Pandas Advanced Tutorial

Python
A tutorial on pandas
Published

May 1, 2021

Pandas Tutorial

Show code
import pandas as pd
import numpy as np
Show code
from numpy.random import randn
np.random.seed(42)
Show code
df = pd.DataFrame(randn(5, 4), index="A B C D E".split(), columns='W X Y Z'.split())
df
W X Y Z
A 1.465649 -0.225776 0.067528 -1.424748
B -0.544383 0.110923 -1.150994 0.375698
C -0.600639 -0.291694 -0.601707 1.852278
D -0.013497 -1.057711 0.822545 -1.220844
E 0.208864 -1.959670 -1.328186 0.196861

Reset index

Show code
# Reset to default 0,1...n index
df.reset_index()
index W X Y Z
0 A 1.465649 -0.225776 0.067528 -1.424748
1 B -0.544383 0.110923 -1.150994 0.375698
2 C -0.600639 -0.291694 -0.601707 1.852278
3 D -0.013497 -1.057711 0.822545 -1.220844
4 E 0.208864 -1.959670 -1.328186 0.196861
Show code
newind = 'CA NY WY OR CO'.split()
Show code
df['States'] = newind
df
W X Y Z States
A 1.465649 -0.225776 0.067528 -1.424748 CA
B -0.544383 0.110923 -1.150994 0.375698 NY
C -0.600639 -0.291694 -0.601707 1.852278 WY
D -0.013497 -1.057711 0.822545 -1.220844 OR
E 0.208864 -1.959670 -1.328186 0.196861 CO
Show code
df.set_index('States',inplace=True)
df
W X Y Z
States
CA 1.465649 -0.225776 0.067528 -1.424748
NY -0.544383 0.110923 -1.150994 0.375698
WY -0.600639 -0.291694 -0.601707 1.852278
OR -0.013497 -1.057711 0.822545 -1.220844
CO 0.208864 -1.959670 -1.328186 0.196861

Groupby

Show code
import pandas as pd
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)
Show code
df
Company Person Sales
0 GOOG Sam 200
1 GOOG Charlie 120
2 MSFT Amy 340
3 MSFT Vanessa 124
4 FB Carl 243
5 FB Sarah 350
Show code
df.groupby('Company')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000018A0C6A41D0>
Show code
by_comp = df.groupby("Company")
Show code
by_comp.mean()
Sales
Company
FB 296.5
GOOG 160.0
MSFT 232.0
Show code
df.groupby('Company').mean()
Sales
Company
FB 296.5
GOOG 160.0
MSFT 232.0
Show code
by_comp.std()
Sales
Company
FB 75.660426
GOOG 56.568542
MSFT 152.735065
Show code
by_comp.min()
Person Sales
Company
FB Carl 243
GOOG Charlie 120
MSFT Amy 124
Show code
by_comp.max()
Person Sales
Company
FB Sarah 350
GOOG Sam 200
MSFT Vanessa 340
Show code
by_comp.count()
Person Sales
Company
FB 2 2
GOOG 2 2
MSFT 2 2
Show code
by_comp.describe()
Sales
count mean std min 25% 50% 75% max
Company
FB 2.0 296.5 75.660426 243.0 269.75 296.5 323.25 350.0
GOOG 2.0 160.0 56.568542 120.0 140.00 160.0 180.00 200.0
MSFT 2.0 232.0 152.735065 124.0 178.00 232.0 286.00 340.0
Show code
by_comp.describe().transpose()
Company FB GOOG MSFT
Sales count 2.000000 2.000000 2.000000
mean 296.500000 160.000000 232.000000
std 75.660426 56.568542 152.735065
min 243.000000 120.000000 124.000000
25% 269.750000 140.000000 178.000000
50% 296.500000 160.000000 232.000000
75% 323.250000 180.000000 286.000000
max 350.000000 200.000000 340.000000
Show code
by_comp.describe().transpose()['GOOG']
Sales  count      2.000000
       mean     160.000000
       std       56.568542
       min      120.000000
       25%      140.000000
       50%      160.000000
       75%      180.000000
       max      200.000000
Name: GOOG, dtype: float64

apply

Show code
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()
col1 col2 col3
0 1 444 abc
1 2 555 def
2 3 666 ghi
3 4 444 xyz
Show code
def times2(x):
    return x*2
Show code
df['col1'].apply(times2)
0    2
1    4
2    6
3    8
Name: col1, dtype: int64
Show code
df['col3'].apply(len)
0    3
1    3
2    3
3    3
Name: col3, dtype: int64
Show code
df['col1'].sum()
10