Pandas Tutorial

import pandas as pd
import numpy as np
from numpy.random import randn
np.random.seed(42)
df = pd.DataFrame(randn(5, 4), index="A B C D E".split(), columns='W X Y Z'.split())
df
W X Y Z
A 1.465649 -0.225776 0.067528 -1.424748
B -0.544383 0.110923 -1.150994 0.375698
C -0.600639 -0.291694 -0.601707 1.852278
D -0.013497 -1.057711 0.822545 -1.220844
E 0.208864 -1.959670 -1.328186 0.196861

Reset index

# Reset to default 0,1...n index
df.reset_index()
index W X Y Z
0 A 1.465649 -0.225776 0.067528 -1.424748
1 B -0.544383 0.110923 -1.150994 0.375698
2 C -0.600639 -0.291694 -0.601707 1.852278
3 D -0.013497 -1.057711 0.822545 -1.220844
4 E 0.208864 -1.959670 -1.328186 0.196861
newind = 'CA NY WY OR CO'.split()
df['States'] = newind
df
W X Y Z States
A 1.465649 -0.225776 0.067528 -1.424748 CA
B -0.544383 0.110923 -1.150994 0.375698 NY
C -0.600639 -0.291694 -0.601707 1.852278 WY
D -0.013497 -1.057711 0.822545 -1.220844 OR
E 0.208864 -1.959670 -1.328186 0.196861 CO
df.set_index('States',inplace=True)
df
W X Y Z
States
CA 1.465649 -0.225776 0.067528 -1.424748
NY -0.544383 0.110923 -1.150994 0.375698
WY -0.600639 -0.291694 -0.601707 1.852278
OR -0.013497 -1.057711 0.822545 -1.220844
CO 0.208864 -1.959670 -1.328186 0.196861

Groupby

import pandas as pd
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)
df
Company Person Sales
0 GOOG Sam 200
1 GOOG Charlie 120
2 MSFT Amy 340
3 MSFT Vanessa 124
4 FB Carl 243
5 FB Sarah 350
df.groupby('Company')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000018A0C6A41D0>
by_comp = df.groupby("Company")
by_comp.mean()
Sales
Company
FB 296.5
GOOG 160.0
MSFT 232.0
df.groupby('Company').mean()
Sales
Company
FB 296.5
GOOG 160.0
MSFT 232.0
by_comp.std()
Sales
Company
FB 75.660426
GOOG 56.568542
MSFT 152.735065
by_comp.min()
Person Sales
Company
FB Carl 243
GOOG Charlie 120
MSFT Amy 124
by_comp.max()
Person Sales
Company
FB Sarah 350
GOOG Sam 200
MSFT Vanessa 340
by_comp.count()
Person Sales
Company
FB 2 2
GOOG 2 2
MSFT 2 2
by_comp.describe()
Sales
count mean std min 25% 50% 75% max
Company
FB 2.0 296.5 75.660426 243.0 269.75 296.5 323.25 350.0
GOOG 2.0 160.0 56.568542 120.0 140.00 160.0 180.00 200.0
MSFT 2.0 232.0 152.735065 124.0 178.00 232.0 286.00 340.0
by_comp.describe().transpose()
Company FB GOOG MSFT
Sales count 2.000000 2.000000 2.000000
mean 296.500000 160.000000 232.000000
std 75.660426 56.568542 152.735065
min 243.000000 120.000000 124.000000
25% 269.750000 140.000000 178.000000
50% 296.500000 160.000000 232.000000
75% 323.250000 180.000000 286.000000
max 350.000000 200.000000 340.000000
by_comp.describe().transpose()['GOOG']
Sales  count      2.000000
       mean     160.000000
       std       56.568542
       min      120.000000
       25%      140.000000
       50%      160.000000
       75%      180.000000
       max      200.000000
Name: GOOG, dtype: float64

apply

df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()
col1 col2 col3
0 1 444 abc
1 2 555 def
2 3 666 ghi
3 4 444 xyz
def times2(x):
    return x*2
df['col1'].apply(times2)
0    2
1    4
2    6
3    8
Name: col1, dtype: int64
df['col3'].apply(len)
0    3
1    3
2    3
3    3
Name: col3, dtype: int64
df['col1'].sum()
10