Profile picture

Co-founder @ RMOTR

Pandas Book Showcase

Last updated: November 5th, 20182018-11-05Project preview

This is just for demonstration purposes

This project includes a notebook from Wes McKinney's excellent book Python for Data Analysis. See the source code here.


Getting Started with pandas

In [1]:
import pandas as pd
import numpy as np

Introduction to pandas Data Structures

Series

In [4]:
obj = pd.Series([4, 7, -5, 3])
obj
Out[4]:
0    4
1    7
2   -5
3    3
dtype: int64
In [5]:
obj.values
Out[5]:
array([ 4,  7, -5,  3])
In [6]:
obj.index  # like range(4)
Out[6]:
RangeIndex(start=0, stop=4, step=1)
In [7]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2
Out[7]:
d    4
b    7
a   -5
c    3
dtype: int64
In [8]:
obj2.index  # A custom index
Out[8]:
Index(['d', 'b', 'a', 'c'], dtype='object')
In [9]:
obj2['a']
Out[9]:
-5
In [10]:
obj2['d'] = 6
obj2[['c', 'a', 'd']]
Out[10]:
c    3
a   -5
d    6
dtype: int64
In [11]:
obj2[obj2 > 0]
Out[11]:
d    6
b    7
c    3
dtype: int64
In [12]:
obj2 * 2
Out[12]:
d    12
b    14
a   -10
c     6
dtype: int64
In [13]:
np.exp(obj2)
Out[13]:
d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
In [14]:
'b' in obj2, 'e' in obj2
Out[14]:
(True, False)
In [15]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3
Out[15]:
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
In [16]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4
Out[16]:
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
In [17]:
pd.isnull(obj4)
Out[17]:
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
In [18]:
pd.notnull(obj4)
Out[18]:
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
In [19]:
obj4.isnull()
Out[19]:
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
In [20]:
obj3 + obj4
Out[20]:
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
In [21]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4
Out[21]:
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64
In [22]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj
Out[22]:
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

DataFrame

In [23]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
In [24]:
frame
Out[24]:
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
In [25]:
frame.head()
Out[25]:
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
In [26]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])
Out[26]:
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
5 2003 Nevada 3.2
In [27]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
frame2
Out[27]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
In [28]:
frame2.columns
Out[28]:
Index(['year', 'state', 'pop', 'debt'], dtype='object')
In [29]:
frame2['state']
Out[29]:
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object
In [30]:
frame2.year
Out[30]:
one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64
In [31]:
frame2.loc['three']
Out[31]:
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
In [32]:
frame2['debt'] = 16.5
frame2
Out[32]:
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5
six 2003 Nevada 3.2 16.5
In [33]:
frame2['debt'] = np.arange(6.)
frame2
Out[33]:
year state pop debt
one 2000 Ohio 1.5 0.0
two 2001 Ohio 1.7 1.0
three 2002 Ohio 3.6 2.0
four 2001 Nevada 2.4 3.0
five 2002 Nevada 2.9 4.0
six 2003 Nevada 3.2 5.0
In [34]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2
Out[34]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
six 2003 Nevada 3.2 NaN
In [35]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2
Out[35]:
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False
six 2003 Nevada 3.2 NaN False
In [36]:
del frame2['eastern']
frame2.columns
Out[36]:
Index(['year', 'state', 'pop', 'debt'], dtype='object')
In [37]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
In [38]:
frame3 = pd.DataFrame(pop)
frame3
Out[38]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
In [39]:
frame3.T
Out[39]:
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6
In [40]:
pd.DataFrame(pop, index=pd.Series([2001, 2002, 2003]))
Out[40]:
Nevada Ohio
2001 2.4 1.7
2002 2.9 3.6
2003 NaN NaN
In [41]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)
Out[41]:
Ohio Nevada
2000 1.5 NaN
2001 1.7 2.4
In [42]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3
Out[42]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
In [43]:
frame3.values
Out[43]:
array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])
In [44]:
frame2.values
Out[44]:
array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

Index Objects

In [45]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
Out[45]:
Index(['a', 'b', 'c'], dtype='object')
In [46]:
index[1:]
Out[46]:
Index(['b', 'c'], dtype='object')
In [190]:
index[1] = 'd' # Will raise TypeError
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-190-ef1d17e4669f> in <module>
----> 1 index[1] = 'd' # Will raise TypeError

/usr/local/lib/python3.6/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
   2048 
   2049     def __setitem__(self, key, value):
-> 2050         raise TypeError("Index does not support mutable operations")
   2051 
   2052     def __getitem__(self, key):

TypeError: Index does not support mutable operations
In [48]:
labels = pd.Index(np.arange(3))
labels
Out[48]:
Int64Index([0, 1, 2], dtype='int64')
In [49]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2
Out[49]:
0    1.5
1   -2.5
2    0.0
dtype: float64
In [50]:
obj2.index is labels
Out[50]:
True
In [51]:
frame3
Out[51]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
In [52]:
frame3.columns
Out[52]:
Index(['Nevada', 'Ohio'], dtype='object', name='state')
In [53]:
'Ohio' in frame3.columns
Out[53]:
True
In [54]:
2003 in frame3.index
Out[54]:
False
In [55]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels
Out[55]:
Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

Essential Functionality

Reindexing

In [56]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
Out[56]:
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
In [57]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
Out[57]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
In [58]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3
Out[58]:
0      blue
2    purple
4    yellow
dtype: object
In [59]:
obj3.reindex(range(6), method='ffill')
Out[59]:
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
In [60]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame
Out[60]:
Ohio Texas California
a 0 1 2
c 3 4 5
d 6 7 8
In [61]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2
Out[61]:
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0
In [62]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
Out[62]:
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
In [63]:
frame.loc[['a', 'b', 'c', 'd'], states]
/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.
Out[63]:
Texas Utah California
a 1.0 NaN 2.0
b NaN NaN NaN
c 4.0 NaN 5.0
d 7.0 NaN 8.0

Dropping Entries from an Axis

In [64]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
Out[64]:
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
In [65]:
new_obj = obj.drop('c')
new_obj
Out[65]:
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
In [66]:
obj.drop(['d', 'c'])
Out[66]:
a    0.0
b    1.0
e    4.0
dtype: float64
In [67]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data
Out[67]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [68]:
data.drop(['Colorado', 'Ohio'])
Out[68]:
one two three four
Utah 8 9 10 11
New York 12 13 14 15
In [69]:
data.drop('two', axis=1)
Out[69]:
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15
In [70]:
data.drop(['two', 'four'], axis='columns')
Out[70]:
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14
In [71]:
obj.drop('c', inplace=True)
obj
Out[71]:
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

Indexing, Selection, and Filtering

In [72]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj
Out[72]:
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
In [73]:
obj['b']
Out[73]:
1.0
In [74]:
obj[1]
Out[74]:
1.0
In [75]:
obj[2:4]
Out[75]:
c    2.0
d    3.0
dtype: float64
In [76]:
obj[['b', 'a', 'd']]
Out[76]:
b    1.0
a    0.0
d    3.0
dtype: float64
In [77]:
obj[[1, 3]]
Out[77]:
b    1.0
d    3.0
dtype: float64
In [78]:
obj[obj < 2]
Out[78]:
a    0.0
b    1.0
dtype: float64
In [79]:
obj['b':'c']
Out[79]:
b    1.0
c    2.0
dtype: float64
In [80]:
obj['b':'c'] = 5
obj
Out[80]:
a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64
In [81]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data
Out[81]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [82]:
data['two']
Out[82]:
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64
In [83]:
data[['three', 'one']]
Out[83]:
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12
In [84]:
data[:2]
Out[84]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
In [85]:
data[data['three'] > 5]
Out[85]:
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [86]:
data < 5
Out[86]:
one two three four
Ohio True True True True
Colorado True False False False
Utah False False False False
New York False False False False
In [87]:
data[data < 5] = 0
data
Out[87]:
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

Selection with loc and iloc

In [88]:
data.loc['Colorado', ['two', 'three']]
Out[88]:
two      5
three    6
Name: Colorado, dtype: int64
In [89]:
data.iloc[2, [3, 0, 1]]
Out[89]:
four    11
one      8
two      9
Name: Utah, dtype: int64
In [90]:
data.iloc[2]
Out[90]:
one       8
two       9
three    10
four     11
Name: Utah, dtype: int64
In [91]:
data.iloc[[1, 2], [3, 0, 1]]
Out[91]:
four one two
Colorado 7 0 5
Utah 11 8 9
In [92]:
data.loc[:'Utah', 'two']
Out[92]:
Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64
In [93]:
data.iloc[:, :3][data.three > 5]
Out[93]:
one two three
Colorado 0 5 6
Utah 8 9 10
New York 12 13 14

Integer Indexes

ser = pd.Series(np.arange(3.)) ser ser[-1]

In [94]:
ser = pd.Series(np.arange(3.))
In [95]:
ser
Out[95]:
0    0.0
1    1.0
2    2.0
dtype: float64
In [96]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]
Out[96]:
2.0
In [97]:
ser[:1]
Out[97]:
0    0.0
dtype: float64
In [98]:
ser.loc[:1]
Out[98]:
0    0.0
1    1.0
dtype: float64
In [99]:
ser.iloc[:1]
Out[99]:
0    0.0
dtype: float64

Arithmetic and Data Alignment

In [100]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s1
Out[100]:
a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
In [101]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s2
Out[101]:
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
In [102]:
s1 + s2
Out[102]:
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
In [103]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df1
Out[103]:
b c d
Ohio 0.0 1.0 2.0
Texas 3.0 4.0 5.0
Colorado 6.0 7.0 8.0
In [104]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df2
Out[104]:
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
In [105]:
df1 + df2
Out[105]:
b c d e
Colorado NaN NaN NaN NaN
Ohio 3.0 NaN 6.0 NaN
Oregon NaN NaN NaN NaN
Texas 9.0 NaN 12.0 NaN
Utah NaN NaN NaN NaN
In [106]:
df1 = pd.DataFrame({'A': [1, 2]})
df1
Out[106]:
A
0 1
1 2
In [107]:
df2 = pd.DataFrame({'B': [3, 4]})
df2
Out[107]:
B
0 3
1 4
In [108]:
df1 - df2
Out[108]:
A B
0 NaN NaN
1 NaN NaN

Arithmetic methods with fill values

In [109]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df1
Out[109]:
a b c d
0 0.0 1.0 2.0 3.0
1 4.0 5.0 6.0 7.0
2 8.0 9.0 10.0 11.0
In [110]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df2
Out[110]:
a b c d e
0 0.0 1.0 2.0 3.0 4.0
1 5.0 6.0 7.0 8.0 9.0
2 10.0 11.0 12.0 13.0 14.0
3 15.0 16.0 17.0 18.0 19.0
In [111]:
df2.loc[1, 'b'] = np.nan
df2
Out[111]:
a b c d e
0 0.0 1.0 2.0 3.0 4.0
1 5.0 NaN 7.0 8.0 9.0
2 10.0 11.0 12.0 13.0 14.0
3 15.0 16.0 17.0 18.0 19.0
In [112]:
df1 + df2
Out[112]:
a b c d e
0 0.0 2.0 4.0 6.0 NaN
1 9.0 NaN 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN
In [113]:
df1.add(df2, fill_value=0)
Out[113]:
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 5.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0
In [114]:
1 / df1
Out[114]:
a b c d
0 inf 1.000000 0.500000 0.333333
1 0.250000 0.200000 0.166667 0.142857
2 0.125000 0.111111 0.100000 0.090909
In [115]:
df1.rdiv(1)
Out[115]:
a b c d
0 inf 1.000000 0.500000 0.333333
1 0.250000 0.200000 0.166667 0.142857
2 0.125000 0.111111 0.100000 0.090909
In [116]:
df1.reindex(columns=df2.columns, fill_value=0)
Out[116]:
a b c d e
0 0.0 1.0 2.0 3.0 0
1 4.0 5.0 6.0 7.0 0
2 8.0 9.0 10.0 11.0 0

Operations between DataFrame and Series

In [117]:
arr = np.arange(12.).reshape((3, 4))
arr
Out[117]:
array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])
In [118]:
arr[0]
Out[118]:
array([0., 1., 2., 3.])
In [119]:
arr - arr[0]
Out[119]:
array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])
In [120]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
Out[120]:
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
In [121]:
series = frame.iloc[0]
series
Out[121]:
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
In [122]:
frame - series
Out[122]:
b d e
Utah 0.0 0.0 0.0
Ohio 3.0 3.0 3.0
Texas 6.0 6.0 6.0
Oregon 9.0 9.0 9.0
In [123]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
series2
Out[123]:
b    0
e    1
f    2
dtype: int64
In [124]:
frame + series2
Out[124]:
b d e f
Utah 0.0 NaN 3.0 NaN
Ohio 3.0 NaN 6.0 NaN
Texas 6.0 NaN 9.0 NaN
Oregon 9.0 NaN 12.0 NaN
In [125]:
series3 = frame['d']
series3
Out[125]:
Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64
In [126]:
frame.sub(series3, axis='index')
Out[126]:
b d e
Utah -1.0 0.0 1.0
Ohio -1.0 0.0 1.0
Texas -1.0 0.0 1.0
Oregon -1.0 0.0 1.0

Function Application and Mapping

In [127]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
Out[127]:
b d e
Utah -0.204708 0.478943 -0.519439
Ohio -0.555730 1.965781 1.393406
Texas 0.092908 0.281746 0.769023
Oregon 1.246435 1.007189 -1.296221
In [128]:
np.abs(frame)
Out[128]:
b d e
Utah 0.204708 0.478943 0.519439
Ohio 0.555730 1.965781 1.393406
Texas 0.092908 0.281746 0.769023
Oregon 1.246435 1.007189 1.296221
In [129]:
f = lambda x: x.max() - x.min()
frame.apply(f)
Out[129]:
b    1.802165
d    1.684034
e    2.689627
dtype: float64
In [130]:
frame.apply(f, axis='columns')
Out[130]:
Utah      0.998382
Ohio      2.521511
Texas     0.676115
Oregon    2.542656
dtype: float64
In [131]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)
Out[131]:
b d e
min -0.555730 0.281746 -1.296221
max 1.246435 1.965781 1.393406
In [132]:
format = lambda x: '%.2f' % x
frame.applymap(format)
Out[132]:
b d e
Utah -0.20 0.48 -0.52
Ohio -0.56 1.97 1.39
Texas 0.09 0.28 0.77
Oregon 1.25 1.01 -1.30
In [133]:
frame['e'].map(format)
Out[133]:
Utah      -0.52
Ohio       1.39
Texas      0.77
Oregon    -1.30
Name: e, dtype: object

Sorting and Ranking

In [134]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
In [135]:
obj.sort_index()
Out[135]:
a    1
b    2
c    3
d    0
dtype: int64
In [136]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
In [137]:
frame.sort_index()
Out[137]:
d a b c
one 4 5 6 7
three 0 1 2 3
In [138]:
frame.sort_index(axis=1)
Out[138]:
a b c d
three 1 2 3 0
one 5 6 7 4
In [139]:
frame.sort_index(axis=1, ascending=False)
Out[139]:
d c b a
three 0 3 2 1
one 4 7 6 5
In [140]:
obj = pd.Series([4, 7, -3, 2])
obj
Out[140]:
0    4
1    7
2   -3
3    2
dtype: int64
In [141]:
obj.sort_values()
Out[141]:
2   -3
3    2
0    4
1    7
dtype: int64
In [142]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj
Out[142]:
0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64
In [143]:
obj.sort_values()
Out[143]:
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
In [144]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
Out[144]:
b a
0 4 0
1 7 1
2 -3 0
3 2 1
In [145]:
frame
Out[145]:
b a
0 4 0
1 7 1
2 -3 0
3 2 1
In [146]:
frame.sort_values(by=['a', 'b'])
Out[146]:
b a
2 -3 0
0 4 0
3 2 1
1 7 1
In [147]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj
Out[147]:
0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64
In [148]:
obj.rank()
Out[148]:
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
In [149]:
obj.rank(method='first')
Out[149]:
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
In [150]:
# Assign tie values the maximum rank in the group
obj.rank(ascending=False, method='max')
Out[150]:
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64
In [151]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame
Out[151]:
b a c
0 4.3 0 -2.0
1 7.0 1 5.0
2 -3.0 0 8.0
3 2.0 1 -2.5
In [152]:
frame.rank(axis='columns')
Out[152]:
b a c
0 3.0 2.0 1.0
1 3.0 1.0 2.0
2 1.0 2.0 3.0
3 3.0 2.0 1.0

Axis Indexes with Duplicate Labels

In [153]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj
Out[153]:
a    0
a    1
b    2
b    3
c    4
dtype: int64
In [154]:
obj.index.is_unique
Out[154]:
False
In [155]:
obj['a']
Out[155]:
a    0
a    1
dtype: int64
In [156]:
obj['c']
Out[156]:
4
In [157]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
Out[157]:
0 1 2
a 0.274992 0.228913 1.352917
a 0.886429 -2.001637 -0.371843
b 1.669025 -0.438570 -0.539741
b 0.476985 3.248944 -1.021228
In [158]:
df.loc['b']
Out[158]:
0 1 2
b 1.669025 -0.438570 -0.539741
b 0.476985 3.248944 -1.021228

Summarizing and Computing Descriptive Statistics

In [159]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df
Out[159]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
In [160]:
df.sum()
Out[160]:
one    9.25
two   -5.80
dtype: float64
In [161]:
df.sum(axis='columns')
Out[161]:
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
In [162]:
df.mean(axis='columns', skipna=False)
Out[162]:
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
In [163]:
df.idxmax()
Out[163]:
one    b
two    d
dtype: object
In [164]:
df.cumsum()
Out[164]:
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
In [165]:
df.describe()
Out[165]:
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
In [166]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj
Out[166]:
0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object
In [167]:
obj.describe()
Out[167]:
count     16
unique     3
top        a
freq       8
dtype: object

Correlation and Covariance

conda install pandas-datareader

In [168]:
price = pd.read_pickle('examples/yahoo_price.pkl')
price.head()
Out[168]:
AAPL GOOG IBM MSFT
Date
2010-01-04 27.990226 313.062468 113.304536 25.884104
2010-01-05 28.038618 311.683844 111.935822 25.892466
2010-01-06 27.592626 303.826685 111.208683 25.733566
2010-01-07 27.541619 296.753749 110.823732 25.465944
2010-01-08 27.724725 300.709808 111.935822 25.641571
In [169]:
volume = pd.read_pickle('examples/yahoo_volume.pkl')
volume.head()
Out[169]:
AAPL GOOG IBM MSFT
Date
2010-01-04 123432400 3927000 6155300 38409100
2010-01-05 150476200 6031900 6841400 49749600
2010-01-06 138040000 7987100 5605300 58182400
2010-01-07 119282800 12876600 5840600 50559700
2010-01-08 111902700 9483900 4197200 51197400

import pandas_datareader.data as web all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()}) volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})

In [170]:
returns = price.pct_change()
returns.tail()
Out[170]:
AAPL GOOG IBM MSFT
Date
2016-10-17 -0.000680 0.001837 0.002072 -0.003483
2016-10-18 -0.000681 0.019616 -0.026168 0.007690
2016-10-19 -0.002979 0.007846 0.003583 -0.002255
2016-10-20 -0.000512 -0.005652 0.001719 -0.004867
2016-10-21 -0.003930 0.003011 -0.012474 0.042096
In [171]:
returns['MSFT'].corr(returns['IBM'])
Out[171]:
0.49976361144151144
In [172]:
returns['MSFT'].cov(returns['IBM'])
Out[172]:
8.870655479703546e-05
In [173]:
returns.MSFT.corr(returns.IBM)
Out[173]:
0.49976361144151144
In [174]:
returns.corr()
Out[174]:
AAPL GOOG IBM MSFT
AAPL 1.000000 0.407919 0.386817 0.389695
GOOG 0.407919 1.000000 0.405099 0.465919
IBM 0.386817 0.405099 1.000000 0.499764
MSFT 0.389695 0.465919 0.499764 1.000000
In [175]:
returns.cov()
Out[175]:
AAPL GOOG IBM MSFT
AAPL 0.000277 0.000107 0.000078 0.000095
GOOG 0.000107 0.000251 0.000078 0.000108
IBM 0.000078 0.000078 0.000146 0.000089
MSFT 0.000095 0.000108 0.000089 0.000215
In [176]:
returns.corrwith(returns.IBM)
Out[176]:
AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64
In [177]:
returns.corrwith(volume)
Out[177]:
AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

Unique Values, Value Counts, and Membership

In [178]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
In [179]:
uniques = obj.unique()
uniques
Out[179]:
array(['c', 'a', 'd', 'b'], dtype=object)
In [180]:
obj.value_counts()
Out[180]:
a    3
c    3
b    2
d    1
dtype: int64
In [181]:
pd.value_counts(obj.values, sort=False)
Out[181]:
d    1
b    2
c    3
a    3
dtype: int64
In [182]:
mask = obj.isin(['b', 'c'])
mask
Out[182]:
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool
In [183]:
obj[mask]
Out[183]:
0    c
5    b
6    b
7    c
8    c
dtype: object
In [184]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
to_match
Out[184]:
0    c
1    a
2    b
3    b
4    c
5    a
dtype: object
In [185]:
unique_vals = pd.Series(['c', 'b', 'a'])
unique_vals
Out[185]:
0    c
1    b
2    a
dtype: object
In [186]:
pd.Index(unique_vals).get_indexer(to_match)
Out[186]:
array([0, 2, 1, 1, 0, 2])
In [187]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data
Out[187]:
Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4
In [188]:
result = data.apply(pd.value_counts).fillna(0)
result
Out[188]:
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 0.0 2.0 1.0
3 2.0 2.0 0.0
4 2.0 0.0 2.0
5 0.0 0.0 1.0

Conclusion

In [189]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS
Notebooks AI
Notebooks AI Profile20060