Python - Learn Pandas

Last updated: August 25th, 20202020-08-25Project preview

Conditional Selection (Boolean Arrays)

In [1]:
import pandas as pd
import numpy as np
In [11]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]
In [12]:
df
Out[12]:
Population GDP Surface Area HDI Continent
Canada 35.467 1785387 9984670 0.913 America
France 63.951 2833687 640679 0.888 Europe
Germany 80.940 3874437 357114 0.916 Europe
Italy 60.665 2167744 301336 0.873 Europe
Japan 127.061 4602367 377930 0.891 Asia
United Kingdom 64.511 2950039 242495 0.907 Europe
United States 318.523 17348075 9525067 0.915 America
In [13]:
df['Population'] > 70
Out[13]:
Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool
In [14]:
df.loc[df['Population'] > 70]
Out[14]:
Population GDP Surface Area HDI Continent
Germany 80.940 3874437 357114 0.916 Europe
Japan 127.061 4602367 377930 0.891 Asia
United States 318.523 17348075 9525067 0.915 America
In [15]:
df.loc[df['Population'] > 70, 'Population']
Out[15]:
Germany           80.940
Japan            127.061
United States    318.523
Name: Population, dtype: float64
In [16]:
df.loc[df['Population'] > 70, ['Population', 'GDP']]
Out[16]:
Population GDP
Germany 80.940 3874437
Japan 127.061 4602367
United States 318.523 17348075

Dropping Stuff

In [17]:
df.drop('Canada')
Out[17]:
Population GDP Surface Area HDI Continent
France 63.951 2833687 640679 0.888 Europe
Germany 80.940 3874437 357114 0.916 Europe
Italy 60.665 2167744 301336 0.873 Europe
Japan 127.061 4602367 377930 0.891 Asia
United Kingdom 64.511 2950039 242495 0.907 Europe
United States 318.523 17348075 9525067 0.915 America
In [18]:
df.drop(columns=['Surface Area', 'HDI'])
Out[18]:
Population GDP Continent
Canada 35.467 1785387 America
France 63.951 2833687 Europe
Germany 80.940 3874437 Europe
Italy 60.665 2167744 Europe
Japan 127.061 4602367 Asia
United Kingdom 64.511 2950039 Europe
United States 318.523 17348075 America

Operations

In [20]:
df[['Population']] / 10
Out[20]:
Population
Canada 3.5467
France 6.3951
Germany 8.0940
Italy 6.0665
Japan 12.7061
United Kingdom 6.4511
United States 31.8523

Modifying DataFrames

In [21]:
langs = pd.Series(
    ['French', 'German', 'Italian'],
    index=['France', 'Germany', 'Italy'],
    name='Language'
)
In [22]:
df['Language'] = langs
In [23]:
df
Out[23]:
Population GDP Surface Area HDI Continent Language
Canada 35.467 1785387 9984670 0.913 America NaN
France 63.951 2833687 640679 0.888 Europe French
Germany 80.940 3874437 357114 0.916 Europe German
Italy 60.665 2167744 301336 0.873 Europe Italian
Japan 127.061 4602367 377930 0.891 Asia NaN
United Kingdom 64.511 2950039 242495 0.907 Europe NaN
United States 318.523 17348075 9525067 0.915 America NaN

Replacing values per column

In [25]:
df['Languages'] = 'English'
In [26]:
df
Out[26]:
Population GDP Surface Area HDI Continent Language Languages
Canada 35.467 1785387 9984670 0.913 America NaN English
France 63.951 2833687 640679 0.888 Europe French English
Germany 80.940 3874437 357114 0.916 Europe German English
Italy 60.665 2167744 301336 0.873 Europe Italian English
Japan 127.061 4602367 377930 0.891 Asia NaN English
United Kingdom 64.511 2950039 242495 0.907 Europe NaN English
United States 318.523 17348075 9525067 0.915 America NaN English

Renaming columns

In [27]:
df.rename(
    columns={
        'HDI': 'Human Development Index',
        'Anual Popcorn Consumption': 'APC'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Argentina': 'AR'
    })
Out[27]:
Population GDP Surface Area Human Development Index Continent Language Languages
Canada 35.467 1785387 9984670 0.913 America NaN English
France 63.951 2833687 640679 0.888 Europe French English
Germany 80.940 3874437 357114 0.916 Europe German English
Italy 60.665 2167744 301336 0.873 Europe Italian English
Japan 127.061 4602367 377930 0.891 Asia NaN English
UK 64.511 2950039 242495 0.907 Europe NaN English
USA 318.523 17348075 9525067 0.915 America NaN English
In [28]:
df.rename(index=str.upper)
Out[28]:
Population GDP Surface Area HDI Continent Language Languages
CANADA 35.467 1785387 9984670 0.913 America NaN English
FRANCE 63.951 2833687 640679 0.888 Europe French English
GERMANY 80.940 3874437 357114 0.916 Europe German English
ITALY 60.665 2167744 301336 0.873 Europe Italian English
JAPAN 127.061 4602367 377930 0.891 Asia NaN English
UNITED KINGDOM 64.511 2950039 242495 0.907 Europe NaN English
UNITED STATES 318.523 17348075 9525067 0.915 America NaN English
Notebooks AI
Notebooks AI Profile20060