import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
Reading data¶
We'll use the following Pokemon Data.
df = pd.read_csv('datasets/Pokemon.csv')
df.head()
DataFrame
s have an index. We can set it using set_index()
method:
df.set_index('#', inplace=True)
df.index.name = 'Id'
df.index
Internally, pandas
uses a very important library in the Data Science space which is numpy
. We can see the internal representation with .values
:
df.values
df.head()
Composition¶
# dataframe elements
df.size
# dataframe dimensions
df.ndim
# dataframe shape
df.shape
# traspose dataframe
df.T
Pandas Data Types¶
A data type is essentially an internal construct that a programming language uses to understand how to store and manipulate data.
Dataframes
have columns, which each one has types:
- object
- int64
- float64
- datetime64
- bool
- category
We can see the column names of our DataFrame
:
df.columns
We can set these the types of the columns if we need it to:
df['Type 1'] = df['Type 1'].astype('category')
df['Type 2'] = df['Type 2'].astype('category')
df.dtypes
df.info()
Special types will have benefits based on that:
df['Type 1'].value_counts()
Also, we can do all of this at read time:
df = pd.read_csv('datasets/Pokemon.csv',
index_col='#',
dtype={
'Type 1': 'category',
'Type 2': 'category'
})
df.head()
df.dtypes
df.tail()
Indexing and selection¶
df['Attack'].head()
df[['Attack', 'Defense']].head()
df.loc[3]
df.loc[3: 10]
df.loc[3, ['Name', 'Attack', 'Defense']]
df.loc[[2, 3], ['Name', 'Attack', 'Defense']]
df.loc[2:6, ['Name', 'Attack', 'Defense']]
We can also use sequential position:
df.iloc[0]
df.iloc[-1]
Summary statistics¶
A large number of methods collectively compute descriptive statistics and other related operations on DataFrame
.
df.describe()
df['Total'].mean()
df['Total'].sum()
df['Total'].max()
df['Total'].min()
Column distribution analysis¶
sns.distplot(df['Attack'], color='red')
sns.distplot(df['Defense'], color='blue')
sns.distplot(df['Speed'], color='green')
Correlation between base stats¶
correlations = df[['Total' ,'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].corr()
sns.heatmap(correlations, annot=True)
sns.lmplot(x='Attack', y='Defense', data=df, height=7)
sns.lmplot(x='Attack', y='Total', data=df, height=7)
Filtering / Conditional Selection / Boolean arrays¶
Will allow you to select different elements from a DataFrame
or Series
.
df_small = df.loc[0:10]
df_small
df_small[[True, True, True, True, True, True, True, True, True, True]]
df_small[[True, False, False, True, False, False, True, False, False, True]]
But broadcasting operations will allow us to create those Boolean arrays dynamically.
For example, let's filter the Attack values higher than 80:
df_small['Attack'] > 80
So we can just use that expression to select just the Pokemons with Attack values higher than 50:
df_small.loc[df_small['Attack'] > 80]
df_small.loc[df_small['Attack'] > 80, 'Name']
df_small.loc[df_small['Attack'] > 80, ['Name', 'Type 1']]
df_small.loc[df_small['Type 1'] == 'Grass']
Combined operations¶
"Filters" can be combined with and
or or
operations:
# AND
(df_small['Attack'] > 80) & (df_small['Speed'] > 80)
df_small.loc[(df_small['Attack'] > 80) & (df_small['Speed'] > 80)]
# OR
(df_small['Attack'] > 80) | (df_small['Speed'] > 80)
df_small.loc[(df_small['Attack'] > 80) | (df_small['Speed'] > 80)]
# isin()
df_small.loc[df_small['Type 1'].isin(['Grass', 'Water'])]
Summary statistics after selecting¶
df_small.loc[df_small['Type 1'] == 'Fire', 'Total'].mean()
df_small.loc[df_small['Type 1'] == 'Water', 'Total'].mean()
df_small.loc[df_small['Type 1'] == 'Dragon']
df_small.loc[df_small['Type 1'] == 'Dragon', 'Total'].mean()
Also we can use the whole data:
df.loc[df['Legendary']]
total_count = df.shape[0]
legendary_count = df.loc[df['Legendary']].shape[0]
no_legendary_count = total_count - legendary_count
plt.pie([no_legendary_count, legendary_count],
labels=['Not Legendary', 'Legendary'],
autopct='%1.1f%%',
shadow=True,
startangle=90,
explode=(0, 0.1))
plt.title('Legendary Pokemons', fontsize=12)
plt.show()
df.loc[df['Generation'] == 1]
df.loc[df['Name'].str.contains('saur')]
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))
ax1.tick_params(axis='x', rotation=45)
ax2.tick_params(axis='x', rotation=45)
sns.countplot(x='Type 1', data=df, ax=ax1)
sns.countplot(x='Type 2', data=df, ax=ax2)
Data Cleaning with Pandas¶
df.loc[1003] = ['Venusaur Mega', 'Grass', 'Poisson', 905, 150, 182, 143, 150, 140, 140, 1, False]
df.loc[1006] = ['Charizard Mega', 'Fire', 'Flying', 974, 178, 184, 138, 149, 165, 160, 1, False]
df.loc[1009] = ['Wartortle Mega', 'Water', np.nan, 950, 169, 183, 150, 165, 145, 138, 1, False]
df.loc[1015] = ['Beedrill Mega', 'Bug', 'Poisson', 495, 65, 150, 40, 15, 80, 145, np.nan, False]
df.loc[1018] = ['Pidgeot Mega', 'Normal', 'Flying', 579, 83, 80, 80, 135, 80, 121, np.nan, False]
df.loc[1099] = [np.nan, np.nan, np.nan, 579, 83, 80, 80, 135, 80, 121, 1, False]
df.tail()
Vectorized operations¶
We'll calculate a new Total Calculated
column using the following formula:
df['Total Calculated'] = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].sum(axis=1)
df.head()
Has our new Total Calculated
the same values that the original Total
column has?
(df['Total Calculated'] - df['Total']).sum() == 0
df.shape[0] == (df['Total Calculated'] == df['Total']).sum()
df['Total'].plot.hist()
sns.distplot(df['Total'])
Another calculated columns¶
We'll build two new Attack(Sp. Atk)
and Defense(Sp. Def)
columns containing both base and special stats at the same time:
df['Attack(Sp. Atk)'] = df.apply(lambda p: "{}({})".format(p['Attack'], p['Sp. Atk']), axis=1)
df['Defense(Sp. Def)'] = df.apply(lambda p: "{}({})".format(p['Defense'], p['Sp. Def']), axis=1)
df[['Name', 'Attack(Sp. Atk)', 'Defense(Sp. Def)']].head()
Outliers detection¶
#df['Total'].plot.box()
sns.boxplot(y='Total', data=df)
sns.boxplot(x='Generation', y='Total', data=df)
There are some outlier values... Let's get rid of them.
Cleaning and dealing with missing data¶
The first thing we're gonna do is to remove the outlier values detected.
q1 = df['Total'].quantile(0.25)
q3 = df['Total'].quantile(0.75)
iqr = q3 - q1 #Interquartile range
fence_low = q1 - 1.5*iqr
fence_high = q3 + 1.5*iqr
# "Outside" boxplot Pokemons
df.loc[(df['Total'] < fence_low) | (df['Total'] > fence_high)]
# Keep just the "inside" boxplot Pokemons
df = df.loc[(df['Total'] > fence_low) & (df['Total'] < fence_high)]
df['Total'].plot.box()
Cleaning missing values¶
Now we're gonna check missing values:
df.isna().sum()
We have missing values on the following columns:
Name
column: we'll drop that rows with missing values.Type 2
column: we'll just leave that as it is. It is possible for a Pokemon not to have Type 2 assigned.Generation
column: we'll fill that missing values withmethod='ffill'
(propagate last valid observation forward to next valid).
# Drop rows with missing value at 'Name' column
df.dropna(subset=['Name'], inplace=True)
# Fill rows with missing value at 'Generation' column with a '1' value
df['Generation'].fillna(method='ffill', inplace=True)
Another fill possibility is to fill using a fixed value:
df['Generation'].fillna(1, inplace=True)
Removing rows and columns¶
# remove row
df.drop([1, 2, 3]).head()
# remove column
df.drop(['Attack', 'Generation'], axis=1).head()
Sorting values¶
Weaker pokemons:
df.sort_values(by='Defense').head()
Strongest pokemons:
df.sort_values(by='Attack', ascending=False).head()
Grouping and Aggregations¶
Let's group our Pokemons per Generation
to get some aggregated information:
generations = df.groupby('Generation')
generations_attack = generations['Attack'].mean().sort_values(ascending=False)
generations_attack
sns.barplot(x=generations_attack.index,
y=generations_attack,
order=generations_attack.index)
generations = df.groupby('Generation')
generations_legendary = generations['Legendary'].sum().sort_values(ascending=False)
generations_legendary
sns.barplot(x=generations_legendary.index,
y=generations_legendary,
order=generations_legendary.index)
Also we can group using two columns at the same type.
In this case, we'll group by Generation
and Type 1
then get the mean Attack
per group:
generations_type = df.groupby(['Generation', 'Type 1'])
generations_type_attack = generations_type['Attack'].mean()
generations_type_attack.head(20)
formatted_df = generations_type_attack.unstack()
formatted_df.head(10)
fig, ax = plt.subplots(figsize=(16, 10))
sns.lineplot(data=formatted_df,
dashes=False,
ax=ax)
plt.legend(loc='center right', bbox_to_anchor=(1.1, 0.5))
Dynamic plots¶
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.palettes import Category10
output_notebook()
mypalette = Category10[10]
p = figure(plot_width=800, plot_height=500, x_range=(1, 7))
p.title.text = 'Click on legend entries to mute the corresponding lines'
for index, name in enumerate(formatted_df.columns):
current_color = mypalette[index % len(mypalette)]
line = p.line(x=formatted_df.index,
y=formatted_df.loc[:, name],
line_width=3,
color=current_color,
muted_color=current_color,
muted_alpha=0.05,
legend=name)
line.muted = True if index > 3 else False
p.legend.location = 'top_right'
p.legend.click_policy = 'mute'
show(p)