In [31]:
import pandas as pd
import numpy as np
In [43]:
toy_dataset = pd.read_csv('toy_dataset.csv')
toy_dataset.head()
Out[43]:
In [22]:
toy_dataset.shape
Out[22]:
If we need the Age for the person in the row 3, we can calculate:
In [44]:
Age_3 = toy_dataset.iloc[3,3]
Age_3
Out[44]:
We saw how calculate the empirical probability, we use it to estimate the probability of union of pairs the events:
A={The person chosen lives in San Diego}
B={The person chosen is Male}
C={The person chosen has a income of 78898.0}
In [45]:
def person():
return np.random.randint(0,149999)
In [54]:
def empirical_prob(n=10000):
positive_cases = 0
for i in range(n):
pick = person()
if toy_dataset.iloc[pick,1] == "San Diego" or toy_dataset.iloc[pick,2] == "Male":
positive_cases +=1
return positive_cases/n
empirical_prob()
Out[54]:
let's calculate the theoretical probability of $A\cup B$
In [62]:
data_A= toy_dataset[toy_dataset["City"] == "San Diego"]
data_B= toy_dataset[toy_dataset["Gender"]=="Male"]
data_C= toy_dataset[toy_dataset["Income"]==78898.0]
data_AnB = toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Gender"]=="Male")]
(data_A.shape[0]+data_B.shape[0]-data_AnB.shape[0])/150000
Out[62]:
Another way
In [64]:
data_AB = toy_dataset[(toy_dataset["City"] == "San Diego") | (toy_dataset["Gender"]=="Male")]
data_AB.shape[0]/150000
Out[64]:
Do the same for the other cases
In [ ]:
In [47]:
def empirical_prob(n=10000):
positive_cases = 0
for i in range(n):
pick = person()
if toy_dataset.iloc[pick,2] == "Male" or toy_dataset.iloc[pick,4] == 78898.0:
positive_cases +=1
return positive_cases/n
empirical_prob()
Out[47]:
In [55]:
data_BC = toy_dataset[(toy_dataset["Gender"]=="Male") | (toy_dataset["Income"]==78898.0)]
data_BC.shape[0]/150000
Out[55]:
In [65]:
data_B= toy_dataset[toy_dataset["Gender"]=="Male"]
data_C= toy_dataset[toy_dataset["Income"]==78898.0]
data_CnB = toy_dataset[(toy_dataset["Gender"]=="Male") & (toy_dataset["Income"]==78898.0)]
(data_B.shape[0]+data_C.shape[0]-data_CnB.shape[0])/150000
Out[65]:
In [48]:
def empirical_prob(n=10000):
positive_cases = 0
for i in range(n):
pick = person()
if toy_dataset.iloc[pick,1] == "San Diego" or toy_dataset.iloc[pick,4] == 78898.0:
positive_cases +=1
return positive_cases/n
empirical_prob()
Out[48]:
In [56]:
data_AC = toy_dataset[(toy_dataset["City"] == "San Diego") | (toy_dataset["Income"]==78898.0)]
data_AC.shape[0]/150000
Out[56]:
In [66]:
data_A= toy_dataset[toy_dataset["City"] == "San Diego"]
data_C= toy_dataset[toy_dataset["Income"]==78898.0]
data_AnC = toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Income"]==78898.0)]
(data_A.shape[0]+data_C.shape[0]-data_AnC.shape[0])/150000
Out[66]:
In [ ]: