Last updated: June 4th, 2020
In [31]:
import pandas as pd
import numpy as np

In [43]:
toy_dataset = pd.read_csv('toy_dataset.csv')

Out[43]:
Number City Gender Age Income Illness
0 1 Dallas Male 41 40367.0 No
1 2 Dallas Male 54 45084.0 No
2 3 Dallas Male 42 52483.0 No
3 4 Dallas Male 40 40941.0 No
4 5 Dallas Male 46 50289.0 No
In [22]:
toy_dataset.shape

Out[22]:
(150000, 6)

If we need the Age for the person in the row 3, we can calculate:

In [44]:
Age_3 = toy_dataset.iloc[3,3]
Age_3

Out[44]:
40

We saw how calculate the empirical probability, we use it to estimate the probability of union of pairs the events:

A={The person chosen lives in San Diego}

B={The person chosen is Male}

C={The person chosen has a income of 78898.0}

In [45]:
def person():
return np.random.randint(0,149999)

In [54]:
def empirical_prob(n=10000):
positive_cases = 0
for i in range(n):
pick = person()
if toy_dataset.iloc[pick,1] == "San Diego" or toy_dataset.iloc[pick,2] == "Male":
positive_cases +=1
return positive_cases/n
empirical_prob()


Out[54]:
0.57384

let's calculate the theoretical probability of $A\cup B$

In [62]:
data_A= toy_dataset[toy_dataset["City"] == "San Diego"]
data_B= toy_dataset[toy_dataset["Gender"]=="Male"]
data_C=  toy_dataset[toy_dataset["Income"]==78898.0]
data_AnB = toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Gender"]=="Male")]
(data_A.shape[0]+data_B.shape[0]-data_AnB.shape[0])/150000

Out[62]:
0.5726533333333333

Another way

In [64]:
data_AB = toy_dataset[(toy_dataset["City"] == "San Diego") | (toy_dataset["Gender"]=="Male")]
data_AB.shape[0]/150000

Out[64]:
0.5726533333333333

Do the same for the other cases

In [ ]:


In [47]:
def empirical_prob(n=10000):
positive_cases = 0
for i in range(n):
pick = person()
if toy_dataset.iloc[pick,2] == "Male" or toy_dataset.iloc[pick,4] == 78898.0:
positive_cases +=1
return positive_cases/n
empirical_prob()

Out[47]:
0.5767
In [55]:
data_BC = toy_dataset[(toy_dataset["Gender"]=="Male") | (toy_dataset["Income"]==78898.0)]
data_BC.shape[0]/150000

Out[55]:
0.5586733333333334
In [65]:
data_B= toy_dataset[toy_dataset["Gender"]=="Male"]
data_C=  toy_dataset[toy_dataset["Income"]==78898.0]
data_CnB = toy_dataset[(toy_dataset["Gender"]=="Male") & (toy_dataset["Income"]==78898.0)]
(data_B.shape[0]+data_C.shape[0]-data_CnB.shape[0])/150000

Out[65]:
0.5586733333333334
In [48]:
def empirical_prob(n=10000):
positive_cases = 0
for i in range(n):
pick = person()
if toy_dataset.iloc[pick,1] == "San Diego" or toy_dataset.iloc[pick,4] == 78898.0:
positive_cases +=1
return positive_cases/n
empirical_prob()

Out[48]:
0.0323
In [56]:
data_AC = toy_dataset[(toy_dataset["City"] == "San Diego") | (toy_dataset["Income"]==78898.0)]
data_AC.shape[0]/150000

Out[56]:
0.03254666666666667
In [66]:
data_A= toy_dataset[toy_dataset["City"] == "San Diego"]
data_C=  toy_dataset[toy_dataset["Income"]==78898.0]
data_AnC = toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Income"]==78898.0)]
(data_A.shape[0]+data_C.shape[0]-data_AnC.shape[0])/150000

Out[66]:
0.03254666666666667
In [ ]: