Inclusion–Exclusion Principle

Last updated: October 5th, 2020
In [1]:
import numpy as np


We can determine the probability of pick a number between 1 and 100 that is divisible by 2 and 3 using the inclusion–exclusion principle:

In [21]:
c_2=0
c_3=0
c_23=0
for i in range(1,101):
if i%3==0:
c_3+=1
if i%2==0:
c_2+=1
if i%3==0 and i%2==0:
c_23+=1
print((c_2+c_3-c_23)/100)

0.67


We calculate the empirical probability:

In [5]:
def number():
return np.random.randint(0,100)

In [20]:
def empirical_prob(n=10000):
positive_cases = 0
for i in range(n):
pick = number()
if (pick %2==0) or (pick %3==0):
positive_cases +=1
return positive_cases/n
empirical_prob()

Out[20]:
0.6773

If we want to calculate the probability of pick a number between 1 and 200 that is divisible by 7 but isn't divisible by 3, we can do the following:

In [39]:
c_n3=0
c_7=0
c_n3_7=0
for i in range(1,201):
if i%7==0:
c_7+=1
if i%3 !=0:
c_n3+=1
if i%7==0 and i%3 !=0:
c_n3_7+=1
print((c_n3+c_7-c_n3_7)/200)

0.715


And the empirical probability is…

In [49]:
def number1():
return np.random.randint(0,200)

In [51]:
def empirical_prob(n=10000):
positive_cases = 0
for i in range(n):
pick = number1()
if (pick %7==0) or (pick %3!=0):
positive_cases +=1
return positive_cases/n
empirical_prob()

Out[51]:
0.7107

Now, you calculate the probability of pick a number between 1 and 350 that is divisible by 5 but isn't divisible by 3 and 7.

In [ ]:


In [55]:
def number2():
return np.random.randint(0,350)

In [86]:
def empirical_prob(n=100000):
positive_cases = 0
for i in range(n):
pick = number2()
if (pick %5==0) or (pick %3!=0) or (pick %7!=0):
positive_cases +=1
return positive_cases/n
empirical_prob()

Out[86]:
0.96272
In [87]:
c_5=0
c_n3=0
c_n7=0
c_5_n3=0
c_5_n7=0
c_n3_n7=0
c_5_n3_n7=0
for i in range(1,349):
if i%5==0:
c_5+=1
if i%3 !=0:
c_n3+=1
if i%7 !=0:
c_n7+=1
if i%5==0 and i%3 !=0:
c_5_n3+=1
if i%5==0 and i%7 !=0:
c_5_n7+=1
if i%3!=0 and i%7 !=0:
c_n3_n7+=1
if i%5==0 and i%3!=0 and i%7!=0:
c_5_n3_n7+=1

print(((c_5+c_n3+c_n7)-(+c_5_n3+c_5_n7+c_n3_n7)+(c_5_n3_n7))/350)

0.9571428571428572


Let's go back to the dataset that we were working

In [80]:
toy_dataset = pd.read_csv('toy_dataset.csv')

Out[80]:
Number City Gender Age Income Illness
0 1 Dallas Male 41 40367.0 No
1 2 Dallas Male 54 45084.0 No
2 3 Dallas Male 42 52483.0 No
3 4 Dallas Male 40 40941.0 No
4 5 Dallas Male 46 50289.0 No

Calculate the probability of picking a person who live in San Diego, is male or has an income less than 50000.

In [ ]:


In [83]:
def empirical_prob(n=10000):
positive_cases = 0
for i in range(n):
pick = person()
if toy_dataset.iloc[pick,1] == "San Diego" or toy_dataset.iloc[pick,2] == "Male" or toy_dataset.iloc[pick,4]<50000:
positive_cases +=1
return positive_cases/n
empirical_prob()

Out[83]:
0.6239
In [85]:
data_A= toy_dataset[toy_dataset["City"] == "San Diego"]
data_B= toy_dataset[toy_dataset["Gender"]=="Male"]
data_C=  toy_dataset[toy_dataset["Income"]<50000]
data_AnB = toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Gender"]=="Male")]
data_AnC = toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Income"]<50000)]
data_BnC = toy_dataset[(toy_dataset["Gender"]=="Male") & (toy_dataset["Income"]<50000)]
data_AnBnC =  toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Gender"]=="Male") & (toy_dataset["Income"]<50000)]
(data_A.shape[0]+data_B.shape[0]+data_C.shape[0]-data_AnB.shape[0]-data_AnC.shape[0]-data_BnC.shape[0]+data_AnBnC.shape[0])/150000

Out[85]:
0.6233866666666666