Inclusion–Exclusion Principle

Last updated: June 4th, 20202020-06-04Project preview
In [1]:
import numpy as np

We can determine the probability of pick a number between 1 and 100 that is divisible by 2 and 3 using the inclusion–exclusion principle:

In [21]:
c_2=0
c_3=0
c_23=0
for i in range(1,101):  
 if i%3==0:
    c_3+=1 
 if i%2==0:
    c_2+=1
 if i%3==0 and i%2==0: 
    c_23+=1
print((c_2+c_3-c_23)/100) 
0.67

We calculate the empirical probability:

In [5]:
def number():
    return np.random.randint(0,100)
In [20]:
def empirical_prob(n=10000):
    positive_cases = 0
    for i in range(n):
        pick = number()
        if (pick %2==0) or (pick %3==0): 
            positive_cases +=1
    return positive_cases/n
empirical_prob()
Out[20]:
0.6773

If we want to calculate the probability of pick a number between 1 and 200 that is divisible by 7 but isn't divisible by 3, we can do the following:

In [39]:
c_n3=0
c_7=0
c_n3_7=0
for i in range(1,201):  
 if i%7==0:
    c_7+=1 
 if i%3 !=0:
    c_n3+=1
 if i%7==0 and i%3 !=0: 
    c_n3_7+=1
print((c_n3+c_7-c_n3_7)/200) 
0.715

And the empirical probability is…

In [49]:
def number1():
    return np.random.randint(0,200)
In [51]:
def empirical_prob(n=10000):
    positive_cases = 0
    for i in range(n):
        pick = number1()
        if (pick %7==0) or (pick %3!=0): 
            positive_cases +=1
    return positive_cases/n
empirical_prob()
Out[51]:
0.7107

Now, you calculate the probability of pick a number between 1 and 350 that is divisible by 5 but isn't divisible by 3 and 7.

In [ ]:
 
In [55]:
def number2():
    return np.random.randint(0,350)
In [86]:
def empirical_prob(n=100000):
    positive_cases = 0
    for i in range(n):
        pick = number2()
        if (pick %5==0) or (pick %3!=0) or (pick %7!=0): 
            positive_cases +=1
    return positive_cases/n
empirical_prob()
Out[86]:
0.96272
In [87]:
c_5=0
c_n3=0
c_n7=0
c_5_n3=0
c_5_n7=0
c_n3_n7=0
c_5_n3_n7=0
for i in range(1,349):  
 if i%5==0:
    c_5+=1
 if i%3 !=0:
    c_n3+=1
 if i%7 !=0:
    c_n7+=1
 if i%5==0 and i%3 !=0: 
    c_5_n3+=1
 if i%5==0 and i%7 !=0: 
    c_5_n7+=1
 if i%3!=0 and i%7 !=0: 
    c_n3_n7+=1    
 if i%5==0 and i%3!=0 and i%7!=0: 
    c_5_n3_n7+=1  
 
print(((c_5+c_n3+c_n7)-(+c_5_n3+c_5_n7+c_n3_n7)+(c_5_n3_n7))/350)
0.9571428571428572

Let's go back to the dataset that we were working

In [80]:
toy_dataset = pd.read_csv('toy_dataset.csv')
toy_dataset.head()
Out[80]:
Number City Gender Age Income Illness
0 1 Dallas Male 41 40367.0 No
1 2 Dallas Male 54 45084.0 No
2 3 Dallas Male 42 52483.0 No
3 4 Dallas Male 40 40941.0 No
4 5 Dallas Male 46 50289.0 No

Calculate the probability of picking a person who live in San Diego, is male or has an income less than 50000.

In [ ]:
 
In [83]:
def empirical_prob(n=10000):
    positive_cases = 0
    for i in range(n):
        pick = person()
        if toy_dataset.iloc[pick,1] == "San Diego" or toy_dataset.iloc[pick,2] == "Male" or toy_dataset.iloc[pick,4]<50000:
            positive_cases +=1
    return positive_cases/n
empirical_prob()
Out[83]:
0.6239
In [85]:
data_A= toy_dataset[toy_dataset["City"] == "San Diego"]
data_B= toy_dataset[toy_dataset["Gender"]=="Male"]
data_C=  toy_dataset[toy_dataset["Income"]<50000]
data_AnB = toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Gender"]=="Male")]
data_AnC = toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Income"]<50000)]
data_BnC = toy_dataset[(toy_dataset["Gender"]=="Male") & (toy_dataset["Income"]<50000)]
data_AnBnC =  toy_dataset[(toy_dataset["City"] == "San Diego") & (toy_dataset["Gender"]=="Male") & (toy_dataset["Income"]<50000)]
(data_A.shape[0]+data_B.shape[0]+data_C.shape[0]-data_AnB.shape[0]-data_AnC.shape[0]-data_BnC.shape[0]+data_AnBnC.shape[0])/150000
Out[85]:
0.6233866666666666
Notebooks AI
Notebooks AI Profile20060