Conditional Probability

Last updated: September 7th, 20202020-09-07Project preview
In [1]:
import pandas as pd
import numpy as np
In [2]:
toy_dataset = pd.read_csv('Churn Modeling.csv')
toy_dataset.head() 
Out[2]:
RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1
1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0
2 3 15619304 Onio 502 France Female 42 8 159660.80 3 1 0 113931.57 1
3 4 15701354 Boni 699 France Female 39 1 0.00 2 0 0 93826.63 0
4 5 15737888 Mitchell 850 Spain Female 43 2 125510.82 1 1 1 79084.10 0

This is a dataset which contain some customers who are withdrawing their account from the bank due to some loss and other issues.

What is the probability of a person withdrawing their account given that the person is from France?

We saw in the video that we can calculate this using: $\dfrac{P(Exited\cap France)}{P(France)}$

In [3]:
data_AB = toy_dataset[(toy_dataset["Geography"] == "France") & (toy_dataset["Exited"]==1)]
data_A= toy_dataset[toy_dataset["Geography"] == "France"]
data_AB.shape[0]/data_A.shape[0]
Out[3]:
0.16154766653370561

Now we calculate this using empirical probability:

In [4]:
def number():
    return np.random.randint(0,toy_dataset.shape[0])
In [5]:
def empirical_probFE(n=10000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][4] == 'France'): 
            positive_cases = positive_cases + toy_dataset.loc[e][13]
    return positive_cases/n
empirical_probFE()
Out[5]:
0.082
In [6]:
def empirical_probF(n=10000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][4] == 'France'): 
            positive_cases = positive_cases + 1
    return positive_cases/n
empirical_probF()
Out[6]:
0.5083
In [7]:
empirical_probFE(10000)/empirical_probF(10000)
Out[7]:
0.1618207339081612

exercise 1: Is it more likely that a person is withdrawing their account given that person is female or male?

In [ ]:
 
In [8]:
data_FE = toy_dataset[(toy_dataset["Gender"] == "Female") & (toy_dataset["Exited"]==1)]
data_F= toy_dataset[toy_dataset["Gender"] == "Female"]
data_ME = toy_dataset[(toy_dataset["Gender"] == "Male") & (toy_dataset["Exited"]==1)]
PFE=data_FE.shape[0]/data_F.shape[0]
PME=data_ME.shape[0]/(toy_dataset.shape[0]-data_F.shape[0])
PFE, PME
Out[8]:
(0.2507153863086066, 0.16455928165658787)
In [9]:
def empirical_probFemaleE(n=10000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][5] == 'Female'): 
            positive_cases = positive_cases + toy_dataset.loc[e][13]
    return positive_cases/n
empirical_probFemaleE()
Out[9]:
0.1144
In [10]:
def empirical_probFemale(n=10000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][5] == 'Female'): 
            positive_cases = positive_cases + 1
    return positive_cases/n
empirical_probFemale()
Out[10]:
0.4486
In [11]:
def empirical_probMaleE(n=100000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][5] == 'Male'): 
            positive_cases = positive_cases + toy_dataset.loc[e][13]
    return positive_cases/n
empirical_probMaleE()
Out[11]:
0.09082
In [12]:
empirical_probMale = 1 - empirical_probFemale(100000)
In [13]:
empirical_probFemaleE(100000)/empirical_probFemale(100000), empirical_probMaleE(100000)/empirical_probMale
Out[13]:
(0.2518163549234812, 0.1638247158049138)
In [ ]:
 

exercise 2: Is it more likely that a person is withdrawing their account given that person have a Estimated Salary less than 100000?

In [ ]:
 
In [14]:
data_LE = toy_dataset[(toy_dataset["EstimatedSalary"] < 100000) & (toy_dataset["Exited"]==1)]
data_L= toy_dataset[toy_dataset["EstimatedSalary"] < 100000]
data_PE = toy_dataset[(toy_dataset["EstimatedSalary"] >= 100000) & (toy_dataset["Exited"]==1)]
PLE=data_LE.shape[0]/data_L.shape[0]
PPE=data_PE.shape[0]/(toy_dataset.shape[0]-data_L.shape[0])
PPE, PLE
Out[14]:
(0.20838323353293414, 0.19899799599198398)
In [15]:
def empirical_probSalaryPE(n=100000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][8] >= 100000): 
            positive_cases = positive_cases + toy_dataset.loc[e][13]
    return positive_cases/n
empirical_probSalaryPE()
Out[15]:
0.12369
In [16]:
def empirical_probSalaryP(n=100000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][8] >= 100000): 
            positive_cases = positive_cases + 1
    return positive_cases/n
empirical_probSalaryP()
Out[16]:
0.48111
In [17]:
def empirical_probSalaryLE(n=100000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][8] < 100000): 
            positive_cases = positive_cases + toy_dataset.loc[e][13]
    return positive_cases/n
empirical_probSalaryLE()
Out[17]:
0.08424
In [ ]:
empirical_probSalaryL = 1- empirical_probSalaryP(1000000)
In [ ]:
empirical_probSalaryPE(1000000)/empirical_probSalaryP(1000000), empirical_probSalaryLE(1000000)/empirical_probSalaryL

exercise 3: Is it more likely that a person is withdrawing their account given that person have a Credit Score more that 750?

In [ ]:
 
In [ ]:
data_CP750 = toy_dataset[(toy_dataset["CreditScore"] > 750) & (toy_dataset["Exited"]==1)]
data_P= toy_dataset[toy_dataset["CreditScore"] > 750]
data_CL750 = toy_dataset[(toy_dataset["CreditScore"] <= 750) & (toy_dataset["Exited"]==1)]
CL750=data_CL750.shape[0]/(toy_dataset.shape[0]-data_L.shape[0])
CP750=data_CP750.shape[0]/data_P.shape[0]
CP750, CL750
In [ ]:
def empirical_probCreditPE(n=100000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][3] > 750): 
            positive_cases = positive_cases + toy_dataset.loc[e][13]
    return positive_cases/n
empirical_probCreditP()
In [ ]:
def empirical_probCreditP(n=100000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][3] > 750): 
            positive_cases = positive_cases + 1
    return positive_cases/n
empirical_probCreditP()
In [ ]:
def empirical_probCreditLE(n=100000):
    positive_cases = 0
    for i in range(n):
        e = number()
        if (toy_dataset.loc[e][3] <= 750): 
            positive_cases = positive_cases + toy_dataset.loc[e][13]
    return positive_cases/n
empirical_probCreditLE()
In [ ]:
empirical_probCreditL = 1 - empirical_probCreditP(100000)
In [ ]:
empirical_probCreditPE(10000)/empirical_probCreditP(10000), empirical_probCreditLE(10000)/empirical_probCreditL
In [ ]:
 
In [ ]:
 
In [ ]:
 
Notebooks AI
Notebooks AI Profile20060