MoSayed Proj 1

Last updated: September 17th, 20202020-09-17Project preview
In [1]:
# The Sakila database 

# It is a normalised schema modelling a DVD rental store, featuring things like films, actors, film-actor relationships, 
# and a central inventory table that connects films, stores, and rentals.
# For db structure, please visit:
# https://dev.mysql.com/doc/sakila/en/sakila-structure.html
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

%matplotlib inline
In [3]:
conn = sqlite3.connect('sakila.db')
df = pd.read_sql('''
SELECT
        rental.rental_id, rental.rental_date, rental.return_date,
        customer.last_name AS customer_lastname,
        store.store_id,
        city.city AS rental_store_city,
        film.title AS film_title, film.rental_duration AS film_rental_duration,
        film.rental_rate AS film_rental_rate, film.replacement_cost AS film_replacement_cost,
        film.rating AS film_rating
    FROM rental
    INNER JOIN customer ON rental.customer_id == customer.customer_id
    INNER JOIN inventory ON rental.inventory_id == inventory.inventory_id
    INNER JOIN store ON inventory.store_id == store.store_id
    INNER JOIN address ON store.address_id == address.address_id
    INNER JOIN city ON address.city_id == city.city_id
    INNER JOIN film ON inventory.film_id == film.film_id
    ;
''', conn, index_col = 'rental_id', parse_dates = (['rental_date', 'return_date']))
In [4]:
df.head()
Out[4]:
rental_date return_date customer_lastname store_id rental_store_city film_title film_rental_duration film_rental_rate film_replacement_cost film_rating
rental_id
1 2005-05-24 22:53:30 2005-05-26 22:04:30 HUNTER 1 Lethbridge BLANKET BEVERLY 7 2.99 21.99 G
2 2005-05-24 22:54:33 2005-05-28 19:40:33 COLLAZO 2 Woodridge FREAKY POCUS 7 2.99 16.99 R
3 2005-05-24 23:03:39 2005-06-01 22:12:39 MURRELL 2 Woodridge GRADUATE LORD 7 2.99 14.99 G
4 2005-05-24 23:04:41 2005-06-03 01:43:41 PURDY 1 Lethbridge LOVE SUICIDES 6 0.99 21.99 R
5 2005-05-24 23:05:21 2005-06-02 04:33:21 HANSEN 2 Woodridge IDOLS SNATCHERS 5 2.99 29.99 NC-17
In [6]:
df.shape
Out[6]:
(16044, 10)
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16044 entries, 1 to 16049
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   rental_date            16044 non-null  datetime64[ns]
 1   return_date            15861 non-null  datetime64[ns]
 2   customer_lastname      16044 non-null  object        
 3   store_id               16044 non-null  int64         
 4   rental_store_city      16044 non-null  object        
 5   film_title             16044 non-null  object        
 6   film_rental_duration   16044 non-null  int64         
 7   film_rental_rate       16044 non-null  float64       
 8   film_replacement_cost  16044 non-null  float64       
 9   film_rating            16044 non-null  object        
dtypes: datetime64[ns](2), float64(2), int64(2), object(4)
memory usage: 1.3+ MB
In [8]:
df.describe()
Out[8]:
store_id film_rental_duration film_rental_rate film_replacement_cost
count 16044.000000 16044.00000 16044.000000 16044.000000
mean 1.506171 4.93549 2.942630 20.215443
std 0.499978 1.40169 1.649678 6.081771
min 1.000000 3.00000 0.990000 9.990000
25% 1.000000 4.00000 0.990000 14.990000
50% 2.000000 5.00000 2.990000 20.990000
75% 2.000000 6.00000 4.990000 25.990000
max 2.000000 7.00000 4.990000 29.990000
In [9]:
# Numerical Analysis & Viualization
In [10]:
df['rental_store_city'].value_counts()
Out[10]:
Woodridge     8121
Lethbridge    7923
Name: rental_store_city, dtype: int64
In [11]:
df['rental_store_city'].value_counts().plot(kind='pie', figsize=(6,6))
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd506db5730>
In [12]:
# the rentals are nearly equal in both cites' stores
In [13]:
#
In [14]:
# Now the query is: are our products(movies) selling enough?
# Here is the analysis
In [15]:
# some statistics: 
# it is not correct to compare no of sales for the highest products without taking into consideration the cost of the 
# product, so we have to divide rental rate to cost then compare
In [16]:
df['rental_gain_return'] = df['film_rental_rate'] / df['film_replacement_cost'] * 100

df['rental_gain_return'].head()
Out[16]:
rental_id
1    13.597090
2    17.598587
3    19.946631
4     4.502046
5     9.969990
Name: rental_gain_return, dtype: float64
In [23]:
df['rental_gain_return'].median().round(1)
Out[23]:
13.6
In [24]:
# This means that the single rent of the movie compensates 13.6 % of the film cost,
# so, we need to rent the product with about average 7.4 times to compensate 100 % of the cost & start making profits
# Let's calculate the the average selling times of the movies & see...
In [26]:
df['film_title'].value_counts().mean().round(1)
Out[26]:
16.7
In [28]:
# the average selling times of the movies is 16.7 times which is much higher than 7.4, so the overall sales performance
# is supporting the cashflow surplus.
# the same steps can be applied to each moive individually & see which movie is making losses & which is acieving the
# highest profits
Notebooks AI
Notebooks AI Profile20060