Profile picture

WEB-DATA-ANALYSIS-82936

Last updated: December 9th, 20192019-12-09Project preview
In [1]:
# import pandas 
import pandas as pd
In [2]:
# load data from csv
df = pd.read_csv('https://dl.dropbox.com/s/mivftl713f52zjw/ga_sessions_31725.csv?dl=0')
df.head()
Out[2]:
Source / Medium Date Users New Users Sessions Bounce Rate Pages / Session Avg. Session Duration Engaged Users (Goal 2 Conversion Rate) Engaged Users (Goal 2 Completions) Engaged Users (Goal 2 Value)
0 google / organic 20180226 1812 1593 1911 37.89% 4.30 00:01:53 8.58% 164 $0.00
1 google / organic 20180321 1445 1184 1577 50.73% 3.65 00:02:04 7.99% 126 $0.00
2 google / organic 20180313 1412 1160 1515 50.30% 3.68 00:02:04 7.46% 113 $0.00
3 google / organic 20180116 1405 1126 1514 54.82% 3.43 00:01:42 7.53% 114 $0.00
4 google / organic 20180315 1403 1148 1509 49.77% 3.84 00:02:08 9.15% 138 $0.00
In [3]:
# get the data type of each column
df.dtypes
Out[3]:
Source / Medium                            object
Date                                        int64
Users                                       int64
New Users                                   int64
Sessions                                    int64
Bounce Rate                                object
Pages / Session                           float64
Avg. Session Duration                      object
Engaged Users (Goal 2 Conversion Rate)     object
Engaged Users (Goal 2 Completions)          int64
Engaged Users (Goal 2 Value)               object
dtype: object
In [4]:
# get shape of the dataframe (rows, columns)
df.shape
Out[4]:
(3121, 11)
In [5]:
# get all the column names
df.columns
Out[5]:
Index(['Source / Medium', 'Date', 'Users', 'New Users', 'Sessions',
       'Bounce Rate', 'Pages / Session', 'Avg. Session Duration',
       'Engaged Users (Goal 2 Conversion Rate)',
       'Engaged Users (Goal 2 Completions)', 'Engaged Users (Goal 2 Value)'],
      dtype='object')
In [6]:
# convert int64 to datetime for the date column
df['Date'] = df['Date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
df = df.sort_values(by='Date')
df.head()
Out[6]:
Source / Medium Date Users New Users Sessions Bounce Rate Pages / Session Avg. Session Duration Engaged Users (Goal 2 Conversion Rate) Engaged Users (Goal 2 Completions) Engaged Users (Goal 2 Value)
2260 dfa / cpm 2018-01-01 1 1 1 100.00% 1.00 00:00:00 0.00% 0 $0.00
2889 pinterest.com / referral 2018-01-01 1 0 1 0.00% 2.00 00:00:05 0.00% 0 $0.00
842 Partners / affiliate 2018-01-01 20 17 27 55.56% 2.56 00:02:25 7.41% 2 $0.00
1477 qiita.com / referral 2018-01-01 4 4 4 25.00% 8.25 00:02:23 25.00% 1 $0.00
1546 baidu / organic 2018-01-01 3 3 3 100.00% 1.00 00:00:00 0.00% 0 $0.00
In [7]:
# what is the date range?
print(df['Date'].min())
print(df['Date'].max())
2018-01-01 00:00:00
2018-03-31 00:00:00
In [8]:
# what are the top 5 sources of traffic? (by sessions)
df_overview = df.groupby('Source / Medium')['Sessions'].sum().to_frame().reset_index()
df_overview = df_overview.sort_values(by='Sessions', ascending=False).reset_index(drop=True)
df_overview.head()
Out[8]:
Source / Medium Sessions
0 google / organic 110269
1 youtube.com / referral 54256
2 (direct) / (none) 41746
3 mall.googleplex.com / referral 20433
4 google / cpc 10250
In [9]:
# how many total new users did we have in this period?
df['New Users'].sum()
Out[9]:
199701
In [10]:
# what percentage of the new users came from google organically in this period?
df_google_organic = df[df['Source / Medium']=='google / organic']
result = (df_google_organic['New Users'].sum()/df['New Users'].sum())*100
str(round(result, 2))+'%'
Out[10]:
'42.94%'
In [11]:
# display plotly outputs in a cell
def enable_plotly_in_cell():
    import IPython
    from plotly.offline import init_notebook_mode
    display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
    init_notebook_mode(connected=False)
In [12]:
# show me a trend of daily sessions from google organic for this peiod
from plotly.offline import iplot
import plotly.graph_objs as go

enable_plotly_in_cell()
data = [go.Scatter(x=df_google_organic['Date'], y=df_google_organic['Sessions'])]
iplot(data)