Profile picture

Data Scientist @ RMOTR

Final Project - StackOverflow

Last updated: October 8th, 20192019-10-08Project preview

rmotr

StackOverflow Developer Survey 2018

green-divider

In [15]:
import numpy as np
import pandas as pd
In [16]:
#!unzip data/stack-overflow-2018-developer-survey.zip
df = pd.read_csv('data/survey_results_public.csv')

green-divider

Getting all unique values within a multi-value column

In [17]:
df['DevType'].head(10)
Out[17]:
0                                 Full-stack developer
1    Database administrator;DevOps specialist;Full-...
2             Engineering manager;Full-stack developer
3                                 Full-stack developer
4    Data or business analyst;Desktop or enterprise...
5    Back-end developer;Database administrator;Fron...
6    Back-end developer;Front-end developer;Full-st...
7    Designer;Front-end developer;QA or test developer
8    Back-end developer;C-suite executive (CEO, CTO...
9                                             Designer
Name: DevType, dtype: object
In [18]:
df['DevType'].str.split(';').head(10)
Out[18]:
0                               [Full-stack developer]
1    [Database administrator, DevOps specialist, Fu...
2          [Engineering manager, Full-stack developer]
3                               [Full-stack developer]
4    [Data or business analyst, Desktop or enterpri...
5    [Back-end developer, Database administrator, F...
6    [Back-end developer, Front-end developer, Full...
7    [Designer, Front-end developer, QA or test dev...
8    [Back-end developer, C-suite executive (CEO, C...
9                                           [Designer]
Name: DevType, dtype: object
In [19]:
df['DevType'].str.split(';', expand=True).head()
Out[19]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
0 Full-stack developer None None None None None None None None None None None None None None None None None None None
1 Database administrator DevOps specialist Full-stack developer System administrator None None None None None None None None None None None None None None None None
2 Engineering manager Full-stack developer None None None None None None None None None None None None None None None None None None
3 Full-stack developer None None None None None None None None None None None None None None None None None None None
4 Data or business analyst Desktop or enterprise applications developer Game or graphics developer QA or test developer Student None None None None None None None None None None None None None None None
In [20]:
df['DevType'].str.split(';', expand=True).stack().head(10).to_frame()
Out[20]:
0
0 0 Full-stack developer
1 0 Database administrator
1 DevOps specialist
2 Full-stack developer
3 System administrator
2 0 Engineering manager
1 Full-stack developer
3 0 Full-stack developer
4 0 Data or business analyst
1 Desktop or enterprise applications developer
In [21]:
all_dev_types = df['DevType'].str.split(';', expand=True).stack().unique()
all_dev_types
Out[21]:
array(['Full-stack developer', 'Database administrator',
       'DevOps specialist', 'System administrator', 'Engineering manager',
       'Data or business analyst',
       'Desktop or enterprise applications developer',
       'Game or graphics developer', 'QA or test developer', 'Student',
       'Back-end developer', 'Front-end developer', 'Designer',
       'C-suite executive (CEO, CTO, etc.)', 'Mobile developer',
       'Data scientist or machine learning specialist',
       'Marketing or sales professional', 'Product manager',
       'Embedded applications or devices developer',
       'Educator or academic researcher'], dtype=object)

green-divider

Count how many values of each type we have

In [22]:
all_dev_types
Out[22]:
array(['Full-stack developer', 'Database administrator',
       'DevOps specialist', 'System administrator', 'Engineering manager',
       'Data or business analyst',
       'Desktop or enterprise applications developer',
       'Game or graphics developer', 'QA or test developer', 'Student',
       'Back-end developer', 'Front-end developer', 'Designer',
       'C-suite executive (CEO, CTO, etc.)', 'Mobile developer',
       'Data scientist or machine learning specialist',
       'Marketing or sales professional', 'Product manager',
       'Embedded applications or devices developer',
       'Educator or academic researcher'], dtype=object)
In [23]:
df["DevType"].isna().sum()
Out[23]:
6757
In [24]:
df.loc[:, "DevType"].isna().sum()
Out[24]:
6757
In [25]:
df.loc[:, "DevType"].dropna().str.contains("Database administrator").head(10)
Out[25]:
0    False
1     True
2    False
3    False
4    False
5     True
6    False
7    False
8     True
9    False
Name: DevType, dtype: bool
In [26]:
df.loc[:, "DevType"].dropna().str.contains("Database administrator").sum()
Out[26]:
13216
In [27]:
df.loc[:, 'DevType'].dropna().str.contains(all_dev_types[1]).sum()
Out[27]:
13216
In [28]:
df.loc[:, 'DevType'].dropna().str.contains(all_dev_types[13]).sum()
/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:1: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.
  """Entry point for launching an IPython kernel.
Out[28]:
0
In [29]:
df.loc[:, 'DevType'].dropna().str.contains(all_dev_types[13], regex=False).sum()
Out[29]:
3491
In [30]:
[df.loc[:, 'DevType'].str.contains(devType, regex=False).sum() for devType in all_dev_types]
Out[30]:
[44353,
 13216,
 9549,
 10375,
 5256,
 7559,
 15807,
 4642,
 6194,
 15732,
 53300,
 34822,
 12019,
 3491,
 18804,
 7088,
 1122,
 4316,
 4819,
 3641]
In [31]:
pd.Series([df.loc[:, 'DevType'].str.contains(devType, regex=False).sum() for devType in all_dev_types])
Out[31]:
0     44353
1     13216
2      9549
3     10375
4      5256
5      7559
6     15807
7      4642
8      6194
9     15732
10    53300
11    34822
12    12019
13     3491
14    18804
15     7088
16     1122
17     4316
18     4819
19     3641
dtype: int64
In [32]:
devtype_count = pd.Series([df.loc[:, 'DevType'].str.contains(devType, regex=False).sum()
                           for devType in all_dev_types],
                          index=all_dev_types,
                          name='DevType_count')

devtype_count
Out[32]:
Full-stack developer                             44353
Database administrator                           13216
DevOps specialist                                 9549
System administrator                             10375
Engineering manager                               5256
Data or business analyst                          7559
Desktop or enterprise applications developer     15807
Game or graphics developer                        4642
QA or test developer                              6194
Student                                          15732
Back-end developer                               53300
Front-end developer                              34822
Designer                                         12019
C-suite executive (CEO, CTO, etc.)                3491
Mobile developer                                 18804
Data scientist or machine learning specialist     7088
Marketing or sales professional                   1122
Product manager                                   4316
Embedded applications or devices developer        4819
Educator or academic researcher                   3641
Name: DevType_count, dtype: int64
In [33]:
devtype_count = devtype_count.sort_values(ascending=False)

devtype_count
Out[33]:
Back-end developer                               53300
Full-stack developer                             44353
Front-end developer                              34822
Mobile developer                                 18804
Desktop or enterprise applications developer     15807
Student                                          15732
Database administrator                           13216
Designer                                         12019
System administrator                             10375
DevOps specialist                                 9549
Data or business analyst                          7559
Data scientist or machine learning specialist     7088
QA or test developer                              6194
Engineering manager                               5256
Embedded applications or devices developer        4819
Game or graphics developer                        4642
Product manager                                   4316
Educator or academic researcher                   3641
C-suite executive (CEO, CTO, etc.)                3491
Marketing or sales professional                   1122
Name: DevType_count, dtype: int64
In [34]:
devtype_count.plot(kind='bar', figsize=(12, 6))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f06f8646400>

Now lets calculate the median ConvertedSalary for each DevType:

In [35]:
devtype_index = pd.Series([df.loc[:, 'DevType'].str.contains(devType, regex=False)
                           for devType in all_dev_types],
                          index=all_dev_types,
                          name='DevType_count')

devtype_index.head()
Out[35]:
Full-stack developer      0         True
1         True
2         True
3...
Database administrator    0        False
1         True
2        False
3...
DevOps specialist         0        False
1         True
2        False
3...
System administrator      0        False
1         True
2        False
3...
Engineering manager       0        False
1        False
2         True
3...
Name: DevType_count, dtype: object

Finally, use that indexes to calculate the median salary per developer type:

In [36]:
devtype_salary = pd.Series([df[devtype_index[i].fillna(False)]['ConvertedSalary'].median()
                            for i in np.arange(len(all_dev_types))],
                           index=all_dev_types,
                           name='DevType_ConvertedSalary')

devtype_salary
Out[36]:
Full-stack developer                             59000.0
Database administrator                           51394.0
DevOps specialist                                72469.0
System administrator                             55562.0
Engineering manager                              88573.0
Data or business analyst                         58890.5
Desktop or enterprise applications developer     57480.0
Game or graphics developer                       40000.0
QA or test developer                             55000.0
Student                                          14202.0
Back-end developer                               55562.0
Front-end developer                              51408.0
Designer                                         45992.0
C-suite executive (CEO, CTO, etc.)               69244.0
Mobile developer                                 43224.0
Data scientist or machine learning specialist    60000.0
Marketing or sales professional                  49197.5
Product manager                                  63174.0
Embedded applications or devices developer       59180.0
Educator or academic researcher                  43711.5
Name: DevType_ConvertedSalary, dtype: float64
In [37]:
devtype_salary = devtype_salary.sort_values(ascending=False)

devtype_salary
Out[37]:
Engineering manager                              88573.0
DevOps specialist                                72469.0
C-suite executive (CEO, CTO, etc.)               69244.0
Product manager                                  63174.0
Data scientist or machine learning specialist    60000.0
Embedded applications or devices developer       59180.0
Full-stack developer                             59000.0
Data or business analyst                         58890.5
Desktop or enterprise applications developer     57480.0
System administrator                             55562.0
Back-end developer                               55562.0
QA or test developer                             55000.0
Front-end developer                              51408.0
Database administrator                           51394.0
Marketing or sales professional                  49197.5
Designer                                         45992.0
Educator or academic researcher                  43711.5
Mobile developer                                 43224.0
Game or graphics developer                       40000.0
Student                                          14202.0
Name: DevType_ConvertedSalary, dtype: float64
In [38]:
devtype_salary.median()
Out[38]:
55562.0
In [39]:
devtype_salary.plot(kind='bar', figsize=(12, 6))
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f06f82964e0>

green-divider

Creating dummies variables

In [40]:
all_dev_types
Out[40]:
array(['Full-stack developer', 'Database administrator',
       'DevOps specialist', 'System administrator', 'Engineering manager',
       'Data or business analyst',
       'Desktop or enterprise applications developer',
       'Game or graphics developer', 'QA or test developer', 'Student',
       'Back-end developer', 'Front-end developer', 'Designer',
       'C-suite executive (CEO, CTO, etc.)', 'Mobile developer',
       'Data scientist or machine learning specialist',
       'Marketing or sales professional', 'Product manager',
       'Embedded applications or devices developer',
       'Educator or academic researcher'], dtype=object)
In [41]:
all_dev_types.shape
Out[41]:
(20,)
In [42]:
list(df['DevType'].head())
Out[42]:
['Full-stack developer',
 'Database administrator;DevOps specialist;Full-stack developer;System administrator',
 'Engineering manager;Full-stack developer',
 'Full-stack developer',
 'Data or business analyst;Desktop or enterprise applications developer;Game or graphics developer;QA or test developer;Student']
In [43]:
df['DevType'].str.split(';', expand=True).stack().head(10).to_frame()
Out[43]:
0
0 0 Full-stack developer
1 0 Database administrator
1 DevOps specialist
2 Full-stack developer
3 System administrator
2 0 Engineering manager
1 Full-stack developer
3 0 Full-stack developer
4 0 Data or business analyst
1 Desktop or enterprise applications developer
In [44]:
dummies_1 = pd.get_dummies(df['DevType'].str.split(';', expand=True), prefix='is_DevType_', prefix_sep='')

print(dummies_1.shape)
dummies_1.head()
(98855, 210)
Out[44]:
is_DevType_Back-end developer is_DevType_C-suite executive (CEO, CTO, etc.) is_DevType_Data or business analyst is_DevType_Data scientist or machine learning specialist is_DevType_Database administrator is_DevType_Designer is_DevType_Desktop or enterprise applications developer is_DevType_DevOps specialist is_DevType_Educator or academic researcher is_DevType_Embedded applications or devices developer ... is_DevType_Product manager is_DevType_QA or test developer is_DevType_Student is_DevType_System administrator is_DevType_QA or test developer is_DevType_Student is_DevType_System administrator is_DevType_Student is_DevType_System administrator is_DevType_System administrator
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 210 columns

In [45]:
dummies_2 = pd.get_dummies(df['DevType'].str.split(';', expand=True).stack(),
                           prefix='is_DevType_', prefix_sep='').sum(level=0)

print(dummies_2.shape)
dummies_2.head()
(92098, 20)
Out[45]:
is_DevType_Back-end developer is_DevType_C-suite executive (CEO, CTO, etc.) is_DevType_Data or business analyst is_DevType_Data scientist or machine learning specialist is_DevType_Database administrator is_DevType_Designer is_DevType_Desktop or enterprise applications developer is_DevType_DevOps specialist is_DevType_Educator or academic researcher is_DevType_Embedded applications or devices developer is_DevType_Engineering manager is_DevType_Front-end developer is_DevType_Full-stack developer is_DevType_Game or graphics developer is_DevType_Marketing or sales professional is_DevType_Mobile developer is_DevType_Product manager is_DevType_QA or test developer is_DevType_Student is_DevType_System administrator
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1
2 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
4 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0

Removing whitespaces from values

In [46]:
other_df = pd.DataFrame({
    'DevType': ['Full-stack developer;DevOps specialist',
                'Full-stack developer; DevOps specialist']
})

other_df
Out[46]:
DevType
0 Full-stack developer;DevOps specialist
1 Full-stack developer; DevOps specialist
In [47]:
pd.get_dummies(other_df['DevType'].str.split(';', expand=True).stack(),
               prefix='is_DevType_', prefix_sep='').sum(level=0)
Out[47]:
is_DevType_ DevOps specialist is_DevType_DevOps specialist is_DevType_Full-stack developer
0 0 1 1
1 1 0 1
In [48]:
pd.get_dummies(other_df['DevType'].str.split(';', expand=True)
               .apply(lambda s: s.str.strip()).stack(),
               prefix='is_DevType_', prefix_sep='').sum(level=0)
Out[48]:
is_DevType_DevOps specialist is_DevType_Full-stack developer
0 1 1
1 1 1

green-divider

Notebooks AI
Notebooks AI Profile20060