Co-founder @ RMOTR

# ML Data Analysis Workshop

Last updated: October 26th, 2018

# A dummy approach to Machine Learning¶

We'll try doing a simple linear regression without any Data Analysis:

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
df = pd.read_csv('kc_house_data.csv')

In [3]:
df.head()

Out[3]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 3 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 3 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 3 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 5 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 3 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503
In [9]:
df.drop(columns=['id', 'date', 'lat', 'long'], inplace=True)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
df.drop(columns=['price']), df['price'], test_size=0.3, random_state=10)

In [12]:
model = LinearRegression()

In [13]:
model.fit(X_train, y_train)

/Users/santiagobasulto/.virtualenvs/ds-classes/lib/python3.6/site-packages/scipy/linalg/basic.py:1226: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
warnings.warn(mesg, RuntimeWarning)

Out[13]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [14]:
model.score(X_test, y_test)

Out[14]:
0.6607735180297489