Thursday, June 29, 2017

Simple Regression


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'
In [2]:
data = pd.read_csv("Salary_Data.csv")
In [3]:
data
Out[3]:
YearsExperience Salary
0 1.1 39343.0
1 1.3 46205.0
2 1.5 37731.0
3 2.0 43525.0
4 2.2 39891.0
5 2.9 56642.0
6 3.0 60150.0
7 3.2 54445.0
8 3.2 64445.0
9 3.7 57189.0
10 3.9 63218.0
11 4.0 55794.0
12 4.0 56957.0
13 4.1 57081.0
14 4.5 61111.0
15 4.9 67938.0
16 5.1 66029.0
17 5.3 83088.0
18 5.9 81363.0
19 6.0 93940.0
20 6.8 91738.0
21 7.1 98273.0
22 7.9 101302.0
23 8.2 113812.0
24 8.7 109431.0
25 9.0 105582.0
26 9.5 116969.0
27 9.6 112635.0
28 10.3 122391.0
29 10.5 121872.0
In [6]:
# see the pre-defined styles provided.
plt.style.available
Out[6]:
['seaborn-bright',
 'classic',
 'seaborn-colorblind',
 'grayscale',
 'seaborn-talk',
 'seaborn-muted',
 'ggplot',
 'bmh',
 'seaborn-whitegrid',
 'seaborn-paper',
 'seaborn-white',
 'seaborn-dark-palette',
 'fivethirtyeight',
 'dark_background',
 'seaborn-dark',
 'seaborn-darkgrid',
 'seaborn',
 'seaborn-deep',
 'seaborn-poster',
 'seaborn-pastel',
 'seaborn-ticks',
 'seaborn-notebook']
In [7]:
# use the 'seaborn-colorblind' style
plt.style.use('fivethirtyeight')
In [8]:
plt.scatter(data.YearsExperience, data.Salary)
Out[8]:
<matplotlib.collections.PathCollection at 0x1132565f8>
In [12]:
sns.lmplot(x = "YearsExperience", y = "Salary", data = data, size = 7)
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x1138ae3c8>
In [36]:
X
Out[36]:
0      1.1
1      1.3
2      1.5
3      2.0
4      2.2
5      2.9
6      3.0
7      3.2
8      3.2
9      3.7
10     3.9
11     4.0
12     4.0
13     4.1
14     4.5
15     4.9
16     5.1
17     5.3
18     5.9
19     6.0
20     6.8
21     7.1
22     7.9
23     8.2
24     8.7
25     9.0
26     9.5
27     9.6
28    10.3
29    10.5
Name: YearsExperience, dtype: float64
In [37]:
X = data.iloc[:,0].values
In [38]:
y = data.iloc[:,1].values
In [39]:
from sklearn.model_selection import train_test_split
In [52]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = 1/3, random_state = 0)

Linear Regression

In [53]:
from sklearn.linear_model import LinearRegression
In [54]:
model = LinearRegression()
In [55]:
model.fit(X_train.reshape(-1,1),y_train)
Out[55]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [56]:
y_pred = model.predict(X_test.reshape(-1,1))
In [57]:
y_pred
Out[57]:
array([  40835.10590871,  123079.39940819,   65134.55626083,
         63265.36777221,  115602.64545369,  108125.8914992 ,
        116537.23969801,   64199.96201652,   76349.68719258,
        100649.1375447 ])
In [58]:
y_test
Out[58]:
array([  37731.,  122391.,   57081.,   63218.,  116969.,  109431.,
        112635.,   55794.,   83088.,  101302.])
In [68]:
plt.figure(figsize=(10,8))
sns.regplot(X_train,y_train, label="Train set")
plt.scatter(X_test,y_test, c="orange", label = "Test set")
plt.legend()
Out[68]:
<matplotlib.legend.Legend at 0x10f7537f0>
In [ ]:
 

No comments :

Post a Comment