Sunday, April 9, 2017

k-NN, Linear Regression, Cross Validation using scikit-learn



In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

kNN

In [4]:
from sklearn.datasets import load_iris
In [5]:
iris = load_iris()
In [8]:
X = iris.data
In [10]:
y = iris.target
In [11]:
from sklearn.neighbors import KNeighborsClassifier
In [12]:
knn = KNeighborsClassifier(n_neighbors=1)
In [13]:
knn
Out[13]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
In [14]:
knn.fit(X,y)
Out[14]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
In [17]:
knn.predict([[1,2,3,4],[2,2,2,2]])
Out[17]:
array([2, 0])
In [20]:
from sklearn.model_selection import train_test_split
In [56]:
X_Train,X_test,y_train, y_test = train_test_split(X,y,test_size = 0.4, random_state = 4)
In [57]:
knn = KNeighborsClassifier(n_neighbors=1)
In [58]:
knn.fit(X_Train, y_train)
Out[58]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
In [59]:
y_predicted = knn.predict(X_test)
In [60]:
y_predicted
Out[60]:
array([2, 0, 2, 2, 2, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 1, 0, 0, 2, 0, 2, 1,
       0, 0, 0, 0, 0, 0, 2, 1, 0, 2, 0, 1, 2, 2, 1, 1, 0, 2, 0, 1, 0, 2, 0,
       0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 0, 1, 1, 1])
In [61]:
from sklearn.metrics import accuracy_score
In [62]:
accuracy_score(y_test,y_predicted)
Out[62]:
0.94999999999999996
In [69]:
k = np.arange(1,25)
In [70]:
k_result = []
for val in k:
    knn = KNeighborsClassifier(n_neighbors=val)
    knn.fit(X_Train,y_train)
    y_predict = knn.predict(X_test)
    k_result.append(accuracy_score(y_test,y_predict))
In [71]:
plt.plot(k,k_result)
Out[71]:
[<matplotlib.lines.Line2D at 0x11b69d710>]

Linear Regression

In [73]:
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
data.head()
Out[73]:
TV Radio Newspaper Sales
1 230.1 37.8 69.2 22.1
2 44.5 39.3 45.1 10.4
3 17.2 45.9 69.3 9.3
4 151.5 41.3 58.5 18.5
5 180.8 10.8 58.4 12.9
In [77]:
sns.pairplot(data, x_vars=["TV","Radio","Newspaper"], y_vars="Sales", size=7, aspect=0.8)
Out[77]:
<seaborn.axisgrid.PairGrid at 0x11f9c76a0>
In [78]:
sns.pairplot(data, x_vars=["TV","Radio","Newspaper"], y_vars="Sales", size=7, aspect=0.8, kind="reg")
Out[78]:
<seaborn.axisgrid.PairGrid at 0x11fe50c50>
In [79]:
X = data[["TV","Radio","Newspaper"]]
In [80]:
X.head()
Out[80]:
TV Radio Newspaper
1 230.1 37.8 69.2
2 44.5 39.3 45.1
3 17.2 45.9 69.3
4 151.5 41.3 58.5
5 180.8 10.8 58.4
In [81]:
y = data["Sales"]
In [82]:
y.head()
Out[82]:
1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: Sales, dtype: float64
In [84]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
In [85]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 7, test_size = 0.3)
In [86]:
model = LinearRegression()
In [87]:
model.fit(X_train, y_train)
Out[87]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [89]:
predicted = model.predict(X_test)
In [91]:
np.sqrt(mean_squared_error(y_test, predicted))
Out[91]:
1.6470911556016581
In [92]:
model.intercept_
Out[92]:
2.5971913990213036
In [93]:
model.coef_
Out[93]:
array([  4.71259657e-02,   1.90987993e-01,  -1.93812266e-05])

kFold Cross Validation

In [94]:
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
In [95]:
X = data[["TV","Radio","Newspaper"]]
y = data["Sales"]
In [96]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
In [112]:
model = LinearRegression()
kfold = KFold(n_splits=10)
score = cross_val_score(model,X,y,cv = kfold,scoring = "mean_squared_error")
In [113]:
score
Out[113]:
array([-3.56038438, -3.29767522, -2.08943356, -2.82474283, -1.3027754 ,
       -1.74163618, -8.17338214, -2.11409746, -3.04273109, -2.45281793])
In [114]:
score.mean()
Out[114]:
-3.0599676181185136
In [115]:
score = -score.mean()
In [116]:
score = np.sqrt(score)
In [117]:
score
Out[117]:
1.749276312684338

No comments :

Post a Comment