Thursday, June 29, 2017

Machine Learning Data Preprocessing Template



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

Import the data

In [33]:
data = pd.read_csv("Data.csv")
In [34]:
data
Out[34]:
Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
In [35]:
X = data.iloc[:,:-1]
In [36]:
y = data.iloc[:,-1]
In [37]:
X
Out[37]:
Country Age Salary
0 France 44.0 72000.0
1 Spain 27.0 48000.0
2 Germany 30.0 54000.0
3 Spain 38.0 61000.0
4 Germany 40.0 NaN
5 France 35.0 58000.0
6 Spain NaN 52000.0
7 France 48.0 79000.0
8 Germany 50.0 83000.0
9 France 37.0 67000.0
In [38]:
y
Out[38]:
0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

Imputation for missing data

In [39]:
from sklearn.preprocessing import Imputer
In [40]:
imputer = Imputer(missing_values="NaN", strategy= "mean", axis=0)
In [41]:
imputer = imputer.fit(X.iloc[:,1:3])
In [42]:
X.iloc[:,1:3] = imputer.transform(X.iloc[:,1:3])
In [43]:
X
Out[43]:
Country Age Salary
0 France 44.000000 72000.000000
1 Spain 27.000000 48000.000000
2 Germany 30.000000 54000.000000
3 Spain 38.000000 61000.000000
4 Germany 40.000000 63777.777778
5 France 35.000000 58000.000000
6 Spain 38.777778 52000.000000
7 France 48.000000 79000.000000
8 Germany 50.000000 83000.000000
9 France 37.000000 67000.000000

Ecoding categorical data

In [44]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
In [45]:
labelencoder = LabelEncoder()
In [46]:
X.iloc[:,0] = labelencoder.fit_transform(X.iloc[:,0])
In [47]:
X
Out[47]:
Country Age Salary
0 0 44.000000 72000.000000
1 2 27.000000 48000.000000
2 1 30.000000 54000.000000
3 2 38.000000 61000.000000
4 1 40.000000 63777.777778
5 0 35.000000 58000.000000
6 2 38.777778 52000.000000
7 0 48.000000 79000.000000
8 1 50.000000 83000.000000
9 0 37.000000 67000.000000
In [48]:
onehotencoder = OneHotEncoder(categorical_features=[0])
In [49]:
X = onehotencoder.fit_transform(X).toarray()
In [50]:
X
Out[50]:
array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.70000000e+01,   6.70000000e+04]])
In [51]:
labelencoder_y = LabelEncoder()
In [52]:
y = labelencoder_y.fit_transform(y)
In [53]:
y
Out[53]:
array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

Train test split

In [54]:
from sklearn.model_selection import train_test_split
In [57]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

Feature Scaling

In [58]:
from sklearn.preprocessing import StandardScaler
In [59]:
sc_X = StandardScaler()
In [60]:
X_train = sc_X.fit_transform(X_train)
In [61]:
X_test = sc_X.transform(X_test)
In [62]:
X_train
Out[62]:
array([[ 1.        , -0.57735027, -0.57735027, -0.7529426 , -0.62603778],
       [ 1.        , -0.57735027, -0.57735027,  1.00845381,  1.01304295],
       [ 1.        , -0.57735027, -0.57735027,  1.79129666,  1.83258331],
       [-1.        ,  1.73205081, -0.57735027, -1.73149616, -1.09434656],
       [ 1.        , -0.57735027, -0.57735027, -0.36152118,  0.42765698],
       [-1.        ,  1.73205081, -0.57735027,  0.22561096,  0.05040824],
       [-1.        , -0.57735027,  1.73205081, -0.16581046, -0.27480619],
       [-1.        , -0.57735027,  1.73205081, -0.01359102, -1.32850095]])

No comments :

Post a Comment