Saturday, March 4, 2017

Data Visualisation and Predictive Model on IRIS dataset


Introduction

This notebook demos Data Visualisation and various Machine Learning Classification algorithms on IRIS dataset.

1) Loading Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.basemap import Basemap
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

2) Data

In [3]:
iris = pd.read_csv("iris.csv")
In [4]:
iris.shape
Out[4]:
(150, 6)
In [5]:
iris.head()
Out[5]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa
In [6]:
iris = iris.drop("Id",axis = 1)
In [7]:
iris.head()
Out[7]:
SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
In [8]:
iris.describe()
Out[8]:
SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
In [9]:
iris.groupby("Species").size()
Out[9]:
Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

3) Data Visualisation

In [10]:
iris.plot(kind = "box", subplots=True,layout=(2,2) ,sharex=False, sharey= False, figsize=(10,8))
Out[10]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1156b0588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1189c6b00>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x118a0a978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x118a50eb8>]], dtype=object)
In [11]:
plt.figure(figsize=(12,10))
plt.subplot(2,2,1)
ax1 = sns.swarmplot(x='Species', y='SepalLengthCm', data=iris)
ax1.set_ylabel("Sepal Length (CM)")
plt.subplot(2,2,2)
ax2 = sns.swarmplot(x='Species', y='SepalWidthCm', data=iris)
ax2.set_ylabel("Sepal Width (CM)")
plt.subplot(2,2,3)
ax3 = sns.swarmplot(x='Species', y='PetalLengthCm', data=iris)
ax3.set_ylabel("Petal Length (CM)")
plt.subplot(2,2,4)
ax4 = sns.swarmplot(x='Species', y='PetalWidthCm', data=iris)
ax4.set_ylabel("Petal Width (CM)")
Out[11]:
<matplotlib.text.Text at 0x119b38e80>
In [12]:
iris.plot(kind = "density", figsize=(10,8))
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x119b3cc50>
In [13]:
sns.pairplot(iris, hue = "Species")
Out[13]:
<seaborn.axisgrid.PairGrid at 0x11b086a20>
In [14]:
corr=iris[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']].corr()
In [15]:
sns.heatmap(corr, vmax=1, square=True,annot=True)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c4fb278>

Multi-class classification

In [16]:
from sklearn.model_selection import train_test_split
In [17]:
X = iris.iloc[:,0:4]
Y = iris.iloc[:,4]
In [18]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, random_state = 22, test_size = 0.2)
In [19]:
X_train.head()
Out[19]:
SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
30 4.8 3.1 1.6 0.2
31 5.4 3.4 1.5 0.4
89 5.5 2.5 4.0 1.3
90 5.5 2.6 4.4 1.2
55 5.7 2.8 4.5 1.3
In [20]:
Y_train.head()
Out[20]:
30        Iris-setosa
31        Iris-setosa
89    Iris-versicolor
90    Iris-versicolor
55    Iris-versicolor
Name: Species, dtype: object
In [21]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
In [22]:
models = []
models.append(("LR",LogisticRegression()))
models.append(("NB",GaussianNB()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("DT",DecisionTreeClassifier()))
models.append(("SVM",SVC()))
In [23]:
results = []
names = []
for name,model in models:
    kfold = KFold(n_splits=10, random_state=22)
    cv_result = cross_val_score(model,X_train,Y_train, cv = kfold,scoring = "accuracy")
    names.append(name)
    results.append(cv_result)
for i in range(len(names)):
    print(names[i],results[i].mean())
LR 0.95
NB 0.958333333333
KNN 0.958333333333
DT 0.941666666667
SVM 0.983333333333

Visualising the results

In [24]:
ax = sns.violinplot(data=results)
ax.set_xticklabels(names)
Out[24]:
[<matplotlib.text.Text at 0x11dadf588>,
 <matplotlib.text.Text at 0x11dacc080>,
 <matplotlib.text.Text at 0x11db479b0>,
 <matplotlib.text.Text at 0x11db4b4e0>,
 <matplotlib.text.Text at 0x11db4bfd0>]
SVM has the highest score and less variance.

Final Predictions using Test data

In [25]:
svm = SVC()
svm.fit(X_train,Y_train)
predictions = svm.predict(X_test)
In [26]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
In [27]:
print(accuracy_score(Y_test,predictions))
0.933333333333
In [28]:
print(classification_report(Y_test,predictions))
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         6
Iris-versicolor       0.83      1.00      0.91        10
 Iris-virginica       1.00      0.86      0.92        14

    avg / total       0.94      0.93      0.93        30

In [29]:
conf = confusion_matrix(Y_test,predictions)
In [30]:
label = ["Setosa","Versicolor","Virginica"]
sns.heatmap(conf, annot=True, xticklabels=label, yticklabels=label)
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x11dad6048>

No comments :

Post a Comment