Introduction¶
This notebook demos Data Visualisation and various Machine Learning Classification algorithms on IRIS dataset.1) Loading Libraries¶
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.basemap import Basemap
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'
2) Data¶
In [3]:
iris = pd.read_csv("iris.csv")
In [4]:
iris.shape
Out[4]:
In [5]:
iris.head()
Out[5]:
In [6]:
iris = iris.drop("Id",axis = 1)
In [7]:
iris.head()
Out[7]:
In [8]:
iris.describe()
Out[8]:
In [9]:
iris.groupby("Species").size()
Out[9]:
3) Data Visualisation¶
In [10]:
iris.plot(kind = "box", subplots=True,layout=(2,2) ,sharex=False, sharey= False, figsize=(10,8))
Out[10]:
In [11]:
plt.figure(figsize=(12,10))
plt.subplot(2,2,1)
ax1 = sns.swarmplot(x='Species', y='SepalLengthCm', data=iris)
ax1.set_ylabel("Sepal Length (CM)")
plt.subplot(2,2,2)
ax2 = sns.swarmplot(x='Species', y='SepalWidthCm', data=iris)
ax2.set_ylabel("Sepal Width (CM)")
plt.subplot(2,2,3)
ax3 = sns.swarmplot(x='Species', y='PetalLengthCm', data=iris)
ax3.set_ylabel("Petal Length (CM)")
plt.subplot(2,2,4)
ax4 = sns.swarmplot(x='Species', y='PetalWidthCm', data=iris)
ax4.set_ylabel("Petal Width (CM)")
Out[11]:
In [12]:
iris.plot(kind = "density", figsize=(10,8))
Out[12]:
In [13]:
sns.pairplot(iris, hue = "Species")
Out[13]:
In [14]:
corr=iris[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']].corr()
In [15]:
sns.heatmap(corr, vmax=1, square=True,annot=True)
Out[15]:
Multi-class classification¶
In [16]:
from sklearn.model_selection import train_test_split
In [17]:
X = iris.iloc[:,0:4]
Y = iris.iloc[:,4]
In [18]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, random_state = 22, test_size = 0.2)
In [19]:
X_train.head()
Out[19]:
In [20]:
Y_train.head()
Out[20]:
In [21]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
In [22]:
models = []
models.append(("LR",LogisticRegression()))
models.append(("NB",GaussianNB()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("DT",DecisionTreeClassifier()))
models.append(("SVM",SVC()))
In [23]:
results = []
names = []
for name,model in models:
kfold = KFold(n_splits=10, random_state=22)
cv_result = cross_val_score(model,X_train,Y_train, cv = kfold,scoring = "accuracy")
names.append(name)
results.append(cv_result)
for i in range(len(names)):
print(names[i],results[i].mean())
Visualising the results¶
In [24]:
ax = sns.violinplot(data=results)
ax.set_xticklabels(names)
Out[24]:
SVM has the highest score and less variance.
Final Predictions using Test data¶
In [25]:
svm = SVC()
svm.fit(X_train,Y_train)
predictions = svm.predict(X_test)
In [26]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
In [27]:
print(accuracy_score(Y_test,predictions))
In [28]:
print(classification_report(Y_test,predictions))
In [29]:
conf = confusion_matrix(Y_test,predictions)
In [30]:
label = ["Setosa","Versicolor","Virginica"]
sns.heatmap(conf, annot=True, xticklabels=label, yticklabels=label)
Out[30]:
No comments :
Post a Comment