In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
In [2]:
pima = pd.read_csv("diabetes.csv")
In [3]:
pima.head()
Out[3]:
In [4]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
In [5]:
X = pima.drop(["Outcome"], axis = 1)
In [6]:
y = pima.Outcome
In [7]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
random_state=seed)
Building model¶
In [8]:
model = XGBClassifier()
In [9]:
model.fit(X_train,y_train)
Out[9]:
In [10]:
y_pred = model.predict(X_test)
In [11]:
y_pred
Out[11]:
In [12]:
accuracy_score(y_pred, y_test)
Out[12]:
Stratified K-fold¶
In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
In [14]:
model = XGBClassifier()
In [15]:
kfold = StratifiedKFold(n_splits=10, random_state=8)
In [16]:
results = cross_val_score(model, X,y, cv = kfold)
In [17]:
results
Out[17]:
In [18]:
results.mean()
Out[18]:
In [19]:
results.std()
Out[19]:
Visualisation¶
In [20]:
from xgboost import plot_tree, plot_importance
In [24]:
model.fit(X,y)
Out[24]:
In [22]:
!pip install graphviz
In [38]:
ax = plot_tree(model)
fig = ax.figure
fig.set_size_inches(10, 8)
In [37]:
ax = plot_importance(model)
fig = ax.figure
fig.set_size_inches(10, 8)
Feature Selection¶
In [40]:
# Fit model using each importance as a threshold
thresholds = sorted(model.feature_importances_)
In [41]:
thresholds
Out[41]:
In [43]:
from sklearn.feature_selection import SelectFromModel
In [46]:
for thresh in thresholds:
# select features using threshold
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X_train = selection.transform(X_train)
# train model
selection_model = XGBClassifier()
selection_model.fit(select_X_train, y_train)
# eval model
select_X_test = selection.transform(X_test)
y_pred = selection_model.predict(select_X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],
accuracy*100.0))
Evaluate models using learning curves¶
In [54]:
eval_set = [(X_train,y_train), (X_test,y_test)]
model.fit(X_train,y_train, eval_set= eval_set, eval_metric=["logloss","error"],verbose = True)
Out[54]:
In [51]:
# retrieve performance metrics
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss')
plt.show()
# plot classification error
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
plt.ylabel('Classification Error')
plt.title('XGBoost Classification Error')
plt.show()
Early stopping¶
In [58]:
model.fit(X_train, y_train, eval_metric="logloss", eval_set=eval_set, early_stopping_rounds=10, verbose = True)
Out[58]:
In [55]:
y_pred = model.predict(X_test)
In [56]:
accuracy_score(y_test, y_pred)
Out[56]:
No comments :
Post a Comment