Wednesday, November 29, 2017

XGBoost Hyperparameter Tuning


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
In [2]:
pima = pd.read_csv("diabetes.csv")
In [3]:
X = pima.drop(["Outcome"], axis = 1)
In [4]:
y = pima.Outcome
In [5]:
pima.head()
Out[5]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [7]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
/Applications/anaconda/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [8]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
    random_state=seed)

Tuning num of estimators

In [9]:
model = XGBClassifier()
In [10]:
n_estimators = range(50,350,50)
In [12]:
list(n_estimators)
Out[12]:
[50, 100, 150, 200, 250, 300]
In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
In [14]:
param_grid = dict(n_estimators = n_estimators)
In [17]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)
In [21]:
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
In [22]:
grid_result = grid_search.fit(X, y)
In [23]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.765625 using {'n_estimators': 100}
0.751302 (0.046158) with: {'n_estimators': 50}
0.765625 (0.048217) with: {'n_estimators': 100}
0.755208 (0.044995) with: {'n_estimators': 150}
0.748698 (0.046475) with: {'n_estimators': 200}
0.755208 (0.044124) with: {'n_estimators': 250}
0.739583 (0.042239) with: {'n_estimators': 300}

Tuning Max Depth

In [24]:
model = XGBClassifier()
In [27]:
max_depth = range(1,10,2)
In [28]:
list(max_depth)
Out[28]:
[1, 3, 5, 7, 9]
In [44]:
param_grid = dict(max_depth = max_depth)
In [45]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)
In [46]:
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
In [47]:
grid_result = grid_search.fit(X, y)
In [48]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.768229 using {'max_depth': 1}
0.768229 (0.044609) with: {'max_depth': 1}
0.765625 (0.048217) with: {'max_depth': 3}
0.752604 (0.036764) with: {'max_depth': 5}
0.746094 (0.040838) with: {'max_depth': 7}
0.733073 (0.033432) with: {'max_depth': 9}

Tuning num of estimators and max_depth

In [50]:
n_estimators = [10,20,50,100]
max_depth = [1,3,6,9]
In [51]:
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth)
In [52]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)
In [53]:
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
In [54]:
grid_result = grid_search.fit(X, y)
In [55]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.768229 using {'max_depth': 1, 'n_estimators': 100}
0.752604 (0.043311) with: {'max_depth': 1, 'n_estimators': 10}
0.757812 (0.040937) with: {'max_depth': 1, 'n_estimators': 20}
0.761719 (0.035131) with: {'max_depth': 1, 'n_estimators': 50}
0.768229 (0.044609) with: {'max_depth': 1, 'n_estimators': 100}
0.751302 (0.031781) with: {'max_depth': 3, 'n_estimators': 10}
0.750000 (0.043194) with: {'max_depth': 3, 'n_estimators': 20}
0.751302 (0.046158) with: {'max_depth': 3, 'n_estimators': 50}
0.765625 (0.048217) with: {'max_depth': 3, 'n_estimators': 100}
0.747396 (0.050991) with: {'max_depth': 6, 'n_estimators': 10}
0.757812 (0.049108) with: {'max_depth': 6, 'n_estimators': 20}
0.761719 (0.053966) with: {'max_depth': 6, 'n_estimators': 50}
0.753906 (0.055895) with: {'max_depth': 6, 'n_estimators': 100}
0.747396 (0.054361) with: {'max_depth': 9, 'n_estimators': 10}
0.740885 (0.050823) with: {'max_depth': 9, 'n_estimators': 20}
0.753906 (0.042640) with: {'max_depth': 9, 'n_estimators': 50}
0.733073 (0.033432) with: {'max_depth': 9, 'n_estimators': 100}

Tuning learning rate and num of trees

In [57]:
n_estimators = [100,200,300,500]
learning_rate = [0.1,0.01,0.001,1]
In [58]:
param_grid = dict(n_estimators = n_estimators, learning_rate = learning_rate)
In [59]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)
In [60]:
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
In [61]:
grid_result = grid_search.fit(X, y)
In [62]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.765625 using {'learning_rate': 0.1, 'n_estimators': 100}
0.765625 (0.048217) with: {'learning_rate': 0.1, 'n_estimators': 100}
0.748698 (0.046475) with: {'learning_rate': 0.1, 'n_estimators': 200}
0.739583 (0.042239) with: {'learning_rate': 0.1, 'n_estimators': 300}
0.739583 (0.039390) with: {'learning_rate': 0.1, 'n_estimators': 500}
0.747396 (0.029346) with: {'learning_rate': 0.01, 'n_estimators': 100}
0.751302 (0.043446) with: {'learning_rate': 0.01, 'n_estimators': 200}
0.752604 (0.043019) with: {'learning_rate': 0.01, 'n_estimators': 300}
0.755208 (0.047917) with: {'learning_rate': 0.01, 'n_estimators': 500}
0.744792 (0.047441) with: {'learning_rate': 0.001, 'n_estimators': 100}
0.748698 (0.043841) with: {'learning_rate': 0.001, 'n_estimators': 200}
0.753906 (0.043081) with: {'learning_rate': 0.001, 'n_estimators': 300}
0.755208 (0.039248) with: {'learning_rate': 0.001, 'n_estimators': 500}
0.716146 (0.033080) with: {'learning_rate': 1, 'n_estimators': 100}
0.717448 (0.028426) with: {'learning_rate': 1, 'n_estimators': 200}
0.714844 (0.030311) with: {'learning_rate': 1, 'n_estimators': 300}
0.708333 (0.030738) with: {'learning_rate': 1, 'n_estimators': 500}

Tuning Row Subsampling

In [73]:
subsample = np.arange(0.1,1.1,0.1)
list(subsample)
Out[73]:
[0.10000000000000001,
 0.20000000000000001,
 0.30000000000000004,
 0.40000000000000002,
 0.5,
 0.59999999999999998,
 0.70000000000000007,
 0.80000000000000004,
 0.90000000000000002,
 1.0]
In [74]:
param_grid = dict(subsample = subsample)
In [75]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)
In [76]:
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
In [77]:
grid_result = grid_search.fit(X, y)
In [78]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.769531 using {'subsample': 0.10000000000000001}
0.769531 (0.057665) with: {'subsample': 0.10000000000000001}
0.736979 (0.044056) with: {'subsample': 0.20000000000000001}
0.743490 (0.043375) with: {'subsample': 0.30000000000000004}
0.736979 (0.030939) with: {'subsample': 0.40000000000000002}
0.746094 (0.040822) with: {'subsample': 0.5}
0.748698 (0.038646) with: {'subsample': 0.59999999999999998}
0.752604 (0.038250) with: {'subsample': 0.70000000000000007}
0.747396 (0.041794) with: {'subsample': 0.80000000000000004}
0.746094 (0.036924) with: {'subsample': 0.90000000000000002}
0.765625 (0.048217) with: {'subsample': 1.0}

Tuning Column subsampling by tree

In [84]:
colsample_bytree = np.arange(0.4,1.1,0.1)
list(colsample_bytree)
Out[84]:
[0.40000000000000002,
 0.5,
 0.59999999999999998,
 0.69999999999999996,
 0.79999999999999993,
 0.89999999999999991,
 0.99999999999999989]
In [85]:
param_grid = dict(colsample_bytree = colsample_bytree)
In [86]:
kfold = StratifiedKFold(n_splits=10, shuffle = True, random_state = 8)
In [87]:
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
In [88]:
grid_result = grid_search.fit(X, y)
In [89]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.765625 using {'colsample_bytree': 0.99999999999999989}
0.753906 (0.049816) with: {'colsample_bytree': 0.40000000000000002}
0.756510 (0.044324) with: {'colsample_bytree': 0.5}
0.756510 (0.044324) with: {'colsample_bytree': 0.59999999999999998}
0.752604 (0.048269) with: {'colsample_bytree': 0.69999999999999996}
0.751302 (0.051559) with: {'colsample_bytree': 0.79999999999999993}
0.756510 (0.048338) with: {'colsample_bytree': 0.89999999999999991}
0.765625 (0.048217) with: {'colsample_bytree': 0.99999999999999989}

Tuning Column subsampling by split

In [90]:
colsample_bylevel = np.arange(0.3,1.1,0.1)
list(colsample_bylevel)
Out[90]:
[0.29999999999999999,
 0.40000000000000002,
 0.5,
 0.60000000000000009,
 0.70000000000000018,
 0.80000000000000027,
 0.90000000000000013,
 1.0000000000000002]
In [91]:
param_grid = dict(colsample_bylevel = colsample_bylevel)
In [92]:
kfold = StratifiedKFold(n_splits=10, shuffle = True, random_state = 8)
In [93]:
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
In [94]:
grid_result = grid_search.fit(X, y)
In [95]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.765625 using {'colsample_bylevel': 1.0000000000000002}
0.750000 (0.054784) with: {'colsample_bylevel': 0.29999999999999999}
0.747396 (0.044624) with: {'colsample_bylevel': 0.40000000000000002}
0.753906 (0.047626) with: {'colsample_bylevel': 0.5}
0.753906 (0.047626) with: {'colsample_bylevel': 0.60000000000000009}
0.755208 (0.050961) with: {'colsample_bylevel': 0.70000000000000018}
0.755208 (0.049037) with: {'colsample_bylevel': 0.80000000000000027}
0.747396 (0.043472) with: {'colsample_bylevel': 0.90000000000000013}
0.765625 (0.048217) with: {'colsample_bylevel': 1.0000000000000002}
In [ ]:
 

No comments :

Post a Comment