* fix DeprecationWarning on sklearn.cross_validation * fix syntax * fix kfold n_split issue * fix mistype * fix n_splits multiple value issue * split should pass a iterable * use np.arange instead of xrange, py3 compatibility
81 lines
2.5 KiB
Python
Executable File
81 lines
2.5 KiB
Python
Executable File
#!/usr/bin/python
|
|
'''
|
|
Created on 1 Apr 2015
|
|
|
|
@author: Jamie Hall
|
|
'''
|
|
import pickle
|
|
import xgboost as xgb
|
|
|
|
import numpy as np
|
|
try:
|
|
from sklearn.model_selection import KFold, train_test_split
|
|
except:
|
|
from sklearn.cross_validation import KFold, train_test_split
|
|
from sklearn.metrics import confusion_matrix, mean_squared_error
|
|
from sklearn.grid_search import GridSearchCV
|
|
from sklearn.datasets import load_iris, load_digits, load_boston
|
|
|
|
rng = np.random.RandomState(31337)
|
|
|
|
print("Zeros and Ones from the Digits dataset: binary classification")
|
|
digits = load_digits(2)
|
|
y = digits['target']
|
|
X = digits['data']
|
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
|
for train_index, test_index in kf:
|
|
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
|
|
predictions = xgb_model.predict(X[test_index])
|
|
actuals = y[test_index]
|
|
print(confusion_matrix(actuals, predictions))
|
|
|
|
print("Iris: multiclass classification")
|
|
iris = load_iris()
|
|
y = iris['target']
|
|
X = iris['data']
|
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
|
for train_index, test_index in kf:
|
|
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
|
|
predictions = xgb_model.predict(X[test_index])
|
|
actuals = y[test_index]
|
|
print(confusion_matrix(actuals, predictions))
|
|
|
|
print("Boston Housing: regression")
|
|
boston = load_boston()
|
|
y = boston['target']
|
|
X = boston['data']
|
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
|
for train_index, test_index in kf:
|
|
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
|
|
predictions = xgb_model.predict(X[test_index])
|
|
actuals = y[test_index]
|
|
print(mean_squared_error(actuals, predictions))
|
|
|
|
print("Parameter optimization")
|
|
y = boston['target']
|
|
X = boston['data']
|
|
xgb_model = xgb.XGBRegressor()
|
|
clf = GridSearchCV(xgb_model,
|
|
{'max_depth': [2,4,6],
|
|
'n_estimators': [50,100,200]}, verbose=1)
|
|
clf.fit(X,y)
|
|
print(clf.best_score_)
|
|
print(clf.best_params_)
|
|
|
|
# The sklearn API models are picklable
|
|
print("Pickling sklearn API models")
|
|
# must open in binary format to pickle
|
|
pickle.dump(clf, open("best_boston.pkl", "wb"))
|
|
clf2 = pickle.load(open("best_boston.pkl", "rb"))
|
|
print(np.allclose(clf.predict(X), clf2.predict(X)))
|
|
|
|
# Early-stopping
|
|
|
|
X = digits['data']
|
|
y = digits['target']
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
|
clf = xgb.XGBClassifier()
|
|
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
|
|
eval_set=[(X_test, y_test)])
|
|
|