Added more thorough test for early stopping (+1 squashed commit)
Squashed commits: [4f78cc0] Added test for early stopping (+1 squashed commit)
This commit is contained in:
parent
166e878830
commit
7d297b418f
@ -2,18 +2,31 @@ import xgboost as xgb
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.datasets import load_digits
|
from sklearn.datasets import load_digits
|
||||||
from sklearn.cross_validation import KFold, train_test_split
|
from sklearn.cross_validation import KFold, train_test_split
|
||||||
|
import unittest
|
||||||
|
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
|
|
||||||
def test_early_stopping_nonparallel():
|
class TestEarlyStopping(unittest.TestCase):
|
||||||
# digits = load_digits(2)
|
|
||||||
# X = digits['data']
|
def test_early_stopping_nonparallel(self):
|
||||||
# y = digits['target']
|
digits = load_digits(2)
|
||||||
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
X = digits['data']
|
||||||
# clf = xgb.XGBClassifier()
|
y = digits['target']
|
||||||
# clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||||
# eval_set=[(X_test, y_test)])
|
clf1 = xgb.XGBClassifier()
|
||||||
print("This test will be re-visited later. ")
|
clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc",
|
||||||
|
eval_set=[(X_test, y_test)])
|
||||||
|
clf2 = xgb.XGBClassifier()
|
||||||
|
clf2.fit(X_train, y_train, early_stopping_rounds=4, eval_metric="auc",
|
||||||
|
eval_set=[(X_test, y_test)])
|
||||||
|
# should be the same
|
||||||
|
assert clf1.best_score == clf2.best_score
|
||||||
|
assert clf1.best_score != 1
|
||||||
|
# check overfit
|
||||||
|
clf3 = xgb.XGBClassifier()
|
||||||
|
clf3.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
|
||||||
|
eval_set=[(X_test, y_test)])
|
||||||
|
assert clf3.best_score == 1
|
||||||
|
|
||||||
# TODO: parallel test for early stopping
|
# TODO: parallel test for early stopping
|
||||||
# TODO: comment out for now. Will re-visit later
|
# TODO: comment out for now. Will re-visit later
|
||||||
@ -4,65 +4,61 @@ from sklearn.cross_validation import KFold, train_test_split
|
|||||||
from sklearn.metrics import mean_squared_error
|
from sklearn.metrics import mean_squared_error
|
||||||
from sklearn.grid_search import GridSearchCV
|
from sklearn.grid_search import GridSearchCV
|
||||||
from sklearn.datasets import load_iris, load_digits, load_boston
|
from sklearn.datasets import load_iris, load_digits, load_boston
|
||||||
import unittest
|
|
||||||
|
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
|
|
||||||
class TestSklearn(unittest.TestCase):
|
def test_binary_classification():
|
||||||
|
digits = load_digits(2)
|
||||||
|
y = digits['target']
|
||||||
|
X = digits['data']
|
||||||
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||||
|
for train_index, test_index in kf:
|
||||||
|
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
|
||||||
|
preds = xgb_model.predict(X[test_index])
|
||||||
|
labels = y[test_index]
|
||||||
|
err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
|
||||||
|
assert err < 0.1
|
||||||
|
|
||||||
def test_binary_classification():
|
def test_multiclass_classification():
|
||||||
digits = load_digits(2)
|
iris = load_iris()
|
||||||
y = digits['target']
|
y = iris['target']
|
||||||
X = digits['data']
|
X = iris['data']
|
||||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||||
for train_index, test_index in kf:
|
for train_index, test_index in kf:
|
||||||
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
|
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
|
||||||
preds = xgb_model.predict(X[test_index])
|
preds = xgb_model.predict(X[test_index])
|
||||||
labels = y[test_index]
|
# test other params in XGBClassifier().fit
|
||||||
err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
|
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
|
||||||
assert err < 0.1
|
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
|
||||||
|
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
|
||||||
|
labels = y[test_index]
|
||||||
|
err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
|
||||||
|
assert err < 0.4
|
||||||
|
|
||||||
def test_multiclass_classification():
|
def test_boston_housing_regression():
|
||||||
iris = load_iris()
|
boston = load_boston()
|
||||||
y = iris['target']
|
y = boston['target']
|
||||||
X = iris['data']
|
X = boston['data']
|
||||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||||
for train_index, test_index in kf:
|
for train_index, test_index in kf:
|
||||||
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
|
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
|
||||||
preds = xgb_model.predict(X[test_index])
|
preds = xgb_model.predict(X[test_index])
|
||||||
# test other params in XGBClassifier().fit
|
# test other params in XGBRegressor().fit
|
||||||
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
|
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
|
||||||
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
|
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
|
||||||
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
|
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
|
||||||
labels = y[test_index]
|
labels = y[test_index]
|
||||||
err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
|
assert mean_squared_error(preds, labels) < 25
|
||||||
assert err < 0.4
|
|
||||||
|
|
||||||
def test_boston_housing_regression():
|
|
||||||
boston = load_boston()
|
|
||||||
y = boston['target']
|
|
||||||
X = boston['data']
|
|
||||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
|
||||||
for train_index, test_index in kf:
|
|
||||||
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
|
|
||||||
preds = xgb_model.predict(X[test_index])
|
|
||||||
# test other params in XGBRegressor().fit
|
|
||||||
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
|
|
||||||
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
|
|
||||||
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
|
|
||||||
labels = y[test_index]
|
|
||||||
assert mean_squared_error(preds, labels) < 15
|
|
||||||
|
|
||||||
def test_parameter_tuning():
|
|
||||||
boston = load_boston()
|
|
||||||
y = boston['target']
|
|
||||||
X = boston['data']
|
|
||||||
xgb_model = xgb.XGBRegressor()
|
|
||||||
clf = GridSearchCV(xgb_model,
|
|
||||||
{'max_depth': [2,4,6],
|
|
||||||
'n_estimators': [50,100,200]}, verbose=1)
|
|
||||||
clf.fit(X,y)
|
|
||||||
assert clf.best_score_ < 0.7
|
|
||||||
assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
|
|
||||||
|
|
||||||
|
def test_parameter_tuning():
|
||||||
|
boston = load_boston()
|
||||||
|
y = boston['target']
|
||||||
|
X = boston['data']
|
||||||
|
xgb_model = xgb.XGBRegressor()
|
||||||
|
clf = GridSearchCV(xgb_model,
|
||||||
|
{'max_depth': [2,4,6],
|
||||||
|
'n_estimators': [50,100,200]}, verbose=1)
|
||||||
|
clf.fit(X,y)
|
||||||
|
assert clf.best_score_ < 0.7
|
||||||
|
assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user