xgboost/tests/python/test_callback.py
2020-11-10 07:47:48 +08:00

294 lines
13 KiB
Python

import xgboost as xgb
import unittest
import pytest
import os
import testing as tm
import tempfile
# We use the dataset for tests.
pytestmark = pytest.mark.skipif(**tm.no_sklearn())
class TestCallbacks(unittest.TestCase):
@classmethod
def setUpClass(cls):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
cls.X = X
cls.y = y
split = int(X.shape[0]*0.8)
cls.X_train = X[: split, ...]
cls.y_train = y[: split, ...]
cls.X_valid = X[split:, ...]
cls.y_valid = y[split:, ...]
def test_evaluation_monitor(self):
D_train = xgb.DMatrix(self.X_train, self.y_train)
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
evals_result = {}
rounds = 10
xgb.train({'objective': 'binary:logistic',
'eval_metric': 'error'}, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
num_boost_round=rounds,
evals_result=evals_result,
verbose_eval=True)
assert len(evals_result['Train']['error']) == rounds
assert len(evals_result['Valid']['error']) == rounds
with tm.captured_output() as (out, err):
xgb.train({'objective': 'binary:logistic',
'eval_metric': 'error'}, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
num_boost_round=rounds,
evals_result=evals_result,
verbose_eval=2)
output: str = out.getvalue().strip()
pos = 0
msg = 'Train-error'
for i in range(rounds // 2):
pos = output.find('Train-error', pos)
assert pos != -1
pos += len(msg)
assert output.find('Train-error', pos) == -1
def test_early_stopping(self):
D_train = xgb.DMatrix(self.X_train, self.y_train)
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
evals_result = {}
rounds = 30
early_stopping_rounds = 5
booster = xgb.train({'objective': 'binary:logistic',
'eval_metric': 'error'}, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
num_boost_round=rounds,
evals_result=evals_result,
verbose_eval=True,
early_stopping_rounds=early_stopping_rounds)
dump = booster.get_dump(dump_format='json')
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_early_stopping_custom_eval(self):
D_train = xgb.DMatrix(self.X_train, self.y_train)
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
early_stopping_rounds = 5
booster = xgb.train({'objective': 'binary:logistic',
'eval_metric': 'error',
'tree_method': 'hist'}, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
feval=tm.eval_error_metric,
num_boost_round=1000,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=False)
dump = booster.get_dump(dump_format='json')
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_early_stopping_customize(self):
D_train = xgb.DMatrix(self.X_train, self.y_train)
D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
early_stopping_rounds = 5
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
metric_name='CustomErr',
data_name='Train')
# Specify which dataset and which metric should be used for early stopping.
booster = xgb.train(
{'objective': 'binary:logistic',
'eval_metric': ['error', 'rmse'],
'tree_method': 'hist'}, D_train,
evals=[(D_train, 'Train'), (D_valid, 'Valid')],
feval=tm.eval_error_metric,
num_boost_round=1000,
callbacks=[early_stop],
verbose_eval=False)
dump = booster.get_dump(dump_format='json')
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
assert len(early_stop.stopping_history['Train']['CustomErr']) == len(dump)
def test_early_stopping_skl(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
cls = xgb.XGBClassifier()
early_stopping_rounds = 5
cls.fit(X, y, eval_set=[(X, y)],
early_stopping_rounds=early_stopping_rounds, eval_metric='error')
booster = cls.get_booster()
dump = booster.get_dump(dump_format='json')
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_early_stopping_custom_eval_skl(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
cls = xgb.XGBClassifier()
early_stopping_rounds = 5
cls.fit(X, y, eval_set=[(X, y)],
early_stopping_rounds=early_stopping_rounds,
eval_metric=tm.eval_error_metric)
booster = cls.get_booster()
dump = booster.get_dump(dump_format='json')
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_early_stopping_save_best_model(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
n_estimators = 100
cls = xgb.XGBClassifier(n_estimators=n_estimators)
early_stopping_rounds = 5
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
save_best=True)
cls.fit(X, y, eval_set=[(X, y)],
eval_metric=tm.eval_error_metric, callbacks=[early_stop])
booster = cls.get_booster()
dump = booster.get_dump(dump_format='json')
assert len(dump) == booster.best_iteration
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
save_best=True)
cls = xgb.XGBClassifier(booster='gblinear', n_estimators=10)
self.assertRaises(ValueError, lambda: cls.fit(X, y, eval_set=[(X, y)],
eval_metric=tm.eval_error_metric,
callbacks=[early_stop]))
# No error
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
save_best=False)
xgb.XGBClassifier(booster='gblinear', n_estimators=10).fit(
X, y, eval_set=[(X, y)],
eval_metric=tm.eval_error_metric,
callbacks=[early_stop])
def run_eta_decay(self, tree_method, deprecated_callback):
if deprecated_callback:
scheduler = xgb.callback.reset_learning_rate
else:
scheduler = xgb.callback.LearningRateScheduler
dpath = os.path.join(tm.PROJECT_ROOT, 'demo/data/')
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 4
# learning_rates as a list
# init eta with 0 to check whether learning_rates work
param = {'max_depth': 2, 'eta': 0, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': 'error',
'tree_method': tree_method}
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist,
callbacks=[scheduler([
0.8, 0.7, 0.6, 0.5
])],
evals_result=evals_result)
eval_errors_0 = list(map(float, evals_result['eval']['error']))
assert isinstance(bst, xgb.core.Booster)
# validation error should decrease, if eta > 0
assert eval_errors_0[0] > eval_errors_0[-1]
# init learning_rate with 0 to check whether learning_rates work
param = {'max_depth': 2, 'learning_rate': 0, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': 'error',
'tree_method': tree_method}
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist,
callbacks=[scheduler(
[0.8, 0.7, 0.6, 0.5])],
evals_result=evals_result)
eval_errors_1 = list(map(float, evals_result['eval']['error']))
assert isinstance(bst, xgb.core.Booster)
# validation error should decrease, if learning_rate > 0
assert eval_errors_1[0] > eval_errors_1[-1]
# check if learning_rates override default value of eta/learning_rate
param = {
'max_depth': 2, 'verbosity': 0, 'objective': 'binary:logistic',
'eval_metric': 'error', 'tree_method': tree_method
}
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist,
callbacks=[scheduler(
[0, 0, 0, 0]
)],
evals_result=evals_result)
eval_errors_2 = list(map(float, evals_result['eval']['error']))
assert isinstance(bst, xgb.core.Booster)
# validation error should not decrease, if eta/learning_rate = 0
assert eval_errors_2[0] == eval_errors_2[-1]
# learning_rates as a customized decay function
def eta_decay(ithround, num_boost_round=num_round):
return num_boost_round / (ithround + 1)
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist,
callbacks=[
scheduler(eta_decay)
],
evals_result=evals_result)
eval_errors_3 = list(map(float, evals_result['eval']['error']))
assert isinstance(bst, xgb.core.Booster)
assert eval_errors_3[0] == eval_errors_2[0]
for i in range(1, len(eval_errors_0)):
assert eval_errors_3[i] != eval_errors_2[i]
def test_eta_decay_hist(self):
with pytest.warns(UserWarning):
self.run_eta_decay('hist', True)
self.run_eta_decay('hist', False)
def test_eta_decay_approx(self):
with pytest.warns(UserWarning):
self.run_eta_decay('approx', True)
self.run_eta_decay('approx', False)
def test_eta_decay_exact(self):
with pytest.warns(UserWarning):
self.run_eta_decay('exact', True)
self.run_eta_decay('exact', False)
def test_check_point(self):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
m = xgb.DMatrix(X, y)
with tempfile.TemporaryDirectory() as tmpdir:
check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
iterations=1,
name='model')
xgb.train({'objective': 'binary:logistic'}, m,
num_boost_round=10,
verbose_eval=False,
callbacks=[check_point])
for i in range(1, 10):
assert os.path.exists(
os.path.join(tmpdir, 'model_' + str(i) + '.json'))
check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
iterations=1,
as_pickle=True,
name='model')
xgb.train({'objective': 'binary:logistic'}, m,
num_boost_round=10,
verbose_eval=False,
callbacks=[check_point])
for i in range(1, 10):
assert os.path.exists(
os.path.join(tmpdir, 'model_' + str(i) + '.pkl'))
def test_callback_list(self):
X, y = tm.get_boston()
m = xgb.DMatrix(X, y)
callbacks = [xgb.callback.EarlyStopping(rounds=10)]
for i in range(4):
xgb.train({'objective': 'reg:squarederror',
'eval_metric': 'rmse'}, m,
evals=[(m, 'Train')],
num_boost_round=1,
verbose_eval=True,
callbacks=callbacks)
assert len(callbacks) == 1