xgboost/tests/python/test_updaters.py
Rory Mitchell b47b5ac771
Use hypothesis (#5759)
* Use hypothesis

* Allow int64 array interface for groups

* Add packages to Windows CI

* Add to travis

* Make sure device index is set correctly

* Fix dask-cudf test

* appveyor
2020-06-16 12:45:59 +12:00

130 lines
5.2 KiB
Python

import testing as tm
import unittest
import pytest
import xgboost as xgb
import numpy as np
from hypothesis import given, strategies, settings, note
exact_parameter_strategy = strategies.fixed_dictionaries({
'nthread': strategies.integers(1, 4),
'max_depth': strategies.integers(1, 11),
'min_child_weight': strategies.floats(0.5, 2.0),
'alpha': strategies.floats(0.0, 2.0),
'lambda': strategies.floats(1e-5, 2.0),
'eta': strategies.floats(0.01, 0.5),
'gamma': strategies.floats(0.0, 2.0),
'seed': strategies.integers(0, 10),
# We cannot enable subsampling as the training loss can increase
# 'subsample': strategies.floats(0.5, 1.0),
'colsample_bytree': strategies.floats(0.5, 1.0),
'colsample_bylevel': strategies.floats(0.5, 1.0),
})
hist_parameter_strategy = strategies.fixed_dictionaries({
'max_depth': strategies.integers(1, 11),
'max_leaves': strategies.integers(0, 1024),
'max_bin': strategies.integers(2, 512),
'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
}).filter(lambda x: (x['max_depth'] > 0 or x['max_leaves'] > 0) and (
x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))
def train_result(param, dmat, num_rounds):
result = {}
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
evals_result=result)
return result
class TestTreeMethod(unittest.TestCase):
@given(exact_parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
@settings(deadline=None)
def test_exact(self, param, num_rounds, dataset):
param['tree_method'] = 'exact'
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)
assert tm.non_increasing(result['train'][dataset.metric])
@given(exact_parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
@settings(deadline=None)
def test_approx(self, param, num_rounds, dataset):
param['tree_method'] = 'approx'
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)
assert tm.non_increasing(result['train'][dataset.metric], 1e-3)
@pytest.mark.skipif(**tm.no_sklearn())
def test_pruner(self):
import sklearn
params = {'tree_method': 'exact'}
cancer = sklearn.datasets.load_breast_cancer()
X = cancer['data']
y = cancer["target"]
dtrain = xgb.DMatrix(X, y)
booster = xgb.train(params, dtrain=dtrain, num_boost_round=10)
grown = str(booster.get_dump())
params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'}
booster = xgb.train(params, dtrain=dtrain, num_boost_round=10,
xgb_model=booster)
after_prune = str(booster.get_dump())
assert grown != after_prune
booster = xgb.train(params, dtrain=dtrain, num_boost_round=10,
xgb_model=booster)
second_prune = str(booster.get_dump())
# Second prune should not change the tree
assert after_prune == second_prune
@given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
@settings(deadline=None)
def test_hist(self, param, hist_param, num_rounds, dataset):
param['tree_method'] = 'hist'
param = dataset.set_params(param)
param.update(hist_param)
result = train_result(param, dataset.get_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result['train'][dataset.metric])
def test_hist_categorical(self):
# hist must be same as exact on all-categorial data
dpath = 'demo/data/'
ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
ag_param = {'max_depth': 2,
'tree_method': 'hist',
'eta': 1,
'verbosity': 0,
'objective': 'binary:logistic',
'eval_metric': 'auc'}
hist_res = {}
exact_res = {}
xgb.train(ag_param, ag_dtrain, 10,
[(ag_dtrain, 'train'), (ag_dtest, 'test')],
evals_result=hist_res)
ag_param["tree_method"] = "exact"
xgb.train(ag_param, ag_dtrain, 10,
[(ag_dtrain, 'train'), (ag_dtest, 'test')],
evals_result=exact_res)
assert hist_res['train']['auc'] == exact_res['train']['auc']
assert hist_res['test']['auc'] == exact_res['test']['auc']
@pytest.mark.skipif(**tm.no_sklearn())
def test_hist_degenerate_case(self):
# Test a degenerate case where the quantile sketcher won't return any
# quantile points for a particular feature (the second feature in
# this example). Source: https://github.com/dmlc/xgboost/issues/2943
nan = np.nan
param = {'missing': nan, 'tree_method': 'hist'}
model = xgb.XGBRegressor(**param)
X = np.array([[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan],
[6.38888889e+05, nan], [6.28086420e+05, nan]])
y = [1000000., 0., 0., 500000.]
w = [0, 0, 1, 0]
model.fit(X, y, sample_weight=w)