Use hypothesis (#5759)

* Use hypothesis

* Allow int64 array interface for groups

* Add packages to Windows CI

* Add to travis

* Make sure device index is set correctly

* Fix dask-cudf test

* appveyor
This commit is contained in:
Rory Mitchell
2020-06-16 12:45:59 +12:00
committed by GitHub
parent 02884b08aa
commit b47b5ac771
17 changed files with 411 additions and 439 deletions

View File

@@ -1,87 +1,72 @@
import numpy as np
import testing as tm
import unittest
import pytest
from hypothesis import strategies, given, settings, note
import xgboost as xgb
try:
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import scale
from regression_test_utilities import run_suite, parameter_combinations
except ImportError:
None
parameter_strategy = strategies.fixed_dictionaries({
'booster': strategies.just('gblinear'),
'eta': strategies.floats(0.01, 0.25),
'tolerance': strategies.floats(1e-5, 1e-2),
'nthread': strategies.integers(1, 4),
})
coord_strategy = strategies.fixed_dictionaries({
'feature_selector': strategies.sampled_from(['cyclic', 'shuffle',
'greedy', 'thrifty']),
'top_k': strategies.integers(1, 10),
})
def is_float(s):
try:
float(s)
return 1
except ValueError:
return 0
def train_result(param, dmat, num_rounds):
result = {}
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
evals_result=result)
return result
def xgb_get_weights(bst):
return np.array([float(s) for s in bst.get_dump()[0].split() if
is_float(s)])
class TestLinear:
@given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy, coord_strategy)
@settings(deadline=None)
def test_coordinate(self, param, num_rounds, dataset, coord_param):
param['updater'] = 'coord_descent'
param.update(coord_param)
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
assert tm.non_increasing(result)
# Loss is not guaranteed to always decrease because of regularisation parameters
# We test a weaker condition that the loss has not increased between the first and last
# iteration
@given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 2.0),
strategies.floats(1e-5, 2.0))
@settings(deadline=None)
def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd):
param['updater'] = 'coord_descent'
param['alpha'] = alpha
param['lambda'] = lambd
param.update(coord_param)
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
assert tm.non_increasing([result[0], result[-1]])
def assert_regression_result(results, tol):
regression_results = [r for r in results if
r["param"]["objective"] == "reg:squarederror"]
for res in regression_results:
X = scale(res["dataset"].X,
with_mean=isinstance(res["dataset"].X, np.ndarray))
y = res["dataset"].y
reg_alpha = res["param"]["alpha"]
reg_lambda = res["param"]["lambda"]
pred = res["bst"].predict(xgb.DMatrix(X))
weights = xgb_get_weights(res["bst"])[1:]
enet = ElasticNet(alpha=reg_alpha + reg_lambda,
l1_ratio=reg_alpha / (reg_alpha + reg_lambda))
enet.fit(X, y)
enet_pred = enet.predict(X)
assert np.isclose(weights, enet.coef_, rtol=tol,
atol=tol).all(), (weights, enet.coef_)
assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all(), (
res["dataset"].name, enet_pred[:5], pred[:5])
@given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy)
@settings(deadline=None)
def test_shotgun(self, param, num_rounds, dataset):
param['updater'] = 'shotgun'
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
assert tm.non_increasing(result)
# TODO: More robust classification tests
def assert_classification_result(results):
classification_results = [r for r in results if
r["param"]["objective"] != "reg:squarederror"]
for res in classification_results:
# Check accuracy is reasonable
assert res["eval"][-1] < 2.0, (res["dataset"].name, res["eval"][-1])
class TestLinear(unittest.TestCase):
datasets = ["Boston", "Digits", "Cancer", "Sparse regression",
"Boston External Memory"]
@pytest.mark.skipif(**tm.no_sklearn())
def test_coordinate(self):
variable_param = {'booster': ['gblinear'], 'updater':
['coord_descent'], 'eta': [0.5], 'top_k':
[10], 'tolerance': [1e-5], 'nthread': [2],
'alpha': [.005, .1], 'lambda': [.005],
'feature_selector': ['cyclic', 'shuffle',
'greedy', 'thrifty']}
for param in parameter_combinations(variable_param):
results = run_suite(param, 150, self.datasets, scale_features=True)
assert_regression_result(results, 1e-2)
assert_classification_result(results)
@pytest.mark.skipif(**tm.no_sklearn())
def test_shotgun(self):
variable_param = {'booster': ['gblinear'], 'updater':
['shotgun'], 'eta': [0.5], 'top_k': [10],
'tolerance': [1e-5], 'nthread': [2],
'alpha': [.005, .1], 'lambda': [.005],
'feature_selector': ['cyclic', 'shuffle']}
for param in parameter_combinations(variable_param):
results = run_suite(param, 200, self.datasets, True)
assert_regression_result(results, 1e-2)
assert_classification_result(results)
@given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy, strategies.floats(1e-5, 2.0),
strategies.floats(1e-5, 2.0))
@settings(deadline=None)
def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd):
param['updater'] = 'shotgun'
param['alpha'] = alpha
param['lambda'] = lambd
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
assert tm.non_increasing([result[0], result[-1]])