Use hypothesis (#5759)
* Use hypothesis * Allow int64 array interface for groups * Add packages to Windows CI * Add to travis * Make sure device index is set correctly * Fix dask-cudf test * appveyor
This commit is contained in:
@@ -1,198 +0,0 @@
|
||||
import glob
|
||||
import itertools as it
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import xgboost as xgb
|
||||
from joblib import Memory
|
||||
memory = Memory('./cachedir', verbose=0)
|
||||
|
||||
try:
|
||||
from sklearn import datasets
|
||||
from sklearn.preprocessing import scale
|
||||
except ImportError:
|
||||
None
|
||||
|
||||
|
||||
class Dataset:
|
||||
def __init__(self, name, get_dataset, objective, metric,
|
||||
has_weights=False, use_external_memory=False):
|
||||
self.name = name
|
||||
self.objective = objective
|
||||
self.metric = metric
|
||||
if has_weights:
|
||||
self.X, self.y, self.w = get_dataset()
|
||||
else:
|
||||
self.X, self.y = get_dataset()
|
||||
self.w = None
|
||||
self.use_external_memory = use_external_memory
|
||||
|
||||
def __str__(self):
|
||||
a = 'name: {name}\nobjective:{objective}, metric:{metric}, '.format(
|
||||
name=self.name,
|
||||
objective=self.objective,
|
||||
metric=self.metric)
|
||||
b = 'external memory:{use_external_memory}\n'.format(
|
||||
use_external_memory=self.use_external_memory
|
||||
)
|
||||
return a + b
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_boston():
|
||||
data = datasets.load_boston()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_digits():
|
||||
data = datasets.load_digits()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_cancer():
|
||||
data = datasets.load_breast_cancer()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_sparse():
|
||||
rng = np.random.RandomState(199)
|
||||
n = 2000
|
||||
sparsity = 0.75
|
||||
X, y = datasets.make_regression(n, random_state=rng)
|
||||
flag = rng.binomial(1, sparsity, X.shape)
|
||||
for i in range(X.shape[0]):
|
||||
for j in range(X.shape[1]):
|
||||
if flag[i, j]:
|
||||
X[i, j] = 0.0
|
||||
from scipy import sparse
|
||||
X = sparse.csr_matrix(X)
|
||||
return X, y
|
||||
|
||||
|
||||
def get_sparse_weights():
|
||||
return get_weights_regression(1, 10)
|
||||
|
||||
|
||||
def get_small_weights():
|
||||
return get_weights_regression(1e-6, 1e-5)
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_weights_regression(min_weight, max_weight):
|
||||
rng = np.random.RandomState(199)
|
||||
n = 2000
|
||||
sparsity = 0.25
|
||||
X, y = datasets.make_regression(n, random_state=rng)
|
||||
flag = rng.binomial(1, sparsity, X.shape)
|
||||
for i in range(X.shape[0]):
|
||||
for j in range(X.shape[1]):
|
||||
if flag[i, j]:
|
||||
X[i, j] = np.nan
|
||||
w = rng.uniform(min_weight, max_weight, n)
|
||||
return X, y, w
|
||||
|
||||
|
||||
def train_dataset(dataset, param_in, num_rounds=10, scale_features=False, DMatrixT=xgb.DMatrix,
|
||||
dmatrix_params={}):
|
||||
param = param_in.copy()
|
||||
param["objective"] = dataset.objective
|
||||
if dataset.objective == "multi:softmax":
|
||||
param["num_class"] = int(np.max(dataset.y) + 1)
|
||||
param["eval_metric"] = dataset.metric
|
||||
|
||||
if scale_features:
|
||||
X = scale(dataset.X, with_mean=isinstance(dataset.X, np.ndarray))
|
||||
else:
|
||||
X = dataset.X
|
||||
|
||||
if dataset.use_external_memory:
|
||||
np.savetxt('tmptmp_1234.csv', np.hstack((dataset.y.reshape(len(dataset.y), 1), X)),
|
||||
delimiter=',')
|
||||
dtrain = DMatrixT('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
|
||||
weight=dataset.w)
|
||||
elif DMatrixT is xgb.DeviceQuantileDMatrix:
|
||||
import cupy as cp
|
||||
dtrain = DMatrixT(cp.array(X), cp.array(dataset.y),
|
||||
weight=None if dataset.w is None else cp.array(dataset.w),
|
||||
**dmatrix_params)
|
||||
else:
|
||||
dtrain = DMatrixT(X, dataset.y, weight=dataset.w, **dmatrix_params)
|
||||
|
||||
print("Training on dataset: " + dataset.name, file=sys.stderr)
|
||||
print("Using parameters: " + str(param), file=sys.stderr)
|
||||
res = {}
|
||||
bst = xgb.train(param, dtrain, num_rounds, [(dtrain, 'train')],
|
||||
evals_result=res, verbose_eval=False)
|
||||
|
||||
# Free the booster and dmatrix so we can delete temporary files
|
||||
bst_copy = bst.copy()
|
||||
del bst
|
||||
del dtrain
|
||||
|
||||
# Cleanup temporary files
|
||||
if dataset.use_external_memory:
|
||||
for f in glob.glob("tmptmp_*"):
|
||||
os.remove(f)
|
||||
|
||||
return {"dataset": dataset, "bst": bst_copy, "param": param.copy(),
|
||||
"eval": res['train'][dataset.metric]}
|
||||
|
||||
|
||||
def parameter_combinations(variable_param):
|
||||
"""
|
||||
Enumerate all possible combinations of parameters
|
||||
"""
|
||||
result = []
|
||||
names = sorted(variable_param)
|
||||
combinations = it.product(*(variable_param[Name] for Name in names))
|
||||
for set in combinations:
|
||||
param = {}
|
||||
for i, name in enumerate(names):
|
||||
param[name] = set[i]
|
||||
result.append(param)
|
||||
return result
|
||||
|
||||
|
||||
def run_suite(param, num_rounds=10, select_datasets=None, scale_features=False,
|
||||
DMatrixT=xgb.DMatrix, dmatrix_params={}):
|
||||
"""
|
||||
Run the given parameters on a range of datasets. Objective and eval metric will be
|
||||
automatically set
|
||||
"""
|
||||
datasets = [
|
||||
Dataset("Boston", get_boston, "reg:squarederror", "rmse"),
|
||||
Dataset("Digits", get_digits, "multi:softmax", "mlogloss"),
|
||||
Dataset("Cancer", get_cancer, "binary:logistic", "logloss"),
|
||||
Dataset("Sparse regression", get_sparse, "reg:squarederror", "rmse"),
|
||||
Dataset("Sparse regression with weights", get_sparse_weights,
|
||||
"reg:squarederror", "rmse", has_weights=True),
|
||||
Dataset("Small weights regression", get_small_weights,
|
||||
"reg:squarederror", "rmse", has_weights=True),
|
||||
Dataset("Boston External Memory", get_boston,
|
||||
"reg:squarederror", "rmse",
|
||||
use_external_memory=True)
|
||||
]
|
||||
|
||||
results = [
|
||||
]
|
||||
for d in datasets:
|
||||
if select_datasets is None or d.name in select_datasets:
|
||||
results.append(
|
||||
train_dataset(d, param, num_rounds=num_rounds, scale_features=scale_features,
|
||||
DMatrixT=DMatrixT, dmatrix_params=dmatrix_params))
|
||||
return results
|
||||
|
||||
|
||||
def non_increasing(L, tolerance):
|
||||
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
|
||||
|
||||
|
||||
def assert_results_non_increasing(results, tolerance=1e-5):
|
||||
for r in results:
|
||||
assert non_increasing(r['eval'], tolerance), r
|
||||
@@ -1,87 +1,72 @@
|
||||
import numpy as np
|
||||
import testing as tm
|
||||
import unittest
|
||||
import pytest
|
||||
|
||||
from hypothesis import strategies, given, settings, note
|
||||
import xgboost as xgb
|
||||
|
||||
try:
|
||||
from sklearn.linear_model import ElasticNet
|
||||
from sklearn.preprocessing import scale
|
||||
from regression_test_utilities import run_suite, parameter_combinations
|
||||
except ImportError:
|
||||
None
|
||||
parameter_strategy = strategies.fixed_dictionaries({
|
||||
'booster': strategies.just('gblinear'),
|
||||
'eta': strategies.floats(0.01, 0.25),
|
||||
'tolerance': strategies.floats(1e-5, 1e-2),
|
||||
'nthread': strategies.integers(1, 4),
|
||||
})
|
||||
|
||||
coord_strategy = strategies.fixed_dictionaries({
|
||||
'feature_selector': strategies.sampled_from(['cyclic', 'shuffle',
|
||||
'greedy', 'thrifty']),
|
||||
'top_k': strategies.integers(1, 10),
|
||||
})
|
||||
|
||||
|
||||
def is_float(s):
|
||||
try:
|
||||
float(s)
|
||||
return 1
|
||||
except ValueError:
|
||||
return 0
|
||||
def train_result(param, dmat, num_rounds):
|
||||
result = {}
|
||||
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
|
||||
evals_result=result)
|
||||
return result
|
||||
|
||||
|
||||
def xgb_get_weights(bst):
|
||||
return np.array([float(s) for s in bst.get_dump()[0].split() if
|
||||
is_float(s)])
|
||||
class TestLinear:
|
||||
@given(parameter_strategy, strategies.integers(10, 50),
|
||||
tm.dataset_strategy, coord_strategy)
|
||||
@settings(deadline=None)
|
||||
def test_coordinate(self, param, num_rounds, dataset, coord_param):
|
||||
param['updater'] = 'coord_descent'
|
||||
param.update(coord_param)
|
||||
param = dataset.set_params(param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||
assert tm.non_increasing(result)
|
||||
|
||||
# Loss is not guaranteed to always decrease because of regularisation parameters
|
||||
# We test a weaker condition that the loss has not increased between the first and last
|
||||
# iteration
|
||||
@given(parameter_strategy, strategies.integers(10, 50),
|
||||
tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 2.0),
|
||||
strategies.floats(1e-5, 2.0))
|
||||
@settings(deadline=None)
|
||||
def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd):
|
||||
param['updater'] = 'coord_descent'
|
||||
param['alpha'] = alpha
|
||||
param['lambda'] = lambd
|
||||
param.update(coord_param)
|
||||
param = dataset.set_params(param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||
assert tm.non_increasing([result[0], result[-1]])
|
||||
|
||||
def assert_regression_result(results, tol):
|
||||
regression_results = [r for r in results if
|
||||
r["param"]["objective"] == "reg:squarederror"]
|
||||
for res in regression_results:
|
||||
X = scale(res["dataset"].X,
|
||||
with_mean=isinstance(res["dataset"].X, np.ndarray))
|
||||
y = res["dataset"].y
|
||||
reg_alpha = res["param"]["alpha"]
|
||||
reg_lambda = res["param"]["lambda"]
|
||||
pred = res["bst"].predict(xgb.DMatrix(X))
|
||||
weights = xgb_get_weights(res["bst"])[1:]
|
||||
enet = ElasticNet(alpha=reg_alpha + reg_lambda,
|
||||
l1_ratio=reg_alpha / (reg_alpha + reg_lambda))
|
||||
enet.fit(X, y)
|
||||
enet_pred = enet.predict(X)
|
||||
assert np.isclose(weights, enet.coef_, rtol=tol,
|
||||
atol=tol).all(), (weights, enet.coef_)
|
||||
assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all(), (
|
||||
res["dataset"].name, enet_pred[:5], pred[:5])
|
||||
@given(parameter_strategy, strategies.integers(10, 50),
|
||||
tm.dataset_strategy)
|
||||
@settings(deadline=None)
|
||||
def test_shotgun(self, param, num_rounds, dataset):
|
||||
param['updater'] = 'shotgun'
|
||||
param = dataset.set_params(param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||
assert tm.non_increasing(result)
|
||||
|
||||
|
||||
# TODO: More robust classification tests
|
||||
def assert_classification_result(results):
|
||||
classification_results = [r for r in results if
|
||||
r["param"]["objective"] != "reg:squarederror"]
|
||||
for res in classification_results:
|
||||
# Check accuracy is reasonable
|
||||
assert res["eval"][-1] < 2.0, (res["dataset"].name, res["eval"][-1])
|
||||
|
||||
|
||||
class TestLinear(unittest.TestCase):
|
||||
|
||||
datasets = ["Boston", "Digits", "Cancer", "Sparse regression",
|
||||
"Boston External Memory"]
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_coordinate(self):
|
||||
variable_param = {'booster': ['gblinear'], 'updater':
|
||||
['coord_descent'], 'eta': [0.5], 'top_k':
|
||||
[10], 'tolerance': [1e-5], 'nthread': [2],
|
||||
'alpha': [.005, .1], 'lambda': [.005],
|
||||
'feature_selector': ['cyclic', 'shuffle',
|
||||
'greedy', 'thrifty']}
|
||||
for param in parameter_combinations(variable_param):
|
||||
results = run_suite(param, 150, self.datasets, scale_features=True)
|
||||
assert_regression_result(results, 1e-2)
|
||||
assert_classification_result(results)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_shotgun(self):
|
||||
variable_param = {'booster': ['gblinear'], 'updater':
|
||||
['shotgun'], 'eta': [0.5], 'top_k': [10],
|
||||
'tolerance': [1e-5], 'nthread': [2],
|
||||
'alpha': [.005, .1], 'lambda': [.005],
|
||||
'feature_selector': ['cyclic', 'shuffle']}
|
||||
for param in parameter_combinations(variable_param):
|
||||
results = run_suite(param, 200, self.datasets, True)
|
||||
assert_regression_result(results, 1e-2)
|
||||
assert_classification_result(results)
|
||||
@given(parameter_strategy, strategies.integers(10, 50),
|
||||
tm.dataset_strategy, strategies.floats(1e-5, 2.0),
|
||||
strategies.floats(1e-5, 2.0))
|
||||
@settings(deadline=None)
|
||||
def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd):
|
||||
param['updater'] = 'shotgun'
|
||||
param['alpha'] = alpha
|
||||
param['lambda'] = lambd
|
||||
param = dataset.set_params(param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||
assert tm.non_increasing([result[0], result[-1]])
|
||||
|
||||
@@ -3,28 +3,57 @@ import unittest
|
||||
import pytest
|
||||
import xgboost as xgb
|
||||
import numpy as np
|
||||
from hypothesis import given, strategies, settings, note
|
||||
|
||||
try:
|
||||
from regression_test_utilities import run_suite, parameter_combinations, \
|
||||
assert_results_non_increasing
|
||||
except ImportError:
|
||||
None
|
||||
exact_parameter_strategy = strategies.fixed_dictionaries({
|
||||
'nthread': strategies.integers(1, 4),
|
||||
'max_depth': strategies.integers(1, 11),
|
||||
'min_child_weight': strategies.floats(0.5, 2.0),
|
||||
'alpha': strategies.floats(0.0, 2.0),
|
||||
'lambda': strategies.floats(1e-5, 2.0),
|
||||
'eta': strategies.floats(0.01, 0.5),
|
||||
'gamma': strategies.floats(0.0, 2.0),
|
||||
'seed': strategies.integers(0, 10),
|
||||
# We cannot enable subsampling as the training loss can increase
|
||||
# 'subsample': strategies.floats(0.5, 1.0),
|
||||
'colsample_bytree': strategies.floats(0.5, 1.0),
|
||||
'colsample_bylevel': strategies.floats(0.5, 1.0),
|
||||
})
|
||||
|
||||
hist_parameter_strategy = strategies.fixed_dictionaries({
|
||||
'max_depth': strategies.integers(1, 11),
|
||||
'max_leaves': strategies.integers(0, 1024),
|
||||
'max_bin': strategies.integers(2, 512),
|
||||
'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
|
||||
}).filter(lambda x: (x['max_depth'] > 0 or x['max_leaves'] > 0) and (
|
||||
x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))
|
||||
|
||||
|
||||
class TestUpdaters(unittest.TestCase):
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_histmaker(self):
|
||||
variable_param = {'updater': ['grow_histmaker'], 'max_depth': [2, 8]}
|
||||
for param in parameter_combinations(variable_param):
|
||||
result = run_suite(param)
|
||||
assert_results_non_increasing(result, 1e-2)
|
||||
def train_result(param, dmat, num_rounds):
|
||||
result = {}
|
||||
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
|
||||
evals_result=result)
|
||||
return result
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_colmaker(self):
|
||||
variable_param = {'updater': ['grow_colmaker'], 'max_depth': [2, 8]}
|
||||
for param in parameter_combinations(variable_param):
|
||||
result = run_suite(param)
|
||||
assert_results_non_increasing(result, 1e-2)
|
||||
|
||||
class TestTreeMethod(unittest.TestCase):
|
||||
@given(exact_parameter_strategy, strategies.integers(1, 20),
|
||||
tm.dataset_strategy)
|
||||
@settings(deadline=None)
|
||||
def test_exact(self, param, num_rounds, dataset):
|
||||
param['tree_method'] = 'exact'
|
||||
param = dataset.set_params(param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
assert tm.non_increasing(result['train'][dataset.metric])
|
||||
|
||||
@given(exact_parameter_strategy, strategies.integers(1, 20),
|
||||
tm.dataset_strategy)
|
||||
@settings(deadline=None)
|
||||
def test_approx(self, param, num_rounds, dataset):
|
||||
param['tree_method'] = 'approx'
|
||||
param = dataset.set_params(param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
assert tm.non_increasing(result['train'][dataset.metric], 1e-3)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_pruner(self):
|
||||
@@ -50,19 +79,18 @@ class TestUpdaters(unittest.TestCase):
|
||||
# Second prune should not change the tree
|
||||
assert after_prune == second_prune
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_fast_histmaker(self):
|
||||
variable_param = {'tree_method': ['hist'],
|
||||
'max_depth': [2, 8],
|
||||
'max_bin': [2, 256],
|
||||
'grow_policy': ['depthwise', 'lossguide'],
|
||||
'max_leaves': [64, 0],
|
||||
'verbosity': [0],
|
||||
'single_precision_histogram': [True, False]}
|
||||
for param in parameter_combinations(variable_param):
|
||||
result = run_suite(param)
|
||||
assert_results_non_increasing(result, 1e-2)
|
||||
@given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
|
||||
tm.dataset_strategy)
|
||||
@settings(deadline=None)
|
||||
def test_hist(self, param, hist_param, num_rounds, dataset):
|
||||
param['tree_method'] = 'hist'
|
||||
param = dataset.set_params(param)
|
||||
param.update(hist_param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
note(result)
|
||||
assert tm.non_increasing(result['train'][dataset.metric])
|
||||
|
||||
def test_hist_categorical(self):
|
||||
# hist must be same as exact on all-categorial data
|
||||
dpath = 'demo/data/'
|
||||
ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
@@ -87,7 +115,7 @@ class TestUpdaters(unittest.TestCase):
|
||||
assert hist_res['test']['auc'] == exact_res['test']['auc']
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_fast_histmaker_degenerate_case(self):
|
||||
def test_hist_degenerate_case(self):
|
||||
# Test a degenerate case where the quantile sketcher won't return any
|
||||
# quantile points for a particular feature (the second feature in
|
||||
# this example). Source: https://github.com/dmlc/xgboost/issues/2943
|
||||
|
||||
@@ -1,6 +1,19 @@
|
||||
# coding: utf-8
|
||||
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
|
||||
from xgboost.compat import DASK_INSTALLED
|
||||
from hypothesis import strategies
|
||||
from hypothesis.extra.numpy import arrays
|
||||
from joblib import Memory
|
||||
from sklearn import datasets
|
||||
import xgboost as xgb
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import cupy as cp
|
||||
except ImportError:
|
||||
cp = None
|
||||
|
||||
memory = Memory('./cachedir', verbose=0)
|
||||
|
||||
|
||||
def no_sklearn():
|
||||
@@ -39,7 +52,7 @@ def no_matplotlib():
|
||||
def no_dask_cuda():
|
||||
reason = 'dask_cuda is not installed.'
|
||||
try:
|
||||
import dask_cuda as _ # noqa
|
||||
import dask_cuda as _ # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
@@ -47,7 +60,7 @@ def no_dask_cuda():
|
||||
|
||||
def no_cudf():
|
||||
try:
|
||||
import cudf # noqa
|
||||
import cudf # noqa
|
||||
CUDF_INSTALLED = True
|
||||
except ImportError:
|
||||
CUDF_INSTALLED = False
|
||||
@@ -59,7 +72,7 @@ def no_cudf():
|
||||
def no_cupy():
|
||||
reason = 'cupy is not installed.'
|
||||
try:
|
||||
import cupy as _ # noqa
|
||||
import cupy as _ # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
@@ -68,7 +81,7 @@ def no_cupy():
|
||||
def no_dask_cudf():
|
||||
reason = 'dask_cudf is not installed.'
|
||||
try:
|
||||
import dask_cudf as _ # noqa
|
||||
import dask_cudf as _ # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
@@ -77,7 +90,101 @@ def no_dask_cudf():
|
||||
def no_json_schema():
|
||||
reason = 'jsonschema is not installed'
|
||||
try:
|
||||
import jsonschema # noqa
|
||||
import jsonschema # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
|
||||
|
||||
# Contains a dataset in numpy format as well as the relevant objective and metric
|
||||
class TestDataset:
|
||||
def __init__(self, name, get_dataset, objective, metric
|
||||
):
|
||||
self.name = name
|
||||
self.objective = objective
|
||||
self.metric = metric
|
||||
self.X, self.y = get_dataset()
|
||||
self.w = None
|
||||
|
||||
def set_params(self, params_in):
|
||||
params_in['objective'] = self.objective
|
||||
params_in['eval_metric'] = self.metric
|
||||
if self.objective == "multi:softmax":
|
||||
params_in["num_class"] = int(np.max(self.y) + 1)
|
||||
return params_in
|
||||
|
||||
def get_dmat(self):
|
||||
return xgb.DMatrix(self.X, self.y, self.w)
|
||||
|
||||
def get_device_dmat(self):
|
||||
w = None if self.w is None else cp.array(self.w)
|
||||
X = cp.array(self.X, dtype=np.float32)
|
||||
y = cp.array(self.y, dtype=np.float32)
|
||||
return xgb.DeviceQuantileDMatrix(X, y, w)
|
||||
|
||||
def get_external_dmat(self):
|
||||
np.savetxt('tmptmp_1234.csv', np.hstack((self.y.reshape(len(self.y), 1), self.X)),
|
||||
delimiter=',')
|
||||
return xgb.DMatrix('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
|
||||
weight=self.w)
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_boston():
|
||||
data = datasets.load_boston()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_digits():
|
||||
data = datasets.load_digits()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_cancer():
|
||||
data = datasets.load_breast_cancer()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_sparse():
|
||||
rng = np.random.RandomState(199)
|
||||
n = 2000
|
||||
sparsity = 0.75
|
||||
X, y = datasets.make_regression(n, random_state=rng)
|
||||
flag = rng.binomial(1, sparsity, X.shape)
|
||||
for i in range(X.shape[0]):
|
||||
for j in range(X.shape[1]):
|
||||
if flag[i, j]:
|
||||
X[i, j] = np.nan
|
||||
return X, y
|
||||
|
||||
|
||||
_unweighted_datasets_strategy = strategies.sampled_from(
|
||||
[TestDataset('boston', get_boston, 'reg:squarederror', 'rmse'),
|
||||
TestDataset('digits', get_digits, 'multi:softmax', 'mlogloss'),
|
||||
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
|
||||
TestDataset
|
||||
("sparse", get_sparse, "reg:squarederror", "rmse"),
|
||||
TestDataset("empty", lambda: (np.empty((0, 100)), np.empty(0)), "reg:squarederror",
|
||||
"rmse")])
|
||||
|
||||
|
||||
@strategies.composite
|
||||
def _dataset_and_weight(draw):
|
||||
data = draw(_unweighted_datasets_strategy)
|
||||
if draw(strategies.booleans()):
|
||||
data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0)))
|
||||
return data
|
||||
|
||||
# A strategy for drawing from a set of example datasets
|
||||
# May add random weights to the dataset
|
||||
dataset_strategy = _dataset_and_weight()
|
||||
|
||||
|
||||
def non_increasing(L, tolerance=1e-4):
|
||||
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
|
||||
|
||||
Reference in New Issue
Block a user