xgboost/tests/python-gpu/test_gpu_updaters.py
Jiaming Yuan a55d43ccfd
Add test for invalid categorical data values. (#7380)
* Add test for invalid categorical data values.

* Add check during sketching.
2021-11-02 18:00:52 +08:00

173 lines
6.4 KiB
Python

import numpy as np
import sys
import gc
import pytest
import xgboost as xgb
from hypothesis import given, strategies, assume, settings, note
sys.path.append("tests/python")
import testing as tm
parameter_strategy = strategies.fixed_dictionaries({
'max_depth': strategies.integers(0, 11),
'max_leaves': strategies.integers(0, 256),
'max_bin': strategies.integers(2, 1024),
'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
'single_precision_histogram': strategies.booleans(),
'min_child_weight': strategies.floats(0.5, 2.0),
'seed': strategies.integers(0, 10),
# We cannot enable subsampling as the training loss can increase
# 'subsample': strategies.floats(0.5, 1.0),
'colsample_bytree': strategies.floats(0.5, 1.0),
'colsample_bylevel': strategies.floats(0.5, 1.0),
}).filter(lambda x: (x['max_depth'] > 0 or x['max_leaves'] > 0) and (
x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))
def train_result(param, dmat, num_rounds):
result = {}
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
evals_result=result)
return result
class TestGPUUpdaters:
@given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
@settings(deadline=None)
def test_gpu_hist(self, param, num_rounds, dataset):
param["tree_method"] = "gpu_hist"
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result["train"][dataset.metric])
def run_categorical_basic(self, rows, cols, rounds, cats):
onehot, label = tm.make_categorical(rows, cols, cats, True)
cat, _ = tm.make_categorical(rows, cols, cats, False)
by_etl_results = {}
by_builtin_results = {}
parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}
m = xgb.DMatrix(onehot, label, enable_categorical=False)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_etl_results,
)
m = xgb.DMatrix(cat, label, enable_categorical=True)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_builtin_results,
)
# There are guidelines on how to specify tolerance based on considering output as
# random variables. But in here the tree construction is extremely sensitive to
# floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
# different tree. So even though the test is quite lenient, hypothesis can still
# pick up falsifying examples from time to time.
np.testing.assert_allclose(
np.array(by_etl_results["Train"]["rmse"]),
np.array(by_builtin_results["Train"]["rmse"]),
rtol=1e-3,
)
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
@given(strategies.integers(10, 400), strategies.integers(3, 8),
strategies.integers(1, 2), strategies.integers(4, 7))
@settings(deadline=None)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical(self, rows, cols, rounds, cats):
self.run_categorical_basic(rows, cols, rounds, cats)
def test_categorical_32_cat(self):
'''32 hits the bound of integer bitset, so special test'''
rows = 1000
cols = 10
cats = 32
rounds = 4
self.run_categorical_basic(rows, cols, rounds, cats)
def test_invalid_categorical(self):
import cupy as cp
rng = np.random.default_rng()
X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10)
y = rng.normal(loc=0, scale=1, size=100)
# Check is performe during sketching.
Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
with pytest.raises(ValueError):
xgb.train({"tree_method": "gpu_hist"}, Xy)
X, y = cp.array(X), cp.array(y)
with pytest.raises(ValueError):
Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10)
@pytest.mark.skipif(**tm.no_cupy())
@given(parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
@settings(deadline=None)
def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
# We cannot handle empty dataset yet
assume(len(dataset.y) > 0)
param['tree_method'] = 'gpu_hist'
param = dataset.set_params(param)
result = train_result(param, dataset.get_device_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result['train'][dataset.metric])
@given(parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
@settings(deadline=None)
def test_external_memory(self, param, num_rounds, dataset):
# We cannot handle empty dataset yet
assume(len(dataset.y) > 0)
param['tree_method'] = 'gpu_hist'
param = dataset.set_params(param)
m = dataset.get_external_dmat()
external_result = train_result(param, m, num_rounds)
del m
gc.collect()
assert tm.non_increasing(external_result['train'][dataset.metric])
def test_empty_dmatrix_prediction(self):
# FIXME(trivialfis): This should be done with all updaters
kRows = 0
kCols = 100
X = np.empty((kRows, kCols))
y = np.empty((kRows))
dtrain = xgb.DMatrix(X, y)
bst = xgb.train({'verbosity': 2,
'tree_method': 'gpu_hist',
'gpu_id': 0},
dtrain,
verbose_eval=True,
num_boost_round=6,
evals=[(dtrain, 'Train')])
kRows = 100
X = np.random.randn(kRows, kCols)
dtest = xgb.DMatrix(X)
predictions = bst.predict(dtest)
np.testing.assert_allclose(predictions, 0.5, 1e-6)
@pytest.mark.mgpu
@given(tm.dataset_strategy, strategies.integers(0, 10))
@settings(deadline=None, max_examples=10)
def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), 10)
assert tm.non_increasing(result['train'][dataset.metric])