Use hypothesis (#5759)

* Use hypothesis

* Allow int64 array interface for groups

* Add packages to Windows CI

* Add to travis

* Make sure device index is set correctly

* Fix dask-cudf test

* appveyor
This commit is contained in:
Rory Mitchell 2020-06-16 12:45:59 +12:00 committed by GitHub
parent 02884b08aa
commit b47b5ac771
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 411 additions and 439 deletions

View File

@ -113,7 +113,7 @@ def TestWin64CPU() {
""" """
echo "Installing Python dependencies..." echo "Installing Python dependencies..."
bat """ bat """
conda activate && conda upgrade scikit-learn pandas numpy conda activate && conda install -y hypothesis && conda upgrade scikit-learn pandas numpy hypothesis
""" """
echo "Running Python tests..." echo "Running Python tests..."
bat "conda activate && python -m pytest -v -s --fulltrace tests\\python" bat "conda activate && python -m pytest -v -s --fulltrace tests\\python"
@ -138,7 +138,7 @@ def TestWin64GPU(args) {
""" """
echo "Installing Python dependencies..." echo "Installing Python dependencies..."
bat """ bat """
conda activate && conda upgrade scikit-learn pandas numpy conda activate && conda install -y hypothesis && conda upgrade scikit-learn pandas numpy hypothesis
""" """
echo "Running Python tests..." echo "Running Python tests..."
bat """ bat """

View File

@ -44,7 +44,7 @@ install:
- if /i "%DO_PYTHON%" == "on" ( - if /i "%DO_PYTHON%" == "on" (
conda config --set always_yes true && conda config --set always_yes true &&
conda update -q conda && conda update -q conda &&
conda install -y numpy scipy pandas matplotlib pytest scikit-learn graphviz python-graphviz conda install -y numpy scipy pandas matplotlib pytest scikit-learn graphviz python-graphviz hypothesis
) )
- set PATH=C:\Miniconda3-x64\Library\bin\graphviz;%PATH% - set PATH=C:\Miniconda3-x64\Library\bin\graphviz;%PATH%
# R: based on https://github.com/krlmlr/r-appveyor # R: based on https://github.com/krlmlr/r-appveyor

View File

@ -34,6 +34,30 @@ void CopyInfoImpl(ArrayInterface column, HostDeviceVector<float>* out) {
}); });
} }
void CopyGroupInfoImpl(ArrayInterface column, std::vector<bst_group_t>* out) {
CHECK(column.type[1] == 'i' || column.type[1] == 'u')
<< "Expected integer metainfo";
auto SetDeviceToPtr = [](void* ptr) {
cudaPointerAttributes attr;
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
int32_t ptr_device = attr.device;
dh::safe_cuda(cudaSetDevice(ptr_device));
return ptr_device;
};
auto ptr_device = SetDeviceToPtr(column.data);
dh::TemporaryArray<bst_group_t> temp(column.num_rows);
auto d_tmp = temp.data();
dh::LaunchN(ptr_device, column.num_rows, [=] __device__(size_t idx) {
d_tmp[idx] = column.GetElement(idx);
});
auto length = column.num_rows;
out->resize(length + 1);
out->at(0) = 0;
thrust::copy(temp.data(), temp.data() + length, out->begin() + 1);
std::partial_sum(out->begin(), out->end(), out->begin());
}
void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()}); Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
auto const& j_arr = get<Array>(j_interface); auto const& j_arr = get<Array>(j_interface);
@ -53,16 +77,7 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
} else if (key == "base_margin") { } else if (key == "base_margin") {
CopyInfoImpl(array_interface, &base_margin_); CopyInfoImpl(array_interface, &base_margin_);
} else if (key == "group") { } else if (key == "group") {
// Ranking is not performed on device. CopyGroupInfoImpl(array_interface, &group_ptr_);
thrust::device_ptr<uint32_t> p_src{
reinterpret_cast<uint32_t*>(array_interface.data)};
auto length = array_interface.num_rows;
group_ptr_.resize(length + 1);
group_ptr_[0] = 0;
thrust::copy(p_src, p_src + length, group_ptr_.begin() + 1);
std::partial_sum(group_ptr_.begin(), group_ptr_.end(), group_ptr_.begin());
return; return;
} else { } else {
LOG(FATAL) << "Unknown metainfo: " << key; LOG(FATAL) << "Unknown metainfo: " << key;

View File

@ -22,7 +22,7 @@ ENV GOSU_VERSION 1.10
# Install Python packages in default env # Install Python packages in default env
RUN \ RUN \
pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh \ pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh \
recommonmark guzzle_sphinx_theme mock breathe graphviz \ recommonmark guzzle_sphinx_theme mock breathe graphviz hypothesis\
pytest scikit-learn wheel kubernetes urllib3 jsonschema boto3 && \ pytest scikit-learn wheel kubernetes urllib3 jsonschema boto3 && \
pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl && \ pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl && \
pip install "dask[complete]" pip install "dask[complete]"

View File

@ -19,7 +19,7 @@ ENV PATH=/opt/python/bin:$PATH
RUN \ RUN \
conda create -n cudf_test -c rapidsai -c nvidia -c conda-forge -c defaults \ conda create -n cudf_test -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.7 cudf cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \ python=3.7 cudf cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
ENV GOSU_VERSION 1.10 ENV GOSU_VERSION 1.10

View File

@ -18,7 +18,7 @@ ENV PATH=/opt/python/bin:$PATH
RUN \ RUN \
conda create -n gpu_test -c rapidsai -c nvidia -c conda-forge -c defaults \ conda create -n gpu_test -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.7 dask dask-cuda numpy pytest scipy scikit-learn pandas \ python=3.7 dask dask-cuda numpy pytest scipy scikit-learn pandas \
matplotlib wheel python-kubernetes urllib3 graphviz matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
ENV GOSU_VERSION 1.10 ENV GOSU_VERSION 1.10

View File

@ -21,7 +21,7 @@ std::string PrepareData(std::string typestr, thrust::device_vector<T>* out, cons
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))}; std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
column["shape"] = Array(j_shape); column["shape"] = Array(j_shape);
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(4)))}); column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
column["version"] = Integer(static_cast<Integer::Int>(1)); column["version"] = Integer(static_cast<Integer::Int>(1));
column["typestr"] = String(typestr); column["typestr"] = String(typestr);
@ -78,16 +78,32 @@ TEST(MetaInfo, FromInterface) {
TEST(MetaInfo, Group) { TEST(MetaInfo, Group) {
cudaSetDevice(0); cudaSetDevice(0);
thrust::device_vector<uint32_t> d_data;
std::string str = PrepareData<uint32_t>("<u4", &d_data);
MetaInfo info; MetaInfo info;
info.SetInfo("group", str.c_str()); thrust::device_vector<uint32_t> d_uint;
auto const& h_group = info.group_ptr_; std::string uint_str = PrepareData<uint32_t>("<u4", &d_uint);
ASSERT_EQ(h_group.size(), d_data.size() + 1); info.SetInfo("group", uint_str.c_str());
auto& h_group = info.group_ptr_;
ASSERT_EQ(h_group.size(), d_uint.size() + 1);
for (size_t i = 1; i < h_group.size(); ++i) { for (size_t i = 1; i < h_group.size(); ++i) {
ASSERT_EQ(h_group[i], d_data[i-1] + h_group[i-1]) << "i: " << i; ASSERT_EQ(h_group[i], d_uint[i - 1] + h_group[i - 1]) << "i: " << i;
} }
thrust::device_vector<int64_t> d_int64;
std::string int_str = PrepareData<int64_t>("<i8", &d_int64);
info = MetaInfo();
info.SetInfo("group", int_str.c_str());
h_group = info.group_ptr_;
ASSERT_EQ(h_group.size(), d_uint.size() + 1);
for (size_t i = 1; i < h_group.size(); ++i) {
ASSERT_EQ(h_group[i], d_uint[i - 1] + h_group[i - 1]) << "i: " << i;
}
// Incorrect type
thrust::device_vector<float> d_float;
std::string float_str = PrepareData<float>("<f4", &d_float);
info = MetaInfo();
EXPECT_ANY_THROW(info.SetInfo("group", float_str.c_str()));
} }
} // namespace xgboost } // namespace xgboost

View File

@ -1,30 +1,50 @@
import sys import sys
import pytest from hypothesis import strategies, given, settings, assume
import unittest import xgboost as xgb
sys.path.append("tests/python")
sys.path.append('tests/python/') import testing as tm
import test_linear # noqa: E402
import testing as tm # noqa: E402
class TestGPULinear(unittest.TestCase): parameter_strategy = strategies.fixed_dictionaries({
datasets = ["Boston", "Digits", "Cancer", "Sparse regression"] 'booster': strategies.just('gblinear'),
common_param = { 'eta': strategies.floats(0.01, 0.25),
'booster': ['gblinear'], 'tolerance': strategies.floats(1e-5, 1e-2),
'updater': ['gpu_coord_descent'], 'nthread': strategies.integers(1, 4),
'eta': [0.5], 'feature_selector': strategies.sampled_from(['cyclic', 'shuffle',
'top_k': [10], 'greedy', 'thrifty']),
'tolerance': [1e-5], 'top_k': strategies.integers(1, 10),
'alpha': [.1], })
'lambda': [0.005],
'coordinate_selection': ['cyclic', 'random', 'greedy']}
@pytest.mark.skipif(**tm.no_sklearn()) def train_result(param, dmat, num_rounds):
def test_gpu_coordinate(self): result = {}
parameters = self.common_param.copy() xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
parameters['gpu_id'] = [0] evals_result=result)
for param in test_linear.parameter_combinations(parameters): return result
results = test_linear.run_suite(
param, 100, self.datasets, scale_features=True)
test_linear.assert_regression_result(results, 1e-2) class TestGPULinear:
test_linear.assert_classification_result(results) @given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy)
@settings(deadline=None)
def test_gpu_coordinate(self, param, num_rounds, dataset):
assume(len(dataset.y) > 0)
param['updater'] = 'gpu_coord_descent'
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
assert tm.non_increasing(result)
# Loss is not guaranteed to always decrease because of regularisation parameters
# We test a weaker condition that the loss has not increased between the first and last
# iteration
@given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy, strategies.floats(1e-5, 2.0),
strategies.floats(1e-5, 2.0))
@settings(deadline=None)
def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd):
assume(len(dataset.y) > 0)
param['updater'] = 'gpu_coord_descent'
param['alpha'] = alpha
param['lambda'] = lambd
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
assert tm.non_increasing([result[0], result[-1]])

View File

@ -4,9 +4,13 @@ import unittest
import numpy as np import numpy as np
import subprocess import subprocess
import os import os
import sys
import json import json
import pytest import pytest
sys.path.append("tests/python")
import testing as tm
import xgboost as xgb import xgboost as xgb
from xgboost import XGBClassifier from xgboost import XGBClassifier
@ -90,7 +94,6 @@ class TestPickling(unittest.TestCase):
) )
status = subprocess.call(args, env=env) status = subprocess.call(args, env=env)
assert status == 0 assert status == 0
os.remove(model_path) os.remove(model_path)
def test_pickled_predictor(self): def test_pickled_predictor(self):

View File

@ -158,10 +158,10 @@ class TestGPUPredict(unittest.TestCase):
rows = 1000 rows = 1000
cols = 10 cols = 10
rng = np.random.RandomState(1994) rng = np.random.RandomState(1994)
cp.cuda.runtime.setDevice(0)
X = rng.randn(rows, cols) X = rng.randn(rows, cols)
X = pd.DataFrame(X) X = pd.DataFrame(X)
y = rng.randn(rows) y = rng.randn(rows)
X = cudf.from_pandas(X) X = cudf.from_pandas(X)
dtrain = xgb.DMatrix(X, y) dtrain = xgb.DMatrix(X, y)

View File

@ -1,74 +1,71 @@
import numpy as np import numpy as np
import sys import sys
import unittest
import pytest import pytest
import xgboost as xgb import xgboost as xgb
from hypothesis import given, strategies, assume, settings, note
sys.path.append("tests/python") sys.path.append("tests/python")
import testing as tm import testing as tm
from regression_test_utilities import run_suite, parameter_combinations, \
assert_results_non_increasing parameter_strategy = strategies.fixed_dictionaries({
'max_depth': strategies.integers(0, 11),
'max_leaves': strategies.integers(0, 256),
'max_bin': strategies.integers(2, 1024),
'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
'single_precision_histogram': strategies.booleans(),
'min_child_weight': strategies.floats(0.5, 2.0),
'seed': strategies.integers(0, 10),
# We cannot enable subsampling as the training loss can increase
# 'subsample': strategies.floats(0.5, 1.0),
'colsample_bytree': strategies.floats(0.5, 1.0),
'colsample_bylevel': strategies.floats(0.5, 1.0),
}).filter(lambda x: (x['max_depth'] > 0 or x['max_leaves'] > 0) and (
x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))
def assert_gpu_results(cpu_results, gpu_results): def train_result(param, dmat, num_rounds):
for cpu_res, gpu_res in zip(cpu_results, gpu_results): result = {}
# Check final eval result roughly equivalent xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
assert np.allclose(cpu_res["eval"][-1], evals_result=result)
gpu_res["eval"][-1], 1e-1, 1e-1) return result
datasets = ["Boston", "Cancer", "Digits", "Sparse regression", class TestGPUUpdaters:
"Sparse regression with weights", "Small weights regression"] @given(parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
test_param = parameter_combinations({ @settings(deadline=None)
'gpu_id': [0], def test_gpu_hist(self, param, num_rounds, dataset):
'max_depth': [2, 8],
'max_leaves': [255, 4],
'max_bin': [4, 256],
'grow_policy': ['lossguide'],
'single_precision_histogram': [True],
'min_child_weight': [0],
'lambda': [0]})
class TestGPU(unittest.TestCase):
def test_gpu_hist(self):
for param in test_param:
param['tree_method'] = 'gpu_hist' param['tree_method'] = 'gpu_hist'
gpu_results = run_suite(param, select_datasets=datasets) param = dataset.set_params(param)
assert_results_non_increasing(gpu_results, 1e-2) result = train_result(param, dataset.get_dmat(), num_rounds)
param['tree_method'] = 'hist' note(result)
cpu_results = run_suite(param, select_datasets=datasets) assert tm.non_increasing(result['train'][dataset.metric])
assert_gpu_results(cpu_results, gpu_results)
@pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cupy())
def test_gpu_hist_device_dmatrix(self): @given(parameter_strategy, strategies.integers(1, 20),
# DeviceDMatrix does not currently accept sparse formats tm.dataset_strategy)
device_dmatrix_datasets = ["Boston", "Cancer", "Digits"] @settings(deadline=None)
for param in test_param: def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
# We cannot handle empty dataset yet
assume(len(dataset.y) > 0)
param['tree_method'] = 'gpu_hist' param['tree_method'] = 'gpu_hist'
param = dataset.set_params(param)
result = train_result(param, dataset.get_device_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result['train'][dataset.metric])
gpu_results_device_dmatrix = run_suite(param, select_datasets=device_dmatrix_datasets, @given(parameter_strategy, strategies.integers(1, 20),
DMatrixT=xgb.DeviceQuantileDMatrix, tm.dataset_strategy)
dmatrix_params={'max_bin': param['max_bin']}) @settings(deadline=None)
assert_results_non_increasing(gpu_results_device_dmatrix, 1e-2) def test_external_memory(self, param, num_rounds, dataset):
gpu_results = run_suite(param, select_datasets=device_dmatrix_datasets) # We cannot handle empty dataset yet
assert_gpu_results(gpu_results, gpu_results_device_dmatrix) assume(len(dataset.y) > 0)
# NOTE(rongou): Because the `Boston` dataset is too small, this only tests external memory mode
# with a single page. To test multiple pages, set DMatrix::kPageSize to, say, 1024.
def test_external_memory(self):
for param in reversed(test_param):
param['tree_method'] = 'gpu_hist' param['tree_method'] = 'gpu_hist'
param['gpu_page_size'] = 1024 param = dataset.set_params(param)
gpu_results = run_suite(param, select_datasets=["Boston"]) external_result = train_result(param, dataset.get_external_dmat(), num_rounds)
assert_results_non_increasing(gpu_results, 1e-2) assert tm.non_increasing(external_result['train'][dataset.metric])
ext_mem_results = run_suite(param, select_datasets=["Boston External Memory"])
assert_results_non_increasing(ext_mem_results, 1e-2)
assert_gpu_results(gpu_results, ext_mem_results)
break
def test_with_empty_dmatrix(self): def test_empty_dmatrix_prediction(self):
# FIXME(trivialfis): This should be done with all updaters # FIXME(trivialfis): This should be done with all updaters
kRows = 0 kRows = 0
kCols = 100 kCols = 100
@ -94,13 +91,10 @@ class TestGPU(unittest.TestCase):
np.testing.assert_allclose(predictions, 0.5, 1e-6) np.testing.assert_allclose(predictions, 0.5, 1e-6)
@pytest.mark.mgpu @pytest.mark.mgpu
def test_specified_gpu_id_gpu_update(self): @given(tm.dataset_strategy, strategies.integers(0, 10))
variable_param = {'gpu_id': [1], @settings(deadline=None, max_examples=10)
'max_depth': [8], def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
'max_leaves': [255, 4], param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
'max_bin': [2, 64], param = dataset.set_params(param)
'grow_policy': ['lossguide'], result = train_result(param, dataset.get_dmat(), 10)
'tree_method': ['gpu_hist']} assert tm.non_increasing(result['train'][dataset.metric])
for param in parameter_combinations(variable_param):
gpu_results = run_suite(param, select_datasets=datasets)
assert_results_non_increasing(gpu_results, 1e-2)

View File

@ -31,7 +31,8 @@ class TestDistributedGPU(unittest.TestCase):
def test_dask_dataframe(self): def test_dask_dataframe(self):
with LocalCUDACluster() as cluster: with LocalCUDACluster() as cluster:
with Client(cluster) as client: with Client(cluster) as client:
import cupy import cupy as cp
cp.cuda.runtime.setDevice(0)
X, y = generate_array() X, y = generate_array()
X = dd.from_dask_array(X) X = dd.from_dask_array(X)
@ -59,8 +60,8 @@ class TestDistributedGPU(unittest.TestCase):
single_node = out['booster'].predict( single_node = out['booster'].predict(
xgboost.DMatrix(X.compute())) xgboost.DMatrix(X.compute()))
cupy.testing.assert_allclose(single_node, predictions) cp.testing.assert_allclose(single_node, predictions)
cupy.testing.assert_allclose(single_node, series_predictions) np.testing.assert_allclose(single_node, series_predictions.to_array())
predt = dxgb.predict(client, out, X) predt = dxgb.predict(client, out, X)
assert isinstance(predt, dd.Series) assert isinstance(predt, dd.Series)
@ -73,7 +74,7 @@ class TestDistributedGPU(unittest.TestCase):
is_df, is_df,
meta=dd.utils.make_meta({'prediction': 'f4'})) meta=dd.utils.make_meta({'prediction': 'f4'}))
cupy.testing.assert_allclose( cp.testing.assert_allclose(
predt.values.compute(), single_node) predt.values.compute(), single_node)
@pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cupy())
@ -81,11 +82,12 @@ class TestDistributedGPU(unittest.TestCase):
def test_dask_array(self): def test_dask_array(self):
with LocalCUDACluster() as cluster: with LocalCUDACluster() as cluster:
with Client(cluster) as client: with Client(cluster) as client:
import cupy import cupy as cp
cp.cuda.runtime.setDevice(0)
X, y = generate_array() X, y = generate_array()
X = X.map_blocks(cupy.asarray) X = X.map_blocks(cp.asarray)
y = y.map_blocks(cupy.asarray) y = y.map_blocks(cp.asarray)
dtrain = dxgb.DaskDMatrix(client, X, y) dtrain = dxgb.DaskDMatrix(client, X, y)
out = dxgb.train(client, {'tree_method': 'gpu_hist'}, out = dxgb.train(client, {'tree_method': 'gpu_hist'},
dtrain=dtrain, dtrain=dtrain,
@ -97,11 +99,11 @@ class TestDistributedGPU(unittest.TestCase):
single_node = out['booster'].predict( single_node = out['booster'].predict(
xgboost.DMatrix(X.compute())) xgboost.DMatrix(X.compute()))
np.testing.assert_allclose(single_node, from_dmatrix) np.testing.assert_allclose(single_node, from_dmatrix)
device = cupy.cuda.runtime.getDevice() device = cp.cuda.runtime.getDevice()
assert device == inplace_predictions.device.id assert device == inplace_predictions.device.id
single_node = cupy.array(single_node) single_node = cp.array(single_node)
assert device == single_node.device.id assert device == single_node.device.id
cupy.testing.assert_allclose( cp.testing.assert_allclose(
single_node, single_node,
inplace_predictions) inplace_predictions)

View File

@ -1,198 +0,0 @@
import glob
import itertools as it
import numpy as np
import os
import sys
import xgboost as xgb
from joblib import Memory
memory = Memory('./cachedir', verbose=0)
try:
from sklearn import datasets
from sklearn.preprocessing import scale
except ImportError:
None
class Dataset:
def __init__(self, name, get_dataset, objective, metric,
has_weights=False, use_external_memory=False):
self.name = name
self.objective = objective
self.metric = metric
if has_weights:
self.X, self.y, self.w = get_dataset()
else:
self.X, self.y = get_dataset()
self.w = None
self.use_external_memory = use_external_memory
def __str__(self):
a = 'name: {name}\nobjective:{objective}, metric:{metric}, '.format(
name=self.name,
objective=self.objective,
metric=self.metric)
b = 'external memory:{use_external_memory}\n'.format(
use_external_memory=self.use_external_memory
)
return a + b
def __repr__(self):
return self.__str__()
@memory.cache
def get_boston():
data = datasets.load_boston()
return data.data, data.target
@memory.cache
def get_digits():
data = datasets.load_digits()
return data.data, data.target
@memory.cache
def get_cancer():
data = datasets.load_breast_cancer()
return data.data, data.target
@memory.cache
def get_sparse():
rng = np.random.RandomState(199)
n = 2000
sparsity = 0.75
X, y = datasets.make_regression(n, random_state=rng)
flag = rng.binomial(1, sparsity, X.shape)
for i in range(X.shape[0]):
for j in range(X.shape[1]):
if flag[i, j]:
X[i, j] = 0.0
from scipy import sparse
X = sparse.csr_matrix(X)
return X, y
def get_sparse_weights():
return get_weights_regression(1, 10)
def get_small_weights():
return get_weights_regression(1e-6, 1e-5)
@memory.cache
def get_weights_regression(min_weight, max_weight):
rng = np.random.RandomState(199)
n = 2000
sparsity = 0.25
X, y = datasets.make_regression(n, random_state=rng)
flag = rng.binomial(1, sparsity, X.shape)
for i in range(X.shape[0]):
for j in range(X.shape[1]):
if flag[i, j]:
X[i, j] = np.nan
w = rng.uniform(min_weight, max_weight, n)
return X, y, w
def train_dataset(dataset, param_in, num_rounds=10, scale_features=False, DMatrixT=xgb.DMatrix,
dmatrix_params={}):
param = param_in.copy()
param["objective"] = dataset.objective
if dataset.objective == "multi:softmax":
param["num_class"] = int(np.max(dataset.y) + 1)
param["eval_metric"] = dataset.metric
if scale_features:
X = scale(dataset.X, with_mean=isinstance(dataset.X, np.ndarray))
else:
X = dataset.X
if dataset.use_external_memory:
np.savetxt('tmptmp_1234.csv', np.hstack((dataset.y.reshape(len(dataset.y), 1), X)),
delimiter=',')
dtrain = DMatrixT('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
weight=dataset.w)
elif DMatrixT is xgb.DeviceQuantileDMatrix:
import cupy as cp
dtrain = DMatrixT(cp.array(X), cp.array(dataset.y),
weight=None if dataset.w is None else cp.array(dataset.w),
**dmatrix_params)
else:
dtrain = DMatrixT(X, dataset.y, weight=dataset.w, **dmatrix_params)
print("Training on dataset: " + dataset.name, file=sys.stderr)
print("Using parameters: " + str(param), file=sys.stderr)
res = {}
bst = xgb.train(param, dtrain, num_rounds, [(dtrain, 'train')],
evals_result=res, verbose_eval=False)
# Free the booster and dmatrix so we can delete temporary files
bst_copy = bst.copy()
del bst
del dtrain
# Cleanup temporary files
if dataset.use_external_memory:
for f in glob.glob("tmptmp_*"):
os.remove(f)
return {"dataset": dataset, "bst": bst_copy, "param": param.copy(),
"eval": res['train'][dataset.metric]}
def parameter_combinations(variable_param):
"""
Enumerate all possible combinations of parameters
"""
result = []
names = sorted(variable_param)
combinations = it.product(*(variable_param[Name] for Name in names))
for set in combinations:
param = {}
for i, name in enumerate(names):
param[name] = set[i]
result.append(param)
return result
def run_suite(param, num_rounds=10, select_datasets=None, scale_features=False,
DMatrixT=xgb.DMatrix, dmatrix_params={}):
"""
Run the given parameters on a range of datasets. Objective and eval metric will be
automatically set
"""
datasets = [
Dataset("Boston", get_boston, "reg:squarederror", "rmse"),
Dataset("Digits", get_digits, "multi:softmax", "mlogloss"),
Dataset("Cancer", get_cancer, "binary:logistic", "logloss"),
Dataset("Sparse regression", get_sparse, "reg:squarederror", "rmse"),
Dataset("Sparse regression with weights", get_sparse_weights,
"reg:squarederror", "rmse", has_weights=True),
Dataset("Small weights regression", get_small_weights,
"reg:squarederror", "rmse", has_weights=True),
Dataset("Boston External Memory", get_boston,
"reg:squarederror", "rmse",
use_external_memory=True)
]
results = [
]
for d in datasets:
if select_datasets is None or d.name in select_datasets:
results.append(
train_dataset(d, param, num_rounds=num_rounds, scale_features=scale_features,
DMatrixT=DMatrixT, dmatrix_params=dmatrix_params))
return results
def non_increasing(L, tolerance):
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
def assert_results_non_increasing(results, tolerance=1e-5):
for r in results:
assert non_increasing(r['eval'], tolerance), r

View File

@ -1,87 +1,72 @@
import numpy as np
import testing as tm import testing as tm
import unittest from hypothesis import strategies, given, settings, note
import pytest
import xgboost as xgb import xgboost as xgb
try: parameter_strategy = strategies.fixed_dictionaries({
from sklearn.linear_model import ElasticNet 'booster': strategies.just('gblinear'),
from sklearn.preprocessing import scale 'eta': strategies.floats(0.01, 0.25),
from regression_test_utilities import run_suite, parameter_combinations 'tolerance': strategies.floats(1e-5, 1e-2),
except ImportError: 'nthread': strategies.integers(1, 4),
None })
coord_strategy = strategies.fixed_dictionaries({
'feature_selector': strategies.sampled_from(['cyclic', 'shuffle',
'greedy', 'thrifty']),
'top_k': strategies.integers(1, 10),
})
def is_float(s): def train_result(param, dmat, num_rounds):
try: result = {}
float(s) xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
return 1 evals_result=result)
except ValueError: return result
return 0
def xgb_get_weights(bst): class TestLinear:
return np.array([float(s) for s in bst.get_dump()[0].split() if @given(parameter_strategy, strategies.integers(10, 50),
is_float(s)]) tm.dataset_strategy, coord_strategy)
@settings(deadline=None)
def test_coordinate(self, param, num_rounds, dataset, coord_param):
param['updater'] = 'coord_descent'
param.update(coord_param)
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
assert tm.non_increasing(result)
# Loss is not guaranteed to always decrease because of regularisation parameters
# We test a weaker condition that the loss has not increased between the first and last
# iteration
@given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 2.0),
strategies.floats(1e-5, 2.0))
@settings(deadline=None)
def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd):
param['updater'] = 'coord_descent'
param['alpha'] = alpha
param['lambda'] = lambd
param.update(coord_param)
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
assert tm.non_increasing([result[0], result[-1]])
def assert_regression_result(results, tol): @given(parameter_strategy, strategies.integers(10, 50),
regression_results = [r for r in results if tm.dataset_strategy)
r["param"]["objective"] == "reg:squarederror"] @settings(deadline=None)
for res in regression_results: def test_shotgun(self, param, num_rounds, dataset):
X = scale(res["dataset"].X, param['updater'] = 'shotgun'
with_mean=isinstance(res["dataset"].X, np.ndarray)) param = dataset.set_params(param)
y = res["dataset"].y result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
reg_alpha = res["param"]["alpha"] assert tm.non_increasing(result)
reg_lambda = res["param"]["lambda"]
pred = res["bst"].predict(xgb.DMatrix(X))
weights = xgb_get_weights(res["bst"])[1:]
enet = ElasticNet(alpha=reg_alpha + reg_lambda,
l1_ratio=reg_alpha / (reg_alpha + reg_lambda))
enet.fit(X, y)
enet_pred = enet.predict(X)
assert np.isclose(weights, enet.coef_, rtol=tol,
atol=tol).all(), (weights, enet.coef_)
assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all(), (
res["dataset"].name, enet_pred[:5], pred[:5])
@given(parameter_strategy, strategies.integers(10, 50),
# TODO: More robust classification tests tm.dataset_strategy, strategies.floats(1e-5, 2.0),
def assert_classification_result(results): strategies.floats(1e-5, 2.0))
classification_results = [r for r in results if @settings(deadline=None)
r["param"]["objective"] != "reg:squarederror"] def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd):
for res in classification_results: param['updater'] = 'shotgun'
# Check accuracy is reasonable param['alpha'] = alpha
assert res["eval"][-1] < 2.0, (res["dataset"].name, res["eval"][-1]) param['lambda'] = lambd
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
class TestLinear(unittest.TestCase): assert tm.non_increasing([result[0], result[-1]])
datasets = ["Boston", "Digits", "Cancer", "Sparse regression",
"Boston External Memory"]
@pytest.mark.skipif(**tm.no_sklearn())
def test_coordinate(self):
variable_param = {'booster': ['gblinear'], 'updater':
['coord_descent'], 'eta': [0.5], 'top_k':
[10], 'tolerance': [1e-5], 'nthread': [2],
'alpha': [.005, .1], 'lambda': [.005],
'feature_selector': ['cyclic', 'shuffle',
'greedy', 'thrifty']}
for param in parameter_combinations(variable_param):
results = run_suite(param, 150, self.datasets, scale_features=True)
assert_regression_result(results, 1e-2)
assert_classification_result(results)
@pytest.mark.skipif(**tm.no_sklearn())
def test_shotgun(self):
variable_param = {'booster': ['gblinear'], 'updater':
['shotgun'], 'eta': [0.5], 'top_k': [10],
'tolerance': [1e-5], 'nthread': [2],
'alpha': [.005, .1], 'lambda': [.005],
'feature_selector': ['cyclic', 'shuffle']}
for param in parameter_combinations(variable_param):
results = run_suite(param, 200, self.datasets, True)
assert_regression_result(results, 1e-2)
assert_classification_result(results)

View File

@ -3,28 +3,57 @@ import unittest
import pytest import pytest
import xgboost as xgb import xgboost as xgb
import numpy as np import numpy as np
from hypothesis import given, strategies, settings, note
try: exact_parameter_strategy = strategies.fixed_dictionaries({
from regression_test_utilities import run_suite, parameter_combinations, \ 'nthread': strategies.integers(1, 4),
assert_results_non_increasing 'max_depth': strategies.integers(1, 11),
except ImportError: 'min_child_weight': strategies.floats(0.5, 2.0),
None 'alpha': strategies.floats(0.0, 2.0),
'lambda': strategies.floats(1e-5, 2.0),
'eta': strategies.floats(0.01, 0.5),
'gamma': strategies.floats(0.0, 2.0),
'seed': strategies.integers(0, 10),
# We cannot enable subsampling as the training loss can increase
# 'subsample': strategies.floats(0.5, 1.0),
'colsample_bytree': strategies.floats(0.5, 1.0),
'colsample_bylevel': strategies.floats(0.5, 1.0),
})
hist_parameter_strategy = strategies.fixed_dictionaries({
'max_depth': strategies.integers(1, 11),
'max_leaves': strategies.integers(0, 1024),
'max_bin': strategies.integers(2, 512),
'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
}).filter(lambda x: (x['max_depth'] > 0 or x['max_leaves'] > 0) and (
x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))
class TestUpdaters(unittest.TestCase): def train_result(param, dmat, num_rounds):
@pytest.mark.skipif(**tm.no_sklearn()) result = {}
def test_histmaker(self): xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
variable_param = {'updater': ['grow_histmaker'], 'max_depth': [2, 8]} evals_result=result)
for param in parameter_combinations(variable_param): return result
result = run_suite(param)
assert_results_non_increasing(result, 1e-2)
@pytest.mark.skipif(**tm.no_sklearn())
def test_colmaker(self): class TestTreeMethod(unittest.TestCase):
variable_param = {'updater': ['grow_colmaker'], 'max_depth': [2, 8]} @given(exact_parameter_strategy, strategies.integers(1, 20),
for param in parameter_combinations(variable_param): tm.dataset_strategy)
result = run_suite(param) @settings(deadline=None)
assert_results_non_increasing(result, 1e-2) def test_exact(self, param, num_rounds, dataset):
param['tree_method'] = 'exact'
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)
assert tm.non_increasing(result['train'][dataset.metric])
@given(exact_parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
@settings(deadline=None)
def test_approx(self, param, num_rounds, dataset):
param['tree_method'] = 'approx'
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)
assert tm.non_increasing(result['train'][dataset.metric], 1e-3)
@pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_sklearn())
def test_pruner(self): def test_pruner(self):
@ -50,19 +79,18 @@ class TestUpdaters(unittest.TestCase):
# Second prune should not change the tree # Second prune should not change the tree
assert after_prune == second_prune assert after_prune == second_prune
@pytest.mark.skipif(**tm.no_sklearn()) @given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
def test_fast_histmaker(self): tm.dataset_strategy)
variable_param = {'tree_method': ['hist'], @settings(deadline=None)
'max_depth': [2, 8], def test_hist(self, param, hist_param, num_rounds, dataset):
'max_bin': [2, 256], param['tree_method'] = 'hist'
'grow_policy': ['depthwise', 'lossguide'], param = dataset.set_params(param)
'max_leaves': [64, 0], param.update(hist_param)
'verbosity': [0], result = train_result(param, dataset.get_dmat(), num_rounds)
'single_precision_histogram': [True, False]} note(result)
for param in parameter_combinations(variable_param): assert tm.non_increasing(result['train'][dataset.metric])
result = run_suite(param)
assert_results_non_increasing(result, 1e-2)
def test_hist_categorical(self):
# hist must be same as exact on all-categorial data # hist must be same as exact on all-categorial data
dpath = 'demo/data/' dpath = 'demo/data/'
ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
@ -87,7 +115,7 @@ class TestUpdaters(unittest.TestCase):
assert hist_res['test']['auc'] == exact_res['test']['auc'] assert hist_res['test']['auc'] == exact_res['test']['auc']
@pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_sklearn())
def test_fast_histmaker_degenerate_case(self): def test_hist_degenerate_case(self):
# Test a degenerate case where the quantile sketcher won't return any # Test a degenerate case where the quantile sketcher won't return any
# quantile points for a particular feature (the second feature in # quantile points for a particular feature (the second feature in
# this example). Source: https://github.com/dmlc/xgboost/issues/2943 # this example). Source: https://github.com/dmlc/xgboost/issues/2943

View File

@ -1,6 +1,19 @@
# coding: utf-8 # coding: utf-8
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
from xgboost.compat import DASK_INSTALLED from xgboost.compat import DASK_INSTALLED
from hypothesis import strategies
from hypothesis.extra.numpy import arrays
from joblib import Memory
from sklearn import datasets
import xgboost as xgb
import numpy as np
try:
import cupy as cp
except ImportError:
cp = None
memory = Memory('./cachedir', verbose=0)
def no_sklearn(): def no_sklearn():
@ -81,3 +94,97 @@ def no_json_schema():
return {'condition': False, 'reason': reason} return {'condition': False, 'reason': reason}
except ImportError: except ImportError:
return {'condition': True, 'reason': reason} return {'condition': True, 'reason': reason}
# Contains a dataset in numpy format as well as the relevant objective and metric
class TestDataset:
def __init__(self, name, get_dataset, objective, metric
):
self.name = name
self.objective = objective
self.metric = metric
self.X, self.y = get_dataset()
self.w = None
def set_params(self, params_in):
params_in['objective'] = self.objective
params_in['eval_metric'] = self.metric
if self.objective == "multi:softmax":
params_in["num_class"] = int(np.max(self.y) + 1)
return params_in
def get_dmat(self):
return xgb.DMatrix(self.X, self.y, self.w)
def get_device_dmat(self):
w = None if self.w is None else cp.array(self.w)
X = cp.array(self.X, dtype=np.float32)
y = cp.array(self.y, dtype=np.float32)
return xgb.DeviceQuantileDMatrix(X, y, w)
def get_external_dmat(self):
np.savetxt('tmptmp_1234.csv', np.hstack((self.y.reshape(len(self.y), 1), self.X)),
delimiter=',')
return xgb.DMatrix('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
weight=self.w)
def __repr__(self):
return self.name
@memory.cache
def get_boston():
data = datasets.load_boston()
return data.data, data.target
@memory.cache
def get_digits():
data = datasets.load_digits()
return data.data, data.target
@memory.cache
def get_cancer():
data = datasets.load_breast_cancer()
return data.data, data.target
@memory.cache
def get_sparse():
rng = np.random.RandomState(199)
n = 2000
sparsity = 0.75
X, y = datasets.make_regression(n, random_state=rng)
flag = rng.binomial(1, sparsity, X.shape)
for i in range(X.shape[0]):
for j in range(X.shape[1]):
if flag[i, j]:
X[i, j] = np.nan
return X, y
_unweighted_datasets_strategy = strategies.sampled_from(
[TestDataset('boston', get_boston, 'reg:squarederror', 'rmse'),
TestDataset('digits', get_digits, 'multi:softmax', 'mlogloss'),
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
TestDataset
("sparse", get_sparse, "reg:squarederror", "rmse"),
TestDataset("empty", lambda: (np.empty((0, 100)), np.empty(0)), "reg:squarederror",
"rmse")])
@strategies.composite
def _dataset_and_weight(draw):
data = draw(_unweighted_datasets_strategy)
if draw(strategies.booleans()):
data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0)))
return data
# A strategy for drawing from a set of example datasets
# May add random weights to the dataset
dataset_strategy = _dataset_and_weight()
def non_increasing(L, tolerance=1e-4):
return all((y - x) < tolerance for x, y in zip(L, L[1:]))

View File

@ -39,7 +39,7 @@ if [ ${TASK} == "python_test" ]; then
# Run unit tests # Run unit tests
cd .. cd ..
python -m pip install graphviz pytest pytest-cov codecov python -m pip install graphviz pytest pytest-cov codecov
python -m pip install datatable python -m pip install datatable hypothesis
python -m pip install numpy scipy pandas matplotlib scikit-learn dask[complete] python -m pip install numpy scipy pandas matplotlib scikit-learn dask[complete]
python -m pytest -v --fulltrace -s tests/python --cov=python-package/xgboost || exit -1 python -m pytest -v --fulltrace -s tests/python --cov=python-package/xgboost || exit -1
codecov codecov