Use hypothesis (#5759)
* Use hypothesis * Allow int64 array interface for groups * Add packages to Windows CI * Add to travis * Make sure device index is set correctly * Fix dask-cudf test * appveyor
This commit is contained in:
parent
02884b08aa
commit
b47b5ac771
@ -113,7 +113,7 @@ def TestWin64CPU() {
|
|||||||
"""
|
"""
|
||||||
echo "Installing Python dependencies..."
|
echo "Installing Python dependencies..."
|
||||||
bat """
|
bat """
|
||||||
conda activate && conda upgrade scikit-learn pandas numpy
|
conda activate && conda install -y hypothesis && conda upgrade scikit-learn pandas numpy hypothesis
|
||||||
"""
|
"""
|
||||||
echo "Running Python tests..."
|
echo "Running Python tests..."
|
||||||
bat "conda activate && python -m pytest -v -s --fulltrace tests\\python"
|
bat "conda activate && python -m pytest -v -s --fulltrace tests\\python"
|
||||||
@ -138,7 +138,7 @@ def TestWin64GPU(args) {
|
|||||||
"""
|
"""
|
||||||
echo "Installing Python dependencies..."
|
echo "Installing Python dependencies..."
|
||||||
bat """
|
bat """
|
||||||
conda activate && conda upgrade scikit-learn pandas numpy
|
conda activate && conda install -y hypothesis && conda upgrade scikit-learn pandas numpy hypothesis
|
||||||
"""
|
"""
|
||||||
echo "Running Python tests..."
|
echo "Running Python tests..."
|
||||||
bat """
|
bat """
|
||||||
|
|||||||
@ -44,7 +44,7 @@ install:
|
|||||||
- if /i "%DO_PYTHON%" == "on" (
|
- if /i "%DO_PYTHON%" == "on" (
|
||||||
conda config --set always_yes true &&
|
conda config --set always_yes true &&
|
||||||
conda update -q conda &&
|
conda update -q conda &&
|
||||||
conda install -y numpy scipy pandas matplotlib pytest scikit-learn graphviz python-graphviz
|
conda install -y numpy scipy pandas matplotlib pytest scikit-learn graphviz python-graphviz hypothesis
|
||||||
)
|
)
|
||||||
- set PATH=C:\Miniconda3-x64\Library\bin\graphviz;%PATH%
|
- set PATH=C:\Miniconda3-x64\Library\bin\graphviz;%PATH%
|
||||||
# R: based on https://github.com/krlmlr/r-appveyor
|
# R: based on https://github.com/krlmlr/r-appveyor
|
||||||
|
|||||||
@ -34,6 +34,30 @@ void CopyInfoImpl(ArrayInterface column, HostDeviceVector<float>* out) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void CopyGroupInfoImpl(ArrayInterface column, std::vector<bst_group_t>* out) {
|
||||||
|
CHECK(column.type[1] == 'i' || column.type[1] == 'u')
|
||||||
|
<< "Expected integer metainfo";
|
||||||
|
auto SetDeviceToPtr = [](void* ptr) {
|
||||||
|
cudaPointerAttributes attr;
|
||||||
|
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
|
||||||
|
int32_t ptr_device = attr.device;
|
||||||
|
dh::safe_cuda(cudaSetDevice(ptr_device));
|
||||||
|
return ptr_device;
|
||||||
|
};
|
||||||
|
auto ptr_device = SetDeviceToPtr(column.data);
|
||||||
|
dh::TemporaryArray<bst_group_t> temp(column.num_rows);
|
||||||
|
auto d_tmp = temp.data();
|
||||||
|
|
||||||
|
dh::LaunchN(ptr_device, column.num_rows, [=] __device__(size_t idx) {
|
||||||
|
d_tmp[idx] = column.GetElement(idx);
|
||||||
|
});
|
||||||
|
auto length = column.num_rows;
|
||||||
|
out->resize(length + 1);
|
||||||
|
out->at(0) = 0;
|
||||||
|
thrust::copy(temp.data(), temp.data() + length, out->begin() + 1);
|
||||||
|
std::partial_sum(out->begin(), out->end(), out->begin());
|
||||||
|
}
|
||||||
|
|
||||||
void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
||||||
Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
|
Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
|
||||||
auto const& j_arr = get<Array>(j_interface);
|
auto const& j_arr = get<Array>(j_interface);
|
||||||
@ -53,16 +77,7 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
|||||||
} else if (key == "base_margin") {
|
} else if (key == "base_margin") {
|
||||||
CopyInfoImpl(array_interface, &base_margin_);
|
CopyInfoImpl(array_interface, &base_margin_);
|
||||||
} else if (key == "group") {
|
} else if (key == "group") {
|
||||||
// Ranking is not performed on device.
|
CopyGroupInfoImpl(array_interface, &group_ptr_);
|
||||||
thrust::device_ptr<uint32_t> p_src{
|
|
||||||
reinterpret_cast<uint32_t*>(array_interface.data)};
|
|
||||||
|
|
||||||
auto length = array_interface.num_rows;
|
|
||||||
group_ptr_.resize(length + 1);
|
|
||||||
group_ptr_[0] = 0;
|
|
||||||
thrust::copy(p_src, p_src + length, group_ptr_.begin() + 1);
|
|
||||||
std::partial_sum(group_ptr_.begin(), group_ptr_.end(), group_ptr_.begin());
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
LOG(FATAL) << "Unknown metainfo: " << key;
|
LOG(FATAL) << "Unknown metainfo: " << key;
|
||||||
|
|||||||
@ -22,7 +22,7 @@ ENV GOSU_VERSION 1.10
|
|||||||
# Install Python packages in default env
|
# Install Python packages in default env
|
||||||
RUN \
|
RUN \
|
||||||
pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh \
|
pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh \
|
||||||
recommonmark guzzle_sphinx_theme mock breathe graphviz \
|
recommonmark guzzle_sphinx_theme mock breathe graphviz hypothesis\
|
||||||
pytest scikit-learn wheel kubernetes urllib3 jsonschema boto3 && \
|
pytest scikit-learn wheel kubernetes urllib3 jsonschema boto3 && \
|
||||||
pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl && \
|
pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl && \
|
||||||
pip install "dask[complete]"
|
pip install "dask[complete]"
|
||||||
|
|||||||
@ -19,7 +19,7 @@ ENV PATH=/opt/python/bin:$PATH
|
|||||||
RUN \
|
RUN \
|
||||||
conda create -n cudf_test -c rapidsai -c nvidia -c conda-forge -c defaults \
|
conda create -n cudf_test -c rapidsai -c nvidia -c conda-forge -c defaults \
|
||||||
python=3.7 cudf cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \
|
python=3.7 cudf cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \
|
||||||
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz
|
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
|
||||||
|
|
||||||
ENV GOSU_VERSION 1.10
|
ENV GOSU_VERSION 1.10
|
||||||
|
|
||||||
|
|||||||
@ -18,7 +18,7 @@ ENV PATH=/opt/python/bin:$PATH
|
|||||||
RUN \
|
RUN \
|
||||||
conda create -n gpu_test -c rapidsai -c nvidia -c conda-forge -c defaults \
|
conda create -n gpu_test -c rapidsai -c nvidia -c conda-forge -c defaults \
|
||||||
python=3.7 dask dask-cuda numpy pytest scipy scikit-learn pandas \
|
python=3.7 dask dask-cuda numpy pytest scipy scikit-learn pandas \
|
||||||
matplotlib wheel python-kubernetes urllib3 graphviz
|
matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
|
||||||
|
|
||||||
ENV GOSU_VERSION 1.10
|
ENV GOSU_VERSION 1.10
|
||||||
|
|
||||||
|
|||||||
@ -21,7 +21,7 @@ std::string PrepareData(std::string typestr, thrust::device_vector<T>* out, cons
|
|||||||
|
|
||||||
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
||||||
column["shape"] = Array(j_shape);
|
column["shape"] = Array(j_shape);
|
||||||
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(4)))});
|
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
|
||||||
column["version"] = Integer(static_cast<Integer::Int>(1));
|
column["version"] = Integer(static_cast<Integer::Int>(1));
|
||||||
column["typestr"] = String(typestr);
|
column["typestr"] = String(typestr);
|
||||||
|
|
||||||
@ -78,16 +78,32 @@ TEST(MetaInfo, FromInterface) {
|
|||||||
|
|
||||||
TEST(MetaInfo, Group) {
|
TEST(MetaInfo, Group) {
|
||||||
cudaSetDevice(0);
|
cudaSetDevice(0);
|
||||||
thrust::device_vector<uint32_t> d_data;
|
|
||||||
std::string str = PrepareData<uint32_t>("<u4", &d_data);
|
|
||||||
|
|
||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
|
|
||||||
info.SetInfo("group", str.c_str());
|
thrust::device_vector<uint32_t> d_uint;
|
||||||
auto const& h_group = info.group_ptr_;
|
std::string uint_str = PrepareData<uint32_t>("<u4", &d_uint);
|
||||||
ASSERT_EQ(h_group.size(), d_data.size() + 1);
|
info.SetInfo("group", uint_str.c_str());
|
||||||
|
auto& h_group = info.group_ptr_;
|
||||||
|
ASSERT_EQ(h_group.size(), d_uint.size() + 1);
|
||||||
for (size_t i = 1; i < h_group.size(); ++i) {
|
for (size_t i = 1; i < h_group.size(); ++i) {
|
||||||
ASSERT_EQ(h_group[i], d_data[i-1] + h_group[i-1]) << "i: " << i;
|
ASSERT_EQ(h_group[i], d_uint[i - 1] + h_group[i - 1]) << "i: " << i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
thrust::device_vector<int64_t> d_int64;
|
||||||
|
std::string int_str = PrepareData<int64_t>("<i8", &d_int64);
|
||||||
|
info = MetaInfo();
|
||||||
|
info.SetInfo("group", int_str.c_str());
|
||||||
|
h_group = info.group_ptr_;
|
||||||
|
ASSERT_EQ(h_group.size(), d_uint.size() + 1);
|
||||||
|
for (size_t i = 1; i < h_group.size(); ++i) {
|
||||||
|
ASSERT_EQ(h_group[i], d_uint[i - 1] + h_group[i - 1]) << "i: " << i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Incorrect type
|
||||||
|
thrust::device_vector<float> d_float;
|
||||||
|
std::string float_str = PrepareData<float>("<f4", &d_float);
|
||||||
|
info = MetaInfo();
|
||||||
|
EXPECT_ANY_THROW(info.SetInfo("group", float_str.c_str()));
|
||||||
}
|
}
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -1,30 +1,50 @@
|
|||||||
import sys
|
import sys
|
||||||
import pytest
|
from hypothesis import strategies, given, settings, assume
|
||||||
import unittest
|
import xgboost as xgb
|
||||||
|
sys.path.append("tests/python")
|
||||||
sys.path.append('tests/python/')
|
import testing as tm
|
||||||
import test_linear # noqa: E402
|
|
||||||
import testing as tm # noqa: E402
|
|
||||||
|
|
||||||
|
|
||||||
class TestGPULinear(unittest.TestCase):
|
parameter_strategy = strategies.fixed_dictionaries({
|
||||||
datasets = ["Boston", "Digits", "Cancer", "Sparse regression"]
|
'booster': strategies.just('gblinear'),
|
||||||
common_param = {
|
'eta': strategies.floats(0.01, 0.25),
|
||||||
'booster': ['gblinear'],
|
'tolerance': strategies.floats(1e-5, 1e-2),
|
||||||
'updater': ['gpu_coord_descent'],
|
'nthread': strategies.integers(1, 4),
|
||||||
'eta': [0.5],
|
'feature_selector': strategies.sampled_from(['cyclic', 'shuffle',
|
||||||
'top_k': [10],
|
'greedy', 'thrifty']),
|
||||||
'tolerance': [1e-5],
|
'top_k': strategies.integers(1, 10),
|
||||||
'alpha': [.1],
|
})
|
||||||
'lambda': [0.005],
|
|
||||||
'coordinate_selection': ['cyclic', 'random', 'greedy']}
|
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
def train_result(param, dmat, num_rounds):
|
||||||
def test_gpu_coordinate(self):
|
result = {}
|
||||||
parameters = self.common_param.copy()
|
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
|
||||||
parameters['gpu_id'] = [0]
|
evals_result=result)
|
||||||
for param in test_linear.parameter_combinations(parameters):
|
return result
|
||||||
results = test_linear.run_suite(
|
|
||||||
param, 100, self.datasets, scale_features=True)
|
|
||||||
test_linear.assert_regression_result(results, 1e-2)
|
class TestGPULinear:
|
||||||
test_linear.assert_classification_result(results)
|
@given(parameter_strategy, strategies.integers(10, 50),
|
||||||
|
tm.dataset_strategy)
|
||||||
|
@settings(deadline=None)
|
||||||
|
def test_gpu_coordinate(self, param, num_rounds, dataset):
|
||||||
|
assume(len(dataset.y) > 0)
|
||||||
|
param['updater'] = 'gpu_coord_descent'
|
||||||
|
param = dataset.set_params(param)
|
||||||
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
assert tm.non_increasing(result)
|
||||||
|
|
||||||
|
# Loss is not guaranteed to always decrease because of regularisation parameters
|
||||||
|
# We test a weaker condition that the loss has not increased between the first and last
|
||||||
|
# iteration
|
||||||
|
@given(parameter_strategy, strategies.integers(10, 50),
|
||||||
|
tm.dataset_strategy, strategies.floats(1e-5, 2.0),
|
||||||
|
strategies.floats(1e-5, 2.0))
|
||||||
|
@settings(deadline=None)
|
||||||
|
def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd):
|
||||||
|
assume(len(dataset.y) > 0)
|
||||||
|
param['updater'] = 'gpu_coord_descent'
|
||||||
|
param['alpha'] = alpha
|
||||||
|
param['lambda'] = lambd
|
||||||
|
param = dataset.set_params(param)
|
||||||
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
assert tm.non_increasing([result[0], result[-1]])
|
||||||
|
|||||||
@ -4,9 +4,13 @@ import unittest
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import subprocess
|
import subprocess
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import json
|
import json
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
sys.path.append("tests/python")
|
||||||
|
import testing as tm
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from xgboost import XGBClassifier
|
from xgboost import XGBClassifier
|
||||||
|
|
||||||
@ -90,7 +94,6 @@ class TestPickling(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
status = subprocess.call(args, env=env)
|
status = subprocess.call(args, env=env)
|
||||||
assert status == 0
|
assert status == 0
|
||||||
|
|
||||||
os.remove(model_path)
|
os.remove(model_path)
|
||||||
|
|
||||||
def test_pickled_predictor(self):
|
def test_pickled_predictor(self):
|
||||||
|
|||||||
@ -158,10 +158,10 @@ class TestGPUPredict(unittest.TestCase):
|
|||||||
rows = 1000
|
rows = 1000
|
||||||
cols = 10
|
cols = 10
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
|
cp.cuda.runtime.setDevice(0)
|
||||||
X = rng.randn(rows, cols)
|
X = rng.randn(rows, cols)
|
||||||
X = pd.DataFrame(X)
|
X = pd.DataFrame(X)
|
||||||
y = rng.randn(rows)
|
y = rng.randn(rows)
|
||||||
|
|
||||||
X = cudf.from_pandas(X)
|
X = cudf.from_pandas(X)
|
||||||
|
|
||||||
dtrain = xgb.DMatrix(X, y)
|
dtrain = xgb.DMatrix(X, y)
|
||||||
|
|||||||
@ -1,74 +1,71 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import sys
|
import sys
|
||||||
import unittest
|
|
||||||
import pytest
|
import pytest
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
|
from hypothesis import given, strategies, assume, settings, note
|
||||||
|
|
||||||
sys.path.append("tests/python")
|
sys.path.append("tests/python")
|
||||||
import testing as tm
|
import testing as tm
|
||||||
from regression_test_utilities import run_suite, parameter_combinations, \
|
|
||||||
assert_results_non_increasing
|
parameter_strategy = strategies.fixed_dictionaries({
|
||||||
|
'max_depth': strategies.integers(0, 11),
|
||||||
|
'max_leaves': strategies.integers(0, 256),
|
||||||
|
'max_bin': strategies.integers(2, 1024),
|
||||||
|
'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
|
||||||
|
'single_precision_histogram': strategies.booleans(),
|
||||||
|
'min_child_weight': strategies.floats(0.5, 2.0),
|
||||||
|
'seed': strategies.integers(0, 10),
|
||||||
|
# We cannot enable subsampling as the training loss can increase
|
||||||
|
# 'subsample': strategies.floats(0.5, 1.0),
|
||||||
|
'colsample_bytree': strategies.floats(0.5, 1.0),
|
||||||
|
'colsample_bylevel': strategies.floats(0.5, 1.0),
|
||||||
|
}).filter(lambda x: (x['max_depth'] > 0 or x['max_leaves'] > 0) and (
|
||||||
|
x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))
|
||||||
|
|
||||||
|
|
||||||
def assert_gpu_results(cpu_results, gpu_results):
|
def train_result(param, dmat, num_rounds):
|
||||||
for cpu_res, gpu_res in zip(cpu_results, gpu_results):
|
result = {}
|
||||||
# Check final eval result roughly equivalent
|
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
|
||||||
assert np.allclose(cpu_res["eval"][-1],
|
evals_result=result)
|
||||||
gpu_res["eval"][-1], 1e-1, 1e-1)
|
return result
|
||||||
|
|
||||||
|
|
||||||
datasets = ["Boston", "Cancer", "Digits", "Sparse regression",
|
class TestGPUUpdaters:
|
||||||
"Sparse regression with weights", "Small weights regression"]
|
@given(parameter_strategy, strategies.integers(1, 20),
|
||||||
|
tm.dataset_strategy)
|
||||||
test_param = parameter_combinations({
|
@settings(deadline=None)
|
||||||
'gpu_id': [0],
|
def test_gpu_hist(self, param, num_rounds, dataset):
|
||||||
'max_depth': [2, 8],
|
param['tree_method'] = 'gpu_hist'
|
||||||
'max_leaves': [255, 4],
|
param = dataset.set_params(param)
|
||||||
'max_bin': [4, 256],
|
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||||
'grow_policy': ['lossguide'],
|
note(result)
|
||||||
'single_precision_histogram': [True],
|
assert tm.non_increasing(result['train'][dataset.metric])
|
||||||
'min_child_weight': [0],
|
|
||||||
'lambda': [0]})
|
|
||||||
|
|
||||||
|
|
||||||
class TestGPU(unittest.TestCase):
|
|
||||||
def test_gpu_hist(self):
|
|
||||||
for param in test_param:
|
|
||||||
param['tree_method'] = 'gpu_hist'
|
|
||||||
gpu_results = run_suite(param, select_datasets=datasets)
|
|
||||||
assert_results_non_increasing(gpu_results, 1e-2)
|
|
||||||
param['tree_method'] = 'hist'
|
|
||||||
cpu_results = run_suite(param, select_datasets=datasets)
|
|
||||||
assert_gpu_results(cpu_results, gpu_results)
|
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_cupy())
|
@pytest.mark.skipif(**tm.no_cupy())
|
||||||
def test_gpu_hist_device_dmatrix(self):
|
@given(parameter_strategy, strategies.integers(1, 20),
|
||||||
# DeviceDMatrix does not currently accept sparse formats
|
tm.dataset_strategy)
|
||||||
device_dmatrix_datasets = ["Boston", "Cancer", "Digits"]
|
@settings(deadline=None)
|
||||||
for param in test_param:
|
def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
|
||||||
param['tree_method'] = 'gpu_hist'
|
# We cannot handle empty dataset yet
|
||||||
|
assume(len(dataset.y) > 0)
|
||||||
|
param['tree_method'] = 'gpu_hist'
|
||||||
|
param = dataset.set_params(param)
|
||||||
|
result = train_result(param, dataset.get_device_dmat(), num_rounds)
|
||||||
|
note(result)
|
||||||
|
assert tm.non_increasing(result['train'][dataset.metric])
|
||||||
|
|
||||||
gpu_results_device_dmatrix = run_suite(param, select_datasets=device_dmatrix_datasets,
|
@given(parameter_strategy, strategies.integers(1, 20),
|
||||||
DMatrixT=xgb.DeviceQuantileDMatrix,
|
tm.dataset_strategy)
|
||||||
dmatrix_params={'max_bin': param['max_bin']})
|
@settings(deadline=None)
|
||||||
assert_results_non_increasing(gpu_results_device_dmatrix, 1e-2)
|
def test_external_memory(self, param, num_rounds, dataset):
|
||||||
gpu_results = run_suite(param, select_datasets=device_dmatrix_datasets)
|
# We cannot handle empty dataset yet
|
||||||
assert_gpu_results(gpu_results, gpu_results_device_dmatrix)
|
assume(len(dataset.y) > 0)
|
||||||
|
param['tree_method'] = 'gpu_hist'
|
||||||
|
param = dataset.set_params(param)
|
||||||
|
external_result = train_result(param, dataset.get_external_dmat(), num_rounds)
|
||||||
|
assert tm.non_increasing(external_result['train'][dataset.metric])
|
||||||
|
|
||||||
# NOTE(rongou): Because the `Boston` dataset is too small, this only tests external memory mode
|
def test_empty_dmatrix_prediction(self):
|
||||||
# with a single page. To test multiple pages, set DMatrix::kPageSize to, say, 1024.
|
|
||||||
def test_external_memory(self):
|
|
||||||
for param in reversed(test_param):
|
|
||||||
param['tree_method'] = 'gpu_hist'
|
|
||||||
param['gpu_page_size'] = 1024
|
|
||||||
gpu_results = run_suite(param, select_datasets=["Boston"])
|
|
||||||
assert_results_non_increasing(gpu_results, 1e-2)
|
|
||||||
ext_mem_results = run_suite(param, select_datasets=["Boston External Memory"])
|
|
||||||
assert_results_non_increasing(ext_mem_results, 1e-2)
|
|
||||||
assert_gpu_results(gpu_results, ext_mem_results)
|
|
||||||
break
|
|
||||||
|
|
||||||
def test_with_empty_dmatrix(self):
|
|
||||||
# FIXME(trivialfis): This should be done with all updaters
|
# FIXME(trivialfis): This should be done with all updaters
|
||||||
kRows = 0
|
kRows = 0
|
||||||
kCols = 100
|
kCols = 100
|
||||||
@ -94,13 +91,10 @@ class TestGPU(unittest.TestCase):
|
|||||||
np.testing.assert_allclose(predictions, 0.5, 1e-6)
|
np.testing.assert_allclose(predictions, 0.5, 1e-6)
|
||||||
|
|
||||||
@pytest.mark.mgpu
|
@pytest.mark.mgpu
|
||||||
def test_specified_gpu_id_gpu_update(self):
|
@given(tm.dataset_strategy, strategies.integers(0, 10))
|
||||||
variable_param = {'gpu_id': [1],
|
@settings(deadline=None, max_examples=10)
|
||||||
'max_depth': [8],
|
def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
|
||||||
'max_leaves': [255, 4],
|
param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
|
||||||
'max_bin': [2, 64],
|
param = dataset.set_params(param)
|
||||||
'grow_policy': ['lossguide'],
|
result = train_result(param, dataset.get_dmat(), 10)
|
||||||
'tree_method': ['gpu_hist']}
|
assert tm.non_increasing(result['train'][dataset.metric])
|
||||||
for param in parameter_combinations(variable_param):
|
|
||||||
gpu_results = run_suite(param, select_datasets=datasets)
|
|
||||||
assert_results_non_increasing(gpu_results, 1e-2)
|
|
||||||
|
|||||||
@ -31,7 +31,8 @@ class TestDistributedGPU(unittest.TestCase):
|
|||||||
def test_dask_dataframe(self):
|
def test_dask_dataframe(self):
|
||||||
with LocalCUDACluster() as cluster:
|
with LocalCUDACluster() as cluster:
|
||||||
with Client(cluster) as client:
|
with Client(cluster) as client:
|
||||||
import cupy
|
import cupy as cp
|
||||||
|
cp.cuda.runtime.setDevice(0)
|
||||||
X, y = generate_array()
|
X, y = generate_array()
|
||||||
|
|
||||||
X = dd.from_dask_array(X)
|
X = dd.from_dask_array(X)
|
||||||
@ -59,8 +60,8 @@ class TestDistributedGPU(unittest.TestCase):
|
|||||||
single_node = out['booster'].predict(
|
single_node = out['booster'].predict(
|
||||||
xgboost.DMatrix(X.compute()))
|
xgboost.DMatrix(X.compute()))
|
||||||
|
|
||||||
cupy.testing.assert_allclose(single_node, predictions)
|
cp.testing.assert_allclose(single_node, predictions)
|
||||||
cupy.testing.assert_allclose(single_node, series_predictions)
|
np.testing.assert_allclose(single_node, series_predictions.to_array())
|
||||||
|
|
||||||
predt = dxgb.predict(client, out, X)
|
predt = dxgb.predict(client, out, X)
|
||||||
assert isinstance(predt, dd.Series)
|
assert isinstance(predt, dd.Series)
|
||||||
@ -73,7 +74,7 @@ class TestDistributedGPU(unittest.TestCase):
|
|||||||
is_df,
|
is_df,
|
||||||
meta=dd.utils.make_meta({'prediction': 'f4'}))
|
meta=dd.utils.make_meta({'prediction': 'f4'}))
|
||||||
|
|
||||||
cupy.testing.assert_allclose(
|
cp.testing.assert_allclose(
|
||||||
predt.values.compute(), single_node)
|
predt.values.compute(), single_node)
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_cupy())
|
@pytest.mark.skipif(**tm.no_cupy())
|
||||||
@ -81,11 +82,12 @@ class TestDistributedGPU(unittest.TestCase):
|
|||||||
def test_dask_array(self):
|
def test_dask_array(self):
|
||||||
with LocalCUDACluster() as cluster:
|
with LocalCUDACluster() as cluster:
|
||||||
with Client(cluster) as client:
|
with Client(cluster) as client:
|
||||||
import cupy
|
import cupy as cp
|
||||||
|
cp.cuda.runtime.setDevice(0)
|
||||||
X, y = generate_array()
|
X, y = generate_array()
|
||||||
|
|
||||||
X = X.map_blocks(cupy.asarray)
|
X = X.map_blocks(cp.asarray)
|
||||||
y = y.map_blocks(cupy.asarray)
|
y = y.map_blocks(cp.asarray)
|
||||||
dtrain = dxgb.DaskDMatrix(client, X, y)
|
dtrain = dxgb.DaskDMatrix(client, X, y)
|
||||||
out = dxgb.train(client, {'tree_method': 'gpu_hist'},
|
out = dxgb.train(client, {'tree_method': 'gpu_hist'},
|
||||||
dtrain=dtrain,
|
dtrain=dtrain,
|
||||||
@ -97,11 +99,11 @@ class TestDistributedGPU(unittest.TestCase):
|
|||||||
single_node = out['booster'].predict(
|
single_node = out['booster'].predict(
|
||||||
xgboost.DMatrix(X.compute()))
|
xgboost.DMatrix(X.compute()))
|
||||||
np.testing.assert_allclose(single_node, from_dmatrix)
|
np.testing.assert_allclose(single_node, from_dmatrix)
|
||||||
device = cupy.cuda.runtime.getDevice()
|
device = cp.cuda.runtime.getDevice()
|
||||||
assert device == inplace_predictions.device.id
|
assert device == inplace_predictions.device.id
|
||||||
single_node = cupy.array(single_node)
|
single_node = cp.array(single_node)
|
||||||
assert device == single_node.device.id
|
assert device == single_node.device.id
|
||||||
cupy.testing.assert_allclose(
|
cp.testing.assert_allclose(
|
||||||
single_node,
|
single_node,
|
||||||
inplace_predictions)
|
inplace_predictions)
|
||||||
|
|
||||||
|
|||||||
@ -1,198 +0,0 @@
|
|||||||
import glob
|
|
||||||
import itertools as it
|
|
||||||
import numpy as np
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import xgboost as xgb
|
|
||||||
from joblib import Memory
|
|
||||||
memory = Memory('./cachedir', verbose=0)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from sklearn import datasets
|
|
||||||
from sklearn.preprocessing import scale
|
|
||||||
except ImportError:
|
|
||||||
None
|
|
||||||
|
|
||||||
|
|
||||||
class Dataset:
|
|
||||||
def __init__(self, name, get_dataset, objective, metric,
|
|
||||||
has_weights=False, use_external_memory=False):
|
|
||||||
self.name = name
|
|
||||||
self.objective = objective
|
|
||||||
self.metric = metric
|
|
||||||
if has_weights:
|
|
||||||
self.X, self.y, self.w = get_dataset()
|
|
||||||
else:
|
|
||||||
self.X, self.y = get_dataset()
|
|
||||||
self.w = None
|
|
||||||
self.use_external_memory = use_external_memory
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
a = 'name: {name}\nobjective:{objective}, metric:{metric}, '.format(
|
|
||||||
name=self.name,
|
|
||||||
objective=self.objective,
|
|
||||||
metric=self.metric)
|
|
||||||
b = 'external memory:{use_external_memory}\n'.format(
|
|
||||||
use_external_memory=self.use_external_memory
|
|
||||||
)
|
|
||||||
return a + b
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.__str__()
|
|
||||||
|
|
||||||
|
|
||||||
@memory.cache
|
|
||||||
def get_boston():
|
|
||||||
data = datasets.load_boston()
|
|
||||||
return data.data, data.target
|
|
||||||
|
|
||||||
|
|
||||||
@memory.cache
|
|
||||||
def get_digits():
|
|
||||||
data = datasets.load_digits()
|
|
||||||
return data.data, data.target
|
|
||||||
|
|
||||||
|
|
||||||
@memory.cache
|
|
||||||
def get_cancer():
|
|
||||||
data = datasets.load_breast_cancer()
|
|
||||||
return data.data, data.target
|
|
||||||
|
|
||||||
|
|
||||||
@memory.cache
|
|
||||||
def get_sparse():
|
|
||||||
rng = np.random.RandomState(199)
|
|
||||||
n = 2000
|
|
||||||
sparsity = 0.75
|
|
||||||
X, y = datasets.make_regression(n, random_state=rng)
|
|
||||||
flag = rng.binomial(1, sparsity, X.shape)
|
|
||||||
for i in range(X.shape[0]):
|
|
||||||
for j in range(X.shape[1]):
|
|
||||||
if flag[i, j]:
|
|
||||||
X[i, j] = 0.0
|
|
||||||
from scipy import sparse
|
|
||||||
X = sparse.csr_matrix(X)
|
|
||||||
return X, y
|
|
||||||
|
|
||||||
|
|
||||||
def get_sparse_weights():
|
|
||||||
return get_weights_regression(1, 10)
|
|
||||||
|
|
||||||
|
|
||||||
def get_small_weights():
|
|
||||||
return get_weights_regression(1e-6, 1e-5)
|
|
||||||
|
|
||||||
|
|
||||||
@memory.cache
|
|
||||||
def get_weights_regression(min_weight, max_weight):
|
|
||||||
rng = np.random.RandomState(199)
|
|
||||||
n = 2000
|
|
||||||
sparsity = 0.25
|
|
||||||
X, y = datasets.make_regression(n, random_state=rng)
|
|
||||||
flag = rng.binomial(1, sparsity, X.shape)
|
|
||||||
for i in range(X.shape[0]):
|
|
||||||
for j in range(X.shape[1]):
|
|
||||||
if flag[i, j]:
|
|
||||||
X[i, j] = np.nan
|
|
||||||
w = rng.uniform(min_weight, max_weight, n)
|
|
||||||
return X, y, w
|
|
||||||
|
|
||||||
|
|
||||||
def train_dataset(dataset, param_in, num_rounds=10, scale_features=False, DMatrixT=xgb.DMatrix,
|
|
||||||
dmatrix_params={}):
|
|
||||||
param = param_in.copy()
|
|
||||||
param["objective"] = dataset.objective
|
|
||||||
if dataset.objective == "multi:softmax":
|
|
||||||
param["num_class"] = int(np.max(dataset.y) + 1)
|
|
||||||
param["eval_metric"] = dataset.metric
|
|
||||||
|
|
||||||
if scale_features:
|
|
||||||
X = scale(dataset.X, with_mean=isinstance(dataset.X, np.ndarray))
|
|
||||||
else:
|
|
||||||
X = dataset.X
|
|
||||||
|
|
||||||
if dataset.use_external_memory:
|
|
||||||
np.savetxt('tmptmp_1234.csv', np.hstack((dataset.y.reshape(len(dataset.y), 1), X)),
|
|
||||||
delimiter=',')
|
|
||||||
dtrain = DMatrixT('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
|
|
||||||
weight=dataset.w)
|
|
||||||
elif DMatrixT is xgb.DeviceQuantileDMatrix:
|
|
||||||
import cupy as cp
|
|
||||||
dtrain = DMatrixT(cp.array(X), cp.array(dataset.y),
|
|
||||||
weight=None if dataset.w is None else cp.array(dataset.w),
|
|
||||||
**dmatrix_params)
|
|
||||||
else:
|
|
||||||
dtrain = DMatrixT(X, dataset.y, weight=dataset.w, **dmatrix_params)
|
|
||||||
|
|
||||||
print("Training on dataset: " + dataset.name, file=sys.stderr)
|
|
||||||
print("Using parameters: " + str(param), file=sys.stderr)
|
|
||||||
res = {}
|
|
||||||
bst = xgb.train(param, dtrain, num_rounds, [(dtrain, 'train')],
|
|
||||||
evals_result=res, verbose_eval=False)
|
|
||||||
|
|
||||||
# Free the booster and dmatrix so we can delete temporary files
|
|
||||||
bst_copy = bst.copy()
|
|
||||||
del bst
|
|
||||||
del dtrain
|
|
||||||
|
|
||||||
# Cleanup temporary files
|
|
||||||
if dataset.use_external_memory:
|
|
||||||
for f in glob.glob("tmptmp_*"):
|
|
||||||
os.remove(f)
|
|
||||||
|
|
||||||
return {"dataset": dataset, "bst": bst_copy, "param": param.copy(),
|
|
||||||
"eval": res['train'][dataset.metric]}
|
|
||||||
|
|
||||||
|
|
||||||
def parameter_combinations(variable_param):
|
|
||||||
"""
|
|
||||||
Enumerate all possible combinations of parameters
|
|
||||||
"""
|
|
||||||
result = []
|
|
||||||
names = sorted(variable_param)
|
|
||||||
combinations = it.product(*(variable_param[Name] for Name in names))
|
|
||||||
for set in combinations:
|
|
||||||
param = {}
|
|
||||||
for i, name in enumerate(names):
|
|
||||||
param[name] = set[i]
|
|
||||||
result.append(param)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def run_suite(param, num_rounds=10, select_datasets=None, scale_features=False,
|
|
||||||
DMatrixT=xgb.DMatrix, dmatrix_params={}):
|
|
||||||
"""
|
|
||||||
Run the given parameters on a range of datasets. Objective and eval metric will be
|
|
||||||
automatically set
|
|
||||||
"""
|
|
||||||
datasets = [
|
|
||||||
Dataset("Boston", get_boston, "reg:squarederror", "rmse"),
|
|
||||||
Dataset("Digits", get_digits, "multi:softmax", "mlogloss"),
|
|
||||||
Dataset("Cancer", get_cancer, "binary:logistic", "logloss"),
|
|
||||||
Dataset("Sparse regression", get_sparse, "reg:squarederror", "rmse"),
|
|
||||||
Dataset("Sparse regression with weights", get_sparse_weights,
|
|
||||||
"reg:squarederror", "rmse", has_weights=True),
|
|
||||||
Dataset("Small weights regression", get_small_weights,
|
|
||||||
"reg:squarederror", "rmse", has_weights=True),
|
|
||||||
Dataset("Boston External Memory", get_boston,
|
|
||||||
"reg:squarederror", "rmse",
|
|
||||||
use_external_memory=True)
|
|
||||||
]
|
|
||||||
|
|
||||||
results = [
|
|
||||||
]
|
|
||||||
for d in datasets:
|
|
||||||
if select_datasets is None or d.name in select_datasets:
|
|
||||||
results.append(
|
|
||||||
train_dataset(d, param, num_rounds=num_rounds, scale_features=scale_features,
|
|
||||||
DMatrixT=DMatrixT, dmatrix_params=dmatrix_params))
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def non_increasing(L, tolerance):
|
|
||||||
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
|
|
||||||
|
|
||||||
|
|
||||||
def assert_results_non_increasing(results, tolerance=1e-5):
|
|
||||||
for r in results:
|
|
||||||
assert non_increasing(r['eval'], tolerance), r
|
|
||||||
@ -1,87 +1,72 @@
|
|||||||
import numpy as np
|
|
||||||
import testing as tm
|
import testing as tm
|
||||||
import unittest
|
from hypothesis import strategies, given, settings, note
|
||||||
import pytest
|
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
|
|
||||||
try:
|
parameter_strategy = strategies.fixed_dictionaries({
|
||||||
from sklearn.linear_model import ElasticNet
|
'booster': strategies.just('gblinear'),
|
||||||
from sklearn.preprocessing import scale
|
'eta': strategies.floats(0.01, 0.25),
|
||||||
from regression_test_utilities import run_suite, parameter_combinations
|
'tolerance': strategies.floats(1e-5, 1e-2),
|
||||||
except ImportError:
|
'nthread': strategies.integers(1, 4),
|
||||||
None
|
})
|
||||||
|
|
||||||
|
coord_strategy = strategies.fixed_dictionaries({
|
||||||
|
'feature_selector': strategies.sampled_from(['cyclic', 'shuffle',
|
||||||
|
'greedy', 'thrifty']),
|
||||||
|
'top_k': strategies.integers(1, 10),
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
def is_float(s):
|
def train_result(param, dmat, num_rounds):
|
||||||
try:
|
result = {}
|
||||||
float(s)
|
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
|
||||||
return 1
|
evals_result=result)
|
||||||
except ValueError:
|
return result
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def xgb_get_weights(bst):
|
class TestLinear:
|
||||||
return np.array([float(s) for s in bst.get_dump()[0].split() if
|
@given(parameter_strategy, strategies.integers(10, 50),
|
||||||
is_float(s)])
|
tm.dataset_strategy, coord_strategy)
|
||||||
|
@settings(deadline=None)
|
||||||
|
def test_coordinate(self, param, num_rounds, dataset, coord_param):
|
||||||
|
param['updater'] = 'coord_descent'
|
||||||
|
param.update(coord_param)
|
||||||
|
param = dataset.set_params(param)
|
||||||
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
assert tm.non_increasing(result)
|
||||||
|
|
||||||
|
# Loss is not guaranteed to always decrease because of regularisation parameters
|
||||||
|
# We test a weaker condition that the loss has not increased between the first and last
|
||||||
|
# iteration
|
||||||
|
@given(parameter_strategy, strategies.integers(10, 50),
|
||||||
|
tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 2.0),
|
||||||
|
strategies.floats(1e-5, 2.0))
|
||||||
|
@settings(deadline=None)
|
||||||
|
def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd):
|
||||||
|
param['updater'] = 'coord_descent'
|
||||||
|
param['alpha'] = alpha
|
||||||
|
param['lambda'] = lambd
|
||||||
|
param.update(coord_param)
|
||||||
|
param = dataset.set_params(param)
|
||||||
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
|
assert tm.non_increasing([result[0], result[-1]])
|
||||||
|
|
||||||
def assert_regression_result(results, tol):
|
@given(parameter_strategy, strategies.integers(10, 50),
|
||||||
regression_results = [r for r in results if
|
tm.dataset_strategy)
|
||||||
r["param"]["objective"] == "reg:squarederror"]
|
@settings(deadline=None)
|
||||||
for res in regression_results:
|
def test_shotgun(self, param, num_rounds, dataset):
|
||||||
X = scale(res["dataset"].X,
|
param['updater'] = 'shotgun'
|
||||||
with_mean=isinstance(res["dataset"].X, np.ndarray))
|
param = dataset.set_params(param)
|
||||||
y = res["dataset"].y
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
reg_alpha = res["param"]["alpha"]
|
assert tm.non_increasing(result)
|
||||||
reg_lambda = res["param"]["lambda"]
|
|
||||||
pred = res["bst"].predict(xgb.DMatrix(X))
|
|
||||||
weights = xgb_get_weights(res["bst"])[1:]
|
|
||||||
enet = ElasticNet(alpha=reg_alpha + reg_lambda,
|
|
||||||
l1_ratio=reg_alpha / (reg_alpha + reg_lambda))
|
|
||||||
enet.fit(X, y)
|
|
||||||
enet_pred = enet.predict(X)
|
|
||||||
assert np.isclose(weights, enet.coef_, rtol=tol,
|
|
||||||
atol=tol).all(), (weights, enet.coef_)
|
|
||||||
assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all(), (
|
|
||||||
res["dataset"].name, enet_pred[:5], pred[:5])
|
|
||||||
|
|
||||||
|
@given(parameter_strategy, strategies.integers(10, 50),
|
||||||
# TODO: More robust classification tests
|
tm.dataset_strategy, strategies.floats(1e-5, 2.0),
|
||||||
def assert_classification_result(results):
|
strategies.floats(1e-5, 2.0))
|
||||||
classification_results = [r for r in results if
|
@settings(deadline=None)
|
||||||
r["param"]["objective"] != "reg:squarederror"]
|
def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd):
|
||||||
for res in classification_results:
|
param['updater'] = 'shotgun'
|
||||||
# Check accuracy is reasonable
|
param['alpha'] = alpha
|
||||||
assert res["eval"][-1] < 2.0, (res["dataset"].name, res["eval"][-1])
|
param['lambda'] = lambd
|
||||||
|
param = dataset.set_params(param)
|
||||||
|
result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
|
||||||
class TestLinear(unittest.TestCase):
|
assert tm.non_increasing([result[0], result[-1]])
|
||||||
|
|
||||||
datasets = ["Boston", "Digits", "Cancer", "Sparse regression",
|
|
||||||
"Boston External Memory"]
|
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
|
||||||
def test_coordinate(self):
|
|
||||||
variable_param = {'booster': ['gblinear'], 'updater':
|
|
||||||
['coord_descent'], 'eta': [0.5], 'top_k':
|
|
||||||
[10], 'tolerance': [1e-5], 'nthread': [2],
|
|
||||||
'alpha': [.005, .1], 'lambda': [.005],
|
|
||||||
'feature_selector': ['cyclic', 'shuffle',
|
|
||||||
'greedy', 'thrifty']}
|
|
||||||
for param in parameter_combinations(variable_param):
|
|
||||||
results = run_suite(param, 150, self.datasets, scale_features=True)
|
|
||||||
assert_regression_result(results, 1e-2)
|
|
||||||
assert_classification_result(results)
|
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
|
||||||
def test_shotgun(self):
|
|
||||||
variable_param = {'booster': ['gblinear'], 'updater':
|
|
||||||
['shotgun'], 'eta': [0.5], 'top_k': [10],
|
|
||||||
'tolerance': [1e-5], 'nthread': [2],
|
|
||||||
'alpha': [.005, .1], 'lambda': [.005],
|
|
||||||
'feature_selector': ['cyclic', 'shuffle']}
|
|
||||||
for param in parameter_combinations(variable_param):
|
|
||||||
results = run_suite(param, 200, self.datasets, True)
|
|
||||||
assert_regression_result(results, 1e-2)
|
|
||||||
assert_classification_result(results)
|
|
||||||
|
|||||||
@ -3,28 +3,57 @@ import unittest
|
|||||||
import pytest
|
import pytest
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from hypothesis import given, strategies, settings, note
|
||||||
|
|
||||||
try:
|
exact_parameter_strategy = strategies.fixed_dictionaries({
|
||||||
from regression_test_utilities import run_suite, parameter_combinations, \
|
'nthread': strategies.integers(1, 4),
|
||||||
assert_results_non_increasing
|
'max_depth': strategies.integers(1, 11),
|
||||||
except ImportError:
|
'min_child_weight': strategies.floats(0.5, 2.0),
|
||||||
None
|
'alpha': strategies.floats(0.0, 2.0),
|
||||||
|
'lambda': strategies.floats(1e-5, 2.0),
|
||||||
|
'eta': strategies.floats(0.01, 0.5),
|
||||||
|
'gamma': strategies.floats(0.0, 2.0),
|
||||||
|
'seed': strategies.integers(0, 10),
|
||||||
|
# We cannot enable subsampling as the training loss can increase
|
||||||
|
# 'subsample': strategies.floats(0.5, 1.0),
|
||||||
|
'colsample_bytree': strategies.floats(0.5, 1.0),
|
||||||
|
'colsample_bylevel': strategies.floats(0.5, 1.0),
|
||||||
|
})
|
||||||
|
|
||||||
|
hist_parameter_strategy = strategies.fixed_dictionaries({
|
||||||
|
'max_depth': strategies.integers(1, 11),
|
||||||
|
'max_leaves': strategies.integers(0, 1024),
|
||||||
|
'max_bin': strategies.integers(2, 512),
|
||||||
|
'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
|
||||||
|
}).filter(lambda x: (x['max_depth'] > 0 or x['max_leaves'] > 0) and (
|
||||||
|
x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))
|
||||||
|
|
||||||
|
|
||||||
class TestUpdaters(unittest.TestCase):
|
def train_result(param, dmat, num_rounds):
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
result = {}
|
||||||
def test_histmaker(self):
|
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
|
||||||
variable_param = {'updater': ['grow_histmaker'], 'max_depth': [2, 8]}
|
evals_result=result)
|
||||||
for param in parameter_combinations(variable_param):
|
return result
|
||||||
result = run_suite(param)
|
|
||||||
assert_results_non_increasing(result, 1e-2)
|
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
|
||||||
def test_colmaker(self):
|
class TestTreeMethod(unittest.TestCase):
|
||||||
variable_param = {'updater': ['grow_colmaker'], 'max_depth': [2, 8]}
|
@given(exact_parameter_strategy, strategies.integers(1, 20),
|
||||||
for param in parameter_combinations(variable_param):
|
tm.dataset_strategy)
|
||||||
result = run_suite(param)
|
@settings(deadline=None)
|
||||||
assert_results_non_increasing(result, 1e-2)
|
def test_exact(self, param, num_rounds, dataset):
|
||||||
|
param['tree_method'] = 'exact'
|
||||||
|
param = dataset.set_params(param)
|
||||||
|
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||||
|
assert tm.non_increasing(result['train'][dataset.metric])
|
||||||
|
|
||||||
|
@given(exact_parameter_strategy, strategies.integers(1, 20),
|
||||||
|
tm.dataset_strategy)
|
||||||
|
@settings(deadline=None)
|
||||||
|
def test_approx(self, param, num_rounds, dataset):
|
||||||
|
param['tree_method'] = 'approx'
|
||||||
|
param = dataset.set_params(param)
|
||||||
|
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||||
|
assert tm.non_increasing(result['train'][dataset.metric], 1e-3)
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
@pytest.mark.skipif(**tm.no_sklearn())
|
||||||
def test_pruner(self):
|
def test_pruner(self):
|
||||||
@ -50,19 +79,18 @@ class TestUpdaters(unittest.TestCase):
|
|||||||
# Second prune should not change the tree
|
# Second prune should not change the tree
|
||||||
assert after_prune == second_prune
|
assert after_prune == second_prune
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
@given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
|
||||||
def test_fast_histmaker(self):
|
tm.dataset_strategy)
|
||||||
variable_param = {'tree_method': ['hist'],
|
@settings(deadline=None)
|
||||||
'max_depth': [2, 8],
|
def test_hist(self, param, hist_param, num_rounds, dataset):
|
||||||
'max_bin': [2, 256],
|
param['tree_method'] = 'hist'
|
||||||
'grow_policy': ['depthwise', 'lossguide'],
|
param = dataset.set_params(param)
|
||||||
'max_leaves': [64, 0],
|
param.update(hist_param)
|
||||||
'verbosity': [0],
|
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||||
'single_precision_histogram': [True, False]}
|
note(result)
|
||||||
for param in parameter_combinations(variable_param):
|
assert tm.non_increasing(result['train'][dataset.metric])
|
||||||
result = run_suite(param)
|
|
||||||
assert_results_non_increasing(result, 1e-2)
|
|
||||||
|
|
||||||
|
def test_hist_categorical(self):
|
||||||
# hist must be same as exact on all-categorial data
|
# hist must be same as exact on all-categorial data
|
||||||
dpath = 'demo/data/'
|
dpath = 'demo/data/'
|
||||||
ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||||
@ -87,7 +115,7 @@ class TestUpdaters(unittest.TestCase):
|
|||||||
assert hist_res['test']['auc'] == exact_res['test']['auc']
|
assert hist_res['test']['auc'] == exact_res['test']['auc']
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
@pytest.mark.skipif(**tm.no_sklearn())
|
||||||
def test_fast_histmaker_degenerate_case(self):
|
def test_hist_degenerate_case(self):
|
||||||
# Test a degenerate case where the quantile sketcher won't return any
|
# Test a degenerate case where the quantile sketcher won't return any
|
||||||
# quantile points for a particular feature (the second feature in
|
# quantile points for a particular feature (the second feature in
|
||||||
# this example). Source: https://github.com/dmlc/xgboost/issues/2943
|
# this example). Source: https://github.com/dmlc/xgboost/issues/2943
|
||||||
|
|||||||
@ -1,6 +1,19 @@
|
|||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
|
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
|
||||||
from xgboost.compat import DASK_INSTALLED
|
from xgboost.compat import DASK_INSTALLED
|
||||||
|
from hypothesis import strategies
|
||||||
|
from hypothesis.extra.numpy import arrays
|
||||||
|
from joblib import Memory
|
||||||
|
from sklearn import datasets
|
||||||
|
import xgboost as xgb
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cupy as cp
|
||||||
|
except ImportError:
|
||||||
|
cp = None
|
||||||
|
|
||||||
|
memory = Memory('./cachedir', verbose=0)
|
||||||
|
|
||||||
|
|
||||||
def no_sklearn():
|
def no_sklearn():
|
||||||
@ -39,7 +52,7 @@ def no_matplotlib():
|
|||||||
def no_dask_cuda():
|
def no_dask_cuda():
|
||||||
reason = 'dask_cuda is not installed.'
|
reason = 'dask_cuda is not installed.'
|
||||||
try:
|
try:
|
||||||
import dask_cuda as _ # noqa
|
import dask_cuda as _ # noqa
|
||||||
return {'condition': False, 'reason': reason}
|
return {'condition': False, 'reason': reason}
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return {'condition': True, 'reason': reason}
|
return {'condition': True, 'reason': reason}
|
||||||
@ -47,7 +60,7 @@ def no_dask_cuda():
|
|||||||
|
|
||||||
def no_cudf():
|
def no_cudf():
|
||||||
try:
|
try:
|
||||||
import cudf # noqa
|
import cudf # noqa
|
||||||
CUDF_INSTALLED = True
|
CUDF_INSTALLED = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
CUDF_INSTALLED = False
|
CUDF_INSTALLED = False
|
||||||
@ -59,7 +72,7 @@ def no_cudf():
|
|||||||
def no_cupy():
|
def no_cupy():
|
||||||
reason = 'cupy is not installed.'
|
reason = 'cupy is not installed.'
|
||||||
try:
|
try:
|
||||||
import cupy as _ # noqa
|
import cupy as _ # noqa
|
||||||
return {'condition': False, 'reason': reason}
|
return {'condition': False, 'reason': reason}
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return {'condition': True, 'reason': reason}
|
return {'condition': True, 'reason': reason}
|
||||||
@ -68,7 +81,7 @@ def no_cupy():
|
|||||||
def no_dask_cudf():
|
def no_dask_cudf():
|
||||||
reason = 'dask_cudf is not installed.'
|
reason = 'dask_cudf is not installed.'
|
||||||
try:
|
try:
|
||||||
import dask_cudf as _ # noqa
|
import dask_cudf as _ # noqa
|
||||||
return {'condition': False, 'reason': reason}
|
return {'condition': False, 'reason': reason}
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return {'condition': True, 'reason': reason}
|
return {'condition': True, 'reason': reason}
|
||||||
@ -77,7 +90,101 @@ def no_dask_cudf():
|
|||||||
def no_json_schema():
|
def no_json_schema():
|
||||||
reason = 'jsonschema is not installed'
|
reason = 'jsonschema is not installed'
|
||||||
try:
|
try:
|
||||||
import jsonschema # noqa
|
import jsonschema # noqa
|
||||||
return {'condition': False, 'reason': reason}
|
return {'condition': False, 'reason': reason}
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return {'condition': True, 'reason': reason}
|
return {'condition': True, 'reason': reason}
|
||||||
|
|
||||||
|
|
||||||
|
# Contains a dataset in numpy format as well as the relevant objective and metric
|
||||||
|
class TestDataset:
|
||||||
|
def __init__(self, name, get_dataset, objective, metric
|
||||||
|
):
|
||||||
|
self.name = name
|
||||||
|
self.objective = objective
|
||||||
|
self.metric = metric
|
||||||
|
self.X, self.y = get_dataset()
|
||||||
|
self.w = None
|
||||||
|
|
||||||
|
def set_params(self, params_in):
|
||||||
|
params_in['objective'] = self.objective
|
||||||
|
params_in['eval_metric'] = self.metric
|
||||||
|
if self.objective == "multi:softmax":
|
||||||
|
params_in["num_class"] = int(np.max(self.y) + 1)
|
||||||
|
return params_in
|
||||||
|
|
||||||
|
def get_dmat(self):
|
||||||
|
return xgb.DMatrix(self.X, self.y, self.w)
|
||||||
|
|
||||||
|
def get_device_dmat(self):
|
||||||
|
w = None if self.w is None else cp.array(self.w)
|
||||||
|
X = cp.array(self.X, dtype=np.float32)
|
||||||
|
y = cp.array(self.y, dtype=np.float32)
|
||||||
|
return xgb.DeviceQuantileDMatrix(X, y, w)
|
||||||
|
|
||||||
|
def get_external_dmat(self):
|
||||||
|
np.savetxt('tmptmp_1234.csv', np.hstack((self.y.reshape(len(self.y), 1), self.X)),
|
||||||
|
delimiter=',')
|
||||||
|
return xgb.DMatrix('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
|
||||||
|
weight=self.w)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
|
||||||
|
@memory.cache
|
||||||
|
def get_boston():
|
||||||
|
data = datasets.load_boston()
|
||||||
|
return data.data, data.target
|
||||||
|
|
||||||
|
|
||||||
|
@memory.cache
|
||||||
|
def get_digits():
|
||||||
|
data = datasets.load_digits()
|
||||||
|
return data.data, data.target
|
||||||
|
|
||||||
|
|
||||||
|
@memory.cache
|
||||||
|
def get_cancer():
|
||||||
|
data = datasets.load_breast_cancer()
|
||||||
|
return data.data, data.target
|
||||||
|
|
||||||
|
|
||||||
|
@memory.cache
|
||||||
|
def get_sparse():
|
||||||
|
rng = np.random.RandomState(199)
|
||||||
|
n = 2000
|
||||||
|
sparsity = 0.75
|
||||||
|
X, y = datasets.make_regression(n, random_state=rng)
|
||||||
|
flag = rng.binomial(1, sparsity, X.shape)
|
||||||
|
for i in range(X.shape[0]):
|
||||||
|
for j in range(X.shape[1]):
|
||||||
|
if flag[i, j]:
|
||||||
|
X[i, j] = np.nan
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
|
_unweighted_datasets_strategy = strategies.sampled_from(
|
||||||
|
[TestDataset('boston', get_boston, 'reg:squarederror', 'rmse'),
|
||||||
|
TestDataset('digits', get_digits, 'multi:softmax', 'mlogloss'),
|
||||||
|
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
|
||||||
|
TestDataset
|
||||||
|
("sparse", get_sparse, "reg:squarederror", "rmse"),
|
||||||
|
TestDataset("empty", lambda: (np.empty((0, 100)), np.empty(0)), "reg:squarederror",
|
||||||
|
"rmse")])
|
||||||
|
|
||||||
|
|
||||||
|
@strategies.composite
|
||||||
|
def _dataset_and_weight(draw):
|
||||||
|
data = draw(_unweighted_datasets_strategy)
|
||||||
|
if draw(strategies.booleans()):
|
||||||
|
data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0)))
|
||||||
|
return data
|
||||||
|
|
||||||
|
# A strategy for drawing from a set of example datasets
|
||||||
|
# May add random weights to the dataset
|
||||||
|
dataset_strategy = _dataset_and_weight()
|
||||||
|
|
||||||
|
|
||||||
|
def non_increasing(L, tolerance=1e-4):
|
||||||
|
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
|
||||||
|
|||||||
@ -39,7 +39,7 @@ if [ ${TASK} == "python_test" ]; then
|
|||||||
# Run unit tests
|
# Run unit tests
|
||||||
cd ..
|
cd ..
|
||||||
python -m pip install graphviz pytest pytest-cov codecov
|
python -m pip install graphviz pytest pytest-cov codecov
|
||||||
python -m pip install datatable
|
python -m pip install datatable hypothesis
|
||||||
python -m pip install numpy scipy pandas matplotlib scikit-learn dask[complete]
|
python -m pip install numpy scipy pandas matplotlib scikit-learn dask[complete]
|
||||||
python -m pytest -v --fulltrace -s tests/python --cov=python-package/xgboost || exit -1
|
python -m pytest -v --fulltrace -s tests/python --cov=python-package/xgboost || exit -1
|
||||||
codecov
|
codecov
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user