Add convergence test for sparse datasets. (#7922)
This commit is contained in:
parent
f6babc814c
commit
474366c020
@ -53,6 +53,23 @@ class TestGPUUpdaters:
|
|||||||
note(result)
|
note(result)
|
||||||
assert tm.non_increasing(result["train"][dataset.metric])
|
assert tm.non_increasing(result["train"][dataset.metric])
|
||||||
|
|
||||||
|
@given(tm.sparse_datasets_strategy)
|
||||||
|
@settings(deadline=None, print_blob=True)
|
||||||
|
def test_sparse(self, dataset):
|
||||||
|
param = {"tree_method": "hist", "max_bin": 64}
|
||||||
|
hist_result = train_result(param, dataset.get_dmat(), 16)
|
||||||
|
note(hist_result)
|
||||||
|
assert tm.non_increasing(hist_result['train'][dataset.metric])
|
||||||
|
|
||||||
|
param = {"tree_method": "gpu_hist", "max_bin": 64}
|
||||||
|
gpu_hist_result = train_result(param, dataset.get_dmat(), 16)
|
||||||
|
note(gpu_hist_result)
|
||||||
|
assert tm.non_increasing(gpu_hist_result['train'][dataset.metric])
|
||||||
|
|
||||||
|
np.testing.assert_allclose(
|
||||||
|
hist_result["train"]["rmse"], gpu_hist_result["train"]["rmse"], rtol=1e-2
|
||||||
|
)
|
||||||
|
|
||||||
@given(strategies.integers(10, 400), strategies.integers(3, 8),
|
@given(strategies.integers(10, 400), strategies.integers(3, 8),
|
||||||
strategies.integers(1, 2), strategies.integers(4, 7))
|
strategies.integers(1, 2), strategies.integers(4, 7))
|
||||||
@settings(deadline=None, print_blob=True)
|
@settings(deadline=None, print_blob=True)
|
||||||
|
|||||||
@ -99,6 +99,23 @@ class TestTreeMethod:
|
|||||||
note(result)
|
note(result)
|
||||||
assert tm.non_increasing(result['train'][dataset.metric])
|
assert tm.non_increasing(result['train'][dataset.metric])
|
||||||
|
|
||||||
|
@given(tm.sparse_datasets_strategy)
|
||||||
|
@settings(deadline=None, print_blob=True)
|
||||||
|
def test_sparse(self, dataset):
|
||||||
|
param = {"tree_method": "hist", "max_bin": 64}
|
||||||
|
hist_result = train_result(param, dataset.get_dmat(), 16)
|
||||||
|
note(hist_result)
|
||||||
|
assert tm.non_increasing(hist_result['train'][dataset.metric])
|
||||||
|
|
||||||
|
param = {"tree_method": "approx", "max_bin": 64}
|
||||||
|
approx_result = train_result(param, dataset.get_dmat(), 16)
|
||||||
|
note(approx_result)
|
||||||
|
assert tm.non_increasing(approx_result['train'][dataset.metric])
|
||||||
|
|
||||||
|
np.testing.assert_allclose(
|
||||||
|
hist_result["train"]["rmse"], approx_result["train"]["rmse"]
|
||||||
|
)
|
||||||
|
|
||||||
def test_hist_categorical(self):
|
def test_hist_categorical(self):
|
||||||
# hist must be same as exact on all-categorial data
|
# hist must be same as exact on all-categorial data
|
||||||
dpath = 'demo/data/'
|
dpath = 'demo/data/'
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# coding: utf-8
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
import os
|
import os
|
||||||
|
import multiprocessing
|
||||||
|
from typing import Tuple, Union
|
||||||
import urllib
|
import urllib
|
||||||
import zipfile
|
import zipfile
|
||||||
import sys
|
import sys
|
||||||
@ -11,6 +13,7 @@ import pytest
|
|||||||
import gc
|
import gc
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from scipy import sparse
|
||||||
import platform
|
import platform
|
||||||
|
|
||||||
hypothesis = pytest.importorskip('hypothesis')
|
hypothesis = pytest.importorskip('hypothesis')
|
||||||
@ -327,6 +330,127 @@ def make_categorical(
|
|||||||
return df, label
|
return df, label
|
||||||
|
|
||||||
|
|
||||||
|
@memory.cache
|
||||||
|
def make_sparse_regression(
|
||||||
|
n_samples: int, n_features: int, sparsity: float, as_dense: bool
|
||||||
|
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
|
||||||
|
"""Make sparse matrix.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
|
||||||
|
as_dense:
|
||||||
|
|
||||||
|
Return the matrix as np.ndarray with missing values filled by NaN
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not hasattr(np.random, "default_rng"):
|
||||||
|
# old version of numpy on s390x
|
||||||
|
rng = np.random.RandomState(1994)
|
||||||
|
X = sparse.random(
|
||||||
|
m=n_samples,
|
||||||
|
n=n_features,
|
||||||
|
density=1.0 - sparsity,
|
||||||
|
random_state=rng,
|
||||||
|
format="csr",
|
||||||
|
)
|
||||||
|
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
# Use multi-thread to speed up the generation, convenient if you use this function
|
||||||
|
# for benchmarking.
|
||||||
|
n_threads = multiprocessing.cpu_count()
|
||||||
|
n_threads = min(n_threads, n_features)
|
||||||
|
|
||||||
|
def random_csc(t_id: int) -> sparse.csc_matrix:
|
||||||
|
rng = np.random.default_rng(1994 * t_id)
|
||||||
|
thread_size = n_features // n_threads
|
||||||
|
if t_id == n_threads - 1:
|
||||||
|
n_features_tloc = n_features - t_id * thread_size
|
||||||
|
else:
|
||||||
|
n_features_tloc = thread_size
|
||||||
|
|
||||||
|
X = sparse.random(
|
||||||
|
m=n_samples,
|
||||||
|
n=n_features_tloc,
|
||||||
|
density=1.0 - sparsity,
|
||||||
|
random_state=rng,
|
||||||
|
).tocsc()
|
||||||
|
y = np.zeros((n_samples, 1))
|
||||||
|
|
||||||
|
for i in range(X.shape[1]):
|
||||||
|
size = X.indptr[i + 1] - X.indptr[i]
|
||||||
|
if size != 0:
|
||||||
|
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
|
||||||
|
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
futures = []
|
||||||
|
with ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||||
|
for i in range(n_threads):
|
||||||
|
futures.append(executor.submit(random_csc, i))
|
||||||
|
|
||||||
|
X_results = []
|
||||||
|
y_results = []
|
||||||
|
for f in futures:
|
||||||
|
X, y = f.result()
|
||||||
|
X_results.append(X)
|
||||||
|
y_results.append(y)
|
||||||
|
|
||||||
|
assert len(y_results) == n_threads
|
||||||
|
|
||||||
|
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
|
||||||
|
y = np.asarray(y_results)
|
||||||
|
y = y.reshape((y.shape[0], y.shape[1])).T
|
||||||
|
y = np.sum(y, axis=1)
|
||||||
|
|
||||||
|
assert csr.shape[0] == n_samples
|
||||||
|
assert csr.shape[1] == n_features
|
||||||
|
assert y.shape[0] == n_samples
|
||||||
|
|
||||||
|
if as_dense:
|
||||||
|
arr = csr.toarray()
|
||||||
|
arr[arr == 0] = np.nan
|
||||||
|
return arr, y
|
||||||
|
|
||||||
|
return csr, y
|
||||||
|
|
||||||
|
|
||||||
|
sparse_datasets_strategy = strategies.sampled_from(
|
||||||
|
[
|
||||||
|
TestDataset(
|
||||||
|
"1e5x8-0.95-csr",
|
||||||
|
lambda: make_sparse_regression(int(1e5), 8, 0.95, False),
|
||||||
|
"reg:squarederror",
|
||||||
|
"rmse",
|
||||||
|
),
|
||||||
|
TestDataset(
|
||||||
|
"1e5x8-0.5-csr",
|
||||||
|
lambda: make_sparse_regression(int(1e5), 8, 0.5, False),
|
||||||
|
"reg:squarederror",
|
||||||
|
"rmse",
|
||||||
|
),
|
||||||
|
TestDataset(
|
||||||
|
"1e5x8-0.5-dense",
|
||||||
|
lambda: make_sparse_regression(int(1e5), 8, 0.5, True),
|
||||||
|
"reg:squarederror",
|
||||||
|
"rmse",
|
||||||
|
),
|
||||||
|
TestDataset(
|
||||||
|
"1e5x8-0.05-csr",
|
||||||
|
lambda: make_sparse_regression(int(1e5), 8, 0.05, False),
|
||||||
|
"reg:squarederror",
|
||||||
|
"rmse",
|
||||||
|
),
|
||||||
|
TestDataset(
|
||||||
|
"1e5x8-0.05-dense",
|
||||||
|
lambda: make_sparse_regression(int(1e5), 8, 0.05, True),
|
||||||
|
"reg:squarederror",
|
||||||
|
"rmse",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
_unweighted_datasets_strategy = strategies.sampled_from(
|
_unweighted_datasets_strategy = strategies.sampled_from(
|
||||||
[
|
[
|
||||||
TestDataset(
|
TestDataset(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user