Add convergence test for sparse datasets. (#7922)

This commit is contained in:
Jiaming Yuan 2022-05-23 18:07:26 +08:00 committed by GitHub
parent f6babc814c
commit 474366c020
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 159 additions and 1 deletions

View File

@ -53,6 +53,23 @@ class TestGPUUpdaters:
note(result) note(result)
assert tm.non_increasing(result["train"][dataset.metric]) assert tm.non_increasing(result["train"][dataset.metric])
@given(tm.sparse_datasets_strategy)
@settings(deadline=None, print_blob=True)
def test_sparse(self, dataset):
param = {"tree_method": "hist", "max_bin": 64}
hist_result = train_result(param, dataset.get_dmat(), 16)
note(hist_result)
assert tm.non_increasing(hist_result['train'][dataset.metric])
param = {"tree_method": "gpu_hist", "max_bin": 64}
gpu_hist_result = train_result(param, dataset.get_dmat(), 16)
note(gpu_hist_result)
assert tm.non_increasing(gpu_hist_result['train'][dataset.metric])
np.testing.assert_allclose(
hist_result["train"]["rmse"], gpu_hist_result["train"]["rmse"], rtol=1e-2
)
@given(strategies.integers(10, 400), strategies.integers(3, 8), @given(strategies.integers(10, 400), strategies.integers(3, 8),
strategies.integers(1, 2), strategies.integers(4, 7)) strategies.integers(1, 2), strategies.integers(4, 7))
@settings(deadline=None, print_blob=True) @settings(deadline=None, print_blob=True)

View File

@ -99,6 +99,23 @@ class TestTreeMethod:
note(result) note(result)
assert tm.non_increasing(result['train'][dataset.metric]) assert tm.non_increasing(result['train'][dataset.metric])
@given(tm.sparse_datasets_strategy)
@settings(deadline=None, print_blob=True)
def test_sparse(self, dataset):
param = {"tree_method": "hist", "max_bin": 64}
hist_result = train_result(param, dataset.get_dmat(), 16)
note(hist_result)
assert tm.non_increasing(hist_result['train'][dataset.metric])
param = {"tree_method": "approx", "max_bin": 64}
approx_result = train_result(param, dataset.get_dmat(), 16)
note(approx_result)
assert tm.non_increasing(approx_result['train'][dataset.metric])
np.testing.assert_allclose(
hist_result["train"]["rmse"], approx_result["train"]["rmse"]
)
def test_hist_categorical(self): def test_hist_categorical(self):
# hist must be same as exact on all-categorial data # hist must be same as exact on all-categorial data
dpath = 'demo/data/' dpath = 'demo/data/'

View File

@ -1,5 +1,7 @@
# coding: utf-8 from concurrent.futures import ThreadPoolExecutor
import os import os
import multiprocessing
from typing import Tuple, Union
import urllib import urllib
import zipfile import zipfile
import sys import sys
@ -11,6 +13,7 @@ import pytest
import gc import gc
import xgboost as xgb import xgboost as xgb
import numpy as np import numpy as np
from scipy import sparse
import platform import platform
hypothesis = pytest.importorskip('hypothesis') hypothesis = pytest.importorskip('hypothesis')
@ -327,6 +330,127 @@ def make_categorical(
return df, label return df, label
@memory.cache
def make_sparse_regression(
n_samples: int, n_features: int, sparsity: float, as_dense: bool
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
"""Make sparse matrix.
Parameters
----------
as_dense:
Return the matrix as np.ndarray with missing values filled by NaN
"""
if not hasattr(np.random, "default_rng"):
# old version of numpy on s390x
rng = np.random.RandomState(1994)
X = sparse.random(
m=n_samples,
n=n_features,
density=1.0 - sparsity,
random_state=rng,
format="csr",
)
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
return X, y
# Use multi-thread to speed up the generation, convenient if you use this function
# for benchmarking.
n_threads = multiprocessing.cpu_count()
n_threads = min(n_threads, n_features)
def random_csc(t_id: int) -> sparse.csc_matrix:
rng = np.random.default_rng(1994 * t_id)
thread_size = n_features // n_threads
if t_id == n_threads - 1:
n_features_tloc = n_features - t_id * thread_size
else:
n_features_tloc = thread_size
X = sparse.random(
m=n_samples,
n=n_features_tloc,
density=1.0 - sparsity,
random_state=rng,
).tocsc()
y = np.zeros((n_samples, 1))
for i in range(X.shape[1]):
size = X.indptr[i + 1] - X.indptr[i]
if size != 0:
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
return X, y
futures = []
with ThreadPoolExecutor(max_workers=n_threads) as executor:
for i in range(n_threads):
futures.append(executor.submit(random_csc, i))
X_results = []
y_results = []
for f in futures:
X, y = f.result()
X_results.append(X)
y_results.append(y)
assert len(y_results) == n_threads
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
y = np.asarray(y_results)
y = y.reshape((y.shape[0], y.shape[1])).T
y = np.sum(y, axis=1)
assert csr.shape[0] == n_samples
assert csr.shape[1] == n_features
assert y.shape[0] == n_samples
if as_dense:
arr = csr.toarray()
arr[arr == 0] = np.nan
return arr, y
return csr, y
sparse_datasets_strategy = strategies.sampled_from(
[
TestDataset(
"1e5x8-0.95-csr",
lambda: make_sparse_regression(int(1e5), 8, 0.95, False),
"reg:squarederror",
"rmse",
),
TestDataset(
"1e5x8-0.5-csr",
lambda: make_sparse_regression(int(1e5), 8, 0.5, False),
"reg:squarederror",
"rmse",
),
TestDataset(
"1e5x8-0.5-dense",
lambda: make_sparse_regression(int(1e5), 8, 0.5, True),
"reg:squarederror",
"rmse",
),
TestDataset(
"1e5x8-0.05-csr",
lambda: make_sparse_regression(int(1e5), 8, 0.05, False),
"reg:squarederror",
"rmse",
),
TestDataset(
"1e5x8-0.05-dense",
lambda: make_sparse_regression(int(1e5), 8, 0.05, True),
"reg:squarederror",
"rmse",
),
]
)
_unweighted_datasets_strategy = strategies.sampled_from( _unweighted_datasets_strategy = strategies.sampled_from(
[ [
TestDataset( TestDataset(