import xgboost as xgb from xgboost.data import SingleBatchInternalIter as SingleBatch import numpy as np from testing import IteratorForTest from typing import Tuple, List import pytest from hypothesis import given, strategies, settings from scipy.sparse import csr_matrix def make_batches( n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False ) -> Tuple[List[np.ndarray], List[np.ndarray]]: X = [] y = [] if use_cupy: import cupy rng = cupy.random.RandomState(1994) else: rng = np.random.RandomState(1994) for i in range(n_batches): _X = rng.randn(n_samples_per_batch, n_features) _y = rng.randn(n_samples_per_batch) X.append(_X) y.append(_y) return X, y def test_single_batch(tree_method: str = "approx") -> None: from sklearn.datasets import load_breast_cancer n_rounds = 10 X, y = load_breast_cancer(return_X_y=True) X = X.astype(np.float32) y = y.astype(np.float32) Xy = xgb.DMatrix(SingleBatch(data=X, label=y)) from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) Xy = xgb.DMatrix(X, y) from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) assert from_it.get_dump() == from_dmat.get_dump() X, y = load_breast_cancer(return_X_y=True, as_frame=True) X = X.astype(np.float32) Xy = xgb.DMatrix(SingleBatch(data=X, label=y)) from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) # remove feature info to generate exact same text representation. from_pd.feature_names = None from_pd.feature_types = None assert from_pd.get_dump() == from_it.get_dump() X, y = load_breast_cancer(return_X_y=True) X = csr_matrix(X) Xy = xgb.DMatrix(SingleBatch(data=X, label=y)) from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) X, y = load_breast_cancer(return_X_y=True) Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0) from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds) assert from_np.get_dump() == from_it.get_dump() def run_data_iterator( n_samples_per_batch: int, n_features: int, n_batches: int, tree_method: str, subsample: bool, use_cupy: bool, ) -> None: n_rounds = 2 # The test is more difficult to pass if the subsample rate is smaller as the root_sum # is accumulated in parallel. Reductions with different number of entries lead to # different floating point errors. subsample_rate = 0.8 if subsample else 1.0 it = IteratorForTest( *make_batches(n_samples_per_batch, n_features, n_batches, use_cupy) ) if n_batches == 0: with pytest.raises(ValueError, match="1 batch"): Xy = xgb.DMatrix(it) return Xy = xgb.DMatrix(it) assert Xy.num_row() == n_samples_per_batch * n_batches assert Xy.num_col() == n_features parameters = { "tree_method": tree_method, "max_depth": 2, "subsample": subsample_rate, "seed": 0, } if tree_method == "gpu_hist": parameters["sampling_method"] = "gradient_based" results_from_it: xgb.callback.EvaluationMonitor.EvalsLog = {} from_it = xgb.train( parameters, Xy, num_boost_round=n_rounds, evals=[(Xy, "Train")], evals_result=results_from_it, verbose_eval=False, ) it_predt = from_it.predict(Xy) X, y = it.as_arrays() Xy = xgb.DMatrix(X, y) assert Xy.num_row() == n_samples_per_batch * n_batches assert Xy.num_col() == n_features results_from_arrays: xgb.callback.EvaluationMonitor.EvalsLog = {} from_arrays = xgb.train( parameters, Xy, num_boost_round=n_rounds, evals=[(Xy, "Train")], evals_result=results_from_arrays, verbose_eval=False, ) arr_predt = from_arrays.predict(Xy) if tree_method != "gpu_hist": rtol = 1e-1 # flaky else: # Model can be sensitive to quantiles, use 1e-2 to relax the test. np.testing.assert_allclose(it_predt, arr_predt, rtol=1e-2) rtol = 1e-6 np.testing.assert_allclose( results_from_it["Train"]["rmse"], results_from_arrays["Train"]["rmse"], rtol=rtol, ) @given( strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13), strategies.booleans(), ) @settings(deadline=None) def test_data_iterator( n_samples_per_batch: int, n_features: int, n_batches: int, subsample: bool, ) -> None: run_data_iterator( n_samples_per_batch, n_features, n_batches, "approx", subsample, False ) run_data_iterator( n_samples_per_batch, n_features, n_batches, "hist", subsample, False )