from typing import Any, Dict, List import numpy as np import pytest from hypothesis import given, settings, strategies from scipy import sparse import xgboost as xgb from xgboost.testing import ( IteratorForTest, make_batches, make_batches_sparse, make_categorical, make_ltr, make_sparse_regression, predictor_equal, ) from xgboost.testing.data import check_inf, np_dtypes from xgboost.testing.data_iter import run_mixed_sparsity class TestQuantileDMatrix: def test_basic(self) -> None: """Checks for np array, list, tuple.""" n_samples = 234 n_features = 8 rng = np.random.default_rng() X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape( n_samples, n_features ) y = rng.normal(0, 3, size=n_samples) Xy = xgb.QuantileDMatrix(X, y) assert Xy.num_row() == n_samples assert Xy.num_col() == n_features X = sparse.random(n_samples, n_features, density=0.1, format="csr") Xy = xgb.QuantileDMatrix(X, y) assert Xy.num_row() == n_samples assert Xy.num_col() == n_features X = sparse.random(n_samples, n_features, density=0.8, format="csr") Xy = xgb.QuantileDMatrix(X, y) assert Xy.num_row() == n_samples assert Xy.num_col() == n_features n_samples = 64 data = [] for f in range(n_samples): row = [f] * n_features data.append(row) assert np.array(data).shape == (n_samples, n_features) Xy = xgb.QuantileDMatrix(data, max_bin=256) assert Xy.num_row() == n_samples assert Xy.num_col() == n_features r = np.arange(1.0, n_samples) np.testing.assert_allclose(Xy.get_data().toarray()[1:, 0], r) def test_error(self): from sklearn.model_selection import train_test_split rng = np.random.default_rng(1994) X, y = make_categorical( n_samples=128, n_features=2, n_categories=3, onehot=False ) reg = xgb.XGBRegressor(tree_method="hist", enable_categorical=True) w = rng.uniform(0, 1, size=y.shape[0]) X_train, X_test, y_train, y_test, w_train, w_test = train_test_split( X, y, w, random_state=1994 ) with pytest.raises(ValueError, match="sample weight"): reg.fit( X, y, sample_weight=w_train, eval_set=[(X_test, y_test)], sample_weight_eval_set=[w_test], ) with pytest.raises(ValueError, match="sample weight"): reg.fit( X_train, y_train, sample_weight=w, eval_set=[(X_test, y_test)], sample_weight_eval_set=[w_test], ) @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.8, 0.9]) def test_with_iterator(self, sparsity: float) -> None: n_samples_per_batch = 317 n_features = 8 n_batches = 7 if sparsity == 0.0: it = IteratorForTest( *make_batches(n_samples_per_batch, n_features, n_batches, False), None ) else: it = IteratorForTest( *make_batches_sparse( n_samples_per_batch, n_features, n_batches, sparsity ), None, ) Xy = xgb.QuantileDMatrix(it) assert Xy.num_row() == n_samples_per_batch * n_batches assert Xy.num_col() == n_features def test_different_size(self) -> None: n_samples_per_batch = 317 n_features = 8 n_batches = 7 it = IteratorForTest( *make_batches( n_samples_per_batch, n_features, n_batches, False, vary_size=True ), cache=None, ) Xy = xgb.QuantileDMatrix(it) assert Xy.num_row() == 2429 X, y, w = it.as_arrays() Xy1 = xgb.QuantileDMatrix(X, y, weight=w) assert predictor_equal(Xy, Xy1) @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.5, 0.8, 0.9]) def test_training(self, sparsity: float) -> None: n_samples_per_batch = 317 n_features = 8 n_batches = 7 if sparsity == 0.0: it = IteratorForTest( *make_batches(n_samples_per_batch, n_features, n_batches, False), None ) else: it = IteratorForTest( *make_batches_sparse( n_samples_per_batch, n_features, n_batches, sparsity ), None, ) parameters = {"tree_method": "hist", "max_bin": 256} Xy_it = xgb.QuantileDMatrix(it, max_bin=parameters["max_bin"]) from_it = xgb.train(parameters, Xy_it) X, y, w = it.as_arrays() w_it = Xy_it.get_weight() np.testing.assert_allclose(w_it, w) Xy_arr = xgb.DMatrix(X, y, weight=w) from_arr = xgb.train(parameters, Xy_arr) np.testing.assert_allclose(from_arr.predict(Xy_it), from_it.predict(Xy_arr)) y -= y.min() y += 0.01 Xy = xgb.QuantileDMatrix(X, y, weight=w) with pytest.raises(ValueError, match=r"Only.*hist.*"): parameters = { "tree_method": "approx", "max_bin": 256, "objective": "reg:gamma", } xgb.train(parameters, Xy) def run_ref_dmatrix(self, rng: Any, tree_method: str, enable_cat: bool) -> None: n_samples, n_features = 2048, 17 if enable_cat: X, y = make_categorical( n_samples, n_features, n_categories=13, onehot=False ) if tree_method == "gpu_hist": import cudf X = cudf.from_pandas(X) y = cudf.from_pandas(y) else: X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape( n_samples, n_features ) y = rng.normal(0, 3, size=n_samples) # Use ref Xy = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat) Xy_valid = xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat) qdm_results: Dict[str, Dict[str, List[float]]] = {} xgb.train( {"tree_method": tree_method}, Xy, evals=[(Xy, "Train"), (Xy_valid, "valid")], evals_result=qdm_results, ) np.testing.assert_allclose( qdm_results["Train"]["rmse"], qdm_results["valid"]["rmse"] ) # No ref Xy_valid = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat) qdm_results = {} xgb.train( {"tree_method": tree_method}, Xy, evals=[(Xy, "Train"), (Xy_valid, "valid")], evals_result=qdm_results, ) np.testing.assert_allclose( qdm_results["Train"]["rmse"], qdm_results["valid"]["rmse"] ) # Different number of features Xy = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat) dXy = xgb.DMatrix(X, y, enable_categorical=enable_cat) n_samples, n_features = 256, 15 X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape( n_samples, n_features ) y = rng.normal(0, 3, size=n_samples) with pytest.raises(ValueError, match=r".*features\."): xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat) # Compare training results n_samples, n_features = 256, 17 if enable_cat: X, y = make_categorical(n_samples, n_features, 13, onehot=False) if tree_method == "gpu_hist": import cudf X = cudf.from_pandas(X) y = cudf.from_pandas(y) else: X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape( n_samples, n_features ) y = rng.normal(0, 3, size=n_samples) Xy_valid = xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat) # use DMatrix as ref Xy_valid_d = xgb.QuantileDMatrix(X, y, ref=dXy, enable_categorical=enable_cat) dXy_valid = xgb.DMatrix(X, y, enable_categorical=enable_cat) qdm_results = {} xgb.train( {"tree_method": tree_method}, Xy, evals=[(Xy, "Train"), (Xy_valid, "valid")], evals_result=qdm_results, ) dm_results: Dict[str, Dict[str, List[float]]] = {} xgb.train( {"tree_method": tree_method}, dXy, evals=[(dXy, "Train"), (dXy_valid, "valid"), (Xy_valid_d, "dvalid")], evals_result=dm_results, ) np.testing.assert_allclose( dm_results["Train"]["rmse"], qdm_results["Train"]["rmse"] ) np.testing.assert_allclose( dm_results["valid"]["rmse"], qdm_results["valid"]["rmse"] ) np.testing.assert_allclose( dm_results["dvalid"]["rmse"], qdm_results["valid"]["rmse"] ) def test_ref_dmatrix(self) -> None: rng = np.random.RandomState(1994) self.run_ref_dmatrix(rng, "hist", True) self.run_ref_dmatrix(rng, "hist", False) @pytest.mark.parametrize("sparsity", [0.0, 0.5]) def test_predict(self, sparsity: float) -> None: n_samples, n_features = 256, 4 X, y = make_categorical( n_samples, n_features, n_categories=13, onehot=False, sparsity=sparsity ) Xy = xgb.DMatrix(X, y, enable_categorical=True) booster = xgb.train({"tree_method": "hist"}, Xy) Xy = xgb.DMatrix(X, y, enable_categorical=True) a = booster.predict(Xy) qXy = xgb.QuantileDMatrix(X, y, enable_categorical=True) b = booster.predict(qXy) np.testing.assert_allclose(a, b) def test_ltr(self) -> None: X, y, qid, w = make_ltr(100, 3, 3, 5) Xy_qdm = xgb.QuantileDMatrix(X, y, qid=qid, weight=w) Xy = xgb.DMatrix(X, y, qid=qid, weight=w) xgb.train({"tree_method": "hist", "objective": "rank:ndcg"}, Xy) from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm) from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy) assert predictor_equal(from_qdm, from_dm) def test_check_inf(self) -> None: rng = np.random.default_rng(1994) check_inf(rng) # we don't test empty Quantile DMatrix in single node construction. @given( strategies.integers(1, 1000), strategies.integers(1, 100), strategies.fractions(0, 0.99), ) @settings(deadline=None, print_blob=True) def test_to_csr(self, n_samples: int, n_features: int, sparsity: float) -> None: csr, y = make_sparse_regression(n_samples, n_features, sparsity, False) csr = csr.astype(np.float32) qdm = xgb.QuantileDMatrix(data=csr, label=y) ret = qdm.get_data() np.testing.assert_equal(csr.indptr, ret.indptr) np.testing.assert_equal(csr.indices, ret.indices) booster = xgb.train({"tree_method": "hist"}, dtrain=qdm) np.testing.assert_allclose( booster.predict(qdm), booster.predict(xgb.DMatrix(qdm.get_data())) ) def test_dtypes(self) -> None: """Checks for both np array and pd DataFrame.""" n_samples = 128 n_features = 16 for orig, x in np_dtypes(n_samples, n_features): m0 = xgb.QuantileDMatrix(orig) m1 = xgb.QuantileDMatrix(x) assert predictor_equal(m0, m1) # unsupported types for dtype in [ np.bytes_, np.complex64, np.complex128, ]: X: np.ndarray = np.array(orig, dtype=dtype) with pytest.raises(ValueError): xgb.QuantileDMatrix(X) def test_changed_max_bin(self) -> None: n_samples = 128 n_features = 16 csr, y = make_sparse_regression(n_samples, n_features, 0.5, False) Xy = xgb.QuantileDMatrix(csr, y, max_bin=9) booster = xgb.train({"max_bin": 9}, Xy, num_boost_round=2) Xy = xgb.QuantileDMatrix(csr, y, max_bin=11) with pytest.raises(ValueError, match="consistent"): xgb.train({}, Xy, num_boost_round=2, xgb_model=booster) def test_mixed_sparsity(self) -> None: run_mixed_sparsity("cpu")