Quantile DMatrix for CPU. (#8130)

- Add a new `QuantileDMatrix` that works for both CPU and GPU. - Deprecate `DeviceQuantileDMatrix`.
2022-08-02 15:51:23 +08:00
parent 2cba1d9fcc
commit d87f69215e
14 changed files with 521 additions and 117 deletions
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -1,32 +1,12 @@
 import xgboost as xgb
 from xgboost.data import SingleBatchInternalIter as SingleBatch
 import numpy as np
-from testing import IteratorForTest, non_increasing
-from typing import Tuple, List
+from testing import IteratorForTest, non_increasing, make_batches
 import pytest
 from hypothesis import given, strategies, settings
 from scipy.sparse import csr_matrix


-def make_batches(
-    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
-) -> Tuple[List[np.ndarray], List[np.ndarray]]:
-    X = []
-    y = []
-    if use_cupy:
-        import cupy
-
-        rng = cupy.random.RandomState(1994)
-    else:
-        rng = np.random.RandomState(1994)
-    for i in range(n_batches):
-        _X = rng.randn(n_samples_per_batch, n_features)
-        _y = rng.randn(n_samples_per_batch)
-        X.append(_X)
-        y.append(_y)
-    return X, y
-
-
 def test_single_batch(tree_method: str = "approx") -> None:
    from sklearn.datasets import load_breast_cancer

@@ -111,8 +91,8 @@ def run_data_iterator(
    if not subsample:
        assert non_increasing(results_from_it["Train"]["rmse"])

-    X, y = it.as_arrays()
-    Xy = xgb.DMatrix(X, y)
+    X, y, w = it.as_arrays()
+    Xy = xgb.DMatrix(X, y, weight=w)
    assert Xy.num_row() == n_samples_per_batch * n_batches
    assert Xy.num_col() == n_features

--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -0,0 +1,212 @@
+from typing import Dict, List, Any
+
+import numpy as np
+import pytest
+from scipy import sparse
+from testing import IteratorForTest, make_batches, make_batches_sparse, make_categorical
+
+import xgboost as xgb
+
+
+class TestQuantileDMatrix:
+    def test_basic(self) -> None:
+        n_samples = 234
+        n_features = 8
+
+        rng = np.random.default_rng()
+        X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
+            n_samples, n_features
+        )
+        y = rng.normal(0, 3, size=n_samples)
+        Xy = xgb.QuantileDMatrix(X, y)
+        assert Xy.num_row() == n_samples
+        assert Xy.num_col() == n_features
+
+        X = sparse.random(n_samples, n_features, density=0.1, format="csr")
+        Xy = xgb.QuantileDMatrix(X, y)
+        assert Xy.num_row() == n_samples
+        assert Xy.num_col() == n_features
+
+        X = sparse.random(n_samples, n_features, density=0.8, format="csr")
+        Xy = xgb.QuantileDMatrix(X, y)
+        assert Xy.num_row() == n_samples
+        assert Xy.num_col() == n_features
+
+    @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.8, 0.9])
+    def test_with_iterator(self, sparsity: float) -> None:
+        n_samples_per_batch = 317
+        n_features = 8
+        n_batches = 7
+
+        if sparsity == 0.0:
+            it = IteratorForTest(
+                *make_batches(n_samples_per_batch, n_features, n_batches, False), None
+            )
+        else:
+            it = IteratorForTest(
+                *make_batches_sparse(
+                    n_samples_per_batch, n_features, n_batches, sparsity
+                ),
+                None
+            )
+        Xy = xgb.QuantileDMatrix(it)
+        assert Xy.num_row() == n_samples_per_batch * n_batches
+        assert Xy.num_col() == n_features
+
+    @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.5, 0.8, 0.9])
+    def test_training(self, sparsity: float) -> None:
+        n_samples_per_batch = 317
+        n_features = 8
+        n_batches = 7
+        if sparsity == 0.0:
+            it = IteratorForTest(
+                *make_batches(n_samples_per_batch, n_features, n_batches, False), None
+            )
+        else:
+            it = IteratorForTest(
+                *make_batches_sparse(
+                    n_samples_per_batch, n_features, n_batches, sparsity
+                ),
+                None
+            )
+
+        parameters = {"tree_method": "hist", "max_bin": 256}
+        Xy_it = xgb.QuantileDMatrix(it, max_bin=parameters["max_bin"])
+        from_it = xgb.train(parameters, Xy_it)
+
+        X, y, w = it.as_arrays()
+        w_it = Xy_it.get_weight()
+        np.testing.assert_allclose(w_it, w)
+
+        Xy_arr = xgb.DMatrix(X, y, weight=w)
+        from_arr = xgb.train(parameters, Xy_arr)
+
+        np.testing.assert_allclose(from_arr.predict(Xy_it), from_it.predict(Xy_arr))
+
+        y -= y.min()
+        y += 0.01
+        Xy = xgb.QuantileDMatrix(X, y, weight=w)
+        with pytest.raises(ValueError, match=r"Only.*hist.*"):
+            parameters = {
+                "tree_method": "approx",
+                "max_bin": 256,
+                "objective": "reg:gamma",
+            }
+            xgb.train(parameters, Xy)
+
+    def run_ref_dmatrix(self, rng: Any, tree_method: str, enable_cat: bool) -> None:
+        n_samples, n_features = 2048, 17
+        if enable_cat:
+            X, y = make_categorical(
+                n_samples, n_features, n_categories=13, onehot=False
+            )
+            if tree_method == "gpu_hist":
+                import cudf
+                X = cudf.from_pandas(X)
+                y = cudf.from_pandas(y)
+        else:
+            X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
+                n_samples, n_features
+            )
+            y = rng.normal(0, 3, size=n_samples)
+
+        # Use ref
+        Xy = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat)
+        Xy_valid = xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat)
+        qdm_results: Dict[str, Dict[str, List[float]]] = {}
+        xgb.train(
+            {"tree_method": tree_method},
+            Xy,
+            evals=[(Xy, "Train"), (Xy_valid, "valid")],
+            evals_result=qdm_results,
+        )
+        np.testing.assert_allclose(
+            qdm_results["Train"]["rmse"], qdm_results["valid"]["rmse"]
+        )
+        # No ref
+        Xy_valid = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat)
+        qdm_results = {}
+        xgb.train(
+            {"tree_method": tree_method},
+            Xy,
+            evals=[(Xy, "Train"), (Xy_valid, "valid")],
+            evals_result=qdm_results,
+        )
+        np.testing.assert_allclose(
+            qdm_results["Train"]["rmse"], qdm_results["valid"]["rmse"]
+        )
+
+        # Different number of features
+        Xy = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat)
+        dXy = xgb.DMatrix(X, y, enable_categorical=enable_cat)
+
+        n_samples, n_features = 256, 15
+        X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
+            n_samples, n_features
+        )
+        y = rng.normal(0, 3, size=n_samples)
+        with pytest.raises(ValueError, match=r".*features\."):
+            xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat)
+
+        # Compare training results
+        n_samples, n_features = 256, 17
+        if enable_cat:
+            X, y = make_categorical(n_samples, n_features, 13, onehot=False)
+            if tree_method == "gpu_hist":
+                import cudf
+                X = cudf.from_pandas(X)
+                y = cudf.from_pandas(y)
+        else:
+            X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
+                n_samples, n_features
+            )
+            y = rng.normal(0, 3, size=n_samples)
+        Xy_valid = xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat)
+        # use DMatrix as ref
+        Xy_valid_d = xgb.QuantileDMatrix(X, y, ref=dXy, enable_categorical=enable_cat)
+        dXy_valid = xgb.DMatrix(X, y, enable_categorical=enable_cat)
+
+        qdm_results = {}
+        xgb.train(
+            {"tree_method": tree_method},
+            Xy,
+            evals=[(Xy, "Train"), (Xy_valid, "valid")],
+            evals_result=qdm_results,
+        )
+
+        dm_results: Dict[str, Dict[str, List[float]]] = {}
+        xgb.train(
+            {"tree_method": tree_method},
+            dXy,
+            evals=[(dXy, "Train"), (dXy_valid, "valid"), (Xy_valid_d, "dvalid")],
+            evals_result=dm_results,
+        )
+        np.testing.assert_allclose(
+            dm_results["Train"]["rmse"], qdm_results["Train"]["rmse"]
+        )
+        np.testing.assert_allclose(
+            dm_results["valid"]["rmse"], qdm_results["valid"]["rmse"]
+        )
+        np.testing.assert_allclose(
+            dm_results["dvalid"]["rmse"], qdm_results["valid"]["rmse"]
+        )
+
+    def test_ref_dmatrix(self) -> None:
+        rng = np.random.RandomState(1994)
+        self.run_ref_dmatrix(rng, "hist", True)
+        self.run_ref_dmatrix(rng, "hist", False)
+
+    def test_predict(self) -> None:
+        n_samples, n_features = 16, 2
+        X, y = make_categorical(
+            n_samples, n_features, n_categories=13, onehot=False
+        )
+        Xy = xgb.DMatrix(X, y, enable_categorical=True)
+
+        booster = xgb.train({"tree_method": "hist"}, Xy)
+
+        Xy = xgb.DMatrix(X, y, enable_categorical=True)
+        a = booster.predict(Xy)
+        qXy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
+        b = booster.predict(qXy)
+        np.testing.assert_allclose(a, b)
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -1382,6 +1382,42 @@ class TestWithDask:
        num_rounds = 30
        self.run_updater_test(client, params, num_rounds, dataset, 'hist')

+    def test_quantile_dmatrix(self, client: Client) -> None:
+        X, y = make_categorical(client, 10000, 30, 13)
+
+        Xy = xgb.dask.DaskDMatrix(client, X, y, enable_categorical=True)
+        valid_Xy = xgb.dask.DaskDMatrix(client, X, y, enable_categorical=True)
+
+        output = xgb.dask.train(
+            client,
+            {"tree_method": "hist"},
+            Xy,
+            num_boost_round=10,
+            evals=[(Xy, "Train"), (valid_Xy, "Valid")]
+        )
+        dmatrix_hist = output["history"]
+
+        Xy = xgb.dask.DaskQuantileDMatrix(client, X, y, enable_categorical=True)
+        valid_Xy = xgb.dask.DaskQuantileDMatrix(
+            client, X, y, enable_categorical=True, ref=Xy
+        )
+
+        output = xgb.dask.train(
+            client,
+            {"tree_method": "hist"},
+            Xy,
+            num_boost_round=10,
+            evals=[(Xy, "Train"), (valid_Xy, "Valid")]
+        )
+        quantile_hist = output["history"]
+
+        np.testing.assert_allclose(
+            quantile_hist["Train"]["rmse"], dmatrix_hist["Train"]["rmse"]
+        )
+        np.testing.assert_allclose(
+            quantile_hist["Valid"]["rmse"], dmatrix_hist["Valid"]["rmse"]
+        )
+
    @given(params=exact_parameter_strategy,
           dataset=tm.dataset_strategy)
    @settings(deadline=None, suppress_health_check=suppress, print_blob=True)
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -1,11 +1,11 @@
 from concurrent.futures import ThreadPoolExecutor
 import os
 import multiprocessing
-from typing import Tuple, Union
+from typing import Tuple, Union, List, Sequence, Callable
 import urllib
 import zipfile
 import sys
-from typing import Optional
+from typing import Optional, Dict, Any
 from contextlib import contextmanager
 from io import StringIO
 from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
@@ -180,79 +180,148 @@ def skip_s390x():


 class IteratorForTest(xgb.core.DataIter):
-    def __init__(self, X, y):
+    def __init__(
+        self,
+        X: Sequence,
+        y: Sequence,
+        w: Optional[Sequence],
+        cache: Optional[str] = "./"
+    ) -> None:
        assert len(X) == len(y)
        self.X = X
        self.y = y
+        self.w = w
        self.it = 0
-        super().__init__("./")
+        super().__init__(cache)

-    def next(self, input_data):
+    def next(self, input_data: Callable) -> int:
        if self.it == len(self.X):
            return 0
        # Use copy to make sure the iterator doesn't hold a reference to the data.
-        input_data(data=self.X[self.it].copy(), label=self.y[self.it].copy())
-        gc.collect()            # clear up the copy, see if XGBoost access freed memory.
+        input_data(
+            data=self.X[self.it].copy(),
+            label=self.y[self.it].copy(),
+            weight=self.w[self.it].copy() if self.w else None,
+        )
+        gc.collect()  # clear up the copy, see if XGBoost access freed memory.
        self.it += 1
        return 1

-    def reset(self):
+    def reset(self) -> None:
        self.it = 0

-    def as_arrays(self):
-        X = np.concatenate(self.X, axis=0)
+    def as_arrays(
+        self,
+    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], np.ndarray, np.ndarray]:
+        if isinstance(self.X[0], sparse.csr_matrix):
+            X = sparse.vstack(self.X, format="csr")
+        else:
+            X = np.concatenate(self.X, axis=0)
        y = np.concatenate(self.y, axis=0)
-        return X, y
+        w = np.concatenate(self.w, axis=0)
+        return X, y, w
+
+
+def make_batches(
+    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
+) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
+    X = []
+    y = []
+    w = []
+    if use_cupy:
+        import cupy
+
+        rng = cupy.random.RandomState(1994)
+    else:
+        rng = np.random.RandomState(1994)
+    for i in range(n_batches):
+        _X = rng.randn(n_samples_per_batch, n_features)
+        _y = rng.randn(n_samples_per_batch)
+        _w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
+        X.append(_X)
+        y.append(_y)
+        w.append(_w)
+    return X, y, w
+
+
+def make_batches_sparse(
+    n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
+) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
+    X = []
+    y = []
+    w = []
+    rng = np.random.RandomState(1994)
+    for i in range(n_batches):
+        _X = sparse.random(
+            n_samples_per_batch,
+            n_features,
+            1.0 - sparsity,
+            format="csr",
+            dtype=np.float32,
+            random_state=rng,
+        )
+        _y = rng.randn(n_samples_per_batch)
+        _w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
+        X.append(_X)
+        y.append(_y)
+        w.append(_w)
+    return X, y, w


 # Contains a dataset in numpy format as well as the relevant objective and metric
 class TestDataset:
-    def __init__(self, name, get_dataset, objective, metric):
+    def __init__(
+        self, name: str, get_dataset: Callable, objective: str, metric: str
+    ) -> None:
        self.name = name
        self.objective = objective
        self.metric = metric
        self.X, self.y = get_dataset()
-        self.w = None
+        self.w: Optional[np.ndarray] = None
        self.margin: Optional[np.ndarray] = None

-    def set_params(self, params_in):
+    def set_params(self, params_in: Dict[str, Any]) -> Dict[str, Any]:
        params_in['objective'] = self.objective
        params_in['eval_metric'] = self.metric
        if self.objective == "multi:softmax":
            params_in["num_class"] = int(np.max(self.y) + 1)
        return params_in

-    def get_dmat(self):
+    def get_dmat(self) -> xgb.DMatrix:
        return xgb.DMatrix(
            self.X, self.y, self.w, base_margin=self.margin, enable_categorical=True
        )

-    def get_device_dmat(self):
+    def get_device_dmat(self) -> xgb.DeviceQuantileDMatrix:
        w = None if self.w is None else cp.array(self.w)
        X = cp.array(self.X, dtype=np.float32)
        y = cp.array(self.y, dtype=np.float32)
        return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)

-    def get_external_dmat(self):
+    def get_external_dmat(self) -> xgb.DMatrix:
        n_samples = self.X.shape[0]
        n_batches = 10
        per_batch = n_samples // n_batches + 1

        predictor = []
        response = []
+        weight = []
        for i in range(n_batches):
            beg = i * per_batch
            end = min((i + 1) * per_batch, n_samples)
            assert end != beg
            X = self.X[beg: end, ...]
            y = self.y[beg: end]
+            w = self.w[beg: end] if self.w is not None else None
            predictor.append(X)
            response.append(y)
+            if w is not None:
+                weight.append(w)

-        it = IteratorForTest(predictor, response)
+        it = IteratorForTest(predictor, response, weight if weight else None)
        return xgb.DMatrix(it)

-    def __repr__(self):
+    def __repr__(self) -> str:
        return self.name


--- a/tests/python/with_omp_limit.py
+++ b/tests/python/with_omp_limit.py
@@ -1,4 +1,3 @@
-import os
 import xgboost as xgb
 from sklearn.datasets import make_classification
 from sklearn.metrics import roc_auc_score