Support exporting cut values (#9356)

2023-07-08 15:32:41 +08:00
parent c3124813e8
commit 20c52f07d2
28 changed files with 722 additions and 101 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -3,6 +3,7 @@
 """Core XGBoost Library."""
 import copy
 import ctypes
+import importlib.util
 import json
 import os
 import re
@@ -381,6 +382,54 @@ def c_array(
    return (ctype * len(values))(*values)


+def from_array_interface(interface: dict) -> NumpyOrCupy:
+    """Convert array interface to numpy or cupy array"""
+
+    class Array:  # pylint: disable=too-few-public-methods
+        """Wrapper type for communicating with numpy and cupy."""
+
+        _interface: Optional[dict] = None
+
+        @property
+        def __array_interface__(self) -> Optional[dict]:
+            return self._interface
+
+        @__array_interface__.setter
+        def __array_interface__(self, interface: dict) -> None:
+            self._interface = copy.copy(interface)
+            # converts some fields to tuple as required by numpy
+            self._interface["shape"] = tuple(self._interface["shape"])
+            self._interface["data"] = tuple(self._interface["data"])
+            if self._interface.get("strides", None) is not None:
+                self._interface["strides"] = tuple(self._interface["strides"])
+
+        @property
+        def __cuda_array_interface__(self) -> Optional[dict]:
+            return self.__array_interface__
+
+        @__cuda_array_interface__.setter
+        def __cuda_array_interface__(self, interface: dict) -> None:
+            self.__array_interface__ = interface
+
+    arr = Array()
+
+    if "stream" in interface:
+        # CUDA stream is presented, this is a __cuda_array_interface__.
+        spec = importlib.util.find_spec("cupy")
+        if spec is None:
+            raise ImportError("`cupy` is required for handling CUDA buffer.")
+
+        import cupy as cp  # pylint: disable=import-error
+
+        arr.__cuda_array_interface__ = interface
+        out = cp.array(arr, copy=True)
+    else:
+        arr.__array_interface__ = interface
+        out = np.array(arr, copy=True)
+
+    return out
+
+
 def _prediction_output(
    shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
 ) -> NumpyOrCupy:
@@ -1060,6 +1109,32 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
        )
        return ret

+    def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Get quantile cuts for quantization."""
+        n_features = self.num_col()
+
+        c_sindptr = ctypes.c_char_p()
+        c_sdata = ctypes.c_char_p()
+        config = make_jcargs()
+        _check_call(
+            _LIB.XGDMatrixGetQuantileCut(
+                self.handle, config, ctypes.byref(c_sindptr), ctypes.byref(c_sdata)
+            )
+        )
+        assert c_sindptr.value is not None
+        assert c_sdata.value is not None
+
+        i_indptr = json.loads(c_sindptr.value)
+        indptr = from_array_interface(i_indptr)
+        assert indptr.size == n_features + 1
+        assert indptr.dtype == np.uint64
+
+        i_data = json.loads(c_sdata.value)
+        data = from_array_interface(i_data)
+        assert data.size == indptr[-1]
+        assert data.dtype == np.float32
+        return indptr, data
+
    def num_row(self) -> int:
        """Get the number of rows in the DMatrix."""
        ret = c_bst_ulong()
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -265,6 +265,14 @@ def make_batches(
    return X, y, w


+def make_regression(
+    n_samples: int, n_features: int, use_cupy: bool
+) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
+    """Make a simple regression dataset."""
+    X, y, w = make_batches(n_samples, n_features, 1, use_cupy)
+    return X[0], y[0], w[0]
+
+
 def make_batches_sparse(
    n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
 ) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -1,7 +1,7 @@
 """Tests for updaters."""
 import json
 from functools import partial, update_wrapper
-from typing import Dict
+from typing import Any, Dict

 import numpy as np

@@ -159,3 +159,100 @@ def check_quantile_loss(tree_method: str, weighted: bool) -> None:

    for i in range(alpha.shape[0]):
        np.testing.assert_allclose(predts[:, i], predt_multi[:, i])
+
+
+def check_cut(
+    n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
+) -> None:
+    """Check the cut values."""
+    from pandas.api.types import is_categorical_dtype
+
+    assert data.shape[0] == indptr[-1]
+    assert data.shape[0] == n_entries
+
+    assert indptr.dtype == np.uint64
+    for i in range(1, indptr.size):
+        beg = int(indptr[i - 1])
+        end = int(indptr[i])
+        for j in range(beg + 1, end):
+            assert data[j] > data[j - 1]
+            if is_categorical_dtype(dtypes[i - 1]):
+                assert data[j] == data[j - 1] + 1
+
+
+def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
+    """Check with optional cupy."""
+    from pandas.api.types import is_categorical_dtype
+
+    n_samples = 1024
+    n_features = 14
+    max_bin = 16
+    dtypes = [np.float32] * n_features
+
+    # numerical
+    X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
+    # - qdm
+    Xyw: xgb.DMatrix = xgb.QuantileDMatrix(X, y, weight=w, max_bin=max_bin)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+    # - dm
+    Xyw = xgb.DMatrix(X, y, weight=w)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+    # - ext mem
+    n_batches = 3
+    n_samples_per_batch = 256
+    it = tm.IteratorForTest(
+        *tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
+        cache="cache",
+    )
+    Xy: xgb.DMatrix = xgb.DMatrix(it)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+
+    # categorical
+    n_categories = 32
+    X, y = tm.make_categorical(n_samples, n_features, n_categories, False, sparsity=0.8)
+    if use_cupy:
+        import cudf  # pylint: disable=import-error
+        import cupy as cp  # pylint: disable=import-error
+
+        X = cudf.from_pandas(X)
+        y = cp.array(y)
+    # - qdm
+    Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_categories * n_features, indptr, data, X.dtypes)
+    # - dm
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_categories * n_features, indptr, data, X.dtypes)
+
+    # mixed
+    X, y = tm.make_categorical(
+        n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
+    )
+    n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)])
+    n_num_features = n_features - n_cat_features
+    n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
+    # - qdm
+    Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_entries, indptr, data, X.dtypes)
+    # - dm
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_entries, indptr, data, X.dtypes)
+
+
+def check_get_quantile_cut(tree_method: str) -> None:
+    """Check the quantile cut getter."""
+
+    use_cupy = tree_method == "gpu_hist"
+    check_get_quantile_cut_device(tree_method, False)
+    if use_cupy:
+        check_get_quantile_cut_device(tree_method, True)