Support exporting cut values (#9356)

This commit is contained in:
Jiaming Yuan
2023-07-08 15:32:41 +08:00
committed by GitHub
parent c3124813e8
commit 20c52f07d2
28 changed files with 722 additions and 101 deletions

View File

@@ -3,6 +3,7 @@
"""Core XGBoost Library."""
import copy
import ctypes
import importlib.util
import json
import os
import re
@@ -381,6 +382,54 @@ def c_array(
return (ctype * len(values))(*values)
def from_array_interface(interface: dict) -> NumpyOrCupy:
"""Convert array interface to numpy or cupy array"""
class Array: # pylint: disable=too-few-public-methods
"""Wrapper type for communicating with numpy and cupy."""
_interface: Optional[dict] = None
@property
def __array_interface__(self) -> Optional[dict]:
return self._interface
@__array_interface__.setter
def __array_interface__(self, interface: dict) -> None:
self._interface = copy.copy(interface)
# converts some fields to tuple as required by numpy
self._interface["shape"] = tuple(self._interface["shape"])
self._interface["data"] = tuple(self._interface["data"])
if self._interface.get("strides", None) is not None:
self._interface["strides"] = tuple(self._interface["strides"])
@property
def __cuda_array_interface__(self) -> Optional[dict]:
return self.__array_interface__
@__cuda_array_interface__.setter
def __cuda_array_interface__(self, interface: dict) -> None:
self.__array_interface__ = interface
arr = Array()
if "stream" in interface:
# CUDA stream is presented, this is a __cuda_array_interface__.
spec = importlib.util.find_spec("cupy")
if spec is None:
raise ImportError("`cupy` is required for handling CUDA buffer.")
import cupy as cp # pylint: disable=import-error
arr.__cuda_array_interface__ = interface
out = cp.array(arr, copy=True)
else:
arr.__array_interface__ = interface
out = np.array(arr, copy=True)
return out
def _prediction_output(
shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
) -> NumpyOrCupy:
@@ -1060,6 +1109,32 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
)
return ret
def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
"""Get quantile cuts for quantization."""
n_features = self.num_col()
c_sindptr = ctypes.c_char_p()
c_sdata = ctypes.c_char_p()
config = make_jcargs()
_check_call(
_LIB.XGDMatrixGetQuantileCut(
self.handle, config, ctypes.byref(c_sindptr), ctypes.byref(c_sdata)
)
)
assert c_sindptr.value is not None
assert c_sdata.value is not None
i_indptr = json.loads(c_sindptr.value)
indptr = from_array_interface(i_indptr)
assert indptr.size == n_features + 1
assert indptr.dtype == np.uint64
i_data = json.loads(c_sdata.value)
data = from_array_interface(i_data)
assert data.size == indptr[-1]
assert data.dtype == np.float32
return indptr, data
def num_row(self) -> int:
"""Get the number of rows in the DMatrix."""
ret = c_bst_ulong()

View File

@@ -265,6 +265,14 @@ def make_batches(
return X, y, w
def make_regression(
n_samples: int, n_features: int, use_cupy: bool
) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
"""Make a simple regression dataset."""
X, y, w = make_batches(n_samples, n_features, 1, use_cupy)
return X[0], y[0], w[0]
def make_batches_sparse(
n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:

View File

@@ -1,7 +1,7 @@
"""Tests for updaters."""
import json
from functools import partial, update_wrapper
from typing import Dict
from typing import Any, Dict
import numpy as np
@@ -159,3 +159,100 @@ def check_quantile_loss(tree_method: str, weighted: bool) -> None:
for i in range(alpha.shape[0]):
np.testing.assert_allclose(predts[:, i], predt_multi[:, i])
def check_cut(
n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
) -> None:
"""Check the cut values."""
from pandas.api.types import is_categorical_dtype
assert data.shape[0] == indptr[-1]
assert data.shape[0] == n_entries
assert indptr.dtype == np.uint64
for i in range(1, indptr.size):
beg = int(indptr[i - 1])
end = int(indptr[i])
for j in range(beg + 1, end):
assert data[j] > data[j - 1]
if is_categorical_dtype(dtypes[i - 1]):
assert data[j] == data[j - 1] + 1
def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
"""Check with optional cupy."""
from pandas.api.types import is_categorical_dtype
n_samples = 1024
n_features = 14
max_bin = 16
dtypes = [np.float32] * n_features
# numerical
X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
# - qdm
Xyw: xgb.DMatrix = xgb.QuantileDMatrix(X, y, weight=w, max_bin=max_bin)
indptr, data = Xyw.get_quantile_cut()
check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
# - dm
Xyw = xgb.DMatrix(X, y, weight=w)
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
indptr, data = Xyw.get_quantile_cut()
check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
# - ext mem
n_batches = 3
n_samples_per_batch = 256
it = tm.IteratorForTest(
*tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
cache="cache",
)
Xy: xgb.DMatrix = xgb.DMatrix(it)
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
indptr, data = Xyw.get_quantile_cut()
check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
# categorical
n_categories = 32
X, y = tm.make_categorical(n_samples, n_features, n_categories, False, sparsity=0.8)
if use_cupy:
import cudf # pylint: disable=import-error
import cupy as cp # pylint: disable=import-error
X = cudf.from_pandas(X)
y = cp.array(y)
# - qdm
Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
indptr, data = Xy.get_quantile_cut()
check_cut(n_categories * n_features, indptr, data, X.dtypes)
# - dm
Xy = xgb.DMatrix(X, y, enable_categorical=True)
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
indptr, data = Xy.get_quantile_cut()
check_cut(n_categories * n_features, indptr, data, X.dtypes)
# mixed
X, y = tm.make_categorical(
n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
)
n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)])
n_num_features = n_features - n_cat_features
n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
# - qdm
Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
indptr, data = Xy.get_quantile_cut()
check_cut(n_entries, indptr, data, X.dtypes)
# - dm
Xy = xgb.DMatrix(X, y, enable_categorical=True)
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
indptr, data = Xy.get_quantile_cut()
check_cut(n_entries, indptr, data, X.dtypes)
def check_get_quantile_cut(tree_method: str) -> None:
"""Check the quantile cut getter."""
use_cupy = tree_method == "gpu_hist"
check_get_quantile_cut_device(tree_method, False)
if use_cupy:
check_get_quantile_cut_device(tree_method, True)