Support exporting cut values (#9356)
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
"""Core XGBoost Library."""
|
||||
import copy
|
||||
import ctypes
|
||||
import importlib.util
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -381,6 +382,54 @@ def c_array(
|
||||
return (ctype * len(values))(*values)
|
||||
|
||||
|
||||
def from_array_interface(interface: dict) -> NumpyOrCupy:
|
||||
"""Convert array interface to numpy or cupy array"""
|
||||
|
||||
class Array: # pylint: disable=too-few-public-methods
|
||||
"""Wrapper type for communicating with numpy and cupy."""
|
||||
|
||||
_interface: Optional[dict] = None
|
||||
|
||||
@property
|
||||
def __array_interface__(self) -> Optional[dict]:
|
||||
return self._interface
|
||||
|
||||
@__array_interface__.setter
|
||||
def __array_interface__(self, interface: dict) -> None:
|
||||
self._interface = copy.copy(interface)
|
||||
# converts some fields to tuple as required by numpy
|
||||
self._interface["shape"] = tuple(self._interface["shape"])
|
||||
self._interface["data"] = tuple(self._interface["data"])
|
||||
if self._interface.get("strides", None) is not None:
|
||||
self._interface["strides"] = tuple(self._interface["strides"])
|
||||
|
||||
@property
|
||||
def __cuda_array_interface__(self) -> Optional[dict]:
|
||||
return self.__array_interface__
|
||||
|
||||
@__cuda_array_interface__.setter
|
||||
def __cuda_array_interface__(self, interface: dict) -> None:
|
||||
self.__array_interface__ = interface
|
||||
|
||||
arr = Array()
|
||||
|
||||
if "stream" in interface:
|
||||
# CUDA stream is presented, this is a __cuda_array_interface__.
|
||||
spec = importlib.util.find_spec("cupy")
|
||||
if spec is None:
|
||||
raise ImportError("`cupy` is required for handling CUDA buffer.")
|
||||
|
||||
import cupy as cp # pylint: disable=import-error
|
||||
|
||||
arr.__cuda_array_interface__ = interface
|
||||
out = cp.array(arr, copy=True)
|
||||
else:
|
||||
arr.__array_interface__ = interface
|
||||
out = np.array(arr, copy=True)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _prediction_output(
|
||||
shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
|
||||
) -> NumpyOrCupy:
|
||||
@@ -1060,6 +1109,32 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
)
|
||||
return ret
|
||||
|
||||
def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Get quantile cuts for quantization."""
|
||||
n_features = self.num_col()
|
||||
|
||||
c_sindptr = ctypes.c_char_p()
|
||||
c_sdata = ctypes.c_char_p()
|
||||
config = make_jcargs()
|
||||
_check_call(
|
||||
_LIB.XGDMatrixGetQuantileCut(
|
||||
self.handle, config, ctypes.byref(c_sindptr), ctypes.byref(c_sdata)
|
||||
)
|
||||
)
|
||||
assert c_sindptr.value is not None
|
||||
assert c_sdata.value is not None
|
||||
|
||||
i_indptr = json.loads(c_sindptr.value)
|
||||
indptr = from_array_interface(i_indptr)
|
||||
assert indptr.size == n_features + 1
|
||||
assert indptr.dtype == np.uint64
|
||||
|
||||
i_data = json.loads(c_sdata.value)
|
||||
data = from_array_interface(i_data)
|
||||
assert data.size == indptr[-1]
|
||||
assert data.dtype == np.float32
|
||||
return indptr, data
|
||||
|
||||
def num_row(self) -> int:
|
||||
"""Get the number of rows in the DMatrix."""
|
||||
ret = c_bst_ulong()
|
||||
|
||||
@@ -265,6 +265,14 @@ def make_batches(
|
||||
return X, y, w
|
||||
|
||||
|
||||
def make_regression(
|
||||
n_samples: int, n_features: int, use_cupy: bool
|
||||
) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
|
||||
"""Make a simple regression dataset."""
|
||||
X, y, w = make_batches(n_samples, n_features, 1, use_cupy)
|
||||
return X[0], y[0], w[0]
|
||||
|
||||
|
||||
def make_batches_sparse(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
|
||||
) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Tests for updaters."""
|
||||
import json
|
||||
from functools import partial, update_wrapper
|
||||
from typing import Dict
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -159,3 +159,100 @@ def check_quantile_loss(tree_method: str, weighted: bool) -> None:
|
||||
|
||||
for i in range(alpha.shape[0]):
|
||||
np.testing.assert_allclose(predts[:, i], predt_multi[:, i])
|
||||
|
||||
|
||||
def check_cut(
|
||||
n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
|
||||
) -> None:
|
||||
"""Check the cut values."""
|
||||
from pandas.api.types import is_categorical_dtype
|
||||
|
||||
assert data.shape[0] == indptr[-1]
|
||||
assert data.shape[0] == n_entries
|
||||
|
||||
assert indptr.dtype == np.uint64
|
||||
for i in range(1, indptr.size):
|
||||
beg = int(indptr[i - 1])
|
||||
end = int(indptr[i])
|
||||
for j in range(beg + 1, end):
|
||||
assert data[j] > data[j - 1]
|
||||
if is_categorical_dtype(dtypes[i - 1]):
|
||||
assert data[j] == data[j - 1] + 1
|
||||
|
||||
|
||||
def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
|
||||
"""Check with optional cupy."""
|
||||
from pandas.api.types import is_categorical_dtype
|
||||
|
||||
n_samples = 1024
|
||||
n_features = 14
|
||||
max_bin = 16
|
||||
dtypes = [np.float32] * n_features
|
||||
|
||||
# numerical
|
||||
X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
|
||||
# - qdm
|
||||
Xyw: xgb.DMatrix = xgb.QuantileDMatrix(X, y, weight=w, max_bin=max_bin)
|
||||
indptr, data = Xyw.get_quantile_cut()
|
||||
check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
|
||||
# - dm
|
||||
Xyw = xgb.DMatrix(X, y, weight=w)
|
||||
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
|
||||
indptr, data = Xyw.get_quantile_cut()
|
||||
check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
|
||||
# - ext mem
|
||||
n_batches = 3
|
||||
n_samples_per_batch = 256
|
||||
it = tm.IteratorForTest(
|
||||
*tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
|
||||
cache="cache",
|
||||
)
|
||||
Xy: xgb.DMatrix = xgb.DMatrix(it)
|
||||
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
|
||||
indptr, data = Xyw.get_quantile_cut()
|
||||
check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
|
||||
|
||||
# categorical
|
||||
n_categories = 32
|
||||
X, y = tm.make_categorical(n_samples, n_features, n_categories, False, sparsity=0.8)
|
||||
if use_cupy:
|
||||
import cudf # pylint: disable=import-error
|
||||
import cupy as cp # pylint: disable=import-error
|
||||
|
||||
X = cudf.from_pandas(X)
|
||||
y = cp.array(y)
|
||||
# - qdm
|
||||
Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
|
||||
indptr, data = Xy.get_quantile_cut()
|
||||
check_cut(n_categories * n_features, indptr, data, X.dtypes)
|
||||
# - dm
|
||||
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
|
||||
indptr, data = Xy.get_quantile_cut()
|
||||
check_cut(n_categories * n_features, indptr, data, X.dtypes)
|
||||
|
||||
# mixed
|
||||
X, y = tm.make_categorical(
|
||||
n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
|
||||
)
|
||||
n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)])
|
||||
n_num_features = n_features - n_cat_features
|
||||
n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
|
||||
# - qdm
|
||||
Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
|
||||
indptr, data = Xy.get_quantile_cut()
|
||||
check_cut(n_entries, indptr, data, X.dtypes)
|
||||
# - dm
|
||||
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
|
||||
indptr, data = Xy.get_quantile_cut()
|
||||
check_cut(n_entries, indptr, data, X.dtypes)
|
||||
|
||||
|
||||
def check_get_quantile_cut(tree_method: str) -> None:
|
||||
"""Check the quantile cut getter."""
|
||||
|
||||
use_cupy = tree_method == "gpu_hist"
|
||||
check_get_quantile_cut_device(tree_method, False)
|
||||
if use_cupy:
|
||||
check_get_quantile_cut_device(tree_method, True)
|
||||
|
||||
Reference in New Issue
Block a user