[EM] Python wrapper for the ExtMemQuantileDMatrix. (#10762)

Not exposed to the document yet.

- Add C API.
- Add Python API.
- Basic CPU tests.
This commit is contained in:
Jiaming Yuan
2024-08-29 04:08:25 +08:00
committed by GitHub
parent 7510a87466
commit 34937fea41
7 changed files with 208 additions and 27 deletions

View File

@@ -5,7 +5,15 @@ Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md
from . import tracker # noqa
from . import collective, dask
from .core import Booster, DataIter, DMatrix, QuantileDMatrix, _py_version, build_info
from .core import (
Booster,
DataIter,
DMatrix,
ExtMemQuantileDMatrix,
QuantileDMatrix,
_py_version,
build_info,
)
from .tracker import RabitTracker # noqa
from .training import cv, train
@@ -31,6 +39,7 @@ __all__ = [
# core
"DMatrix",
"QuantileDMatrix",
"ExtMemQuantileDMatrix",
"Booster",
"DataIter",
"train",

View File

@@ -526,8 +526,13 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
on_host :
Whether the data should be cached on host memory instead of harddrive when using
GPU with external memory. If set to true, then the "external memory" would
simply be CPU (host) memory. This is still working in progress, not ready for
test yet.
simply be CPU (host) memory.
.. versionadded:: 3.0.0
.. warning::
This is still working in progress, not ready for test yet.
"""
@@ -927,8 +932,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
if feature_types is not None:
self.feature_types = feature_types
def _init_from_iter(self, iterator: DataIter, enable_categorical: bool) -> None:
it = iterator
def _init_from_iter(self, it: DataIter, enable_categorical: bool) -> None:
args = make_jcargs(
missing=self.missing,
nthread=self.nthread,
@@ -1673,6 +1677,63 @@ class QuantileDMatrix(DMatrix):
self.handle = handle
class ExtMemQuantileDMatrix(DMatrix):
"""The external memory version of the :py:class:`QuantileDMatrix`.
.. warning::
This is still working in progress, not ready for test yet.
.. versionadded:: 3.0.0
"""
@_deprecate_positional_args
def __init__( # pylint: disable=super-init-not-called
self,
data: DataIter,
missing: Optional[float] = None,
nthread: Optional[int] = None,
max_bin: Optional[int] = None,
ref: Optional[DMatrix] = None,
enable_categorical: bool = False,
) -> None:
self.max_bin = max_bin
self.missing = missing if missing is not None else np.nan
self.nthread = nthread if nthread is not None else -1
self._init(data, ref, enable_categorical)
assert self.handle is not None
def _init(
self, it: DataIter, ref: Optional[DMatrix], enable_categorical: bool
) -> None:
args = make_jcargs(
missing=self.missing,
nthread=self.nthread,
cache_prefix=it.cache_prefix if it.cache_prefix else "",
on_host=it.on_host,
)
handle = ctypes.c_void_p()
reset_callback, next_callback = it.get_callbacks(enable_categorical)
# We don't need the iter handle (hence None) in Python as reset,next callbacks
# are member functions, and ctypes can handle the `self` parameter
# automatically.
ret = _LIB.XGExtMemQuantileDMatrixCreateFromCallback(
None, # iter
it.proxy.handle, # proxy
ref.handle if ref is not None else ref, # ref
reset_callback, # reset
next_callback, # next
args, # config
ctypes.byref(handle), # out
)
it.reraise()
# delay check_call to throw intermediate exception first
_check_call(ret)
self.handle = handle
Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]

View File

@@ -5,6 +5,7 @@ from functools import partial, update_wrapper
from typing import Any, Dict, List
import numpy as np
import pytest
import xgboost as xgb
import xgboost.testing as tm
@@ -194,6 +195,43 @@ def check_quantile_loss_extmem(
np.testing.assert_allclose(predt, predt_it)
def check_extmem_qdm(
n_samples_per_batch: int,
n_features: int,
n_batches: int,
device: str,
on_host: bool,
) -> None:
"""Basic test for the `ExtMemQuantileDMatrix`."""
it = tm.IteratorForTest(
*tm.make_batches(
n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu"
),
cache="cache",
on_host=on_host,
)
Xy_it = xgb.ExtMemQuantileDMatrix(it)
with pytest.raises(ValueError, match="Only the `hist`"):
booster_it = xgb.train(
{"device": device, "tree_method": "approx"}, Xy_it, num_boost_round=8
)
booster_it = xgb.train({"device": device}, Xy_it, num_boost_round=8)
X, y, w = it.as_arrays()
Xy = xgb.QuantileDMatrix(X, y, weight=w)
booster = xgb.train({"device": device}, Xy, num_boost_round=8)
cut_it = Xy_it.get_quantile_cut()
cut = Xy.get_quantile_cut()
np.testing.assert_allclose(cut_it[0], cut[0])
np.testing.assert_allclose(cut_it[1], cut[1])
predt_it = booster_it.predict(Xy_it)
predt = booster.predict(Xy)
np.testing.assert_allclose(predt_it, predt)
def check_cut(
n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
) -> None: