[EM] Python wrapper for the ExtMemQuantileDMatrix. (#10762)
Not exposed to the document yet. - Add C API. - Add Python API. - Basic CPU tests.
This commit is contained in:
@@ -5,7 +5,15 @@ Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md
|
||||
|
||||
from . import tracker # noqa
|
||||
from . import collective, dask
|
||||
from .core import Booster, DataIter, DMatrix, QuantileDMatrix, _py_version, build_info
|
||||
from .core import (
|
||||
Booster,
|
||||
DataIter,
|
||||
DMatrix,
|
||||
ExtMemQuantileDMatrix,
|
||||
QuantileDMatrix,
|
||||
_py_version,
|
||||
build_info,
|
||||
)
|
||||
from .tracker import RabitTracker # noqa
|
||||
from .training import cv, train
|
||||
|
||||
@@ -31,6 +39,7 @@ __all__ = [
|
||||
# core
|
||||
"DMatrix",
|
||||
"QuantileDMatrix",
|
||||
"ExtMemQuantileDMatrix",
|
||||
"Booster",
|
||||
"DataIter",
|
||||
"train",
|
||||
|
||||
@@ -526,8 +526,13 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
|
||||
on_host :
|
||||
Whether the data should be cached on host memory instead of harddrive when using
|
||||
GPU with external memory. If set to true, then the "external memory" would
|
||||
simply be CPU (host) memory. This is still working in progress, not ready for
|
||||
test yet.
|
||||
simply be CPU (host) memory.
|
||||
|
||||
.. versionadded:: 3.0.0
|
||||
|
||||
.. warning::
|
||||
|
||||
This is still working in progress, not ready for test yet.
|
||||
|
||||
"""
|
||||
|
||||
@@ -927,8 +932,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
if feature_types is not None:
|
||||
self.feature_types = feature_types
|
||||
|
||||
def _init_from_iter(self, iterator: DataIter, enable_categorical: bool) -> None:
|
||||
it = iterator
|
||||
def _init_from_iter(self, it: DataIter, enable_categorical: bool) -> None:
|
||||
args = make_jcargs(
|
||||
missing=self.missing,
|
||||
nthread=self.nthread,
|
||||
@@ -1673,6 +1677,63 @@ class QuantileDMatrix(DMatrix):
|
||||
self.handle = handle
|
||||
|
||||
|
||||
class ExtMemQuantileDMatrix(DMatrix):
|
||||
"""The external memory version of the :py:class:`QuantileDMatrix`.
|
||||
|
||||
.. warning::
|
||||
|
||||
This is still working in progress, not ready for test yet.
|
||||
|
||||
.. versionadded:: 3.0.0
|
||||
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__( # pylint: disable=super-init-not-called
|
||||
self,
|
||||
data: DataIter,
|
||||
missing: Optional[float] = None,
|
||||
nthread: Optional[int] = None,
|
||||
max_bin: Optional[int] = None,
|
||||
ref: Optional[DMatrix] = None,
|
||||
enable_categorical: bool = False,
|
||||
) -> None:
|
||||
self.max_bin = max_bin
|
||||
self.missing = missing if missing is not None else np.nan
|
||||
self.nthread = nthread if nthread is not None else -1
|
||||
|
||||
self._init(data, ref, enable_categorical)
|
||||
assert self.handle is not None
|
||||
|
||||
def _init(
|
||||
self, it: DataIter, ref: Optional[DMatrix], enable_categorical: bool
|
||||
) -> None:
|
||||
args = make_jcargs(
|
||||
missing=self.missing,
|
||||
nthread=self.nthread,
|
||||
cache_prefix=it.cache_prefix if it.cache_prefix else "",
|
||||
on_host=it.on_host,
|
||||
)
|
||||
handle = ctypes.c_void_p()
|
||||
reset_callback, next_callback = it.get_callbacks(enable_categorical)
|
||||
# We don't need the iter handle (hence None) in Python as reset,next callbacks
|
||||
# are member functions, and ctypes can handle the `self` parameter
|
||||
# automatically.
|
||||
ret = _LIB.XGExtMemQuantileDMatrixCreateFromCallback(
|
||||
None, # iter
|
||||
it.proxy.handle, # proxy
|
||||
ref.handle if ref is not None else ref, # ref
|
||||
reset_callback, # reset
|
||||
next_callback, # next
|
||||
args, # config
|
||||
ctypes.byref(handle), # out
|
||||
)
|
||||
it.reraise()
|
||||
# delay check_call to throw intermediate exception first
|
||||
_check_call(ret)
|
||||
self.handle = handle
|
||||
|
||||
|
||||
Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
|
||||
Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from functools import partial, update_wrapper
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
import xgboost.testing as tm
|
||||
@@ -194,6 +195,43 @@ def check_quantile_loss_extmem(
|
||||
np.testing.assert_allclose(predt, predt_it)
|
||||
|
||||
|
||||
def check_extmem_qdm(
|
||||
n_samples_per_batch: int,
|
||||
n_features: int,
|
||||
n_batches: int,
|
||||
device: str,
|
||||
on_host: bool,
|
||||
) -> None:
|
||||
"""Basic test for the `ExtMemQuantileDMatrix`."""
|
||||
|
||||
it = tm.IteratorForTest(
|
||||
*tm.make_batches(
|
||||
n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu"
|
||||
),
|
||||
cache="cache",
|
||||
on_host=on_host,
|
||||
)
|
||||
Xy_it = xgb.ExtMemQuantileDMatrix(it)
|
||||
with pytest.raises(ValueError, match="Only the `hist`"):
|
||||
booster_it = xgb.train(
|
||||
{"device": device, "tree_method": "approx"}, Xy_it, num_boost_round=8
|
||||
)
|
||||
|
||||
booster_it = xgb.train({"device": device}, Xy_it, num_boost_round=8)
|
||||
X, y, w = it.as_arrays()
|
||||
Xy = xgb.QuantileDMatrix(X, y, weight=w)
|
||||
booster = xgb.train({"device": device}, Xy, num_boost_round=8)
|
||||
|
||||
cut_it = Xy_it.get_quantile_cut()
|
||||
cut = Xy.get_quantile_cut()
|
||||
np.testing.assert_allclose(cut_it[0], cut[0])
|
||||
np.testing.assert_allclose(cut_it[1], cut[1])
|
||||
|
||||
predt_it = booster_it.predict(Xy_it)
|
||||
predt = booster.predict(Xy)
|
||||
np.testing.assert_allclose(predt_it, predt)
|
||||
|
||||
|
||||
def check_cut(
|
||||
n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
|
||||
) -> None:
|
||||
|
||||
Reference in New Issue
Block a user