[EM] Allow staging ellpack on host for GPU external memory. (#10488)
- New parameter `on_host`. - Abstract format creation and stream creation into policy classes.
This commit is contained in:
@@ -503,18 +503,29 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
|
||||
----------
|
||||
cache_prefix :
|
||||
Prefix to the cache files, only used in external memory.
|
||||
|
||||
release_data :
|
||||
Whether the iterator should release the data during iteration. Set it to True if
|
||||
the data transformation (converting data to np.float32 type) is memory
|
||||
intensive. Otherwise, if the transformation is computation intensive then we can
|
||||
keep the cache.
|
||||
|
||||
on_host :
|
||||
Whether the data should be cached on host memory instead of harddrive when using
|
||||
GPU with external memory. If set to true, then the "external memory" would
|
||||
simply be CPU (host) memory. This is still working in progress, not ready for
|
||||
test yet.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, cache_prefix: Optional[str] = None, release_data: bool = True
|
||||
self,
|
||||
cache_prefix: Optional[str] = None,
|
||||
release_data: bool = True,
|
||||
on_host: bool = False,
|
||||
) -> None:
|
||||
self.cache_prefix = cache_prefix
|
||||
self.on_host = on_host
|
||||
|
||||
self._handle = _ProxyDMatrix()
|
||||
self._exception: Optional[Exception] = None
|
||||
@@ -905,12 +916,12 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
|
||||
def _init_from_iter(self, iterator: DataIter, enable_categorical: bool) -> None:
|
||||
it = iterator
|
||||
args = {
|
||||
"missing": self.missing,
|
||||
"nthread": self.nthread,
|
||||
"cache_prefix": it.cache_prefix if it.cache_prefix else "",
|
||||
}
|
||||
args_cstr = from_pystr_to_cstr(json.dumps(args))
|
||||
args = make_jcargs(
|
||||
missing=self.missing,
|
||||
nthread=self.nthread,
|
||||
cache_prefix=it.cache_prefix if it.cache_prefix else "",
|
||||
on_host=it.on_host,
|
||||
)
|
||||
handle = ctypes.c_void_p()
|
||||
reset_callback, next_callback = it.get_callbacks(enable_categorical)
|
||||
ret = _LIB.XGDMatrixCreateFromCallback(
|
||||
@@ -918,7 +929,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
it.proxy.handle,
|
||||
reset_callback,
|
||||
next_callback,
|
||||
args_cstr,
|
||||
args,
|
||||
ctypes.byref(handle),
|
||||
)
|
||||
it.reraise()
|
||||
|
||||
@@ -198,19 +198,20 @@ def skip_win() -> PytestSkip:
|
||||
class IteratorForTest(xgb.core.DataIter):
|
||||
"""Iterator for testing streaming DMatrix. (external memory, quantile)"""
|
||||
|
||||
def __init__(
|
||||
def __init__( # pylint: disable=too-many-arguments
|
||||
self,
|
||||
X: Sequence,
|
||||
y: Sequence,
|
||||
w: Optional[Sequence],
|
||||
cache: Optional[str],
|
||||
on_host: bool = False,
|
||||
) -> None:
|
||||
assert len(X) == len(y)
|
||||
self.X = X
|
||||
self.y = y
|
||||
self.w = w
|
||||
self.it = 0
|
||||
super().__init__(cache_prefix=cache)
|
||||
super().__init__(cache_prefix=cache, on_host=on_host)
|
||||
|
||||
def next(self, input_data: Callable) -> int:
|
||||
if self.it == len(self.X):
|
||||
@@ -367,7 +368,11 @@ class TestDataset:
|
||||
weight.append(w)
|
||||
|
||||
it = IteratorForTest(
|
||||
predictor, response, weight if weight else None, cache="cache"
|
||||
predictor,
|
||||
response,
|
||||
weight if weight else None,
|
||||
cache="cache",
|
||||
on_host=False,
|
||||
)
|
||||
return xgb.DMatrix(it)
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ def run_mixed_sparsity(device: str) -> None:
|
||||
|
||||
X = [cp.array(batch) for batch in X]
|
||||
|
||||
it = tm.IteratorForTest(X, y, None, None)
|
||||
it = tm.IteratorForTest(X, y, None, None, on_host=False)
|
||||
Xy_0 = xgboost.QuantileDMatrix(it)
|
||||
|
||||
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
|
||||
|
||||
@@ -207,6 +207,7 @@ def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
|
||||
it = tm.IteratorForTest(
|
||||
*tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
|
||||
cache="cache",
|
||||
on_host=False,
|
||||
)
|
||||
Xy: xgb.DMatrix = xgb.DMatrix(it)
|
||||
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
|
||||
|
||||
Reference in New Issue
Block a user