[EM] Allow staging ellpack on host for GPU external memory. (#10488)

- New parameter `on_host`. - Abstract format creation and stream creation into policy classes.
2024-06-28 04:42:18 +08:00
parent 824fba783e
commit e8a962575a
36 changed files with 842 additions and 317 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -503,18 +503,29 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
    ----------
    cache_prefix :
        Prefix to the cache files, only used in external memory.
+
    release_data :
        Whether the iterator should release the data during iteration. Set it to True if
        the data transformation (converting data to np.float32 type) is memory
        intensive. Otherwise, if the transformation is computation intensive then we can
        keep the cache.

+    on_host :
+        Whether the data should be cached on host memory instead of harddrive when using
+        GPU with external memory. If set to true, then the "external memory" would
+        simply be CPU (host) memory. This is still working in progress, not ready for
+        test yet.
+
    """

    def __init__(
-        self, cache_prefix: Optional[str] = None, release_data: bool = True
+        self,
+        cache_prefix: Optional[str] = None,
+        release_data: bool = True,
+        on_host: bool = False,
    ) -> None:
        self.cache_prefix = cache_prefix
+        self.on_host = on_host

        self._handle = _ProxyDMatrix()
        self._exception: Optional[Exception] = None
@@ -905,12 +916,12 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m

    def _init_from_iter(self, iterator: DataIter, enable_categorical: bool) -> None:
        it = iterator
-        args = {
-            "missing": self.missing,
-            "nthread": self.nthread,
-            "cache_prefix": it.cache_prefix if it.cache_prefix else "",
-        }
-        args_cstr = from_pystr_to_cstr(json.dumps(args))
+        args = make_jcargs(
+            missing=self.missing,
+            nthread=self.nthread,
+            cache_prefix=it.cache_prefix if it.cache_prefix else "",
+            on_host=it.on_host,
+        )
        handle = ctypes.c_void_p()
        reset_callback, next_callback = it.get_callbacks(enable_categorical)
        ret = _LIB.XGDMatrixCreateFromCallback(
@@ -918,7 +929,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
            it.proxy.handle,
            reset_callback,
            next_callback,
-            args_cstr,
+            args,
            ctypes.byref(handle),
        )
        it.reraise()
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -198,19 +198,20 @@ def skip_win() -> PytestSkip:
 class IteratorForTest(xgb.core.DataIter):
    """Iterator for testing streaming DMatrix. (external memory, quantile)"""

-    def __init__(
+    def __init__(  # pylint: disable=too-many-arguments
        self,
        X: Sequence,
        y: Sequence,
        w: Optional[Sequence],
        cache: Optional[str],
+        on_host: bool = False,
    ) -> None:
        assert len(X) == len(y)
        self.X = X
        self.y = y
        self.w = w
        self.it = 0
-        super().__init__(cache_prefix=cache)
+        super().__init__(cache_prefix=cache, on_host=on_host)

    def next(self, input_data: Callable) -> int:
        if self.it == len(self.X):
@@ -367,7 +368,11 @@ class TestDataset:
                weight.append(w)

        it = IteratorForTest(
-            predictor, response, weight if weight else None, cache="cache"
+            predictor,
+            response,
+            weight if weight else None,
+            cache="cache",
+            on_host=False,
        )
        return xgb.DMatrix(it)

--- a/python-package/xgboost/testing/data_iter.py
+++ b/python-package/xgboost/testing/data_iter.py
@@ -22,7 +22,7 @@ def run_mixed_sparsity(device: str) -> None:

        X = [cp.array(batch) for batch in X]

-    it = tm.IteratorForTest(X, y, None, None)
+    it = tm.IteratorForTest(X, y, None, None, on_host=False)
    Xy_0 = xgboost.QuantileDMatrix(it)

    X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -207,6 +207,7 @@ def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
    it = tm.IteratorForTest(
        *tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
        cache="cache",
+        on_host=False,
    )
    Xy: xgb.DMatrix = xgb.DMatrix(it)
    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)