[dask] Update dask demo for using the new dask backend. (#10347)

2024-05-31 08:03:20 +08:00 · 2024-05-31 08:03:20 +08:00 · c2e3d4f3cd
commit c2e3d4f3cd
parent e6eefea5e2
2 changed files with 29 additions and 23 deletions
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@ -3,7 +3,7 @@ Example of training with Dask on GPU
 ====================================
 """

-import cupy as cp
+import dask
 import dask_cudf
 from dask import array as da
 from dask import dataframe as dd
@ -24,12 +24,8 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
    # history obtained from evaluation metrics.
    output = dxgb.train(
        client,
-        {
-            "verbosity": 2,
-            "tree_method": "hist",
-            # Golden line for GPU training
-            "device": "cuda",
-        },
+        # Make sure the device is set to CUDA.
+        {"tree_method": "hist", "device": "cuda"},
        dtrain,
        num_boost_round=4,
        evals=[(dtrain, "train")],
@ -50,18 +46,17 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
    .. versionadded:: 1.2.0

    """
-    X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X))
-    y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y))
-
    # `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful that it can not
    # be used for anything else other than training unless a reference is specified. See
    # the `ref` argument of `DaskQuantileDMatrix`.
    dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
    output = dxgb.train(
        client,
-        {"verbosity": 2, "tree_method": "hist", "device": "cuda"},
+        # Make sure the device is set to CUDA.
+        {"tree_method": "hist", "device": "cuda"},
        dtrain,
        num_boost_round=4,
+        evals=[(dtrain, "train")],
    )

    prediction = dxgb.predict(client, output, X)
@ -72,15 +67,23 @@ if __name__ == "__main__":
    # `LocalCUDACluster` is used for assigning GPU to XGBoost processes.  Here
    # `n_workers` represents the number of GPUs since we use one GPU per worker process.
    with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster:
-        with Client(cluster) as client:
-            # generate some random data for demonstration
+        # Create client from cluster, set the backend to GPU array (cupy).
+        with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
+            # Generate some random data for demonstration
            rng = da.random.default_rng(1)

-            m = 100000
+            m = 2**18
            n = 100
-            X = rng.normal(size=(m, n))
+            X = rng.uniform(size=(m, n), chunks=(128**2, -1))
            y = X.sum(axis=1)

+            X = dd.from_dask_array(X)
+            y = dd.from_dask_array(y)
+            # XGBoost can take arrays. This is to show that DataFrame uses the GPU
+            # backend as well.
+            assert isinstance(X, dask_cudf.DataFrame)
+            assert isinstance(y, dask_cudf.Series)
+
            print("Using DaskQuantileDMatrix")
            from_ddqdm = using_quantile_device_dmatrix(client, X, y)
            print("Using DMatrix")
--- a/demo/dask/sklearn_gpu_training.py
+++ b/demo/dask/sklearn_gpu_training.py
@ -3,6 +3,7 @@ Use scikit-learn regressor interface with GPU histogram tree method
 ===================================================================
 """

+import dask
 from dask import array as da
 from dask.distributed import Client

@ -13,17 +14,18 @@ from xgboost import dask as dxgb


 def main(client: Client) -> dxgb.Booster:
-    # generate some random data for demonstration
+    # Generate some random data for demonstration
+    rng = da.random.default_rng(1)
+
+    m = 2**18
    n = 100
-    m = 1000000
-    partition_size = 10000
-    X = da.random.random((m, n), partition_size)
-    y = da.random.random(m, partition_size)
+    X = rng.uniform(size=(m, n), chunks=(128**2, -1))
+    y = X.sum(axis=1)

    regressor = dxgb.DaskXGBRegressor(verbosity=1)
-    # set the device to CUDA
+    # Set the device to CUDA
    regressor.set_params(tree_method="hist", device="cuda")
-    # assigning client here is optional
+    # Assigning client here is optional
    regressor.client = client

    regressor.fit(X, y, eval_set=[(X, y)])
@ -42,5 +44,6 @@ if __name__ == "__main__":
    # With dask cuda, one can scale up XGBoost to arbitrary GPU clusters.
    # `LocalCUDACluster` used here is only for demonstration purpose.
    with LocalCUDACluster() as cluster:
-        with Client(cluster) as client:
+        # Create client from cluster, set the backend to GPU array (cupy).
+        with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
            main(client)