From c2e3d4f3cd6ab26dc4ee271c4ca9be3bf1a2c037 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 31 May 2024 08:03:20 +0800 Subject: [PATCH] [dask] Update dask demo for using the new dask backend. (#10347) --- demo/dask/gpu_training.py | 33 +++++++++++++++++-------------- demo/dask/sklearn_gpu_training.py | 19 ++++++++++-------- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py index f53835ffb..d964d78e2 100644 --- a/demo/dask/gpu_training.py +++ b/demo/dask/gpu_training.py @@ -3,7 +3,7 @@ Example of training with Dask on GPU ==================================== """ -import cupy as cp +import dask import dask_cudf from dask import array as da from dask import dataframe as dd @@ -24,12 +24,8 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array: # history obtained from evaluation metrics. output = dxgb.train( client, - { - "verbosity": 2, - "tree_method": "hist", - # Golden line for GPU training - "device": "cuda", - }, + # Make sure the device is set to CUDA. + {"tree_method": "hist", "device": "cuda"}, dtrain, num_boost_round=4, evals=[(dtrain, "train")], @@ -50,18 +46,17 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d .. versionadded:: 1.2.0 """ - X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X)) - y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y)) - # `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful that it can not # be used for anything else other than training unless a reference is specified. See # the `ref` argument of `DaskQuantileDMatrix`. dtrain = dxgb.DaskQuantileDMatrix(client, X, y) output = dxgb.train( client, - {"verbosity": 2, "tree_method": "hist", "device": "cuda"}, + # Make sure the device is set to CUDA. + {"tree_method": "hist", "device": "cuda"}, dtrain, num_boost_round=4, + evals=[(dtrain, "train")], ) prediction = dxgb.predict(client, output, X) @@ -72,15 +67,23 @@ if __name__ == "__main__": # `LocalCUDACluster` is used for assigning GPU to XGBoost processes. Here # `n_workers` represents the number of GPUs since we use one GPU per worker process. with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster: - with Client(cluster) as client: - # generate some random data for demonstration + # Create client from cluster, set the backend to GPU array (cupy). + with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}): + # Generate some random data for demonstration rng = da.random.default_rng(1) - m = 100000 + m = 2**18 n = 100 - X = rng.normal(size=(m, n)) + X = rng.uniform(size=(m, n), chunks=(128**2, -1)) y = X.sum(axis=1) + X = dd.from_dask_array(X) + y = dd.from_dask_array(y) + # XGBoost can take arrays. This is to show that DataFrame uses the GPU + # backend as well. + assert isinstance(X, dask_cudf.DataFrame) + assert isinstance(y, dask_cudf.Series) + print("Using DaskQuantileDMatrix") from_ddqdm = using_quantile_device_dmatrix(client, X, y) print("Using DMatrix") diff --git a/demo/dask/sklearn_gpu_training.py b/demo/dask/sklearn_gpu_training.py index 6161bf9a3..56f1be715 100644 --- a/demo/dask/sklearn_gpu_training.py +++ b/demo/dask/sklearn_gpu_training.py @@ -3,6 +3,7 @@ Use scikit-learn regressor interface with GPU histogram tree method =================================================================== """ +import dask from dask import array as da from dask.distributed import Client @@ -13,17 +14,18 @@ from xgboost import dask as dxgb def main(client: Client) -> dxgb.Booster: - # generate some random data for demonstration + # Generate some random data for demonstration + rng = da.random.default_rng(1) + + m = 2**18 n = 100 - m = 1000000 - partition_size = 10000 - X = da.random.random((m, n), partition_size) - y = da.random.random(m, partition_size) + X = rng.uniform(size=(m, n), chunks=(128**2, -1)) + y = X.sum(axis=1) regressor = dxgb.DaskXGBRegressor(verbosity=1) - # set the device to CUDA + # Set the device to CUDA regressor.set_params(tree_method="hist", device="cuda") - # assigning client here is optional + # Assigning client here is optional regressor.client = client regressor.fit(X, y, eval_set=[(X, y)]) @@ -42,5 +44,6 @@ if __name__ == "__main__": # With dask cuda, one can scale up XGBoost to arbitrary GPU clusters. # `LocalCUDACluster` used here is only for demonstration purpose. with LocalCUDACluster() as cluster: - with Client(cluster) as client: + # Create client from cluster, set the backend to GPU array (cupy). + with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}): main(client)