[dask] Update dask demo for using the new dask backend. (#10347)

This commit is contained in:
Jiaming Yuan 2024-05-31 08:03:20 +08:00 committed by GitHub
parent e6eefea5e2
commit c2e3d4f3cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 29 additions and 23 deletions

View File

@ -3,7 +3,7 @@ Example of training with Dask on GPU
====================================
"""
import cupy as cp
import dask
import dask_cudf
from dask import array as da
from dask import dataframe as dd
@ -24,12 +24,8 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
# history obtained from evaluation metrics.
output = dxgb.train(
client,
{
"verbosity": 2,
"tree_method": "hist",
# Golden line for GPU training
"device": "cuda",
},
# Make sure the device is set to CUDA.
{"tree_method": "hist", "device": "cuda"},
dtrain,
num_boost_round=4,
evals=[(dtrain, "train")],
@ -50,18 +46,17 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
.. versionadded:: 1.2.0
"""
X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X))
y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y))
# `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful that it can not
# be used for anything else other than training unless a reference is specified. See
# the `ref` argument of `DaskQuantileDMatrix`.
dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
output = dxgb.train(
client,
{"verbosity": 2, "tree_method": "hist", "device": "cuda"},
# Make sure the device is set to CUDA.
{"tree_method": "hist", "device": "cuda"},
dtrain,
num_boost_round=4,
evals=[(dtrain, "train")],
)
prediction = dxgb.predict(client, output, X)
@ -72,15 +67,23 @@ if __name__ == "__main__":
# `LocalCUDACluster` is used for assigning GPU to XGBoost processes. Here
# `n_workers` represents the number of GPUs since we use one GPU per worker process.
with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster:
with Client(cluster) as client:
# generate some random data for demonstration
# Create client from cluster, set the backend to GPU array (cupy).
with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
# Generate some random data for demonstration
rng = da.random.default_rng(1)
m = 100000
m = 2**18
n = 100
X = rng.normal(size=(m, n))
X = rng.uniform(size=(m, n), chunks=(128**2, -1))
y = X.sum(axis=1)
X = dd.from_dask_array(X)
y = dd.from_dask_array(y)
# XGBoost can take arrays. This is to show that DataFrame uses the GPU
# backend as well.
assert isinstance(X, dask_cudf.DataFrame)
assert isinstance(y, dask_cudf.Series)
print("Using DaskQuantileDMatrix")
from_ddqdm = using_quantile_device_dmatrix(client, X, y)
print("Using DMatrix")

View File

@ -3,6 +3,7 @@ Use scikit-learn regressor interface with GPU histogram tree method
===================================================================
"""
import dask
from dask import array as da
from dask.distributed import Client
@ -13,17 +14,18 @@ from xgboost import dask as dxgb
def main(client: Client) -> dxgb.Booster:
# generate some random data for demonstration
# Generate some random data for demonstration
rng = da.random.default_rng(1)
m = 2**18
n = 100
m = 1000000
partition_size = 10000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
X = rng.uniform(size=(m, n), chunks=(128**2, -1))
y = X.sum(axis=1)
regressor = dxgb.DaskXGBRegressor(verbosity=1)
# set the device to CUDA
# Set the device to CUDA
regressor.set_params(tree_method="hist", device="cuda")
# assigning client here is optional
# Assigning client here is optional
regressor.client = client
regressor.fit(X, y, eval_set=[(X, y)])
@ -42,5 +44,6 @@ if __name__ == "__main__":
# With dask cuda, one can scale up XGBoost to arbitrary GPU clusters.
# `LocalCUDACluster` used here is only for demonstration purpose.
with LocalCUDACluster() as cluster:
with Client(cluster) as client:
# Create client from cluster, set the backend to GPU array (cupy).
with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
main(client)