xgboost/demo/dask/sklearn_gpu_training.py

"""
Use scikit-learn regressor interface with GPU histogram tree method
===================================================================
"""

import dask
from dask import array as da
from dask.distributed import Client

# It's recommended to use dask_cuda for GPU assignment
from dask_cuda import LocalCUDACluster

from xgboost import dask as dxgb


def main(client: Client) -> dxgb.Booster:
    # Generate some random data for demonstration
    rng = da.random.default_rng(1)

    m = 2**18
    n = 100
    X = rng.uniform(size=(m, n), chunks=(128**2, -1))
    y = X.sum(axis=1)

    regressor = dxgb.DaskXGBRegressor(verbosity=1)
    # Set the device to CUDA
    regressor.set_params(tree_method="hist", device="cuda")
    # Assigning client here is optional
    regressor.client = client

    regressor.fit(X, y, eval_set=[(X, y)])
    prediction = regressor.predict(X)

    bst = regressor.get_booster()
    history = regressor.evals_result()

    print("Evaluation history:", history)
    # returned prediction is always a dask array.
    assert isinstance(prediction, da.Array)
    return bst  # returning the trained model


if __name__ == "__main__":
    # With dask cuda, one can scale up XGBoost to arbitrary GPU clusters.
    # `LocalCUDACluster` used here is only for demonstration purpose.
    with LocalCUDACluster() as cluster:
        # Create client from cluster, set the backend to GPU array (cupy).
        with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
            main(client)