rocm enable for v2.0.1

2023-10-27 18:50:28 -07:00
parent 2e7e9d3b2d a408254c2f
commit 782b73f2bb
447 changed files with 13518 additions and 8719 deletions
--- a/demo/README.md
+++ b/demo/README.md
@@ -106,7 +106,7 @@ Please send pull requests if you find ones that are missing here.
 - Prarthana Bhat, 2nd place winner in [DYD Competition](https://datahack.analyticsvidhya.com/contest/date-your-data/). Link to [Solution](https://github.com/analyticsvidhya/DateYourData/blob/master/Prathna_Bhat_Model.R).

 ## Talks
- [XGBoost: A Scalable Tree Boosting System](http://datascience.la/xgboost-workshop-and-meetup-talk-with-tianqi-chen/) (video+slides) by Tianqi Chen at the Los Angeles Data Science meetup
+- XGBoost: A Scalable Tree Boosting System ([video] (https://www.youtube.com/watch?v=Vly8xGnNiWs) + [slides](https://speakerdeck.com/datasciencela/tianqi-chen-xgboost-overview-and-latest-news-la-meetup-talk)) by Tianqi Chen at the Los Angeles Data Science meetup

 ## Tutorials

@@ -145,7 +145,7 @@ Send a PR to add a one sentence description:)
 ## Tools using XGBoost

 - [BayesBoost](https://github.com/mpearmain/BayesBoost) - Bayesian Optimization using xgboost and sklearn API
- [FLAML](https://github.com/microsoft/FLAML) - An open source AutoML library 
+- [FLAML](https://github.com/microsoft/FLAML) - An open source AutoML library
 designed to automatically produce accurate machine learning models with low computational cost. FLAML includes [XGBoost as one of the default learners](https://github.com/microsoft/FLAML/blob/main/flaml/model.py) and can also be used as a fast hyperparameter tuning tool for XGBoost ([code example](https://microsoft.github.io/FLAML/docs/Examples/AutoML-for-XGBoost)).
 - [gp_xgboost_gridsearch](https://github.com/vatsan/gp_xgboost_gridsearch) - In-database parallel grid-search for XGBoost on [Greenplum](https://github.com/greenplum-db/gpdb) using PL/Python
 - [tpot](https://github.com/rhiever/tpot) - A Python tool that automatically creates and optimizes machine learning pipelines using genetic programming.
--- a/demo/aft_survival/aft_survival_viz_demo.py
+++ b/demo/aft_survival/aft_survival_viz_demo.py
@@ -11,33 +11,43 @@ import numpy as np

 import xgboost as xgb

-plt.rcParams.update({'font.size': 13})
+plt.rcParams.update({"font.size": 13})
+

 # Function to visualize censored labels
-def plot_censored_labels(X, y_lower, y_upper):
-    def replace_inf(x, target_value):
+def plot_censored_labels(
+    X: np.ndarray, y_lower: np.ndarray, y_upper: np.ndarray
+) -> None:
+    def replace_inf(x: np.ndarray, target_value: float) -> np.ndarray:
        x[np.isinf(x)] = target_value
        return x
-    plt.plot(X, y_lower, 'o', label='y_lower', color='blue')
-    plt.plot(X, y_upper, 'o', label='y_upper', color='fuchsia')
-    plt.vlines(X, ymin=replace_inf(y_lower, 0.01), ymax=replace_inf(y_upper, 1000),
-               label='Range for y', color='gray')
+
+    plt.plot(X, y_lower, "o", label="y_lower", color="blue")
+    plt.plot(X, y_upper, "o", label="y_upper", color="fuchsia")
+    plt.vlines(
+        X,
+        ymin=replace_inf(y_lower, 0.01),
+        ymax=replace_inf(y_upper, 1000.0),
+        label="Range for y",
+        color="gray",
+    )
+

 # Toy data
 X = np.array([1, 2, 3, 4, 5]).reshape((-1, 1))
 INF = np.inf
-y_lower = np.array([ 10,  15, -INF, 30, 100])
-y_upper = np.array([INF, INF,   20, 50, INF])
+y_lower = np.array([10, 15, -INF, 30, 100])
+y_upper = np.array([INF, INF, 20, 50, INF])

 # Visualize toy data
 plt.figure(figsize=(5, 4))
 plot_censored_labels(X, y_lower, y_upper)
 plt.ylim((6, 200))
-plt.legend(loc='lower right')
-plt.title('Toy data')
-plt.xlabel('Input feature')
-plt.ylabel('Label')
-plt.yscale('log')
+plt.legend(loc="lower right")
+plt.title("Toy data")
+plt.xlabel("Input feature")
+plt.ylabel("Label")
+plt.yscale("log")
 plt.tight_layout()
 plt.show(block=True)

@@ -46,54 +56,83 @@ grid_pts = np.linspace(0.8, 5.2, 1000).reshape((-1, 1))

 # Train AFT model using XGBoost
 dmat = xgb.DMatrix(X)
-dmat.set_float_info('label_lower_bound', y_lower)
-dmat.set_float_info('label_upper_bound', y_upper)
-params = {'max_depth': 3, 'objective':'survival:aft', 'min_child_weight': 0}
+dmat.set_float_info("label_lower_bound", y_lower)
+dmat.set_float_info("label_upper_bound", y_upper)
+params = {"max_depth": 3, "objective": "survival:aft", "min_child_weight": 0}

 accuracy_history = []
-def plot_intermediate_model_callback(env):
-    """Custom callback to plot intermediate models"""
-    # Compute y_pred = prediction using the intermediate model, at current boosting iteration
-    y_pred = env.model.predict(dmat)
-    # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
-    #              the corresponding predicted label (y_pred)
-    acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X) * 100)
-    accuracy_history.append(acc)

-    # Plot ranged labels as well as predictions by the model
-    plt.subplot(5, 3, env.iteration + 1)
-    plot_censored_labels(X, y_lower, y_upper)
-    y_pred_grid_pts = env.model.predict(xgb.DMatrix(grid_pts))
-    plt.plot(grid_pts, y_pred_grid_pts, 'r-', label='XGBoost AFT model', linewidth=4)
-    plt.title('Iteration {}'.format(env.iteration), x=0.5, y=0.8)
-    plt.xlim((0.8, 5.2))
-    plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
-    plt.yscale('log')

-res = {}
-plt.figure(figsize=(12,13))
-bst = xgb.train(params, dmat, 15, [(dmat, 'train')], evals_result=res,
-                callbacks=[plot_intermediate_model_callback])
+class PlotIntermediateModel(xgb.callback.TrainingCallback):
+    """Custom callback to plot intermediate models."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def after_iteration(
+        self,
+        model: xgb.Booster,
+        epoch: int,
+        evals_log: xgb.callback.TrainingCallback.EvalsLog,
+    ) -> bool:
+        """Run after training is finished."""
+        # Compute y_pred = prediction using the intermediate model, at current boosting
+        # iteration
+        y_pred = model.predict(dmat)
+        # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper)
+        #              includes the corresponding predicted label (y_pred)
+        acc = np.sum(
+            np.logical_and(y_pred >= y_lower, y_pred <= y_upper) / len(X) * 100
+        )
+        accuracy_history.append(acc)
+
+        # Plot ranged labels as well as predictions by the model
+        plt.subplot(5, 3, epoch + 1)
+        plot_censored_labels(X, y_lower, y_upper)
+        y_pred_grid_pts = model.predict(xgb.DMatrix(grid_pts))
+        plt.plot(
+            grid_pts, y_pred_grid_pts, "r-", label="XGBoost AFT model", linewidth=4
+        )
+        plt.title("Iteration {}".format(epoch), x=0.5, y=0.8)
+        plt.xlim((0.8, 5.2))
+        plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
+        plt.yscale("log")
+        return False
+
+
+res: xgb.callback.TrainingCallback.EvalsLog = {}
+plt.figure(figsize=(12, 13))
+bst = xgb.train(
+    params,
+    dmat,
+    15,
+    [(dmat, "train")],
+    evals_result=res,
+    callbacks=[PlotIntermediateModel()],
+)
 plt.tight_layout()
-plt.legend(loc='lower center', ncol=4,
-           bbox_to_anchor=(0.5, 0),
-           bbox_transform=plt.gcf().transFigure)
+plt.legend(
+    loc="lower center",
+    ncol=4,
+    bbox_to_anchor=(0.5, 0),
+    bbox_transform=plt.gcf().transFigure,
+)
 plt.tight_layout()

 # Plot negative log likelihood over boosting iterations
-plt.figure(figsize=(8,3))
+plt.figure(figsize=(8, 3))
 plt.subplot(1, 2, 1)
-plt.plot(res['train']['aft-nloglik'], 'b-o', label='aft-nloglik')
-plt.xlabel('# Boosting Iterations')
-plt.legend(loc='best')
+plt.plot(res["train"]["aft-nloglik"], "b-o", label="aft-nloglik")
+plt.xlabel("# Boosting Iterations")
+plt.legend(loc="best")

 # Plot "accuracy" over boosting iterations
 # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
 #              the corresponding predicted label (y_pred)
 plt.subplot(1, 2, 2)
-plt.plot(accuracy_history, 'r-o', label='Accuracy (%)')
-plt.xlabel('# Boosting Iterations')
-plt.legend(loc='best')
+plt.plot(accuracy_history, "r-o", label="Accuracy (%)")
+plt.xlabel("# Boosting Iterations")
+plt.legend(loc="best")
 plt.tight_layout()

 plt.show()
--- a/demo/c-api/basic/c-api-demo.c
+++ b/demo/c-api/basic/c-api-demo.c
@@ -53,15 +53,7 @@ int main() {
  // configure the training
  // available parameters are described here:
  //   https://xgboost.readthedocs.io/en/latest/parameter.html
-  safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
-  if (use_gpu) {
-    // set the GPU to use;
-    // this is not necessary, but provided here as an illustration
-    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
-  } else {
-    // avoid evaluating objective and metric on a GPU
-    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
-  }
+  safe_xgboost(XGBoosterSetParam(booster, "device", use_gpu ? "cuda" : "cpu"));

  safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
  safe_xgboost(XGBoosterSetParam(booster, "min_child_weight", "1"));
--- a/demo/dask/cpu_survival.py
+++ b/demo/dask/cpu_survival.py
@@ -18,43 +18,45 @@ def main(client):
    # The Veterans' Administration Lung Cancer Trial
    # The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980)
    CURRENT_DIR = os.path.dirname(__file__)
-    df = dd.read_csv(os.path.join(CURRENT_DIR, os.pardir, 'data', 'veterans_lung_cancer.csv'))
+    df = dd.read_csv(
+        os.path.join(CURRENT_DIR, os.pardir, "data", "veterans_lung_cancer.csv")
+    )

    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
    # DMatrix scatter around workers.
    # For AFT survival, you'd need to extract the lower and upper bounds for the label
    # and pass them as arguments to DaskDMatrix.
-    y_lower_bound = df['Survival_label_lower_bound']
-    y_upper_bound = df['Survival_label_upper_bound']
-    X = df.drop(['Survival_label_lower_bound',
-                 'Survival_label_upper_bound'], axis=1)
-    dtrain = DaskDMatrix(client, X, label_lower_bound=y_lower_bound,
-                         label_upper_bound=y_upper_bound)
+    y_lower_bound = df["Survival_label_lower_bound"]
+    y_upper_bound = df["Survival_label_upper_bound"]
+    X = df.drop(["Survival_label_lower_bound", "Survival_label_upper_bound"], axis=1)
+    dtrain = DaskDMatrix(
+        client, X, label_lower_bound=y_lower_bound, label_upper_bound=y_upper_bound
+    )

    # Use train method from xgboost.dask instead of xgboost.  This
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
-    params = {'verbosity': 1,
-              'objective': 'survival:aft',
-              'eval_metric': 'aft-nloglik',
-              'learning_rate': 0.05,
-              'aft_loss_distribution_scale': 1.20,
-              'aft_loss_distribution': 'normal',
-              'max_depth': 6,
-              'lambda': 0.01,
-              'alpha': 0.02}
-    output = xgb.dask.train(client,
-                            params,
-                            dtrain,
-                            num_boost_round=100,
-                            evals=[(dtrain, 'train')])
-    bst = output['booster']
-    history = output['history']
+    params = {
+        "verbosity": 1,
+        "objective": "survival:aft",
+        "eval_metric": "aft-nloglik",
+        "learning_rate": 0.05,
+        "aft_loss_distribution_scale": 1.20,
+        "aft_loss_distribution": "normal",
+        "max_depth": 6,
+        "lambda": 0.01,
+        "alpha": 0.02,
+    }
+    output = xgb.dask.train(
+        client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
+    )
+    bst = output["booster"]
+    history = output["history"]

    # you can pass output directly into `predict` too.
    prediction = xgb.dask.predict(client, bst, dtrain)
-    print('Evaluation history: ', history)
+    print("Evaluation history: ", history)

    # Uncomment the following line to save the model to the disk
    # bst.save_model('survival_model.json')
@@ -62,7 +64,7 @@ def main(client):
    return prediction


-if __name__ == '__main__':
+if __name__ == "__main__":
    # or use other clusters for scaling
    with LocalCluster(n_workers=7, threads_per_worker=4) as cluster:
        with Client(cluster) as client:
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@@ -15,7 +15,7 @@ def main(client):
    m = 100000
    n = 100
    X = da.random.random(size=(m, n), chunks=100)
-    y = da.random.random(size=(m, ), chunks=100)
+    y = da.random.random(size=(m,), chunks=100)

    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
    # DMatrix scatter around workers.
@@ -25,21 +25,23 @@ def main(client):
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
-    output = xgb.dask.train(client,
-                            {'verbosity': 1,
-                             'tree_method': 'hist'},
-                            dtrain,
-                            num_boost_round=4, evals=[(dtrain, 'train')])
-    bst = output['booster']
-    history = output['history']
+    output = xgb.dask.train(
+        client,
+        {"verbosity": 1, "tree_method": "hist"},
+        dtrain,
+        num_boost_round=4,
+        evals=[(dtrain, "train")],
+    )
+    bst = output["booster"]
+    history = output["history"]

    # you can pass output directly into `predict` too.
    prediction = xgb.dask.predict(client, bst, dtrain)
-    print('Evaluation history:', history)
+    print("Evaluation history:", history)
    return prediction


-if __name__ == '__main__':
+if __name__ == "__main__":
    # or use other clusters for scaling
    with LocalCluster(n_workers=7, threads_per_worker=4) as cluster:
        with Client(cluster) as client:
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@@ -13,33 +13,38 @@ from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix


-def using_dask_matrix(client: Client, X, y):
-    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
-    # DMatrix scatter around workers.
+def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
+    # DaskDMatrix acts like normal DMatrix, works as a proxy for local DMatrix scatter
+    # around workers.
    dtrain = DaskDMatrix(client, X, y)

-    # Use train method from xgboost.dask instead of xgboost.  This
-    # distributed version of train returns a dictionary containing the
-    # resulting booster and evaluation history obtained from
-    # evaluation metrics.
-    output = xgb.dask.train(client,
-                            {'verbosity': 2,
-                             # Golden line for GPU training
-                             'tree_method': 'gpu_hist'},
-                            dtrain,
-                            num_boost_round=4, evals=[(dtrain, 'train')])
-    bst = output['booster']
-    history = output['history']
+    # Use train method from xgboost.dask instead of xgboost.  This distributed version
+    # of train returns a dictionary containing the resulting booster and evaluation
+    # history obtained from evaluation metrics.
+    output = xgb.dask.train(
+        client,
+        {
+            "verbosity": 2,
+            "tree_method": "hist",
+            # Golden line for GPU training
+            "device": "cuda",
+        },
+        dtrain,
+        num_boost_round=4,
+        evals=[(dtrain, "train")],
+    )
+    bst = output["booster"]
+    history = output["history"]

    # you can pass output directly into `predict` too.
    prediction = xgb.dask.predict(client, bst, dtrain)
-    print('Evaluation history:', history)
+    print("Evaluation history:", history)
    return prediction


-def using_quantile_device_dmatrix(client: Client, X, y):
-    """`DaskQuantileDMatrix` is a data type specialized for `gpu_hist` and `hist` tree
-     methods for reducing memory usage.
+def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
+    """`DaskQuantileDMatrix` is a data type specialized for `hist` tree methods for
+     reducing memory usage.

    .. versionadded:: 1.2.0

@@ -52,26 +57,28 @@ def using_quantile_device_dmatrix(client: Client, X, y):
    # the `ref` argument of `DaskQuantileDMatrix`.
    dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
    output = xgb.dask.train(
-        client, {"verbosity": 2, "tree_method": "gpu_hist"}, dtrain, num_boost_round=4
+        client,
+        {"verbosity": 2, "tree_method": "hist", "device": "cuda"},
+        dtrain,
+        num_boost_round=4,
    )

    prediction = xgb.dask.predict(client, output, X)
    return prediction


-if __name__ == '__main__':
+if __name__ == "__main__":
    # `LocalCUDACluster` is used for assigning GPU to XGBoost processes.  Here
-    # `n_workers` represents the number of GPUs since we use one GPU per worker
-    # process.
+    # `n_workers` represents the number of GPUs since we use one GPU per worker process.
    with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster:
        with Client(cluster) as client:
            # generate some random data for demonstration
            m = 100000
            n = 100
            X = da.random.random(size=(m, n), chunks=10000)
-            y = da.random.random(size=(m, ), chunks=10000)
+            y = da.random.random(size=(m,), chunks=10000)

-            print('Using DaskQuantileDMatrix')
+            print("Using DaskQuantileDMatrix")
            from_ddqdm = using_quantile_device_dmatrix(client, X, y)
-            print('Using DMatrix')
+            print("Using DMatrix")
            from_dmatrix = using_dask_matrix(client, X, y)
--- a/demo/dask/sklearn_gpu_training.py
+++ b/demo/dask/sklearn_gpu_training.py
@@ -21,7 +21,8 @@ def main(client):
    y = da.random.random(m, partition_size)

    regressor = xgboost.dask.DaskXGBRegressor(verbosity=1)
-    regressor.set_params(tree_method='gpu_hist')
+    # set the device to CUDA
+    regressor.set_params(tree_method="hist", device="cuda")
    # assigning client here is optional
    regressor.client = client

@@ -31,13 +32,13 @@ def main(client):
    bst = regressor.get_booster()
    history = regressor.evals_result()

-    print('Evaluation history:', history)
+    print("Evaluation history:", history)
    # returned prediction is always a dask array.
    assert isinstance(prediction, da.Array)
-    return bst                  # returning the trained model
+    return bst  # returning the trained model


-if __name__ == '__main__':
+if __name__ == "__main__":
    # With dask cuda, one can scale up XGBoost to arbitrary GPU clusters.
    # `LocalCUDACluster` used here is only for demonstration purpose.
    with LocalCUDACluster() as cluster:
--- a/demo/gpu_acceleration/README.md
+++ b/demo/gpu_acceleration/README.md
@@ -1,5 +0,0 @@
-# GPU Acceleration Demo
-
-`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
-
-`shap.ipynb` demonstrates using GPU acceleration to compute SHAP values for feature importance.
--- a/demo/gpu_acceleration/README.rst
+++ b/demo/gpu_acceleration/README.rst
@@ -0,0 +1,8 @@
+:orphan:
+
+GPU Acceleration Demo
+=====================
+
+This is a collection of demonstration scripts to showcase the basic usage of GPU. Please
+see :doc:`/gpu/index` for more info. There are other demonstrations for distributed GPU
+training using dask or spark.
--- a/demo/gpu_acceleration/cover_type.py
+++ b/demo/gpu_acceleration/cover_type.py
@@ -1,41 +1,49 @@
+"""
+Using xgboost on GPU devices
+============================
+
+Shows how to train a model on the `forest cover type
+<https://archive.ics.uci.edu/ml/datasets/covertype>`_ dataset using GPU
+acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it
+time consuming to process. We compare the run-time and accuracy of the GPU and CPU
+histogram algorithms.
+
+In addition, The demo showcases using GPU with other GPU-related libraries including
+cupy and cuml. These libraries are not strictly required.
+
+"""
 import time

+import cupy as cp
+from cuml.model_selection import train_test_split
 from sklearn.datasets import fetch_covtype
-from sklearn.model_selection import train_test_split

 import xgboost as xgb

 # Fetch dataset using sklearn
-cov = fetch_covtype()
-X = cov.data
-y = cov.target
+X, y = fetch_covtype(return_X_y=True)
+X = cp.array(X)
+y = cp.array(y)
+y -= y.min()

 # Create 0.75/0.25 train/test split
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,
-                                                    random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.25, train_size=0.75, random_state=42
+)

 # Specify sufficient boosting iterations to reach a minimum
 num_round = 3000

 # Leave most parameters as default
-param = {'objective': 'multi:softmax', # Specify multiclass classification
-         'num_class': 8, # Number of possible output classes
-         'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
-         }
-
-# Convert input data from numpy to XGBoost format
-dtrain = xgb.DMatrix(X_train, label=y_train)
-dtest = xgb.DMatrix(X_test, label=y_test)
-
-gpu_res = {} # Store accuracy result
-tmp = time.time()
+clf = xgb.XGBClassifier(device="cuda", n_estimators=num_round)
 # Train model
-xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
-print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))
+start = time.time()
+clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+gpu_res = clf.evals_result()
+print("GPU Training Time: %s seconds" % (str(time.time() - start)))

 # Repeat for CPU algorithm
-tmp = time.time()
-param['tree_method'] = 'hist'
-cpu_res = {}
-xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
-print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
+clf = xgb.XGBClassifier(device="cpu", n_estimators=num_round)
+start = time.time()
+cpu_res = clf.evals_result()
+print("CPU Training Time: %s seconds" % (str(time.time() - start)))
--- a/demo/gpu_acceleration/shap.ipynb
+++ b/demo/gpu_acceleration/shap.ipynb
--- a/demo/gpu_acceleration/tree_shap.py
+++ b/demo/gpu_acceleration/tree_shap.py
@@ -0,0 +1,55 @@
+"""
+Use GPU to speedup SHAP value computation
+=========================================
+
+Demonstrates using GPU acceleration to compute SHAP values for feature importance.
+
+"""
+import shap
+from sklearn.datasets import fetch_california_housing
+
+import xgboost as xgb
+
+# Fetch dataset using sklearn
+data = fetch_california_housing()
+print(data.DESCR)
+X = data.data
+y = data.target
+
+num_round = 500
+
+param = {
+    "eta": 0.05,
+    "max_depth": 10,
+    "tree_method": "hist",
+    "device": "cuda",
+}
+
+# GPU accelerated training
+dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
+model = xgb.train(param, dtrain, num_round)
+
+# Compute shap values using GPU with xgboost
+model.set_param({"device": "cuda"})
+shap_values = model.predict(dtrain, pred_contribs=True)
+
+# Compute shap interaction values using GPU
+shap_interaction_values = model.predict(dtrain, pred_interactions=True)
+
+
+# shap will call the GPU accelerated version as long as the device parameter is set to
+# "cuda"
+explainer = shap.TreeExplainer(model)
+shap_values = explainer.shap_values(X)
+
+# visualize the first prediction's explanation
+shap.force_plot(
+    explainer.expected_value,
+    shap_values[0, :],
+    X[0, :],
+    feature_names=data.feature_names,
+    matplotlib=True,
+)
+
+# Show a summary of feature importance
+shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)
--- a/demo/guide-python/callbacks.py
+++ b/demo/guide-python/callbacks.py
@@ -1,9 +1,9 @@
-'''
+"""
 Demo for using and defining callback functions
 ==============================================

    .. versionadded:: 1.3.0
-'''
+"""
 import argparse
 import os
 import tempfile
@@ -17,10 +17,11 @@ import xgboost as xgb


 class Plotting(xgb.callback.TrainingCallback):
-    '''Plot evaluation result during training.  Only for demonstration purpose as it's quite
+    """Plot evaluation result during training.  Only for demonstration purpose as it's quite
    slow to draw.

-    '''
+    """
+
    def __init__(self, rounds):
        self.fig = plt.figure()
        self.ax = self.fig.add_subplot(111)
@@ -31,16 +32,16 @@ class Plotting(xgb.callback.TrainingCallback):
        plt.ion()

    def _get_key(self, data, metric):
-        return f'{data}-{metric}'
+        return f"{data}-{metric}"

    def after_iteration(self, model, epoch, evals_log):
-        '''Update the plot.'''
+        """Update the plot."""
        if not self.lines:
            for data, metric in evals_log.items():
                for metric_name, log in metric.items():
                    key = self._get_key(data, metric_name)
                    expanded = log + [0] * (self.rounds - len(log))
-                    self.lines[key],  = self.ax.plot(self.x, expanded, label=key)
+                    (self.lines[key],) = self.ax.plot(self.x, expanded, label=key)
                    self.ax.legend()
        else:
            # https://pythonspot.com/matplotlib-update-plot/
@@ -55,8 +56,8 @@ class Plotting(xgb.callback.TrainingCallback):


 def custom_callback():
-    '''Demo for defining a custom callback function that plots evaluation result during
-    training.'''
+    """Demo for defining a custom callback function that plots evaluation result during
+    training."""
    X, y = load_breast_cancer(return_X_y=True)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

@@ -69,14 +70,16 @@ def custom_callback():
    # Pass it to the `callbacks` parameter as a list.
    xgb.train(
        {
-            'objective': 'binary:logistic',
-            'eval_metric': ['error', 'rmse'],
-            'tree_method': 'gpu_hist'
+            "objective": "binary:logistic",
+            "eval_metric": ["error", "rmse"],
+            "tree_method": "hist",
+            "device": "cuda",
        },
        D_train,
-        evals=[(D_train, 'Train'), (D_valid, 'Valid')],
+        evals=[(D_train, "Train"), (D_valid, "Valid")],
        num_boost_round=num_boost_round,
-        callbacks=[plotting])
+        callbacks=[plotting],
+    )


 def check_point_callback():
@@ -89,10 +92,10 @@ def check_point_callback():
            if i == 0:
                continue
            if as_pickle:
-                path = os.path.join(tmpdir, 'model_' + str(i) + '.pkl')
+                path = os.path.join(tmpdir, "model_" + str(i) + ".pkl")
            else:
-                path = os.path.join(tmpdir, 'model_' + str(i) + '.json')
-            assert(os.path.exists(path))
+                path = os.path.join(tmpdir, "model_" + str(i) + ".json")
+            assert os.path.exists(path)

    X, y = load_breast_cancer(return_X_y=True)
    m = xgb.DMatrix(X, y)
@@ -100,31 +103,36 @@ def check_point_callback():
    with tempfile.TemporaryDirectory() as tmpdir:
        # Use callback class from xgboost.callback
        # Feel free to subclass/customize it to suit your need.
-        check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
-                                                      iterations=rounds,
-                                                      name='model')
-        xgb.train({'objective': 'binary:logistic'}, m,
-                  num_boost_round=10,
-                  verbose_eval=False,
-                  callbacks=[check_point])
+        check_point = xgb.callback.TrainingCheckPoint(
+            directory=tmpdir, iterations=rounds, name="model"
+        )
+        xgb.train(
+            {"objective": "binary:logistic"},
+            m,
+            num_boost_round=10,
+            verbose_eval=False,
+            callbacks=[check_point],
+        )
        check(False)

        # This version of checkpoint saves everything including parameters and
        # model.  See: doc/tutorials/saving_model.rst
-        check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
-                                                      iterations=rounds,
-                                                      as_pickle=True,
-                                                      name='model')
-        xgb.train({'objective': 'binary:logistic'}, m,
-                  num_boost_round=10,
-                  verbose_eval=False,
-                  callbacks=[check_point])
+        check_point = xgb.callback.TrainingCheckPoint(
+            directory=tmpdir, iterations=rounds, as_pickle=True, name="model"
+        )
+        xgb.train(
+            {"objective": "binary:logistic"},
+            m,
+            num_boost_round=10,
+            verbose_eval=False,
+            callbacks=[check_point],
+        )
        check(True)


-if __name__ == '__main__':
+if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('--plot', default=1, type=int)
+    parser.add_argument("--plot", default=1, type=int)
    args = parser.parse_args()

    check_point_callback()
--- a/demo/guide-python/cat_in_the_dat.py
+++ b/demo/guide-python/cat_in_the_dat.py
@@ -63,7 +63,8 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:


 params = {
-    "tree_method": "gpu_hist",
+    "tree_method": "hist",
+    "device": "cuda",
    "n_estimators": 32,
    "colsample_bylevel": 0.7,
 }
--- a/demo/guide-python/categorical.py
+++ b/demo/guide-python/categorical.py
@@ -58,13 +58,13 @@ def main() -> None:
    # Specify `enable_categorical` to True, also we use onehot encoding based split
    # here for demonstration. For details see the document of `max_cat_to_onehot`.
    reg = xgb.XGBRegressor(
-        tree_method="gpu_hist", enable_categorical=True, max_cat_to_onehot=5
+        tree_method="hist", enable_categorical=True, max_cat_to_onehot=5, device="cuda"
    )
    reg.fit(X, y, eval_set=[(X, y)])

    # Pass in already encoded data
    X_enc, y_enc = make_categorical(100, 10, 4, True)
-    reg_enc = xgb.XGBRegressor(tree_method="gpu_hist")
+    reg_enc = xgb.XGBRegressor(tree_method="hist", device="cuda")
    reg_enc.fit(X_enc, y_enc, eval_set=[(X_enc, y_enc)])

    reg_results = np.array(reg.evals_result()["validation_0"]["rmse"])
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -22,7 +22,10 @@ import xgboost


 def make_batches(
-    n_samples_per_batch: int, n_features: int, n_batches: int, tmpdir: str,
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    tmpdir: str,
 ) -> List[Tuple[str, str]]:
    files: List[Tuple[str, str]] = []
    rng = np.random.RandomState(1994)
@@ -38,6 +41,7 @@ def make_batches(

 class Iterator(xgboost.DataIter):
    """A custom iterator for loading files in batches."""
+
    def __init__(self, file_paths: List[Tuple[str, str]]):
        self._file_paths = file_paths
        self._it = 0
@@ -82,10 +86,11 @@ def main(tmpdir: str) -> xgboost.Booster:
    missing = np.NaN
    Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)

-    # Other tree methods including ``hist`` and ``gpu_hist`` also work, see tutorial in
+    # ``approx`` is also supported, but less efficient due to sketching. GPU behaves
+    # differently than CPU tree methods as it uses a hybrid approach. See tutorial in
    # doc for details.
    booster = xgboost.train(
-        {"tree_method": "approx", "max_depth": 2},
+        {"tree_method": "hist", "max_depth": 4},
        Xy,
        evals=[(Xy, "Train")],
        num_boost_round=10,
--- a/demo/guide-python/learning_to_rank.py
+++ b/demo/guide-python/learning_to_rank.py
@@ -104,7 +104,8 @@ def ranking_demo(args: argparse.Namespace) -> None:
    qid_test = qid_test[sorted_idx]

    ranker = xgb.XGBRanker(
-        tree_method="gpu_hist",
+        tree_method="hist",
+        device="cuda",
        lambdarank_pair_method="topk",
        lambdarank_num_pair_per_sample=13,
        eval_metric=["ndcg@1", "ndcg@8"],
@@ -161,7 +162,8 @@ def click_data_demo(args: argparse.Namespace) -> None:

    ranker = xgb.XGBRanker(
        n_estimators=512,
-        tree_method="gpu_hist",
+        tree_method="hist",
+        device="cuda",
        learning_rate=0.01,
        reg_lambda=1.5,
        subsample=0.8,
--- a/demo/guide-python/quantile_data_iterator.py
+++ b/demo/guide-python/quantile_data_iterator.py
@@ -23,22 +23,23 @@ import numpy
 import xgboost

 COLS = 64
-ROWS_PER_BATCH = 1000            # data is splited by rows
+ROWS_PER_BATCH = 1000  # data is splited by rows
 BATCHES = 32


 class IterForDMatrixDemo(xgboost.core.DataIter):
-    '''A data iterator for XGBoost DMatrix.
+    """A data iterator for XGBoost DMatrix.

    `reset` and `next` are required for any data iterator, other functions here
    are utilites for demonstration's purpose.

-    '''
+    """
+
    def __init__(self):
-        '''Generate some random data for demostration.
+        """Generate some random data for demostration.

        Actual data can be anything that is currently supported by XGBoost.
-        '''
+        """
        self.rows = ROWS_PER_BATCH
        self.cols = COLS
        rng = cupy.random.RandomState(1994)
@@ -46,7 +47,7 @@ class IterForDMatrixDemo(xgboost.core.DataIter):
        self._labels = [rng.randn(self.rows)] * BATCHES
        self._weights = [rng.uniform(size=self.rows)] * BATCHES

-        self.it = 0             # set iterator to 0
+        self.it = 0  # set iterator to 0
        super().__init__()

    def as_array(self):
@@ -59,27 +60,26 @@ class IterForDMatrixDemo(xgboost.core.DataIter):
        return cupy.concatenate(self._weights)

    def data(self):
-        '''Utility function for obtaining current batch of data.'''
+        """Utility function for obtaining current batch of data."""
        return self._data[self.it]

    def labels(self):
-        '''Utility function for obtaining current batch of label.'''
+        """Utility function for obtaining current batch of label."""
        return self._labels[self.it]

    def weights(self):
        return self._weights[self.it]

    def reset(self):
-        '''Reset the iterator'''
+        """Reset the iterator"""
        self.it = 0

    def next(self, input_data):
-        '''Yield next batch of data.'''
+        """Yield next batch of data."""
        if self.it == len(self._data):
            # Return 0 when there's no more batch.
            return 0
-        input_data(data=self.data(), label=self.labels(),
-                   weight=self.weights())
+        input_data(data=self.data(), label=self.labels(), weight=self.weights())
        self.it += 1
        return 1

@@ -103,18 +103,19 @@ def main():

    assert m_with_it.num_col() == m.num_col()
    assert m_with_it.num_row() == m.num_row()
-    # Tree meethod must be one of the `hist` or `gpu_hist`. We use `gpu_hist` for GPU
-    # input here.
+    # Tree meethod must be `hist`.
    reg_with_it = xgboost.train(
-        {"tree_method": "gpu_hist"}, m_with_it, num_boost_round=rounds
+        {"tree_method": "hist", "device": "cuda"}, m_with_it, num_boost_round=rounds
    )
    predict_with_it = reg_with_it.predict(m_with_it)

-    reg = xgboost.train({"tree_method": "gpu_hist"}, m, num_boost_round=rounds)
+    reg = xgboost.train(
+        {"tree_method": "hist", "device": "cuda"}, m, num_boost_round=rounds
+    )
    predict = reg.predict(m)

    numpy.testing.assert_allclose(predict_with_it, predict, rtol=1e6)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/demo/guide-python/quantile_regression.py
+++ b/demo/guide-python/quantile_regression.py
@@ -7,6 +7,11 @@ Quantile Regression
 The script is inspired by this awesome example in sklearn:
 https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html

+.. note::
+
+    The feature is only supported using the Python package. In addition, quantile
+    crossing can happen due to limitation in the algorithm.
+
 """
 import argparse
 from typing import Dict
--- a/demo/guide-python/update_process.py
+++ b/demo/guide-python/update_process.py
@@ -24,7 +24,7 @@ def main():
    Xy = xgb.DMatrix(X_train, y_train)
    evals_result: xgb.callback.EvaluationMonitor.EvalsLog = {}
    booster = xgb.train(
-        {"tree_method": "gpu_hist", "max_depth": 6},
+        {"tree_method": "hist", "max_depth": 6, "device": "cuda"},
        Xy,
        num_boost_round=n_rounds,
        evals=[(Xy, "Train")],
@@ -33,8 +33,8 @@ def main():
    SHAP = booster.predict(Xy, pred_contribs=True)

    # Refresh the leaf value and tree statistic
-    X_refresh = X[X.shape[0] // 2:]
-    y_refresh = y[y.shape[0] // 2:]
+    X_refresh = X[X.shape[0] // 2 :]
+    y_refresh = y[y.shape[0] // 2 :]
    Xy_refresh = xgb.DMatrix(X_refresh, y_refresh)
    # The model will adapt to other half of the data by changing leaf value (no change in
    # split condition) with refresh_leaf set to True.
@@ -87,7 +87,7 @@ def main():
    np.testing.assert_allclose(
        np.array(prune_result["Original"]["rmse"]),
        np.array(prune_result["Train"]["rmse"]),
-        atol=1e-5
+        atol=1e-5,
    )


--- a/demo/nvflare/.gitignore
+++ b/demo/nvflare/.gitignore
@@ -0,0 +1 @@
+!config
--- a/demo/nvflare/config/config_fed_client.json
+++ b/demo/nvflare/config/config_fed_client.json
@@ -0,0 +1,23 @@
+{
+  "format_version": 2,
+  "executors": [
+    {
+      "tasks": [
+        "train"
+      ],
+      "executor": {
+        "path": "trainer.XGBoostTrainer",
+        "args": {
+          "server_address": "localhost:9091",
+          "world_size": 2,
+          "server_cert_path": "server-cert.pem",
+          "client_key_path": "client-key.pem",
+          "client_cert_path": "client-cert.pem",
+          "use_gpus": false
+        }
+      }
+    }
+  ],
+  "task_result_filters": [],
+  "task_data_filters": []
+}
--- a/demo/nvflare/config/config_fed_server.json
+++ b/demo/nvflare/config/config_fed_server.json
@@ -0,0 +1,22 @@
+{
+  "format_version": 2,
+  "server": {
+    "heart_beat_timeout": 600
+  },
+  "task_data_filters": [],
+  "task_result_filters": [],
+  "workflows": [
+    {
+      "id": "server_workflow",
+      "path": "controller.XGBoostController",
+      "args": {
+        "port": 9091,
+        "world_size": 2,
+        "server_key_path": "server-key.pem",
+        "server_cert_path": "server-cert.pem",
+        "client_cert_path": "client-cert.pem"
+      }
+    }
+  ],
+  "components": []
+}
--- a/demo/nvflare/horizontal/README.md
+++ b/demo/nvflare/horizontal/README.md
@@ -6,7 +6,7 @@ This directory contains a demo of Horizontal Federated Learning using
 ## Training with CPU only

 To run the demo, first build XGBoost with the federated learning plugin enabled (see the
-[README](../../plugin/federated/README.md)).
+[README](../../../plugin/federated/README.md)).

 Install NVFlare (note that currently NVFlare only supports Python 3.8):
 ```shell
--- a/demo/nvflare/horizontal/custom/trainer.py
+++ b/demo/nvflare/horizontal/custom/trainer.py
@@ -70,8 +70,7 @@ class XGBoostTrainer(Executor):
            param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
            if self._use_gpus:
                self.log_info(fl_ctx, f'Training with GPU {rank}')
-                param['tree_method'] = 'gpu_hist'
-                param['gpu_id'] = rank
+                param['device'] = f"cuda:{rank}"

            # Specify validations set to watch performance
            watchlist = [(dtest, 'eval'), (dtrain, 'train')]
--- a/demo/nvflare/horizontal/prepare_data.sh
+++ b/demo/nvflare/horizontal/prepare_data.sh
@@ -16,7 +16,7 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test

 nvflare poc -n 2 --prepare
 mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
-cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
+cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
 for (( site=1; site<=world_size; site++ )); do
  cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/
--- a/demo/nvflare/vertical/README.md
+++ b/demo/nvflare/vertical/README.md
@@ -6,7 +6,7 @@ This directory contains a demo of Vertical Federated Learning using
 ## Training with CPU only

 To run the demo, first build XGBoost with the federated learning plugin enabled (see the
-[README](../../plugin/federated/README.md)).
+[README](../../../plugin/federated/README.md)).

 Install NVFlare (note that currently NVFlare only supports Python 3.8):
 ```shell
--- a/demo/nvflare/vertical/custom/trainer.py
+++ b/demo/nvflare/vertical/custom/trainer.py
@@ -16,7 +16,7 @@ class SupportedTasks(object):

 class XGBoostTrainer(Executor):
    def __init__(self, server_address: str, world_size: int, server_cert_path: str,
-                 client_key_path: str, client_cert_path: str):
+                 client_key_path: str, client_cert_path: str, use_gpus: bool):
        """Trainer for federated XGBoost.

        Args:
@@ -32,6 +32,7 @@ class XGBoostTrainer(Executor):
        self._server_cert_path = server_cert_path
        self._client_key_path = client_key_path
        self._client_cert_path = client_cert_path
+        self._use_gpus = use_gpus

    def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
                abort_signal: Signal) -> Shareable:
@@ -81,6 +82,8 @@ class XGBoostTrainer(Executor):
                'objective': 'binary:logistic',
                'eval_metric': 'auc',
            }
+            if self._use_gpus:
+                self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost')

            # specify validations set to watch performance
            watchlist = [(dtest, "eval"), (dtrain, "train")]
--- a/demo/nvflare/vertical/prepare_data.sh
+++ b/demo/nvflare/vertical/prepare_data.sh
@@ -56,7 +56,7 @@ fi

 nvflare poc -n 2 --prepare
 mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost
-cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
+cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
 for (( site=1; site<=world_size; site++ )); do
  cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/
--- a/demo/rmm_plugin/README.md
+++ b/demo/rmm_plugin/README.md
@@ -1,47 +0,0 @@
-Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
-====================================================================
-[RAPIDS Memory Manager (RMM)](https://github.com/rapidsai/rmm) library provides a collection of
-efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory
-allocators provided by RMM, by enabling the RMM integration plugin.
-
-The demos in this directory highlights one RMM allocator in particular: **the pool sub-allocator**.
-This allocator addresses the slow speed of `cudaMalloc()` by allocating a large chunk of memory
-upfront. Subsequent allocations will draw from the pool of already allocated memory and thus avoid
-the overhead of calling `cudaMalloc()` directly. See
-[this GTC talk slides](https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf)
-for more details.
-
-Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
-run CMake with option `-DPLUGIN_RMM=ON` (`-DUSE_CUDA=ON` also required):
-```
-cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
-make -j4
-```
-CMake will attempt to locate the RMM library in your build environment. You may choose to build
-RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
-should specify the location of RMM with the CMake prefix:
-```
-# If using Conda:
-cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
-# If using RMM installed with a custom location
-cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
-```
-
-# Informing XGBoost about RMM pool
-
-When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
-allocators, but some small allocations in performance critical areas are using a different
-caching allocator so that we can have better control over memory allocation behavior.
-Users can override this behavior and force the use of rmm for all allocations by setting
-the global configuration ``use_rmm``:
-
-``` python
-with xgb.config_context(use_rmm=True):
-    clf = xgb.XGBClassifier(tree_method="gpu_hist")
-```
-
-Depending on the choice of memory pool size or type of allocator, this may have negative
-performance impact.
-
-* [Using RMM with a single GPU](./rmm_singlegpu.py)
-* [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py)
--- a/demo/rmm_plugin/README.rst
+++ b/demo/rmm_plugin/README.rst
@@ -0,0 +1,51 @@
+Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
+====================================================================
+
+`RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ library provides a
+collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use
+XGBoost with memory allocators provided by RMM, by enabling the RMM integration plugin.
+
+The demos in this directory highlights one RMM allocator in particular: **the pool
+sub-allocator**.  This allocator addresses the slow speed of ``cudaMalloc()`` by
+allocating a large chunk of memory upfront. Subsequent allocations will draw from the pool
+of already allocated memory and thus avoid the overhead of calling ``cudaMalloc()``
+directly. See `this GTC talk slides
+<https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf>`_ for
+more details.
+
+Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
+run CMake with option ``-DPLUGIN_RMM=ON`` (``-DUSE_CUDA=ON`` also required):
+
+.. code-block:: sh
+
+  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
+  make -j$(nproc)
+
+CMake will attempt to locate the RMM library in your build environment. You may choose to build
+RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
+should specify the location of RMM with the CMake prefix:
+
+.. code-block:: sh
+
+  # If using Conda:
+  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+  # If using RMM installed with a custom location
+  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
+
+********************************
+Informing XGBoost about RMM pool
+********************************
+
+When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
+allocators, but some small allocations in performance critical areas are using a different
+caching allocator so that we can have better control over memory allocation behavior.
+Users can override this behavior and force the use of rmm for all allocations by setting
+the global configuration ``use_rmm``:
+
+.. code-block:: python
+
+  with xgb.config_context(use_rmm=True):
+    clf = xgb.XGBClassifier(tree_method="hist", device="cuda")
+
+Depending on the choice of memory pool size or type of allocator, this may have negative
+performance impact.
--- a/demo/rmm_plugin/rmm_mgpu_with_dask.py
+++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py
@@ -1,3 +1,7 @@
+"""
+Using rmm with Dask
+===================
+"""
 import dask
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
@@ -11,25 +15,33 @@ def main(client):
    # xgb.set_config(use_rmm=True)

    X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
-    # In pratice one should prefer loading the data with dask collections instead of using
-    # `from_array`.
+    # In pratice one should prefer loading the data with dask collections instead of
+    # using `from_array`.
    X = dask.array.from_array(X)
    y = dask.array.from_array(y)
    dtrain = xgb.dask.DaskDMatrix(client, X, label=y)

-    params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
-              'tree_method': 'gpu_hist', 'eval_metric': 'merror'}
-    output = xgb.dask.train(client, params, dtrain, num_boost_round=100,
-                            evals=[(dtrain, 'train')])
-    bst = output['booster']
-    history = output['history']
-    for i, e in enumerate(history['train']['merror']):
-        print(f'[{i}] train-merror: {e}')
+    params = {
+        "max_depth": 8,
+        "eta": 0.01,
+        "objective": "multi:softprob",
+        "num_class": 3,
+        "tree_method": "hist",
+        "eval_metric": "merror",
+        "device": "cuda",
+    }
+    output = xgb.dask.train(
+        client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
+    )
+    bst = output["booster"]
+    history = output["history"]
+    for i, e in enumerate(history["train"]["merror"]):
+        print(f"[{i}] train-merror: {e}")


-if __name__ == '__main__':
-    # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to
-    # LocalCUDACluster constructor.
-    with LocalCUDACluster(rmm_pool_size='2GB') as cluster:
+if __name__ == "__main__":
+    # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option
+    # to LocalCUDACluster constructor.
+    with LocalCUDACluster(rmm_pool_size="2GB") as cluster:
        with Client(cluster) as client:
            main(client)
--- a/demo/rmm_plugin/rmm_singlegpu.py
+++ b/demo/rmm_plugin/rmm_singlegpu.py
@@ -1,3 +1,7 @@
+"""
+Using rmm on a single node device
+=================================
+"""
 import rmm
 from sklearn.datasets import make_classification

@@ -16,7 +20,8 @@ params = {
    "eta": 0.01,
    "objective": "multi:softprob",
    "num_class": 3,
-    "tree_method": "gpu_hist",
+    "tree_method": "hist",
+    "device": "cuda",
 }
 # XGBoost will automatically use the RMM pool allocator
 bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])