Document for device ordinal. (#9398)

- Rewrite GPU demos. notebook is converted to script to avoid committing additional png plots. - Add GPU demos into the sphinx gallery. - Add RMM demos into the sphinx gallery. - Test for firing threads with different device ordinals.
2023-07-22 15:26:29 +08:00 · 2023-07-22 15:26:29 +08:00 · 275da176ba
commit 275da176ba
parent 22b0a55a04
32 changed files with 351 additions and 398 deletions
--- a/demo/c-api/basic/c-api-demo.c
+++ b/demo/c-api/basic/c-api-demo.c
@ -53,15 +53,7 @@ int main() {
  // configure the training
  // available parameters are described here:
  //   https://xgboost.readthedocs.io/en/latest/parameter.html
-  safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
-  if (use_gpu) {
-    // set the GPU to use;
-    // this is not necessary, but provided here as an illustration
-    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
-  } else {
-    // avoid evaluating objective and metric on a GPU
-    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
-  }
+  safe_xgboost(XGBoosterSetParam(booster, "device", use_gpu ? "cuda" : "cpu"));

  safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
  safe_xgboost(XGBoosterSetParam(booster, "min_child_weight", "1"));
--- a/demo/gpu_acceleration/README.md
+++ b/demo/gpu_acceleration/README.md
@ -1,5 +0,0 @@
-# GPU Acceleration Demo
-
-`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
-
-`shap.ipynb` demonstrates using GPU acceleration to compute SHAP values for feature importance.
--- a/demo/gpu_acceleration/README.rst
+++ b/demo/gpu_acceleration/README.rst
@ -0,0 +1,8 @@
+:orphan:
+
+GPU Acceleration Demo
+=====================
+
+This is a collection of demonstration scripts to showcase the basic usage of GPU. Please
+see :doc:`/gpu/index` for more info. There are other demonstrations for distributed GPU
+training using dask or spark.
--- a/demo/gpu_acceleration/cover_type.py
+++ b/demo/gpu_acceleration/cover_type.py
@ -1,41 +1,49 @@
+"""
+Using xgboost on GPU devices
+============================
+
+Shows how to train a model on the `forest cover type
+<https://archive.ics.uci.edu/ml/datasets/covertype>`_ dataset using GPU
+acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it
+time consuming to process. We compare the run-time and accuracy of the GPU and CPU
+histogram algorithms.
+
+In addition, The demo showcases using GPU with other GPU-related libraries including
+cupy and cuml. These libraries are not strictly required.
+
+"""
 import time

+import cupy as cp
+from cuml.model_selection import train_test_split
 from sklearn.datasets import fetch_covtype
-from sklearn.model_selection import train_test_split

 import xgboost as xgb

 # Fetch dataset using sklearn
-cov = fetch_covtype()
-X = cov.data
-y = cov.target
+X, y = fetch_covtype(return_X_y=True)
+X = cp.array(X)
+y = cp.array(y)
+y -= y.min()

 # Create 0.75/0.25 train/test split
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,
-                                                    random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.25, train_size=0.75, random_state=42
+)

 # Specify sufficient boosting iterations to reach a minimum
 num_round = 3000

 # Leave most parameters as default
-param = {'objective': 'multi:softmax', # Specify multiclass classification
-         'num_class': 8, # Number of possible output classes
-         'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
-         }
-
-# Convert input data from numpy to XGBoost format
-dtrain = xgb.DMatrix(X_train, label=y_train)
-dtest = xgb.DMatrix(X_test, label=y_test)
-
-gpu_res = {} # Store accuracy result
-tmp = time.time()
+clf = xgb.XGBClassifier(device="cuda", n_estimators=num_round)
 # Train model
-xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
-print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))
+start = time.time()
+clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+gpu_res = clf.evals_result()
+print("GPU Training Time: %s seconds" % (str(time.time() - start)))

 # Repeat for CPU algorithm
-tmp = time.time()
-param['tree_method'] = 'hist'
-cpu_res = {}
-xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
-print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
+clf = xgb.XGBClassifier(device="cpu", n_estimators=num_round)
+start = time.time()
+cpu_res = clf.evals_result()
+print("CPU Training Time: %s seconds" % (str(time.time() - start)))
--- a/demo/gpu_acceleration/shap.ipynb
+++ b/demo/gpu_acceleration/shap.ipynb
--- a/demo/gpu_acceleration/tree_shap.py
+++ b/demo/gpu_acceleration/tree_shap.py
@ -0,0 +1,55 @@
+"""
+Use GPU to speedup SHAP value computation
+=========================================
+
+Demonstrates using GPU acceleration to compute SHAP values for feature importance.
+
+"""
+import shap
+from sklearn.datasets import fetch_california_housing
+
+import xgboost as xgb
+
+# Fetch dataset using sklearn
+data = fetch_california_housing()
+print(data.DESCR)
+X = data.data
+y = data.target
+
+num_round = 500
+
+param = {
+    "eta": 0.05,
+    "max_depth": 10,
+    "tree_method": "hist",
+    "device": "cuda",
+}
+
+# GPU accelerated training
+dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
+model = xgb.train(param, dtrain, num_round)
+
+# Compute shap values using GPU with xgboost
+model.set_param({"device": "cuda"})
+shap_values = model.predict(dtrain, pred_contribs=True)
+
+# Compute shap interaction values using GPU
+shap_interaction_values = model.predict(dtrain, pred_interactions=True)
+
+
+# shap will call the GPU accelerated version as long as the device parameter is set to
+# "cuda"
+explainer = shap.TreeExplainer(model)
+shap_values = explainer.shap_values(X)
+
+# visualize the first prediction's explanation
+shap.force_plot(
+    explainer.expected_value,
+    shap_values[0, :],
+    X[0, :],
+    feature_names=data.feature_names,
+    matplotlib=True,
+)
+
+# Show a summary of feature importance
+shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)
--- a/demo/nvflare/horizontal/custom/trainer.py
+++ b/demo/nvflare/horizontal/custom/trainer.py
@ -70,8 +70,7 @@ class XGBoostTrainer(Executor):
            param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
            if self._use_gpus:
                self.log_info(fl_ctx, f'Training with GPU {rank}')
-                param['tree_method'] = 'gpu_hist'
-                param['gpu_id'] = rank
+                param['device'] = f"cuda:{rank}"

            # Specify validations set to watch performance
            watchlist = [(dtest, 'eval'), (dtrain, 'train')]
--- a/demo/rmm_plugin/README.md
+++ b/demo/rmm_plugin/README.md
@ -1,47 +0,0 @@
-Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
-====================================================================
-[RAPIDS Memory Manager (RMM)](https://github.com/rapidsai/rmm) library provides a collection of
-efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory
-allocators provided by RMM, by enabling the RMM integration plugin.
-
-The demos in this directory highlights one RMM allocator in particular: **the pool sub-allocator**.
-This allocator addresses the slow speed of `cudaMalloc()` by allocating a large chunk of memory
-upfront. Subsequent allocations will draw from the pool of already allocated memory and thus avoid
-the overhead of calling `cudaMalloc()` directly. See
-[this GTC talk slides](https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf)
-for more details.
-
-Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
-run CMake with option `-DPLUGIN_RMM=ON` (`-DUSE_CUDA=ON` also required):
-```
-cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
-make -j4
-```
-CMake will attempt to locate the RMM library in your build environment. You may choose to build
-RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
-should specify the location of RMM with the CMake prefix:
-```
-# If using Conda:
-cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
-# If using RMM installed with a custom location
-cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
-```
-
-# Informing XGBoost about RMM pool
-
-When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
-allocators, but some small allocations in performance critical areas are using a different
-caching allocator so that we can have better control over memory allocation behavior.
-Users can override this behavior and force the use of rmm for all allocations by setting
-the global configuration ``use_rmm``:
-
-``` python
-with xgb.config_context(use_rmm=True):
-    clf = xgb.XGBClassifier(tree_method="gpu_hist")
-```
-
-Depending on the choice of memory pool size or type of allocator, this may have negative
-performance impact.
-
-* [Using RMM with a single GPU](./rmm_singlegpu.py)
-* [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py)
--- a/demo/rmm_plugin/README.rst
+++ b/demo/rmm_plugin/README.rst
@ -0,0 +1,51 @@
+Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
+====================================================================
+
+`RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ library provides a
+collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use
+XGBoost with memory allocators provided by RMM, by enabling the RMM integration plugin.
+
+The demos in this directory highlights one RMM allocator in particular: **the pool
+sub-allocator**.  This allocator addresses the slow speed of ``cudaMalloc()`` by
+allocating a large chunk of memory upfront. Subsequent allocations will draw from the pool
+of already allocated memory and thus avoid the overhead of calling ``cudaMalloc()``
+directly. See `this GTC talk slides
+<https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf>`_ for
+more details.
+
+Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
+run CMake with option ``-DPLUGIN_RMM=ON`` (``-DUSE_CUDA=ON`` also required):
+
+.. code-block:: sh
+
+  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
+  make -j$(nproc)
+
+CMake will attempt to locate the RMM library in your build environment. You may choose to build
+RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
+should specify the location of RMM with the CMake prefix:
+
+.. code-block:: sh
+
+  # If using Conda:
+  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+  # If using RMM installed with a custom location
+  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
+
+********************************
+Informing XGBoost about RMM pool
+********************************
+
+When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
+allocators, but some small allocations in performance critical areas are using a different
+caching allocator so that we can have better control over memory allocation behavior.
+Users can override this behavior and force the use of rmm for all allocations by setting
+the global configuration ``use_rmm``:
+
+.. code-block:: python
+
+  with xgb.config_context(use_rmm=True):
+    clf = xgb.XGBClassifier(tree_method="hist", device="cuda")
+
+Depending on the choice of memory pool size or type of allocator, this may have negative
+performance impact.
--- a/demo/rmm_plugin/rmm_mgpu_with_dask.py
+++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py
@ -1,3 +1,7 @@
+"""
+Using rmm with Dask
+===================
+"""
 import dask
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
@ -11,25 +15,33 @@ def main(client):
    # xgb.set_config(use_rmm=True)

    X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
-    # In pratice one should prefer loading the data with dask collections instead of using
-    # `from_array`.
+    # In pratice one should prefer loading the data with dask collections instead of
+    # using `from_array`.
    X = dask.array.from_array(X)
    y = dask.array.from_array(y)
    dtrain = xgb.dask.DaskDMatrix(client, X, label=y)

-    params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
-              'tree_method': 'gpu_hist', 'eval_metric': 'merror'}
-    output = xgb.dask.train(client, params, dtrain, num_boost_round=100,
-                            evals=[(dtrain, 'train')])
-    bst = output['booster']
-    history = output['history']
-    for i, e in enumerate(history['train']['merror']):
-        print(f'[{i}] train-merror: {e}')
+    params = {
+        "max_depth": 8,
+        "eta": 0.01,
+        "objective": "multi:softprob",
+        "num_class": 3,
+        "tree_method": "hist",
+        "eval_metric": "merror",
+        "device": "cuda",
+    }
+    output = xgb.dask.train(
+        client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
+    )
+    bst = output["booster"]
+    history = output["history"]
+    for i, e in enumerate(history["train"]["merror"]):
+        print(f"[{i}] train-merror: {e}")


-if __name__ == '__main__':
-    # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to
-    # LocalCUDACluster constructor.
-    with LocalCUDACluster(rmm_pool_size='2GB') as cluster:
+if __name__ == "__main__":
+    # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option
+    # to LocalCUDACluster constructor.
+    with LocalCUDACluster(rmm_pool_size="2GB") as cluster:
        with Client(cluster) as client:
            main(client)
--- a/demo/rmm_plugin/rmm_singlegpu.py
+++ b/demo/rmm_plugin/rmm_singlegpu.py
@ -1,3 +1,7 @@
+"""
+Using rmm on a single node device
+=================================
+"""
 import rmm
 from sklearn.datasets import make_classification

@ -16,7 +20,8 @@ params = {
    "eta": 0.01,
    "objective": "multi:softprob",
    "num_class": 3,
-    "tree_method": "gpu_hist",
+    "tree_method": "hist",
+    "device": "cuda",
 }
 # XGBoost will automatically use the RMM pool allocator
 bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])
--- a/doc/.gitignore
+++ b/doc/.gitignore
@ -6,3 +6,5 @@ doxygen
 parser.py
 *.pyc
 web-data
+# generated by doxygen
+tmp
--- a/doc/conf.py
+++ b/doc/conf.py
@ -19,7 +19,6 @@ import sys
 import tarfile
 import urllib.request
 import warnings
-from subprocess import call
 from urllib.error import HTTPError

 from sh.contrib import git
@ -148,12 +147,20 @@ extensions = [

 sphinx_gallery_conf = {
    # path to your example scripts
-    "examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
+    "examples_dirs": [
+        "../demo/guide-python",
+        "../demo/dask",
+        "../demo/aft_survival",
+        "../demo/gpu_acceleration",
+        "../demo/rmm_plugin"
+    ],
    # path to where to save gallery generated output
    "gallery_dirs": [
        "python/examples",
        "python/dask-examples",
        "python/survival-examples",
+        "python/gpu-examples",
+        "python/rmm-examples",
    ],
    "matplotlib_animations": True,
 }
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@ -23,20 +23,19 @@ The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :do
  :caption: Python example

  params = dict()
-  params["device"] = "cuda:0"
+  params["device"] = "cuda"
  params["tree_method"] = "hist"
  Xy = xgboost.QuantileDMatrix(X, y)
  xgboost.train(params, Xy)

 .. code-block:: python
-  :caption: With Scikit-Learn interface
+  :caption: With the Scikit-Learn interface

  XGBRegressor(tree_method="hist", device="cuda")

-
 GPU-Accelerated SHAP values
 =============================
-XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU predictor is selected.
+XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU is used.

 .. code-block:: python

@ -44,12 +43,12 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
  shap_values = booster.predict(dtrain, pred_contribs=True)
  shap_interaction_values = model.predict(dtrain, pred_interactions=True)

-See examples `here <https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
+See :ref:`sphx_glr_python_gpu-examples_tree_shap.py` for a worked example.

 Multi-node Multi-GPU Training
 =============================

-XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples `here <https://github.com/dmlc/xgboost/tree/master/demo/dask>`__, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
+XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples :doc:`/python/dask-examples/index`, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.


 Memory usage
@ -67,7 +66,8 @@ If you are getting out-of-memory errors on a big dataset, try the or :py:class:`

 CPU-GPU Interoperability
 ========================
-XGBoost models trained on GPUs can be used on CPU-only systems to generate predictions. For information about how to save and load an XGBoost model, see :doc:`/tutorials/saving_model`.
+
+The model can be used on any device regardless of the one used to train it. For instance, a model trained using GPU can still work on a CPU-only machine and vice versa. For more information about model serialization, see :doc:`/tutorials/saving_model`.


 Developer notes
--- a/doc/install.rst
+++ b/doc/install.rst
@ -189,7 +189,7 @@ This will check out the latest stable version from the Maven Central.

 For the latest release version number, please check `release page <https://github.com/dmlc/xgboost/releases>`_.

-To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
+To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).


 .. note:: Windows not supported in the JVM package
@ -325,4 +325,4 @@ The SNAPSHOT JARs are hosted by the XGBoost project. Every commit in the ``maste

 You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html.

-To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
+To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -34,27 +34,6 @@ General Parameters

  - Which booster to use. Can be ``gbtree``, ``gblinear`` or ``dart``; ``gbtree`` and ``dart`` use tree based models while ``gblinear`` uses linear functions.

-* ``verbosity`` [default=1]
-
-  - Verbosity of printing messages.  Valid values are 0 (silent), 1 (warning), 2 (info), 3
-    (debug).  Sometimes XGBoost tries to change configurations based on heuristics, which
-    is displayed as warning message.  If there's unexpected behaviour, please try to
-    increase value of verbosity.
-
-* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
-
-  - When set to True, XGBoost will perform validation of input parameters to check whether
-    a parameter is used or not.
-
-* ``nthread`` [default to maximum number of threads available if not set]
-
-  - Number of parallel threads used to run XGBoost.  When choosing it, please keep thread
-    contention and hyperthreading in mind.
-
-* ``disable_default_eval_metric`` [default= ``false``]
-
-  - Flag to disable default metric. Set to 1 or ``true`` to disable.
-
 * ``device`` [default= ``cpu``]

  .. versionadded:: 2.0.0
@ -67,6 +46,29 @@ General Parameters
    + ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
    + ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.

+    For more information about GPU acceleration, see :doc:`/gpu/index`.
+
+* ``verbosity`` [default=1]
+
+  - Verbosity of printing messages.  Valid values are 0 (silent), 1 (warning), 2 (info), 3
+    (debug).  Sometimes XGBoost tries to change configurations based on heuristics, which
+    is displayed as warning message.  If there's unexpected behaviour, please try to
+    increase value of verbosity.
+
+* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
+
+  - When set to True, XGBoost will perform validation of input parameters to check whether
+    a parameter is used or not. A warning is emitted when there's unknown parameter.
+
+* ``nthread`` [default to maximum number of threads available if not set]
+
+  - Number of parallel threads used to run XGBoost.  When choosing it, please keep thread
+    contention and hyperthreading in mind.
+
+* ``disable_default_eval_metric`` [default= ``false``]
+
+  - Flag to disable default metric. Set to 1 or ``true`` to disable.
+
 Parameters for Tree Booster
 ===========================
 * ``eta`` [default=0.3, alias: ``learning_rate``]
@ -160,7 +162,7 @@ Parameters for Tree Booster
    - ``grow_colmaker``: non-distributed column-based construction of trees.
    - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
    - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
-    - ``grow_gpu_hist``: Grow tree with GPU. Same as setting ``tree_method`` to ``hist`` and use ``device=cuda``.
+    - ``grow_gpu_hist``: Grow tree with GPU. Enabled when ``tree_method`` is set to ``hist`` along with ``device=cuda``.
    - ``sync``: synchronizes trees in all distributed nodes.
    - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
    - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
--- a/doc/python/.gitignore
+++ b/doc/python/.gitignore
@ -1,3 +1,5 @@
 examples
 dask-examples
-survival-examples
+survival-examples
+gpu-examples
+rmm-examples
--- a/doc/python/index.rst
+++ b/doc/python/index.rst
@ -17,3 +17,5 @@ Contents
  examples/index
  dask-examples/index
  survival-examples/index
+  gpu-examples/index
+  rmm-examples/index
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@ -124,7 +124,7 @@ Following table summarizes some differences in supported features between 4 tree
 `T` means supported while `F` means unsupported.

 +------------------+-----------+---------------------+---------------------+------------------------+
-|                  | Exact     | Approx              | Hist                | GPU Hist               |
+|                  | Exact     | Approx              | Hist                | Hist (GPU)             |
 +==================+===========+=====================+=====================+========================+
 | grow_policy      | Depthwise | depthwise/lossguide | depthwise/lossguide | depthwise/lossguide    |
 +------------------+-----------+---------------------+---------------------+------------------------+
@ -141,5 +141,5 @@ Following table summarizes some differences in supported features between 4 tree

 Features/parameters that are not mentioned here are universally supported for all 4 tree
 methods (for instance, column sampling and constraints).  The `P` in external memory means
-partially supported.  Please note that both categorical data and external memory are
+special handling.  Please note that both categorical data and external memory are
 experimental.
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@ -35,8 +35,8 @@ parameter ``enable_categorical``:

 .. code:: python

-  # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
-  clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
+  # Supported tree methods are `approx` and `hist`.
+  clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, device="cuda")
  # X is the dataframe we created in previous snippet
  clf.fit(X, y)
  # Must use JSON/UBJSON for serialization, otherwise the information is lost.
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@ -81,7 +81,7 @@ constructor.
  it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
  Xy = xgboost.DMatrix(it)

-  # Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats
+  # The ``approx`` also work, but with low performance. GPU implementation is different from CPU.
  # as noted in following sections.
  booster = xgboost.train({"tree_method": "hist"}, Xy)

@ -118,15 +118,15 @@ to reduce the overhead of file reading.
 GPU Version (GPU Hist tree method)
 **********************************

-External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to
-``gpu_hist``). However, the algorithm used for GPU is different from the one used for
+External memory is supported by GPU algorithms (i.e. when ``device`` is set to
+``cuda``). However, the algorithm used for GPU is different from the one used for
 CPU. When training on a CPU, the tree method iterates through all batches from external
 memory for each step of the tree construction algorithm. On the other hand, the GPU
 algorithm uses a hybrid approach. It iterates through the data during the beginning of
-each iteration and concatenates all batches into one in GPU memory. To reduce overall
-memory usage, users can utilize subsampling. The GPU hist tree method supports
-`gradient-based sampling`, enabling users to set a low sampling rate without compromising
-accuracy.
+each iteration and concatenates all batches into one in GPU memory for performance
+reasons. To reduce overall memory usage, users can utilize subsampling. The GPU hist tree
+method supports `gradient-based sampling`, enabling users to set a low sampling rate
+without compromising accuracy.

 .. code-block:: python

--- a/doc/tutorials/monotonic.rst
+++ b/doc/tutorials/monotonic.rst
@ -83,13 +83,14 @@ Some other examples:
 - ``(0,-1)``: No constraint on the first predictor and a decreasing constraint on the second.


-**Note for the 'hist' tree construction algorithm**.
-If ``tree_method`` is set to either ``hist``, ``approx`` or ``gpu_hist``, enabling
-monotonic constraints may produce unnecessarily shallow trees. This is because the
-``hist`` method reduces the number of candidate splits to be considered at each
-split. Monotonic constraints may wipe out all available split candidates, in which case no
-split is made. To reduce the effect, you may want to increase the ``max_bin`` parameter to
-consider more split candidates.
+.. note::
+
+   **Note for the 'hist' tree construction algorithm**.  If ``tree_method`` is set to
+   either ``hist`` or ``approx``, enabling monotonic constraints may produce unnecessarily
+   shallow trees. This is because the ``hist`` method reduces the number of candidate
+   splits to be considered at each split. Monotonic constraints may wipe out all available
+   split candidates, in which case no split is made. To reduce the effect, you may want to
+   increase the ``max_bin`` parameter to consider more split candidates.


 *******************
--- a/doc/tutorials/param_tuning.rst
+++ b/doc/tutorials/param_tuning.rst
@ -38,10 +38,6 @@ There are in general two ways that you can control overfitting in XGBoost:
  - This includes ``subsample`` and ``colsample_bytree``.
  - You can also reduce stepsize ``eta``. Remember to increase ``num_round`` when you do so.

-***************************
-Faster training performance
-***************************
-There's a parameter called ``tree_method``, set it to ``hist`` or ``gpu_hist`` for faster computation.

 *************************
 Handle Imbalanced Dataset
--- a/doc/tutorials/rf.rst
+++ b/doc/tutorials/rf.rst
@ -50,13 +50,14 @@ Here is a sample parameter dictionary for training a random forest on a GPU usin
 xgboost::

  params = {
-    'colsample_bynode': 0.8,
-    'learning_rate': 1,
-    'max_depth': 5,
-    'num_parallel_tree': 100,
-    'objective': 'binary:logistic',
-    'subsample': 0.8,
-    'tree_method': 'gpu_hist'
+    "colsample_bynode": 0.8,
+    "learning_rate": 1,
+    "max_depth": 5,
+    "num_parallel_tree": 100,
+    "objective": "binary:logistic",
+    "subsample": 0.8,
+    "tree_method": "hist",
+    "device": "cuda",
  }

 A random forest model can then be trained as follows::
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@ -174,7 +174,7 @@ Will print out something similar to (not actual output as it's too long for demo
          "gbtree_train_param": {
            "num_parallel_tree": "1",
            "process_type": "default",
-            "tree_method": "gpu_hist",
+            "tree_method": "hist",
            "updater": "grow_gpu_hist",
            "updater_seq": "grow_gpu_hist"
          },
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -278,9 +278,15 @@ __model_doc = f"""
          without bias.

    device : Optional[str]
-        Device ordinal.
+
+        .. versionadded:: 2.0.0
+
+        Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
+
    validate_parameters : Optional[bool]
+
        Give warnings for unknown parameter.
+
    enable_categorical : bool

        .. versionadded:: 1.5.0
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@ -144,8 +144,13 @@ class SparkXGBRegressor(_SparkXGBEstimator):
        .. deprecated:: 2.0.0

        Use `device` instead.
+
    device:
+
+        .. versionadded:: 2.0.0
+
        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@ -319,8 +324,13 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
        .. deprecated:: 2.0.0

        Use `device` instead.
+
    device:
+
+        .. versionadded:: 2.0.0
+
        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@ -497,8 +507,13 @@ class SparkXGBRanker(_SparkXGBEstimator):
        .. deprecated:: 2.0.0

        Use `device` instead.
+
    device:
+
+        .. versionadded:: 2.0.0
+
        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -724,11 +724,15 @@ void MetaInfo::SynchronizeNumberOfColumns() {
 namespace {
 template <typename T>
 void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device)
-      << "Data is resided on a different device than `gpu_id`. "
-      << "Device that data is on: " << v.DeviceIdx() << ", "
-      << "`gpu_id` for XGBoost: " << device;
+  bool valid =
+      v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device;
+  if (!valid) {
+    LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
+                  "the booster. The device ordinal of the data is: "
+               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
+  }
 }
+
 template <typename T, std::int32_t D>
 void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
  CheckDevice(device, *v.Data());
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -42,22 +42,22 @@ DMLC_REGISTRY_FILE_TAG(gbtree);

 namespace {
 /** @brief Map the `tree_method` parameter to the `updater` parameter. */
-std::string MapTreeMethodToUpdaters(Context const* ctx_, TreeMethod tree_method) {
+std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method) {
  // Choose updaters according to tree_method parameters
+  if (ctx->IsCUDA()) {
+    common::AssertGPUSupport();
+  }
  switch (tree_method) {
    case TreeMethod::kAuto:  // Use hist as default in 2.0
    case TreeMethod::kHist: {
-      return ctx_->DispatchDevice([] { return "grow_quantile_histmaker"; },
-                                  [] {
-                                    common::AssertGPUSupport();
-                                    return "grow_gpu_hist";
-                                  });
+      return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
+                                 [] { return "grow_gpu_hist"; });
    }
    case TreeMethod::kApprox:
-      CHECK(ctx_->IsCPU()) << "The `approx` tree method is not supported on GPU.";
+      CHECK(ctx->IsCPU()) << "The `approx` tree method is not supported on GPU.";
      return "grow_histmaker";
    case TreeMethod::kExact:
-      CHECK(ctx_->IsCPU()) << "The `exact` tree method is not supported on GPU.";
+      CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
      return "grow_colmaker,prune";
    case TreeMethod::kGPUHist: {
      common::AssertGPUSupport();
@ -150,6 +150,7 @@ void GBTree::Configure(Args const& cfg) {
    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
        << "Only the hist tree method is supported for building multi-target trees with vector "
           "leaf.";
+    CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
  }

  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@ -29,10 +29,12 @@ class LintersPaths:
        "tests/python-gpu/load_pickle.py",
        "tests/python-gpu/test_gpu_pickling.py",
        "tests/python-gpu/test_gpu_eval_metrics.py",
+        "tests/python-gpu/test_gpu_with_sklearn.py",
        "tests/test_distributed/test_with_spark/",
        "tests/test_distributed/test_gpu_with_spark/",
        # demo
        "demo/dask/",
+        "demo/rmm_plugin",
        "demo/json-model/json_parser.py",
        "demo/guide-python/cat_in_the_dat.py",
        "demo/guide-python/categorical.py",
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@ -234,7 +234,7 @@ Arrow specification.'''
        cp.cuda.runtime.setDevice(0)
        dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
        with pytest.raises(
-            xgb.core.XGBoostError, match="Data is resided on a different device"
+            xgb.core.XGBoostError, match="Invalid device ordinal"
        ):
            xgb.train(
                {'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@ -2,6 +2,7 @@ import json
 import os
 import sys
 import tempfile
+from concurrent.futures import ThreadPoolExecutor

 import numpy as np
 import pytest
@ -23,18 +24,19 @@ def test_gpu_binary_classification():
    from sklearn.model_selection import KFold

    digits = load_digits(n_class=2)
-    y = digits['target']
-    X = digits['data']
+    y = digits["target"]
+    X = digits["data"]
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
        for train_index, test_index in kf.split(X, y):
            xgb_model = cls(
-                random_state=42, tree_method='gpu_hist',
-                n_estimators=4, gpu_id='0').fit(X[train_index], y[train_index])
+                random_state=42, tree_method="gpu_hist", n_estimators=4, gpu_id="0"
+            ).fit(X[train_index], y[train_index])
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
-            err = sum(1 for i in range(len(preds))
-                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+            err = sum(
+                1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
+            ) / float(len(preds))
            assert err < 0.1


@ -133,7 +135,7 @@ def test_classififer():
    X, y = load_digits(return_X_y=True)
    y *= 10

-    clf = xgb.XGBClassifier(tree_method="gpu_hist", n_estimators=1)
+    clf = xgb.XGBClassifier(tree_method="hist", n_estimators=1, device="cuda")

    # numpy
    with pytest.raises(ValueError, match=r"Invalid classes.*"):
@ -161,3 +163,46 @@ def test_ranking_qid_df():
    import cudf

    run_ranking_qid_df(cudf, "gpu_hist")
+
+
+@pytest.mark.skipif(**tm.no_cupy())
+@pytest.mark.mgpu
+def test_device_ordinal() -> None:
+    import cupy as cp
+
+    n_devices = 2
+
+    def worker(ordinal: int, correct_ordinal: bool) -> None:
+        if correct_ordinal:
+            cp.cuda.runtime.setDevice(ordinal)
+        else:
+            cp.cuda.runtime.setDevice((ordinal + 1) % n_devices)
+
+        X, y, w = tm.make_regression(4096, 12, use_cupy=True)
+        reg = xgb.XGBRegressor(device=f"cuda:{ordinal}", tree_method="hist")
+
+        if correct_ordinal:
+            reg.fit(
+                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
+            )
+            assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
+            return
+
+        with pytest.raises(ValueError, match="Invalid device ordinal"):
+            reg.fit(
+                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
+            )
+
+    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
+        futures = []
+        n_trials = 32
+        for i in range(n_trials):
+            fut = executor.submit(
+                worker, ordinal=i % n_devices, correct_ordinal=i % 3 != 0
+            )
+            futures.append(fut)
+
+        for fut in futures:
+            fut.result()
+
+    cp.cuda.runtime.setDevice(0)