Document for device ordinal. (#9398)

- Rewrite GPU demos. notebook is converted to script to avoid committing additional png plots. - Add GPU demos into the sphinx gallery. - Add RMM demos into the sphinx gallery. - Test for firing threads with different device ordinals.
2023-07-22 15:26:29 +08:00
parent 22b0a55a04
commit 275da176ba
32 changed files with 351 additions and 398 deletions
--- a/demo/c-api/basic/c-api-demo.c
+++ b/demo/c-api/basic/c-api-demo.c
@@ -53,15 +53,7 @@ int main() {
  // configure the training
  // available parameters are described here:
  //   https://xgboost.readthedocs.io/en/latest/parameter.html
-  safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
+  safe_xgboost(XGBoosterSetParam(booster, "device", use_gpu ? "cuda" : "cpu"));
  if (use_gpu) {
    // set the GPU to use;
    // this is not necessary, but provided here as an illustration
    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
  } else {
    // avoid evaluating objective and metric on a GPU
    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
  }
  safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
  safe_xgboost(XGBoosterSetParam(booster, "min_child_weight", "1"));
--- a/demo/gpu_acceleration/README.md
+++ b/demo/gpu_acceleration/README.md
@@ -1,5 +0,0 @@
 # GPU Acceleration Demo
 `cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
 `shap.ipynb` demonstrates using GPU acceleration to compute SHAP values for feature importance.
--- a/demo/gpu_acceleration/README.rst
+++ b/demo/gpu_acceleration/README.rst
@@ -0,0 +1,8 @@
 :orphan:
 GPU Acceleration Demo
 =====================
 This is a collection of demonstration scripts to showcase the basic usage of GPU. Please
 see :doc:`/gpu/index` for more info. There are other demonstrations for distributed GPU
 training using dask or spark.
--- a/demo/gpu_acceleration/cover_type.py
+++ b/demo/gpu_acceleration/cover_type.py
@@ -1,41 +1,49 @@
 """
 Using xgboost on GPU devices
 ============================
 Shows how to train a model on the `forest cover type
 <https://archive.ics.uci.edu/ml/datasets/covertype>`_ dataset using GPU
 acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it
 time consuming to process. We compare the run-time and accuracy of the GPU and CPU
 histogram algorithms.
 In addition, The demo showcases using GPU with other GPU-related libraries including
 cupy and cuml. These libraries are not strictly required.
 """
 import time
 import cupy as cp
 from cuml.model_selection import train_test_split
 from sklearn.datasets import fetch_covtype
 from sklearn.model_selection import train_test_split
 import xgboost as xgb
 # Fetch dataset using sklearn
-cov = fetch_covtype()
+X, y = fetch_covtype(return_X_y=True)
-X = cov.data
+X = cp.array(X)
-y = cov.target
+y = cp.array(y)
 y -= y.min()
 # Create 0.75/0.25 train/test split
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,
+X_train, X_test, y_train, y_test = train_test_split(
-                                                    random_state=42)
+    X, y, test_size=0.25, train_size=0.75, random_state=42
 )
 # Specify sufficient boosting iterations to reach a minimum
 num_round = 3000
 # Leave most parameters as default
-param = {'objective': 'multi:softmax', # Specify multiclass classification
+clf = xgb.XGBClassifier(device="cuda", n_estimators=num_round)
         'num_class': 8, # Number of possible output classes
         'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
         }
 # Convert input data from numpy to XGBoost format
 dtrain = xgb.DMatrix(X_train, label=y_train)
 dtest = xgb.DMatrix(X_test, label=y_test)
 gpu_res = {} # Store accuracy result
 tmp = time.time()
 # Train model
-xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
+start = time.time()
-print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))
+clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
 gpu_res = clf.evals_result()
 print("GPU Training Time: %s seconds" % (str(time.time() - start)))
 # Repeat for CPU algorithm
-tmp = time.time()
+clf = xgb.XGBClassifier(device="cpu", n_estimators=num_round)
-param['tree_method'] = 'hist'
+start = time.time()
-cpu_res = {}
+cpu_res = clf.evals_result()
-xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
+print("CPU Training Time: %s seconds" % (str(time.time() - start)))
 print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
--- a/demo/gpu_acceleration/shap.ipynb
+++ b/demo/gpu_acceleration/shap.ipynb
--- a/demo/gpu_acceleration/tree_shap.py
+++ b/demo/gpu_acceleration/tree_shap.py
@@ -0,0 +1,55 @@
 """
 Use GPU to speedup SHAP value computation
 =========================================
 Demonstrates using GPU acceleration to compute SHAP values for feature importance.
 """
 import shap
 from sklearn.datasets import fetch_california_housing
 import xgboost as xgb
 # Fetch dataset using sklearn
 data = fetch_california_housing()
 print(data.DESCR)
 X = data.data
 y = data.target
 num_round = 500
 param = {
    "eta": 0.05,
    "max_depth": 10,
    "tree_method": "hist",
    "device": "cuda",
 }
 # GPU accelerated training
 dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
 model = xgb.train(param, dtrain, num_round)
 # Compute shap values using GPU with xgboost
 model.set_param({"device": "cuda"})
 shap_values = model.predict(dtrain, pred_contribs=True)
 # Compute shap interaction values using GPU
 shap_interaction_values = model.predict(dtrain, pred_interactions=True)
 # shap will call the GPU accelerated version as long as the device parameter is set to
 # "cuda"
 explainer = shap.TreeExplainer(model)
 shap_values = explainer.shap_values(X)
 # visualize the first prediction's explanation
 shap.force_plot(
    explainer.expected_value,
    shap_values[0, :],
    X[0, :],
    feature_names=data.feature_names,
    matplotlib=True,
 )
 # Show a summary of feature importance
 shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)
--- a/demo/nvflare/horizontal/custom/trainer.py
+++ b/demo/nvflare/horizontal/custom/trainer.py
@@ -70,8 +70,7 @@ class XGBoostTrainer(Executor):
            param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
            if self._use_gpus:
                self.log_info(fl_ctx, f'Training with GPU {rank}')
-                param['tree_method'] = 'gpu_hist'
+                param['device'] = f"cuda:{rank}"
                param['gpu_id'] = rank
            # Specify validations set to watch performance
            watchlist = [(dtest, 'eval'), (dtrain, 'train')]
--- a/demo/rmm_plugin/README.md
+++ b/demo/rmm_plugin/README.md
@@ -1,47 +0,0 @@
 Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
 ====================================================================
 [RAPIDS Memory Manager (RMM)](https://github.com/rapidsai/rmm) library provides a collection of
 efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory
 allocators provided by RMM, by enabling the RMM integration plugin.
 The demos in this directory highlights one RMM allocator in particular: **the pool sub-allocator**.
 This allocator addresses the slow speed of `cudaMalloc()` by allocating a large chunk of memory
 upfront. Subsequent allocations will draw from the pool of already allocated memory and thus avoid
 the overhead of calling `cudaMalloc()` directly. See
 [this GTC talk slides](https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf)
 for more details.
 Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
 run CMake with option `-DPLUGIN_RMM=ON` (`-DUSE_CUDA=ON` also required):
 ```
 cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
 make -j4
 ```
 CMake will attempt to locate the RMM library in your build environment. You may choose to build
 RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
 should specify the location of RMM with the CMake prefix:
 ```
 # If using Conda:
 cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
 # If using RMM installed with a custom location
 cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
 ```
 # Informing XGBoost about RMM pool
 When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
 allocators, but some small allocations in performance critical areas are using a different
 caching allocator so that we can have better control over memory allocation behavior.
 Users can override this behavior and force the use of rmm for all allocations by setting
 the global configuration ``use_rmm``:
 ``` python
 with xgb.config_context(use_rmm=True):
    clf = xgb.XGBClassifier(tree_method="gpu_hist")
 ```
 Depending on the choice of memory pool size or type of allocator, this may have negative
 performance impact.
 * [Using RMM with a single GPU](./rmm_singlegpu.py)
 * [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py)
--- a/demo/rmm_plugin/README.rst
+++ b/demo/rmm_plugin/README.rst
@@ -0,0 +1,51 @@
 Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
 ====================================================================
 `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ library provides a
 collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use
 XGBoost with memory allocators provided by RMM, by enabling the RMM integration plugin.
 The demos in this directory highlights one RMM allocator in particular: **the pool
 sub-allocator**.  This allocator addresses the slow speed of ``cudaMalloc()`` by
 allocating a large chunk of memory upfront. Subsequent allocations will draw from the pool
 of already allocated memory and thus avoid the overhead of calling ``cudaMalloc()``
 directly. See `this GTC talk slides
 <https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf>`_ for
 more details.
 Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
 run CMake with option ``-DPLUGIN_RMM=ON`` (``-DUSE_CUDA=ON`` also required):
 .. code-block:: sh
  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
  make -j$(nproc)
 CMake will attempt to locate the RMM library in your build environment. You may choose to build
 RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
 should specify the location of RMM with the CMake prefix:
 .. code-block:: sh
  # If using Conda:
  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
  # If using RMM installed with a custom location
  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
 ********************************
 Informing XGBoost about RMM pool
 ********************************
 When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
 allocators, but some small allocations in performance critical areas are using a different
 caching allocator so that we can have better control over memory allocation behavior.
 Users can override this behavior and force the use of rmm for all allocations by setting
 the global configuration ``use_rmm``:
 .. code-block:: python
  with xgb.config_context(use_rmm=True):
    clf = xgb.XGBClassifier(tree_method="hist", device="cuda")
 Depending on the choice of memory pool size or type of allocator, this may have negative
 performance impact.
--- a/demo/rmm_plugin/rmm_mgpu_with_dask.py
+++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py
@@ -1,3 +1,7 @@
 """
 Using rmm with Dask
 ===================
 """
 import dask
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
@@ -11,25 +15,33 @@ def main(client):
    # xgb.set_config(use_rmm=True)
    X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
-    # In pratice one should prefer loading the data with dask collections instead of using
+    # In pratice one should prefer loading the data with dask collections instead of
-    # `from_array`.
+    # using `from_array`.
    X = dask.array.from_array(X)
    y = dask.array.from_array(y)
    dtrain = xgb.dask.DaskDMatrix(client, X, label=y)
-    params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
+    params = {
-              'tree_method': 'gpu_hist', 'eval_metric': 'merror'}
+        "max_depth": 8,
-    output = xgb.dask.train(client, params, dtrain, num_boost_round=100,
+        "eta": 0.01,
-                            evals=[(dtrain, 'train')])
+        "objective": "multi:softprob",
-    bst = output['booster']
+        "num_class": 3,
-    history = output['history']
+        "tree_method": "hist",
-    for i, e in enumerate(history['train']['merror']):
+        "eval_metric": "merror",
-        print(f'[{i}] train-merror: {e}')
+        "device": "cuda",
    }
    output = xgb.dask.train(
        client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
    )
    bst = output["booster"]
    history = output["history"]
    for i, e in enumerate(history["train"]["merror"]):
        print(f"[{i}] train-merror: {e}")
-if __name__ == '__main__':
+if __name__ == "__main__":
-    # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to
+    # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option
-    # LocalCUDACluster constructor.
+    # to LocalCUDACluster constructor.
-    with LocalCUDACluster(rmm_pool_size='2GB') as cluster:
+    with LocalCUDACluster(rmm_pool_size="2GB") as cluster:
        with Client(cluster) as client:
            main(client)
--- a/demo/rmm_plugin/rmm_singlegpu.py
+++ b/demo/rmm_plugin/rmm_singlegpu.py
@@ -1,3 +1,7 @@
 """
 Using rmm on a single node device
 =================================
 """
 import rmm
 from sklearn.datasets import make_classification
@@ -16,7 +20,8 @@ params = {
    "eta": 0.01,
    "objective": "multi:softprob",
    "num_class": 3,
-    "tree_method": "gpu_hist",
+    "tree_method": "hist",
    "device": "cuda",
 }
 # XGBoost will automatically use the RMM pool allocator
 bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])
--- a/doc/.gitignore
+++ b/doc/.gitignore
@@ -6,3 +6,5 @@ doxygen
 parser.py
 *.pyc
 web-data
 # generated by doxygen
 tmp
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -19,7 +19,6 @@ import sys
 import tarfile
 import urllib.request
 import warnings
 from subprocess import call
 from urllib.error import HTTPError
 from sh.contrib import git
@@ -148,12 +147,20 @@ extensions = [
 sphinx_gallery_conf = {
    # path to your example scripts
-    "examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
+    "examples_dirs": [
        "../demo/guide-python",
        "../demo/dask",
        "../demo/aft_survival",
        "../demo/gpu_acceleration",
        "../demo/rmm_plugin"
    ],
    # path to where to save gallery generated output
    "gallery_dirs": [
        "python/examples",
        "python/dask-examples",
        "python/survival-examples",
        "python/gpu-examples",
        "python/rmm-examples",
    ],
    "matplotlib_animations": True,
 }
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -23,20 +23,19 @@ The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :do
  :caption: Python example
  params = dict()
-  params["device"] = "cuda:0"
+  params["device"] = "cuda"
  params["tree_method"] = "hist"
  Xy = xgboost.QuantileDMatrix(X, y)
  xgboost.train(params, Xy)
 .. code-block:: python
-  :caption: With Scikit-Learn interface
+  :caption: With the Scikit-Learn interface
  XGBRegressor(tree_method="hist", device="cuda")
 GPU-Accelerated SHAP values
 =============================
-XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU predictor is selected.
+XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU is used.
 .. code-block:: python
@@ -44,12 +43,12 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
  shap_values = booster.predict(dtrain, pred_contribs=True)
  shap_interaction_values = model.predict(dtrain, pred_interactions=True)
-See examples `here <https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
+See :ref:`sphx_glr_python_gpu-examples_tree_shap.py` for a worked example.
 Multi-node Multi-GPU Training
 =============================
-XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples `here <https://github.com/dmlc/xgboost/tree/master/demo/dask>`__, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
+XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples :doc:`/python/dask-examples/index`, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
 Memory usage
@@ -67,7 +66,8 @@ If you are getting out-of-memory errors on a big dataset, try the or :py:class:`
 CPU-GPU Interoperability
 ========================
-XGBoost models trained on GPUs can be used on CPU-only systems to generate predictions. For information about how to save and load an XGBoost model, see :doc:`/tutorials/saving_model`.
+
 The model can be used on any device regardless of the one used to train it. For instance, a model trained using GPU can still work on a CPU-only machine and vice versa. For more information about model serialization, see :doc:`/tutorials/saving_model`.
 Developer notes
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -189,7 +189,7 @@ This will check out the latest stable version from the Maven Central.
 For the latest release version number, please check `release page <https://github.com/dmlc/xgboost/releases>`_.
-To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
+To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
 .. note:: Windows not supported in the JVM package
@@ -325,4 +325,4 @@ The SNAPSHOT JARs are hosted by the XGBoost project. Every commit in the ``maste
 You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html.
-To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
+To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -34,27 +34,6 @@ General Parameters
  - Which booster to use. Can be ``gbtree``, ``gblinear`` or ``dart``; ``gbtree`` and ``dart`` use tree based models while ``gblinear`` uses linear functions.
 * ``verbosity`` [default=1]
  - Verbosity of printing messages.  Valid values are 0 (silent), 1 (warning), 2 (info), 3
    (debug).  Sometimes XGBoost tries to change configurations based on heuristics, which
    is displayed as warning message.  If there's unexpected behaviour, please try to
    increase value of verbosity.
 * ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
  - When set to True, XGBoost will perform validation of input parameters to check whether
    a parameter is used or not.
 * ``nthread`` [default to maximum number of threads available if not set]
  - Number of parallel threads used to run XGBoost.  When choosing it, please keep thread
    contention and hyperthreading in mind.
 * ``disable_default_eval_metric`` [default= ``false``]
  - Flag to disable default metric. Set to 1 or ``true`` to disable.
 * ``device`` [default= ``cpu``]
  .. versionadded:: 2.0.0
@@ -67,6 +46,29 @@ General Parameters
    + ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
    + ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
    For more information about GPU acceleration, see :doc:`/gpu/index`.
 * ``verbosity`` [default=1]
  - Verbosity of printing messages.  Valid values are 0 (silent), 1 (warning), 2 (info), 3
    (debug).  Sometimes XGBoost tries to change configurations based on heuristics, which
    is displayed as warning message.  If there's unexpected behaviour, please try to
    increase value of verbosity.
 * ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
  - When set to True, XGBoost will perform validation of input parameters to check whether
    a parameter is used or not. A warning is emitted when there's unknown parameter.
 * ``nthread`` [default to maximum number of threads available if not set]
  - Number of parallel threads used to run XGBoost.  When choosing it, please keep thread
    contention and hyperthreading in mind.
 * ``disable_default_eval_metric`` [default= ``false``]
  - Flag to disable default metric. Set to 1 or ``true`` to disable.
 Parameters for Tree Booster
 ===========================
 * ``eta`` [default=0.3, alias: ``learning_rate``]
@@ -160,7 +162,7 @@ Parameters for Tree Booster
    - ``grow_colmaker``: non-distributed column-based construction of trees.
    - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
    - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
-    - ``grow_gpu_hist``: Grow tree with GPU. Same as setting ``tree_method`` to ``hist`` and use ``device=cuda``.
+    - ``grow_gpu_hist``: Grow tree with GPU. Enabled when ``tree_method`` is set to ``hist`` along with ``device=cuda``.
    - ``sync``: synchronizes trees in all distributed nodes.
    - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
    - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
--- a/doc/python/.gitignore
+++ b/doc/python/.gitignore
@@ -1,3 +1,5 @@
 examples
 dask-examples
 survival-examples
 gpu-examples
 rmm-examples
--- a/doc/python/index.rst
+++ b/doc/python/index.rst
@@ -17,3 +17,5 @@ Contents
  examples/index
  dask-examples/index
  survival-examples/index
  gpu-examples/index
  rmm-examples/index
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -124,7 +124,7 @@ Following table summarizes some differences in supported features between 4 tree
 `T` means supported while `F` means unsupported.
 +------------------+-----------+---------------------+---------------------+------------------------+
-|                  | Exact     | Approx              | Hist                | GPU Hist               |
+|                  | Exact     | Approx              | Hist                | Hist (GPU)             |
 +==================+===========+=====================+=====================+========================+
 | grow_policy      | Depthwise | depthwise/lossguide | depthwise/lossguide | depthwise/lossguide    |
 +------------------+-----------+---------------------+---------------------+------------------------+
@@ -141,5 +141,5 @@ Following table summarizes some differences in supported features between 4 tree
 Features/parameters that are not mentioned here are universally supported for all 4 tree
 methods (for instance, column sampling and constraints).  The `P` in external memory means
-partially supported.  Please note that both categorical data and external memory are
+special handling.  Please note that both categorical data and external memory are
 experimental.
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -35,8 +35,8 @@ parameter ``enable_categorical``:
 .. code:: python
-  # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
+  # Supported tree methods are `approx` and `hist`.
-  clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
+  clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, device="cuda")
  # X is the dataframe we created in previous snippet
  clf.fit(X, y)
  # Must use JSON/UBJSON for serialization, otherwise the information is lost.
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -81,7 +81,7 @@ constructor.
  it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
  Xy = xgboost.DMatrix(it)
-  # Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats
+  # The ``approx`` also work, but with low performance. GPU implementation is different from CPU.
  # as noted in following sections.
  booster = xgboost.train({"tree_method": "hist"}, Xy)
@@ -118,15 +118,15 @@ to reduce the overhead of file reading.
 GPU Version (GPU Hist tree method)
 **********************************
-External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to
+External memory is supported by GPU algorithms (i.e. when ``device`` is set to
-``gpu_hist``). However, the algorithm used for GPU is different from the one used for
+``cuda``). However, the algorithm used for GPU is different from the one used for
 CPU. When training on a CPU, the tree method iterates through all batches from external
 memory for each step of the tree construction algorithm. On the other hand, the GPU
 algorithm uses a hybrid approach. It iterates through the data during the beginning of
-each iteration and concatenates all batches into one in GPU memory. To reduce overall
+each iteration and concatenates all batches into one in GPU memory for performance
-memory usage, users can utilize subsampling. The GPU hist tree method supports
+reasons. To reduce overall memory usage, users can utilize subsampling. The GPU hist tree
-`gradient-based sampling`, enabling users to set a low sampling rate without compromising
+method supports `gradient-based sampling`, enabling users to set a low sampling rate
-accuracy.
+without compromising accuracy.
 .. code-block:: python
--- a/doc/tutorials/monotonic.rst
+++ b/doc/tutorials/monotonic.rst
@@ -83,13 +83,14 @@ Some other examples:
 - ``(0,-1)``: No constraint on the first predictor and a decreasing constraint on the second.
-**Note for the 'hist' tree construction algorithm**.
+.. note::
-If ``tree_method`` is set to either ``hist``, ``approx`` or ``gpu_hist``, enabling
+
-monotonic constraints may produce unnecessarily shallow trees. This is because the
+   **Note for the 'hist' tree construction algorithm**.  If ``tree_method`` is set to
-``hist`` method reduces the number of candidate splits to be considered at each
+   either ``hist`` or ``approx``, enabling monotonic constraints may produce unnecessarily
-split. Monotonic constraints may wipe out all available split candidates, in which case no
+   shallow trees. This is because the ``hist`` method reduces the number of candidate
-split is made. To reduce the effect, you may want to increase the ``max_bin`` parameter to
+   splits to be considered at each split. Monotonic constraints may wipe out all available
-consider more split candidates.
+   split candidates, in which case no split is made. To reduce the effect, you may want to
   increase the ``max_bin`` parameter to consider more split candidates.
 *******************
--- a/doc/tutorials/param_tuning.rst
+++ b/doc/tutorials/param_tuning.rst
@@ -38,10 +38,6 @@ There are in general two ways that you can control overfitting in XGBoost:
  - This includes ``subsample`` and ``colsample_bytree``.
  - You can also reduce stepsize ``eta``. Remember to increase ``num_round`` when you do so.
 ***************************
 Faster training performance
 ***************************
 There's a parameter called ``tree_method``, set it to ``hist`` or ``gpu_hist`` for faster computation.
 *************************
 Handle Imbalanced Dataset
--- a/doc/tutorials/rf.rst
+++ b/doc/tutorials/rf.rst
@@ -50,13 +50,14 @@ Here is a sample parameter dictionary for training a random forest on a GPU usin
 xgboost::
  params = {
-    'colsample_bynode': 0.8,
+    "colsample_bynode": 0.8,
-    'learning_rate': 1,
+    "learning_rate": 1,
-    'max_depth': 5,
+    "max_depth": 5,
-    'num_parallel_tree': 100,
+    "num_parallel_tree": 100,
-    'objective': 'binary:logistic',
+    "objective": "binary:logistic",
-    'subsample': 0.8,
+    "subsample": 0.8,
-    'tree_method': 'gpu_hist'
+    "tree_method": "hist",
    "device": "cuda",
  }
 A random forest model can then be trained as follows::
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -174,7 +174,7 @@ Will print out something similar to (not actual output as it's too long for demo
          "gbtree_train_param": {
            "num_parallel_tree": "1",
            "process_type": "default",
-            "tree_method": "gpu_hist",
+            "tree_method": "hist",
            "updater": "grow_gpu_hist",
            "updater_seq": "grow_gpu_hist"
          },
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -278,9 +278,15 @@ __model_doc = f"""
          without bias.
    device : Optional[str]
-        Device ordinal.
+
        .. versionadded:: 2.0.0
        Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
    validate_parameters : Optional[bool]
        Give warnings for unknown parameter.
    enable_categorical : bool
        .. versionadded:: 1.5.0
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -144,8 +144,13 @@ class SparkXGBRegressor(_SparkXGBEstimator):
        .. deprecated:: 2.0.0
        Use `device` instead.
    device:
        .. versionadded:: 2.0.0
        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@@ -319,8 +324,13 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
        .. deprecated:: 2.0.0
        Use `device` instead.
    device:
        .. versionadded:: 2.0.0
        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@@ -497,8 +507,13 @@ class SparkXGBRanker(_SparkXGBEstimator):
        .. deprecated:: 2.0.0
        Use `device` instead.
    device:
        .. versionadded:: 2.0.0
        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -724,11 +724,15 @@ void MetaInfo::SynchronizeNumberOfColumns() {
 namespace {
 template <typename T>
 void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device)
+  bool valid =
-      << "Data is resided on a different device than `gpu_id`. "
+      v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device;
-      << "Device that data is on: " << v.DeviceIdx() << ", "
+  if (!valid) {
-      << "`gpu_id` for XGBoost: " << device;
+    LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
                  "the booster. The device ordinal of the data is: "
               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
  }
 }
 template <typename T, std::int32_t D>
 void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
  CheckDevice(device, *v.Data());
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -42,22 +42,22 @@ DMLC_REGISTRY_FILE_TAG(gbtree);
 namespace {
 /** @brief Map the `tree_method` parameter to the `updater` parameter. */
-std::string MapTreeMethodToUpdaters(Context const* ctx_, TreeMethod tree_method) {
+std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method) {
  // Choose updaters according to tree_method parameters
  if (ctx->IsCUDA()) {
    common::AssertGPUSupport();
  }
  switch (tree_method) {
    case TreeMethod::kAuto:  // Use hist as default in 2.0
    case TreeMethod::kHist: {
-      return ctx_->DispatchDevice([] { return "grow_quantile_histmaker"; },
+      return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
-                                  [] {
+                                 [] { return "grow_gpu_hist"; });
                                    common::AssertGPUSupport();
                                    return "grow_gpu_hist";
                                  });
    }
    case TreeMethod::kApprox:
-      CHECK(ctx_->IsCPU()) << "The `approx` tree method is not supported on GPU.";
+      CHECK(ctx->IsCPU()) << "The `approx` tree method is not supported on GPU.";
      return "grow_histmaker";
    case TreeMethod::kExact:
-      CHECK(ctx_->IsCPU()) << "The `exact` tree method is not supported on GPU.";
+      CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
      return "grow_colmaker,prune";
    case TreeMethod::kGPUHist: {
      common::AssertGPUSupport();
@@ -150,6 +150,7 @@ void GBTree::Configure(Args const& cfg) {
    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
        << "Only the hist tree method is supported for building multi-target trees with vector "
           "leaf.";
    CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
  }
  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -29,10 +29,12 @@ class LintersPaths:
        "tests/python-gpu/load_pickle.py",
        "tests/python-gpu/test_gpu_pickling.py",
        "tests/python-gpu/test_gpu_eval_metrics.py",
        "tests/python-gpu/test_gpu_with_sklearn.py",
        "tests/test_distributed/test_with_spark/",
        "tests/test_distributed/test_gpu_with_spark/",
        # demo
        "demo/dask/",
        "demo/rmm_plugin",
        "demo/json-model/json_parser.py",
        "demo/guide-python/cat_in_the_dat.py",
        "demo/guide-python/categorical.py",
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -234,7 +234,7 @@ Arrow specification.'''
        cp.cuda.runtime.setDevice(0)
        dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
        with pytest.raises(
-            xgb.core.XGBoostError, match="Data is resided on a different device"
+            xgb.core.XGBoostError, match="Invalid device ordinal"
        ):
            xgb.train(
                {'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -2,6 +2,7 @@ import json
 import os
 import sys
 import tempfile
 from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import pytest
@@ -23,18 +24,19 @@ def test_gpu_binary_classification():
    from sklearn.model_selection import KFold
    digits = load_digits(n_class=2)
-    y = digits['target']
+    y = digits["target"]
-    X = digits['data']
+    X = digits["data"]
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
        for train_index, test_index in kf.split(X, y):
            xgb_model = cls(
-                random_state=42, tree_method='gpu_hist',
+                random_state=42, tree_method="gpu_hist", n_estimators=4, gpu_id="0"
-                n_estimators=4, gpu_id='0').fit(X[train_index], y[train_index])
+            ).fit(X[train_index], y[train_index])
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
-            err = sum(1 for i in range(len(preds))
+            err = sum(
-                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+                1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
            ) / float(len(preds))
            assert err < 0.1
@@ -133,7 +135,7 @@ def test_classififer():
    X, y = load_digits(return_X_y=True)
    y *= 10
-    clf = xgb.XGBClassifier(tree_method="gpu_hist", n_estimators=1)
+    clf = xgb.XGBClassifier(tree_method="hist", n_estimators=1, device="cuda")
    # numpy
    with pytest.raises(ValueError, match=r"Invalid classes.*"):
@@ -161,3 +163,46 @@ def test_ranking_qid_df():
    import cudf
    run_ranking_qid_df(cudf, "gpu_hist")
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.mgpu
 def test_device_ordinal() -> None:
    import cupy as cp
    n_devices = 2
    def worker(ordinal: int, correct_ordinal: bool) -> None:
        if correct_ordinal:
            cp.cuda.runtime.setDevice(ordinal)
        else:
            cp.cuda.runtime.setDevice((ordinal + 1) % n_devices)
        X, y, w = tm.make_regression(4096, 12, use_cupy=True)
        reg = xgb.XGBRegressor(device=f"cuda:{ordinal}", tree_method="hist")
        if correct_ordinal:
            reg.fit(
                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
            )
            assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
            return
        with pytest.raises(ValueError, match="Invalid device ordinal"):
            reg.fit(
                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
            )
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        futures = []
        n_trials = 32
        for i in range(n_trials):
            fut = executor.submit(
                worker, ordinal=i % n_devices, correct_ordinal=i % 3 != 0
            )
            futures.append(fut)
        for fut in futures:
            fut.result()
    cp.cuda.runtime.setDevice(0)