Document for device ordinal. (#9398)

- Rewrite GPU demos. notebook is converted to script to avoid committing additional png plots.
- Add GPU demos into the sphinx gallery.
- Add RMM demos into the sphinx gallery.
- Test for firing threads with different device ordinals.
This commit is contained in:
Jiaming Yuan 2023-07-22 15:26:29 +08:00 committed by GitHub
parent 22b0a55a04
commit 275da176ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
32 changed files with 351 additions and 398 deletions

View File

@ -53,15 +53,7 @@ int main() {
// configure the training // configure the training
// available parameters are described here: // available parameters are described here:
// https://xgboost.readthedocs.io/en/latest/parameter.html // https://xgboost.readthedocs.io/en/latest/parameter.html
safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist")); safe_xgboost(XGBoosterSetParam(booster, "device", use_gpu ? "cuda" : "cpu"));
if (use_gpu) {
// set the GPU to use;
// this is not necessary, but provided here as an illustration
safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
} else {
// avoid evaluating objective and metric on a GPU
safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
}
safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic")); safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
safe_xgboost(XGBoosterSetParam(booster, "min_child_weight", "1")); safe_xgboost(XGBoosterSetParam(booster, "min_child_weight", "1"));

View File

@ -1,5 +0,0 @@
# GPU Acceleration Demo
`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
`shap.ipynb` demonstrates using GPU acceleration to compute SHAP values for feature importance.

View File

@ -0,0 +1,8 @@
:orphan:
GPU Acceleration Demo
=====================
This is a collection of demonstration scripts to showcase the basic usage of GPU. Please
see :doc:`/gpu/index` for more info. There are other demonstrations for distributed GPU
training using dask or spark.

View File

@ -1,41 +1,49 @@
"""
Using xgboost on GPU devices
============================
Shows how to train a model on the `forest cover type
<https://archive.ics.uci.edu/ml/datasets/covertype>`_ dataset using GPU
acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it
time consuming to process. We compare the run-time and accuracy of the GPU and CPU
histogram algorithms.
In addition, The demo showcases using GPU with other GPU-related libraries including
cupy and cuml. These libraries are not strictly required.
"""
import time import time
import cupy as cp
from cuml.model_selection import train_test_split
from sklearn.datasets import fetch_covtype from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
import xgboost as xgb import xgboost as xgb
# Fetch dataset using sklearn # Fetch dataset using sklearn
cov = fetch_covtype() X, y = fetch_covtype(return_X_y=True)
X = cov.data X = cp.array(X)
y = cov.target y = cp.array(y)
y -= y.min()
# Create 0.75/0.25 train/test split # Create 0.75/0.25 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, X_train, X_test, y_train, y_test = train_test_split(
random_state=42) X, y, test_size=0.25, train_size=0.75, random_state=42
)
# Specify sufficient boosting iterations to reach a minimum # Specify sufficient boosting iterations to reach a minimum
num_round = 3000 num_round = 3000
# Leave most parameters as default # Leave most parameters as default
param = {'objective': 'multi:softmax', # Specify multiclass classification clf = xgb.XGBClassifier(device="cuda", n_estimators=num_round)
'num_class': 8, # Number of possible output classes
'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
}
# Convert input data from numpy to XGBoost format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
gpu_res = {} # Store accuracy result
tmp = time.time()
# Train model # Train model
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res) start = time.time()
print("GPU Training Time: %s seconds" % (str(time.time() - tmp))) clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
gpu_res = clf.evals_result()
print("GPU Training Time: %s seconds" % (str(time.time() - start)))
# Repeat for CPU algorithm # Repeat for CPU algorithm
tmp = time.time() clf = xgb.XGBClassifier(device="cpu", n_estimators=num_round)
param['tree_method'] = 'hist' start = time.time()
cpu_res = {} cpu_res = clf.evals_result()
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res) print("CPU Training Time: %s seconds" % (str(time.time() - start)))
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,55 @@
"""
Use GPU to speedup SHAP value computation
=========================================
Demonstrates using GPU acceleration to compute SHAP values for feature importance.
"""
import shap
from sklearn.datasets import fetch_california_housing
import xgboost as xgb
# Fetch dataset using sklearn
data = fetch_california_housing()
print(data.DESCR)
X = data.data
y = data.target
num_round = 500
param = {
"eta": 0.05,
"max_depth": 10,
"tree_method": "hist",
"device": "cuda",
}
# GPU accelerated training
dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
model = xgb.train(param, dtrain, num_round)
# Compute shap values using GPU with xgboost
model.set_param({"device": "cuda"})
shap_values = model.predict(dtrain, pred_contribs=True)
# Compute shap interaction values using GPU
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
# shap will call the GPU accelerated version as long as the device parameter is set to
# "cuda"
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
# visualize the first prediction's explanation
shap.force_plot(
explainer.expected_value,
shap_values[0, :],
X[0, :],
feature_names=data.feature_names,
matplotlib=True,
)
# Show a summary of feature importance
shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)

View File

@ -70,8 +70,7 @@ class XGBoostTrainer(Executor):
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
if self._use_gpus: if self._use_gpus:
self.log_info(fl_ctx, f'Training with GPU {rank}') self.log_info(fl_ctx, f'Training with GPU {rank}')
param['tree_method'] = 'gpu_hist' param['device'] = f"cuda:{rank}"
param['gpu_id'] = rank
# Specify validations set to watch performance # Specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')] watchlist = [(dtest, 'eval'), (dtrain, 'train')]

View File

@ -1,47 +0,0 @@
Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
====================================================================
[RAPIDS Memory Manager (RMM)](https://github.com/rapidsai/rmm) library provides a collection of
efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory
allocators provided by RMM, by enabling the RMM integration plugin.
The demos in this directory highlights one RMM allocator in particular: **the pool sub-allocator**.
This allocator addresses the slow speed of `cudaMalloc()` by allocating a large chunk of memory
upfront. Subsequent allocations will draw from the pool of already allocated memory and thus avoid
the overhead of calling `cudaMalloc()` directly. See
[this GTC talk slides](https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf)
for more details.
Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
run CMake with option `-DPLUGIN_RMM=ON` (`-DUSE_CUDA=ON` also required):
```
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
make -j4
```
CMake will attempt to locate the RMM library in your build environment. You may choose to build
RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
should specify the location of RMM with the CMake prefix:
```
# If using Conda:
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
# If using RMM installed with a custom location
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
```
# Informing XGBoost about RMM pool
When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
allocators, but some small allocations in performance critical areas are using a different
caching allocator so that we can have better control over memory allocation behavior.
Users can override this behavior and force the use of rmm for all allocations by setting
the global configuration ``use_rmm``:
``` python
with xgb.config_context(use_rmm=True):
clf = xgb.XGBClassifier(tree_method="gpu_hist")
```
Depending on the choice of memory pool size or type of allocator, this may have negative
performance impact.
* [Using RMM with a single GPU](./rmm_singlegpu.py)
* [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py)

View File

@ -0,0 +1,51 @@
Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
====================================================================
`RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ library provides a
collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use
XGBoost with memory allocators provided by RMM, by enabling the RMM integration plugin.
The demos in this directory highlights one RMM allocator in particular: **the pool
sub-allocator**. This allocator addresses the slow speed of ``cudaMalloc()`` by
allocating a large chunk of memory upfront. Subsequent allocations will draw from the pool
of already allocated memory and thus avoid the overhead of calling ``cudaMalloc()``
directly. See `this GTC talk slides
<https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf>`_ for
more details.
Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
run CMake with option ``-DPLUGIN_RMM=ON`` (``-DUSE_CUDA=ON`` also required):
.. code-block:: sh
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
make -j$(nproc)
CMake will attempt to locate the RMM library in your build environment. You may choose to build
RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
should specify the location of RMM with the CMake prefix:
.. code-block:: sh
# If using Conda:
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
# If using RMM installed with a custom location
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
********************************
Informing XGBoost about RMM pool
********************************
When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
allocators, but some small allocations in performance critical areas are using a different
caching allocator so that we can have better control over memory allocation behavior.
Users can override this behavior and force the use of rmm for all allocations by setting
the global configuration ``use_rmm``:
.. code-block:: python
with xgb.config_context(use_rmm=True):
clf = xgb.XGBClassifier(tree_method="hist", device="cuda")
Depending on the choice of memory pool size or type of allocator, this may have negative
performance impact.

View File

@ -1,3 +1,7 @@
"""
Using rmm with Dask
===================
"""
import dask import dask
from dask.distributed import Client from dask.distributed import Client
from dask_cuda import LocalCUDACluster from dask_cuda import LocalCUDACluster
@ -11,25 +15,33 @@ def main(client):
# xgb.set_config(use_rmm=True) # xgb.set_config(use_rmm=True)
X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3) X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
# In pratice one should prefer loading the data with dask collections instead of using # In pratice one should prefer loading the data with dask collections instead of
# `from_array`. # using `from_array`.
X = dask.array.from_array(X) X = dask.array.from_array(X)
y = dask.array.from_array(y) y = dask.array.from_array(y)
dtrain = xgb.dask.DaskDMatrix(client, X, label=y) dtrain = xgb.dask.DaskDMatrix(client, X, label=y)
params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3, params = {
'tree_method': 'gpu_hist', 'eval_metric': 'merror'} "max_depth": 8,
output = xgb.dask.train(client, params, dtrain, num_boost_round=100, "eta": 0.01,
evals=[(dtrain, 'train')]) "objective": "multi:softprob",
bst = output['booster'] "num_class": 3,
history = output['history'] "tree_method": "hist",
for i, e in enumerate(history['train']['merror']): "eval_metric": "merror",
print(f'[{i}] train-merror: {e}') "device": "cuda",
}
output = xgb.dask.train(
client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
)
bst = output["booster"]
history = output["history"]
for i, e in enumerate(history["train"]["merror"]):
print(f"[{i}] train-merror: {e}")
if __name__ == '__main__': if __name__ == "__main__":
# To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option
# LocalCUDACluster constructor. # to LocalCUDACluster constructor.
with LocalCUDACluster(rmm_pool_size='2GB') as cluster: with LocalCUDACluster(rmm_pool_size="2GB") as cluster:
with Client(cluster) as client: with Client(cluster) as client:
main(client) main(client)

View File

@ -1,3 +1,7 @@
"""
Using rmm on a single node device
=================================
"""
import rmm import rmm
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
@ -16,7 +20,8 @@ params = {
"eta": 0.01, "eta": 0.01,
"objective": "multi:softprob", "objective": "multi:softprob",
"num_class": 3, "num_class": 3,
"tree_method": "gpu_hist", "tree_method": "hist",
"device": "cuda",
} }
# XGBoost will automatically use the RMM pool allocator # XGBoost will automatically use the RMM pool allocator
bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]) bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])

2
doc/.gitignore vendored
View File

@ -6,3 +6,5 @@ doxygen
parser.py parser.py
*.pyc *.pyc
web-data web-data
# generated by doxygen
tmp

View File

@ -19,7 +19,6 @@ import sys
import tarfile import tarfile
import urllib.request import urllib.request
import warnings import warnings
from subprocess import call
from urllib.error import HTTPError from urllib.error import HTTPError
from sh.contrib import git from sh.contrib import git
@ -148,12 +147,20 @@ extensions = [
sphinx_gallery_conf = { sphinx_gallery_conf = {
# path to your example scripts # path to your example scripts
"examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"], "examples_dirs": [
"../demo/guide-python",
"../demo/dask",
"../demo/aft_survival",
"../demo/gpu_acceleration",
"../demo/rmm_plugin"
],
# path to where to save gallery generated output # path to where to save gallery generated output
"gallery_dirs": [ "gallery_dirs": [
"python/examples", "python/examples",
"python/dask-examples", "python/dask-examples",
"python/survival-examples", "python/survival-examples",
"python/gpu-examples",
"python/rmm-examples",
], ],
"matplotlib_animations": True, "matplotlib_animations": True,
} }

View File

@ -23,20 +23,19 @@ The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :do
:caption: Python example :caption: Python example
params = dict() params = dict()
params["device"] = "cuda:0" params["device"] = "cuda"
params["tree_method"] = "hist" params["tree_method"] = "hist"
Xy = xgboost.QuantileDMatrix(X, y) Xy = xgboost.QuantileDMatrix(X, y)
xgboost.train(params, Xy) xgboost.train(params, Xy)
.. code-block:: python .. code-block:: python
:caption: With Scikit-Learn interface :caption: With the Scikit-Learn interface
XGBRegressor(tree_method="hist", device="cuda") XGBRegressor(tree_method="hist", device="cuda")
GPU-Accelerated SHAP values GPU-Accelerated SHAP values
============================= =============================
XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU predictor is selected. XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU is used.
.. code-block:: python .. code-block:: python
@ -44,12 +43,12 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
shap_values = booster.predict(dtrain, pred_contribs=True) shap_values = booster.predict(dtrain, pred_contribs=True)
shap_interaction_values = model.predict(dtrain, pred_interactions=True) shap_interaction_values = model.predict(dtrain, pred_interactions=True)
See examples `here <https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__. See :ref:`sphx_glr_python_gpu-examples_tree_shap.py` for a worked example.
Multi-node Multi-GPU Training Multi-node Multi-GPU Training
============================= =============================
XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples `here <https://github.com/dmlc/xgboost/tree/master/demo/dask>`__, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`. XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples :doc:`/python/dask-examples/index`, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
Memory usage Memory usage
@ -67,7 +66,8 @@ If you are getting out-of-memory errors on a big dataset, try the or :py:class:`
CPU-GPU Interoperability CPU-GPU Interoperability
======================== ========================
XGBoost models trained on GPUs can be used on CPU-only systems to generate predictions. For information about how to save and load an XGBoost model, see :doc:`/tutorials/saving_model`.
The model can be used on any device regardless of the one used to train it. For instance, a model trained using GPU can still work on a CPU-only machine and vice versa. For more information about model serialization, see :doc:`/tutorials/saving_model`.
Developer notes Developer notes

View File

@ -189,7 +189,7 @@ This will check out the latest stable version from the Maven Central.
For the latest release version number, please check `release page <https://github.com/dmlc/xgboost/releases>`_. For the latest release version number, please check `release page <https://github.com/dmlc/xgboost/releases>`_.
To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix). To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
.. note:: Windows not supported in the JVM package .. note:: Windows not supported in the JVM package
@ -325,4 +325,4 @@ The SNAPSHOT JARs are hosted by the XGBoost project. Every commit in the ``maste
You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html. You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html.
To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix). To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).

View File

@ -34,27 +34,6 @@ General Parameters
- Which booster to use. Can be ``gbtree``, ``gblinear`` or ``dart``; ``gbtree`` and ``dart`` use tree based models while ``gblinear`` uses linear functions. - Which booster to use. Can be ``gbtree``, ``gblinear`` or ``dart``; ``gbtree`` and ``dart`` use tree based models while ``gblinear`` uses linear functions.
* ``verbosity`` [default=1]
- Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3
(debug). Sometimes XGBoost tries to change configurations based on heuristics, which
is displayed as warning message. If there's unexpected behaviour, please try to
increase value of verbosity.
* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
- When set to True, XGBoost will perform validation of input parameters to check whether
a parameter is used or not.
* ``nthread`` [default to maximum number of threads available if not set]
- Number of parallel threads used to run XGBoost. When choosing it, please keep thread
contention and hyperthreading in mind.
* ``disable_default_eval_metric`` [default= ``false``]
- Flag to disable default metric. Set to 1 or ``true`` to disable.
* ``device`` [default= ``cpu``] * ``device`` [default= ``cpu``]
.. versionadded:: 2.0.0 .. versionadded:: 2.0.0
@ -67,6 +46,29 @@ General Parameters
+ ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently. + ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
+ ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently. + ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
For more information about GPU acceleration, see :doc:`/gpu/index`.
* ``verbosity`` [default=1]
- Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3
(debug). Sometimes XGBoost tries to change configurations based on heuristics, which
is displayed as warning message. If there's unexpected behaviour, please try to
increase value of verbosity.
* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
- When set to True, XGBoost will perform validation of input parameters to check whether
a parameter is used or not. A warning is emitted when there's unknown parameter.
* ``nthread`` [default to maximum number of threads available if not set]
- Number of parallel threads used to run XGBoost. When choosing it, please keep thread
contention and hyperthreading in mind.
* ``disable_default_eval_metric`` [default= ``false``]
- Flag to disable default metric. Set to 1 or ``true`` to disable.
Parameters for Tree Booster Parameters for Tree Booster
=========================== ===========================
* ``eta`` [default=0.3, alias: ``learning_rate``] * ``eta`` [default=0.3, alias: ``learning_rate``]
@ -160,7 +162,7 @@ Parameters for Tree Booster
- ``grow_colmaker``: non-distributed column-based construction of trees. - ``grow_colmaker``: non-distributed column-based construction of trees.
- ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting. - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
- ``grow_quantile_histmaker``: Grow tree using quantized histogram. - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
- ``grow_gpu_hist``: Grow tree with GPU. Same as setting ``tree_method`` to ``hist`` and use ``device=cuda``. - ``grow_gpu_hist``: Grow tree with GPU. Enabled when ``tree_method`` is set to ``hist`` along with ``device=cuda``.
- ``sync``: synchronizes trees in all distributed nodes. - ``sync``: synchronizes trees in all distributed nodes.
- ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed. - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
- ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``. - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.

View File

@ -1,3 +1,5 @@
examples examples
dask-examples dask-examples
survival-examples survival-examples
gpu-examples
rmm-examples

View File

@ -17,3 +17,5 @@ Contents
examples/index examples/index
dask-examples/index dask-examples/index
survival-examples/index survival-examples/index
gpu-examples/index
rmm-examples/index

View File

@ -124,7 +124,7 @@ Following table summarizes some differences in supported features between 4 tree
`T` means supported while `F` means unsupported. `T` means supported while `F` means unsupported.
+------------------+-----------+---------------------+---------------------+------------------------+ +------------------+-----------+---------------------+---------------------+------------------------+
| | Exact | Approx | Hist | GPU Hist | | | Exact | Approx | Hist | Hist (GPU) |
+==================+===========+=====================+=====================+========================+ +==================+===========+=====================+=====================+========================+
| grow_policy | Depthwise | depthwise/lossguide | depthwise/lossguide | depthwise/lossguide | | grow_policy | Depthwise | depthwise/lossguide | depthwise/lossguide | depthwise/lossguide |
+------------------+-----------+---------------------+---------------------+------------------------+ +------------------+-----------+---------------------+---------------------+------------------------+
@ -141,5 +141,5 @@ Following table summarizes some differences in supported features between 4 tree
Features/parameters that are not mentioned here are universally supported for all 4 tree Features/parameters that are not mentioned here are universally supported for all 4 tree
methods (for instance, column sampling and constraints). The `P` in external memory means methods (for instance, column sampling and constraints). The `P` in external memory means
partially supported. Please note that both categorical data and external memory are special handling. Please note that both categorical data and external memory are
experimental. experimental.

View File

@ -35,8 +35,8 @@ parameter ``enable_categorical``:
.. code:: python .. code:: python
# Supported tree methods are `gpu_hist`, `approx`, and `hist`. # Supported tree methods are `approx` and `hist`.
clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True) clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, device="cuda")
# X is the dataframe we created in previous snippet # X is the dataframe we created in previous snippet
clf.fit(X, y) clf.fit(X, y)
# Must use JSON/UBJSON for serialization, otherwise the information is lost. # Must use JSON/UBJSON for serialization, otherwise the information is lost.

View File

@ -81,7 +81,7 @@ constructor.
it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"]) it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
Xy = xgboost.DMatrix(it) Xy = xgboost.DMatrix(it)
# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats # The ``approx`` also work, but with low performance. GPU implementation is different from CPU.
# as noted in following sections. # as noted in following sections.
booster = xgboost.train({"tree_method": "hist"}, Xy) booster = xgboost.train({"tree_method": "hist"}, Xy)
@ -118,15 +118,15 @@ to reduce the overhead of file reading.
GPU Version (GPU Hist tree method) GPU Version (GPU Hist tree method)
********************************** **********************************
External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to External memory is supported by GPU algorithms (i.e. when ``device`` is set to
``gpu_hist``). However, the algorithm used for GPU is different from the one used for ``cuda``). However, the algorithm used for GPU is different from the one used for
CPU. When training on a CPU, the tree method iterates through all batches from external CPU. When training on a CPU, the tree method iterates through all batches from external
memory for each step of the tree construction algorithm. On the other hand, the GPU memory for each step of the tree construction algorithm. On the other hand, the GPU
algorithm uses a hybrid approach. It iterates through the data during the beginning of algorithm uses a hybrid approach. It iterates through the data during the beginning of
each iteration and concatenates all batches into one in GPU memory. To reduce overall each iteration and concatenates all batches into one in GPU memory for performance
memory usage, users can utilize subsampling. The GPU hist tree method supports reasons. To reduce overall memory usage, users can utilize subsampling. The GPU hist tree
`gradient-based sampling`, enabling users to set a low sampling rate without compromising method supports `gradient-based sampling`, enabling users to set a low sampling rate
accuracy. without compromising accuracy.
.. code-block:: python .. code-block:: python

View File

@ -83,13 +83,14 @@ Some other examples:
- ``(0,-1)``: No constraint on the first predictor and a decreasing constraint on the second. - ``(0,-1)``: No constraint on the first predictor and a decreasing constraint on the second.
**Note for the 'hist' tree construction algorithm**. .. note::
If ``tree_method`` is set to either ``hist``, ``approx`` or ``gpu_hist``, enabling
monotonic constraints may produce unnecessarily shallow trees. This is because the **Note for the 'hist' tree construction algorithm**. If ``tree_method`` is set to
``hist`` method reduces the number of candidate splits to be considered at each either ``hist`` or ``approx``, enabling monotonic constraints may produce unnecessarily
split. Monotonic constraints may wipe out all available split candidates, in which case no shallow trees. This is because the ``hist`` method reduces the number of candidate
split is made. To reduce the effect, you may want to increase the ``max_bin`` parameter to splits to be considered at each split. Monotonic constraints may wipe out all available
consider more split candidates. split candidates, in which case no split is made. To reduce the effect, you may want to
increase the ``max_bin`` parameter to consider more split candidates.
******************* *******************

View File

@ -38,10 +38,6 @@ There are in general two ways that you can control overfitting in XGBoost:
- This includes ``subsample`` and ``colsample_bytree``. - This includes ``subsample`` and ``colsample_bytree``.
- You can also reduce stepsize ``eta``. Remember to increase ``num_round`` when you do so. - You can also reduce stepsize ``eta``. Remember to increase ``num_round`` when you do so.
***************************
Faster training performance
***************************
There's a parameter called ``tree_method``, set it to ``hist`` or ``gpu_hist`` for faster computation.
************************* *************************
Handle Imbalanced Dataset Handle Imbalanced Dataset

View File

@ -50,13 +50,14 @@ Here is a sample parameter dictionary for training a random forest on a GPU usin
xgboost:: xgboost::
params = { params = {
'colsample_bynode': 0.8, "colsample_bynode": 0.8,
'learning_rate': 1, "learning_rate": 1,
'max_depth': 5, "max_depth": 5,
'num_parallel_tree': 100, "num_parallel_tree": 100,
'objective': 'binary:logistic', "objective": "binary:logistic",
'subsample': 0.8, "subsample": 0.8,
'tree_method': 'gpu_hist' "tree_method": "hist",
"device": "cuda",
} }
A random forest model can then be trained as follows:: A random forest model can then be trained as follows::

View File

@ -174,7 +174,7 @@ Will print out something similar to (not actual output as it's too long for demo
"gbtree_train_param": { "gbtree_train_param": {
"num_parallel_tree": "1", "num_parallel_tree": "1",
"process_type": "default", "process_type": "default",
"tree_method": "gpu_hist", "tree_method": "hist",
"updater": "grow_gpu_hist", "updater": "grow_gpu_hist",
"updater_seq": "grow_gpu_hist" "updater_seq": "grow_gpu_hist"
}, },

View File

@ -278,9 +278,15 @@ __model_doc = f"""
without bias. without bias.
device : Optional[str] device : Optional[str]
Device ordinal.
.. versionadded:: 2.0.0
Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
validate_parameters : Optional[bool] validate_parameters : Optional[bool]
Give warnings for unknown parameter. Give warnings for unknown parameter.
enable_categorical : bool enable_categorical : bool
.. versionadded:: 1.5.0 .. versionadded:: 1.5.0

View File

@ -144,8 +144,13 @@ class SparkXGBRegressor(_SparkXGBEstimator):
.. deprecated:: 2.0.0 .. deprecated:: 2.0.0
Use `device` instead. Use `device` instead.
device: device:
.. versionadded:: 2.0.0
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`. Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
force_repartition: force_repartition:
Boolean value to specify if forcing the input dataset to be repartitioned Boolean value to specify if forcing the input dataset to be repartitioned
before XGBoost training. before XGBoost training.
@ -319,8 +324,13 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
.. deprecated:: 2.0.0 .. deprecated:: 2.0.0
Use `device` instead. Use `device` instead.
device: device:
.. versionadded:: 2.0.0
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`. Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
force_repartition: force_repartition:
Boolean value to specify if forcing the input dataset to be repartitioned Boolean value to specify if forcing the input dataset to be repartitioned
before XGBoost training. before XGBoost training.
@ -497,8 +507,13 @@ class SparkXGBRanker(_SparkXGBEstimator):
.. deprecated:: 2.0.0 .. deprecated:: 2.0.0
Use `device` instead. Use `device` instead.
device: device:
.. versionadded:: 2.0.0
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`. Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
force_repartition: force_repartition:
Boolean value to specify if forcing the input dataset to be repartitioned Boolean value to specify if forcing the input dataset to be repartitioned
before XGBoost training. before XGBoost training.

View File

@ -724,11 +724,15 @@ void MetaInfo::SynchronizeNumberOfColumns() {
namespace { namespace {
template <typename T> template <typename T>
void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) { void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device) bool valid =
<< "Data is resided on a different device than `gpu_id`. " v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device;
<< "Device that data is on: " << v.DeviceIdx() << ", " if (!valid) {
<< "`gpu_id` for XGBoost: " << device; LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
"the booster. The device ordinal of the data is: "
<< v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
} }
}
template <typename T, std::int32_t D> template <typename T, std::int32_t D>
void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) { void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
CheckDevice(device, *v.Data()); CheckDevice(device, *v.Data());

View File

@ -42,22 +42,22 @@ DMLC_REGISTRY_FILE_TAG(gbtree);
namespace { namespace {
/** @brief Map the `tree_method` parameter to the `updater` parameter. */ /** @brief Map the `tree_method` parameter to the `updater` parameter. */
std::string MapTreeMethodToUpdaters(Context const* ctx_, TreeMethod tree_method) { std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method) {
// Choose updaters according to tree_method parameters // Choose updaters according to tree_method parameters
if (ctx->IsCUDA()) {
common::AssertGPUSupport();
}
switch (tree_method) { switch (tree_method) {
case TreeMethod::kAuto: // Use hist as default in 2.0 case TreeMethod::kAuto: // Use hist as default in 2.0
case TreeMethod::kHist: { case TreeMethod::kHist: {
return ctx_->DispatchDevice([] { return "grow_quantile_histmaker"; }, return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
[] { [] { return "grow_gpu_hist"; });
common::AssertGPUSupport();
return "grow_gpu_hist";
});
} }
case TreeMethod::kApprox: case TreeMethod::kApprox:
CHECK(ctx_->IsCPU()) << "The `approx` tree method is not supported on GPU."; CHECK(ctx->IsCPU()) << "The `approx` tree method is not supported on GPU.";
return "grow_histmaker"; return "grow_histmaker";
case TreeMethod::kExact: case TreeMethod::kExact:
CHECK(ctx_->IsCPU()) << "The `exact` tree method is not supported on GPU."; CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
return "grow_colmaker,prune"; return "grow_colmaker,prune";
case TreeMethod::kGPUHist: { case TreeMethod::kGPUHist: {
common::AssertGPUSupport(); common::AssertGPUSupport();
@ -150,6 +150,7 @@ void GBTree::Configure(Args const& cfg) {
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto) CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
<< "Only the hist tree method is supported for building multi-target trees with vector " << "Only the hist tree method is supported for building multi-target trees with vector "
"leaf."; "leaf.";
CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
} }
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method); LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);

View File

@ -29,10 +29,12 @@ class LintersPaths:
"tests/python-gpu/load_pickle.py", "tests/python-gpu/load_pickle.py",
"tests/python-gpu/test_gpu_pickling.py", "tests/python-gpu/test_gpu_pickling.py",
"tests/python-gpu/test_gpu_eval_metrics.py", "tests/python-gpu/test_gpu_eval_metrics.py",
"tests/python-gpu/test_gpu_with_sklearn.py",
"tests/test_distributed/test_with_spark/", "tests/test_distributed/test_with_spark/",
"tests/test_distributed/test_gpu_with_spark/", "tests/test_distributed/test_gpu_with_spark/",
# demo # demo
"demo/dask/", "demo/dask/",
"demo/rmm_plugin",
"demo/json-model/json_parser.py", "demo/json-model/json_parser.py",
"demo/guide-python/cat_in_the_dat.py", "demo/guide-python/cat_in_the_dat.py",
"demo/guide-python/categorical.py", "demo/guide-python/categorical.py",

View File

@ -234,7 +234,7 @@ Arrow specification.'''
cp.cuda.runtime.setDevice(0) cp.cuda.runtime.setDevice(0)
dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan) dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
with pytest.raises( with pytest.raises(
xgb.core.XGBoostError, match="Data is resided on a different device" xgb.core.XGBoostError, match="Invalid device ordinal"
): ):
xgb.train( xgb.train(
{'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10 {'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10

View File

@ -2,6 +2,7 @@ import json
import os import os
import sys import sys
import tempfile import tempfile
from concurrent.futures import ThreadPoolExecutor
import numpy as np import numpy as np
import pytest import pytest
@ -23,18 +24,19 @@ def test_gpu_binary_classification():
from sklearn.model_selection import KFold from sklearn.model_selection import KFold
digits = load_digits(n_class=2) digits = load_digits(n_class=2)
y = digits['target'] y = digits["target"]
X = digits['data'] X = digits["data"]
kf = KFold(n_splits=2, shuffle=True, random_state=rng) kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier): for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
for train_index, test_index in kf.split(X, y): for train_index, test_index in kf.split(X, y):
xgb_model = cls( xgb_model = cls(
random_state=42, tree_method='gpu_hist', random_state=42, tree_method="gpu_hist", n_estimators=4, gpu_id="0"
n_estimators=4, gpu_id='0').fit(X[train_index], y[train_index]) ).fit(X[train_index], y[train_index])
preds = xgb_model.predict(X[test_index]) preds = xgb_model.predict(X[test_index])
labels = y[test_index] labels = y[test_index]
err = sum(1 for i in range(len(preds)) err = sum(
if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
) / float(len(preds))
assert err < 0.1 assert err < 0.1
@ -133,7 +135,7 @@ def test_classififer():
X, y = load_digits(return_X_y=True) X, y = load_digits(return_X_y=True)
y *= 10 y *= 10
clf = xgb.XGBClassifier(tree_method="gpu_hist", n_estimators=1) clf = xgb.XGBClassifier(tree_method="hist", n_estimators=1, device="cuda")
# numpy # numpy
with pytest.raises(ValueError, match=r"Invalid classes.*"): with pytest.raises(ValueError, match=r"Invalid classes.*"):
@ -161,3 +163,46 @@ def test_ranking_qid_df():
import cudf import cudf
run_ranking_qid_df(cudf, "gpu_hist") run_ranking_qid_df(cudf, "gpu_hist")
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.mgpu
def test_device_ordinal() -> None:
import cupy as cp
n_devices = 2
def worker(ordinal: int, correct_ordinal: bool) -> None:
if correct_ordinal:
cp.cuda.runtime.setDevice(ordinal)
else:
cp.cuda.runtime.setDevice((ordinal + 1) % n_devices)
X, y, w = tm.make_regression(4096, 12, use_cupy=True)
reg = xgb.XGBRegressor(device=f"cuda:{ordinal}", tree_method="hist")
if correct_ordinal:
reg.fit(
X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
)
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
return
with pytest.raises(ValueError, match="Invalid device ordinal"):
reg.fit(
X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
)
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
futures = []
n_trials = 32
for i in range(n_trials):
fut = executor.submit(
worker, ordinal=i % n_devices, correct_ordinal=i % 3 != 0
)
futures.append(fut)
for fut in futures:
fut.result()
cp.cuda.runtime.setDevice(0)