Document for device ordinal. (#9398)
- Rewrite GPU demos. notebook is converted to script to avoid committing additional png plots. - Add GPU demos into the sphinx gallery. - Add RMM demos into the sphinx gallery. - Test for firing threads with different device ordinals.
This commit is contained in:
parent
22b0a55a04
commit
275da176ba
@ -53,15 +53,7 @@ int main() {
|
||||
// configure the training
|
||||
// available parameters are described here:
|
||||
// https://xgboost.readthedocs.io/en/latest/parameter.html
|
||||
safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
|
||||
if (use_gpu) {
|
||||
// set the GPU to use;
|
||||
// this is not necessary, but provided here as an illustration
|
||||
safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
|
||||
} else {
|
||||
// avoid evaluating objective and metric on a GPU
|
||||
safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
|
||||
}
|
||||
safe_xgboost(XGBoosterSetParam(booster, "device", use_gpu ? "cuda" : "cpu"));
|
||||
|
||||
safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
|
||||
safe_xgboost(XGBoosterSetParam(booster, "min_child_weight", "1"));
|
||||
|
||||
@ -1,5 +0,0 @@
|
||||
# GPU Acceleration Demo
|
||||
|
||||
`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
|
||||
|
||||
`shap.ipynb` demonstrates using GPU acceleration to compute SHAP values for feature importance.
|
||||
8
demo/gpu_acceleration/README.rst
Normal file
8
demo/gpu_acceleration/README.rst
Normal file
@ -0,0 +1,8 @@
|
||||
:orphan:
|
||||
|
||||
GPU Acceleration Demo
|
||||
=====================
|
||||
|
||||
This is a collection of demonstration scripts to showcase the basic usage of GPU. Please
|
||||
see :doc:`/gpu/index` for more info. There are other demonstrations for distributed GPU
|
||||
training using dask or spark.
|
||||
@ -1,41 +1,49 @@
|
||||
"""
|
||||
Using xgboost on GPU devices
|
||||
============================
|
||||
|
||||
Shows how to train a model on the `forest cover type
|
||||
<https://archive.ics.uci.edu/ml/datasets/covertype>`_ dataset using GPU
|
||||
acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it
|
||||
time consuming to process. We compare the run-time and accuracy of the GPU and CPU
|
||||
histogram algorithms.
|
||||
|
||||
In addition, The demo showcases using GPU with other GPU-related libraries including
|
||||
cupy and cuml. These libraries are not strictly required.
|
||||
|
||||
"""
|
||||
import time
|
||||
|
||||
import cupy as cp
|
||||
from cuml.model_selection import train_test_split
|
||||
from sklearn.datasets import fetch_covtype
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
# Fetch dataset using sklearn
|
||||
cov = fetch_covtype()
|
||||
X = cov.data
|
||||
y = cov.target
|
||||
X, y = fetch_covtype(return_X_y=True)
|
||||
X = cp.array(X)
|
||||
y = cp.array(y)
|
||||
y -= y.min()
|
||||
|
||||
# Create 0.75/0.25 train/test split
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,
|
||||
random_state=42)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.25, train_size=0.75, random_state=42
|
||||
)
|
||||
|
||||
# Specify sufficient boosting iterations to reach a minimum
|
||||
num_round = 3000
|
||||
|
||||
# Leave most parameters as default
|
||||
param = {'objective': 'multi:softmax', # Specify multiclass classification
|
||||
'num_class': 8, # Number of possible output classes
|
||||
'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
|
||||
}
|
||||
|
||||
# Convert input data from numpy to XGBoost format
|
||||
dtrain = xgb.DMatrix(X_train, label=y_train)
|
||||
dtest = xgb.DMatrix(X_test, label=y_test)
|
||||
|
||||
gpu_res = {} # Store accuracy result
|
||||
tmp = time.time()
|
||||
clf = xgb.XGBClassifier(device="cuda", n_estimators=num_round)
|
||||
# Train model
|
||||
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
|
||||
print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))
|
||||
start = time.time()
|
||||
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
|
||||
gpu_res = clf.evals_result()
|
||||
print("GPU Training Time: %s seconds" % (str(time.time() - start)))
|
||||
|
||||
# Repeat for CPU algorithm
|
||||
tmp = time.time()
|
||||
param['tree_method'] = 'hist'
|
||||
cpu_res = {}
|
||||
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
|
||||
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
|
||||
clf = xgb.XGBClassifier(device="cpu", n_estimators=num_round)
|
||||
start = time.time()
|
||||
cpu_res = clf.evals_result()
|
||||
print("CPU Training Time: %s seconds" % (str(time.time() - start)))
|
||||
|
||||
File diff suppressed because one or more lines are too long
55
demo/gpu_acceleration/tree_shap.py
Normal file
55
demo/gpu_acceleration/tree_shap.py
Normal file
@ -0,0 +1,55 @@
|
||||
"""
|
||||
Use GPU to speedup SHAP value computation
|
||||
=========================================
|
||||
|
||||
Demonstrates using GPU acceleration to compute SHAP values for feature importance.
|
||||
|
||||
"""
|
||||
import shap
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
# Fetch dataset using sklearn
|
||||
data = fetch_california_housing()
|
||||
print(data.DESCR)
|
||||
X = data.data
|
||||
y = data.target
|
||||
|
||||
num_round = 500
|
||||
|
||||
param = {
|
||||
"eta": 0.05,
|
||||
"max_depth": 10,
|
||||
"tree_method": "hist",
|
||||
"device": "cuda",
|
||||
}
|
||||
|
||||
# GPU accelerated training
|
||||
dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
|
||||
model = xgb.train(param, dtrain, num_round)
|
||||
|
||||
# Compute shap values using GPU with xgboost
|
||||
model.set_param({"device": "cuda"})
|
||||
shap_values = model.predict(dtrain, pred_contribs=True)
|
||||
|
||||
# Compute shap interaction values using GPU
|
||||
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
|
||||
|
||||
|
||||
# shap will call the GPU accelerated version as long as the device parameter is set to
|
||||
# "cuda"
|
||||
explainer = shap.TreeExplainer(model)
|
||||
shap_values = explainer.shap_values(X)
|
||||
|
||||
# visualize the first prediction's explanation
|
||||
shap.force_plot(
|
||||
explainer.expected_value,
|
||||
shap_values[0, :],
|
||||
X[0, :],
|
||||
feature_names=data.feature_names,
|
||||
matplotlib=True,
|
||||
)
|
||||
|
||||
# Show a summary of feature importance
|
||||
shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)
|
||||
@ -70,8 +70,7 @@ class XGBoostTrainer(Executor):
|
||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
||||
if self._use_gpus:
|
||||
self.log_info(fl_ctx, f'Training with GPU {rank}')
|
||||
param['tree_method'] = 'gpu_hist'
|
||||
param['gpu_id'] = rank
|
||||
param['device'] = f"cuda:{rank}"
|
||||
|
||||
# Specify validations set to watch performance
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
|
||||
@ -1,47 +0,0 @@
|
||||
Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
|
||||
====================================================================
|
||||
[RAPIDS Memory Manager (RMM)](https://github.com/rapidsai/rmm) library provides a collection of
|
||||
efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory
|
||||
allocators provided by RMM, by enabling the RMM integration plugin.
|
||||
|
||||
The demos in this directory highlights one RMM allocator in particular: **the pool sub-allocator**.
|
||||
This allocator addresses the slow speed of `cudaMalloc()` by allocating a large chunk of memory
|
||||
upfront. Subsequent allocations will draw from the pool of already allocated memory and thus avoid
|
||||
the overhead of calling `cudaMalloc()` directly. See
|
||||
[this GTC talk slides](https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf)
|
||||
for more details.
|
||||
|
||||
Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
|
||||
run CMake with option `-DPLUGIN_RMM=ON` (`-DUSE_CUDA=ON` also required):
|
||||
```
|
||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
|
||||
make -j4
|
||||
```
|
||||
CMake will attempt to locate the RMM library in your build environment. You may choose to build
|
||||
RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
|
||||
should specify the location of RMM with the CMake prefix:
|
||||
```
|
||||
# If using Conda:
|
||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
|
||||
# If using RMM installed with a custom location
|
||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
|
||||
```
|
||||
|
||||
# Informing XGBoost about RMM pool
|
||||
|
||||
When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
|
||||
allocators, but some small allocations in performance critical areas are using a different
|
||||
caching allocator so that we can have better control over memory allocation behavior.
|
||||
Users can override this behavior and force the use of rmm for all allocations by setting
|
||||
the global configuration ``use_rmm``:
|
||||
|
||||
``` python
|
||||
with xgb.config_context(use_rmm=True):
|
||||
clf = xgb.XGBClassifier(tree_method="gpu_hist")
|
||||
```
|
||||
|
||||
Depending on the choice of memory pool size or type of allocator, this may have negative
|
||||
performance impact.
|
||||
|
||||
* [Using RMM with a single GPU](./rmm_singlegpu.py)
|
||||
* [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py)
|
||||
51
demo/rmm_plugin/README.rst
Normal file
51
demo/rmm_plugin/README.rst
Normal file
@ -0,0 +1,51 @@
|
||||
Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
|
||||
====================================================================
|
||||
|
||||
`RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ library provides a
|
||||
collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use
|
||||
XGBoost with memory allocators provided by RMM, by enabling the RMM integration plugin.
|
||||
|
||||
The demos in this directory highlights one RMM allocator in particular: **the pool
|
||||
sub-allocator**. This allocator addresses the slow speed of ``cudaMalloc()`` by
|
||||
allocating a large chunk of memory upfront. Subsequent allocations will draw from the pool
|
||||
of already allocated memory and thus avoid the overhead of calling ``cudaMalloc()``
|
||||
directly. See `this GTC talk slides
|
||||
<https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf>`_ for
|
||||
more details.
|
||||
|
||||
Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
|
||||
run CMake with option ``-DPLUGIN_RMM=ON`` (``-DUSE_CUDA=ON`` also required):
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
|
||||
make -j$(nproc)
|
||||
|
||||
CMake will attempt to locate the RMM library in your build environment. You may choose to build
|
||||
RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
|
||||
should specify the location of RMM with the CMake prefix:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
# If using Conda:
|
||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
|
||||
# If using RMM installed with a custom location
|
||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
|
||||
|
||||
********************************
|
||||
Informing XGBoost about RMM pool
|
||||
********************************
|
||||
|
||||
When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
|
||||
allocators, but some small allocations in performance critical areas are using a different
|
||||
caching allocator so that we can have better control over memory allocation behavior.
|
||||
Users can override this behavior and force the use of rmm for all allocations by setting
|
||||
the global configuration ``use_rmm``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
with xgb.config_context(use_rmm=True):
|
||||
clf = xgb.XGBClassifier(tree_method="hist", device="cuda")
|
||||
|
||||
Depending on the choice of memory pool size or type of allocator, this may have negative
|
||||
performance impact.
|
||||
@ -1,3 +1,7 @@
|
||||
"""
|
||||
Using rmm with Dask
|
||||
===================
|
||||
"""
|
||||
import dask
|
||||
from dask.distributed import Client
|
||||
from dask_cuda import LocalCUDACluster
|
||||
@ -11,25 +15,33 @@ def main(client):
|
||||
# xgb.set_config(use_rmm=True)
|
||||
|
||||
X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
|
||||
# In pratice one should prefer loading the data with dask collections instead of using
|
||||
# `from_array`.
|
||||
# In pratice one should prefer loading the data with dask collections instead of
|
||||
# using `from_array`.
|
||||
X = dask.array.from_array(X)
|
||||
y = dask.array.from_array(y)
|
||||
dtrain = xgb.dask.DaskDMatrix(client, X, label=y)
|
||||
|
||||
params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
|
||||
'tree_method': 'gpu_hist', 'eval_metric': 'merror'}
|
||||
output = xgb.dask.train(client, params, dtrain, num_boost_round=100,
|
||||
evals=[(dtrain, 'train')])
|
||||
bst = output['booster']
|
||||
history = output['history']
|
||||
for i, e in enumerate(history['train']['merror']):
|
||||
print(f'[{i}] train-merror: {e}')
|
||||
params = {
|
||||
"max_depth": 8,
|
||||
"eta": 0.01,
|
||||
"objective": "multi:softprob",
|
||||
"num_class": 3,
|
||||
"tree_method": "hist",
|
||||
"eval_metric": "merror",
|
||||
"device": "cuda",
|
||||
}
|
||||
output = xgb.dask.train(
|
||||
client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
|
||||
)
|
||||
bst = output["booster"]
|
||||
history = output["history"]
|
||||
for i, e in enumerate(history["train"]["merror"]):
|
||||
print(f"[{i}] train-merror: {e}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to
|
||||
# LocalCUDACluster constructor.
|
||||
with LocalCUDACluster(rmm_pool_size='2GB') as cluster:
|
||||
if __name__ == "__main__":
|
||||
# To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option
|
||||
# to LocalCUDACluster constructor.
|
||||
with LocalCUDACluster(rmm_pool_size="2GB") as cluster:
|
||||
with Client(cluster) as client:
|
||||
main(client)
|
||||
|
||||
@ -1,3 +1,7 @@
|
||||
"""
|
||||
Using rmm on a single node device
|
||||
=================================
|
||||
"""
|
||||
import rmm
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
@ -16,7 +20,8 @@ params = {
|
||||
"eta": 0.01,
|
||||
"objective": "multi:softprob",
|
||||
"num_class": 3,
|
||||
"tree_method": "gpu_hist",
|
||||
"tree_method": "hist",
|
||||
"device": "cuda",
|
||||
}
|
||||
# XGBoost will automatically use the RMM pool allocator
|
||||
bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])
|
||||
|
||||
2
doc/.gitignore
vendored
2
doc/.gitignore
vendored
@ -6,3 +6,5 @@ doxygen
|
||||
parser.py
|
||||
*.pyc
|
||||
web-data
|
||||
# generated by doxygen
|
||||
tmp
|
||||
11
doc/conf.py
11
doc/conf.py
@ -19,7 +19,6 @@ import sys
|
||||
import tarfile
|
||||
import urllib.request
|
||||
import warnings
|
||||
from subprocess import call
|
||||
from urllib.error import HTTPError
|
||||
|
||||
from sh.contrib import git
|
||||
@ -148,12 +147,20 @@ extensions = [
|
||||
|
||||
sphinx_gallery_conf = {
|
||||
# path to your example scripts
|
||||
"examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
|
||||
"examples_dirs": [
|
||||
"../demo/guide-python",
|
||||
"../demo/dask",
|
||||
"../demo/aft_survival",
|
||||
"../demo/gpu_acceleration",
|
||||
"../demo/rmm_plugin"
|
||||
],
|
||||
# path to where to save gallery generated output
|
||||
"gallery_dirs": [
|
||||
"python/examples",
|
||||
"python/dask-examples",
|
||||
"python/survival-examples",
|
||||
"python/gpu-examples",
|
||||
"python/rmm-examples",
|
||||
],
|
||||
"matplotlib_animations": True,
|
||||
}
|
||||
|
||||
@ -23,20 +23,19 @@ The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :do
|
||||
:caption: Python example
|
||||
|
||||
params = dict()
|
||||
params["device"] = "cuda:0"
|
||||
params["device"] = "cuda"
|
||||
params["tree_method"] = "hist"
|
||||
Xy = xgboost.QuantileDMatrix(X, y)
|
||||
xgboost.train(params, Xy)
|
||||
|
||||
.. code-block:: python
|
||||
:caption: With Scikit-Learn interface
|
||||
:caption: With the Scikit-Learn interface
|
||||
|
||||
XGBRegressor(tree_method="hist", device="cuda")
|
||||
|
||||
|
||||
GPU-Accelerated SHAP values
|
||||
=============================
|
||||
XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU predictor is selected.
|
||||
XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU is used.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -44,12 +43,12 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
|
||||
shap_values = booster.predict(dtrain, pred_contribs=True)
|
||||
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
|
||||
|
||||
See examples `here <https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
|
||||
See :ref:`sphx_glr_python_gpu-examples_tree_shap.py` for a worked example.
|
||||
|
||||
Multi-node Multi-GPU Training
|
||||
=============================
|
||||
|
||||
XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples `here <https://github.com/dmlc/xgboost/tree/master/demo/dask>`__, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
|
||||
XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples :doc:`/python/dask-examples/index`, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
|
||||
|
||||
|
||||
Memory usage
|
||||
@ -67,7 +66,8 @@ If you are getting out-of-memory errors on a big dataset, try the or :py:class:`
|
||||
|
||||
CPU-GPU Interoperability
|
||||
========================
|
||||
XGBoost models trained on GPUs can be used on CPU-only systems to generate predictions. For information about how to save and load an XGBoost model, see :doc:`/tutorials/saving_model`.
|
||||
|
||||
The model can be used on any device regardless of the one used to train it. For instance, a model trained using GPU can still work on a CPU-only machine and vice versa. For more information about model serialization, see :doc:`/tutorials/saving_model`.
|
||||
|
||||
|
||||
Developer notes
|
||||
|
||||
@ -189,7 +189,7 @@ This will check out the latest stable version from the Maven Central.
|
||||
|
||||
For the latest release version number, please check `release page <https://github.com/dmlc/xgboost/releases>`_.
|
||||
|
||||
To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
|
||||
To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
|
||||
|
||||
|
||||
.. note:: Windows not supported in the JVM package
|
||||
@ -325,4 +325,4 @@ The SNAPSHOT JARs are hosted by the XGBoost project. Every commit in the ``maste
|
||||
|
||||
You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html.
|
||||
|
||||
To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
|
||||
To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
|
||||
|
||||
@ -34,27 +34,6 @@ General Parameters
|
||||
|
||||
- Which booster to use. Can be ``gbtree``, ``gblinear`` or ``dart``; ``gbtree`` and ``dart`` use tree based models while ``gblinear`` uses linear functions.
|
||||
|
||||
* ``verbosity`` [default=1]
|
||||
|
||||
- Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3
|
||||
(debug). Sometimes XGBoost tries to change configurations based on heuristics, which
|
||||
is displayed as warning message. If there's unexpected behaviour, please try to
|
||||
increase value of verbosity.
|
||||
|
||||
* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
|
||||
|
||||
- When set to True, XGBoost will perform validation of input parameters to check whether
|
||||
a parameter is used or not.
|
||||
|
||||
* ``nthread`` [default to maximum number of threads available if not set]
|
||||
|
||||
- Number of parallel threads used to run XGBoost. When choosing it, please keep thread
|
||||
contention and hyperthreading in mind.
|
||||
|
||||
* ``disable_default_eval_metric`` [default= ``false``]
|
||||
|
||||
- Flag to disable default metric. Set to 1 or ``true`` to disable.
|
||||
|
||||
* ``device`` [default= ``cpu``]
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
@ -67,6 +46,29 @@ General Parameters
|
||||
+ ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
|
||||
+ ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
|
||||
|
||||
For more information about GPU acceleration, see :doc:`/gpu/index`.
|
||||
|
||||
* ``verbosity`` [default=1]
|
||||
|
||||
- Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3
|
||||
(debug). Sometimes XGBoost tries to change configurations based on heuristics, which
|
||||
is displayed as warning message. If there's unexpected behaviour, please try to
|
||||
increase value of verbosity.
|
||||
|
||||
* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
|
||||
|
||||
- When set to True, XGBoost will perform validation of input parameters to check whether
|
||||
a parameter is used or not. A warning is emitted when there's unknown parameter.
|
||||
|
||||
* ``nthread`` [default to maximum number of threads available if not set]
|
||||
|
||||
- Number of parallel threads used to run XGBoost. When choosing it, please keep thread
|
||||
contention and hyperthreading in mind.
|
||||
|
||||
* ``disable_default_eval_metric`` [default= ``false``]
|
||||
|
||||
- Flag to disable default metric. Set to 1 or ``true`` to disable.
|
||||
|
||||
Parameters for Tree Booster
|
||||
===========================
|
||||
* ``eta`` [default=0.3, alias: ``learning_rate``]
|
||||
@ -160,7 +162,7 @@ Parameters for Tree Booster
|
||||
- ``grow_colmaker``: non-distributed column-based construction of trees.
|
||||
- ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
|
||||
- ``grow_quantile_histmaker``: Grow tree using quantized histogram.
|
||||
- ``grow_gpu_hist``: Grow tree with GPU. Same as setting ``tree_method`` to ``hist`` and use ``device=cuda``.
|
||||
- ``grow_gpu_hist``: Grow tree with GPU. Enabled when ``tree_method`` is set to ``hist`` along with ``device=cuda``.
|
||||
- ``sync``: synchronizes trees in all distributed nodes.
|
||||
- ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
|
||||
- ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
|
||||
|
||||
4
doc/python/.gitignore
vendored
4
doc/python/.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
examples
|
||||
dask-examples
|
||||
survival-examples
|
||||
survival-examples
|
||||
gpu-examples
|
||||
rmm-examples
|
||||
@ -17,3 +17,5 @@ Contents
|
||||
examples/index
|
||||
dask-examples/index
|
||||
survival-examples/index
|
||||
gpu-examples/index
|
||||
rmm-examples/index
|
||||
|
||||
@ -124,7 +124,7 @@ Following table summarizes some differences in supported features between 4 tree
|
||||
`T` means supported while `F` means unsupported.
|
||||
|
||||
+------------------+-----------+---------------------+---------------------+------------------------+
|
||||
| | Exact | Approx | Hist | GPU Hist |
|
||||
| | Exact | Approx | Hist | Hist (GPU) |
|
||||
+==================+===========+=====================+=====================+========================+
|
||||
| grow_policy | Depthwise | depthwise/lossguide | depthwise/lossguide | depthwise/lossguide |
|
||||
+------------------+-----------+---------------------+---------------------+------------------------+
|
||||
@ -141,5 +141,5 @@ Following table summarizes some differences in supported features between 4 tree
|
||||
|
||||
Features/parameters that are not mentioned here are universally supported for all 4 tree
|
||||
methods (for instance, column sampling and constraints). The `P` in external memory means
|
||||
partially supported. Please note that both categorical data and external memory are
|
||||
special handling. Please note that both categorical data and external memory are
|
||||
experimental.
|
||||
|
||||
@ -35,8 +35,8 @@ parameter ``enable_categorical``:
|
||||
|
||||
.. code:: python
|
||||
|
||||
# Supported tree methods are `gpu_hist`, `approx`, and `hist`.
|
||||
clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
|
||||
# Supported tree methods are `approx` and `hist`.
|
||||
clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, device="cuda")
|
||||
# X is the dataframe we created in previous snippet
|
||||
clf.fit(X, y)
|
||||
# Must use JSON/UBJSON for serialization, otherwise the information is lost.
|
||||
|
||||
@ -81,7 +81,7 @@ constructor.
|
||||
it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
|
||||
Xy = xgboost.DMatrix(it)
|
||||
|
||||
# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats
|
||||
# The ``approx`` also work, but with low performance. GPU implementation is different from CPU.
|
||||
# as noted in following sections.
|
||||
booster = xgboost.train({"tree_method": "hist"}, Xy)
|
||||
|
||||
@ -118,15 +118,15 @@ to reduce the overhead of file reading.
|
||||
GPU Version (GPU Hist tree method)
|
||||
**********************************
|
||||
|
||||
External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to
|
||||
``gpu_hist``). However, the algorithm used for GPU is different from the one used for
|
||||
External memory is supported by GPU algorithms (i.e. when ``device`` is set to
|
||||
``cuda``). However, the algorithm used for GPU is different from the one used for
|
||||
CPU. When training on a CPU, the tree method iterates through all batches from external
|
||||
memory for each step of the tree construction algorithm. On the other hand, the GPU
|
||||
algorithm uses a hybrid approach. It iterates through the data during the beginning of
|
||||
each iteration and concatenates all batches into one in GPU memory. To reduce overall
|
||||
memory usage, users can utilize subsampling. The GPU hist tree method supports
|
||||
`gradient-based sampling`, enabling users to set a low sampling rate without compromising
|
||||
accuracy.
|
||||
each iteration and concatenates all batches into one in GPU memory for performance
|
||||
reasons. To reduce overall memory usage, users can utilize subsampling. The GPU hist tree
|
||||
method supports `gradient-based sampling`, enabling users to set a low sampling rate
|
||||
without compromising accuracy.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
@ -83,13 +83,14 @@ Some other examples:
|
||||
- ``(0,-1)``: No constraint on the first predictor and a decreasing constraint on the second.
|
||||
|
||||
|
||||
**Note for the 'hist' tree construction algorithm**.
|
||||
If ``tree_method`` is set to either ``hist``, ``approx`` or ``gpu_hist``, enabling
|
||||
monotonic constraints may produce unnecessarily shallow trees. This is because the
|
||||
``hist`` method reduces the number of candidate splits to be considered at each
|
||||
split. Monotonic constraints may wipe out all available split candidates, in which case no
|
||||
split is made. To reduce the effect, you may want to increase the ``max_bin`` parameter to
|
||||
consider more split candidates.
|
||||
.. note::
|
||||
|
||||
**Note for the 'hist' tree construction algorithm**. If ``tree_method`` is set to
|
||||
either ``hist`` or ``approx``, enabling monotonic constraints may produce unnecessarily
|
||||
shallow trees. This is because the ``hist`` method reduces the number of candidate
|
||||
splits to be considered at each split. Monotonic constraints may wipe out all available
|
||||
split candidates, in which case no split is made. To reduce the effect, you may want to
|
||||
increase the ``max_bin`` parameter to consider more split candidates.
|
||||
|
||||
|
||||
*******************
|
||||
|
||||
@ -38,10 +38,6 @@ There are in general two ways that you can control overfitting in XGBoost:
|
||||
- This includes ``subsample`` and ``colsample_bytree``.
|
||||
- You can also reduce stepsize ``eta``. Remember to increase ``num_round`` when you do so.
|
||||
|
||||
***************************
|
||||
Faster training performance
|
||||
***************************
|
||||
There's a parameter called ``tree_method``, set it to ``hist`` or ``gpu_hist`` for faster computation.
|
||||
|
||||
*************************
|
||||
Handle Imbalanced Dataset
|
||||
|
||||
@ -50,13 +50,14 @@ Here is a sample parameter dictionary for training a random forest on a GPU usin
|
||||
xgboost::
|
||||
|
||||
params = {
|
||||
'colsample_bynode': 0.8,
|
||||
'learning_rate': 1,
|
||||
'max_depth': 5,
|
||||
'num_parallel_tree': 100,
|
||||
'objective': 'binary:logistic',
|
||||
'subsample': 0.8,
|
||||
'tree_method': 'gpu_hist'
|
||||
"colsample_bynode": 0.8,
|
||||
"learning_rate": 1,
|
||||
"max_depth": 5,
|
||||
"num_parallel_tree": 100,
|
||||
"objective": "binary:logistic",
|
||||
"subsample": 0.8,
|
||||
"tree_method": "hist",
|
||||
"device": "cuda",
|
||||
}
|
||||
|
||||
A random forest model can then be trained as follows::
|
||||
|
||||
@ -174,7 +174,7 @@ Will print out something similar to (not actual output as it's too long for demo
|
||||
"gbtree_train_param": {
|
||||
"num_parallel_tree": "1",
|
||||
"process_type": "default",
|
||||
"tree_method": "gpu_hist",
|
||||
"tree_method": "hist",
|
||||
"updater": "grow_gpu_hist",
|
||||
"updater_seq": "grow_gpu_hist"
|
||||
},
|
||||
|
||||
@ -278,9 +278,15 @@ __model_doc = f"""
|
||||
without bias.
|
||||
|
||||
device : Optional[str]
|
||||
Device ordinal.
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
|
||||
Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
|
||||
|
||||
validate_parameters : Optional[bool]
|
||||
|
||||
Give warnings for unknown parameter.
|
||||
|
||||
enable_categorical : bool
|
||||
|
||||
.. versionadded:: 1.5.0
|
||||
|
||||
@ -144,8 +144,13 @@ class SparkXGBRegressor(_SparkXGBEstimator):
|
||||
.. deprecated:: 2.0.0
|
||||
|
||||
Use `device` instead.
|
||||
|
||||
device:
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
|
||||
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
|
||||
|
||||
force_repartition:
|
||||
Boolean value to specify if forcing the input dataset to be repartitioned
|
||||
before XGBoost training.
|
||||
@ -319,8 +324,13 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
|
||||
.. deprecated:: 2.0.0
|
||||
|
||||
Use `device` instead.
|
||||
|
||||
device:
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
|
||||
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
|
||||
|
||||
force_repartition:
|
||||
Boolean value to specify if forcing the input dataset to be repartitioned
|
||||
before XGBoost training.
|
||||
@ -497,8 +507,13 @@ class SparkXGBRanker(_SparkXGBEstimator):
|
||||
.. deprecated:: 2.0.0
|
||||
|
||||
Use `device` instead.
|
||||
|
||||
device:
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
|
||||
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
|
||||
|
||||
force_repartition:
|
||||
Boolean value to specify if forcing the input dataset to be repartitioned
|
||||
before XGBoost training.
|
||||
|
||||
@ -724,11 +724,15 @@ void MetaInfo::SynchronizeNumberOfColumns() {
|
||||
namespace {
|
||||
template <typename T>
|
||||
void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
|
||||
CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device)
|
||||
<< "Data is resided on a different device than `gpu_id`. "
|
||||
<< "Device that data is on: " << v.DeviceIdx() << ", "
|
||||
<< "`gpu_id` for XGBoost: " << device;
|
||||
bool valid =
|
||||
v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device;
|
||||
if (!valid) {
|
||||
LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
|
||||
"the booster. The device ordinal of the data is: "
|
||||
<< v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, std::int32_t D>
|
||||
void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
|
||||
CheckDevice(device, *v.Data());
|
||||
|
||||
@ -42,22 +42,22 @@ DMLC_REGISTRY_FILE_TAG(gbtree);
|
||||
|
||||
namespace {
|
||||
/** @brief Map the `tree_method` parameter to the `updater` parameter. */
|
||||
std::string MapTreeMethodToUpdaters(Context const* ctx_, TreeMethod tree_method) {
|
||||
std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method) {
|
||||
// Choose updaters according to tree_method parameters
|
||||
if (ctx->IsCUDA()) {
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
switch (tree_method) {
|
||||
case TreeMethod::kAuto: // Use hist as default in 2.0
|
||||
case TreeMethod::kHist: {
|
||||
return ctx_->DispatchDevice([] { return "grow_quantile_histmaker"; },
|
||||
[] {
|
||||
common::AssertGPUSupport();
|
||||
return "grow_gpu_hist";
|
||||
});
|
||||
return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
|
||||
[] { return "grow_gpu_hist"; });
|
||||
}
|
||||
case TreeMethod::kApprox:
|
||||
CHECK(ctx_->IsCPU()) << "The `approx` tree method is not supported on GPU.";
|
||||
CHECK(ctx->IsCPU()) << "The `approx` tree method is not supported on GPU.";
|
||||
return "grow_histmaker";
|
||||
case TreeMethod::kExact:
|
||||
CHECK(ctx_->IsCPU()) << "The `exact` tree method is not supported on GPU.";
|
||||
CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
|
||||
return "grow_colmaker,prune";
|
||||
case TreeMethod::kGPUHist: {
|
||||
common::AssertGPUSupport();
|
||||
@ -150,6 +150,7 @@ void GBTree::Configure(Args const& cfg) {
|
||||
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
|
||||
<< "Only the hist tree method is supported for building multi-target trees with vector "
|
||||
"leaf.";
|
||||
CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
|
||||
}
|
||||
|
||||
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
|
||||
|
||||
@ -29,10 +29,12 @@ class LintersPaths:
|
||||
"tests/python-gpu/load_pickle.py",
|
||||
"tests/python-gpu/test_gpu_pickling.py",
|
||||
"tests/python-gpu/test_gpu_eval_metrics.py",
|
||||
"tests/python-gpu/test_gpu_with_sklearn.py",
|
||||
"tests/test_distributed/test_with_spark/",
|
||||
"tests/test_distributed/test_gpu_with_spark/",
|
||||
# demo
|
||||
"demo/dask/",
|
||||
"demo/rmm_plugin",
|
||||
"demo/json-model/json_parser.py",
|
||||
"demo/guide-python/cat_in_the_dat.py",
|
||||
"demo/guide-python/categorical.py",
|
||||
|
||||
@ -234,7 +234,7 @@ Arrow specification.'''
|
||||
cp.cuda.runtime.setDevice(0)
|
||||
dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
|
||||
with pytest.raises(
|
||||
xgb.core.XGBoostError, match="Data is resided on a different device"
|
||||
xgb.core.XGBoostError, match="Invalid device ordinal"
|
||||
):
|
||||
xgb.train(
|
||||
{'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10
|
||||
|
||||
@ -2,6 +2,7 @@ import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@ -23,18 +24,19 @@ def test_gpu_binary_classification():
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
digits = load_digits(n_class=2)
|
||||
y = digits['target']
|
||||
X = digits['data']
|
||||
y = digits["target"]
|
||||
X = digits["data"]
|
||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||
for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
|
||||
for train_index, test_index in kf.split(X, y):
|
||||
xgb_model = cls(
|
||||
random_state=42, tree_method='gpu_hist',
|
||||
n_estimators=4, gpu_id='0').fit(X[train_index], y[train_index])
|
||||
random_state=42, tree_method="gpu_hist", n_estimators=4, gpu_id="0"
|
||||
).fit(X[train_index], y[train_index])
|
||||
preds = xgb_model.predict(X[test_index])
|
||||
labels = y[test_index]
|
||||
err = sum(1 for i in range(len(preds))
|
||||
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||
err = sum(
|
||||
1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
|
||||
) / float(len(preds))
|
||||
assert err < 0.1
|
||||
|
||||
|
||||
@ -133,7 +135,7 @@ def test_classififer():
|
||||
X, y = load_digits(return_X_y=True)
|
||||
y *= 10
|
||||
|
||||
clf = xgb.XGBClassifier(tree_method="gpu_hist", n_estimators=1)
|
||||
clf = xgb.XGBClassifier(tree_method="hist", n_estimators=1, device="cuda")
|
||||
|
||||
# numpy
|
||||
with pytest.raises(ValueError, match=r"Invalid classes.*"):
|
||||
@ -161,3 +163,46 @@ def test_ranking_qid_df():
|
||||
import cudf
|
||||
|
||||
run_ranking_qid_df(cudf, "gpu_hist")
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
@pytest.mark.mgpu
|
||||
def test_device_ordinal() -> None:
|
||||
import cupy as cp
|
||||
|
||||
n_devices = 2
|
||||
|
||||
def worker(ordinal: int, correct_ordinal: bool) -> None:
|
||||
if correct_ordinal:
|
||||
cp.cuda.runtime.setDevice(ordinal)
|
||||
else:
|
||||
cp.cuda.runtime.setDevice((ordinal + 1) % n_devices)
|
||||
|
||||
X, y, w = tm.make_regression(4096, 12, use_cupy=True)
|
||||
reg = xgb.XGBRegressor(device=f"cuda:{ordinal}", tree_method="hist")
|
||||
|
||||
if correct_ordinal:
|
||||
reg.fit(
|
||||
X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
|
||||
)
|
||||
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
|
||||
return
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid device ordinal"):
|
||||
reg.fit(
|
||||
X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
|
||||
futures = []
|
||||
n_trials = 32
|
||||
for i in range(n_trials):
|
||||
fut = executor.submit(
|
||||
worker, ordinal=i % n_devices, correct_ordinal=i % 3 != 0
|
||||
)
|
||||
futures.append(fut)
|
||||
|
||||
for fut in futures:
|
||||
fut.result()
|
||||
|
||||
cp.cuda.runtime.setDevice(0)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user