Document for device ordinal. (#9398)
- Rewrite GPU demos. notebook is converted to script to avoid committing additional png plots. - Add GPU demos into the sphinx gallery. - Add RMM demos into the sphinx gallery. - Test for firing threads with different device ordinals.
This commit is contained in:
parent
22b0a55a04
commit
275da176ba
@ -53,15 +53,7 @@ int main() {
|
|||||||
// configure the training
|
// configure the training
|
||||||
// available parameters are described here:
|
// available parameters are described here:
|
||||||
// https://xgboost.readthedocs.io/en/latest/parameter.html
|
// https://xgboost.readthedocs.io/en/latest/parameter.html
|
||||||
safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
|
safe_xgboost(XGBoosterSetParam(booster, "device", use_gpu ? "cuda" : "cpu"));
|
||||||
if (use_gpu) {
|
|
||||||
// set the GPU to use;
|
|
||||||
// this is not necessary, but provided here as an illustration
|
|
||||||
safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
|
|
||||||
} else {
|
|
||||||
// avoid evaluating objective and metric on a GPU
|
|
||||||
safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
|
|
||||||
}
|
|
||||||
|
|
||||||
safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
|
safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
|
||||||
safe_xgboost(XGBoosterSetParam(booster, "min_child_weight", "1"));
|
safe_xgboost(XGBoosterSetParam(booster, "min_child_weight", "1"));
|
||||||
|
|||||||
@ -1,5 +0,0 @@
|
|||||||
# GPU Acceleration Demo
|
|
||||||
|
|
||||||
`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
|
|
||||||
|
|
||||||
`shap.ipynb` demonstrates using GPU acceleration to compute SHAP values for feature importance.
|
|
||||||
8
demo/gpu_acceleration/README.rst
Normal file
8
demo/gpu_acceleration/README.rst
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
GPU Acceleration Demo
|
||||||
|
=====================
|
||||||
|
|
||||||
|
This is a collection of demonstration scripts to showcase the basic usage of GPU. Please
|
||||||
|
see :doc:`/gpu/index` for more info. There are other demonstrations for distributed GPU
|
||||||
|
training using dask or spark.
|
||||||
@ -1,41 +1,49 @@
|
|||||||
|
"""
|
||||||
|
Using xgboost on GPU devices
|
||||||
|
============================
|
||||||
|
|
||||||
|
Shows how to train a model on the `forest cover type
|
||||||
|
<https://archive.ics.uci.edu/ml/datasets/covertype>`_ dataset using GPU
|
||||||
|
acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it
|
||||||
|
time consuming to process. We compare the run-time and accuracy of the GPU and CPU
|
||||||
|
histogram algorithms.
|
||||||
|
|
||||||
|
In addition, The demo showcases using GPU with other GPU-related libraries including
|
||||||
|
cupy and cuml. These libraries are not strictly required.
|
||||||
|
|
||||||
|
"""
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import cupy as cp
|
||||||
|
from cuml.model_selection import train_test_split
|
||||||
from sklearn.datasets import fetch_covtype
|
from sklearn.datasets import fetch_covtype
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
|
|
||||||
# Fetch dataset using sklearn
|
# Fetch dataset using sklearn
|
||||||
cov = fetch_covtype()
|
X, y = fetch_covtype(return_X_y=True)
|
||||||
X = cov.data
|
X = cp.array(X)
|
||||||
y = cov.target
|
y = cp.array(y)
|
||||||
|
y -= y.min()
|
||||||
|
|
||||||
# Create 0.75/0.25 train/test split
|
# Create 0.75/0.25 train/test split
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
random_state=42)
|
X, y, test_size=0.25, train_size=0.75, random_state=42
|
||||||
|
)
|
||||||
|
|
||||||
# Specify sufficient boosting iterations to reach a minimum
|
# Specify sufficient boosting iterations to reach a minimum
|
||||||
num_round = 3000
|
num_round = 3000
|
||||||
|
|
||||||
# Leave most parameters as default
|
# Leave most parameters as default
|
||||||
param = {'objective': 'multi:softmax', # Specify multiclass classification
|
clf = xgb.XGBClassifier(device="cuda", n_estimators=num_round)
|
||||||
'num_class': 8, # Number of possible output classes
|
|
||||||
'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
|
|
||||||
}
|
|
||||||
|
|
||||||
# Convert input data from numpy to XGBoost format
|
|
||||||
dtrain = xgb.DMatrix(X_train, label=y_train)
|
|
||||||
dtest = xgb.DMatrix(X_test, label=y_test)
|
|
||||||
|
|
||||||
gpu_res = {} # Store accuracy result
|
|
||||||
tmp = time.time()
|
|
||||||
# Train model
|
# Train model
|
||||||
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
|
start = time.time()
|
||||||
print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))
|
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
|
||||||
|
gpu_res = clf.evals_result()
|
||||||
|
print("GPU Training Time: %s seconds" % (str(time.time() - start)))
|
||||||
|
|
||||||
# Repeat for CPU algorithm
|
# Repeat for CPU algorithm
|
||||||
tmp = time.time()
|
clf = xgb.XGBClassifier(device="cpu", n_estimators=num_round)
|
||||||
param['tree_method'] = 'hist'
|
start = time.time()
|
||||||
cpu_res = {}
|
cpu_res = clf.evals_result()
|
||||||
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
|
print("CPU Training Time: %s seconds" % (str(time.time() - start)))
|
||||||
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
55
demo/gpu_acceleration/tree_shap.py
Normal file
55
demo/gpu_acceleration/tree_shap.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
"""
|
||||||
|
Use GPU to speedup SHAP value computation
|
||||||
|
=========================================
|
||||||
|
|
||||||
|
Demonstrates using GPU acceleration to compute SHAP values for feature importance.
|
||||||
|
|
||||||
|
"""
|
||||||
|
import shap
|
||||||
|
from sklearn.datasets import fetch_california_housing
|
||||||
|
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
# Fetch dataset using sklearn
|
||||||
|
data = fetch_california_housing()
|
||||||
|
print(data.DESCR)
|
||||||
|
X = data.data
|
||||||
|
y = data.target
|
||||||
|
|
||||||
|
num_round = 500
|
||||||
|
|
||||||
|
param = {
|
||||||
|
"eta": 0.05,
|
||||||
|
"max_depth": 10,
|
||||||
|
"tree_method": "hist",
|
||||||
|
"device": "cuda",
|
||||||
|
}
|
||||||
|
|
||||||
|
# GPU accelerated training
|
||||||
|
dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
|
||||||
|
model = xgb.train(param, dtrain, num_round)
|
||||||
|
|
||||||
|
# Compute shap values using GPU with xgboost
|
||||||
|
model.set_param({"device": "cuda"})
|
||||||
|
shap_values = model.predict(dtrain, pred_contribs=True)
|
||||||
|
|
||||||
|
# Compute shap interaction values using GPU
|
||||||
|
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
|
||||||
|
|
||||||
|
|
||||||
|
# shap will call the GPU accelerated version as long as the device parameter is set to
|
||||||
|
# "cuda"
|
||||||
|
explainer = shap.TreeExplainer(model)
|
||||||
|
shap_values = explainer.shap_values(X)
|
||||||
|
|
||||||
|
# visualize the first prediction's explanation
|
||||||
|
shap.force_plot(
|
||||||
|
explainer.expected_value,
|
||||||
|
shap_values[0, :],
|
||||||
|
X[0, :],
|
||||||
|
feature_names=data.feature_names,
|
||||||
|
matplotlib=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show a summary of feature importance
|
||||||
|
shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)
|
||||||
@ -70,8 +70,7 @@ class XGBoostTrainer(Executor):
|
|||||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
||||||
if self._use_gpus:
|
if self._use_gpus:
|
||||||
self.log_info(fl_ctx, f'Training with GPU {rank}')
|
self.log_info(fl_ctx, f'Training with GPU {rank}')
|
||||||
param['tree_method'] = 'gpu_hist'
|
param['device'] = f"cuda:{rank}"
|
||||||
param['gpu_id'] = rank
|
|
||||||
|
|
||||||
# Specify validations set to watch performance
|
# Specify validations set to watch performance
|
||||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||||
|
|||||||
@ -1,47 +0,0 @@
|
|||||||
Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
|
|
||||||
====================================================================
|
|
||||||
[RAPIDS Memory Manager (RMM)](https://github.com/rapidsai/rmm) library provides a collection of
|
|
||||||
efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory
|
|
||||||
allocators provided by RMM, by enabling the RMM integration plugin.
|
|
||||||
|
|
||||||
The demos in this directory highlights one RMM allocator in particular: **the pool sub-allocator**.
|
|
||||||
This allocator addresses the slow speed of `cudaMalloc()` by allocating a large chunk of memory
|
|
||||||
upfront. Subsequent allocations will draw from the pool of already allocated memory and thus avoid
|
|
||||||
the overhead of calling `cudaMalloc()` directly. See
|
|
||||||
[this GTC talk slides](https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf)
|
|
||||||
for more details.
|
|
||||||
|
|
||||||
Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
|
|
||||||
run CMake with option `-DPLUGIN_RMM=ON` (`-DUSE_CUDA=ON` also required):
|
|
||||||
```
|
|
||||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
|
|
||||||
make -j4
|
|
||||||
```
|
|
||||||
CMake will attempt to locate the RMM library in your build environment. You may choose to build
|
|
||||||
RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
|
|
||||||
should specify the location of RMM with the CMake prefix:
|
|
||||||
```
|
|
||||||
# If using Conda:
|
|
||||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
|
|
||||||
# If using RMM installed with a custom location
|
|
||||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
|
|
||||||
```
|
|
||||||
|
|
||||||
# Informing XGBoost about RMM pool
|
|
||||||
|
|
||||||
When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
|
|
||||||
allocators, but some small allocations in performance critical areas are using a different
|
|
||||||
caching allocator so that we can have better control over memory allocation behavior.
|
|
||||||
Users can override this behavior and force the use of rmm for all allocations by setting
|
|
||||||
the global configuration ``use_rmm``:
|
|
||||||
|
|
||||||
``` python
|
|
||||||
with xgb.config_context(use_rmm=True):
|
|
||||||
clf = xgb.XGBClassifier(tree_method="gpu_hist")
|
|
||||||
```
|
|
||||||
|
|
||||||
Depending on the choice of memory pool size or type of allocator, this may have negative
|
|
||||||
performance impact.
|
|
||||||
|
|
||||||
* [Using RMM with a single GPU](./rmm_singlegpu.py)
|
|
||||||
* [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py)
|
|
||||||
51
demo/rmm_plugin/README.rst
Normal file
51
demo/rmm_plugin/README.rst
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
|
||||||
|
====================================================================
|
||||||
|
|
||||||
|
`RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ library provides a
|
||||||
|
collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use
|
||||||
|
XGBoost with memory allocators provided by RMM, by enabling the RMM integration plugin.
|
||||||
|
|
||||||
|
The demos in this directory highlights one RMM allocator in particular: **the pool
|
||||||
|
sub-allocator**. This allocator addresses the slow speed of ``cudaMalloc()`` by
|
||||||
|
allocating a large chunk of memory upfront. Subsequent allocations will draw from the pool
|
||||||
|
of already allocated memory and thus avoid the overhead of calling ``cudaMalloc()``
|
||||||
|
directly. See `this GTC talk slides
|
||||||
|
<https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf>`_ for
|
||||||
|
more details.
|
||||||
|
|
||||||
|
Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
|
||||||
|
run CMake with option ``-DPLUGIN_RMM=ON`` (``-DUSE_CUDA=ON`` also required):
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
|
||||||
|
make -j$(nproc)
|
||||||
|
|
||||||
|
CMake will attempt to locate the RMM library in your build environment. You may choose to build
|
||||||
|
RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
|
||||||
|
should specify the location of RMM with the CMake prefix:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
# If using Conda:
|
||||||
|
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
|
||||||
|
# If using RMM installed with a custom location
|
||||||
|
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
|
||||||
|
|
||||||
|
********************************
|
||||||
|
Informing XGBoost about RMM pool
|
||||||
|
********************************
|
||||||
|
|
||||||
|
When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
|
||||||
|
allocators, but some small allocations in performance critical areas are using a different
|
||||||
|
caching allocator so that we can have better control over memory allocation behavior.
|
||||||
|
Users can override this behavior and force the use of rmm for all allocations by setting
|
||||||
|
the global configuration ``use_rmm``:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
with xgb.config_context(use_rmm=True):
|
||||||
|
clf = xgb.XGBClassifier(tree_method="hist", device="cuda")
|
||||||
|
|
||||||
|
Depending on the choice of memory pool size or type of allocator, this may have negative
|
||||||
|
performance impact.
|
||||||
@ -1,3 +1,7 @@
|
|||||||
|
"""
|
||||||
|
Using rmm with Dask
|
||||||
|
===================
|
||||||
|
"""
|
||||||
import dask
|
import dask
|
||||||
from dask.distributed import Client
|
from dask.distributed import Client
|
||||||
from dask_cuda import LocalCUDACluster
|
from dask_cuda import LocalCUDACluster
|
||||||
@ -11,25 +15,33 @@ def main(client):
|
|||||||
# xgb.set_config(use_rmm=True)
|
# xgb.set_config(use_rmm=True)
|
||||||
|
|
||||||
X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
|
X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
|
||||||
# In pratice one should prefer loading the data with dask collections instead of using
|
# In pratice one should prefer loading the data with dask collections instead of
|
||||||
# `from_array`.
|
# using `from_array`.
|
||||||
X = dask.array.from_array(X)
|
X = dask.array.from_array(X)
|
||||||
y = dask.array.from_array(y)
|
y = dask.array.from_array(y)
|
||||||
dtrain = xgb.dask.DaskDMatrix(client, X, label=y)
|
dtrain = xgb.dask.DaskDMatrix(client, X, label=y)
|
||||||
|
|
||||||
params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
|
params = {
|
||||||
'tree_method': 'gpu_hist', 'eval_metric': 'merror'}
|
"max_depth": 8,
|
||||||
output = xgb.dask.train(client, params, dtrain, num_boost_round=100,
|
"eta": 0.01,
|
||||||
evals=[(dtrain, 'train')])
|
"objective": "multi:softprob",
|
||||||
bst = output['booster']
|
"num_class": 3,
|
||||||
history = output['history']
|
"tree_method": "hist",
|
||||||
for i, e in enumerate(history['train']['merror']):
|
"eval_metric": "merror",
|
||||||
print(f'[{i}] train-merror: {e}')
|
"device": "cuda",
|
||||||
|
}
|
||||||
|
output = xgb.dask.train(
|
||||||
|
client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
|
||||||
|
)
|
||||||
|
bst = output["booster"]
|
||||||
|
history = output["history"]
|
||||||
|
for i, e in enumerate(history["train"]["merror"]):
|
||||||
|
print(f"[{i}] train-merror: {e}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
# To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to
|
# To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option
|
||||||
# LocalCUDACluster constructor.
|
# to LocalCUDACluster constructor.
|
||||||
with LocalCUDACluster(rmm_pool_size='2GB') as cluster:
|
with LocalCUDACluster(rmm_pool_size="2GB") as cluster:
|
||||||
with Client(cluster) as client:
|
with Client(cluster) as client:
|
||||||
main(client)
|
main(client)
|
||||||
|
|||||||
@ -1,3 +1,7 @@
|
|||||||
|
"""
|
||||||
|
Using rmm on a single node device
|
||||||
|
=================================
|
||||||
|
"""
|
||||||
import rmm
|
import rmm
|
||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
|
|
||||||
@ -16,7 +20,8 @@ params = {
|
|||||||
"eta": 0.01,
|
"eta": 0.01,
|
||||||
"objective": "multi:softprob",
|
"objective": "multi:softprob",
|
||||||
"num_class": 3,
|
"num_class": 3,
|
||||||
"tree_method": "gpu_hist",
|
"tree_method": "hist",
|
||||||
|
"device": "cuda",
|
||||||
}
|
}
|
||||||
# XGBoost will automatically use the RMM pool allocator
|
# XGBoost will automatically use the RMM pool allocator
|
||||||
bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])
|
bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])
|
||||||
|
|||||||
2
doc/.gitignore
vendored
2
doc/.gitignore
vendored
@ -6,3 +6,5 @@ doxygen
|
|||||||
parser.py
|
parser.py
|
||||||
*.pyc
|
*.pyc
|
||||||
web-data
|
web-data
|
||||||
|
# generated by doxygen
|
||||||
|
tmp
|
||||||
11
doc/conf.py
11
doc/conf.py
@ -19,7 +19,6 @@ import sys
|
|||||||
import tarfile
|
import tarfile
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import warnings
|
import warnings
|
||||||
from subprocess import call
|
|
||||||
from urllib.error import HTTPError
|
from urllib.error import HTTPError
|
||||||
|
|
||||||
from sh.contrib import git
|
from sh.contrib import git
|
||||||
@ -148,12 +147,20 @@ extensions = [
|
|||||||
|
|
||||||
sphinx_gallery_conf = {
|
sphinx_gallery_conf = {
|
||||||
# path to your example scripts
|
# path to your example scripts
|
||||||
"examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
|
"examples_dirs": [
|
||||||
|
"../demo/guide-python",
|
||||||
|
"../demo/dask",
|
||||||
|
"../demo/aft_survival",
|
||||||
|
"../demo/gpu_acceleration",
|
||||||
|
"../demo/rmm_plugin"
|
||||||
|
],
|
||||||
# path to where to save gallery generated output
|
# path to where to save gallery generated output
|
||||||
"gallery_dirs": [
|
"gallery_dirs": [
|
||||||
"python/examples",
|
"python/examples",
|
||||||
"python/dask-examples",
|
"python/dask-examples",
|
||||||
"python/survival-examples",
|
"python/survival-examples",
|
||||||
|
"python/gpu-examples",
|
||||||
|
"python/rmm-examples",
|
||||||
],
|
],
|
||||||
"matplotlib_animations": True,
|
"matplotlib_animations": True,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -23,20 +23,19 @@ The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :do
|
|||||||
:caption: Python example
|
:caption: Python example
|
||||||
|
|
||||||
params = dict()
|
params = dict()
|
||||||
params["device"] = "cuda:0"
|
params["device"] = "cuda"
|
||||||
params["tree_method"] = "hist"
|
params["tree_method"] = "hist"
|
||||||
Xy = xgboost.QuantileDMatrix(X, y)
|
Xy = xgboost.QuantileDMatrix(X, y)
|
||||||
xgboost.train(params, Xy)
|
xgboost.train(params, Xy)
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
:caption: With Scikit-Learn interface
|
:caption: With the Scikit-Learn interface
|
||||||
|
|
||||||
XGBRegressor(tree_method="hist", device="cuda")
|
XGBRegressor(tree_method="hist", device="cuda")
|
||||||
|
|
||||||
|
|
||||||
GPU-Accelerated SHAP values
|
GPU-Accelerated SHAP values
|
||||||
=============================
|
=============================
|
||||||
XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU predictor is selected.
|
XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU is used.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -44,12 +43,12 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
|
|||||||
shap_values = booster.predict(dtrain, pred_contribs=True)
|
shap_values = booster.predict(dtrain, pred_contribs=True)
|
||||||
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
|
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
|
||||||
|
|
||||||
See examples `here <https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
|
See :ref:`sphx_glr_python_gpu-examples_tree_shap.py` for a worked example.
|
||||||
|
|
||||||
Multi-node Multi-GPU Training
|
Multi-node Multi-GPU Training
|
||||||
=============================
|
=============================
|
||||||
|
|
||||||
XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples `here <https://github.com/dmlc/xgboost/tree/master/demo/dask>`__, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
|
XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples :doc:`/python/dask-examples/index`, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
|
||||||
|
|
||||||
|
|
||||||
Memory usage
|
Memory usage
|
||||||
@ -67,7 +66,8 @@ If you are getting out-of-memory errors on a big dataset, try the or :py:class:`
|
|||||||
|
|
||||||
CPU-GPU Interoperability
|
CPU-GPU Interoperability
|
||||||
========================
|
========================
|
||||||
XGBoost models trained on GPUs can be used on CPU-only systems to generate predictions. For information about how to save and load an XGBoost model, see :doc:`/tutorials/saving_model`.
|
|
||||||
|
The model can be used on any device regardless of the one used to train it. For instance, a model trained using GPU can still work on a CPU-only machine and vice versa. For more information about model serialization, see :doc:`/tutorials/saving_model`.
|
||||||
|
|
||||||
|
|
||||||
Developer notes
|
Developer notes
|
||||||
|
|||||||
@ -189,7 +189,7 @@ This will check out the latest stable version from the Maven Central.
|
|||||||
|
|
||||||
For the latest release version number, please check `release page <https://github.com/dmlc/xgboost/releases>`_.
|
For the latest release version number, please check `release page <https://github.com/dmlc/xgboost/releases>`_.
|
||||||
|
|
||||||
To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
|
To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
|
||||||
|
|
||||||
|
|
||||||
.. note:: Windows not supported in the JVM package
|
.. note:: Windows not supported in the JVM package
|
||||||
@ -325,4 +325,4 @@ The SNAPSHOT JARs are hosted by the XGBoost project. Every commit in the ``maste
|
|||||||
|
|
||||||
You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html.
|
You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html.
|
||||||
|
|
||||||
To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
|
To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
|
||||||
|
|||||||
@ -34,27 +34,6 @@ General Parameters
|
|||||||
|
|
||||||
- Which booster to use. Can be ``gbtree``, ``gblinear`` or ``dart``; ``gbtree`` and ``dart`` use tree based models while ``gblinear`` uses linear functions.
|
- Which booster to use. Can be ``gbtree``, ``gblinear`` or ``dart``; ``gbtree`` and ``dart`` use tree based models while ``gblinear`` uses linear functions.
|
||||||
|
|
||||||
* ``verbosity`` [default=1]
|
|
||||||
|
|
||||||
- Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3
|
|
||||||
(debug). Sometimes XGBoost tries to change configurations based on heuristics, which
|
|
||||||
is displayed as warning message. If there's unexpected behaviour, please try to
|
|
||||||
increase value of verbosity.
|
|
||||||
|
|
||||||
* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
|
|
||||||
|
|
||||||
- When set to True, XGBoost will perform validation of input parameters to check whether
|
|
||||||
a parameter is used or not.
|
|
||||||
|
|
||||||
* ``nthread`` [default to maximum number of threads available if not set]
|
|
||||||
|
|
||||||
- Number of parallel threads used to run XGBoost. When choosing it, please keep thread
|
|
||||||
contention and hyperthreading in mind.
|
|
||||||
|
|
||||||
* ``disable_default_eval_metric`` [default= ``false``]
|
|
||||||
|
|
||||||
- Flag to disable default metric. Set to 1 or ``true`` to disable.
|
|
||||||
|
|
||||||
* ``device`` [default= ``cpu``]
|
* ``device`` [default= ``cpu``]
|
||||||
|
|
||||||
.. versionadded:: 2.0.0
|
.. versionadded:: 2.0.0
|
||||||
@ -67,6 +46,29 @@ General Parameters
|
|||||||
+ ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
|
+ ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
|
||||||
+ ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
|
+ ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
|
||||||
|
|
||||||
|
For more information about GPU acceleration, see :doc:`/gpu/index`.
|
||||||
|
|
||||||
|
* ``verbosity`` [default=1]
|
||||||
|
|
||||||
|
- Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3
|
||||||
|
(debug). Sometimes XGBoost tries to change configurations based on heuristics, which
|
||||||
|
is displayed as warning message. If there's unexpected behaviour, please try to
|
||||||
|
increase value of verbosity.
|
||||||
|
|
||||||
|
* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
|
||||||
|
|
||||||
|
- When set to True, XGBoost will perform validation of input parameters to check whether
|
||||||
|
a parameter is used or not. A warning is emitted when there's unknown parameter.
|
||||||
|
|
||||||
|
* ``nthread`` [default to maximum number of threads available if not set]
|
||||||
|
|
||||||
|
- Number of parallel threads used to run XGBoost. When choosing it, please keep thread
|
||||||
|
contention and hyperthreading in mind.
|
||||||
|
|
||||||
|
* ``disable_default_eval_metric`` [default= ``false``]
|
||||||
|
|
||||||
|
- Flag to disable default metric. Set to 1 or ``true`` to disable.
|
||||||
|
|
||||||
Parameters for Tree Booster
|
Parameters for Tree Booster
|
||||||
===========================
|
===========================
|
||||||
* ``eta`` [default=0.3, alias: ``learning_rate``]
|
* ``eta`` [default=0.3, alias: ``learning_rate``]
|
||||||
@ -160,7 +162,7 @@ Parameters for Tree Booster
|
|||||||
- ``grow_colmaker``: non-distributed column-based construction of trees.
|
- ``grow_colmaker``: non-distributed column-based construction of trees.
|
||||||
- ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
|
- ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
|
||||||
- ``grow_quantile_histmaker``: Grow tree using quantized histogram.
|
- ``grow_quantile_histmaker``: Grow tree using quantized histogram.
|
||||||
- ``grow_gpu_hist``: Grow tree with GPU. Same as setting ``tree_method`` to ``hist`` and use ``device=cuda``.
|
- ``grow_gpu_hist``: Grow tree with GPU. Enabled when ``tree_method`` is set to ``hist`` along with ``device=cuda``.
|
||||||
- ``sync``: synchronizes trees in all distributed nodes.
|
- ``sync``: synchronizes trees in all distributed nodes.
|
||||||
- ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
|
- ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
|
||||||
- ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
|
- ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
|
||||||
|
|||||||
2
doc/python/.gitignore
vendored
2
doc/python/.gitignore
vendored
@ -1,3 +1,5 @@
|
|||||||
examples
|
examples
|
||||||
dask-examples
|
dask-examples
|
||||||
survival-examples
|
survival-examples
|
||||||
|
gpu-examples
|
||||||
|
rmm-examples
|
||||||
@ -17,3 +17,5 @@ Contents
|
|||||||
examples/index
|
examples/index
|
||||||
dask-examples/index
|
dask-examples/index
|
||||||
survival-examples/index
|
survival-examples/index
|
||||||
|
gpu-examples/index
|
||||||
|
rmm-examples/index
|
||||||
|
|||||||
@ -124,7 +124,7 @@ Following table summarizes some differences in supported features between 4 tree
|
|||||||
`T` means supported while `F` means unsupported.
|
`T` means supported while `F` means unsupported.
|
||||||
|
|
||||||
+------------------+-----------+---------------------+---------------------+------------------------+
|
+------------------+-----------+---------------------+---------------------+------------------------+
|
||||||
| | Exact | Approx | Hist | GPU Hist |
|
| | Exact | Approx | Hist | Hist (GPU) |
|
||||||
+==================+===========+=====================+=====================+========================+
|
+==================+===========+=====================+=====================+========================+
|
||||||
| grow_policy | Depthwise | depthwise/lossguide | depthwise/lossguide | depthwise/lossguide |
|
| grow_policy | Depthwise | depthwise/lossguide | depthwise/lossguide | depthwise/lossguide |
|
||||||
+------------------+-----------+---------------------+---------------------+------------------------+
|
+------------------+-----------+---------------------+---------------------+------------------------+
|
||||||
@ -141,5 +141,5 @@ Following table summarizes some differences in supported features between 4 tree
|
|||||||
|
|
||||||
Features/parameters that are not mentioned here are universally supported for all 4 tree
|
Features/parameters that are not mentioned here are universally supported for all 4 tree
|
||||||
methods (for instance, column sampling and constraints). The `P` in external memory means
|
methods (for instance, column sampling and constraints). The `P` in external memory means
|
||||||
partially supported. Please note that both categorical data and external memory are
|
special handling. Please note that both categorical data and external memory are
|
||||||
experimental.
|
experimental.
|
||||||
|
|||||||
@ -35,8 +35,8 @@ parameter ``enable_categorical``:
|
|||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
# Supported tree methods are `gpu_hist`, `approx`, and `hist`.
|
# Supported tree methods are `approx` and `hist`.
|
||||||
clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
|
clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, device="cuda")
|
||||||
# X is the dataframe we created in previous snippet
|
# X is the dataframe we created in previous snippet
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
# Must use JSON/UBJSON for serialization, otherwise the information is lost.
|
# Must use JSON/UBJSON for serialization, otherwise the information is lost.
|
||||||
|
|||||||
@ -81,7 +81,7 @@ constructor.
|
|||||||
it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
|
it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
|
||||||
Xy = xgboost.DMatrix(it)
|
Xy = xgboost.DMatrix(it)
|
||||||
|
|
||||||
# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats
|
# The ``approx`` also work, but with low performance. GPU implementation is different from CPU.
|
||||||
# as noted in following sections.
|
# as noted in following sections.
|
||||||
booster = xgboost.train({"tree_method": "hist"}, Xy)
|
booster = xgboost.train({"tree_method": "hist"}, Xy)
|
||||||
|
|
||||||
@ -118,15 +118,15 @@ to reduce the overhead of file reading.
|
|||||||
GPU Version (GPU Hist tree method)
|
GPU Version (GPU Hist tree method)
|
||||||
**********************************
|
**********************************
|
||||||
|
|
||||||
External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to
|
External memory is supported by GPU algorithms (i.e. when ``device`` is set to
|
||||||
``gpu_hist``). However, the algorithm used for GPU is different from the one used for
|
``cuda``). However, the algorithm used for GPU is different from the one used for
|
||||||
CPU. When training on a CPU, the tree method iterates through all batches from external
|
CPU. When training on a CPU, the tree method iterates through all batches from external
|
||||||
memory for each step of the tree construction algorithm. On the other hand, the GPU
|
memory for each step of the tree construction algorithm. On the other hand, the GPU
|
||||||
algorithm uses a hybrid approach. It iterates through the data during the beginning of
|
algorithm uses a hybrid approach. It iterates through the data during the beginning of
|
||||||
each iteration and concatenates all batches into one in GPU memory. To reduce overall
|
each iteration and concatenates all batches into one in GPU memory for performance
|
||||||
memory usage, users can utilize subsampling. The GPU hist tree method supports
|
reasons. To reduce overall memory usage, users can utilize subsampling. The GPU hist tree
|
||||||
`gradient-based sampling`, enabling users to set a low sampling rate without compromising
|
method supports `gradient-based sampling`, enabling users to set a low sampling rate
|
||||||
accuracy.
|
without compromising accuracy.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
|
|||||||
@ -83,13 +83,14 @@ Some other examples:
|
|||||||
- ``(0,-1)``: No constraint on the first predictor and a decreasing constraint on the second.
|
- ``(0,-1)``: No constraint on the first predictor and a decreasing constraint on the second.
|
||||||
|
|
||||||
|
|
||||||
**Note for the 'hist' tree construction algorithm**.
|
.. note::
|
||||||
If ``tree_method`` is set to either ``hist``, ``approx`` or ``gpu_hist``, enabling
|
|
||||||
monotonic constraints may produce unnecessarily shallow trees. This is because the
|
**Note for the 'hist' tree construction algorithm**. If ``tree_method`` is set to
|
||||||
``hist`` method reduces the number of candidate splits to be considered at each
|
either ``hist`` or ``approx``, enabling monotonic constraints may produce unnecessarily
|
||||||
split. Monotonic constraints may wipe out all available split candidates, in which case no
|
shallow trees. This is because the ``hist`` method reduces the number of candidate
|
||||||
split is made. To reduce the effect, you may want to increase the ``max_bin`` parameter to
|
splits to be considered at each split. Monotonic constraints may wipe out all available
|
||||||
consider more split candidates.
|
split candidates, in which case no split is made. To reduce the effect, you may want to
|
||||||
|
increase the ``max_bin`` parameter to consider more split candidates.
|
||||||
|
|
||||||
|
|
||||||
*******************
|
*******************
|
||||||
|
|||||||
@ -38,10 +38,6 @@ There are in general two ways that you can control overfitting in XGBoost:
|
|||||||
- This includes ``subsample`` and ``colsample_bytree``.
|
- This includes ``subsample`` and ``colsample_bytree``.
|
||||||
- You can also reduce stepsize ``eta``. Remember to increase ``num_round`` when you do so.
|
- You can also reduce stepsize ``eta``. Remember to increase ``num_round`` when you do so.
|
||||||
|
|
||||||
***************************
|
|
||||||
Faster training performance
|
|
||||||
***************************
|
|
||||||
There's a parameter called ``tree_method``, set it to ``hist`` or ``gpu_hist`` for faster computation.
|
|
||||||
|
|
||||||
*************************
|
*************************
|
||||||
Handle Imbalanced Dataset
|
Handle Imbalanced Dataset
|
||||||
|
|||||||
@ -50,13 +50,14 @@ Here is a sample parameter dictionary for training a random forest on a GPU usin
|
|||||||
xgboost::
|
xgboost::
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
'colsample_bynode': 0.8,
|
"colsample_bynode": 0.8,
|
||||||
'learning_rate': 1,
|
"learning_rate": 1,
|
||||||
'max_depth': 5,
|
"max_depth": 5,
|
||||||
'num_parallel_tree': 100,
|
"num_parallel_tree": 100,
|
||||||
'objective': 'binary:logistic',
|
"objective": "binary:logistic",
|
||||||
'subsample': 0.8,
|
"subsample": 0.8,
|
||||||
'tree_method': 'gpu_hist'
|
"tree_method": "hist",
|
||||||
|
"device": "cuda",
|
||||||
}
|
}
|
||||||
|
|
||||||
A random forest model can then be trained as follows::
|
A random forest model can then be trained as follows::
|
||||||
|
|||||||
@ -174,7 +174,7 @@ Will print out something similar to (not actual output as it's too long for demo
|
|||||||
"gbtree_train_param": {
|
"gbtree_train_param": {
|
||||||
"num_parallel_tree": "1",
|
"num_parallel_tree": "1",
|
||||||
"process_type": "default",
|
"process_type": "default",
|
||||||
"tree_method": "gpu_hist",
|
"tree_method": "hist",
|
||||||
"updater": "grow_gpu_hist",
|
"updater": "grow_gpu_hist",
|
||||||
"updater_seq": "grow_gpu_hist"
|
"updater_seq": "grow_gpu_hist"
|
||||||
},
|
},
|
||||||
|
|||||||
@ -278,9 +278,15 @@ __model_doc = f"""
|
|||||||
without bias.
|
without bias.
|
||||||
|
|
||||||
device : Optional[str]
|
device : Optional[str]
|
||||||
Device ordinal.
|
|
||||||
|
.. versionadded:: 2.0.0
|
||||||
|
|
||||||
|
Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
|
||||||
|
|
||||||
validate_parameters : Optional[bool]
|
validate_parameters : Optional[bool]
|
||||||
|
|
||||||
Give warnings for unknown parameter.
|
Give warnings for unknown parameter.
|
||||||
|
|
||||||
enable_categorical : bool
|
enable_categorical : bool
|
||||||
|
|
||||||
.. versionadded:: 1.5.0
|
.. versionadded:: 1.5.0
|
||||||
|
|||||||
@ -144,8 +144,13 @@ class SparkXGBRegressor(_SparkXGBEstimator):
|
|||||||
.. deprecated:: 2.0.0
|
.. deprecated:: 2.0.0
|
||||||
|
|
||||||
Use `device` instead.
|
Use `device` instead.
|
||||||
|
|
||||||
device:
|
device:
|
||||||
|
|
||||||
|
.. versionadded:: 2.0.0
|
||||||
|
|
||||||
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
|
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
|
||||||
|
|
||||||
force_repartition:
|
force_repartition:
|
||||||
Boolean value to specify if forcing the input dataset to be repartitioned
|
Boolean value to specify if forcing the input dataset to be repartitioned
|
||||||
before XGBoost training.
|
before XGBoost training.
|
||||||
@ -319,8 +324,13 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
|
|||||||
.. deprecated:: 2.0.0
|
.. deprecated:: 2.0.0
|
||||||
|
|
||||||
Use `device` instead.
|
Use `device` instead.
|
||||||
|
|
||||||
device:
|
device:
|
||||||
|
|
||||||
|
.. versionadded:: 2.0.0
|
||||||
|
|
||||||
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
|
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
|
||||||
|
|
||||||
force_repartition:
|
force_repartition:
|
||||||
Boolean value to specify if forcing the input dataset to be repartitioned
|
Boolean value to specify if forcing the input dataset to be repartitioned
|
||||||
before XGBoost training.
|
before XGBoost training.
|
||||||
@ -497,8 +507,13 @@ class SparkXGBRanker(_SparkXGBEstimator):
|
|||||||
.. deprecated:: 2.0.0
|
.. deprecated:: 2.0.0
|
||||||
|
|
||||||
Use `device` instead.
|
Use `device` instead.
|
||||||
|
|
||||||
device:
|
device:
|
||||||
|
|
||||||
|
.. versionadded:: 2.0.0
|
||||||
|
|
||||||
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
|
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
|
||||||
|
|
||||||
force_repartition:
|
force_repartition:
|
||||||
Boolean value to specify if forcing the input dataset to be repartitioned
|
Boolean value to specify if forcing the input dataset to be repartitioned
|
||||||
before XGBoost training.
|
before XGBoost training.
|
||||||
|
|||||||
@ -724,11 +724,15 @@ void MetaInfo::SynchronizeNumberOfColumns() {
|
|||||||
namespace {
|
namespace {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
|
void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
|
||||||
CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device)
|
bool valid =
|
||||||
<< "Data is resided on a different device than `gpu_id`. "
|
v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device;
|
||||||
<< "Device that data is on: " << v.DeviceIdx() << ", "
|
if (!valid) {
|
||||||
<< "`gpu_id` for XGBoost: " << device;
|
LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
|
||||||
|
"the booster. The device ordinal of the data is: "
|
||||||
|
<< v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, std::int32_t D>
|
template <typename T, std::int32_t D>
|
||||||
void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
|
void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
|
||||||
CheckDevice(device, *v.Data());
|
CheckDevice(device, *v.Data());
|
||||||
|
|||||||
@ -42,22 +42,22 @@ DMLC_REGISTRY_FILE_TAG(gbtree);
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
/** @brief Map the `tree_method` parameter to the `updater` parameter. */
|
/** @brief Map the `tree_method` parameter to the `updater` parameter. */
|
||||||
std::string MapTreeMethodToUpdaters(Context const* ctx_, TreeMethod tree_method) {
|
std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method) {
|
||||||
// Choose updaters according to tree_method parameters
|
// Choose updaters according to tree_method parameters
|
||||||
|
if (ctx->IsCUDA()) {
|
||||||
|
common::AssertGPUSupport();
|
||||||
|
}
|
||||||
switch (tree_method) {
|
switch (tree_method) {
|
||||||
case TreeMethod::kAuto: // Use hist as default in 2.0
|
case TreeMethod::kAuto: // Use hist as default in 2.0
|
||||||
case TreeMethod::kHist: {
|
case TreeMethod::kHist: {
|
||||||
return ctx_->DispatchDevice([] { return "grow_quantile_histmaker"; },
|
return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
|
||||||
[] {
|
[] { return "grow_gpu_hist"; });
|
||||||
common::AssertGPUSupport();
|
|
||||||
return "grow_gpu_hist";
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
case TreeMethod::kApprox:
|
case TreeMethod::kApprox:
|
||||||
CHECK(ctx_->IsCPU()) << "The `approx` tree method is not supported on GPU.";
|
CHECK(ctx->IsCPU()) << "The `approx` tree method is not supported on GPU.";
|
||||||
return "grow_histmaker";
|
return "grow_histmaker";
|
||||||
case TreeMethod::kExact:
|
case TreeMethod::kExact:
|
||||||
CHECK(ctx_->IsCPU()) << "The `exact` tree method is not supported on GPU.";
|
CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
|
||||||
return "grow_colmaker,prune";
|
return "grow_colmaker,prune";
|
||||||
case TreeMethod::kGPUHist: {
|
case TreeMethod::kGPUHist: {
|
||||||
common::AssertGPUSupport();
|
common::AssertGPUSupport();
|
||||||
@ -150,6 +150,7 @@ void GBTree::Configure(Args const& cfg) {
|
|||||||
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
|
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
|
||||||
<< "Only the hist tree method is supported for building multi-target trees with vector "
|
<< "Only the hist tree method is supported for building multi-target trees with vector "
|
||||||
"leaf.";
|
"leaf.";
|
||||||
|
CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
|
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
|
||||||
|
|||||||
@ -29,10 +29,12 @@ class LintersPaths:
|
|||||||
"tests/python-gpu/load_pickle.py",
|
"tests/python-gpu/load_pickle.py",
|
||||||
"tests/python-gpu/test_gpu_pickling.py",
|
"tests/python-gpu/test_gpu_pickling.py",
|
||||||
"tests/python-gpu/test_gpu_eval_metrics.py",
|
"tests/python-gpu/test_gpu_eval_metrics.py",
|
||||||
|
"tests/python-gpu/test_gpu_with_sklearn.py",
|
||||||
"tests/test_distributed/test_with_spark/",
|
"tests/test_distributed/test_with_spark/",
|
||||||
"tests/test_distributed/test_gpu_with_spark/",
|
"tests/test_distributed/test_gpu_with_spark/",
|
||||||
# demo
|
# demo
|
||||||
"demo/dask/",
|
"demo/dask/",
|
||||||
|
"demo/rmm_plugin",
|
||||||
"demo/json-model/json_parser.py",
|
"demo/json-model/json_parser.py",
|
||||||
"demo/guide-python/cat_in_the_dat.py",
|
"demo/guide-python/cat_in_the_dat.py",
|
||||||
"demo/guide-python/categorical.py",
|
"demo/guide-python/categorical.py",
|
||||||
|
|||||||
@ -234,7 +234,7 @@ Arrow specification.'''
|
|||||||
cp.cuda.runtime.setDevice(0)
|
cp.cuda.runtime.setDevice(0)
|
||||||
dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
|
dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
xgb.core.XGBoostError, match="Data is resided on a different device"
|
xgb.core.XGBoostError, match="Invalid device ordinal"
|
||||||
):
|
):
|
||||||
xgb.train(
|
xgb.train(
|
||||||
{'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10
|
{'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
@ -23,18 +24,19 @@ def test_gpu_binary_classification():
|
|||||||
from sklearn.model_selection import KFold
|
from sklearn.model_selection import KFold
|
||||||
|
|
||||||
digits = load_digits(n_class=2)
|
digits = load_digits(n_class=2)
|
||||||
y = digits['target']
|
y = digits["target"]
|
||||||
X = digits['data']
|
X = digits["data"]
|
||||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||||
for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
|
for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
|
||||||
for train_index, test_index in kf.split(X, y):
|
for train_index, test_index in kf.split(X, y):
|
||||||
xgb_model = cls(
|
xgb_model = cls(
|
||||||
random_state=42, tree_method='gpu_hist',
|
random_state=42, tree_method="gpu_hist", n_estimators=4, gpu_id="0"
|
||||||
n_estimators=4, gpu_id='0').fit(X[train_index], y[train_index])
|
).fit(X[train_index], y[train_index])
|
||||||
preds = xgb_model.predict(X[test_index])
|
preds = xgb_model.predict(X[test_index])
|
||||||
labels = y[test_index]
|
labels = y[test_index]
|
||||||
err = sum(1 for i in range(len(preds))
|
err = sum(
|
||||||
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
|
||||||
|
) / float(len(preds))
|
||||||
assert err < 0.1
|
assert err < 0.1
|
||||||
|
|
||||||
|
|
||||||
@ -133,7 +135,7 @@ def test_classififer():
|
|||||||
X, y = load_digits(return_X_y=True)
|
X, y = load_digits(return_X_y=True)
|
||||||
y *= 10
|
y *= 10
|
||||||
|
|
||||||
clf = xgb.XGBClassifier(tree_method="gpu_hist", n_estimators=1)
|
clf = xgb.XGBClassifier(tree_method="hist", n_estimators=1, device="cuda")
|
||||||
|
|
||||||
# numpy
|
# numpy
|
||||||
with pytest.raises(ValueError, match=r"Invalid classes.*"):
|
with pytest.raises(ValueError, match=r"Invalid classes.*"):
|
||||||
@ -161,3 +163,46 @@ def test_ranking_qid_df():
|
|||||||
import cudf
|
import cudf
|
||||||
|
|
||||||
run_ranking_qid_df(cudf, "gpu_hist")
|
run_ranking_qid_df(cudf, "gpu_hist")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(**tm.no_cupy())
|
||||||
|
@pytest.mark.mgpu
|
||||||
|
def test_device_ordinal() -> None:
|
||||||
|
import cupy as cp
|
||||||
|
|
||||||
|
n_devices = 2
|
||||||
|
|
||||||
|
def worker(ordinal: int, correct_ordinal: bool) -> None:
|
||||||
|
if correct_ordinal:
|
||||||
|
cp.cuda.runtime.setDevice(ordinal)
|
||||||
|
else:
|
||||||
|
cp.cuda.runtime.setDevice((ordinal + 1) % n_devices)
|
||||||
|
|
||||||
|
X, y, w = tm.make_regression(4096, 12, use_cupy=True)
|
||||||
|
reg = xgb.XGBRegressor(device=f"cuda:{ordinal}", tree_method="hist")
|
||||||
|
|
||||||
|
if correct_ordinal:
|
||||||
|
reg.fit(
|
||||||
|
X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
|
||||||
|
)
|
||||||
|
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
|
||||||
|
return
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Invalid device ordinal"):
|
||||||
|
reg.fit(
|
||||||
|
X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
|
||||||
|
)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
|
||||||
|
futures = []
|
||||||
|
n_trials = 32
|
||||||
|
for i in range(n_trials):
|
||||||
|
fut = executor.submit(
|
||||||
|
worker, ordinal=i % n_devices, correct_ordinal=i % 3 != 0
|
||||||
|
)
|
||||||
|
futures.append(fut)
|
||||||
|
|
||||||
|
for fut in futures:
|
||||||
|
fut.result()
|
||||||
|
|
||||||
|
cp.cuda.runtime.setDevice(0)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user