diff --git a/CMakeLists.txt b/CMakeLists.txt index a4c5b7a27..2b7ab0817 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF) set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header") ## CUDA option(USE_CUDA "Build with GPU acceleration" OFF) -option(USE_NCCL "Build with NCCL to enable multi-GPU support." OFF) +option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF) set(GPU_COMPUTE_VER "" CACHE STRING "Semicolon separated list of compute versions to be built against, e.g. '35;61'") diff --git a/doc/build.rst b/doc/build.rst index 21d8edc46..76353f00d 100644 --- a/doc/build.rst +++ b/doc/build.rst @@ -196,9 +196,9 @@ From the command line on Linux starting from the XGBoost directory: cmake .. -DUSE_CUDA=ON make -j4 -.. note:: Enabling multi-GPU training +.. note:: Enabling distributed GPU training - By default, multi-GPU training is disabled and only a single GPU will be used. To enable multi-GPU training, set the option ``USE_NCCL=ON``. Multi-GPU training depends on NCCL2, available at `this link `_. Since NCCL2 is only available for Linux machines, **multi-GPU training is available only for Linux**. + By default, distributed GPU training is disabled and only a single GPU will be used. To enable distributed GPU training, set the option ``USE_NCCL=ON``. Distributed GPU training depends on NCCL2, available at `this link `_. Since NCCL2 is only available for Linux machines, **distributed GPU training is available only for Linux**. .. code-block:: bash diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst index 7e72ee41b..9f91affca 100644 --- a/doc/gpu/index.rst +++ b/doc/gpu/index.rst @@ -80,6 +80,8 @@ The GPU algorithms currently work with CLI, Python and R packages. See :doc:`/bu Single Node Multi-GPU ===================== +.. note:: Single node multi-GPU training is deprecated. Please use distributed GPU training with one process per GPU. + Multiple GPUs can be used with the ``gpu_hist`` tree method using the ``n_gpus`` parameter. which defaults to 1. If this is set to -1 all available GPUs will be used. If ``gpu_id`` is specified as non-zero, the selected gpu devices will be from ``gpu_id`` to ``gpu_id+n_gpus``, please note that ``gpu_id+n_gpus`` must be less than or equal to the number of available GPUs on your system. As with GPU vs. CPU, multi-GPU will not always be faster than a single GPU due to PCI bus bandwidth that can limit performance. .. note:: Enabling multi-GPU training diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h index 8f9836cf9..9c2f6dc08 100644 --- a/include/xgboost/generic_parameters.h +++ b/include/xgboost/generic_parameters.h @@ -66,7 +66,9 @@ struct LearnerTrainParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(n_gpus) .set_default(0) .set_lower_bound(-1) - .describe("Number of GPUs to use for multi-gpu algorithms."); + .describe("Deprecated, please use distributed training with one " + "process per GPU. " + "Number of GPUs to use for multi-gpu algorithms."); DMLC_DECLARE_FIELD(booster) .set_default("gbtree") .describe("Gradient booster used for training."); diff --git a/src/learner.cc b/src/learner.cc index f928fbfdd..8c84b2434 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -200,8 +200,14 @@ class LearnerImpl : public Learner { << " Internal Error: Always call InitModel or Load before any evaluation."; this->ValidateDMatrix(dmat); CHECK(this->gbm_) << " Internal: GBM is not set"; - if (this->gbm_->UseGPU() && cfg_.find("n_gpus") == cfg_.cend()) { - tparam_.n_gpus = 1; + if (this->gbm_->UseGPU()) { + if (cfg_.find("n_gpus") == cfg_.cend()) { + tparam_.n_gpus = 1; + } + if (tparam_.n_gpus != 1) { + LOG(WARNING) << "Multi-GPU training is deprecated. " + "Please use distributed GPU training with one process per GPU."; + } } }