From 2200939416af2fd6b7eed334b5067bf483170f5e Mon Sep 17 00:00:00 2001 From: Thejaswi Date: Tue, 10 Jul 2018 13:12:15 +0530 Subject: [PATCH] Upgrading to NCCL2 (#3404) * Upgrading to NCCL2 * Part - II of NCCL2 upgradation - Doc updates to build with nccl2 - Dockerfile.gpu update for a correct CI build with nccl2 - Updated FindNccl package to have env-var NCCL_ROOT to take precedence * Upgrading to v9.2 for CI workflow, since it has the nccl2 binaries available * Added NCCL2 license + copy the nccl binaries into /usr location for the FindNccl module to find * Set LD_LIBRARY_PATH variable to pick nccl2 binary at runtime * Need the nccl2 library download instructions inside Dockerfile.release as well * Use NCCL2 as a static library --- .gitmodules | 3 -- CMakeLists.txt | 10 +++--- cmake/modules/FindNccl.cmake | 58 +++++++++++++++++++++++++++++++ doc/build.md | 8 +++++ nccl | 1 - src/common/device_helpers.cuh | 18 ++++++++++ src/tree/updater_gpu_hist.cu | 2 ++ tests/ci_build/Dockerfile.gpu | 12 ++++++- tests/ci_build/Dockerfile.release | 2 +- 9 files changed, 102 insertions(+), 12 deletions(-) create mode 100644 cmake/modules/FindNccl.cmake delete mode 160000 nccl diff --git a/.gitmodules b/.gitmodules index f271ce442..dbf7ee1a4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,9 +4,6 @@ [submodule "rabit"] path = rabit url = https://github.com/dmlc/rabit -[submodule "nccl"] - path = nccl - url = https://github.com/dmlc/nccl [submodule "cub"] path = cub url = https://github.com/NVlabs/cub diff --git a/CMakeLists.txt b/CMakeLists.txt index f3b879925..24eda97aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,7 +123,8 @@ if(USE_CUDA) include_directories(cub) if(USE_NCCL) - include_directories(nccl/src) + find_package(Nccl REQUIRED) + include_directories(${NCCL_INCLUDE_DIR}) add_definitions(-DXGBOOST_USE_NCCL) endif() @@ -136,14 +137,11 @@ if(USE_CUDA) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -Xcompiler -Werror; -std=c++11") endif() - if(USE_NCCL) - add_subdirectory(nccl) - endif() - cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC) if(USE_NCCL) - target_link_libraries(gpuxgboost nccl) + link_directories(${NCCL_LIBRARY}) + target_link_libraries(gpuxgboost ${NCCL_LIB_NAME}) endif() list(APPEND LINK_LIBRARIES gpuxgboost) endif() diff --git a/cmake/modules/FindNccl.cmake b/cmake/modules/FindNccl.cmake new file mode 100644 index 000000000..2d39abf19 --- /dev/null +++ b/cmake/modules/FindNccl.cmake @@ -0,0 +1,58 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find NCCL headers and libraries. +# +# Usage of this module as follows: +# +# find_package(NCCL) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# NCCL_ROOT - When set, this path is inspected instead of standard library +# locations as the root of the NCCL installation. +# The environment variable NCCL_ROOT overrides this veriable. +# +# This module defines +# Nccl_FOUND, whether nccl has been found +# NCCL_INCLUDE_DIR, directory containing header +# NCCL_LIBRARY, directory containing nccl library +# NCCL_LIB_NAME, nccl library name +# +# This module assumes that the user has already called find_package(CUDA) + + +set(NCCL_LIB_NAME nccl_static) + +find_path(NCCL_INCLUDE_DIR + NAMES nccl.h + PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include ${CUDA_INCLUDE_DIRS} /usr/include) + +find_library(NCCL_LIBRARY + NAMES ${NCCL_LIB_NAME} + PATHS $ENV{NCCL_ROOT}/lib ${NCCL_ROOT}/lib ${CUDA_INCLUDE_DIRS}/../lib /usr/lib) + +if (NCCL_INCLUDE_DIR AND NCCL_LIBRARY) + get_filename_component(NCCL_LIBRARY ${NCCL_LIBRARY} PATH) +endif () + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Nccl DEFAULT_MSG + NCCL_INCLUDE_DIR NCCL_LIBRARY) + +mark_as_advanced( + NCCL_INCLUDE_DIR + NCCL_LIBRARY + NCCL_LIB_NAME +) diff --git a/doc/build.md b/doc/build.md index 787846896..28ade8639 100644 --- a/doc/build.md +++ b/doc/build.md @@ -204,6 +204,14 @@ After the build process successfully ends, you will find a `xgboost.dll` library Unofficial windows binaries and instructions on how to use them are hosted on [Guido Tapia's blog](http://www.picnet.com.au/blogs/guido/post/2016/09/22/xgboost-windows-x64-binaries-for-download/) +### Building with Multi-GPU support +Multi-GPU support requires the [NCCL](https://developer.nvidia.com/nccl) library. With NCCL installed, run cmake as: +```bash +cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DNCCL_ROOT="" +export LD_LIBRARY_PATH=/lib:$LD_LIBRARY_PATH +``` +One can also pass NCCL_ROOT as an environment variable, in which case, this takes precedence over the cmake variable NCCL_ROOT. + ### Customized Building The configuration of xgboost can be modified by ```config.mk``` diff --git a/nccl b/nccl deleted file mode 160000 index faeac8333..000000000 --- a/nccl +++ /dev/null @@ -1 +0,0 @@ -Subproject commit faeac8333cec3ed7474dc9f1c35088dcd6eea60b diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 42cd52a41..aa109d9e4 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -946,6 +946,24 @@ class AllReducer { #endif } + /** + * \brief Use in exactly the same way as ncclGroupStart + */ + void GroupStart() { +#ifdef XGBOOST_USE_NCCL + dh::safe_nccl(ncclGroupStart()); +#endif + } + + /** + * \brief Use in exactly the same way as ncclGroupEnd + */ + void GroupEnd() { +#ifdef XGBOOST_USE_NCCL + dh::safe_nccl(ncclGroupEnd()); +#endif + } + /** * \brief Allreduce. Use in exactly the same way as NCCL but without needing * streams or comms. diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index f1d5249cc..647ce9c3f 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -810,6 +810,7 @@ class GPUHistMaker : public TreeUpdater { } void AllReduceHist(int nidx) { + reducer_.GroupStart(); for (auto& shard : shards_) { auto d_node_hist = shard->hist.GetHistPtr(nidx); reducer_.AllReduceSum( @@ -818,6 +819,7 @@ class GPUHistMaker : public TreeUpdater { reinterpret_cast(d_node_hist), n_bins_ * (sizeof(GradientPairSumT) / sizeof(GradientPairSumT::ValueT))); } + reducer_.GroupEnd(); reducer_.Synchronize(); } diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu index c25dbeebd..adf7668a5 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/tests/ci_build/Dockerfile.gpu @@ -7,7 +7,7 @@ ENV DEBIAN_FRONTEND noninteractive # Install all basic requirements RUN \ yum -y update && \ - yum install -y wget unzip && \ + yum install -y tar unzip wget xz && \ wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo && \ yum install -y devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ && \ # Python @@ -19,6 +19,16 @@ RUN \ cd cmake-3.5.2/ && ./configure && make && make install && cd ../ && \ rm -rf cmake-3.5.2/ && rm -rf cmake-3.5.2.tar.gz +# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) +RUN \ + export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]\.[0-9]'` && \ + wget https://developer.download.nvidia.com/compute/redist/nccl/v2.2/nccl_2.2.13-1%2Bcuda${CUDA_SHORT}_x86_64.txz && \ + tar xf "nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz" && \ + cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/include/nccl.h /usr/include && \ + cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/lib/* /usr/lib && \ + rm -f nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz && \ + rm -r nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64 + ENV PATH=/opt/python/bin:$PATH ENV CC=/opt/rh/devtoolset-2/root/usr/bin/gcc ENV CXX=/opt/rh/devtoolset-2/root/usr/bin/c++ diff --git a/tests/ci_build/Dockerfile.release b/tests/ci_build/Dockerfile.release index b976b1c7c..772e5f175 100644 --- a/tests/ci_build/Dockerfile.release +++ b/tests/ci_build/Dockerfile.release @@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND noninteractive # Install all basic requirements RUN \ yum -y update && \ - yum install -y wget graphviz && \ + yum install -y graphviz tar unzip wget xz && \ # Python wget https://repo.continuum.io/miniconda/Miniconda2-4.3.27-Linux-x86_64.sh && \ bash Miniconda2-4.3.27-Linux-x86_64.sh -b -p /opt/python