Upgrading to NCCL2 (#3404)

* Upgrading to NCCL2

* Part - II of NCCL2 upgradation

 - Doc updates to build with nccl2
 - Dockerfile.gpu update for a correct CI build with nccl2
 - Updated FindNccl package to have env-var NCCL_ROOT to take precedence

* Upgrading to v9.2 for CI workflow, since it has the nccl2 binaries available

* Added NCCL2 license + copy the nccl binaries into /usr location for the FindNccl module to find

* Set LD_LIBRARY_PATH variable to pick nccl2 binary at runtime

* Need the nccl2 library download instructions inside Dockerfile.release as well

* Use NCCL2 as a static library
This commit is contained in:
Thejaswi 2018-07-10 13:12:15 +05:30 committed by Philip Hyunsu Cho
parent a6331925d2
commit 2200939416
9 changed files with 102 additions and 12 deletions

3
.gitmodules vendored
View File

@ -4,9 +4,6 @@
[submodule "rabit"]
path = rabit
url = https://github.com/dmlc/rabit
[submodule "nccl"]
path = nccl
url = https://github.com/dmlc/nccl
[submodule "cub"]
path = cub
url = https://github.com/NVlabs/cub

View File

@ -123,7 +123,8 @@ if(USE_CUDA)
include_directories(cub)
if(USE_NCCL)
include_directories(nccl/src)
find_package(Nccl REQUIRED)
include_directories(${NCCL_INCLUDE_DIR})
add_definitions(-DXGBOOST_USE_NCCL)
endif()
@ -136,14 +137,11 @@ if(USE_CUDA)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -Xcompiler -Werror; -std=c++11")
endif()
if(USE_NCCL)
add_subdirectory(nccl)
endif()
cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC)
if(USE_NCCL)
target_link_libraries(gpuxgboost nccl)
link_directories(${NCCL_LIBRARY})
target_link_libraries(gpuxgboost ${NCCL_LIB_NAME})
endif()
list(APPEND LINK_LIBRARIES gpuxgboost)
endif()

View File

@ -0,0 +1,58 @@
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Tries to find NCCL headers and libraries.
#
# Usage of this module as follows:
#
# find_package(NCCL)
#
# Variables used by this module, they can change the default behaviour and need
# to be set before calling find_package:
#
# NCCL_ROOT - When set, this path is inspected instead of standard library
# locations as the root of the NCCL installation.
# The environment variable NCCL_ROOT overrides this veriable.
#
# This module defines
# Nccl_FOUND, whether nccl has been found
# NCCL_INCLUDE_DIR, directory containing header
# NCCL_LIBRARY, directory containing nccl library
# NCCL_LIB_NAME, nccl library name
#
# This module assumes that the user has already called find_package(CUDA)
set(NCCL_LIB_NAME nccl_static)
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include ${CUDA_INCLUDE_DIRS} /usr/include)
find_library(NCCL_LIBRARY
NAMES ${NCCL_LIB_NAME}
PATHS $ENV{NCCL_ROOT}/lib ${NCCL_ROOT}/lib ${CUDA_INCLUDE_DIRS}/../lib /usr/lib)
if (NCCL_INCLUDE_DIR AND NCCL_LIBRARY)
get_filename_component(NCCL_LIBRARY ${NCCL_LIBRARY} PATH)
endif ()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Nccl DEFAULT_MSG
NCCL_INCLUDE_DIR NCCL_LIBRARY)
mark_as_advanced(
NCCL_INCLUDE_DIR
NCCL_LIBRARY
NCCL_LIB_NAME
)

View File

@ -204,6 +204,14 @@ After the build process successfully ends, you will find a `xgboost.dll` library
Unofficial windows binaries and instructions on how to use them are hosted on [Guido Tapia's blog](http://www.picnet.com.au/blogs/guido/post/2016/09/22/xgboost-windows-x64-binaries-for-download/)
### Building with Multi-GPU support
Multi-GPU support requires the [NCCL](https://developer.nvidia.com/nccl) library. With NCCL installed, run cmake as:
```bash
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DNCCL_ROOT="<NCCL_DIRECTORY>"
export LD_LIBRARY_PATH=<NCCL_DIRECTORY>/lib:$LD_LIBRARY_PATH
```
One can also pass NCCL_ROOT as an environment variable, in which case, this takes precedence over the cmake variable NCCL_ROOT.
### Customized Building
The configuration of xgboost can be modified by ```config.mk```

1
nccl

@ -1 +0,0 @@
Subproject commit faeac8333cec3ed7474dc9f1c35088dcd6eea60b

View File

@ -946,6 +946,24 @@ class AllReducer {
#endif
}
/**
* \brief Use in exactly the same way as ncclGroupStart
*/
void GroupStart() {
#ifdef XGBOOST_USE_NCCL
dh::safe_nccl(ncclGroupStart());
#endif
}
/**
* \brief Use in exactly the same way as ncclGroupEnd
*/
void GroupEnd() {
#ifdef XGBOOST_USE_NCCL
dh::safe_nccl(ncclGroupEnd());
#endif
}
/**
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
* streams or comms.

View File

@ -810,6 +810,7 @@ class GPUHistMaker : public TreeUpdater {
}
void AllReduceHist(int nidx) {
reducer_.GroupStart();
for (auto& shard : shards_) {
auto d_node_hist = shard->hist.GetHistPtr(nidx);
reducer_.AllReduceSum(
@ -818,6 +819,7 @@ class GPUHistMaker : public TreeUpdater {
reinterpret_cast<GradientPairSumT::ValueT*>(d_node_hist),
n_bins_ * (sizeof(GradientPairSumT) / sizeof(GradientPairSumT::ValueT)));
}
reducer_.GroupEnd();
reducer_.Synchronize();
}

View File

@ -7,7 +7,7 @@ ENV DEBIAN_FRONTEND noninteractive
# Install all basic requirements
RUN \
yum -y update && \
yum install -y wget unzip && \
yum install -y tar unzip wget xz && \
wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo && \
yum install -y devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ && \
# Python
@ -19,6 +19,16 @@ RUN \
cd cmake-3.5.2/ && ./configure && make && make install && cd ../ && \
rm -rf cmake-3.5.2/ && rm -rf cmake-3.5.2.tar.gz
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]\.[0-9]'` && \
wget https://developer.download.nvidia.com/compute/redist/nccl/v2.2/nccl_2.2.13-1%2Bcuda${CUDA_SHORT}_x86_64.txz && \
tar xf "nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz" && \
cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/include/nccl.h /usr/include && \
cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/lib/* /usr/lib && \
rm -f nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz && \
rm -r nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64
ENV PATH=/opt/python/bin:$PATH
ENV CC=/opt/rh/devtoolset-2/root/usr/bin/gcc
ENV CXX=/opt/rh/devtoolset-2/root/usr/bin/c++

View File

@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND noninteractive
# Install all basic requirements
RUN \
yum -y update && \
yum install -y wget graphviz && \
yum install -y graphviz tar unzip wget xz && \
# Python
wget https://repo.continuum.io/miniconda/Miniconda2-4.3.27-Linux-x86_64.sh && \
bash Miniconda2-4.3.27-Linux-x86_64.sh -b -p /opt/python