Upgrading to NCCL2 (#3404)
* Upgrading to NCCL2 * Part - II of NCCL2 upgradation - Doc updates to build with nccl2 - Dockerfile.gpu update for a correct CI build with nccl2 - Updated FindNccl package to have env-var NCCL_ROOT to take precedence * Upgrading to v9.2 for CI workflow, since it has the nccl2 binaries available * Added NCCL2 license + copy the nccl binaries into /usr location for the FindNccl module to find * Set LD_LIBRARY_PATH variable to pick nccl2 binary at runtime * Need the nccl2 library download instructions inside Dockerfile.release as well * Use NCCL2 as a static library
This commit is contained in:
parent
a6331925d2
commit
2200939416
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -4,9 +4,6 @@
|
||||
[submodule "rabit"]
|
||||
path = rabit
|
||||
url = https://github.com/dmlc/rabit
|
||||
[submodule "nccl"]
|
||||
path = nccl
|
||||
url = https://github.com/dmlc/nccl
|
||||
[submodule "cub"]
|
||||
path = cub
|
||||
url = https://github.com/NVlabs/cub
|
||||
|
||||
@ -123,7 +123,8 @@ if(USE_CUDA)
|
||||
include_directories(cub)
|
||||
|
||||
if(USE_NCCL)
|
||||
include_directories(nccl/src)
|
||||
find_package(Nccl REQUIRED)
|
||||
include_directories(${NCCL_INCLUDE_DIR})
|
||||
add_definitions(-DXGBOOST_USE_NCCL)
|
||||
endif()
|
||||
|
||||
@ -136,14 +137,11 @@ if(USE_CUDA)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -Xcompiler -Werror; -std=c++11")
|
||||
endif()
|
||||
|
||||
if(USE_NCCL)
|
||||
add_subdirectory(nccl)
|
||||
endif()
|
||||
|
||||
cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC)
|
||||
|
||||
if(USE_NCCL)
|
||||
target_link_libraries(gpuxgboost nccl)
|
||||
link_directories(${NCCL_LIBRARY})
|
||||
target_link_libraries(gpuxgboost ${NCCL_LIB_NAME})
|
||||
endif()
|
||||
list(APPEND LINK_LIBRARIES gpuxgboost)
|
||||
endif()
|
||||
|
||||
58
cmake/modules/FindNccl.cmake
Normal file
58
cmake/modules/FindNccl.cmake
Normal file
@ -0,0 +1,58 @@
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Tries to find NCCL headers and libraries.
|
||||
#
|
||||
# Usage of this module as follows:
|
||||
#
|
||||
# find_package(NCCL)
|
||||
#
|
||||
# Variables used by this module, they can change the default behaviour and need
|
||||
# to be set before calling find_package:
|
||||
#
|
||||
# NCCL_ROOT - When set, this path is inspected instead of standard library
|
||||
# locations as the root of the NCCL installation.
|
||||
# The environment variable NCCL_ROOT overrides this veriable.
|
||||
#
|
||||
# This module defines
|
||||
# Nccl_FOUND, whether nccl has been found
|
||||
# NCCL_INCLUDE_DIR, directory containing header
|
||||
# NCCL_LIBRARY, directory containing nccl library
|
||||
# NCCL_LIB_NAME, nccl library name
|
||||
#
|
||||
# This module assumes that the user has already called find_package(CUDA)
|
||||
|
||||
|
||||
set(NCCL_LIB_NAME nccl_static)
|
||||
|
||||
find_path(NCCL_INCLUDE_DIR
|
||||
NAMES nccl.h
|
||||
PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include ${CUDA_INCLUDE_DIRS} /usr/include)
|
||||
|
||||
find_library(NCCL_LIBRARY
|
||||
NAMES ${NCCL_LIB_NAME}
|
||||
PATHS $ENV{NCCL_ROOT}/lib ${NCCL_ROOT}/lib ${CUDA_INCLUDE_DIRS}/../lib /usr/lib)
|
||||
|
||||
if (NCCL_INCLUDE_DIR AND NCCL_LIBRARY)
|
||||
get_filename_component(NCCL_LIBRARY ${NCCL_LIBRARY} PATH)
|
||||
endif ()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(Nccl DEFAULT_MSG
|
||||
NCCL_INCLUDE_DIR NCCL_LIBRARY)
|
||||
|
||||
mark_as_advanced(
|
||||
NCCL_INCLUDE_DIR
|
||||
NCCL_LIBRARY
|
||||
NCCL_LIB_NAME
|
||||
)
|
||||
@ -204,6 +204,14 @@ After the build process successfully ends, you will find a `xgboost.dll` library
|
||||
|
||||
Unofficial windows binaries and instructions on how to use them are hosted on [Guido Tapia's blog](http://www.picnet.com.au/blogs/guido/post/2016/09/22/xgboost-windows-x64-binaries-for-download/)
|
||||
|
||||
### Building with Multi-GPU support
|
||||
Multi-GPU support requires the [NCCL](https://developer.nvidia.com/nccl) library. With NCCL installed, run cmake as:
|
||||
```bash
|
||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DNCCL_ROOT="<NCCL_DIRECTORY>"
|
||||
export LD_LIBRARY_PATH=<NCCL_DIRECTORY>/lib:$LD_LIBRARY_PATH
|
||||
```
|
||||
One can also pass NCCL_ROOT as an environment variable, in which case, this takes precedence over the cmake variable NCCL_ROOT.
|
||||
|
||||
### Customized Building
|
||||
|
||||
The configuration of xgboost can be modified by ```config.mk```
|
||||
|
||||
1
nccl
1
nccl
@ -1 +0,0 @@
|
||||
Subproject commit faeac8333cec3ed7474dc9f1c35088dcd6eea60b
|
||||
@ -946,6 +946,24 @@ class AllReducer {
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Use in exactly the same way as ncclGroupStart
|
||||
*/
|
||||
void GroupStart() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
dh::safe_nccl(ncclGroupStart());
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Use in exactly the same way as ncclGroupEnd
|
||||
*/
|
||||
void GroupEnd() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
dh::safe_nccl(ncclGroupEnd());
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
|
||||
@ -810,6 +810,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
}
|
||||
|
||||
void AllReduceHist(int nidx) {
|
||||
reducer_.GroupStart();
|
||||
for (auto& shard : shards_) {
|
||||
auto d_node_hist = shard->hist.GetHistPtr(nidx);
|
||||
reducer_.AllReduceSum(
|
||||
@ -818,6 +819,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
reinterpret_cast<GradientPairSumT::ValueT*>(d_node_hist),
|
||||
n_bins_ * (sizeof(GradientPairSumT) / sizeof(GradientPairSumT::ValueT)));
|
||||
}
|
||||
reducer_.GroupEnd();
|
||||
|
||||
reducer_.Synchronize();
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ ENV DEBIAN_FRONTEND noninteractive
|
||||
# Install all basic requirements
|
||||
RUN \
|
||||
yum -y update && \
|
||||
yum install -y wget unzip && \
|
||||
yum install -y tar unzip wget xz && \
|
||||
wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo && \
|
||||
yum install -y devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ && \
|
||||
# Python
|
||||
@ -19,6 +19,16 @@ RUN \
|
||||
cd cmake-3.5.2/ && ./configure && make && make install && cd ../ && \
|
||||
rm -rf cmake-3.5.2/ && rm -rf cmake-3.5.2.tar.gz
|
||||
|
||||
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
|
||||
RUN \
|
||||
export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]\.[0-9]'` && \
|
||||
wget https://developer.download.nvidia.com/compute/redist/nccl/v2.2/nccl_2.2.13-1%2Bcuda${CUDA_SHORT}_x86_64.txz && \
|
||||
tar xf "nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz" && \
|
||||
cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/include/nccl.h /usr/include && \
|
||||
cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/lib/* /usr/lib && \
|
||||
rm -f nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz && \
|
||||
rm -r nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64
|
||||
|
||||
ENV PATH=/opt/python/bin:$PATH
|
||||
ENV CC=/opt/rh/devtoolset-2/root/usr/bin/gcc
|
||||
ENV CXX=/opt/rh/devtoolset-2/root/usr/bin/c++
|
||||
|
||||
@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND noninteractive
|
||||
# Install all basic requirements
|
||||
RUN \
|
||||
yum -y update && \
|
||||
yum install -y wget graphviz && \
|
||||
yum install -y graphviz tar unzip wget xz && \
|
||||
# Python
|
||||
wget https://repo.continuum.io/miniconda/Miniconda2-4.3.27-Linux-x86_64.sh && \
|
||||
bash Miniconda2-4.3.27-Linux-x86_64.sh -b -p /opt/python
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user