Upgrading to NCCL2 (#3404)
* Upgrading to NCCL2 * Part - II of NCCL2 upgradation - Doc updates to build with nccl2 - Dockerfile.gpu update for a correct CI build with nccl2 - Updated FindNccl package to have env-var NCCL_ROOT to take precedence * Upgrading to v9.2 for CI workflow, since it has the nccl2 binaries available * Added NCCL2 license + copy the nccl binaries into /usr location for the FindNccl module to find * Set LD_LIBRARY_PATH variable to pick nccl2 binary at runtime * Need the nccl2 library download instructions inside Dockerfile.release as well * Use NCCL2 as a static library
This commit is contained in:
parent
a6331925d2
commit
2200939416
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -4,9 +4,6 @@
|
|||||||
[submodule "rabit"]
|
[submodule "rabit"]
|
||||||
path = rabit
|
path = rabit
|
||||||
url = https://github.com/dmlc/rabit
|
url = https://github.com/dmlc/rabit
|
||||||
[submodule "nccl"]
|
|
||||||
path = nccl
|
|
||||||
url = https://github.com/dmlc/nccl
|
|
||||||
[submodule "cub"]
|
[submodule "cub"]
|
||||||
path = cub
|
path = cub
|
||||||
url = https://github.com/NVlabs/cub
|
url = https://github.com/NVlabs/cub
|
||||||
|
|||||||
@ -123,7 +123,8 @@ if(USE_CUDA)
|
|||||||
include_directories(cub)
|
include_directories(cub)
|
||||||
|
|
||||||
if(USE_NCCL)
|
if(USE_NCCL)
|
||||||
include_directories(nccl/src)
|
find_package(Nccl REQUIRED)
|
||||||
|
include_directories(${NCCL_INCLUDE_DIR})
|
||||||
add_definitions(-DXGBOOST_USE_NCCL)
|
add_definitions(-DXGBOOST_USE_NCCL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@ -136,14 +137,11 @@ if(USE_CUDA)
|
|||||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -Xcompiler -Werror; -std=c++11")
|
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -Xcompiler -Werror; -std=c++11")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_NCCL)
|
|
||||||
add_subdirectory(nccl)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC)
|
cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC)
|
||||||
|
|
||||||
if(USE_NCCL)
|
if(USE_NCCL)
|
||||||
target_link_libraries(gpuxgboost nccl)
|
link_directories(${NCCL_LIBRARY})
|
||||||
|
target_link_libraries(gpuxgboost ${NCCL_LIB_NAME})
|
||||||
endif()
|
endif()
|
||||||
list(APPEND LINK_LIBRARIES gpuxgboost)
|
list(APPEND LINK_LIBRARIES gpuxgboost)
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
58
cmake/modules/FindNccl.cmake
Normal file
58
cmake/modules/FindNccl.cmake
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Tries to find NCCL headers and libraries.
|
||||||
|
#
|
||||||
|
# Usage of this module as follows:
|
||||||
|
#
|
||||||
|
# find_package(NCCL)
|
||||||
|
#
|
||||||
|
# Variables used by this module, they can change the default behaviour and need
|
||||||
|
# to be set before calling find_package:
|
||||||
|
#
|
||||||
|
# NCCL_ROOT - When set, this path is inspected instead of standard library
|
||||||
|
# locations as the root of the NCCL installation.
|
||||||
|
# The environment variable NCCL_ROOT overrides this veriable.
|
||||||
|
#
|
||||||
|
# This module defines
|
||||||
|
# Nccl_FOUND, whether nccl has been found
|
||||||
|
# NCCL_INCLUDE_DIR, directory containing header
|
||||||
|
# NCCL_LIBRARY, directory containing nccl library
|
||||||
|
# NCCL_LIB_NAME, nccl library name
|
||||||
|
#
|
||||||
|
# This module assumes that the user has already called find_package(CUDA)
|
||||||
|
|
||||||
|
|
||||||
|
set(NCCL_LIB_NAME nccl_static)
|
||||||
|
|
||||||
|
find_path(NCCL_INCLUDE_DIR
|
||||||
|
NAMES nccl.h
|
||||||
|
PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include ${CUDA_INCLUDE_DIRS} /usr/include)
|
||||||
|
|
||||||
|
find_library(NCCL_LIBRARY
|
||||||
|
NAMES ${NCCL_LIB_NAME}
|
||||||
|
PATHS $ENV{NCCL_ROOT}/lib ${NCCL_ROOT}/lib ${CUDA_INCLUDE_DIRS}/../lib /usr/lib)
|
||||||
|
|
||||||
|
if (NCCL_INCLUDE_DIR AND NCCL_LIBRARY)
|
||||||
|
get_filename_component(NCCL_LIBRARY ${NCCL_LIBRARY} PATH)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(Nccl DEFAULT_MSG
|
||||||
|
NCCL_INCLUDE_DIR NCCL_LIBRARY)
|
||||||
|
|
||||||
|
mark_as_advanced(
|
||||||
|
NCCL_INCLUDE_DIR
|
||||||
|
NCCL_LIBRARY
|
||||||
|
NCCL_LIB_NAME
|
||||||
|
)
|
||||||
@ -204,6 +204,14 @@ After the build process successfully ends, you will find a `xgboost.dll` library
|
|||||||
|
|
||||||
Unofficial windows binaries and instructions on how to use them are hosted on [Guido Tapia's blog](http://www.picnet.com.au/blogs/guido/post/2016/09/22/xgboost-windows-x64-binaries-for-download/)
|
Unofficial windows binaries and instructions on how to use them are hosted on [Guido Tapia's blog](http://www.picnet.com.au/blogs/guido/post/2016/09/22/xgboost-windows-x64-binaries-for-download/)
|
||||||
|
|
||||||
|
### Building with Multi-GPU support
|
||||||
|
Multi-GPU support requires the [NCCL](https://developer.nvidia.com/nccl) library. With NCCL installed, run cmake as:
|
||||||
|
```bash
|
||||||
|
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DNCCL_ROOT="<NCCL_DIRECTORY>"
|
||||||
|
export LD_LIBRARY_PATH=<NCCL_DIRECTORY>/lib:$LD_LIBRARY_PATH
|
||||||
|
```
|
||||||
|
One can also pass NCCL_ROOT as an environment variable, in which case, this takes precedence over the cmake variable NCCL_ROOT.
|
||||||
|
|
||||||
### Customized Building
|
### Customized Building
|
||||||
|
|
||||||
The configuration of xgboost can be modified by ```config.mk```
|
The configuration of xgboost can be modified by ```config.mk```
|
||||||
|
|||||||
1
nccl
1
nccl
@ -1 +0,0 @@
|
|||||||
Subproject commit faeac8333cec3ed7474dc9f1c35088dcd6eea60b
|
|
||||||
@ -946,6 +946,24 @@ class AllReducer {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Use in exactly the same way as ncclGroupStart
|
||||||
|
*/
|
||||||
|
void GroupStart() {
|
||||||
|
#ifdef XGBOOST_USE_NCCL
|
||||||
|
dh::safe_nccl(ncclGroupStart());
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Use in exactly the same way as ncclGroupEnd
|
||||||
|
*/
|
||||||
|
void GroupEnd() {
|
||||||
|
#ifdef XGBOOST_USE_NCCL
|
||||||
|
dh::safe_nccl(ncclGroupEnd());
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||||
* streams or comms.
|
* streams or comms.
|
||||||
|
|||||||
@ -810,6 +810,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void AllReduceHist(int nidx) {
|
void AllReduceHist(int nidx) {
|
||||||
|
reducer_.GroupStart();
|
||||||
for (auto& shard : shards_) {
|
for (auto& shard : shards_) {
|
||||||
auto d_node_hist = shard->hist.GetHistPtr(nidx);
|
auto d_node_hist = shard->hist.GetHistPtr(nidx);
|
||||||
reducer_.AllReduceSum(
|
reducer_.AllReduceSum(
|
||||||
@ -818,6 +819,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
reinterpret_cast<GradientPairSumT::ValueT*>(d_node_hist),
|
reinterpret_cast<GradientPairSumT::ValueT*>(d_node_hist),
|
||||||
n_bins_ * (sizeof(GradientPairSumT) / sizeof(GradientPairSumT::ValueT)));
|
n_bins_ * (sizeof(GradientPairSumT) / sizeof(GradientPairSumT::ValueT)));
|
||||||
}
|
}
|
||||||
|
reducer_.GroupEnd();
|
||||||
|
|
||||||
reducer_.Synchronize();
|
reducer_.Synchronize();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,7 +7,7 @@ ENV DEBIAN_FRONTEND noninteractive
|
|||||||
# Install all basic requirements
|
# Install all basic requirements
|
||||||
RUN \
|
RUN \
|
||||||
yum -y update && \
|
yum -y update && \
|
||||||
yum install -y wget unzip && \
|
yum install -y tar unzip wget xz && \
|
||||||
wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo && \
|
wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo && \
|
||||||
yum install -y devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ && \
|
yum install -y devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ && \
|
||||||
# Python
|
# Python
|
||||||
@ -19,6 +19,16 @@ RUN \
|
|||||||
cd cmake-3.5.2/ && ./configure && make && make install && cd ../ && \
|
cd cmake-3.5.2/ && ./configure && make && make install && cd ../ && \
|
||||||
rm -rf cmake-3.5.2/ && rm -rf cmake-3.5.2.tar.gz
|
rm -rf cmake-3.5.2/ && rm -rf cmake-3.5.2.tar.gz
|
||||||
|
|
||||||
|
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
|
||||||
|
RUN \
|
||||||
|
export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]\.[0-9]'` && \
|
||||||
|
wget https://developer.download.nvidia.com/compute/redist/nccl/v2.2/nccl_2.2.13-1%2Bcuda${CUDA_SHORT}_x86_64.txz && \
|
||||||
|
tar xf "nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz" && \
|
||||||
|
cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/include/nccl.h /usr/include && \
|
||||||
|
cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/lib/* /usr/lib && \
|
||||||
|
rm -f nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz && \
|
||||||
|
rm -r nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64
|
||||||
|
|
||||||
ENV PATH=/opt/python/bin:$PATH
|
ENV PATH=/opt/python/bin:$PATH
|
||||||
ENV CC=/opt/rh/devtoolset-2/root/usr/bin/gcc
|
ENV CC=/opt/rh/devtoolset-2/root/usr/bin/gcc
|
||||||
ENV CXX=/opt/rh/devtoolset-2/root/usr/bin/c++
|
ENV CXX=/opt/rh/devtoolset-2/root/usr/bin/c++
|
||||||
|
|||||||
@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND noninteractive
|
|||||||
# Install all basic requirements
|
# Install all basic requirements
|
||||||
RUN \
|
RUN \
|
||||||
yum -y update && \
|
yum -y update && \
|
||||||
yum install -y wget graphviz && \
|
yum install -y graphviz tar unzip wget xz && \
|
||||||
# Python
|
# Python
|
||||||
wget https://repo.continuum.io/miniconda/Miniconda2-4.3.27-Linux-x86_64.sh && \
|
wget https://repo.continuum.io/miniconda/Miniconda2-4.3.27-Linux-x86_64.sh && \
|
||||||
bash Miniconda2-4.3.27-Linux-x86_64.sh -b -p /opt/python
|
bash Miniconda2-4.3.27-Linux-x86_64.sh -b -p /opt/python
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user