Update CUDA docker image and NCCL. (#8139)

This commit is contained in:
Jiaming Yuan 2022-08-07 16:32:41 +08:00 committed by GitHub
parent ff471b3fab
commit bcc8679a05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 43 additions and 15 deletions

18
Jenkinsfile vendored
View File

@ -7,7 +7,7 @@
dockerRun = 'tests/ci_build/ci_build.sh' dockerRun = 'tests/ci_build/ci_build.sh'
// Which CUDA version to use when building reference distribution wheel // Which CUDA version to use when building reference distribution wheel
ref_cuda_ver = '11.0' ref_cuda_ver = '11.0.3'
import groovy.transform.Field import groovy.transform.Field
@ -61,9 +61,9 @@ pipeline {
'build-cpu-rabit-mock': { BuildCPUMock() }, 'build-cpu-rabit-mock': { BuildCPUMock() },
// Build reference, distribution-ready Python wheel with CUDA 11.0 // Build reference, distribution-ready Python wheel with CUDA 11.0
// using CentOS 7 image // using CentOS 7 image
'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0', build_rmm: true) }, 'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0.3', build_rmm: true) },
'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '11.0') }, 'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '11.0.3') },
'build-jvm-packages-gpu-cuda11.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.1', cuda_version: '11.0') }, 'build-jvm-packages-gpu-cuda11.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.1', cuda_version: '11.0.3') },
'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.1') }, 'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.1') },
'build-jvm-doc': { BuildJVMDoc() } 'build-jvm-doc': { BuildJVMDoc() }
]) ])
@ -78,9 +78,9 @@ pipeline {
'test-python-cpu': { TestPythonCPU() }, 'test-python-cpu': { TestPythonCPU() },
'test-python-cpu-arm64': { TestPythonCPUARM64() }, 'test-python-cpu-arm64': { TestPythonCPUARM64() },
// artifact_cuda_version doesn't apply to RMM tests; RMM tests will always match CUDA version between artifact and host env // artifact_cuda_version doesn't apply to RMM tests; RMM tests will always match CUDA version between artifact and host env
'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', test_rmm: true) }, 'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', test_rmm: true) },
'test-python-mgpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', multi_gpu: true, test_rmm: true) }, 'test-python-mgpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', multi_gpu: true, test_rmm: true) },
'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', test_rmm: true) }, 'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', test_rmm: true) },
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') } 'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') }
]) ])
} }
@ -124,7 +124,7 @@ def ClangTidy() {
echo "Running clang-tidy job..." echo "Running clang-tidy job..."
def container_type = "clang_tidy" def container_type = "clang_tidy"
def docker_binary = "docker" def docker_binary = "docker"
def dockerArgs = "--build-arg CUDA_VERSION_ARG=11.0" def dockerArgs = "--build-arg CUDA_VERSION_ARG=11.0.3"
sh """ sh """
${dockerRun} ${container_type} ${docker_binary} ${dockerArgs} python3 tests/ci_build/tidy.py --cuda-archs 75 ${dockerRun} ${container_type} ${docker_binary} ${dockerArgs} python3 tests/ci_build/tidy.py --cuda-archs 75
""" """
@ -446,7 +446,7 @@ def DeployJVMPackages(args) {
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) { if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
echo 'Deploying to xgboost-maven-repo S3 repo...' echo 'Deploying to xgboost-maven-repo S3 repo...'
sh """ sh """
${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=11.0 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=11.0.3 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version}
""" """
} }
deleteDir() deleteDir()

View File

@ -33,6 +33,9 @@ std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> uuid) {
void NcclAllReducer::DoInit(int _device_ordinal) { void NcclAllReducer::DoInit(int _device_ordinal) {
int32_t const rank = rabit::GetRank(); int32_t const rank = rabit::GetRank();
int32_t const world = rabit::GetWorldSize(); int32_t const world = rabit::GetWorldSize();
if (world == 1) {
return;
}
std::vector<uint64_t> uuids(world * kUuidLength, 0); std::vector<uint64_t> uuids(world * kUuidLength, 0);
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()}; auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};

View File

@ -754,6 +754,9 @@ class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
void Init(int _device_ordinal) { void Init(int _device_ordinal) {
device_ordinal_ = _device_ordinal; device_ordinal_ = _device_ordinal;
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
if (rabit::GetWorldSize() == 1) {
return;
}
this->Underlying().DoInit(_device_ordinal); this->Underlying().DoInit(_device_ordinal);
initialised_ = true; initialised_ = true;
} }
@ -769,6 +772,9 @@ class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
*/ */
void AllGather(void const *data, size_t length_bytes, std::vector<size_t> *segments, void AllGather(void const *data, size_t length_bytes, std::vector<size_t> *segments,
dh::caching_device_vector<char> *recvbuf) { dh::caching_device_vector<char> *recvbuf) {
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_); CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
this->Underlying().DoAllGather(data, length_bytes, segments, recvbuf); this->Underlying().DoAllGather(data, length_bytes, segments, recvbuf);
@ -784,6 +790,9 @@ class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
*/ */
void AllGather(uint32_t const *data, size_t length, void AllGather(uint32_t const *data, size_t length,
dh::caching_device_vector<uint32_t> *recvbuf) { dh::caching_device_vector<uint32_t> *recvbuf) {
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_); CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
this->Underlying().DoAllGather(data, length, recvbuf); this->Underlying().DoAllGather(data, length, recvbuf);
@ -798,6 +807,9 @@ class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
* \param count Number of elements. * \param count Number of elements.
*/ */
void AllReduceSum(const double *sendbuff, double *recvbuff, int count) { void AllReduceSum(const double *sendbuff, double *recvbuff, int count) {
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_); CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
@ -814,6 +826,9 @@ class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
* \param count Number of elements. * \param count Number of elements.
*/ */
void AllReduceSum(const float *sendbuff, float *recvbuff, int count) { void AllReduceSum(const float *sendbuff, float *recvbuff, int count) {
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_); CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
@ -831,6 +846,9 @@ class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
* \param count Number of. * \param count Number of.
*/ */
void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) { void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_); CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
@ -847,6 +865,9 @@ class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
* \param count Number of elements. * \param count Number of elements.
*/ */
void AllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) { void AllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_); CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
@ -863,6 +884,9 @@ class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
* \param count Number of elements. * \param count Number of elements.
*/ */
void AllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) { void AllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_); CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count);
@ -886,6 +910,9 @@ class AllReducerBase : public xgboost::common::Crtp<AllReducer> {
!std::is_same<size_t, unsigned long long>::value> // NOLINT !std::is_same<size_t, unsigned long long>::value> // NOLINT
* = nullptr> * = nullptr>
void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
if (rabit::GetWorldSize() == 1) {
return;
}
CHECK(initialised_); CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT

View File

@ -24,7 +24,7 @@ RUN \
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \ RUN \
export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=2.7.5-1 && \ export NCCL_VERSION=2.13.4-1 && \
apt-get update && \ apt-get update && \
apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}

View File

@ -4,7 +4,6 @@ ARG CUDA_VERSION_ARG
# Install all basic requirements # Install all basic requirements
RUN \ RUN \
rpm --erase gpg-pubkey-7fa2af80* && \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
> /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
yum install -y epel-release centos-release-scl && \ yum install -y epel-release centos-release-scl && \
@ -22,7 +21,7 @@ RUN \
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \ RUN \
export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=2.7.3-1 && \ export NCCL_VERSION=2.13.4-1 && \
wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
yum -y update && \ yum -y update && \

View File

@ -4,7 +4,6 @@ ARG CUDA_VERSION_ARG
# Install all basic requirements # Install all basic requirements
RUN \ RUN \
rpm --erase gpg-pubkey-7fa2af80* && \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
> /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
yum install -y epel-release centos-release-scl && \ yum install -y epel-release centos-release-scl && \
@ -25,7 +24,7 @@ RUN \
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \ RUN \
export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=2.12.12-1 && \ export NCCL_VERSION=2.13.4-1 && \
yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
yum -y update && \ yum -y update && \
yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT}

View File

@ -18,7 +18,7 @@ RUN \
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \ RUN \
export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=2.7.5-1 && \ export NCCL_VERSION=2.13.4-1 && \
apt-get update && \ apt-get update && \
apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}