[backport] Update CUDA docker image and NCCL. (#8139) (#8162)

* Update CUDA docker image and NCCL. (#8139) * Rest of the CI. * CPU test dependencies.
2022-08-12 18:57:42 +08:00
parent a55d3bdde2
commit 39c1488a42
13 changed files with 68 additions and 45 deletions
--- a/18
+++ b/18
@@ -7,7 +7,7 @@
 dockerRun = 'tests/ci_build/ci_build.sh'

 // Which CUDA version to use when building reference distribution wheel
-ref_cuda_ver = '11.0'
+ref_cuda_ver = '11.0.3'

 import groovy.transform.Field

@@ -60,9 +60,9 @@ pipeline {
            'build-cpu-rabit-mock': { BuildCPUMock() },
            // Build reference, distribution-ready Python wheel with CUDA 11.0
            // using CentOS 7 image
-            'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0', build_rmm: true) },
-            'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '11.0') },
-            'build-jvm-packages-gpu-cuda11.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.1', cuda_version: '11.0') },
+            'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0.3', build_rmm: true) },
+            'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '11.0.3') },
+            'build-jvm-packages-gpu-cuda11.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.1', cuda_version: '11.0.3') },
            'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.1') },
            'build-jvm-doc': { BuildJVMDoc() }
          ])
@@ -77,9 +77,9 @@ pipeline {
            'test-python-cpu': { TestPythonCPU() },
            'test-python-cpu-arm64': { TestPythonCPUARM64() },
            // artifact_cuda_version doesn't apply to RMM tests; RMM tests will always match CUDA version between artifact and host env
-            'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', test_rmm: true) },
-            'test-python-mgpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', multi_gpu: true, test_rmm: true) },
-            'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', test_rmm: true) },
+            'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', test_rmm: true) },
+            'test-python-mgpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', multi_gpu: true, test_rmm: true) },
+            'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', test_rmm: true) },
            'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') }
          ])
        }
@@ -123,7 +123,7 @@ def ClangTidy() {
    echo "Running clang-tidy job..."
    def container_type = "clang_tidy"
    def docker_binary = "docker"
-    def dockerArgs = "--build-arg CUDA_VERSION_ARG=11.0"
+    def dockerArgs = "--build-arg CUDA_VERSION_ARG=11.0.3"
    sh """
    ${dockerRun} ${container_type} ${docker_binary} ${dockerArgs} python3 tests/ci_build/tidy.py --cuda-archs 75
    """
@@ -445,7 +445,7 @@ def DeployJVMPackages(args) {
    if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
      echo 'Deploying to xgboost-maven-repo S3 repo...'
      sh """
-      ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=11.0 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version}
+      ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=11.0.3 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version}
      """
    }
    deleteDir()
--- a/src/common/device_helpers.cu
+++ b/src/common/device_helpers.cu
@@ -38,6 +38,9 @@ void AllReducer::Init(int _device_ordinal) {

  int32_t const rank = rabit::GetRank();
  int32_t const world = rabit::GetWorldSize();
+  if (world == 1) {
+    return;
+  }

  std::vector<uint64_t> uuids(world * kUuidLength, 0);
  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -775,13 +775,16 @@ class AllReducer {
   */

  void AllReduceSum(const double *sendbuff, double *recvbuff, int count) {
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
 #ifdef XGBOOST_USE_NCCL
    CHECK(initialised_);
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, comm_, stream_));
    allreduce_bytes_ += count * sizeof(double);
    allreduce_calls_ += 1;
-#endif
+#endif  // XGBOOST_USE_NCCL
  }

  /**
@@ -796,9 +799,12 @@ class AllReducer {

  void AllGather(uint32_t const* data, size_t length,
                 dh::caching_device_vector<uint32_t>* recvbuf) {
+    size_t world = rabit::GetWorldSize();
+    if (world == 1) {
+      return;
+    }
 #ifdef XGBOOST_USE_NCCL
    CHECK(initialised_);
-    size_t world = rabit::GetWorldSize();
    recvbuf->resize(length * world);
    safe_nccl(ncclAllGather(data, recvbuf->data().get(), length, ncclUint32,
                            comm_, stream_));
@@ -813,9 +819,11 @@ class AllReducer {
   * \param recvbuff                The recvbuff.
   * \param count                   Number of elements.
   */
-
  void AllReduceSum(const float *sendbuff, float *recvbuff, int count) {
 #ifdef XGBOOST_USE_NCCL
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
    CHECK(initialised_);
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, comm_, stream_));
@@ -836,6 +844,9 @@ class AllReducer {

  void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
 #ifdef XGBOOST_USE_NCCL
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
    CHECK(initialised_);

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
@@ -845,6 +856,9 @@ class AllReducer {

  void AllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) {
 #ifdef XGBOOST_USE_NCCL
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
    CHECK(initialised_);

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
@@ -853,6 +867,9 @@ class AllReducer {
  }

  void AllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) {
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
 #ifdef XGBOOST_USE_NCCL
    CHECK(initialised_);

@@ -867,12 +884,15 @@ class AllReducer {
            std::enable_if_t<std::is_same<size_t, T>::value &&
                             !std::is_same<size_t, unsigned long long>::value>  // NOLINT
                * = nullptr>
-  void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT
+  void AllReduceSum(const T *sendbuff, T *recvbuff, int count) {  // NOLINT
 #ifdef XGBOOST_USE_NCCL
+    if (rabit::GetWorldSize() == 1) {
+      return;
+    }
    CHECK(initialised_);

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");  // NOLINT
    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclUint64, ncclSum, comm_, stream_));
 #endif
  }
--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@@ -10,13 +10,13 @@ RUN \
    apt-get install -y software-properties-common && \
    add-apt-repository ppa:ubuntu-toolchain-r/test && \
    apt-get update && \
-    apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 && \
+    apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 openjdk-8-jdk-headless && \
    # CMake
    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
    # Python
-    wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    bash Miniconda3.sh -b -p /opt/python
+    wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \
+    bash Mambaforge-Linux-x86_64.sh -b -p /opt/python

 ENV PATH=/opt/python/bin:$PATH
 ENV CC=gcc-8
@@ -24,10 +24,11 @@ ENV CXX=g++-8
 ENV CPP=cpp-8

 ENV GOSU_VERSION 1.10
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/

 # Create new Conda environment
 COPY conda_env/cpu_test.yml /scripts/
-RUN conda env create -n cpu_test --file=/scripts/cpu_test.yml
+RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml

 # Install lightweight sudo (not bound to TTY)
 RUN set -ex; \
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -10,7 +10,7 @@ SHELL ["/bin/bash", "-c"]   # Use Bash as shell
 RUN \
    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
    apt-get update && \
-    apt-get install -y wget unzip bzip2 libgomp1 build-essential && \
+    apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
    # Python
    wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
    bash Miniconda3.sh -b -p /opt/python
@@ -19,11 +19,14 @@ ENV PATH=/opt/python/bin:$PATH

 # Create new Conda environment with cuDF, Dask, and cuPy
 RUN \
-    conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.8 cudf=21.10* rmm=21.10* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda=21.10* dask-cudf=21.10* cupy=9.1* \
-        numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
+    conda install -c conda-forge mamba && \
+    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
+        python=3.8 cudf=22.04* rmm=22.04* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda=22.04* dask-cudf=22.04* cupy \
+        numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
+        pyspark cloudpickle cuda-python=11.7.0

 ENV GOSU_VERSION 1.10
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/

 # Install lightweight sudo (not bound to TTY)
 RUN set -ex; \
--- a/tests/ci_build/Dockerfile.gpu_build
+++ b/tests/ci_build/Dockerfile.gpu_build
@@ -24,7 +24,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.7.5-1 && \
+    export NCCL_VERSION=2.13.4-1 && \
    apt-get update && \
    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}

--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -4,7 +4,6 @@ ARG CUDA_VERSION_ARG

 # Install all basic requirements
 RUN \
-    rpm --erase gpg-pubkey-7fa2af80* && \
    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
        > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
    yum install -y epel-release centos-release-scl && \
@@ -22,7 +21,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.7.3-1 && \
+    export NCCL_VERSION=2.13.4-1 && \
    wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
    rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
    yum -y update && \
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -4,7 +4,6 @@ ARG CUDA_VERSION_ARG

 # Install all basic requirements
 RUN \
-    rpm --erase gpg-pubkey-7fa2af80* && \
    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
        > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
    yum install -y epel-release centos-release-scl && \
@@ -25,12 +24,10 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.8.3-1 && \
-    wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
-    rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
+    export NCCL_VERSION=2.13.4-1 && \
+    yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
    yum -y update && \
-    yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
-    rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
+    yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT}

 ENV PATH=/opt/python/bin:/opt/maven/bin:$PATH
 ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -18,7 +18,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.7.5-1 && \
+    export NCCL_VERSION=2.13.4-1 && \
    apt-get update && \
    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}

@@ -27,7 +27,7 @@ ENV PATH=/opt/python/bin:$PATH
 # Create new Conda environment with RMM
 RUN \
    conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.9 rmm=22.06* cudatoolkit=$CUDA_VERSION_ARG cmake
+        python=3.9 rmm=22.04* cudatoolkit=$CUDA_VERSION_ARG cmake

 ENV GOSU_VERSION 1.10

--- a/tests/ci_build/build_python_wheels.sh
+++ b/tests/ci_build/build_python_wheels.sh
@@ -29,13 +29,15 @@ if [[ "$platform_id" == macosx_* ]]; then
        setup_env_var='CIBW_TARGET_OSX_ARM64=1'  # extra flag to be passed to setup.py
        export PYTHON_CROSSENV=1
        export MACOSX_DEPLOYMENT_TARGET=12.0
-        OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-arm64/llvm-openmp-11.1.0-hf3c4609_1.tar.bz2"
+        #OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-arm64/llvm-openmp-11.1.0-hf3c4609_1.tar.bz2"
+        OPENMP_URL="https://xgboost-ci-jenkins-artifacts.s3.us-west-2.amazonaws.com/llvm-openmp-11.1.0-hf3c4609_1-osx-arm64.tar.bz2"
    elif [[ "$platform_id" == macosx_x86_64 ]]; then
        # MacOS, Intel
        wheel_tag=macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64
        cpython_ver=37
        export MACOSX_DEPLOYMENT_TARGET=10.13
-        OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-64/llvm-openmp-11.1.0-hda6cdc1_1.tar.bz2"
+        #OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-64/llvm-openmp-11.1.0-hda6cdc1_1.tar.bz2"
+        OPENMP_URL="https://xgboost-ci-jenkins-artifacts.s3.us-west-2.amazonaws.com/llvm-openmp-11.1.0-hda6cdc1_1-osx-64.tar.bz2"
    else
        echo "Platform not supported: $platform_id"
        exit 3
--- a/tests/ci_build/conda_env/cpu_test.yml
+++ b/tests/ci_build/conda_env/cpu_test.yml
@@ -30,15 +30,13 @@ dependencies:
 - jsonschema
 - boto3
 - awscli
- numba
- llvmlite
 - py-ubjson
 - cffi
 - pyarrow
- protobuf<=3.20
+- protobuf
+- pyspark>=3.3.0
+- cloudpickle
+- shap
+- modin
 - pip:
-  - shap
-  - ipython                     # required by shap at import time.
-  - sphinx_rtd_theme
  - datatable
-  - modin[all]
--- a/tests/ci_build/entrypoint.sh
+++ b/tests/ci_build/entrypoint.sh
@@ -20,9 +20,9 @@ else
 fi

 if [[ -n $CI_BUILD_UID ]] && [[ -n $CI_BUILD_GID ]]; then
-    groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+    groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true
    useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \
-        "${CI_BUILD_USER}"
+        "${CI_BUILD_USER}" || true
    export HOME="/home/${CI_BUILD_USER}"
    shopt -s dotglob
    cp -r /root/* "$HOME/"
--- a/tests/ci_build/rename_whl.py
+++ b/tests/ci_build/rename_whl.py
@@ -42,4 +42,4 @@ with cd(dirname):

    filesize = os.path.getsize(new_name) / 1024 / 1024  # MB
    msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}"
-    assert filesize <= 200, msg
+    assert filesize <= 300, msg