[CI] Build pip wheel with RMM support (#9383)

2023-07-18 01:52:26 -07:00
parent 6e18d3a290
commit e082718c66
8 changed files with 64 additions and 70 deletions
--- a/tests/buildkite/build-containers.sh
+++ b/tests/buildkite/build-containers.sh
@@ -20,16 +20,18 @@ case "${container}" in
  cpu)
    ;;
-  gpu|rmm)
+  gpu)
    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
    if [[ $container == "rmm" ]]
    then
      BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
    fi
    ;;
-  gpu_build_centos7|jvm_gpu_build)
+  gpu_build_centos7)
    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
    BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
    ;;
  jvm_gpu_build)
    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
    BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
    ;;
--- a/tests/buildkite/build-cuda-with-rmm.sh
+++ b/tests/buildkite/build-cuda-with-rmm.sh
@@ -2,9 +2,11 @@
 set -euo pipefail
 WHEEL_TAG=manylinux2014_x86_64
 source tests/buildkite/conftest.sh
-echo "--- Build with CUDA ${CUDA_VERSION}, RMM enabled"
+echo "--- Build with CUDA ${CUDA_VERSION} with RMM"
 if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
 then
@@ -13,14 +15,40 @@ else
  arch_flag=""
 fi
-command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
+command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
+                `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "`
-                `"NCCL_VERSION_ARG=$NCCL_VERSION"
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
 echo "--- Build libxgboost from the source"
-$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \
+$command_wrapper tests/ci_build/prune_libnccl.sh
-  -DUSE_NCCL=ON -DPLUGIN_RMM=ON ${arch_flag}
+$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
  -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
  -DPLUGIN_RMM=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
  -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
  "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
 $command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
 echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
 tests/ci_build/ci_build.sh auditwheel_x86_64 docker auditwheel repair \
  --plat ${WHEEL_TAG} python-package/dist/*.whl
 $command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
 mv -v wheelhouse/*.whl python-package/dist/
 # Make sure that libgomp.so is vendored in the wheel
 tests/ci_build/ci_build.sh auditwheel_x86_64 docker bash -c \
  "unzip -l python-package/dist/*.whl | grep libgomp  || exit -1"
 echo "--- Upload Python wheel"
 buildkite-agent artifact upload python-package/dist/*.whl
 if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
 then
  aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \
    --acl public-read --no-progress
 fi
 echo "-- Stash C++ test executable (testxgboost)"
 buildkite-agent artifact upload build/testxgboost
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -17,11 +17,12 @@ fi
 command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"NCCL_VERSION_ARG=$NCCL_VERSION"
+                `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "`
                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/prune_libnccl.sh
-$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \
+$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc" \
  -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
  -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
  -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
--- a/tests/buildkite/pipeline.yml
+++ b/tests/buildkite/pipeline.yml
@@ -16,7 +16,6 @@ steps:
      - "tests/buildkite/build-containers.sh cpu"
      - "tests/buildkite/build-containers.sh gpu"
      - "tests/buildkite/build-containers.sh gpu_build_centos7"
      - "tests/buildkite/build-containers.sh rmm"
    key: build-containers
    agents:
      queue: linux-amd64-cpu
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -16,8 +16,8 @@ echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
 rm -rfv build/
 buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
 chmod +x build/testxgboost
-tests/ci_build/ci_build.sh rmm nvidia-docker \
+tests/ci_build/ci_build.sh gpu nvidia-docker \
  --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
-  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
-  "source activate gpu_test && build/testxgboost --use-rmm-pool"
+  build/testxgboost --use-rmm-pool
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -2,6 +2,7 @@ ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
 # Install all basic requirements
 RUN \
@@ -16,8 +17,8 @@ RUN \
    bash conda.sh -b -p /opt/mambaforge && \
    /opt/mambaforge/bin/python -m pip install awscli && \
    # CMake
-    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
+    wget -nv -nc https://cmake.org/files/v3.24/cmake-3.24.0-linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    bash cmake-3.24.0-linux-x86_64.sh --skip-license --prefix=/usr
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
@@ -33,9 +34,21 @@ ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
 ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
 ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
 ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
 ENV CUDAHOSTCXX=/opt/rh/devtoolset-9/root/usr/bin/c++
 ENV GOSU_VERSION 1.10
 # Install RMM
 RUN git clone -b v${RAPIDS_VERSION_ARG}.00 https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \
    pushd rmm && \
    mkdir build && \
    pushd build && \
    cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=/opt/rmm -DCUDA_STATIC_RUNTIME=ON && \
    cmake --build . --target install && \
    popd && \
    popd && \
    rm -rf rmm
 # Install gRPC
 RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
      --recurse-submodules --depth 1 && \
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -1,49 +0,0 @@
 ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
 ARG CUDA_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
 ARG NCCL_VERSION_ARG
 # Environment
 ENV DEBIAN_FRONTEND noninteractive
 SHELL ["/bin/bash", "-c"]   # Use Bash as shell
 # Install all basic requirements
 RUN \
    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
    apt-get update && \
    apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \
    # Python
    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
    bash conda.sh -b -p /opt/mambaforge
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
    export NCCL_VERSION=$NCCL_VERSION_ARG && \
    apt-get update && \
    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
 ENV PATH=/opt/mambaforge/bin:$PATH
 # Create new Conda environment with RMM
 RUN \
    conda install -c conda-forge mamba && \
    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
        python=3.10 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
    mamba clean --all
 ENV GOSU_VERSION 1.10
 # Install lightweight sudo (not bound to TTY)
 RUN set -ex; \
    wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
    chmod +x /usr/local/bin/gosu && \
    gosu nobody true
 # Default entry-point to use if running locally
 # It will preserve attributes of created files
 COPY entrypoint.sh /scripts/
 WORKDIR /workspace
 ENTRYPOINT ["/scripts/entrypoint.sh"]
--- a/tests/ci_build/prune_libnccl.sh
+++ b/tests/ci_build/prune_libnccl.sh
@@ -26,7 +26,7 @@ set_property(TARGET test PROPERTY CUDA_ARCHITECTURES \${CMAKE_CUDA_ARCHITECTURES
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 EOF
-cmake . -GNinja
+cmake . -GNinja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 gen_code=$(grep -o -- '--generate-code=\S*' compile_commands.json | paste -sd ' ')
 nvprune ${gen_code} /usr/lib64/libnccl_static.a -o ../libnccl_static.a