diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh index 899976a7d..f46e6ccd0 100755 --- a/tests/buildkite/build-containers.sh +++ b/tests/buildkite/build-containers.sh @@ -20,16 +20,18 @@ case "${container}" in cpu) ;; - gpu|rmm) + gpu) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - if [[ $container == "rmm" ]] - then - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - fi ;; - gpu_build_centos7|jvm_gpu_build) + gpu_build_centos7) + BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" + BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" + BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" + ;; + + jvm_gpu_build) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" ;; diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh index 2e0b9fe2c..46bc98028 100755 --- a/tests/buildkite/build-cuda-with-rmm.sh +++ b/tests/buildkite/build-cuda-with-rmm.sh @@ -2,9 +2,11 @@ set -euo pipefail +WHEEL_TAG=manylinux2014_x86_64 + source tests/buildkite/conftest.sh -echo "--- Build with CUDA ${CUDA_VERSION}, RMM enabled" +echo "--- Build with CUDA ${CUDA_VERSION} with RMM" if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] then @@ -13,14 +15,40 @@ else arch_flag="" fi -command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "` +command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "` `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION" + `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` + `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \ - -DUSE_NCCL=ON -DPLUGIN_RMM=ON ${arch_flag} +$command_wrapper tests/ci_build/prune_libnccl.sh +$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \ + -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ + -DPLUGIN_RMM=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ + -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} +echo "--- Build binary wheel" +$command_wrapper bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \ + ${BUILDKITE_COMMIT} ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard" +tests/ci_build/ci_build.sh auditwheel_x86_64 docker auditwheel repair \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \ + ${BUILDKITE_COMMIT} ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +tests/ci_build/ci_build.sh auditwheel_x86_64 docker bash -c \ + "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" + +echo "--- Upload Python wheel" +buildkite-agent artifact upload python-package/dist/*.whl +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \ + --acl public-read --no-progress +fi echo "-- Stash C++ test executable (testxgboost)" buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh index c180695e8..1926754b8 100755 --- a/tests/buildkite/build-cuda.sh +++ b/tests/buildkite/build-cuda.sh @@ -17,11 +17,12 @@ fi command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "` `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION" + `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` + `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" echo "--- Build libxgboost from the source" $command_wrapper tests/ci_build/prune_libnccl.sh -$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \ +$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc" \ -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml index 72e1ec1e5..905535c52 100644 --- a/tests/buildkite/pipeline.yml +++ b/tests/buildkite/pipeline.yml @@ -16,7 +16,6 @@ steps: - "tests/buildkite/build-containers.sh cpu" - "tests/buildkite/build-containers.sh gpu" - "tests/buildkite/build-containers.sh gpu_build_centos7" - - "tests/buildkite/build-containers.sh rmm" key: build-containers agents: queue: linux-amd64-cpu diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh index 7c8f5e505..58d250308 100755 --- a/tests/buildkite/test-cpp-gpu.sh +++ b/tests/buildkite/test-cpp-gpu.sh @@ -16,8 +16,8 @@ echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" rm -rfv build/ buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm chmod +x build/testxgboost -tests/ci_build/ci_build.sh rmm nvidia-docker \ +tests/ci_build/ci_build.sh gpu nvidia-docker \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \ - "source activate gpu_test && build/testxgboost --use-rmm-pool" + --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ + build/testxgboost --use-rmm-pool diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index bfc79c216..4f9823baa 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -2,6 +2,7 @@ ARG CUDA_VERSION_ARG FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7 ARG CUDA_VERSION_ARG ARG NCCL_VERSION_ARG +ARG RAPIDS_VERSION_ARG # Install all basic requirements RUN \ @@ -16,8 +17,8 @@ RUN \ bash conda.sh -b -p /opt/mambaforge && \ /opt/mambaforge/bin/python -m pip install awscli && \ # CMake - wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \ - bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr + wget -nv -nc https://cmake.org/files/v3.24/cmake-3.24.0-linux-x86_64.sh --no-check-certificate && \ + bash cmake-3.24.0-linux-x86_64.sh --skip-license --prefix=/usr # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ @@ -33,9 +34,21 @@ ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++ ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp +ENV CUDAHOSTCXX=/opt/rh/devtoolset-9/root/usr/bin/c++ ENV GOSU_VERSION 1.10 +# Install RMM +RUN git clone -b v${RAPIDS_VERSION_ARG}.00 https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ + pushd rmm && \ + mkdir build && \ + pushd build && \ + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=/opt/rmm -DCUDA_STATIC_RUNTIME=ON && \ + cmake --build . --target install && \ + popd && \ + popd && \ + rm -rf rmm + # Install gRPC RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ --recurse-submodules --depth 1 && \ diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm deleted file mode 100644 index 16db377c2..000000000 --- a/tests/ci_build/Dockerfile.rmm +++ /dev/null @@ -1,49 +0,0 @@ -ARG CUDA_VERSION_ARG -FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04 -ARG CUDA_VERSION_ARG -ARG RAPIDS_VERSION_ARG -ARG NCCL_VERSION_ARG - -# Environment -ENV DEBIAN_FRONTEND noninteractive -SHELL ["/bin/bash", "-c"] # Use Bash as shell - -# Install all basic requirements -RUN \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \ - apt-get update && \ - apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/mambaforge - -# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) -RUN \ - export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=$NCCL_VERSION_ARG && \ - apt-get update && \ - apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} - -ENV PATH=/opt/mambaforge/bin:$PATH - -# Create new Conda environment with RMM -RUN \ - conda install -c conda-forge mamba && \ - mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \ - python=3.10 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \ - mamba clean --all - -ENV GOSU_VERSION 1.10 - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/prune_libnccl.sh b/tests/ci_build/prune_libnccl.sh index 5b6e48ad5..a81d6e4ac 100755 --- a/tests/ci_build/prune_libnccl.sh +++ b/tests/ci_build/prune_libnccl.sh @@ -26,7 +26,7 @@ set_property(TARGET test PROPERTY CUDA_ARCHITECTURES \${CMAKE_CUDA_ARCHITECTURES set(CMAKE_EXPORT_COMPILE_COMMANDS ON) EOF -cmake . -GNinja +cmake . -GNinja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON gen_code=$(grep -o -- '--generate-code=\S*' compile_commands.json | paste -sd ' ') nvprune ${gen_code} /usr/lib64/libnccl_static.a -o ../libnccl_static.a