From ef8bdaa047ebf6504245a08d7046c49528cc1a13 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Fri, 29 Dec 2023 11:15:38 -0800 Subject: [PATCH] [CI] Update machine images (#9932) --- cmake/modules/FindLibR.cmake | 5 +++- doc/contrib/ci.rst | 7 ++--- tests/buildkite/build-containers.sh | 3 +- tests/buildkite/build-cpu-arm64.sh | 2 +- tests/buildkite/build-cpu.sh | 2 +- tests/buildkite/build-cuda-with-rmm.sh | 6 ++-- tests/buildkite/build-cuda.sh | 6 ++-- tests/buildkite/build-gpu-rpkg.sh | 2 +- tests/buildkite/build-jvm-doc.sh | 2 +- tests/buildkite/build-jvm-packages-gpu.sh | 2 +- tests/buildkite/build-jvm-packages.sh | 4 +-- tests/buildkite/deploy-jvm-packages.sh | 2 +- .../aws-stack-creator/create_stack.py | 2 +- .../aws-stack-creator/metadata.py | 28 +++++++++---------- .../linux-amd64-gpu-bootstrap.yml | 12 ++++---- .../windows-gpu-bootstrap.yml | 25 ++++++----------- tests/buildkite/run-clang-tidy.sh | 2 +- tests/buildkite/test-cpp-gpu.sh | 4 +-- tests/buildkite/test-cpp-mgpu.sh | 2 +- .../test-integration-jvm-packages.sh | 2 +- tests/buildkite/test-python-cpu-arm64.sh | 2 +- tests/buildkite/test-python-cpu.sh | 2 +- tests/buildkite/test-python-gpu.sh | 2 +- tests/ci_build/build_r_pkg_with_cuda_win64.sh | 6 ++-- tests/ci_build/ci_build.sh | 28 ++++++++++++------- tests/cpp/common/test_device_helpers.cu | 2 +- 26 files changed, 82 insertions(+), 80 deletions(-) diff --git a/cmake/modules/FindLibR.cmake b/cmake/modules/FindLibR.cmake index 1eb384238..c406ae0d9 100644 --- a/cmake/modules/FindLibR.cmake +++ b/cmake/modules/FindLibR.cmake @@ -65,7 +65,10 @@ function(create_rlib_for_msvc) execute_process(COMMAND ${DLLTOOL_EXE} "--input-def" "${CMAKE_CURRENT_BINARY_DIR}/R.def" - "--output-lib" "${CMAKE_CURRENT_BINARY_DIR}/R.lib") + "--output-lib" "${CMAKE_CURRENT_BINARY_DIR}/R.lib" + "--temp-prefix" "Rlibtemp" + COMMAND_ECHO STDOUT + COMMAND_ERROR_IS_FATAL ANY) endfunction() diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index 2db6f80bc..af9e65562 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -59,7 +59,7 @@ For your convenience, we provide the wrapper script ``tests/ci_build/ci_build.sh .. code-block:: bash - tests/ci_build/ci_build.sh --build-arg \ + tests/ci_build/ci_build.sh --use-gpus --build-arg \ ... where: @@ -68,8 +68,7 @@ where: container definition (Dockerfile) located at ``tests/ci_build/Dockerfile.``. For example, setting the container type to ``gpu`` will cause the script to load the Dockerfile ``tests/ci_build/Dockerfile.gpu``. -* ```` must be either ``docker`` or ``nvidia-docker``. Choose ``nvidia-docker`` - as long as you need to run any GPU code. +* Specify ``--use-gpus`` to run any GPU code. This flag will grant the container access to all NVIDIA GPUs in the base machine. Omit the flag if the access to GPUs is not necessary. * ```` is a build argument to be passed to Docker. Must be of form ``VAR=VALUE``. Example: ``--build-arg CUDA_VERSION_ARG=11.0``. You can pass multiple ``--build-arg``. * ```` is the command to run inside the Docker container. This can be more than one argument. @@ -83,7 +82,7 @@ arguments to Docker. For example: # Allocate extra space in /dev/shm to enable NCCL export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' # Run multi-GPU test suite - tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg CUDA_VERSION_ARG=11.0 \ + tests/ci_build/ci_build.sh gpu --use-gpus --build-arg CUDA_VERSION_ARG=11.0 \ tests/ci_build/test_python.sh mgpu To pass multiple extra arguments: diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh index f46e6ccd0..9aec33d1f 100755 --- a/tests/buildkite/build-containers.sh +++ b/tests/buildkite/build-containers.sh @@ -22,6 +22,7 @@ case "${container}" in gpu) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" + BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" ;; @@ -43,4 +44,4 @@ case "${container}" in esac # Run a no-op command. This will simply build the container and push it to the private registry -tests/ci_build/ci_build.sh ${container} docker ${BUILD_ARGS} bash +tests/ci_build/ci_build.sh ${container} ${BUILD_ARGS} bash diff --git a/tests/buildkite/build-cpu-arm64.sh b/tests/buildkite/build-cpu-arm64.sh index fd00a7971..3bbc95472 100755 --- a/tests/buildkite/build-cpu-arm64.sh +++ b/tests/buildkite/build-cpu-arm64.sh @@ -8,7 +8,7 @@ echo "--- Build CPU code targeting ARM64" source tests/buildkite/conftest.sh -command_wrapper="tests/ci_build/ci_build.sh aarch64 docker" +command_wrapper="tests/ci_build/ci_build.sh aarch64" echo "--- Build libxgboost from the source" $command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test \ diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh index 73e88d8aa..11679d644 100755 --- a/tests/buildkite/build-cpu.sh +++ b/tests/buildkite/build-cpu.sh @@ -6,7 +6,7 @@ echo "--- Build CPU code" source tests/buildkite/conftest.sh -command_wrapper="tests/ci_build/ci_build.sh cpu docker" +command_wrapper="tests/ci_build/ci_build.sh cpu" $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h # This step is not necessary, but here we include it, to ensure that diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh index 615608249..559bad8a7 100755 --- a/tests/buildkite/build-cuda-with-rmm.sh +++ b/tests/buildkite/build-cuda-with-rmm.sh @@ -15,7 +15,7 @@ else arch_flag="" fi -command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "` +command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 --build-arg "` `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" @@ -40,13 +40,13 @@ $command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \ ${BUILDKITE_COMMIT} ${WHEEL_TAG} echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard" -tests/ci_build/ci_build.sh auditwheel_x86_64 docker auditwheel repair \ +tests/ci_build/ci_build.sh auditwheel_x86_64 auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl $command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \ ${BUILDKITE_COMMIT} ${WHEEL_TAG} mv -v wheelhouse/*.whl python-package/dist/ # Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh auditwheel_x86_64 docker bash -c \ +tests/ci_build/ci_build.sh auditwheel_x86_64 bash -c \ "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" echo "--- Upload Python wheel" diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh index 7bd3492a2..5abc5ca5a 100755 --- a/tests/buildkite/build-cuda.sh +++ b/tests/buildkite/build-cuda.sh @@ -15,7 +15,7 @@ else arch_flag="" fi -command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "` +command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 --build-arg "` `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" @@ -39,13 +39,13 @@ $command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \ ${BUILDKITE_COMMIT} ${WHEEL_TAG} echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard" -tests/ci_build/ci_build.sh auditwheel_x86_64 docker auditwheel repair \ +tests/ci_build/ci_build.sh auditwheel_x86_64 auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl $command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \ ${BUILDKITE_COMMIT} ${WHEEL_TAG} mv -v wheelhouse/*.whl python-package/dist/ # Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh auditwheel_x86_64 docker bash -c \ +tests/ci_build/ci_build.sh auditwheel_x86_64 bash -c \ "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" echo "--- Upload Python wheel" diff --git a/tests/buildkite/build-gpu-rpkg.sh b/tests/buildkite/build-gpu-rpkg.sh index 585dc79ae..78a534615 100755 --- a/tests/buildkite/build-gpu-rpkg.sh +++ b/tests/buildkite/build-gpu-rpkg.sh @@ -6,7 +6,7 @@ source tests/buildkite/conftest.sh echo "--- Build XGBoost R package with CUDA" -tests/ci_build/ci_build.sh gpu_build_r_centos7 docker \ +tests/ci_build/ci_build.sh gpu_build_r_centos7 \ --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ --build-arg R_VERSION_ARG=${R_VERSION} \ tests/ci_build/build_r_pkg_with_cuda.sh \ diff --git a/tests/buildkite/build-jvm-doc.sh b/tests/buildkite/build-jvm-doc.sh index a2d658e48..d168eb8cc 100755 --- a/tests/buildkite/build-jvm-doc.sh +++ b/tests/buildkite/build-jvm-doc.sh @@ -5,7 +5,7 @@ set -euo pipefail source tests/buildkite/conftest.sh echo "--- Build JVM packages doc" -tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME} +tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then echo "--- Upload JVM packages doc" diff --git a/tests/buildkite/build-jvm-packages-gpu.sh b/tests/buildkite/build-jvm-packages-gpu.sh index 6a9a29cb3..64be7cc0a 100755 --- a/tests/buildkite/build-jvm-packages-gpu.sh +++ b/tests/buildkite/build-jvm-packages-gpu.sh @@ -13,7 +13,7 @@ else arch_flag="" fi -tests/ci_build/ci_build.sh jvm_gpu_build nvidia-docker \ +tests/ci_build/ci_build.sh jvm_gpu_build --use-gpus \ --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ tests/ci_build/build_jvm_packages.sh \ diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh index 33cfffe71..12393c561 100755 --- a/tests/buildkite/build-jvm-packages.sh +++ b/tests/buildkite/build-jvm-packages.sh @@ -5,13 +5,13 @@ set -euo pipefail source tests/buildkite/conftest.sh echo "--- Build XGBoost JVM packages scala 2.12" -tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ +tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ ${SPARK_VERSION} echo "--- Build XGBoost JVM packages scala 2.13" -tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ +tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ ${SPARK_VERSION} "" "" "true" echo "--- Stash XGBoost4J JARs" diff --git a/tests/buildkite/deploy-jvm-packages.sh b/tests/buildkite/deploy-jvm-packages.sh index a3410b294..812a6c5ca 100755 --- a/tests/buildkite/deploy-jvm-packages.sh +++ b/tests/buildkite/deploy-jvm-packages.sh @@ -7,7 +7,7 @@ source tests/buildkite/conftest.sh if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" - tests/ci_build/ci_build.sh jvm_gpu_build docker \ + tests/ci_build/ci_build.sh jvm_gpu_build \ --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION} diff --git a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py index 4277eed53..8f8db348a 100644 --- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py +++ b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py @@ -63,7 +63,7 @@ def format_params(args, *, stack_id, agent_iam_policy): params["BuildkiteAgentToken"] = args.agent_token params["VpcId"] = default_vpc.id params["Subnets"] = ",".join(subnets) - params["ManagedPolicyARN"] = agent_iam_policy + params["ManagedPolicyARNs"] = agent_iam_policy params.update(COMMON_STACK_PARAMS) return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py index 36d8595a6..3b56a2d8c 100644 --- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py +++ b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py @@ -1,34 +1,34 @@ AMI_ID = { # Managed by XGBoost team "linux-amd64-gpu": { - "us-west-2": "ami-094271bed4788ddb5", + "us-west-2": "ami-08c3bc1dd5ec8bc5c", }, "linux-amd64-mgpu": { - "us-west-2": "ami-094271bed4788ddb5", + "us-west-2": "ami-08c3bc1dd5ec8bc5c", }, "windows-gpu": { - "us-west-2": "ami-0839681594a1d7627", + "us-west-2": "ami-03c7f2156f93b22a7", }, "windows-cpu": { - "us-west-2": "ami-0839681594a1d7627", + "us-west-2": "ami-03c7f2156f93b22a7", }, # Managed by BuildKite # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml "linux-amd64-cpu": { - "us-west-2": "ami-00f2127550cf03658", + "us-west-2": "ami-015e64acb52b3e595", }, "pipeline-loader": { - "us-west-2": "ami-00f2127550cf03658", + "us-west-2": "ami-015e64acb52b3e595", }, "linux-arm64-cpu": { - "us-west-2": "ami-0c5789068f4a2d1b5", + "us-west-2": "ami-0884e9c23a2fa98d0", }, } STACK_PARAMS = { "linux-amd64-gpu": { "InstanceOperatingSystem": "linux", - "InstanceType": "g4dn.xlarge", + "InstanceTypes": "g4dn.xlarge", "AgentsPerInstance": "1", "MinSize": "0", "MaxSize": "8", @@ -38,7 +38,7 @@ STACK_PARAMS = { }, "linux-amd64-mgpu": { "InstanceOperatingSystem": "linux", - "InstanceType": "g4dn.12xlarge", + "InstanceTypes": "g4dn.12xlarge", "AgentsPerInstance": "1", "MinSize": "0", "MaxSize": "1", @@ -48,7 +48,7 @@ STACK_PARAMS = { }, "windows-gpu": { "InstanceOperatingSystem": "windows", - "InstanceType": "g4dn.2xlarge", + "InstanceTypes": "g4dn.2xlarge", "AgentsPerInstance": "1", "MinSize": "0", "MaxSize": "2", @@ -58,7 +58,7 @@ STACK_PARAMS = { }, "windows-cpu": { "InstanceOperatingSystem": "windows", - "InstanceType": "c5a.2xlarge", + "InstanceTypes": "c5a.2xlarge", "AgentsPerInstance": "1", "MinSize": "0", "MaxSize": "2", @@ -68,7 +68,7 @@ STACK_PARAMS = { }, "linux-amd64-cpu": { "InstanceOperatingSystem": "linux", - "InstanceType": "c5a.4xlarge", + "InstanceTypes": "c5a.4xlarge", "AgentsPerInstance": "1", "MinSize": "0", "MaxSize": "16", @@ -78,7 +78,7 @@ STACK_PARAMS = { }, "pipeline-loader": { "InstanceOperatingSystem": "linux", - "InstanceType": "t3a.micro", + "InstanceTypes": "t3a.micro", "AgentsPerInstance": "1", "MinSize": "2", "MaxSize": "2", @@ -88,7 +88,7 @@ STACK_PARAMS = { }, "linux-arm64-cpu": { "InstanceOperatingSystem": "linux", - "InstanceType": "c6g.4xlarge", + "InstanceTypes": "c6g.4xlarge", "AgentsPerInstance": "1", "MinSize": "0", "MaxSize": "8", diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml index a5c82a7fa..88403911c 100644 --- a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml +++ b/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml @@ -12,15 +12,13 @@ phases: - | yum groupinstall -y "Development tools" yum install -y kernel-devel-$(uname -r) + dnf install -y kernel-modules-extra aws s3 cp --recursive s3://ec2-linux-nvidia-drivers/latest/ . chmod +x NVIDIA-Linux-x86_64*.run - CC=/usr/bin/gcc10-cc ./NVIDIA-Linux-x86_64*.run --silent + ./NVIDIA-Linux-x86_64*.run --silent - amazon-linux-extras install docker - systemctl --now enable docker - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ - && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo \ - | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo + curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo + yum install -y nvidia-container-toolkit yum clean expire-cache - yum install -y nvidia-docker2 + nvidia-ctk runtime configure --runtime=docker systemctl restart docker diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml index 03fb105a7..e4d212fda 100644 --- a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml +++ b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml @@ -15,9 +15,9 @@ phases: choco --version choco feature enable -n=allowGlobalConfirmation - # CMake 3.25 - Write-Host '>>> Installing CMake 3.25...' - choco install cmake --version 3.25.2 --installargs "ADD_CMAKE_TO_PATH=System" + # CMake 3.27 + Write-Host '>>> Installing CMake 3.27...' + choco install cmake --version 3.27.9 --installargs "ADD_CMAKE_TO_PATH=System" if ($LASTEXITCODE -ne 0) { throw "Last command failed" } # Notepad++ @@ -25,15 +25,14 @@ phases: choco install notepadplusplus if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - # Miniconda - Write-Host '>>> Installing Miniconda...' - choco install miniconda3 /RegisterPython:1 /D:C:\tools\miniconda3 - C:\tools\miniconda3\Scripts\conda.exe init --user --system + # Mambaforge + Write-Host '>>> Installing Mambaforge...' + choco install mambaforge /RegisterPython:1 /D:C:\tools\mambaforge + C:\tools\mambaforge\Scripts\conda.exe init --user --system if ($LASTEXITCODE -ne 0) { throw "Last command failed" } . "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" if ($LASTEXITCODE -ne 0) { throw "Last command failed" } conda config --set auto_activate_base false - conda config --prepend channels conda-forge # Install Java 11 Write-Host '>>> Installing Java 11...' @@ -59,15 +58,9 @@ phases: choco install cuda --version=11.8.0.52206 if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - # Install Python packages - Write-Host '>>> Installing Python packages...' - conda activate - conda install -y mamba - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - # Install R Write-Host '>>> Installing R...' - choco install r.project --version=3.6.3 + choco install r.project --version=4.3.2 if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - choco install rtools --version=3.5.0.4 + choco install rtools --version=4.3.5550 if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/buildkite/run-clang-tidy.sh b/tests/buildkite/run-clang-tidy.sh index 1a664f568..95ff010c2 100755 --- a/tests/buildkite/run-clang-tidy.sh +++ b/tests/buildkite/run-clang-tidy.sh @@ -6,6 +6,6 @@ echo "--- Run clang-tidy" source tests/buildkite/conftest.sh -tests/ci_build/ci_build.sh clang_tidy docker \ +tests/ci_build/ci_build.sh clang_tidy \ --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ python3 tests/ci_build/tidy.py --cuda-archs 75 diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh index 36f54cd3d..d7197db2e 100755 --- a/tests/buildkite/test-cpp-gpu.sh +++ b/tests/buildkite/test-cpp-gpu.sh @@ -7,7 +7,7 @@ source tests/buildkite/conftest.sh echo "--- Run Google Tests with CUDA, using a GPU" buildkite-agent artifact download "build/testxgboost" . --step build-cuda chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu nvidia-docker \ +tests/ci_build/ci_build.sh gpu --use-gpus \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ @@ -17,7 +17,7 @@ echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" rm -rfv build/ buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu nvidia-docker \ +tests/ci_build/ci_build.sh gpu --use-gpus \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh index 2aac47407..65614b191 100755 --- a/tests/buildkite/test-cpp-mgpu.sh +++ b/tests/buildkite/test-cpp-mgpu.sh @@ -10,7 +10,7 @@ export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' echo "--- Run Google Tests with CUDA, using multiple GPUs" buildkite-agent artifact download "build/testxgboost" . --step build-cuda chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu nvidia-docker \ +tests/ci_build/ci_build.sh gpu --use-gpus \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ diff --git a/tests/buildkite/test-integration-jvm-packages.sh b/tests/buildkite/test-integration-jvm-packages.sh index 9f477720a..51f74afe9 100755 --- a/tests/buildkite/test-integration-jvm-packages.sh +++ b/tests/buildkite/test-integration-jvm-packages.sh @@ -9,5 +9,5 @@ buildkite-agent artifact download "jvm-packages/xgboost4j/target/*.jar" . --step buildkite-agent artifact download "jvm-packages/xgboost4j-spark/target/*.jar" . --step build-jvm-packages buildkite-agent artifact download "jvm-packages/xgboost4j-example/target/*.jar" . --step build-jvm-packages export CI_DOCKER_EXTRA_PARAMS_INIT='-e RUN_INTEGRATION_TEST=1' -tests/ci_build/ci_build.sh jvm_cross docker --build-arg JDK_VERSION=${JDK_VERSION} \ +tests/ci_build/ci_build.sh jvm_cross --build-arg JDK_VERSION=${JDK_VERSION} \ --build-arg SPARK_VERSION=${SPARK_VERSION} tests/ci_build/test_jvm_cross.sh diff --git a/tests/buildkite/test-python-cpu-arm64.sh b/tests/buildkite/test-python-cpu-arm64.sh index ed1ad101b..68a428034 100755 --- a/tests/buildkite/test-python-cpu-arm64.sh +++ b/tests/buildkite/test-python-cpu-arm64.sh @@ -8,4 +8,4 @@ echo "--- Test Python CPU ARM64" buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cpu-arm64 buildkite-agent artifact download "xgboost" . --step build-cpu-arm64 chmod +x ./xgboost -tests/ci_build/ci_build.sh aarch64 docker tests/ci_build/test_python.sh cpu-arm64 +tests/ci_build/ci_build.sh aarch64 tests/ci_build/test_python.sh cpu-arm64 diff --git a/tests/buildkite/test-python-cpu.sh b/tests/buildkite/test-python-cpu.sh index 938e1184e..6c53dc282 100755 --- a/tests/buildkite/test-python-cpu.sh +++ b/tests/buildkite/test-python-cpu.sh @@ -13,4 +13,4 @@ chmod +x ./xgboost export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/cpu) set_buildkite_env_vars_in_container -tests/ci_build/ci_build.sh cpu docker tests/ci_build/test_python.sh cpu +tests/ci_build/ci_build.sh cpu tests/ci_build/test_python.sh cpu diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh index c2376c021..bb61a980d 100755 --- a/tests/buildkite/test-python-gpu.sh +++ b/tests/buildkite/test-python-gpu.sh @@ -22,7 +22,7 @@ chmod +x build/testxgboost # Allocate extra space in /dev/shm to enable NCCL export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' -command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "` +command_wrapper="tests/ci_build/ci_build.sh gpu --use-gpus --build-arg "` `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "` `"NCCL_VERSION_ARG=$NCCL_VERSION" diff --git a/tests/ci_build/build_r_pkg_with_cuda_win64.sh b/tests/ci_build/build_r_pkg_with_cuda_win64.sh index d44a418d1..580358883 100644 --- a/tests/ci_build/build_r_pkg_with_cuda_win64.sh +++ b/tests/ci_build/build_r_pkg_with_cuda_win64.sh @@ -18,7 +18,7 @@ mv xgboost/ xgboost_rpack/ mkdir build cd build -cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-3.6.3" +cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-4.3.2" -DCMAKE_PREFIX_PATH="C:\\rtools43\\x86_64-w64-mingw32.static.posix\\bin" cmake --build . --config Release --parallel cd .. @@ -32,5 +32,5 @@ cp -v lib/xgboost.dll xgboost_rpack/src/ echo 'all:' > xgboost_rpack/src/Makefile echo 'all:' > xgboost_rpack/src/Makefile.win mv xgboost_rpack/ xgboost/ -/c/Rtools/bin/tar -cvf xgboost_r_gpu_win64_${commit_hash}.tar xgboost/ -/c/Rtools/bin/gzip -9c xgboost_r_gpu_win64_${commit_hash}.tar > xgboost_r_gpu_win64_${commit_hash}.tar.gz +/c/Rtools43/usr/bin/tar -cvf xgboost_r_gpu_win64_${commit_hash}.tar xgboost/ +/c/Rtools43/usr/bin/gzip -9c xgboost_r_gpu_win64_${commit_hash}.tar > xgboost_r_gpu_win64_${commit_hash}.tar.gz diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh index ef0c69183..a2f2d6063 100755 --- a/tests/ci_build/ci_build.sh +++ b/tests/ci_build/ci_build.sh @@ -2,14 +2,14 @@ # # Execute command within a docker container # -# Usage: ci_build.sh +# Usage: ci_build.sh [--use-gpus] # [--dockerfile ] [-it] # [--build-arg ] # # CONTAINER_TYPE: Type of the docker container used the run the build: e.g., # (cpu | gpu) # -# DOCKER_BINARY: Command to invoke docker, e.g. (docker | nvidia-docker). +# --use-gpus: Whether to grant the container access to NVIDIA GPUs. # # DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build. If # this optional value is not supplied (via the --dockerfile @@ -29,9 +29,12 @@ shift 1 DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" DOCKER_CONTEXT_PATH="${SCRIPT_DIR}" -# Get docker binary command (should be either docker or nvidia-docker) -DOCKER_BINARY="$1" -shift 1 +GPU_FLAG='' +if [[ "$1" == "--use-gpus" ]]; then + echo "Using NVIDIA GPUs" + GPU_FLAG='--gpus all' + shift 1 +fi if [[ "$1" == "--dockerfile" ]]; then DOCKERFILE_PATH="$2" @@ -144,21 +147,21 @@ then DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" # Login for Docker registry - echo "\$(aws ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})" - $(aws ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID}) + echo "aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} | docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO}" + aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} | docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO} # Pull pre-build container from Docker build cache, # if one exists for the particular branch or pull request DOCKER_TAG="${BRANCH_NAME//\//-}" # Slashes are not allow in Docker tag echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}" if time docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}" then - CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}" + CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG} --build-arg BUILDKIT_INLINE_CACHE=1" else # If the build cache is empty of the particular branch or pull request, # use the build cache associated with the master branch echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" || true - CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" + CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master --build-arg BUILDKIT_INLINE_CACHE=1" fi else CACHE_FROM_CMD='' @@ -166,11 +169,15 @@ fi echo "docker build \ ${CI_DOCKER_BUILD_ARG} \ + --progress=plain \ + --ulimit nofile=1024000:1024000 \ -t ${DOCKER_IMG_NAME} \ -f ${DOCKERFILE_PATH} ${DOCKER_CONTEXT_PATH} \ ${CACHE_FROM_CMD}" docker build \ ${CI_DOCKER_BUILD_ARG} \ + --progress=plain \ + --ulimit nofile=1024000:1024000 \ -t "${DOCKER_IMG_NAME}" \ -f "${DOCKERFILE_PATH}" "${DOCKER_CONTEXT_PATH}" \ ${CACHE_FROM_CMD} @@ -231,7 +238,8 @@ echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..." # and share the PID namespace (--pid=host) so the process inside does not have # pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it). set -x -${DOCKER_BINARY} run --rm --pid=host \ +docker run --rm --pid=host \ + ${GPU_FLAG} \ -v "${WORKSPACE}":/workspace \ -w /workspace \ ${USER_IDS} \ diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 7ae8faf03..1d10a48ad 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -165,7 +165,7 @@ TEST(SegmentedUnique, Regression) { } } -TEST(Allocator, OOM) { +TEST(Allocator, DISABLED_OOM) { auto size = dh::AvailableMemory(0) * 4; ASSERT_THROW({dh::caching_device_vector vec(size);}, dmlc::Error); ASSERT_THROW({dh::device_vector vec(size);}, dmlc::Error);