From e888eb2fa9e8033881f222852f58820e167c557c Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Wed, 7 Sep 2022 17:29:25 -0700 Subject: [PATCH] [CI] Migrate CI pipelines from Jenkins to BuildKite (#8142) * [CI] Migrate CI pipelines from Jenkins to BuildKite * Require manual approval * Less verbose output when pulling Docker * Remove us-east-2 from metadata.py * Add documentation * Add missing underscore * Add missing punctuation * More specific instruction * Better paragraph structure --- doc/contrib/ci.rst | 64 +++- tests/buildkite/build-cpu-arm64.sh | 43 +++ tests/buildkite/build-cpu.sh | 33 ++ tests/buildkite/build-cuda-with-rmm.sh | 26 ++ tests/buildkite/build-cuda.sh | 52 +++ tests/buildkite/build-gpu-rpkg.sh | 22 ++ tests/buildkite/build-jvm-doc.sh | 14 + tests/buildkite/build-jvm-packages-gpu.sh | 21 ++ tests/buildkite/build-jvm-packages.sh | 16 + tests/buildkite/build-rpkg-win64-gpu.ps1 | 21 ++ tests/buildkite/build-win64-gpu.ps1 | 58 +++ tests/buildkite/conftest.ps1 | 11 + tests/buildkite/conftest.sh | 22 ++ tests/buildkite/deploy-jvm-packages.sh | 16 + .../aws-stack-creator/create_stack.py | 101 +++++ .../aws-stack-creator/metadata.py | 101 +++++ .../buildkite/infrastructure/requirements.txt | 2 + .../service-user/create_service_user.py | 44 +++ .../service-user/service-user-template.yml | 349 ++++++++++++++++++ .../create_worker_image_pipelines.py | 86 +++++ .../ec2-image-builder-pipeline-template.yml | 108 ++++++ .../linux-amd64-gpu-bootstrap.yml | 26 ++ .../worker-image-pipeline/metadata.py | 18 + .../worker-image-pipeline/run_pipelines.py | 22 ++ .../windows-gpu-bootstrap.yml | 73 ++++ tests/buildkite/pipeline-win64.yml | 22 ++ tests/buildkite/pipeline.yml | 93 +++++ tests/buildkite/run-clang-tidy.sh | 13 + tests/buildkite/test-cpp-gpu.sh | 21 ++ .../test-integration-jvm-packages.sh | 16 + tests/buildkite/test-python-cpu-arm64.sh | 11 + tests/buildkite/test-python-cpu.sh | 14 + tests/buildkite/test-python-gpu.sh | 46 +++ tests/buildkite/test-win64-gpu.ps1 | 37 ++ tests/ci_build/Dockerfile.aarch64 | 4 +- tests/ci_build/Dockerfile.clang_tidy | 4 +- tests/ci_build/Dockerfile.cpu | 4 +- tests/ci_build/Dockerfile.gpu | 4 +- tests/ci_build/Dockerfile.gpu_build | 4 +- tests/ci_build/Dockerfile.gpu_jvm | 8 +- tests/ci_build/Dockerfile.jvm_cross | 8 +- tests/ci_build/Dockerfile.rmm | 4 +- tests/ci_build/Dockerfile.s390x | 2 +- tests/ci_build/ci_build.sh | 20 +- tests/ci_build/insert_vcomp140.py | 1 + 45 files changed, 1639 insertions(+), 46 deletions(-) create mode 100755 tests/buildkite/build-cpu-arm64.sh create mode 100755 tests/buildkite/build-cpu.sh create mode 100755 tests/buildkite/build-cuda-with-rmm.sh create mode 100755 tests/buildkite/build-cuda.sh create mode 100755 tests/buildkite/build-gpu-rpkg.sh create mode 100755 tests/buildkite/build-jvm-doc.sh create mode 100755 tests/buildkite/build-jvm-packages-gpu.sh create mode 100755 tests/buildkite/build-jvm-packages.sh create mode 100644 tests/buildkite/build-rpkg-win64-gpu.ps1 create mode 100644 tests/buildkite/build-win64-gpu.ps1 create mode 100644 tests/buildkite/conftest.ps1 create mode 100755 tests/buildkite/conftest.sh create mode 100755 tests/buildkite/deploy-jvm-packages.sh create mode 100644 tests/buildkite/infrastructure/aws-stack-creator/create_stack.py create mode 100644 tests/buildkite/infrastructure/aws-stack-creator/metadata.py create mode 100644 tests/buildkite/infrastructure/requirements.txt create mode 100644 tests/buildkite/infrastructure/service-user/create_service_user.py create mode 100644 tests/buildkite/infrastructure/service-user/service-user-template.yml create mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py create mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml create mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml create mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/metadata.py create mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py create mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml create mode 100644 tests/buildkite/pipeline-win64.yml create mode 100644 tests/buildkite/pipeline.yml create mode 100755 tests/buildkite/run-clang-tidy.sh create mode 100755 tests/buildkite/test-cpp-gpu.sh create mode 100755 tests/buildkite/test-integration-jvm-packages.sh create mode 100755 tests/buildkite/test-python-cpu-arm64.sh create mode 100755 tests/buildkite/test-python-cpu.sh create mode 100755 tests/buildkite/test-python-gpu.sh create mode 100644 tests/buildkite/test-win64-gpu.ps1 diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index d117b90e2..abb1cd73f 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -39,24 +39,58 @@ task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call a glue code to call CMake and a C++ compiler to build the native library on the fly.) ******************************* -Reproducing errors from Jenkins +Elastic CI Stack with BuildKite ******************************* -It is often useful to reproduce the particular testing environment from our Jenkins server for -the purpose of troubleshooting a failing test. We use Docker containers heavily to package -the testing environment, so you can use Docker to reproduce it on your own machine. +`BuildKite `_ is a SaaS (Software as a Service) platform that orchestrates +cloud machines to host CI pipelines. The BuildKite platform allows us to define cloud resources in +a declarative fashion. Every configuration step is now documented explicitly as code. -1. Install Docker: https://docs.docker.com/engine/install/ubuntu/ -2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian - The runtime lets you access NVIDIA GPUs inside a Docker container. -3. In a build log, all tests are invoked via the wrapper script ``tests/ci_build/ci_build.sh``. - Identify the test you'd like to reproduce locally, and note how the wrapper script was invoked for that test. - The invocation should look like this: +**Prerequisite**: You should have some knowledge of `CloudFormation `_. +CloudFormation lets us define a stack of cloud resources (EC2 machines, Lambda functions, S3 etc) using +a single YAML file. -.. code-block:: bash +**Prerequisite**: Gain access to the XGBoost project's AWS account (``admin@xgboost-ci.net``), and then +set up a credential pair in order to provision resources on AWS. See +`Creating an IAM user in your AWS account `_. - CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' tests/ci_build/ci_build.sh gpu nvidia-docker \ - --build-arg CUDA_VERSION_ARG=11.0 tests/ci_build/test_python.sh mgpu --use-rmm-pool +* Option 1. Give full admin privileges to your IAM user. This is the simplest option. +* Option 2. Give limited set of permissions to your IAM user, to reduce the possibility of messing up other resources. + For this, use the script ``tests/buildkite/infrastructure/service-user/create_service_user.py``. -4. You can now run the same command on your own machine. The wrapper script will automatically download and - set up the correct Docker container(s). +===================== +Worker Image Pipeline +===================== +Building images for worker machines used to be a chore: you'd provision an EC2 machine, SSH into it, and +manually install the necessary packages. This process is not only laborous but also error-prone. You may +forget to install a package or change a system configuration. + +No more. Now we have an automated pipeline for building images for worker machines. + +* Run ``tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py`` in order to provision + CloudFormation stacks named ``buildkite-linux-amd64-gpu-worker`` and ``buildkite-windows-gpu-worker``. They are + pipelines that create AMIs (Amazon Machine Images) for Linux and Windows workers, respectively. +* Navigate to the CloudFormation web console to verify that the image builder pipelines have been provisioned. It may + take some time. +* Once they pipelines have been fully provisioned, run the script + ``tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py`` to execute the pipelines. New AMIs will be + uploaded to the EC2 service. You can locate them in the EC2 console. +* Make sure to modify ``tests/buildkite/infrastructure/aws-stack-creator/metadata.py`` to use the correct AMI IDs. + (For ``linux-amd64-cpu`` and ``linux-arm64-cpu``, use the AMIs provided by BuildKite. Consult the ``AWSRegion2AMI`` + section of https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml.) + +====================== +EC2 Autoscaling Groups +====================== +In EC2, you can create auto-scaling groups, where you can dynamically adjust the number of worker instances according to +workload. When a pull request is submitted, the following steps take place: + +1. GitHub sends a signal to the registered webhook, which connects to the BuildKite server. +2. BuildKite sends a signal to a `Lambda `_ function named ``Autoscaling``. +3. The Lambda function sends a signal to the auto-scaling group. The group scales up and adds additional worker instances. +4. New worker instances run the test jobs. Test results are reported back to BuildKite. +5. When the test jobs complete, BuildKite sends a signal to ``Autoscaling``, which in turn requests the autoscaling group + to scale down. Idle worker instances are shut down. + +To set up the auto-scaling group, run the script ``tests/buildkite/infrastructure/aws-stack-creator/create_stack.py``. +Check the CloudFormation web console to verify successful provision of auto-scaling groups. \ No newline at end of file diff --git a/tests/buildkite/build-cpu-arm64.sh b/tests/buildkite/build-cpu-arm64.sh new file mode 100755 index 000000000..1a95a880a --- /dev/null +++ b/tests/buildkite/build-cpu-arm64.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -euo pipefail + +WHEEL_TAG=manylinux2014_aarch64 + +echo "--- Build CPU code targeting ARM64" + +source tests/buildkite/conftest.sh + +command_wrapper="tests/ci_build/ci_build.sh aarch64 docker" + +echo "--- Build libxgboost from the source" +$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test \ + -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOL=ON +echo "--- Run Google Test" +$command_wrapper bash -c "cd build && ctest --extra-verbose" + +echo "--- Build binary wheel" +$command_wrapper bash -c \ + "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal" +$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \ + ${BUILDKITE_COMMIT} ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard" +$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \ + ${BUILDKITE_COMMIT} ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +$command_wrapper bash -c \ + "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" + +echo "--- Upload Python wheel" +buildkite-agent artifact upload "python-package/dist/*.whl" +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress +fi + +echo "--- Stash XGBoost CLI executable" +buildkite-agent artifact upload ./xgboost diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh new file mode 100755 index 000000000..60c84c52c --- /dev/null +++ b/tests/buildkite/build-cpu.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -euo pipefail + +echo "--- Build CPU code" + +source tests/buildkite/conftest.sh + +command_wrapper="tests/ci_build/ci_build.sh cpu docker" + +$command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h + # This step is not necessary, but here we include it, to ensure that + # DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use + # the configured header build/dmlc/build_config.h instead of + # include/dmlc/build_config_default.h. +echo "--- Build libxgboost from the source" +$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON +echo "--- Run Google Test" +$command_wrapper bash -c "cd build && ctest --extra-verbose" +echo "--- Stash XGBoost CLI executable" +buildkite-agent artifact upload ./xgboost + +# Sanitizer test +echo "--- Run Google Test with sanitizer enabled" +$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON \ + -DENABLED_SANITIZERS="address;leak;undefined" -DCMAKE_BUILD_TYPE=Debug \ + -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ +CI_DOCKER_EXTRA_PARAMS_INIT="-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer "` + `"-e ASAN_OPTIONS=symbolize=1 "` + `"-e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log "` + `"--cap-add SYS_PTRACE" \ + $command_wrapper bash -c "cd build && ctest --exclude-regex AllTestsInDMLCUnitTests "` + `"--extra-verbose" diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh new file mode 100755 index 000000000..63bb903a6 --- /dev/null +++ b/tests/buildkite/build-cuda-with-rmm.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -euo pipefail + +CUDA_VERSION=11.0.3 + +source tests/buildkite/conftest.sh + +echo "--- Build with CUDA ${CUDA_VERSION}, RMM enabled" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "` + `"CUDA_VERSION_ARG=$CUDA_VERSION" + +echo "--- Build libxgboost from the source" +$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \ + -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DBUILD_WITH_CUDA_CUB=ON ${arch_flag} + +echo "-- Stash C++ test executable (testxgboost)" +buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh new file mode 100755 index 000000000..e7b2c1954 --- /dev/null +++ b/tests/buildkite/build-cuda.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -euo pipefail + +CUDA_VERSION=11.0.3 +WHEEL_TAG=manylinux2014_x86_64 + +echo "--- Build with CUDA ${CUDA_VERSION}" + +source tests/buildkite/conftest.sh + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "` + `"CUDA_VERSION_ARG=$CUDA_VERSION" + +echo "--- Build libxgboost from the source" +$command_wrapper tests/ci_build/prune_libnccl.sh +$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \ + -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include -DNCCL_LIBRARY=/workspace/libnccl_static.a \ + ${arch_flag} +echo "--- Build binary wheel" +$command_wrapper bash -c \ + "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal" +$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \ + ${BUILDKITE_COMMIT} ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard" +tests/ci_build/ci_build.sh auditwheel_x86_64 docker auditwheel repair \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \ + ${BUILDKITE_COMMIT} ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +tests/ci_build/ci_build.sh auditwheel_x86_64 docker bash -c \ + "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" + +echo "--- Upload Python wheel" +buildkite-agent artifact upload python-package/dist/*.whl +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress +fi +echo "-- Stash C++ test executable (testxgboost)" +buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-gpu-rpkg.sh b/tests/buildkite/build-gpu-rpkg.sh new file mode 100755 index 000000000..151d97940 --- /dev/null +++ b/tests/buildkite/build-gpu-rpkg.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -euo pipefail + +CUDA_VERSION=11.0.3 + +source tests/buildkite/conftest.sh + +echo "--- Build XGBoost R package with CUDA" + +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + +tests/ci_build/ci_build.sh gpu_build_r_centos7 docker \ + --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} tests/ci_build/build_r_pkg_with_cuda.sh \ + ${BUILDKITE_COMMIT} + +echo "--- Upload R tarball" +aws s3 cp xgboost_r_gpu_linux_*.tar.gz s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress + +fi diff --git a/tests/buildkite/build-jvm-doc.sh b/tests/buildkite/build-jvm-doc.sh new file mode 100755 index 000000000..a2d658e48 --- /dev/null +++ b/tests/buildkite/build-jvm-doc.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -euo pipefail + +source tests/buildkite/conftest.sh + +echo "--- Build JVM packages doc" +tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME} +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + echo "--- Upload JVM packages doc" + aws s3 cp jvm-packages/${BRANCH_NAME}.tar.bz2 \ + s3://xgboost-docs/${BRANCH_NAME}.tar.bz2 --acl public-read --no-progress +fi diff --git a/tests/buildkite/build-jvm-packages-gpu.sh b/tests/buildkite/build-jvm-packages-gpu.sh new file mode 100755 index 000000000..13fc05279 --- /dev/null +++ b/tests/buildkite/build-jvm-packages-gpu.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -euo pipefail + +SPARK_VERSION=3.0.1 +CUDA_VERSION=11.0.3 + +source tests/buildkite/conftest.sh + +echo "--- Build XGBoost JVM packages with CUDA" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +tests/ci_build/ci_build.sh jvm_gpu_build nvidia-docker \ + --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} tests/ci_build/build_jvm_packages.sh \ + ${SPARK_VERSION} -Duse.cuda=ON ${arch_flag} diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh new file mode 100755 index 000000000..d9fe6feef --- /dev/null +++ b/tests/buildkite/build-jvm-packages.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -euo pipefail + +SPARK_VERSION=3.0.1 + +source tests/buildkite/conftest.sh + +echo "--- Build XGBoost JVM packages" +tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ + ${SPARK_VERSION} + +echo "--- Stash XGBoost4J JARs" +buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar" +buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar" +buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar" diff --git a/tests/buildkite/build-rpkg-win64-gpu.ps1 b/tests/buildkite/build-rpkg-win64-gpu.ps1 new file mode 100644 index 000000000..c5db18480 --- /dev/null +++ b/tests/buildkite/build-rpkg-win64-gpu.ps1 @@ -0,0 +1,21 @@ +$ErrorActionPreference = "Stop" + +. tests/buildkite/conftest.ps1 + +Write-Host "--- Build XGBoost R package with CUDA" + +nvcc --version +if ( $is_release_branch -eq 1 ) { + $arch_flag = "-DGPU_COMPUTE_VER=75" + + bash tests/ci_build/build_r_pkg_with_cuda_win64.sh $Env:BUILDKITE_COMMIT + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + + Write-Host "--- Upload R tarball" + Get-ChildItem . -Filter xgboost_r_gpu_win64_*.tar.gz | + Foreach-Object { + & aws s3 cp $_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ ` + --acl public-read --no-progress + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + } +} diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1 new file mode 100644 index 000000000..6ee723abb --- /dev/null +++ b/tests/buildkite/build-win64-gpu.ps1 @@ -0,0 +1,58 @@ +$ErrorActionPreference = "Stop" + +. tests/buildkite/conftest.ps1 + +Write-Host "--- Build libxgboost on Windows with CUDA" + +nvcc --version +if ( $is_release_branch -eq 0 ) { + $arch_flag = "-DGPU_COMPUTE_VER=75" +} else { + $arch_flag = "" +} +mkdir build +cd build +cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON ` + -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_UNITY_BUILD=ON ${arch_flag} +$msbuild = -join @( + "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\MSBuild\\15.0" + "\\Bin\\MSBuild.exe" +) +& $msbuild xgboost.sln /m /p:Configuration=Release /nodeReuse:false +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Build binary wheel" +cd ../python-package +conda activate +& python setup.py bdist_wheel --universal +Get-ChildItem . -Filter dist/*.whl | +Foreach-Object { + & python ../tests/ci_build/rename_whl.py $_.FullName $Env:BUILDKITE_COMMIT win_amd64 + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +} + +Write-Host "--- Insert vcomp140.dll (OpenMP runtime) into the wheel" +cd dist +Copy-Item -Path ../../tests/ci_build/insert_vcomp140.py -Destination . +& python insert_vcomp140.py *.whl +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Upload Python wheel" +cd ../.. +Get-ChildItem . -Filter python-package/dist/*.whl | +Foreach-Object { + & buildkite-agent artifact upload python-package/dist/$_ + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +} +if ( $is_release_branch -eq 1 ) { + Get-ChildItem . -Filter python-package/dist/*.whl | + Foreach-Object { + & aws s3 cp python-package/dist/$_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ ` + --acl public-read --no-progress + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + } +} + +Write-Host "--- Stash C++ test executables" +& buildkite-agent artifact upload build/testxgboost.exe +& buildkite-agent artifact upload xgboost.exe diff --git a/tests/buildkite/conftest.ps1 b/tests/buildkite/conftest.ps1 new file mode 100644 index 000000000..c8dd5f1d1 --- /dev/null +++ b/tests/buildkite/conftest.ps1 @@ -0,0 +1,11 @@ +if ( $Env:BUILDKITE_PULL_REQUEST -and ($Env:BUILDKITE_PULL_REQUEST -ne "false") ) { + $is_pull_request = 1 +} else { + $is_pull_request = 0 +} + +if ( ($Env:BUILDKITE_BRANCH -eq "master") -or ($Env:BUILDKITE_BRANCH -match "release_.+") ) { + $is_release_branch = 1 +} else { + $is_release_branch = 0 +} diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh new file mode 100755 index 000000000..40409fc50 --- /dev/null +++ b/tests/buildkite/conftest.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -euo pipefail +set -x + +if [[ -n $BUILDKITE_PULL_REQUEST && $BUILDKITE_PULL_REQUEST != "false" ]] +then + is_pull_request=1 + export BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST +else + is_pull_request=0 + export BRANCH_NAME=$BUILDKITE_BRANCH +fi + +if [[ $BUILDKITE_BRANCH == "master" || $BUILDKITE_BRANCH == "release_"* ]] +then + is_release_branch=1 +else + is_release_branch=0 +fi + +set +x diff --git a/tests/buildkite/deploy-jvm-packages.sh b/tests/buildkite/deploy-jvm-packages.sh new file mode 100755 index 000000000..dd4037704 --- /dev/null +++ b/tests/buildkite/deploy-jvm-packages.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -euo pipefail + +SPARK_VERSION=3.0.1 +CUDA_VERSION=11.0.3 + +source tests/buildkite/conftest.sh + +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" + tests/ci_build/ci_build.sh jvm_gpu_build docker \ + --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ + tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION} +fi diff --git a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py new file mode 100644 index 000000000..f4e92f634 --- /dev/null +++ b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py @@ -0,0 +1,101 @@ +import argparse +import copy + +import boto3 + +from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS + +TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" + + +def get_availability_zones(*, aws_region): + client = boto3.client("ec2", region_name=aws_region) + r = client.describe_availability_zones( + Filters=[ + {"Name": "region-name", "Values": [aws_region]}, + {"Name": "zone-type", "Values": ["availability-zone"]}, + ] + ) + return sorted([x["ZoneName"] for x in r["AvailabilityZones"]]) + + +def get_default_vpc(*, aws_region): + ec2 = boto3.resource("ec2", region_name=aws_region) + default_vpc_id = None + for x in ec2.vpcs.filter(Filters=[{"Name": "is-default", "Values": ["true"]}]): + return x + + # Create default VPC if not exist + client = boto3.client("ec2", region_name=aws_region) + r = client.create_default_vpc() + default_vpc_id = r["Vpc"]["VpcId"] + + return ec2.Vpc(default_vpc_id) + + +def format_params(args, *, stack_id): + default_vpc = get_default_vpc(aws_region=args.aws_region) + azs = get_availability_zones(aws_region=args.aws_region) + # For each of the first two availability zones (AZs), choose the default subnet + subnets = [ + x.id + for x in default_vpc.subnets.filter( + Filters=[ + {"Name": "default-for-az", "Values": ["true"]}, + {"Name": "availability-zone", "Values": azs[:2]}, + ] + ) + ] + assert len(subnets) == 2 + + params = copy.deepcopy(STACK_PARAMS[stack_id]) + params["ImageId"] = AMI_ID[stack_id][args.aws_region] + params["BuildkiteQueue"] = stack_id + params["CostAllocationTagValue"] = f"buildkite-{stack_id}" + params["BuildkiteAgentToken"] = args.agent_token + params["VpcId"] = default_vpc.id + params["Subnets"] = ",".join(subnets) + params.update(COMMON_STACK_PARAMS) + return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] + + +def get_full_stack_id(stack_id): + return f"buildkite-{stack_id}-autoscaling-group" + + +def main(args): + client = boto3.client("cloudformation", region_name=args.aws_region) + + for stack_id in AMI_ID: + stack_id_full = get_full_stack_id(stack_id) + print(f"Creating elastic CI stack {stack_id_full}...") + + params = format_params(args, stack_id=stack_id) + + response = client.create_stack( + StackName=stack_id_full, + TemplateURL=TEMPLATE_URL, + Capabilities=[ + "CAPABILITY_IAM", + "CAPABILITY_NAMED_IAM", + "CAPABILITY_AUTO_EXPAND", + ], + OnFailure="ROLLBACK", + EnableTerminationProtection=False, + Parameters=params, + ) + print(f"CI stack {stack_id_full} is in progress in the background") + + for stack_id in AMI_ID: + stack_id_full = get_full_stack_id(stack_id) + waiter = client.get_waiter("stack_create_complete") + waiter.wait(StackName=stack_id_full) + print(f"CI stack {stack_id_full} is now finished.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--aws-region", type=str, required=True) + parser.add_argument("--agent-token", type=str, required=True) + args = parser.parse_args() + main(args) diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py new file mode 100644 index 000000000..8906309d1 --- /dev/null +++ b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py @@ -0,0 +1,101 @@ +AMI_ID = { + # Managed by XGBoost team + "linux-amd64-gpu": { + "us-west-2": "ami-00ed92bd37f77bc33", + }, + "linux-amd64-mgpu": { + "us-west-2": "ami-00ed92bd37f77bc33", + }, + "windows-gpu": { + "us-west-2": "ami-0a1a2ea551a07ad5f", + }, + # Managed by BuildKite + "linux-amd64-cpu": { + "us-west-2": "ami-075d4c25d5f0c17c1", + }, + "pipeline-loader": { + "us-west-2": "ami-075d4c25d5f0c17c1", + }, + "linux-arm64-cpu": { + "us-west-2": "ami-0952c6fb6db9a9891", + }, +} + +STACK_PARAMS = { + "linux-amd64-gpu": { + "InstanceOperatingSystem": "linux", + "InstanceType": "g4dn.xlarge", + "AgentsPerInstance": "1", + "MinSize": "0", + "MaxSize": "8", + "OnDemandPercentage": "100", + "ScaleOutFactor": "1.0", + "ScaleInIdlePeriod": "60", # in seconds + }, + "linux-amd64-mgpu": { + "InstanceOperatingSystem": "linux", + "InstanceType": "g4dn.12xlarge", + "AgentsPerInstance": "1", + "MinSize": "0", + "MaxSize": "4", + "OnDemandPercentage": "100", + "ScaleOutFactor": "1.0", + "ScaleInIdlePeriod": "60", # in seconds + }, + "windows-gpu": { + "InstanceOperatingSystem": "windows", + "InstanceType": "g4dn.2xlarge", + "AgentsPerInstance": "1", + "MinSize": "0", + "MaxSize": "2", + "OnDemandPercentage": "100", + "ScaleOutFactor": "1.0", + "ScaleInIdlePeriod": "600", # in seconds + }, + "linux-amd64-cpu": { + "InstanceOperatingSystem": "linux", + "InstanceType": "c5a.4xlarge", + "AgentsPerInstance": "1", + "MinSize": "0", + "MaxSize": "16", + "OnDemandPercentage": "100", + "ScaleOutFactor": "1.0", + "ScaleInIdlePeriod": "60", # in seconds + }, + "pipeline-loader": { + "InstanceOperatingSystem": "linux", + "InstanceType": "t3a.micro", + "AgentsPerInstance": "1", + "MinSize": "1", + "MaxSize": "1", + "OnDemandPercentage": "100", + "ScaleOutFactor": "1.0", + "ScaleInIdlePeriod": "60", # in seconds + }, + "linux-arm64-cpu": { + "InstanceOperatingSystem": "linux", + "InstanceType": "c6g.4xlarge", + "AgentsPerInstance": "1", + "MinSize": "0", + "MaxSize": "8", + "OnDemandPercentage": "100", + "ScaleOutFactor": "1.0", + "ScaleInIdlePeriod": "60", # in seconds + }, +} + +COMMON_STACK_PARAMS = { + "BuildkiteAgentTimestampLines": "false", + "BuildkiteWindowsAdministrator": "true", + "AssociatePublicIpAddress": "true", + "ScaleOutForWaitingJobs": "false", + "EnableCostAllocationTags": "true", + "CostAllocationTagName": "CreatedBy", + "ECRAccessPolicy": "full", + "ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess", + "EnableSecretsPlugin": "false", + "EnableECRPlugin": "false", + "EnableDockerLoginPlugin": "false", + "EnableDockerUserNamespaceRemap": "false", + "BuildkiteAgentExperiments": "normalised-upload-paths,resolve-commit-after-checkout", +} diff --git a/tests/buildkite/infrastructure/requirements.txt b/tests/buildkite/infrastructure/requirements.txt new file mode 100644 index 000000000..3ce271ebb --- /dev/null +++ b/tests/buildkite/infrastructure/requirements.txt @@ -0,0 +1,2 @@ +boto3 +cfn_tools diff --git a/tests/buildkite/infrastructure/service-user/create_service_user.py b/tests/buildkite/infrastructure/service-user/create_service_user.py new file mode 100644 index 000000000..ba08779bd --- /dev/null +++ b/tests/buildkite/infrastructure/service-user/create_service_user.py @@ -0,0 +1,44 @@ +import argparse +import os + +import boto3 + +current_dir = os.path.dirname(__file__) + + +def main(args): + with open( + os.path.join(current_dir, "service-user-template.yml"), encoding="utf-8" + ) as f: + service_user_template = f.read() + + stack_id = "buildkite-elastic-ci-stack-service-user" + + print("Create a new IAM user with suitable permissions...") + client = boto3.client("cloudformation", region_name=args.aws_region) + response = client.create_stack( + StackName=stack_id, + TemplateBody=service_user_template, + Capabilities=[ + "CAPABILITY_IAM", + "CAPABILITY_NAMED_IAM", + ], + Parameters=[{"ParameterKey": "UserName", "ParameterValue": args.user_name}], + ) + waiter = client.get_waiter("stack_create_complete") + waiter.wait(StackName=stack_id) + user = boto3.resource("iam", region_name=args.aws_region).User(args.user_name) + key_pair = user.create_access_key_pair() + print("Finished creating an IAM users with suitable permissions.") + print(f"Access Key ID: {key_pair.access_key_id}") + print(f"Access Secret Access Key: {key_pair.secret_access_key}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--aws-region", type=str, required=True) + parser.add_argument( + "--user-name", type=str, default="buildkite-elastic-ci-stack-user" + ) + args = parser.parse_args() + main(args) diff --git a/tests/buildkite/infrastructure/service-user/service-user-template.yml b/tests/buildkite/infrastructure/service-user/service-user-template.yml new file mode 100644 index 000000000..2077cfe7b --- /dev/null +++ b/tests/buildkite/infrastructure/service-user/service-user-template.yml @@ -0,0 +1,349 @@ +--- +AWSTemplateFormatVersion: "2010-09-09" +Description: "Buildkite Elastic CI Stack CloudFormation service user" + +Parameters: + UserName: + Type: String + Default: buildkite-elastic-ci-stack-user + Description: Name of user to create + +Outputs: + UserNameOutput: + Value: !Ref CloudFormationServiceUser + UserArnOutput: + Value: !GetAtt CloudFormationServiceUser.Arn + +Resources: + CloudFormationServiceUser: + Type: AWS::IAM::User + Properties: + ManagedPolicyArns: + - !Ref SubstackCrudPolicy + - !Ref CrudPolicy + - !Ref ImageBuilderPolicy + UserName: !Ref UserName + + SubstackCrudPolicy: + Type: AWS::IAM::ManagedPolicy + Properties: + PolicyDocument: + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "cloudformation:*", + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "serverlessrepo:GetApplication", + "serverlessrepo:GetCloudFormationTemplate", + "serverlessrepo:CreateCloudFormationTemplate" + ], + "Resource": "*" + } + ] + } + + CrudPolicy: + Type: AWS::IAM::ManagedPolicy + Properties: + PolicyDocument: + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeAccountAttributes", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeInstances", + "ec2:DescribeInternetGateways", + "ec2:DescribeLaunchTemplateVersions", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribeRouteTables", + "ec2:DescribeSecurityGroups", + "ec2:DescribeSubnets", + "ec2:DescribeVpcs", + "ec2:CreateTags" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateInternetGateway", + "ec2:AttachInternetGateway", + "ec2:DetachInternetGateway", + "ec2:DeleteInternetGateway" + ], + "Resource": "arn:aws:ec2:*:*:internet-gateway/*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateLaunchTemplate", + "ec2:CreateLaunchTemplateVersion", + "ec2:DeleteLaunchTemplate" + ], + "Resource": "arn:aws:ec2:*:*:launch-template/*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:AssociateRouteTable", + "ec2:DisassociateRouteTable", + "ec2:CreateRoute", + "ec2:CreateRouteTable", + "ec2:DeleteRoute", + "ec2:DeleteRouteTable" + ], + "Resource": "arn:aws:ec2:*:*:route-table/*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress", + "ec2:CreateSecurityGroup", + "ec2:DeleteSecurityGroup" + ], + "Resource": "arn:aws:ec2:*:*:security-group/*" + }, + { + "Effect": "Allow", + "Action": "ec2:RunInstances", + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateSubnet", + "ec2:DeleteSubnet", + "ec2:AssociateRouteTable", + "ec2:DisassociateRouteTable" + ], + "Resource": "arn:aws:ec2:*:*:subnet/*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateVpc", + "ec2:CreateSecurityGroup", + "ec2:ModifyVpcAttribute", + "ec2:AttachInternetGateway", + "ec2:DetachInternetGateway", + "ec2:CreateSubnet", + "ec2:CreateRouteTable", + "ec2:DeleteVpc" + ], + "Resource": "arn:aws:ec2:*:*:vpc/*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateDefaultVpc", + "ec2:CreateDefaultSubnet" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "iam:CreateInstanceProfile", + "iam:GetInstanceProfile", + "iam:AddRoleToInstanceProfile", + "iam:RemoveRoleFromInstanceProfile", + "iam:DeleteInstanceProfile" + ], + "Resource": "arn:aws:iam::*:instance-profile/*" + }, + { + "Effect": "Allow", + "Action": [ + "kms:DescribeKey", + "kms:CreateGrant", + "kms:Decrypt", + "kms:Encrypt" + ], + "Resource": "arn:aws:kms:*:*:key/*" + }, + { + "Effect": "Allow", + "Action": [ + "lambda:CreateFunction", + "lambda:GetFunction", + "lambda:GetFunctionCodeSigningConfig", + "lambda:AddPermission", + "lambda:RemovePermission", + "lambda:DeleteFunction", + "lambda:InvokeFunction", + "lambda:TagResource" + ], + "Resource": "arn:aws:lambda:*:*:function:*" + }, + { + "Effect": "Allow", + "Action": [ + "logs:CreateLogGroup", + "logs:PutRetentionPolicy", + "logs:DeleteLogGroup" + ], + "Resource": "arn:aws:logs:*:*:log-group:*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:CreateBucket", + "s3:PutBucketAcl", + "s3:PutBucketLogging", + "s3:PutBucketTagging", + "s3:PutBucketVersioning" + ], + "Resource": "arn:aws:s3:::*" + }, + { + "Effect": "Allow", + "Action": [ + "ssm:GetParameter", + "ssm:PutParameter", + "ssm:DeleteParameter" + ], + "Resource": "arn:aws:ssm:*:*:parameter/*" + }, + { + "Effect": "Allow", + "Action": [ + "iam:ListPolicies", + "iam:ListInstanceProfiles", + "iam:ListRoles", + "iam:ListPolicyVersions", + "iam:ListRolePolicies", + "iam:ListAttachedRolePolicies", + "iam:ListInstanceProfileTags", + "iam:ListRoleTags", + "iam:ListInstanceProfilesForRole", + "iam:GetPolicyVersion", + "iam:GetPolicy", + "iam:GetInstanceProfile", + "iam:GetRole", + "iam:GetRolePolicy", + "iam:TagPolicy", + "iam:UntagPolicy", + "iam:TagInstanceProfile", + "iam:UntagInstanceProfile", + "iam:TagRole", + "iam:UntagRole", + "iam:CreateRole", + "iam:PassRole", + "iam:DeleteRole", + "iam:UpdateRoleDescription", + "iam:UpdateRole", + "iam:AddRoleToInstanceProfile", + "iam:RemoveRoleFromInstanceProfile", + "iam:CreateInstanceProfile", + "iam:DeleteInstanceProfile", + "iam:DetachRolePolicy", + "iam:SetDefaultPolicyVersion", + "iam:AttachRolePolicy", + "iam:UpdateAssumeRolePolicy", + "iam:PutRolePermissionsBoundary", + "iam:DeleteRolePermissionsBoundary", + "iam:CreatePolicy", + "iam:DeletePolicyVersion", + "iam:DeletePolicy", + "iam:PutRolePolicy", + "iam:DeleteRolePolicy" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "autoscaling:DescribeLifecycleHookTypes", + "autoscaling:DescribeTerminationPolicyTypes", + "autoscaling:DescribePolicies", + "autoscaling:DescribeWarmPool", + "autoscaling:DescribeScalingActivities", + "autoscaling:DescribeScalingProcessTypes", + "autoscaling:DescribeScheduledActions", + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeLifecycleHooks", + "autoscaling:SetDesiredCapacity", + "autoscaling:PutLifecycleHook", + "autoscaling:DeleteLifecycleHook", + "autoscaling:SetInstanceProtection", + "autoscaling:CreateAutoScalingGroup", + "autoscaling:EnableMetricsCollection", + "autoscaling:UpdateAutoScalingGroup", + "autoscaling:DeleteAutoScalingGroup", + "autoscaling:PutScalingPolicy", + "autoscaling:DeletePolicy", + "autoscaling:BatchPutScheduledUpdateGroupAction", + "autoscaling:PutScheduledUpdateGroupAction", + "autoscaling:DeleteScheduledAction", + "autoscaling:PutWarmPool", + "autoscaling:DeleteWarmPool", + "autoscaling:TerminateInstanceInAutoScalingGroup", + "autoscaling:AttachInstances" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "events:DescribeRule", + "events:PutRule", + "events:PutTargets", + "events:RemoveTargets", + "events:DeleteRule" + ], + "Resource": "arn:aws:events:*:*:rule/*" + } + ] + } + + ImageBuilderPolicy: + Type: AWS::IAM::ManagedPolicy + Properties: + PolicyDocument: + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "imagebuilder:CreateComponent", + "imagebuilder:GetComponent", + "imagebuilder:DeleteComponent", + "imagebuilder:CreateImageRecipe", + "imagebuilder:GetImageRecipe", + "imagebuilder:DeleteImageRecipe", + "imagebuilder:CreateImagePipeline", + "imagebuilder:GetImagePipeline", + "imagebuilder:DeleteImagePipeline", + "imagebuilder:CreateInfrastructureConfiguration", + "imagebuilder:GetInfrastructureConfiguration", + "imagebuilder:DeleteInfrastructureConfiguration", + "imagebuilder:CreateDistributionConfiguration", + "imagebuilder:GetDistributionConfiguration", + "imagebuilder:DeleteDistributionConfiguration", + "imagebuilder:TagResource", + "imagebuilder:StartImagePipelineExecution", + "ec2:DescribeImages", + "ec2:DescribeSnapshots", + "ec2:DescribeRegions", + "ec2:DescribeVolumes", + "ec2:DescribeKeyPairs", + "ec2:DescribeInstanceTypeOfferings" + ], + "Resource": "*" + } + ] + } diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py new file mode 100644 index 000000000..0c71d5e77 --- /dev/null +++ b/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py @@ -0,0 +1,86 @@ +import argparse +import copy +import json +import os +from urllib.request import urlopen + +import boto3 +import cfn_flip +from metadata import IMAGE_PARAMS + +current_dir = os.path.dirname(__file__) + +BUILDKITE_CF_TEMPLATE_URL = ( + "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" +) + + +def format_params(*, stack_id, aws_region, ami_mapping): + params = copy.deepcopy(IMAGE_PARAMS[stack_id]) + with open( + os.path.join(current_dir, params["BootstrapScript"]), + encoding="utf-8", + ) as f: + bootstrap_script = f.read() + params["BaseImageId"] = ami_mapping[aws_region][params["BaseImageId"]] + params["BootstrapScript"] = bootstrap_script + return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] + + +def get_ami_mapping(): + with urlopen(BUILDKITE_CF_TEMPLATE_URL) as response: + buildkite_cf_template = response.read().decode("utf-8") + cfn_obj = json.loads(cfn_flip.to_json(buildkite_cf_template)) + return cfn_obj["Mappings"]["AWSRegion2AMI"] + + +def get_full_stack_id(stack_id): + return f"buildkite-{stack_id}-worker" + + +def main(args): + with open( + os.path.join(current_dir, "ec2-image-builder-pipeline-template.yml"), + encoding="utf-8", + ) as f: + ec2_image_pipeline_template = f.read() + + ami_mapping = get_ami_mapping() + + for stack_id in IMAGE_PARAMS: + stack_id_full = get_full_stack_id(stack_id) + print(f"Creating EC2 image builder stack {stack_id_full}...") + + params = format_params( + stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping + ) + + client = boto3.client("cloudformation", region_name=args.aws_region) + response = client.create_stack( + StackName=stack_id_full, + TemplateBody=ec2_image_pipeline_template, + Capabilities=[ + "CAPABILITY_IAM", + "CAPABILITY_NAMED_IAM", + "CAPABILITY_AUTO_EXPAND", + ], + OnFailure="ROLLBACK", + EnableTerminationProtection=False, + Parameters=params, + ) + print( + f"EC2 image builder stack {stack_id_full} is in progress in the background" + ) + + for stack_id in IMAGE_PARAMS: + stack_id_full = get_full_stack_id(stack_id) + waiter = client.get_waiter("stack_create_complete") + waiter.wait(StackName=stack_id_full) + print(f"EC2 image builder stack {stack_id_full} is now finished.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--aws-region", type=str, required=True) + args = parser.parse_args() + main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml b/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml new file mode 100644 index 000000000..478adf3d4 --- /dev/null +++ b/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml @@ -0,0 +1,108 @@ +--- +AWSTemplateFormatVersion: "2010-09-09" +Description: "EC2 Image Builder pipelines to build workers" + +Parameters: + BaseImageId: + Type: String + Description: Base AMI to build a new image on top of. + + BootstrapScript: + Type: String + Description: Content of AMI customization script + + InstanceType: + Type: String + Description: Instance type for the Image Builder instances. + + InstanceOperatingSystem: + Type: String + Description: The operating system to run on the instance + AllowedValues: + - Linux + - Windows + Default: "Linux" + + VolumeSize: + Type: Number + Description: Size of EBS volume, in GiBs + +Conditions: + IsInstanceWindows: + !Equals [ !Ref InstanceOperatingSystem, "Windows" ] + +Resources: + # IAM role for the image builder instance + InstanceRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: "Allow" + Principal: + Service: "ec2.amazonaws.com" + Action: "sts:AssumeRole" + ManagedPolicyArns: + - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + - arn:aws:iam::aws:policy/EC2InstanceProfileForImageBuilder + - arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess + + InstanceProfile: + Type: AWS::IAM::InstanceProfile + Properties: + Roles: + - !Ref InstanceRole + + # Component that runs the bootstrap script + BootstrapComponent: + Type: AWS::ImageBuilder::Component + Properties: + Name: !Sub "${AWS::StackName}-bootstrap-component" + Platform: !Ref InstanceOperatingSystem + Version: "1.0.0" + Description: Execute a bootstrap script. + Data: !Ref BootstrapScript + + Recipe: + Type: AWS::ImageBuilder::ImageRecipe + Properties: + Name: !Sub "${AWS::StackName}-image" + Components: + - ComponentArn: !Ref BootstrapComponent + ParentImage: !Ref BaseImageId + BlockDeviceMappings: + - DeviceName: !If [IsInstanceWindows, "/dev/sda1", "/dev/xvda"] + Ebs: + DeleteOnTermination: true + Encrypted: false + VolumeSize: !Ref VolumeSize + VolumeType: gp2 + Version: "1.0.0" + + Infrastructure: + Type: AWS::ImageBuilder::InfrastructureConfiguration + Properties: + Name: !Sub "${AWS::StackName}-image-pipeline-infrastructure" + InstanceProfileName: !Ref InstanceProfile + InstanceTypes: + - !Ref InstanceType + TerminateInstanceOnFailure: true + + # Copy to this region only + Distribution: + Type: AWS::ImageBuilder::DistributionConfiguration + Properties: + Name: !Sub "${AWS::StackName}-image-pipeline-distribution-config" + Distributions: + - Region: !Ref AWS::Region + AmiDistributionConfiguration: {} + + # Composition of the above elements + Pipeline: + Type: AWS::ImageBuilder::ImagePipeline + Properties: + Name: !Sub "${AWS::StackName}-image-pipeline" + DistributionConfigurationArn: !Ref Distribution + ImageRecipeArn: !Ref Recipe + InfrastructureConfigurationArn: !Ref Infrastructure diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml new file mode 100644 index 000000000..a5c82a7fa --- /dev/null +++ b/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml @@ -0,0 +1,26 @@ +name: BuildKiteLinuxAMD64GPUBootstrap +description: Set up worker image for linux-amd64-gpu pipeline +schemaVersion: 1.0 + +phases: + - name: build + steps: + - name: SetupStep + action: ExecuteBash + inputs: + commands: + - | + yum groupinstall -y "Development tools" + yum install -y kernel-devel-$(uname -r) + aws s3 cp --recursive s3://ec2-linux-nvidia-drivers/latest/ . + chmod +x NVIDIA-Linux-x86_64*.run + CC=/usr/bin/gcc10-cc ./NVIDIA-Linux-x86_64*.run --silent + + amazon-linux-extras install docker + systemctl --now enable docker + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ + && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo \ + | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo + yum clean expire-cache + yum install -y nvidia-docker2 + systemctl restart docker diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py b/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py new file mode 100644 index 000000000..c74914e54 --- /dev/null +++ b/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py @@ -0,0 +1,18 @@ +IMAGE_PARAMS = { + "linux-amd64-gpu": { + "BaseImageId": "linuxamd64", + # AMI ID is looked up from Buildkite's CloudFormation template + "BootstrapScript": "linux-amd64-gpu-bootstrap.yml", + "InstanceType": "g4dn.xlarge", + "InstanceOperatingSystem": "Linux", + "VolumeSize": "40", # in GiBs + }, + "windows-gpu": { + "BaseImageId": "windows", + # AMI ID is looked up from Buildkite's CloudFormation template + "BootstrapScript": "windows-gpu-bootstrap.yml", + "InstanceType": "g4dn.2xlarge", + "InstanceOperatingSystem": "Windows", + "VolumeSize": "80", # in GiBs + }, +} diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py new file mode 100644 index 000000000..9edb8b1a7 --- /dev/null +++ b/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py @@ -0,0 +1,22 @@ +import argparse + +import boto3 +from create_worker_image_pipelines import get_full_stack_id +from metadata import IMAGE_PARAMS + + +def main(args): + cf = boto3.resource("cloudformation", region_name=args.aws_region) + builder_client = boto3.client("imagebuilder", region_name=args.aws_region) + for stack_id in IMAGE_PARAMS: + stack_id_full = get_full_stack_id(stack_id) + pipeline_arn = cf.Stack(stack_id_full).Resource("Pipeline").physical_resource_id + print(f"Running pipeline {pipeline_arn} to generate a new AMI...") + r = builder_client.start_image_pipeline_execution(imagePipelineArn=pipeline_arn) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--aws-region", type=str, required=True) + args = parser.parse_args() + main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml new file mode 100644 index 000000000..ef3fade44 --- /dev/null +++ b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml @@ -0,0 +1,73 @@ +name: BuildKiteWindowsGPUBootstrap +description: Set up worker image for windows-gpu pipeline +schemaVersion: 1.0 + +phases: + - name: build + steps: + - name: SetupStep + action: ExecutePowerShell + inputs: + commands: + - | + $ErrorActionPreference = "Stop" + + choco --version + choco feature enable -n=allowGlobalConfirmation + + # CMake 3.18 + Write-Host '>>> Installing CMake 3.18...' + choco install cmake --version 3.18.0 --installargs "ADD_CMAKE_TO_PATH=System" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + + # Notepad++ + Write-Host '>>> Installing Notepad++...' + choco install notepadplusplus + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + + # Miniconda + Write-Host '>>> Installing Miniconda...' + choco install miniconda3 /RegisterPython:1 /D:C:\tools\miniconda3 + C:\tools\miniconda3\Scripts\conda.exe init --user --system + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + . "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + conda config --set auto_activate_base false + conda config --prepend channels conda-forge + + # Install Java 11 + Write-Host '>>> Installing Java 11...' + choco install openjdk11jre + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + + # Install GraphViz + Write-Host '>>> Installing GraphViz...' + choco install graphviz + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + + # Install Visual Studio Community 2017 (15.9) + Write-Host '>>> Installing Visual Studio 2017 Community (15.9)...' + choco install visualstudio2017community --version 15.9.23.0 ` + --params "--wait --passive --norestart" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + choco install visualstudio2017-workload-nativedesktop --params ` + "--wait --passive --norestart --includeOptional" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + + # Install CUDA 11.0 + Write-Host '>>> Installing CUDA 11.0...' + choco install cuda --version 11.0.3 + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + + # Install Python packages + Write-Host '>>> Installing Python packages...' + conda activate + conda install -y mamba + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + + # Install R + Write-Host '>>> Installing R...' + choco install r.project --version=3.6.3 + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + choco install rtools --version=3.5.0.4 + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml new file mode 100644 index 000000000..afb6aafba --- /dev/null +++ b/tests/buildkite/pipeline-win64.yml @@ -0,0 +1,22 @@ +steps: + - block: ":rocket: Run this test job" + #### -------- BUILD -------- + - label: ":windows: Build XGBoost for Windows with CUDA" + command: "tests/buildkite/build-win64-gpu.ps1" + key: build-win64-gpu + agents: + queue: windows-gpu + - label: ":windows: Build XGBoost R package for Windows with CUDA" + command: "tests/buildkite/build-rpkg-win64-gpu.ps1" + key: build-rpkg-win64-gpu + agents: + queue: windows-gpu + + - wait + + #### -------- TEST -------- + - label: ":windows: Test XGBoost on Windows" + command: "tests/buildkite/test-win64-gpu.ps1" + key: test-win64-gpu + agents: + queue: windows-gpu diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml new file mode 100644 index 000000000..7180e0f50 --- /dev/null +++ b/tests/buildkite/pipeline.yml @@ -0,0 +1,93 @@ +env: + DOCKER_CACHE_ECR_ID: "492475357299" + DOCKER_CACHE_ECR_REGION: "us-west-2" +steps: + - block: ":rocket: Run this test job" + #### -------- BUILD -------- + - label: ":console: Run clang-tidy" + command: "tests/buildkite/run-clang-tidy.sh" + key: run-clang-tidy + agents: + queue: linux-amd64-cpu + - label: ":console: Build CPU" + command: "tests/buildkite/build-cpu.sh" + key: build-cpu + agents: + queue: linux-amd64-cpu + - label: ":console: Build CPU ARM64" + command: "tests/buildkite/build-cpu-arm64.sh" + key: build-cpu-arm64 + agents: + queue: linux-arm64-cpu + - label: ":console: Build CUDA" + command: "tests/buildkite/build-cuda.sh" + key: build-cuda + agents: + queue: linux-amd64-cpu + - label: ":console: Build CUDA with RMM" + command: "tests/buildkite/build-cuda-with-rmm.sh" + key: build-cuda-with-rmm + agents: + queue: linux-amd64-cpu + - label: ":console: Build R package with CUDA" + command: "tests/buildkite/build-gpu-rpkg.sh" + key: build-gpu-rpkg + agents: + queue: linux-amd64-cpu + - label: ":console: Build JVM packages" + command: "tests/buildkite/build-jvm-packages.sh" + key: build-jvm-packages + agents: + queue: linux-amd64-cpu + - label: ":console: Build JVM packages with CUDA" + command: "tests/buildkite/build-jvm-packages-gpu.sh" + key: build-jvm-packages-gpu + agents: + queue: linux-amd64-mgpu + - label: ":console: Build JVM package doc" + command: "tests/buildkite/build-jvm-doc.sh" + key: build-jvm-doc + agents: + queue: linux-amd64-cpu + + - wait + + #### -------- TEST -------- + - label: ":console: Test Python package, CPU" + command: "tests/buildkite/test-python-cpu.sh" + key: test-python-cpu + agents: + queue: linux-amd64-cpu + - label: ":console: Test Python package, CPU ARM64" + command: "tests/buildkite/test-python-cpu-arm64.sh" + key: test-python-cpu-arm64 + agents: + queue: linux-arm64-cpu + - label: ":console: Test Python package, single GPU" + command: "tests/buildkite/test-python-gpu.sh gpu" + key: test-python-gpu + agents: + queue: linux-amd64-gpu + - label: ":console: Test Python package, 4 GPUs" + command: "tests/buildkite/test-python-gpu.sh mgpu" + key: test-python-mgpu + agents: + queue: linux-amd64-mgpu + - label: ":console: Run Google Tests, 4 GPUs" + command: "tests/buildkite/test-cpp-gpu.sh" + key: test-cpp-gpu + agents: + queue: linux-amd64-mgpu + - label: ":console: Run integration tests with JVM packages" + command: "tests/buildkite/test-integration-jvm-packages.sh" + key: test-integration-jvm-packages + agents: + queue: linux-amd64-cpu + + - wait + + - label: ":console: Deploy JVM packages" + command: "tests/buildkite/deploy-jvm-packages.sh" + key: deploy-jvm-packages + agents: + queue: linux-amd64-cpu diff --git a/tests/buildkite/run-clang-tidy.sh b/tests/buildkite/run-clang-tidy.sh new file mode 100755 index 000000000..2cab9d22d --- /dev/null +++ b/tests/buildkite/run-clang-tidy.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -euo pipefail + +CUDA_VERSION=11.0.3 + +echo "--- Run clang-tidy" + +source tests/buildkite/conftest.sh + +tests/ci_build/ci_build.sh clang_tidy docker \ + --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ + python3 tests/ci_build/tidy.py --cuda-archs 75 diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh new file mode 100755 index 000000000..d32e25fad --- /dev/null +++ b/tests/buildkite/test-cpp-gpu.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -euo pipefail + +CUDA_VERSION=11.0.3 + +source tests/buildkite/conftest.sh + +echo "--- Run Google Tests with CUDA, using 4 GPUs" +buildkite-agent artifact download "build/testxgboost" . --step build-cuda +chmod +x build/testxgboost +tests/ci_build/ci_build.sh gpu nvidia-docker \ + --build-arg CUDA_VERSION_ARG=$CUDA_VERSION build/testxgboost + +echo "--- Run Google Tests with CUDA, using 4 GPUs, RMM enabled" +rm -rfv build/ +buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm +chmod +x build/testxgboost +tests/ci_build/ci_build.sh rmm nvidia-docker \ + --build-arg CUDA_VERSION_ARG=$CUDA_VERSION bash -c \ + "source activate gpu_test && build/testxgboost --use-rmm-pool" diff --git a/tests/buildkite/test-integration-jvm-packages.sh b/tests/buildkite/test-integration-jvm-packages.sh new file mode 100755 index 000000000..7c719241a --- /dev/null +++ b/tests/buildkite/test-integration-jvm-packages.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -euo pipefail + +JDK_VERSION=8 +SPARK_VERSION=3.0.1 + +source tests/buildkite/conftest.sh + +echo "--- Test XGBoost4J on a machine with JDK ${JDK_VERSION}, Spark ${SPARK_VERSION}" +buildkite-agent artifact download "jvm-packages/xgboost4j/target/*.jar" . --step build-jvm-packages +buildkite-agent artifact download "jvm-packages/xgboost4j-spark/target/*.jar" . --step build-jvm-packages +buildkite-agent artifact download "jvm-packages/xgboost4j-example/target/*.jar" . --step build-jvm-packages +export CI_DOCKER_EXTRA_PARAMS_INIT='-e RUN_INTEGRATION_TEST=1' +tests/ci_build/ci_build.sh jvm_cross docker --build-arg JDK_VERSION=${JDK_VERSION} \ + --build-arg SPARK_VERSION=${SPARK_VERSION} tests/ci_build/test_jvm_cross.sh diff --git a/tests/buildkite/test-python-cpu-arm64.sh b/tests/buildkite/test-python-cpu-arm64.sh new file mode 100755 index 000000000..ed1ad101b --- /dev/null +++ b/tests/buildkite/test-python-cpu-arm64.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -euo pipefail + +source tests/buildkite/conftest.sh + +echo "--- Test Python CPU ARM64" +buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cpu-arm64 +buildkite-agent artifact download "xgboost" . --step build-cpu-arm64 +chmod +x ./xgboost +tests/ci_build/ci_build.sh aarch64 docker tests/ci_build/test_python.sh cpu-arm64 diff --git a/tests/buildkite/test-python-cpu.sh b/tests/buildkite/test-python-cpu.sh new file mode 100755 index 000000000..5848c392a --- /dev/null +++ b/tests/buildkite/test-python-cpu.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -euo pipefail + +echo "--- Test CPU code in Python env" + +source tests/buildkite/conftest.sh + +mkdir -pv python-package/dist +buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda +buildkite-agent artifact download "xgboost" . --step build-cpu +chmod +x ./xgboost + +tests/ci_build/ci_build.sh cpu docker tests/ci_build/test_python.sh cpu diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh new file mode 100755 index 000000000..a6285ba16 --- /dev/null +++ b/tests/buildkite/test-python-gpu.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +set -euo pipefail + +CUDA_VERSION=11.0.3 + +if [ "$#" -lt 1 ] +then + suite='' + args='' +else + suite=$1 + shift 1 + args="$@" +fi + +source tests/buildkite/conftest.sh + +echo "--- Fetch build artifacts" +buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda +buildkite-agent artifact download "build/testxgboost" . --step build-cuda +chmod +x build/testxgboost + +# Allocate extra space in /dev/shm to enable NCCL +export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' + +command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "` + `"CUDA_VERSION_ARG=$CUDA_VERSION" + +# Run specified test suite +case "$suite" in + gpu) + echo "--- Test XGBoost Python package, single GPU" + $command_wrapper tests/ci_build/test_python.sh $suite + ;; + + mgpu) + echo "--- Test XGBoost Python package, 4 GPUs" + $command_wrapper tests/ci_build/test_python.sh $suite + ;; + + *) + echo "Usage: $0 {gpu|mgpu} [extra args to pass to pytest]" + exit 1 + ;; +esac diff --git a/tests/buildkite/test-win64-gpu.ps1 b/tests/buildkite/test-win64-gpu.ps1 new file mode 100644 index 000000000..fcc6e8436 --- /dev/null +++ b/tests/buildkite/test-win64-gpu.ps1 @@ -0,0 +1,37 @@ +$ErrorActionPreference = "Stop" + +. tests/buildkite/conftest.ps1 + +Write-Host "--- Test XGBoost on Windows with CUDA" + +New-Item python-package/dist -ItemType Directory -ea 0 +New-Item build -ItemType Directory -ea 0 +buildkite-agent artifact download "python-package/dist/*.whl" . --step build-win64-gpu +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +buildkite-agent artifact download "build/testxgboost.exe" . --step build-win64-gpu +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +buildkite-agent artifact download "xgboost.exe" . --step build-win64-gpu +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +nvcc --version + +Write-Host "--- Run Google Tests" +& build/testxgboost.exe +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Set up Python env" +conda activate +$env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) +mamba env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml +conda activate ${env_name} +Get-ChildItem . -Filter python-package/dist/*.whl | +Foreach-Object { + & python -m pip install python-package/dist/$_ + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +} + +Write-Host "--- Run Python tests" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python +Write-Host "--- Run Python tests with GPU" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"` + tests/python-gpu diff --git a/tests/ci_build/Dockerfile.aarch64 b/tests/ci_build/Dockerfile.aarch64 index cba3fa432..b7a863239 100644 --- a/tests/ci_build/Dockerfile.aarch64 +++ b/tests/ci_build/Dockerfile.aarch64 @@ -10,7 +10,7 @@ RUN \ yum update -y && \ yum install -y devtoolset-7 && \ # Python - wget --no-verbose -O conda.sh https://github.com/conda-forge/miniforge/releases/download/4.8.2-1/Miniforge3-4.8.2-1-Linux-aarch64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/4.8.2-1/Miniforge3-4.8.2-1-Linux-aarch64.sh && \ bash conda.sh -b -p /opt/miniforge-python ENV PATH=/opt/miniforge-python/bin:$PATH @@ -25,7 +25,7 @@ RUN conda env create -n aarch64_test --file=/scripts/aarch64_test.yml # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget --no-verbose -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/tests/ci_build/Dockerfile.clang_tidy b/tests/ci_build/Dockerfile.clang_tidy index 90a35a370..b0166f240 100644 --- a/tests/ci_build/Dockerfile.clang_tidy +++ b/tests/ci_build/Dockerfile.clang_tidy @@ -11,7 +11,7 @@ RUN \ apt-get update && \ apt-get install -y tar unzip wget git build-essential python3 python3-pip software-properties-common \ apt-transport-https ca-certificates gnupg-agent && \ - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ + wget -nv -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ add-apt-repository -u 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main' && \ apt-get update && \ apt-get install -y llvm-11 clang-tidy-11 clang-11 && \ @@ -31,7 +31,7 @@ ENV GOSU_VERSION 1.10 # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index 4210f63f0..786ab834b 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -15,7 +15,7 @@ RUN \ wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \ bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \ # Python - wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \ + wget -nv https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \ bash Mambaforge-Linux-x86_64.sh -b -p /opt/python ENV PATH=/opt/python/bin:$PATH @@ -32,7 +32,7 @@ RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu index ea4452564..2a022c207 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/tests/ci_build/Dockerfile.gpu @@ -12,7 +12,7 @@ RUN \ apt-get update && \ apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ # Python - wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash Miniconda3.sh -b -p /opt/python ENV PATH=/opt/python/bin:$PATH @@ -30,7 +30,7 @@ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/tests/ci_build/Dockerfile.gpu_build b/tests/ci_build/Dockerfile.gpu_build index b9eaa0a59..0d9f6a27c 100644 --- a/tests/ci_build/Dockerfile.gpu_build +++ b/tests/ci_build/Dockerfile.gpu_build @@ -18,7 +18,7 @@ RUN \ wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \ bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \ # Python - wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash Miniconda3.sh -b -p /opt/python # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) @@ -37,7 +37,7 @@ ENV GOSU_VERSION 1.10 # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/tests/ci_build/Dockerfile.gpu_jvm b/tests/ci_build/Dockerfile.gpu_jvm index b88120a4f..c4eda9150 100644 --- a/tests/ci_build/Dockerfile.gpu_jvm +++ b/tests/ci_build/Dockerfile.gpu_jvm @@ -16,15 +16,15 @@ RUN \ apt-get update && \ apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \ # Python - wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash Miniconda3.sh -b -p /opt/python && \ /opt/python/bin/pip install awscli && \ # Maven - wget https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \ + wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \ tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \ ln -s /opt/apache-maven-3.6.1/ /opt/maven && \ # Spark - wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \ + wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \ tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \ ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark @@ -38,7 +38,7 @@ ENV GOSU_VERSION 1.10 # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/tests/ci_build/Dockerfile.jvm_cross b/tests/ci_build/Dockerfile.jvm_cross index e143051c3..ca4323615 100644 --- a/tests/ci_build/Dockerfile.jvm_cross +++ b/tests/ci_build/Dockerfile.jvm_cross @@ -13,15 +13,15 @@ RUN \ apt-get update && \ apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \ # Python - wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash Miniconda3.sh -b -p /opt/python && \ /opt/python/bin/pip install awscli && \ # Maven - wget https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \ + wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \ tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \ ln -s /opt/apache-maven-3.6.1/ /opt/maven && \ # Spark - wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \ + wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \ tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \ ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark @@ -35,7 +35,7 @@ ENV GOSU_VERSION 1.10 # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm index 237aa11b7..4e9f3ac3a 100644 --- a/tests/ci_build/Dockerfile.rmm +++ b/tests/ci_build/Dockerfile.rmm @@ -12,7 +12,7 @@ RUN \ apt-get update && \ apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \ # Python - wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash Miniconda3.sh -b -p /opt/python # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) @@ -33,7 +33,7 @@ ENV GOSU_VERSION 1.10 # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/tests/ci_build/Dockerfile.s390x b/tests/ci_build/Dockerfile.s390x index 5ad4a7888..a29dbd407 100644 --- a/tests/ci_build/Dockerfile.s390x +++ b/tests/ci_build/Dockerfile.s390x @@ -15,7 +15,7 @@ ENV GOSU_VERSION 1.10 # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh index 6b3d5325e..c94670cc7 100755 --- a/tests/ci_build/ci_build.sh +++ b/tests/ci_build/ci_build.sh @@ -144,19 +144,19 @@ then DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" # Login for Docker registry - echo "\$(python3 -m awscli ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})" - $(python3 -m awscli ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID}) + echo "\$(aws ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})" + $(aws ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID}) # Pull pre-build container from Docker build cache, # if one exists for the particular branch or pull request - echo "docker pull ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" - if docker pull "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" + echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" + if docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" then CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" else # If the build cache is empty of the particular branch or pull request, # use the build cache associated with the master branch - echo "docker pull ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" - docker pull "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" || true + echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" + docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" || true CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" fi else @@ -188,11 +188,11 @@ then echo "docker tag ${DOCKER_IMG_NAME} ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" docker tag "${DOCKER_IMG_NAME}" "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" - echo "python3 -m awscli ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true" - python3 -m awscli ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true + echo "aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true" + aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true - echo "docker push ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" - docker push "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" + echo "docker push --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" + docker push --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" if [[ $? != "0" ]]; then echo "ERROR: could not update Docker cache ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" exit 1 diff --git a/tests/ci_build/insert_vcomp140.py b/tests/ci_build/insert_vcomp140.py index dce593501..e45c7d204 100644 --- a/tests/ci_build/insert_vcomp140.py +++ b/tests/ci_build/insert_vcomp140.py @@ -14,5 +14,6 @@ for wheel_path in sorted(glob.glob(sys.argv[1])): assert m, f'wheel_path = {wheel_path}' version = m.group(1) + print(f"Inserting vcomp140.dll into {wheel_path}...") with zipfile.ZipFile(wheel_path, 'a') as f: f.write(vcomp140_path, 'xgboost-{}.data/data/xgboost/vcomp140.dll'.format(version))