[CI] Add nightly CI job to test against dev version of deps (#10351)

* [CI] Add nightly CI job to test against dev version of deps

* Update build-containers.sh

* Add build step

* Wait for build artifact

* Try pinning dask

* Address reviewers' comments

* Fix unbound variable error

* Specify dev version exactly

* Pin dask=2024.1.1
This commit is contained in:
Philip Hyunsu Cho
2024-06-03 19:28:55 -07:00
committed by GitHub
parent eb6622ff7a
commit 4057f861c1
14 changed files with 120 additions and 18 deletions

View File

@@ -20,16 +20,16 @@ case "${container}" in
cpu)
;;
gpu)
gpu|gpu_build_centos7)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
;;
gpu_build_centos7)
gpu_dev_ver)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$DEV_RAPIDS_VERSION"
;;
jvm_gpu_build)

View File

@@ -25,6 +25,7 @@ set -x
CUDA_VERSION=11.8.0
NCCL_VERSION=2.16.5-1
RAPIDS_VERSION=24.04
DEV_RAPIDS_VERSION=24.06
SPARK_VERSION=3.4.0
JDK_VERSION=8
R_VERSION=4.3.2

View File

@@ -0,0 +1,37 @@
# Nightly CI pipeline, to test against dev versions of dependencies
env:
DOCKER_CACHE_ECR_ID: "492475357299"
DOCKER_CACHE_ECR_REGION: "us-west-2"
DISABLE_RELEASE: "1"
# Skip uploading artifacts to S3 bucket
# Also, don't build all CUDA archs; just build sm_75
USE_DEPS_DEV_VER: "1"
# Use dev versions of RAPIDS and other dependencies
steps:
#### -------- CONTAINER BUILD --------
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh gpu_build_centos7"
- "tests/buildkite/build-containers.sh gpu_dev_ver"
key: build-containers
agents:
queue: linux-amd64-cpu
- wait
- label: ":console: Build CUDA"
command: "tests/buildkite/build-cuda.sh"
key: build-cuda
agents:
queue: linux-amd64-cpu
- wait
- label: ":console: Test Python package, single GPU"
command: "tests/buildkite/test-python-gpu.sh gpu"
key: test-python-gpu
agents:
queue: linux-amd64-gpu
- label: ":console: Test Python package, 4 GPUs"
command: "tests/buildkite/test-python-gpu.sh mgpu"
key: test-python-mgpu
agents:
queue: linux-amd64-mgpu

View File

@@ -22,10 +22,19 @@ chmod +x build/testxgboost
# Allocate extra space in /dev/shm to enable NCCL
export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
command_wrapper="tests/ci_build/ci_build.sh gpu --use-gpus --build-arg "`
if [[ -z "${USE_DEPS_DEV_VER-}" ]]
then
container_tag='gpu'
rapids_version=${RAPIDS_VERSION}
else
container_tag='gpu_dev_ver'
rapids_version=${DEV_RAPIDS_VERSION}
fi
command_wrapper="tests/ci_build/ci_build.sh ${container_tag} --use-gpus --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
`"NCCL_VERSION_ARG=$NCCL_VERSION"
`"RAPIDS_VERSION_ARG=${rapids_version} --build-arg "`
`"NCCL_VERSION_ARG=$NCCL_VERSION"
# Run specified test suite
case "$suite" in

View File

@@ -4,7 +4,10 @@ set -euo pipefail
LATEST_RAPIDS_VERSION=$(gh api repos/rapidsai/cuml/releases/latest --jq '.name' | sed -e 's/^v\([[:digit:]]\+\.[[:digit:]]\+\).*/\1/')
echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION"
DEV_RAPIDS_VERSION=$(date +%Y-%m-%d -d "20${LATEST_RAPIDS_VERSION//./-}-01 + 2 month" | cut -c3-7 | tr - .)
echo "DEV_RAPIDS_VERSION = $DEV_RAPIDS_VERSION"
PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh
sed -i "s/^DEV_RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/DEV_RAPIDS_VERSION=${DEV_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh