More CI improvements (#8313)
* Reduce clutter in log of Python test * Set up BuildKite test analytics * Add separate step for building containers * Enable incremental update of CI stack; custom agent IAM policy
This commit is contained in:
parent
bc7a6ec603
commit
50ff8a2623
36
tests/buildkite/build-containers.sh
Executable file
36
tests/buildkite/build-containers.sh
Executable file
@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
set -x
|
||||
|
||||
if [ "$#" -lt 1 ]
|
||||
then
|
||||
echo "Usage: $0 [container to build]"
|
||||
return 1
|
||||
fi
|
||||
container=$1
|
||||
|
||||
source tests/buildkite/conftest.sh
|
||||
|
||||
echo "--- Build container ${container}"
|
||||
|
||||
BUILD_ARGS=""
|
||||
|
||||
case "${container}" in
|
||||
gpu|rmm)
|
||||
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
|
||||
BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
|
||||
;;
|
||||
|
||||
jvm_gpu_build)
|
||||
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Unrecognized container ID: ${container}"
|
||||
return 2
|
||||
;;
|
||||
esac
|
||||
|
||||
# Run a no-op command. This will simply build the container and push it to the private registry
|
||||
tests/ci_build/ci_build.sh ${container} docker ${BUILD_ARGS} bash
|
||||
@ -1,6 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
function get_aws_secret {
|
||||
if [[ $# -ne 1 ]]
|
||||
then
|
||||
echo "Usage: get_aws_secret [Name of secret]"
|
||||
return 1
|
||||
fi
|
||||
aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString
|
||||
}
|
||||
|
||||
function set_buildkite_env_vars_in_container {
|
||||
# Pass all Buildkite-specific env vars to Docker containers.
|
||||
# This is to be used with tests/ci_build/ci_build.sh
|
||||
export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "`
|
||||
`"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "`
|
||||
`"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "`
|
||||
`"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL"
|
||||
}
|
||||
|
||||
set -x
|
||||
|
||||
CUDA_VERSION=11.0.3
|
||||
|
||||
@ -0,0 +1,32 @@
|
||||
---
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: "Buildkite agent's IAM policy"
|
||||
|
||||
Resources:
|
||||
BuildkiteAgentManagedPolicy:
|
||||
Type: AWS::IAM::ManagedPolicy
|
||||
Properties:
|
||||
PolicyDocument:
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:*",
|
||||
"s3-object-lambda:*"
|
||||
],
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": "lambda:InvokeFunction",
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": "secretsmanager:GetSecretValue",
|
||||
"Resource": "*"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1,10 +1,14 @@
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
import re
|
||||
|
||||
import boto3
|
||||
|
||||
import botocore
|
||||
from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
|
||||
TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
|
||||
|
||||
|
||||
@ -33,7 +37,7 @@ def get_default_vpc(*, aws_region):
|
||||
return ec2.Vpc(default_vpc_id)
|
||||
|
||||
|
||||
def format_params(args, *, stack_id):
|
||||
def format_params(args, *, stack_id, agent_iam_policy):
|
||||
default_vpc = get_default_vpc(aws_region=args.aws_region)
|
||||
azs = get_availability_zones(aws_region=args.aws_region)
|
||||
# For each of the first two availability zones (AZs), choose the default subnet
|
||||
@ -55,6 +59,7 @@ def format_params(args, *, stack_id):
|
||||
params["BuildkiteAgentToken"] = args.agent_token
|
||||
params["VpcId"] = default_vpc.id
|
||||
params["Subnets"] = ",".join(subnets)
|
||||
params["ManagedPolicyARN"] = agent_iam_policy
|
||||
params.update(COMMON_STACK_PARAMS)
|
||||
return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]
|
||||
|
||||
@ -63,34 +68,112 @@ def get_full_stack_id(stack_id):
|
||||
return f"buildkite-{stack_id}-autoscaling-group"
|
||||
|
||||
|
||||
def main(args):
|
||||
def stack_exists(args, *, stack_name):
|
||||
client = boto3.client("cloudformation", region_name=args.aws_region)
|
||||
waiter = client.get_waiter("stack_exists")
|
||||
try:
|
||||
waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
|
||||
return True
|
||||
except botocore.exceptions.WaiterError as e:
|
||||
return False
|
||||
|
||||
|
||||
def create_or_update_stack(
|
||||
args, *, stack_name, template_url=None, template_body=None, params=None
|
||||
):
|
||||
kwargs = {
|
||||
"StackName": stack_name,
|
||||
"Capabilities": [
|
||||
"CAPABILITY_IAM",
|
||||
"CAPABILITY_NAMED_IAM",
|
||||
"CAPABILITY_AUTO_EXPAND",
|
||||
],
|
||||
}
|
||||
if template_url:
|
||||
kwargs["TemplateURL"] = template_url
|
||||
if template_body:
|
||||
kwargs["TemplateBody"] = template_body
|
||||
if params:
|
||||
kwargs["Parameters"] = params
|
||||
|
||||
client = boto3.client("cloudformation", region_name=args.aws_region)
|
||||
|
||||
if stack_exists(args, stack_name=stack_name):
|
||||
print(f"Stack {stack_name} already exists. Updating...")
|
||||
try:
|
||||
response = client.update_stack(**kwargs)
|
||||
return {"StackName": stack_name, "Action": "update"}
|
||||
except botocore.exceptions.ClientError as e:
|
||||
if e.response["Error"]["Code"] == "ValidationError" and re.search(
|
||||
"No updates are to be performed", e.response["Error"]["Message"]
|
||||
):
|
||||
print(f"No update was made to {stack_name}")
|
||||
return {"StackName": stack_name, "Action": "noop"}
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
|
||||
response = client.create_stack(**kwargs)
|
||||
return {"StackName": stack_name, "Action": "create"}
|
||||
|
||||
|
||||
def wait(promise):
|
||||
client = boto3.client("cloudformation", region_name=args.aws_region)
|
||||
stack_name = promise["StackName"]
|
||||
print(f"Waiting for {stack_name}...")
|
||||
if promise["Action"] == "create":
|
||||
waiter = client.get_waiter("stack_create_complete")
|
||||
waiter.wait(StackName=stack_name)
|
||||
print(f"Finished creating stack {stack_name}")
|
||||
elif promise["Action"] == "update":
|
||||
waiter = client.get_waiter("stack_update_complete")
|
||||
waiter.wait(StackName=stack_name)
|
||||
print(f"Finished updating stack {stack_name}")
|
||||
elif promise["Action"] != "noop":
|
||||
raise ValueError(f"Invalid promise {promise}")
|
||||
|
||||
|
||||
def create_agent_iam_policy(args):
|
||||
policy_stack_name = "buildkite-agent-iam-policy"
|
||||
print(f"Creating stack {policy_stack_name} for agent IAM policy...")
|
||||
with open(
|
||||
os.path.join(current_dir, "agent-iam-policy-template.yml"),
|
||||
encoding="utf-8",
|
||||
) as f:
|
||||
policy_template = f.read()
|
||||
promise = create_or_update_stack(
|
||||
args, stack_name=policy_stack_name, template_body=policy_template
|
||||
)
|
||||
wait(promise)
|
||||
|
||||
cf = boto3.resource("cloudformation", region_name=args.aws_region)
|
||||
policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy")
|
||||
return policy.physical_resource_id
|
||||
|
||||
|
||||
def main(args):
|
||||
agent_iam_policy = create_agent_iam_policy(args)
|
||||
|
||||
client = boto3.client("cloudformation", region_name=args.aws_region)
|
||||
|
||||
promises = []
|
||||
|
||||
for stack_id in AMI_ID:
|
||||
stack_id_full = get_full_stack_id(stack_id)
|
||||
print(f"Creating elastic CI stack {stack_id_full}...")
|
||||
|
||||
params = format_params(args, stack_id=stack_id)
|
||||
|
||||
response = client.create_stack(
|
||||
StackName=stack_id_full,
|
||||
TemplateURL=TEMPLATE_URL,
|
||||
Capabilities=[
|
||||
"CAPABILITY_IAM",
|
||||
"CAPABILITY_NAMED_IAM",
|
||||
"CAPABILITY_AUTO_EXPAND",
|
||||
],
|
||||
OnFailure="ROLLBACK",
|
||||
EnableTerminationProtection=False,
|
||||
Parameters=params,
|
||||
params = format_params(
|
||||
args, stack_id=stack_id, agent_iam_policy=agent_iam_policy
|
||||
)
|
||||
|
||||
promise = create_or_update_stack(
|
||||
args, stack_name=stack_id_full, template_url=TEMPLATE_URL, params=params
|
||||
)
|
||||
promises.append(promise)
|
||||
print(f"CI stack {stack_id_full} is in progress in the background")
|
||||
|
||||
for stack_id in AMI_ID:
|
||||
stack_id_full = get_full_stack_id(stack_id)
|
||||
waiter = client.get_waiter("stack_create_complete")
|
||||
waiter.wait(StackName=stack_id_full)
|
||||
print(f"CI stack {stack_id_full} is now finished.")
|
||||
for promise in promises:
|
||||
wait(promise)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -13,6 +13,7 @@ AMI_ID = {
|
||||
"us-west-2": "ami-0a1a2ea551a07ad5f",
|
||||
},
|
||||
# Managed by BuildKite
|
||||
# from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
|
||||
"linux-amd64-cpu": {
|
||||
"us-west-2": "ami-075d4c25d5f0c17c1",
|
||||
},
|
||||
@ -105,7 +106,6 @@ COMMON_STACK_PARAMS = {
|
||||
"EnableCostAllocationTags": "true",
|
||||
"CostAllocationTagName": "CreatedBy",
|
||||
"ECRAccessPolicy": "full",
|
||||
"ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess,arn:aws:iam::aws:policy/service-role/AWSLambdaRole",
|
||||
"EnableSecretsPlugin": "false",
|
||||
"EnableECRPlugin": "false",
|
||||
"EnableDockerLoginPlugin": "false",
|
||||
|
||||
@ -13,6 +13,15 @@ steps:
|
||||
- wait
|
||||
- block: ":rocket: Run this test job"
|
||||
if: build.pull_request.repository.fork == true
|
||||
#### -------- CONTAINER BUILD --------
|
||||
- label: ":docker: Build containers"
|
||||
commands:
|
||||
- "tests/buildkite/build-containers.sh gpu"
|
||||
- "tests/buildkite/build-containers.sh jvm_gpu_build"
|
||||
key: build-containers
|
||||
agents:
|
||||
queue: linux-amd64-cpu
|
||||
- wait
|
||||
#### -------- BUILD --------
|
||||
- label: ":console: Build CUDA"
|
||||
command: "tests/buildkite/build-cuda.sh"
|
||||
@ -24,9 +33,7 @@ steps:
|
||||
key: build-jvm-packages-gpu
|
||||
agents:
|
||||
queue: linux-amd64-mgpu
|
||||
|
||||
- wait
|
||||
|
||||
#### -------- TEST --------
|
||||
- label: ":console: Test Python package, 4 GPUs"
|
||||
command: "tests/buildkite/test-python-gpu.sh mgpu"
|
||||
|
||||
@ -10,6 +10,15 @@ steps:
|
||||
- wait
|
||||
- block: ":rocket: Run this test job"
|
||||
if: build.pull_request.repository.fork == true
|
||||
#### -------- CONTAINER BUILD --------
|
||||
- label: ":docker: Build containers"
|
||||
commands:
|
||||
- "tests/buildkite/build-containers.sh gpu"
|
||||
- "tests/buildkite/build-containers.sh rmm"
|
||||
key: build-containers
|
||||
agents:
|
||||
queue: linux-amd64-cpu
|
||||
- wait
|
||||
#### -------- BUILD --------
|
||||
- label: ":console: Run clang-tidy"
|
||||
command: "tests/buildkite/run-clang-tidy.sh"
|
||||
@ -52,9 +61,7 @@ steps:
|
||||
key: build-jvm-doc
|
||||
agents:
|
||||
queue: linux-amd64-cpu
|
||||
|
||||
- wait
|
||||
|
||||
#### -------- TEST --------
|
||||
- label: ":console: Test Python package, CPU"
|
||||
command: "tests/buildkite/test-python-cpu.sh"
|
||||
@ -81,9 +88,8 @@ steps:
|
||||
key: test-integration-jvm-packages
|
||||
agents:
|
||||
queue: linux-amd64-cpu
|
||||
|
||||
- wait
|
||||
|
||||
#### -------- DEPLOY JVM --------
|
||||
- label: ":console: Deploy JVM packages"
|
||||
command: "tests/buildkite/deploy-jvm-packages.sh"
|
||||
key: deploy-jvm-packages
|
||||
|
||||
@ -29,11 +29,15 @@ command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
|
||||
# Run specified test suite
|
||||
case "$suite" in
|
||||
gpu)
|
||||
export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu)
|
||||
set_buildkite_env_vars_in_container
|
||||
echo "--- Test XGBoost Python package, single GPU"
|
||||
$command_wrapper tests/ci_build/test_python.sh $suite
|
||||
;;
|
||||
|
||||
mgpu)
|
||||
export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu)
|
||||
set_buildkite_env_vars_in_container
|
||||
echo "--- Test XGBoost Python package, 4 GPUs"
|
||||
$command_wrapper tests/ci_build/test_python.sh $suite
|
||||
;;
|
||||
|
||||
@ -25,7 +25,9 @@ RUN \
|
||||
python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
|
||||
dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
|
||||
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
||||
pyspark cloudpickle cuda-python=11.7.0 && mamba clean --all
|
||||
pyspark cloudpickle cuda-python=11.7.0 && \
|
||||
mamba clean --all && \
|
||||
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
|
||||
|
||||
ENV GOSU_VERSION 1.10
|
||||
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
||||
|
||||
@ -27,8 +27,10 @@ ENV PATH=/opt/python/bin:$PATH
|
||||
|
||||
# Create new Conda environment with RMM
|
||||
RUN \
|
||||
conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
|
||||
python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake
|
||||
conda install -c conda-forge mamba && \
|
||||
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
|
||||
python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
|
||||
mamba clean --all
|
||||
|
||||
ENV GOSU_VERSION 1.10
|
||||
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -x
|
||||
|
||||
if [ "$#" -lt 1 ]
|
||||
then
|
||||
@ -54,39 +53,47 @@ function uninstall_xgboost {
|
||||
case "$suite" in
|
||||
gpu)
|
||||
source activate gpu_test
|
||||
set -x
|
||||
install_xgboost
|
||||
setup_pyspark_envs
|
||||
pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu
|
||||
unset_pyspark_envs
|
||||
uninstall_xgboost
|
||||
set +x
|
||||
;;
|
||||
|
||||
mgpu)
|
||||
source activate gpu_test
|
||||
set -x
|
||||
install_xgboost
|
||||
setup_pyspark_envs
|
||||
pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu
|
||||
unset_pyspark_envs
|
||||
uninstall_xgboost
|
||||
set +x
|
||||
;;
|
||||
|
||||
cpu)
|
||||
source activate cpu_test
|
||||
set -x
|
||||
install_xgboost
|
||||
export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1
|
||||
setup_pyspark_envs
|
||||
pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python
|
||||
unset_pyspark_envs
|
||||
uninstall_xgboost
|
||||
set +x
|
||||
;;
|
||||
|
||||
cpu-arm64)
|
||||
source activate aarch64_test
|
||||
set -x
|
||||
install_xgboost
|
||||
setup_pyspark_envs
|
||||
pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py
|
||||
unset_pyspark_envs
|
||||
uninstall_xgboost
|
||||
set +x
|
||||
;;
|
||||
|
||||
*)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user