More CI improvements (#8313)

* Reduce clutter in log of Python test

* Set up BuildKite test analytics

* Add separate step for building containers

* Enable incremental update of CI stack; custom agent IAM policy
This commit is contained in:
Philip Hyunsu Cho 2022-10-06 06:33:46 -08:00 committed by GitHub
parent bc7a6ec603
commit 50ff8a2623
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 230 additions and 32 deletions

View File

@ -0,0 +1,36 @@
#!/bin/bash
set -euo pipefail
set -x
if [ "$#" -lt 1 ]
then
echo "Usage: $0 [container to build]"
return 1
fi
container=$1
source tests/buildkite/conftest.sh
echo "--- Build container ${container}"
BUILD_ARGS=""
case "${container}" in
gpu|rmm)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
;;
jvm_gpu_build)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
;;
*)
echo "Unrecognized container ID: ${container}"
return 2
;;
esac
# Run a no-op command. This will simply build the container and push it to the private registry
tests/ci_build/ci_build.sh ${container} docker ${BUILD_ARGS} bash

View File

@ -1,6 +1,25 @@
#!/bin/bash
set -euo pipefail
function get_aws_secret {
if [[ $# -ne 1 ]]
then
echo "Usage: get_aws_secret [Name of secret]"
return 1
fi
aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString
}
function set_buildkite_env_vars_in_container {
# Pass all Buildkite-specific env vars to Docker containers.
# This is to be used with tests/ci_build/ci_build.sh
export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "`
`"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "`
`"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "`
`"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL"
}
set -x
CUDA_VERSION=11.0.3

View File

@ -0,0 +1,32 @@
---
AWSTemplateFormatVersion: "2010-09-09"
Description: "Buildkite agent's IAM policy"
Resources:
BuildkiteAgentManagedPolicy:
Type: AWS::IAM::ManagedPolicy
Properties:
PolicyDocument:
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:*",
"s3-object-lambda:*"
],
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "lambda:InvokeFunction",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "secretsmanager:GetSecretValue",
"Resource": "*"
}
]
}

View File

@ -1,10 +1,14 @@
import argparse
import copy
import os
import re
import boto3
import botocore
from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS
current_dir = os.path.dirname(__file__)
TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
@ -33,7 +37,7 @@ def get_default_vpc(*, aws_region):
return ec2.Vpc(default_vpc_id)
def format_params(args, *, stack_id):
def format_params(args, *, stack_id, agent_iam_policy):
default_vpc = get_default_vpc(aws_region=args.aws_region)
azs = get_availability_zones(aws_region=args.aws_region)
# For each of the first two availability zones (AZs), choose the default subnet
@ -55,6 +59,7 @@ def format_params(args, *, stack_id):
params["BuildkiteAgentToken"] = args.agent_token
params["VpcId"] = default_vpc.id
params["Subnets"] = ",".join(subnets)
params["ManagedPolicyARN"] = agent_iam_policy
params.update(COMMON_STACK_PARAMS)
return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]
@ -63,34 +68,112 @@ def get_full_stack_id(stack_id):
return f"buildkite-{stack_id}-autoscaling-group"
def main(args):
def stack_exists(args, *, stack_name):
client = boto3.client("cloudformation", region_name=args.aws_region)
waiter = client.get_waiter("stack_exists")
try:
waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
return True
except botocore.exceptions.WaiterError as e:
return False
def create_or_update_stack(
args, *, stack_name, template_url=None, template_body=None, params=None
):
kwargs = {
"StackName": stack_name,
"Capabilities": [
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
}
if template_url:
kwargs["TemplateURL"] = template_url
if template_body:
kwargs["TemplateBody"] = template_body
if params:
kwargs["Parameters"] = params
client = boto3.client("cloudformation", region_name=args.aws_region)
if stack_exists(args, stack_name=stack_name):
print(f"Stack {stack_name} already exists. Updating...")
try:
response = client.update_stack(**kwargs)
return {"StackName": stack_name, "Action": "update"}
except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] == "ValidationError" and re.search(
"No updates are to be performed", e.response["Error"]["Message"]
):
print(f"No update was made to {stack_name}")
return {"StackName": stack_name, "Action": "noop"}
else:
raise e
else:
kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
response = client.create_stack(**kwargs)
return {"StackName": stack_name, "Action": "create"}
def wait(promise):
client = boto3.client("cloudformation", region_name=args.aws_region)
stack_name = promise["StackName"]
print(f"Waiting for {stack_name}...")
if promise["Action"] == "create":
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_name)
print(f"Finished creating stack {stack_name}")
elif promise["Action"] == "update":
waiter = client.get_waiter("stack_update_complete")
waiter.wait(StackName=stack_name)
print(f"Finished updating stack {stack_name}")
elif promise["Action"] != "noop":
raise ValueError(f"Invalid promise {promise}")
def create_agent_iam_policy(args):
policy_stack_name = "buildkite-agent-iam-policy"
print(f"Creating stack {policy_stack_name} for agent IAM policy...")
with open(
os.path.join(current_dir, "agent-iam-policy-template.yml"),
encoding="utf-8",
) as f:
policy_template = f.read()
promise = create_or_update_stack(
args, stack_name=policy_stack_name, template_body=policy_template
)
wait(promise)
cf = boto3.resource("cloudformation", region_name=args.aws_region)
policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy")
return policy.physical_resource_id
def main(args):
agent_iam_policy = create_agent_iam_policy(args)
client = boto3.client("cloudformation", region_name=args.aws_region)
promises = []
for stack_id in AMI_ID:
stack_id_full = get_full_stack_id(stack_id)
print(f"Creating elastic CI stack {stack_id_full}...")
params = format_params(args, stack_id=stack_id)
response = client.create_stack(
StackName=stack_id_full,
TemplateURL=TEMPLATE_URL,
Capabilities=[
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
OnFailure="ROLLBACK",
EnableTerminationProtection=False,
Parameters=params,
params = format_params(
args, stack_id=stack_id, agent_iam_policy=agent_iam_policy
)
promise = create_or_update_stack(
args, stack_name=stack_id_full, template_url=TEMPLATE_URL, params=params
)
promises.append(promise)
print(f"CI stack {stack_id_full} is in progress in the background")
for stack_id in AMI_ID:
stack_id_full = get_full_stack_id(stack_id)
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_id_full)
print(f"CI stack {stack_id_full} is now finished.")
for promise in promises:
wait(promise)
if __name__ == "__main__":

View File

@ -13,6 +13,7 @@ AMI_ID = {
"us-west-2": "ami-0a1a2ea551a07ad5f",
},
# Managed by BuildKite
# from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
"linux-amd64-cpu": {
"us-west-2": "ami-075d4c25d5f0c17c1",
},
@ -105,7 +106,6 @@ COMMON_STACK_PARAMS = {
"EnableCostAllocationTags": "true",
"CostAllocationTagName": "CreatedBy",
"ECRAccessPolicy": "full",
"ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess,arn:aws:iam::aws:policy/service-role/AWSLambdaRole",
"EnableSecretsPlugin": "false",
"EnableECRPlugin": "false",
"EnableDockerLoginPlugin": "false",

View File

@ -13,6 +13,15 @@ steps:
- wait
- block: ":rocket: Run this test job"
if: build.pull_request.repository.fork == true
#### -------- CONTAINER BUILD --------
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh jvm_gpu_build"
key: build-containers
agents:
queue: linux-amd64-cpu
- wait
#### -------- BUILD --------
- label: ":console: Build CUDA"
command: "tests/buildkite/build-cuda.sh"
@ -24,9 +33,7 @@ steps:
key: build-jvm-packages-gpu
agents:
queue: linux-amd64-mgpu
- wait
#### -------- TEST --------
- label: ":console: Test Python package, 4 GPUs"
command: "tests/buildkite/test-python-gpu.sh mgpu"

View File

@ -10,6 +10,15 @@ steps:
- wait
- block: ":rocket: Run this test job"
if: build.pull_request.repository.fork == true
#### -------- CONTAINER BUILD --------
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh rmm"
key: build-containers
agents:
queue: linux-amd64-cpu
- wait
#### -------- BUILD --------
- label: ":console: Run clang-tidy"
command: "tests/buildkite/run-clang-tidy.sh"
@ -52,9 +61,7 @@ steps:
key: build-jvm-doc
agents:
queue: linux-amd64-cpu
- wait
#### -------- TEST --------
- label: ":console: Test Python package, CPU"
command: "tests/buildkite/test-python-cpu.sh"
@ -81,9 +88,8 @@ steps:
key: test-integration-jvm-packages
agents:
queue: linux-amd64-cpu
- wait
#### -------- DEPLOY JVM --------
- label: ":console: Deploy JVM packages"
command: "tests/buildkite/deploy-jvm-packages.sh"
key: deploy-jvm-packages

View File

@ -29,11 +29,15 @@ command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
# Run specified test suite
case "$suite" in
gpu)
export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu)
set_buildkite_env_vars_in_container
echo "--- Test XGBoost Python package, single GPU"
$command_wrapper tests/ci_build/test_python.sh $suite
;;
mgpu)
export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu)
set_buildkite_env_vars_in_container
echo "--- Test XGBoost Python package, 4 GPUs"
$command_wrapper tests/ci_build/test_python.sh $suite
;;

View File

@ -25,7 +25,9 @@ RUN \
python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
pyspark cloudpickle cuda-python=11.7.0 && mamba clean --all
pyspark cloudpickle cuda-python=11.7.0 && \
mamba clean --all && \
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
ENV GOSU_VERSION 1.10
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/

View File

@ -27,8 +27,10 @@ ENV PATH=/opt/python/bin:$PATH
# Create new Conda environment with RMM
RUN \
conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake
conda install -c conda-forge mamba && \
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
mamba clean --all
ENV GOSU_VERSION 1.10

View File

@ -1,6 +1,5 @@
#!/bin/bash
set -e
set -x
if [ "$#" -lt 1 ]
then
@ -54,39 +53,47 @@ function uninstall_xgboost {
case "$suite" in
gpu)
source activate gpu_test
set -x
install_xgboost
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu
unset_pyspark_envs
uninstall_xgboost
set +x
;;
mgpu)
source activate gpu_test
set -x
install_xgboost
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu
unset_pyspark_envs
uninstall_xgboost
set +x
;;
cpu)
source activate cpu_test
set -x
install_xgboost
export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python
unset_pyspark_envs
uninstall_xgboost
set +x
;;
cpu-arm64)
source activate aarch64_test
set -x
install_xgboost
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py
unset_pyspark_envs
uninstall_xgboost
set +x
;;
*)