More CI improvements (#8313)

* Reduce clutter in log of Python test * Set up BuildKite test analytics * Add separate step for building containers * Enable incremental update of CI stack; custom agent IAM policy
2022-10-06 06:33:46 -08:00 · 2022-10-06 06:33:46 -08:00 · 50ff8a2623
commit 50ff8a2623
parent bc7a6ec603
11 changed files with 230 additions and 32 deletions
--- a/tests/buildkite/build-containers.sh
+++ b/tests/buildkite/build-containers.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -euo pipefail
+set -x
+
+if [ "$#" -lt 1 ]
+then
+  echo "Usage: $0 [container to build]"
+  return 1
+fi
+container=$1
+
+source tests/buildkite/conftest.sh
+
+echo "--- Build container ${container}"
+
+BUILD_ARGS=""
+
+case "${container}" in
+  gpu|rmm)
+    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+    ;;
+
+  jvm_gpu_build)
+    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
+    ;;
+
+  *)
+    echo "Unrecognized container ID: ${container}"
+    return 2
+    ;;
+esac
+
+# Run a no-op command. This will simply build the container and push it to the private registry
+tests/ci_build/ci_build.sh ${container} docker ${BUILD_ARGS} bash
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@ -1,6 +1,25 @@
 #!/bin/bash

 set -euo pipefail
+
+function get_aws_secret {
+  if [[ $# -ne 1 ]]
+  then
+    echo "Usage: get_aws_secret [Name of secret]"
+    return 1
+  fi
+  aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString
+}
+
+function set_buildkite_env_vars_in_container {
+  # Pass all Buildkite-specific env vars to Docker containers.
+  # This is to be used with tests/ci_build/ci_build.sh
+  export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "`
+    `"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "`
+    `"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "`
+    `"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL"
+}
+
 set -x

 CUDA_VERSION=11.0.3
--- a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml
+++ b/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml
@ -0,0 +1,32 @@
+---
+AWSTemplateFormatVersion: "2010-09-09"
+Description: "Buildkite agent's IAM policy"
+
+Resources:
+  BuildkiteAgentManagedPolicy:
+    Type: AWS::IAM::ManagedPolicy
+    Properties:
+      PolicyDocument:
+        {
+          "Version": "2012-10-17",
+          "Statement": [
+            {
+              "Effect": "Allow",
+              "Action": [
+                "s3:*",
+                "s3-object-lambda:*"
+              ],
+              "Resource": "*"
+            },
+            {
+              "Effect": "Allow",
+              "Action": "lambda:InvokeFunction",
+              "Resource": "*"
+            },
+            {
+              "Effect": "Allow",
+              "Action": "secretsmanager:GetSecretValue",
+              "Resource": "*"
+            }
+          ]
+        }
--- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
+++ b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
@ -1,10 +1,14 @@
 import argparse
 import copy
+import os
+import re

 import boto3
-
+import botocore
 from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS

+current_dir = os.path.dirname(__file__)
+
 TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"


@ -33,7 +37,7 @@ def get_default_vpc(*, aws_region):
    return ec2.Vpc(default_vpc_id)


-def format_params(args, *, stack_id):
+def format_params(args, *, stack_id, agent_iam_policy):
    default_vpc = get_default_vpc(aws_region=args.aws_region)
    azs = get_availability_zones(aws_region=args.aws_region)
    # For each of the first two availability zones (AZs), choose the default subnet
@ -55,6 +59,7 @@ def format_params(args, *, stack_id):
    params["BuildkiteAgentToken"] = args.agent_token
    params["VpcId"] = default_vpc.id
    params["Subnets"] = ",".join(subnets)
+    params["ManagedPolicyARN"] = agent_iam_policy
    params.update(COMMON_STACK_PARAMS)
    return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]

@ -63,34 +68,112 @@ def get_full_stack_id(stack_id):
    return f"buildkite-{stack_id}-autoscaling-group"


-def main(args):
+def stack_exists(args, *, stack_name):
    client = boto3.client("cloudformation", region_name=args.aws_region)
+    waiter = client.get_waiter("stack_exists")
+    try:
+        waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
+        return True
+    except botocore.exceptions.WaiterError as e:
+        return False
+
+
+def create_or_update_stack(
+    args, *, stack_name, template_url=None, template_body=None, params=None
+):
+    kwargs = {
+        "StackName": stack_name,
+        "Capabilities": [
+            "CAPABILITY_IAM",
+            "CAPABILITY_NAMED_IAM",
+            "CAPABILITY_AUTO_EXPAND",
+        ],
+    }
+    if template_url:
+        kwargs["TemplateURL"] = template_url
+    if template_body:
+        kwargs["TemplateBody"] = template_body
+    if params:
+        kwargs["Parameters"] = params
+
+    client = boto3.client("cloudformation", region_name=args.aws_region)
+
+    if stack_exists(args, stack_name=stack_name):
+        print(f"Stack {stack_name} already exists. Updating...")
+        try:
+            response = client.update_stack(**kwargs)
+            return {"StackName": stack_name, "Action": "update"}
+        except botocore.exceptions.ClientError as e:
+            if e.response["Error"]["Code"] == "ValidationError" and re.search(
+                "No updates are to be performed", e.response["Error"]["Message"]
+            ):
+                print(f"No update was made to {stack_name}")
+                return {"StackName": stack_name, "Action": "noop"}
+            else:
+                raise e
+    else:
+        kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
+        response = client.create_stack(**kwargs)
+        return {"StackName": stack_name, "Action": "create"}
+
+
+def wait(promise):
+    client = boto3.client("cloudformation", region_name=args.aws_region)
+    stack_name = promise["StackName"]
+    print(f"Waiting for {stack_name}...")
+    if promise["Action"] == "create":
+        waiter = client.get_waiter("stack_create_complete")
+        waiter.wait(StackName=stack_name)
+        print(f"Finished creating stack {stack_name}")
+    elif promise["Action"] == "update":
+        waiter = client.get_waiter("stack_update_complete")
+        waiter.wait(StackName=stack_name)
+        print(f"Finished updating stack {stack_name}")
+    elif promise["Action"] != "noop":
+        raise ValueError(f"Invalid promise {promise}")
+
+
+def create_agent_iam_policy(args):
+    policy_stack_name = "buildkite-agent-iam-policy"
+    print(f"Creating stack {policy_stack_name} for agent IAM policy...")
+    with open(
+        os.path.join(current_dir, "agent-iam-policy-template.yml"),
+        encoding="utf-8",
+    ) as f:
+        policy_template = f.read()
+    promise = create_or_update_stack(
+        args, stack_name=policy_stack_name, template_body=policy_template
+    )
+    wait(promise)
+
+    cf = boto3.resource("cloudformation", region_name=args.aws_region)
+    policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy")
+    return policy.physical_resource_id
+
+
+def main(args):
+    agent_iam_policy = create_agent_iam_policy(args)
+
+    client = boto3.client("cloudformation", region_name=args.aws_region)
+
+    promises = []

    for stack_id in AMI_ID:
        stack_id_full = get_full_stack_id(stack_id)
        print(f"Creating elastic CI stack {stack_id_full}...")

-        params = format_params(args, stack_id=stack_id)
-
-        response = client.create_stack(
-            StackName=stack_id_full,
-            TemplateURL=TEMPLATE_URL,
-            Capabilities=[
-                "CAPABILITY_IAM",
-                "CAPABILITY_NAMED_IAM",
-                "CAPABILITY_AUTO_EXPAND",
-            ],
-            OnFailure="ROLLBACK",
-            EnableTerminationProtection=False,
-            Parameters=params,
+        params = format_params(
+            args, stack_id=stack_id, agent_iam_policy=agent_iam_policy
        )
+
+        promise = create_or_update_stack(
+            args, stack_name=stack_id_full, template_url=TEMPLATE_URL, params=params
+        )
+        promises.append(promise)
        print(f"CI stack {stack_id_full} is in progress in the background")

-    for stack_id in AMI_ID:
-        stack_id_full = get_full_stack_id(stack_id)
-        waiter = client.get_waiter("stack_create_complete")
-        waiter.wait(StackName=stack_id_full)
-        print(f"CI stack {stack_id_full} is now finished.")
+    for promise in promises:
+        wait(promise)


 if __name__ == "__main__":
--- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
+++ b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
@ -13,6 +13,7 @@ AMI_ID = {
        "us-west-2": "ami-0a1a2ea551a07ad5f",
    },
    # Managed by BuildKite
+    # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
    "linux-amd64-cpu": {
        "us-west-2": "ami-075d4c25d5f0c17c1",
    },
@ -105,7 +106,6 @@ COMMON_STACK_PARAMS = {
    "EnableCostAllocationTags": "true",
    "CostAllocationTagName": "CreatedBy",
    "ECRAccessPolicy": "full",
-    "ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess,arn:aws:iam::aws:policy/service-role/AWSLambdaRole",
    "EnableSecretsPlugin": "false",
    "EnableECRPlugin": "false",
    "EnableDockerLoginPlugin": "false",
--- a/tests/buildkite/pipeline-mgpu.yml
+++ b/tests/buildkite/pipeline-mgpu.yml
@ -13,6 +13,15 @@ steps:
  - wait
  - block: ":rocket: Run this test job"
    if: build.pull_request.repository.fork == true
+  #### -------- CONTAINER BUILD --------
+  - label: ":docker: Build containers"
+    commands:
+      - "tests/buildkite/build-containers.sh gpu"
+      - "tests/buildkite/build-containers.sh jvm_gpu_build"
+    key: build-containers
+    agents:
+      queue: linux-amd64-cpu
+  - wait
  #### -------- BUILD --------
  - label: ":console: Build CUDA"
    command: "tests/buildkite/build-cuda.sh"
@ -24,9 +33,7 @@ steps:
    key: build-jvm-packages-gpu
    agents:
      queue: linux-amd64-mgpu
-
  - wait
-
  #### -------- TEST --------
  - label: ":console: Test Python package, 4 GPUs"
    command: "tests/buildkite/test-python-gpu.sh mgpu"
--- a/tests/buildkite/pipeline.yml
+++ b/tests/buildkite/pipeline.yml
@ -10,6 +10,15 @@ steps:
  - wait
  - block: ":rocket: Run this test job"
    if: build.pull_request.repository.fork == true
+  #### -------- CONTAINER BUILD --------
+  - label: ":docker: Build containers"
+    commands:
+      - "tests/buildkite/build-containers.sh gpu"
+      - "tests/buildkite/build-containers.sh rmm"
+    key: build-containers
+    agents:
+      queue: linux-amd64-cpu
+  - wait
  #### -------- BUILD --------
  - label: ":console: Run clang-tidy"
    command: "tests/buildkite/run-clang-tidy.sh"
@ -52,9 +61,7 @@ steps:
    key: build-jvm-doc
    agents:
      queue: linux-amd64-cpu
-
  - wait
-
  #### -------- TEST --------
  - label: ":console: Test Python package, CPU"
    command: "tests/buildkite/test-python-cpu.sh"
@ -81,9 +88,8 @@ steps:
    key: test-integration-jvm-packages
    agents:
      queue: linux-amd64-cpu
-
  - wait
-
+  #### -------- DEPLOY JVM --------
  - label: ":console: Deploy JVM packages"
    command: "tests/buildkite/deploy-jvm-packages.sh"
    key: deploy-jvm-packages
--- a/tests/buildkite/test-python-gpu.sh
+++ b/tests/buildkite/test-python-gpu.sh
@ -29,11 +29,15 @@ command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
 # Run specified test suite
 case "$suite" in
  gpu)
+    export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu)
+    set_buildkite_env_vars_in_container
    echo "--- Test XGBoost Python package, single GPU"
    $command_wrapper tests/ci_build/test_python.sh $suite
    ;;

  mgpu)
+    export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu)
+    set_buildkite_env_vars_in_container
    echo "--- Test XGBoost Python package, 4 GPUs"
    $command_wrapper tests/ci_build/test_python.sh $suite
    ;;
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@ -25,7 +25,9 @@ RUN \
        python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
        dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
        numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
-        pyspark cloudpickle cuda-python=11.7.0 && mamba clean --all
+        pyspark cloudpickle cuda-python=11.7.0 && \
+    mamba clean --all && \
+    conda run --no-capture-output -n gpu_test pip install buildkite-test-collector

 ENV GOSU_VERSION 1.10
 ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@ -27,8 +27,10 @@ ENV PATH=/opt/python/bin:$PATH

 # Create new Conda environment with RMM
 RUN \
-    conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake
+    conda install -c conda-forge mamba && \
+    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
+        python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
+    mamba clean --all

 ENV GOSU_VERSION 1.10

--- a/tests/ci_build/test_python.sh
+++ b/tests/ci_build/test_python.sh
@ -1,6 +1,5 @@
 #!/bin/bash
 set -e
-set -x

 if [ "$#" -lt 1 ]
 then
@ -54,39 +53,47 @@ function uninstall_xgboost {
 case "$suite" in
  gpu)
    source activate gpu_test
+    set -x
    install_xgboost
    setup_pyspark_envs
    pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu
    unset_pyspark_envs
    uninstall_xgboost
+    set +x
    ;;

  mgpu)
    source activate gpu_test
+    set -x
    install_xgboost
    setup_pyspark_envs
    pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu
    unset_pyspark_envs
    uninstall_xgboost
+    set +x
    ;;

  cpu)
    source activate cpu_test
+    set -x
    install_xgboost
    export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1
    setup_pyspark_envs
    pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python
    unset_pyspark_envs
    uninstall_xgboost
+    set +x
    ;;

  cpu-arm64)
    source activate aarch64_test
+    set -x
    install_xgboost
    setup_pyspark_envs
    pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py
    unset_pyspark_envs
    uninstall_xgboost
+    set +x
    ;;

  *)