From 9af99760d4dcbd7a7aa8321a169a45ed715f6a66 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Fri, 30 Sep 2022 05:42:56 -0700 Subject: [PATCH] Various CI savings (#8291) --- tests/buildkite/enforce_daily_budget.py | 14 ++++++++++ tests/buildkite/enforce_daily_budget.sh | 15 +++++++++++ .../aws-stack-creator/metadata.py | 19 ++++++++++--- tests/buildkite/pipeline-mgpu.yml | 5 ++++ tests/buildkite/pipeline-win64.yml | 9 +++++-- tests/buildkite/pipeline.yml | 5 ++++ tests/ci_build/ci_build.sh | 27 +++++++++++++++++-- 7 files changed, 87 insertions(+), 7 deletions(-) create mode 100644 tests/buildkite/enforce_daily_budget.py create mode 100755 tests/buildkite/enforce_daily_budget.sh diff --git a/tests/buildkite/enforce_daily_budget.py b/tests/buildkite/enforce_daily_budget.py new file mode 100644 index 000000000..af1b1ce48 --- /dev/null +++ b/tests/buildkite/enforce_daily_budget.py @@ -0,0 +1,14 @@ +import json +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--response", type=str, required=True) + args = parser.parse_args() + with open(args.response, "r") as f: + payload = f.read() + response = json.loads(payload) + if response["approved"]: + print(f"Testing approved. Reason: {response['reason']}") + else: + raise RuntimeError(f"Testing rejected. Reason: {response['reason']}") diff --git a/tests/buildkite/enforce_daily_budget.sh b/tests/buildkite/enforce_daily_budget.sh new file mode 100755 index 000000000..9be920d5a --- /dev/null +++ b/tests/buildkite/enforce_daily_budget.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -euo pipefail + +echo "--- Enforce daily budget" + +source tests/buildkite/conftest.sh + +if [[ $is_release_branch == 1 ]] +then + echo "Automatically approving all test jobs for release branches" +else + aws lambda invoke --function-name XGBoostCICostWatcher --invocation-type RequestResponse --region us-west-2 response.json + python3 tests/buildkite/enforce_daily_budget.py --response response.json +fi diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py index 8906309d1..6e94c5108 100644 --- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py +++ b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py @@ -9,6 +9,9 @@ AMI_ID = { "windows-gpu": { "us-west-2": "ami-0a1a2ea551a07ad5f", }, + "windows-cpu": { + "us-west-2": "ami-0a1a2ea551a07ad5f", + }, # Managed by BuildKite "linux-amd64-cpu": { "us-west-2": "ami-075d4c25d5f0c17c1", @@ -37,7 +40,7 @@ STACK_PARAMS = { "InstanceType": "g4dn.12xlarge", "AgentsPerInstance": "1", "MinSize": "0", - "MaxSize": "4", + "MaxSize": "1", "OnDemandPercentage": "100", "ScaleOutFactor": "1.0", "ScaleInIdlePeriod": "60", # in seconds @@ -50,7 +53,17 @@ STACK_PARAMS = { "MaxSize": "2", "OnDemandPercentage": "100", "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "600", # in seconds + "ScaleInIdlePeriod": "60", # in seconds + }, + "windows-cpu": { + "InstanceOperatingSystem": "windows", + "InstanceType": "c5a.2xlarge", + "AgentsPerInstance": "1", + "MinSize": "0", + "MaxSize": "2", + "OnDemandPercentage": "100", + "ScaleOutFactor": "1.0", + "ScaleInIdlePeriod": "60", # in seconds }, "linux-amd64-cpu": { "InstanceOperatingSystem": "linux", @@ -92,7 +105,7 @@ COMMON_STACK_PARAMS = { "EnableCostAllocationTags": "true", "CostAllocationTagName": "CreatedBy", "ECRAccessPolicy": "full", - "ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess", + "ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess,arn:aws:iam::aws:policy/service-role/AWSLambdaRole", "EnableSecretsPlugin": "false", "EnableECRPlugin": "false", "EnableDockerLoginPlugin": "false", diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml index 215c1584f..be71033ad 100644 --- a/tests/buildkite/pipeline-mgpu.yml +++ b/tests/buildkite/pipeline-mgpu.yml @@ -5,6 +5,11 @@ env: # Skip uploading artifacts to S3 bucket # Also, don't build all CUDA archs; just build sm_75 steps: + - label: ":moneybag: Enforce daily budget" + command: "tests/buildkite/enforce_daily_budget.sh" + key: enforce-daily-budget + agents: + queue: pipeline-loader - block: ":rocket: Run this test job" if: build.pull_request.repository.fork == true #### -------- BUILD -------- diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml index d4d171b02..9b73baeb3 100644 --- a/tests/buildkite/pipeline-win64.yml +++ b/tests/buildkite/pipeline-win64.yml @@ -1,4 +1,9 @@ steps: + - label: ":moneybag: Enforce daily budget" + command: "tests/buildkite/enforce_daily_budget.sh" + key: enforce-daily-budget + agents: + queue: pipeline-loader - block: ":rocket: Run this test job" if: build.pull_request.repository.fork == true #### -------- BUILD -------- @@ -6,12 +11,12 @@ steps: command: "tests/buildkite/build-win64-gpu.ps1" key: build-win64-gpu agents: - queue: windows-gpu + queue: windows-cpu - label: ":windows: Build XGBoost R package for Windows with CUDA" command: "tests/buildkite/build-rpkg-win64-gpu.ps1" key: build-rpkg-win64-gpu agents: - queue: windows-gpu + queue: windows-cpu - wait diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml index 91c6b0144..1cd44db3d 100644 --- a/tests/buildkite/pipeline.yml +++ b/tests/buildkite/pipeline.yml @@ -2,6 +2,11 @@ env: DOCKER_CACHE_ECR_ID: "492475357299" DOCKER_CACHE_ECR_REGION: "us-west-2" steps: + - label: ":moneybag: Enforce daily budget" + command: "tests/buildkite/enforce_daily_budget.sh" + key: enforce-daily-budget + agents: + queue: pipeline-loader - block: ":rocket: Run this test job" if: build.pull_request.repository.fork == true #### -------- BUILD -------- diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh index c94670cc7..9c55cf1d8 100755 --- a/tests/ci_build/ci_build.sh +++ b/tests/ci_build/ci_build.sh @@ -188,8 +188,31 @@ then echo "docker tag ${DOCKER_IMG_NAME} ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" docker tag "${DOCKER_IMG_NAME}" "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}" - echo "aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true" - aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true + # Attempt to create Docker repository; it will fail if the repository already exists + echo "aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION}" + if aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} + then + # Repository was created. Now set expiration policy + echo "aws ecr put-lifecycle-policy --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin" + cat <