Philip Hyunsu Cho 6d8afb2218
[CI] Require C++17 + CMake 3.18; Use CUDA 11.8 in CI (#8853)
* Update to C++17

* Turn off unity build

* Update CMake to 3.18

* Use MSVC 2022 + CUDA 11.8

* Re-create stack for worker images

* Allocate more disk space for Windows

* Tempiorarily disable clang-tidy

* RAPIDS now requires Python 3.10+

* Unpin cuda-python

* Use latest NCCL

* Use Ubuntu 20.04 in RMM image

* Mark failing mgpu test as xfail
2023-03-01 09:22:24 -08:00

128 lines
4.0 KiB
Python

import argparse
import copy
import os
import re
import sys
import boto3
import botocore
from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS
current_dir = os.path.dirname(__file__)
sys.path.append(os.path.join(current_dir, ".."))
from common_blocks.utils import create_or_update_stack, wait
TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
def get_availability_zones(*, aws_region):
client = boto3.client("ec2", region_name=aws_region)
r = client.describe_availability_zones(
Filters=[
{"Name": "region-name", "Values": [aws_region]},
{"Name": "zone-type", "Values": ["availability-zone"]},
]
)
return sorted([x["ZoneName"] for x in r["AvailabilityZones"]])
def get_default_vpc(*, aws_region):
ec2 = boto3.resource("ec2", region_name=aws_region)
default_vpc_id = None
for x in ec2.vpcs.filter(Filters=[{"Name": "is-default", "Values": ["true"]}]):
return x
# Create default VPC if not exist
client = boto3.client("ec2", region_name=aws_region)
r = client.create_default_vpc()
default_vpc_id = r["Vpc"]["VpcId"]
return ec2.Vpc(default_vpc_id)
def format_params(args, *, stack_id, agent_iam_policy):
default_vpc = get_default_vpc(aws_region=args.aws_region)
azs = get_availability_zones(aws_region=args.aws_region)
# For each of the first two availability zones (AZs), choose the default subnet
subnets = [
x.id
for x in default_vpc.subnets.filter(
Filters=[
{"Name": "default-for-az", "Values": ["true"]},
{"Name": "availability-zone", "Values": azs[:2]},
]
)
]
assert len(subnets) == 2
params = copy.deepcopy(STACK_PARAMS[stack_id])
params["ImageId"] = AMI_ID[stack_id][args.aws_region]
params["BuildkiteQueue"] = stack_id
params["CostAllocationTagValue"] = f"buildkite-{stack_id}"
params["BuildkiteAgentToken"] = args.agent_token
params["VpcId"] = default_vpc.id
params["Subnets"] = ",".join(subnets)
params["ManagedPolicyARN"] = agent_iam_policy
params.update(COMMON_STACK_PARAMS)
return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]
def get_full_stack_id(stack_id):
return f"buildkite-{stack_id}-autoscaling-group"
def create_agent_iam_policy(args, *, client):
policy_stack_name = "buildkite-agent-iam-policy"
print(f"Creating stack {policy_stack_name} for agent IAM policy...")
with open(
os.path.join(current_dir, "agent-iam-policy-template.yml"),
encoding="utf-8",
) as f:
policy_template = f.read()
promise = create_or_update_stack(
args, client=client, stack_name=policy_stack_name, template_body=policy_template
)
wait(promise, client=client)
cf = boto3.resource("cloudformation", region_name=args.aws_region)
policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy")
return policy.physical_resource_id
def main(args):
client = boto3.client("cloudformation", region_name=args.aws_region)
agent_iam_policy = create_agent_iam_policy(args, client=client)
promises = []
for stack_id in AMI_ID:
stack_id_full = get_full_stack_id(stack_id)
print(f"Creating elastic CI stack {stack_id_full}...")
params = format_params(
args, stack_id=stack_id, agent_iam_policy=agent_iam_policy
)
promise = create_or_update_stack(
args,
client=client,
stack_name=stack_id_full,
template_url=TEMPLATE_URL,
params=params,
)
promises.append(promise)
print(f"CI stack {stack_id_full} is in progress in the background")
for promise in promises:
wait(promise, client=client)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--aws-region", type=str, required=True)
parser.add_argument("--agent-token", type=str, required=True)
args = parser.parse_args()
main(args)