[CI] Migrate CI pipelines from Jenkins to BuildKite (#8142)
* [CI] Migrate CI pipelines from Jenkins to BuildKite * Require manual approval * Less verbose output when pulling Docker * Remove us-east-2 from metadata.py * Add documentation * Add missing underscore * Add missing punctuation * More specific instruction * Better paragraph structure
This commit is contained in:
committed by
GitHub
parent
b397d64c96
commit
e888eb2fa9
101
tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
Normal file
101
tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import argparse
|
||||
import copy
|
||||
|
||||
import boto3
|
||||
|
||||
from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS
|
||||
|
||||
TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
|
||||
|
||||
|
||||
def get_availability_zones(*, aws_region):
|
||||
client = boto3.client("ec2", region_name=aws_region)
|
||||
r = client.describe_availability_zones(
|
||||
Filters=[
|
||||
{"Name": "region-name", "Values": [aws_region]},
|
||||
{"Name": "zone-type", "Values": ["availability-zone"]},
|
||||
]
|
||||
)
|
||||
return sorted([x["ZoneName"] for x in r["AvailabilityZones"]])
|
||||
|
||||
|
||||
def get_default_vpc(*, aws_region):
|
||||
ec2 = boto3.resource("ec2", region_name=aws_region)
|
||||
default_vpc_id = None
|
||||
for x in ec2.vpcs.filter(Filters=[{"Name": "is-default", "Values": ["true"]}]):
|
||||
return x
|
||||
|
||||
# Create default VPC if not exist
|
||||
client = boto3.client("ec2", region_name=aws_region)
|
||||
r = client.create_default_vpc()
|
||||
default_vpc_id = r["Vpc"]["VpcId"]
|
||||
|
||||
return ec2.Vpc(default_vpc_id)
|
||||
|
||||
|
||||
def format_params(args, *, stack_id):
|
||||
default_vpc = get_default_vpc(aws_region=args.aws_region)
|
||||
azs = get_availability_zones(aws_region=args.aws_region)
|
||||
# For each of the first two availability zones (AZs), choose the default subnet
|
||||
subnets = [
|
||||
x.id
|
||||
for x in default_vpc.subnets.filter(
|
||||
Filters=[
|
||||
{"Name": "default-for-az", "Values": ["true"]},
|
||||
{"Name": "availability-zone", "Values": azs[:2]},
|
||||
]
|
||||
)
|
||||
]
|
||||
assert len(subnets) == 2
|
||||
|
||||
params = copy.deepcopy(STACK_PARAMS[stack_id])
|
||||
params["ImageId"] = AMI_ID[stack_id][args.aws_region]
|
||||
params["BuildkiteQueue"] = stack_id
|
||||
params["CostAllocationTagValue"] = f"buildkite-{stack_id}"
|
||||
params["BuildkiteAgentToken"] = args.agent_token
|
||||
params["VpcId"] = default_vpc.id
|
||||
params["Subnets"] = ",".join(subnets)
|
||||
params.update(COMMON_STACK_PARAMS)
|
||||
return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]
|
||||
|
||||
|
||||
def get_full_stack_id(stack_id):
|
||||
return f"buildkite-{stack_id}-autoscaling-group"
|
||||
|
||||
|
||||
def main(args):
|
||||
client = boto3.client("cloudformation", region_name=args.aws_region)
|
||||
|
||||
for stack_id in AMI_ID:
|
||||
stack_id_full = get_full_stack_id(stack_id)
|
||||
print(f"Creating elastic CI stack {stack_id_full}...")
|
||||
|
||||
params = format_params(args, stack_id=stack_id)
|
||||
|
||||
response = client.create_stack(
|
||||
StackName=stack_id_full,
|
||||
TemplateURL=TEMPLATE_URL,
|
||||
Capabilities=[
|
||||
"CAPABILITY_IAM",
|
||||
"CAPABILITY_NAMED_IAM",
|
||||
"CAPABILITY_AUTO_EXPAND",
|
||||
],
|
||||
OnFailure="ROLLBACK",
|
||||
EnableTerminationProtection=False,
|
||||
Parameters=params,
|
||||
)
|
||||
print(f"CI stack {stack_id_full} is in progress in the background")
|
||||
|
||||
for stack_id in AMI_ID:
|
||||
stack_id_full = get_full_stack_id(stack_id)
|
||||
waiter = client.get_waiter("stack_create_complete")
|
||||
waiter.wait(StackName=stack_id_full)
|
||||
print(f"CI stack {stack_id_full} is now finished.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--aws-region", type=str, required=True)
|
||||
parser.add_argument("--agent-token", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
101
tests/buildkite/infrastructure/aws-stack-creator/metadata.py
Normal file
101
tests/buildkite/infrastructure/aws-stack-creator/metadata.py
Normal file
@@ -0,0 +1,101 @@
|
||||
AMI_ID = {
|
||||
# Managed by XGBoost team
|
||||
"linux-amd64-gpu": {
|
||||
"us-west-2": "ami-00ed92bd37f77bc33",
|
||||
},
|
||||
"linux-amd64-mgpu": {
|
||||
"us-west-2": "ami-00ed92bd37f77bc33",
|
||||
},
|
||||
"windows-gpu": {
|
||||
"us-west-2": "ami-0a1a2ea551a07ad5f",
|
||||
},
|
||||
# Managed by BuildKite
|
||||
"linux-amd64-cpu": {
|
||||
"us-west-2": "ami-075d4c25d5f0c17c1",
|
||||
},
|
||||
"pipeline-loader": {
|
||||
"us-west-2": "ami-075d4c25d5f0c17c1",
|
||||
},
|
||||
"linux-arm64-cpu": {
|
||||
"us-west-2": "ami-0952c6fb6db9a9891",
|
||||
},
|
||||
}
|
||||
|
||||
STACK_PARAMS = {
|
||||
"linux-amd64-gpu": {
|
||||
"InstanceOperatingSystem": "linux",
|
||||
"InstanceType": "g4dn.xlarge",
|
||||
"AgentsPerInstance": "1",
|
||||
"MinSize": "0",
|
||||
"MaxSize": "8",
|
||||
"OnDemandPercentage": "100",
|
||||
"ScaleOutFactor": "1.0",
|
||||
"ScaleInIdlePeriod": "60", # in seconds
|
||||
},
|
||||
"linux-amd64-mgpu": {
|
||||
"InstanceOperatingSystem": "linux",
|
||||
"InstanceType": "g4dn.12xlarge",
|
||||
"AgentsPerInstance": "1",
|
||||
"MinSize": "0",
|
||||
"MaxSize": "4",
|
||||
"OnDemandPercentage": "100",
|
||||
"ScaleOutFactor": "1.0",
|
||||
"ScaleInIdlePeriod": "60", # in seconds
|
||||
},
|
||||
"windows-gpu": {
|
||||
"InstanceOperatingSystem": "windows",
|
||||
"InstanceType": "g4dn.2xlarge",
|
||||
"AgentsPerInstance": "1",
|
||||
"MinSize": "0",
|
||||
"MaxSize": "2",
|
||||
"OnDemandPercentage": "100",
|
||||
"ScaleOutFactor": "1.0",
|
||||
"ScaleInIdlePeriod": "600", # in seconds
|
||||
},
|
||||
"linux-amd64-cpu": {
|
||||
"InstanceOperatingSystem": "linux",
|
||||
"InstanceType": "c5a.4xlarge",
|
||||
"AgentsPerInstance": "1",
|
||||
"MinSize": "0",
|
||||
"MaxSize": "16",
|
||||
"OnDemandPercentage": "100",
|
||||
"ScaleOutFactor": "1.0",
|
||||
"ScaleInIdlePeriod": "60", # in seconds
|
||||
},
|
||||
"pipeline-loader": {
|
||||
"InstanceOperatingSystem": "linux",
|
||||
"InstanceType": "t3a.micro",
|
||||
"AgentsPerInstance": "1",
|
||||
"MinSize": "1",
|
||||
"MaxSize": "1",
|
||||
"OnDemandPercentage": "100",
|
||||
"ScaleOutFactor": "1.0",
|
||||
"ScaleInIdlePeriod": "60", # in seconds
|
||||
},
|
||||
"linux-arm64-cpu": {
|
||||
"InstanceOperatingSystem": "linux",
|
||||
"InstanceType": "c6g.4xlarge",
|
||||
"AgentsPerInstance": "1",
|
||||
"MinSize": "0",
|
||||
"MaxSize": "8",
|
||||
"OnDemandPercentage": "100",
|
||||
"ScaleOutFactor": "1.0",
|
||||
"ScaleInIdlePeriod": "60", # in seconds
|
||||
},
|
||||
}
|
||||
|
||||
COMMON_STACK_PARAMS = {
|
||||
"BuildkiteAgentTimestampLines": "false",
|
||||
"BuildkiteWindowsAdministrator": "true",
|
||||
"AssociatePublicIpAddress": "true",
|
||||
"ScaleOutForWaitingJobs": "false",
|
||||
"EnableCostAllocationTags": "true",
|
||||
"CostAllocationTagName": "CreatedBy",
|
||||
"ECRAccessPolicy": "full",
|
||||
"ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess",
|
||||
"EnableSecretsPlugin": "false",
|
||||
"EnableECRPlugin": "false",
|
||||
"EnableDockerLoginPlugin": "false",
|
||||
"EnableDockerUserNamespaceRemap": "false",
|
||||
"BuildkiteAgentExperiments": "normalised-upload-paths,resolve-commit-after-checkout",
|
||||
}
|
||||
2
tests/buildkite/infrastructure/requirements.txt
Normal file
2
tests/buildkite/infrastructure/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
boto3
|
||||
cfn_tools
|
||||
@@ -0,0 +1,44 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import boto3
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
|
||||
|
||||
def main(args):
|
||||
with open(
|
||||
os.path.join(current_dir, "service-user-template.yml"), encoding="utf-8"
|
||||
) as f:
|
||||
service_user_template = f.read()
|
||||
|
||||
stack_id = "buildkite-elastic-ci-stack-service-user"
|
||||
|
||||
print("Create a new IAM user with suitable permissions...")
|
||||
client = boto3.client("cloudformation", region_name=args.aws_region)
|
||||
response = client.create_stack(
|
||||
StackName=stack_id,
|
||||
TemplateBody=service_user_template,
|
||||
Capabilities=[
|
||||
"CAPABILITY_IAM",
|
||||
"CAPABILITY_NAMED_IAM",
|
||||
],
|
||||
Parameters=[{"ParameterKey": "UserName", "ParameterValue": args.user_name}],
|
||||
)
|
||||
waiter = client.get_waiter("stack_create_complete")
|
||||
waiter.wait(StackName=stack_id)
|
||||
user = boto3.resource("iam", region_name=args.aws_region).User(args.user_name)
|
||||
key_pair = user.create_access_key_pair()
|
||||
print("Finished creating an IAM users with suitable permissions.")
|
||||
print(f"Access Key ID: {key_pair.access_key_id}")
|
||||
print(f"Access Secret Access Key: {key_pair.secret_access_key}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--aws-region", type=str, required=True)
|
||||
parser.add_argument(
|
||||
"--user-name", type=str, default="buildkite-elastic-ci-stack-user"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -0,0 +1,349 @@
|
||||
---
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: "Buildkite Elastic CI Stack CloudFormation service user"
|
||||
|
||||
Parameters:
|
||||
UserName:
|
||||
Type: String
|
||||
Default: buildkite-elastic-ci-stack-user
|
||||
Description: Name of user to create
|
||||
|
||||
Outputs:
|
||||
UserNameOutput:
|
||||
Value: !Ref CloudFormationServiceUser
|
||||
UserArnOutput:
|
||||
Value: !GetAtt CloudFormationServiceUser.Arn
|
||||
|
||||
Resources:
|
||||
CloudFormationServiceUser:
|
||||
Type: AWS::IAM::User
|
||||
Properties:
|
||||
ManagedPolicyArns:
|
||||
- !Ref SubstackCrudPolicy
|
||||
- !Ref CrudPolicy
|
||||
- !Ref ImageBuilderPolicy
|
||||
UserName: !Ref UserName
|
||||
|
||||
SubstackCrudPolicy:
|
||||
Type: AWS::IAM::ManagedPolicy
|
||||
Properties:
|
||||
PolicyDocument:
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": "cloudformation:*",
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"serverlessrepo:GetApplication",
|
||||
"serverlessrepo:GetCloudFormationTemplate",
|
||||
"serverlessrepo:CreateCloudFormationTemplate"
|
||||
],
|
||||
"Resource": "*"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
CrudPolicy:
|
||||
Type: AWS::IAM::ManagedPolicy
|
||||
Properties:
|
||||
PolicyDocument:
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ec2:DescribeAccountAttributes",
|
||||
"ec2:DescribeAvailabilityZones",
|
||||
"ec2:DescribeInstances",
|
||||
"ec2:DescribeInternetGateways",
|
||||
"ec2:DescribeLaunchTemplateVersions",
|
||||
"ec2:DescribeLaunchTemplates",
|
||||
"ec2:DescribeNetworkInterfaces",
|
||||
"ec2:DescribeRouteTables",
|
||||
"ec2:DescribeSecurityGroups",
|
||||
"ec2:DescribeSubnets",
|
||||
"ec2:DescribeVpcs",
|
||||
"ec2:CreateTags"
|
||||
],
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ec2:CreateInternetGateway",
|
||||
"ec2:AttachInternetGateway",
|
||||
"ec2:DetachInternetGateway",
|
||||
"ec2:DeleteInternetGateway"
|
||||
],
|
||||
"Resource": "arn:aws:ec2:*:*:internet-gateway/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ec2:CreateLaunchTemplate",
|
||||
"ec2:CreateLaunchTemplateVersion",
|
||||
"ec2:DeleteLaunchTemplate"
|
||||
],
|
||||
"Resource": "arn:aws:ec2:*:*:launch-template/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ec2:AssociateRouteTable",
|
||||
"ec2:DisassociateRouteTable",
|
||||
"ec2:CreateRoute",
|
||||
"ec2:CreateRouteTable",
|
||||
"ec2:DeleteRoute",
|
||||
"ec2:DeleteRouteTable"
|
||||
],
|
||||
"Resource": "arn:aws:ec2:*:*:route-table/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ec2:AuthorizeSecurityGroupIngress",
|
||||
"ec2:RevokeSecurityGroupIngress",
|
||||
"ec2:CreateSecurityGroup",
|
||||
"ec2:DeleteSecurityGroup"
|
||||
],
|
||||
"Resource": "arn:aws:ec2:*:*:security-group/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": "ec2:RunInstances",
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ec2:CreateSubnet",
|
||||
"ec2:DeleteSubnet",
|
||||
"ec2:AssociateRouteTable",
|
||||
"ec2:DisassociateRouteTable"
|
||||
],
|
||||
"Resource": "arn:aws:ec2:*:*:subnet/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ec2:CreateVpc",
|
||||
"ec2:CreateSecurityGroup",
|
||||
"ec2:ModifyVpcAttribute",
|
||||
"ec2:AttachInternetGateway",
|
||||
"ec2:DetachInternetGateway",
|
||||
"ec2:CreateSubnet",
|
||||
"ec2:CreateRouteTable",
|
||||
"ec2:DeleteVpc"
|
||||
],
|
||||
"Resource": "arn:aws:ec2:*:*:vpc/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ec2:CreateDefaultVpc",
|
||||
"ec2:CreateDefaultSubnet"
|
||||
],
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"iam:CreateInstanceProfile",
|
||||
"iam:GetInstanceProfile",
|
||||
"iam:AddRoleToInstanceProfile",
|
||||
"iam:RemoveRoleFromInstanceProfile",
|
||||
"iam:DeleteInstanceProfile"
|
||||
],
|
||||
"Resource": "arn:aws:iam::*:instance-profile/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"kms:DescribeKey",
|
||||
"kms:CreateGrant",
|
||||
"kms:Decrypt",
|
||||
"kms:Encrypt"
|
||||
],
|
||||
"Resource": "arn:aws:kms:*:*:key/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"lambda:CreateFunction",
|
||||
"lambda:GetFunction",
|
||||
"lambda:GetFunctionCodeSigningConfig",
|
||||
"lambda:AddPermission",
|
||||
"lambda:RemovePermission",
|
||||
"lambda:DeleteFunction",
|
||||
"lambda:InvokeFunction",
|
||||
"lambda:TagResource"
|
||||
],
|
||||
"Resource": "arn:aws:lambda:*:*:function:*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"logs:CreateLogGroup",
|
||||
"logs:PutRetentionPolicy",
|
||||
"logs:DeleteLogGroup"
|
||||
],
|
||||
"Resource": "arn:aws:logs:*:*:log-group:*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:GetObject",
|
||||
"s3:CreateBucket",
|
||||
"s3:PutBucketAcl",
|
||||
"s3:PutBucketLogging",
|
||||
"s3:PutBucketTagging",
|
||||
"s3:PutBucketVersioning"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ssm:GetParameter",
|
||||
"ssm:PutParameter",
|
||||
"ssm:DeleteParameter"
|
||||
],
|
||||
"Resource": "arn:aws:ssm:*:*:parameter/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"iam:ListPolicies",
|
||||
"iam:ListInstanceProfiles",
|
||||
"iam:ListRoles",
|
||||
"iam:ListPolicyVersions",
|
||||
"iam:ListRolePolicies",
|
||||
"iam:ListAttachedRolePolicies",
|
||||
"iam:ListInstanceProfileTags",
|
||||
"iam:ListRoleTags",
|
||||
"iam:ListInstanceProfilesForRole",
|
||||
"iam:GetPolicyVersion",
|
||||
"iam:GetPolicy",
|
||||
"iam:GetInstanceProfile",
|
||||
"iam:GetRole",
|
||||
"iam:GetRolePolicy",
|
||||
"iam:TagPolicy",
|
||||
"iam:UntagPolicy",
|
||||
"iam:TagInstanceProfile",
|
||||
"iam:UntagInstanceProfile",
|
||||
"iam:TagRole",
|
||||
"iam:UntagRole",
|
||||
"iam:CreateRole",
|
||||
"iam:PassRole",
|
||||
"iam:DeleteRole",
|
||||
"iam:UpdateRoleDescription",
|
||||
"iam:UpdateRole",
|
||||
"iam:AddRoleToInstanceProfile",
|
||||
"iam:RemoveRoleFromInstanceProfile",
|
||||
"iam:CreateInstanceProfile",
|
||||
"iam:DeleteInstanceProfile",
|
||||
"iam:DetachRolePolicy",
|
||||
"iam:SetDefaultPolicyVersion",
|
||||
"iam:AttachRolePolicy",
|
||||
"iam:UpdateAssumeRolePolicy",
|
||||
"iam:PutRolePermissionsBoundary",
|
||||
"iam:DeleteRolePermissionsBoundary",
|
||||
"iam:CreatePolicy",
|
||||
"iam:DeletePolicyVersion",
|
||||
"iam:DeletePolicy",
|
||||
"iam:PutRolePolicy",
|
||||
"iam:DeleteRolePolicy"
|
||||
],
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"autoscaling:DescribeLifecycleHookTypes",
|
||||
"autoscaling:DescribeTerminationPolicyTypes",
|
||||
"autoscaling:DescribePolicies",
|
||||
"autoscaling:DescribeWarmPool",
|
||||
"autoscaling:DescribeScalingActivities",
|
||||
"autoscaling:DescribeScalingProcessTypes",
|
||||
"autoscaling:DescribeScheduledActions",
|
||||
"autoscaling:DescribeAutoScalingGroups",
|
||||
"autoscaling:DescribeAutoScalingInstances",
|
||||
"autoscaling:DescribeLifecycleHooks",
|
||||
"autoscaling:SetDesiredCapacity",
|
||||
"autoscaling:PutLifecycleHook",
|
||||
"autoscaling:DeleteLifecycleHook",
|
||||
"autoscaling:SetInstanceProtection",
|
||||
"autoscaling:CreateAutoScalingGroup",
|
||||
"autoscaling:EnableMetricsCollection",
|
||||
"autoscaling:UpdateAutoScalingGroup",
|
||||
"autoscaling:DeleteAutoScalingGroup",
|
||||
"autoscaling:PutScalingPolicy",
|
||||
"autoscaling:DeletePolicy",
|
||||
"autoscaling:BatchPutScheduledUpdateGroupAction",
|
||||
"autoscaling:PutScheduledUpdateGroupAction",
|
||||
"autoscaling:DeleteScheduledAction",
|
||||
"autoscaling:PutWarmPool",
|
||||
"autoscaling:DeleteWarmPool",
|
||||
"autoscaling:TerminateInstanceInAutoScalingGroup",
|
||||
"autoscaling:AttachInstances"
|
||||
],
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"events:DescribeRule",
|
||||
"events:PutRule",
|
||||
"events:PutTargets",
|
||||
"events:RemoveTargets",
|
||||
"events:DeleteRule"
|
||||
],
|
||||
"Resource": "arn:aws:events:*:*:rule/*"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
ImageBuilderPolicy:
|
||||
Type: AWS::IAM::ManagedPolicy
|
||||
Properties:
|
||||
PolicyDocument:
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"imagebuilder:CreateComponent",
|
||||
"imagebuilder:GetComponent",
|
||||
"imagebuilder:DeleteComponent",
|
||||
"imagebuilder:CreateImageRecipe",
|
||||
"imagebuilder:GetImageRecipe",
|
||||
"imagebuilder:DeleteImageRecipe",
|
||||
"imagebuilder:CreateImagePipeline",
|
||||
"imagebuilder:GetImagePipeline",
|
||||
"imagebuilder:DeleteImagePipeline",
|
||||
"imagebuilder:CreateInfrastructureConfiguration",
|
||||
"imagebuilder:GetInfrastructureConfiguration",
|
||||
"imagebuilder:DeleteInfrastructureConfiguration",
|
||||
"imagebuilder:CreateDistributionConfiguration",
|
||||
"imagebuilder:GetDistributionConfiguration",
|
||||
"imagebuilder:DeleteDistributionConfiguration",
|
||||
"imagebuilder:TagResource",
|
||||
"imagebuilder:StartImagePipelineExecution",
|
||||
"ec2:DescribeImages",
|
||||
"ec2:DescribeSnapshots",
|
||||
"ec2:DescribeRegions",
|
||||
"ec2:DescribeVolumes",
|
||||
"ec2:DescribeKeyPairs",
|
||||
"ec2:DescribeInstanceTypeOfferings"
|
||||
],
|
||||
"Resource": "*"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
import argparse
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
from urllib.request import urlopen
|
||||
|
||||
import boto3
|
||||
import cfn_flip
|
||||
from metadata import IMAGE_PARAMS
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
|
||||
BUILDKITE_CF_TEMPLATE_URL = (
|
||||
"https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
|
||||
)
|
||||
|
||||
|
||||
def format_params(*, stack_id, aws_region, ami_mapping):
|
||||
params = copy.deepcopy(IMAGE_PARAMS[stack_id])
|
||||
with open(
|
||||
os.path.join(current_dir, params["BootstrapScript"]),
|
||||
encoding="utf-8",
|
||||
) as f:
|
||||
bootstrap_script = f.read()
|
||||
params["BaseImageId"] = ami_mapping[aws_region][params["BaseImageId"]]
|
||||
params["BootstrapScript"] = bootstrap_script
|
||||
return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]
|
||||
|
||||
|
||||
def get_ami_mapping():
|
||||
with urlopen(BUILDKITE_CF_TEMPLATE_URL) as response:
|
||||
buildkite_cf_template = response.read().decode("utf-8")
|
||||
cfn_obj = json.loads(cfn_flip.to_json(buildkite_cf_template))
|
||||
return cfn_obj["Mappings"]["AWSRegion2AMI"]
|
||||
|
||||
|
||||
def get_full_stack_id(stack_id):
|
||||
return f"buildkite-{stack_id}-worker"
|
||||
|
||||
|
||||
def main(args):
|
||||
with open(
|
||||
os.path.join(current_dir, "ec2-image-builder-pipeline-template.yml"),
|
||||
encoding="utf-8",
|
||||
) as f:
|
||||
ec2_image_pipeline_template = f.read()
|
||||
|
||||
ami_mapping = get_ami_mapping()
|
||||
|
||||
for stack_id in IMAGE_PARAMS:
|
||||
stack_id_full = get_full_stack_id(stack_id)
|
||||
print(f"Creating EC2 image builder stack {stack_id_full}...")
|
||||
|
||||
params = format_params(
|
||||
stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping
|
||||
)
|
||||
|
||||
client = boto3.client("cloudformation", region_name=args.aws_region)
|
||||
response = client.create_stack(
|
||||
StackName=stack_id_full,
|
||||
TemplateBody=ec2_image_pipeline_template,
|
||||
Capabilities=[
|
||||
"CAPABILITY_IAM",
|
||||
"CAPABILITY_NAMED_IAM",
|
||||
"CAPABILITY_AUTO_EXPAND",
|
||||
],
|
||||
OnFailure="ROLLBACK",
|
||||
EnableTerminationProtection=False,
|
||||
Parameters=params,
|
||||
)
|
||||
print(
|
||||
f"EC2 image builder stack {stack_id_full} is in progress in the background"
|
||||
)
|
||||
|
||||
for stack_id in IMAGE_PARAMS:
|
||||
stack_id_full = get_full_stack_id(stack_id)
|
||||
waiter = client.get_waiter("stack_create_complete")
|
||||
waiter.wait(StackName=stack_id_full)
|
||||
print(f"EC2 image builder stack {stack_id_full} is now finished.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--aws-region", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -0,0 +1,108 @@
|
||||
---
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: "EC2 Image Builder pipelines to build workers"
|
||||
|
||||
Parameters:
|
||||
BaseImageId:
|
||||
Type: String
|
||||
Description: Base AMI to build a new image on top of.
|
||||
|
||||
BootstrapScript:
|
||||
Type: String
|
||||
Description: Content of AMI customization script
|
||||
|
||||
InstanceType:
|
||||
Type: String
|
||||
Description: Instance type for the Image Builder instances.
|
||||
|
||||
InstanceOperatingSystem:
|
||||
Type: String
|
||||
Description: The operating system to run on the instance
|
||||
AllowedValues:
|
||||
- Linux
|
||||
- Windows
|
||||
Default: "Linux"
|
||||
|
||||
VolumeSize:
|
||||
Type: Number
|
||||
Description: Size of EBS volume, in GiBs
|
||||
|
||||
Conditions:
|
||||
IsInstanceWindows:
|
||||
!Equals [ !Ref InstanceOperatingSystem, "Windows" ]
|
||||
|
||||
Resources:
|
||||
# IAM role for the image builder instance
|
||||
InstanceRole:
|
||||
Type: AWS::IAM::Role
|
||||
Properties:
|
||||
AssumeRolePolicyDocument:
|
||||
Version: "2012-10-17"
|
||||
Statement:
|
||||
- Effect: "Allow"
|
||||
Principal:
|
||||
Service: "ec2.amazonaws.com"
|
||||
Action: "sts:AssumeRole"
|
||||
ManagedPolicyArns:
|
||||
- arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
|
||||
- arn:aws:iam::aws:policy/EC2InstanceProfileForImageBuilder
|
||||
- arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
|
||||
|
||||
InstanceProfile:
|
||||
Type: AWS::IAM::InstanceProfile
|
||||
Properties:
|
||||
Roles:
|
||||
- !Ref InstanceRole
|
||||
|
||||
# Component that runs the bootstrap script
|
||||
BootstrapComponent:
|
||||
Type: AWS::ImageBuilder::Component
|
||||
Properties:
|
||||
Name: !Sub "${AWS::StackName}-bootstrap-component"
|
||||
Platform: !Ref InstanceOperatingSystem
|
||||
Version: "1.0.0"
|
||||
Description: Execute a bootstrap script.
|
||||
Data: !Ref BootstrapScript
|
||||
|
||||
Recipe:
|
||||
Type: AWS::ImageBuilder::ImageRecipe
|
||||
Properties:
|
||||
Name: !Sub "${AWS::StackName}-image"
|
||||
Components:
|
||||
- ComponentArn: !Ref BootstrapComponent
|
||||
ParentImage: !Ref BaseImageId
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: !If [IsInstanceWindows, "/dev/sda1", "/dev/xvda"]
|
||||
Ebs:
|
||||
DeleteOnTermination: true
|
||||
Encrypted: false
|
||||
VolumeSize: !Ref VolumeSize
|
||||
VolumeType: gp2
|
||||
Version: "1.0.0"
|
||||
|
||||
Infrastructure:
|
||||
Type: AWS::ImageBuilder::InfrastructureConfiguration
|
||||
Properties:
|
||||
Name: !Sub "${AWS::StackName}-image-pipeline-infrastructure"
|
||||
InstanceProfileName: !Ref InstanceProfile
|
||||
InstanceTypes:
|
||||
- !Ref InstanceType
|
||||
TerminateInstanceOnFailure: true
|
||||
|
||||
# Copy to this region only
|
||||
Distribution:
|
||||
Type: AWS::ImageBuilder::DistributionConfiguration
|
||||
Properties:
|
||||
Name: !Sub "${AWS::StackName}-image-pipeline-distribution-config"
|
||||
Distributions:
|
||||
- Region: !Ref AWS::Region
|
||||
AmiDistributionConfiguration: {}
|
||||
|
||||
# Composition of the above elements
|
||||
Pipeline:
|
||||
Type: AWS::ImageBuilder::ImagePipeline
|
||||
Properties:
|
||||
Name: !Sub "${AWS::StackName}-image-pipeline"
|
||||
DistributionConfigurationArn: !Ref Distribution
|
||||
ImageRecipeArn: !Ref Recipe
|
||||
InfrastructureConfigurationArn: !Ref Infrastructure
|
||||
@@ -0,0 +1,26 @@
|
||||
name: BuildKiteLinuxAMD64GPUBootstrap
|
||||
description: Set up worker image for linux-amd64-gpu pipeline
|
||||
schemaVersion: 1.0
|
||||
|
||||
phases:
|
||||
- name: build
|
||||
steps:
|
||||
- name: SetupStep
|
||||
action: ExecuteBash
|
||||
inputs:
|
||||
commands:
|
||||
- |
|
||||
yum groupinstall -y "Development tools"
|
||||
yum install -y kernel-devel-$(uname -r)
|
||||
aws s3 cp --recursive s3://ec2-linux-nvidia-drivers/latest/ .
|
||||
chmod +x NVIDIA-Linux-x86_64*.run
|
||||
CC=/usr/bin/gcc10-cc ./NVIDIA-Linux-x86_64*.run --silent
|
||||
|
||||
amazon-linux-extras install docker
|
||||
systemctl --now enable docker
|
||||
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
|
||||
&& curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo \
|
||||
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
|
||||
yum clean expire-cache
|
||||
yum install -y nvidia-docker2
|
||||
systemctl restart docker
|
||||
@@ -0,0 +1,18 @@
|
||||
IMAGE_PARAMS = {
|
||||
"linux-amd64-gpu": {
|
||||
"BaseImageId": "linuxamd64",
|
||||
# AMI ID is looked up from Buildkite's CloudFormation template
|
||||
"BootstrapScript": "linux-amd64-gpu-bootstrap.yml",
|
||||
"InstanceType": "g4dn.xlarge",
|
||||
"InstanceOperatingSystem": "Linux",
|
||||
"VolumeSize": "40", # in GiBs
|
||||
},
|
||||
"windows-gpu": {
|
||||
"BaseImageId": "windows",
|
||||
# AMI ID is looked up from Buildkite's CloudFormation template
|
||||
"BootstrapScript": "windows-gpu-bootstrap.yml",
|
||||
"InstanceType": "g4dn.2xlarge",
|
||||
"InstanceOperatingSystem": "Windows",
|
||||
"VolumeSize": "80", # in GiBs
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
import argparse
|
||||
|
||||
import boto3
|
||||
from create_worker_image_pipelines import get_full_stack_id
|
||||
from metadata import IMAGE_PARAMS
|
||||
|
||||
|
||||
def main(args):
|
||||
cf = boto3.resource("cloudformation", region_name=args.aws_region)
|
||||
builder_client = boto3.client("imagebuilder", region_name=args.aws_region)
|
||||
for stack_id in IMAGE_PARAMS:
|
||||
stack_id_full = get_full_stack_id(stack_id)
|
||||
pipeline_arn = cf.Stack(stack_id_full).Resource("Pipeline").physical_resource_id
|
||||
print(f"Running pipeline {pipeline_arn} to generate a new AMI...")
|
||||
r = builder_client.start_image_pipeline_execution(imagePipelineArn=pipeline_arn)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--aws-region", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -0,0 +1,73 @@
|
||||
name: BuildKiteWindowsGPUBootstrap
|
||||
description: Set up worker image for windows-gpu pipeline
|
||||
schemaVersion: 1.0
|
||||
|
||||
phases:
|
||||
- name: build
|
||||
steps:
|
||||
- name: SetupStep
|
||||
action: ExecutePowerShell
|
||||
inputs:
|
||||
commands:
|
||||
- |
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
choco --version
|
||||
choco feature enable -n=allowGlobalConfirmation
|
||||
|
||||
# CMake 3.18
|
||||
Write-Host '>>> Installing CMake 3.18...'
|
||||
choco install cmake --version 3.18.0 --installargs "ADD_CMAKE_TO_PATH=System"
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Notepad++
|
||||
Write-Host '>>> Installing Notepad++...'
|
||||
choco install notepadplusplus
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Miniconda
|
||||
Write-Host '>>> Installing Miniconda...'
|
||||
choco install miniconda3 /RegisterPython:1 /D:C:\tools\miniconda3
|
||||
C:\tools\miniconda3\Scripts\conda.exe init --user --system
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
. "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1"
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
conda config --set auto_activate_base false
|
||||
conda config --prepend channels conda-forge
|
||||
|
||||
# Install Java 11
|
||||
Write-Host '>>> Installing Java 11...'
|
||||
choco install openjdk11jre
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Install GraphViz
|
||||
Write-Host '>>> Installing GraphViz...'
|
||||
choco install graphviz
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Install Visual Studio Community 2017 (15.9)
|
||||
Write-Host '>>> Installing Visual Studio 2017 Community (15.9)...'
|
||||
choco install visualstudio2017community --version 15.9.23.0 `
|
||||
--params "--wait --passive --norestart"
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
choco install visualstudio2017-workload-nativedesktop --params `
|
||||
"--wait --passive --norestart --includeOptional"
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Install CUDA 11.0
|
||||
Write-Host '>>> Installing CUDA 11.0...'
|
||||
choco install cuda --version 11.0.3
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Install Python packages
|
||||
Write-Host '>>> Installing Python packages...'
|
||||
conda activate
|
||||
conda install -y mamba
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Install R
|
||||
Write-Host '>>> Installing R...'
|
||||
choco install r.project --version=3.6.3
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
choco install rtools --version=3.5.0.4
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
Reference in New Issue
Block a user