[CI] Migrate CI pipelines from Jenkins to BuildKite (#8142)

* [CI] Migrate CI pipelines from Jenkins to BuildKite

* Require manual approval

* Less verbose output when pulling Docker

* Remove us-east-2 from metadata.py

* Add documentation

* Add missing underscore

* Add missing punctuation

* More specific instruction

* Better paragraph structure
This commit is contained in:
Philip Hyunsu Cho
2022-09-07 17:29:25 -07:00
committed by GitHub
parent b397d64c96
commit e888eb2fa9
45 changed files with 1639 additions and 46 deletions

View File

@@ -0,0 +1,86 @@
import argparse
import copy
import json
import os
from urllib.request import urlopen
import boto3
import cfn_flip
from metadata import IMAGE_PARAMS
current_dir = os.path.dirname(__file__)
BUILDKITE_CF_TEMPLATE_URL = (
"https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
)
def format_params(*, stack_id, aws_region, ami_mapping):
params = copy.deepcopy(IMAGE_PARAMS[stack_id])
with open(
os.path.join(current_dir, params["BootstrapScript"]),
encoding="utf-8",
) as f:
bootstrap_script = f.read()
params["BaseImageId"] = ami_mapping[aws_region][params["BaseImageId"]]
params["BootstrapScript"] = bootstrap_script
return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]
def get_ami_mapping():
with urlopen(BUILDKITE_CF_TEMPLATE_URL) as response:
buildkite_cf_template = response.read().decode("utf-8")
cfn_obj = json.loads(cfn_flip.to_json(buildkite_cf_template))
return cfn_obj["Mappings"]["AWSRegion2AMI"]
def get_full_stack_id(stack_id):
return f"buildkite-{stack_id}-worker"
def main(args):
with open(
os.path.join(current_dir, "ec2-image-builder-pipeline-template.yml"),
encoding="utf-8",
) as f:
ec2_image_pipeline_template = f.read()
ami_mapping = get_ami_mapping()
for stack_id in IMAGE_PARAMS:
stack_id_full = get_full_stack_id(stack_id)
print(f"Creating EC2 image builder stack {stack_id_full}...")
params = format_params(
stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping
)
client = boto3.client("cloudformation", region_name=args.aws_region)
response = client.create_stack(
StackName=stack_id_full,
TemplateBody=ec2_image_pipeline_template,
Capabilities=[
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
OnFailure="ROLLBACK",
EnableTerminationProtection=False,
Parameters=params,
)
print(
f"EC2 image builder stack {stack_id_full} is in progress in the background"
)
for stack_id in IMAGE_PARAMS:
stack_id_full = get_full_stack_id(stack_id)
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_id_full)
print(f"EC2 image builder stack {stack_id_full} is now finished.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--aws-region", type=str, required=True)
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,108 @@
---
AWSTemplateFormatVersion: "2010-09-09"
Description: "EC2 Image Builder pipelines to build workers"
Parameters:
BaseImageId:
Type: String
Description: Base AMI to build a new image on top of.
BootstrapScript:
Type: String
Description: Content of AMI customization script
InstanceType:
Type: String
Description: Instance type for the Image Builder instances.
InstanceOperatingSystem:
Type: String
Description: The operating system to run on the instance
AllowedValues:
- Linux
- Windows
Default: "Linux"
VolumeSize:
Type: Number
Description: Size of EBS volume, in GiBs
Conditions:
IsInstanceWindows:
!Equals [ !Ref InstanceOperatingSystem, "Windows" ]
Resources:
# IAM role for the image builder instance
InstanceRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: "Allow"
Principal:
Service: "ec2.amazonaws.com"
Action: "sts:AssumeRole"
ManagedPolicyArns:
- arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
- arn:aws:iam::aws:policy/EC2InstanceProfileForImageBuilder
- arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
InstanceProfile:
Type: AWS::IAM::InstanceProfile
Properties:
Roles:
- !Ref InstanceRole
# Component that runs the bootstrap script
BootstrapComponent:
Type: AWS::ImageBuilder::Component
Properties:
Name: !Sub "${AWS::StackName}-bootstrap-component"
Platform: !Ref InstanceOperatingSystem
Version: "1.0.0"
Description: Execute a bootstrap script.
Data: !Ref BootstrapScript
Recipe:
Type: AWS::ImageBuilder::ImageRecipe
Properties:
Name: !Sub "${AWS::StackName}-image"
Components:
- ComponentArn: !Ref BootstrapComponent
ParentImage: !Ref BaseImageId
BlockDeviceMappings:
- DeviceName: !If [IsInstanceWindows, "/dev/sda1", "/dev/xvda"]
Ebs:
DeleteOnTermination: true
Encrypted: false
VolumeSize: !Ref VolumeSize
VolumeType: gp2
Version: "1.0.0"
Infrastructure:
Type: AWS::ImageBuilder::InfrastructureConfiguration
Properties:
Name: !Sub "${AWS::StackName}-image-pipeline-infrastructure"
InstanceProfileName: !Ref InstanceProfile
InstanceTypes:
- !Ref InstanceType
TerminateInstanceOnFailure: true
# Copy to this region only
Distribution:
Type: AWS::ImageBuilder::DistributionConfiguration
Properties:
Name: !Sub "${AWS::StackName}-image-pipeline-distribution-config"
Distributions:
- Region: !Ref AWS::Region
AmiDistributionConfiguration: {}
# Composition of the above elements
Pipeline:
Type: AWS::ImageBuilder::ImagePipeline
Properties:
Name: !Sub "${AWS::StackName}-image-pipeline"
DistributionConfigurationArn: !Ref Distribution
ImageRecipeArn: !Ref Recipe
InfrastructureConfigurationArn: !Ref Infrastructure

View File

@@ -0,0 +1,26 @@
name: BuildKiteLinuxAMD64GPUBootstrap
description: Set up worker image for linux-amd64-gpu pipeline
schemaVersion: 1.0
phases:
- name: build
steps:
- name: SetupStep
action: ExecuteBash
inputs:
commands:
- |
yum groupinstall -y "Development tools"
yum install -y kernel-devel-$(uname -r)
aws s3 cp --recursive s3://ec2-linux-nvidia-drivers/latest/ .
chmod +x NVIDIA-Linux-x86_64*.run
CC=/usr/bin/gcc10-cc ./NVIDIA-Linux-x86_64*.run --silent
amazon-linux-extras install docker
systemctl --now enable docker
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
&& curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo \
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
yum clean expire-cache
yum install -y nvidia-docker2
systemctl restart docker

View File

@@ -0,0 +1,18 @@
IMAGE_PARAMS = {
"linux-amd64-gpu": {
"BaseImageId": "linuxamd64",
# AMI ID is looked up from Buildkite's CloudFormation template
"BootstrapScript": "linux-amd64-gpu-bootstrap.yml",
"InstanceType": "g4dn.xlarge",
"InstanceOperatingSystem": "Linux",
"VolumeSize": "40", # in GiBs
},
"windows-gpu": {
"BaseImageId": "windows",
# AMI ID is looked up from Buildkite's CloudFormation template
"BootstrapScript": "windows-gpu-bootstrap.yml",
"InstanceType": "g4dn.2xlarge",
"InstanceOperatingSystem": "Windows",
"VolumeSize": "80", # in GiBs
},
}

View File

@@ -0,0 +1,22 @@
import argparse
import boto3
from create_worker_image_pipelines import get_full_stack_id
from metadata import IMAGE_PARAMS
def main(args):
cf = boto3.resource("cloudformation", region_name=args.aws_region)
builder_client = boto3.client("imagebuilder", region_name=args.aws_region)
for stack_id in IMAGE_PARAMS:
stack_id_full = get_full_stack_id(stack_id)
pipeline_arn = cf.Stack(stack_id_full).Resource("Pipeline").physical_resource_id
print(f"Running pipeline {pipeline_arn} to generate a new AMI...")
r = builder_client.start_image_pipeline_execution(imagePipelineArn=pipeline_arn)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--aws-region", type=str, required=True)
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,73 @@
name: BuildKiteWindowsGPUBootstrap
description: Set up worker image for windows-gpu pipeline
schemaVersion: 1.0
phases:
- name: build
steps:
- name: SetupStep
action: ExecutePowerShell
inputs:
commands:
- |
$ErrorActionPreference = "Stop"
choco --version
choco feature enable -n=allowGlobalConfirmation
# CMake 3.18
Write-Host '>>> Installing CMake 3.18...'
choco install cmake --version 3.18.0 --installargs "ADD_CMAKE_TO_PATH=System"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Notepad++
Write-Host '>>> Installing Notepad++...'
choco install notepadplusplus
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Miniconda
Write-Host '>>> Installing Miniconda...'
choco install miniconda3 /RegisterPython:1 /D:C:\tools\miniconda3
C:\tools\miniconda3\Scripts\conda.exe init --user --system
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
. "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
conda config --set auto_activate_base false
conda config --prepend channels conda-forge
# Install Java 11
Write-Host '>>> Installing Java 11...'
choco install openjdk11jre
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install GraphViz
Write-Host '>>> Installing GraphViz...'
choco install graphviz
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install Visual Studio Community 2017 (15.9)
Write-Host '>>> Installing Visual Studio 2017 Community (15.9)...'
choco install visualstudio2017community --version 15.9.23.0 `
--params "--wait --passive --norestart"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
choco install visualstudio2017-workload-nativedesktop --params `
"--wait --passive --norestart --includeOptional"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install CUDA 11.0
Write-Host '>>> Installing CUDA 11.0...'
choco install cuda --version 11.0.3
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install Python packages
Write-Host '>>> Installing Python packages...'
conda activate
conda install -y mamba
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install R
Write-Host '>>> Installing R...'
choco install r.project --version=3.6.3
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
choco install rtools --version=3.5.0.4
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }