[CI] Migrate CI pipelines from Jenkins to BuildKite (#8142)

* [CI] Migrate CI pipelines from Jenkins to BuildKite

* Require manual approval

* Less verbose output when pulling Docker

* Remove us-east-2 from metadata.py

* Add documentation

* Add missing underscore

* Add missing punctuation

* More specific instruction

* Better paragraph structure
This commit is contained in:
Philip Hyunsu Cho
2022-09-07 17:29:25 -07:00
committed by GitHub
parent b397d64c96
commit e888eb2fa9
45 changed files with 1639 additions and 46 deletions

View File

@@ -0,0 +1,43 @@
#!/bin/bash
set -euo pipefail
WHEEL_TAG=manylinux2014_aarch64
echo "--- Build CPU code targeting ARM64"
source tests/buildkite/conftest.sh
command_wrapper="tests/ci_build/ci_build.sh aarch64 docker"
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test \
-DUSE_OPENMP=ON -DHIDE_CXX_SYMBOL=ON
echo "--- Run Google Test"
$command_wrapper bash -c "cd build && ctest --extra-verbose"
echo "--- Build binary wheel"
$command_wrapper bash -c \
"cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
${BUILDKITE_COMMIT} ${WHEEL_TAG}
echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl
$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
${BUILDKITE_COMMIT} ${WHEEL_TAG}
mv -v wheelhouse/*.whl python-package/dist/
# Make sure that libgomp.so is vendored in the wheel
$command_wrapper bash -c \
"unzip -l python-package/dist/*.whl | grep libgomp || exit -1"
echo "--- Upload Python wheel"
buildkite-agent artifact upload "python-package/dist/*.whl"
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
--acl public-read --no-progress
fi
echo "--- Stash XGBoost CLI executable"
buildkite-agent artifact upload ./xgboost

33
tests/buildkite/build-cpu.sh Executable file
View File

@@ -0,0 +1,33 @@
#!/bin/bash
set -euo pipefail
echo "--- Build CPU code"
source tests/buildkite/conftest.sh
command_wrapper="tests/ci_build/ci_build.sh cpu docker"
$command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h
# This step is not necessary, but here we include it, to ensure that
# DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use
# the configured header build/dmlc/build_config.h instead of
# include/dmlc/build_config_default.h.
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON
echo "--- Run Google Test"
$command_wrapper bash -c "cd build && ctest --extra-verbose"
echo "--- Stash XGBoost CLI executable"
buildkite-agent artifact upload ./xgboost
# Sanitizer test
echo "--- Run Google Test with sanitizer enabled"
$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON \
-DENABLED_SANITIZERS="address;leak;undefined" -DCMAKE_BUILD_TYPE=Debug \
-DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/
CI_DOCKER_EXTRA_PARAMS_INIT="-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer "`
`"-e ASAN_OPTIONS=symbolize=1 "`
`"-e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log "`
`"--cap-add SYS_PTRACE" \
$command_wrapper bash -c "cd build && ctest --exclude-regex AllTestsInDMLCUnitTests "`
`"--extra-verbose"

View File

@@ -0,0 +1,26 @@
#!/bin/bash
set -euo pipefail
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
echo "--- Build with CUDA ${CUDA_VERSION}, RMM enabled"
if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
then
arch_flag="-DGPU_COMPUTE_VER=75"
else
arch_flag=""
fi
command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION"
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \
-DUSE_NCCL=ON -DPLUGIN_RMM=ON -DBUILD_WITH_CUDA_CUB=ON ${arch_flag}
echo "-- Stash C++ test executable (testxgboost)"
buildkite-agent artifact upload build/testxgboost

52
tests/buildkite/build-cuda.sh Executable file
View File

@@ -0,0 +1,52 @@
#!/bin/bash
set -euo pipefail
CUDA_VERSION=11.0.3
WHEEL_TAG=manylinux2014_x86_64
echo "--- Build with CUDA ${CUDA_VERSION}"
source tests/buildkite/conftest.sh
if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
then
arch_flag="-DGPU_COMPUTE_VER=75"
else
arch_flag=""
fi
command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION"
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/prune_libnccl.sh
$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \
-DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DUSE_NCCL_LIB_PATH=ON \
-DNCCL_INCLUDE_DIR=/usr/include -DNCCL_LIBRARY=/workspace/libnccl_static.a \
${arch_flag}
echo "--- Build binary wheel"
$command_wrapper bash -c \
"cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
${BUILDKITE_COMMIT} ${WHEEL_TAG}
echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
tests/ci_build/ci_build.sh auditwheel_x86_64 docker auditwheel repair \
--plat ${WHEEL_TAG} python-package/dist/*.whl
$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
${BUILDKITE_COMMIT} ${WHEEL_TAG}
mv -v wheelhouse/*.whl python-package/dist/
# Make sure that libgomp.so is vendored in the wheel
tests/ci_build/ci_build.sh auditwheel_x86_64 docker bash -c \
"unzip -l python-package/dist/*.whl | grep libgomp || exit -1"
echo "--- Upload Python wheel"
buildkite-agent artifact upload python-package/dist/*.whl
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
--acl public-read --no-progress
fi
echo "-- Stash C++ test executable (testxgboost)"
buildkite-agent artifact upload build/testxgboost

View File

@@ -0,0 +1,22 @@
#!/bin/bash
set -euo pipefail
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
echo "--- Build XGBoost R package with CUDA"
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
tests/ci_build/ci_build.sh gpu_build_r_centos7 docker \
--build-arg CUDA_VERSION_ARG=${CUDA_VERSION} tests/ci_build/build_r_pkg_with_cuda.sh \
${BUILDKITE_COMMIT}
echo "--- Upload R tarball"
aws s3 cp xgboost_r_gpu_linux_*.tar.gz s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
--acl public-read --no-progress
fi

View File

@@ -0,0 +1,14 @@
#!/bin/bash
set -euo pipefail
source tests/buildkite/conftest.sh
echo "--- Build JVM packages doc"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME}
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
echo "--- Upload JVM packages doc"
aws s3 cp jvm-packages/${BRANCH_NAME}.tar.bz2 \
s3://xgboost-docs/${BRANCH_NAME}.tar.bz2 --acl public-read --no-progress
fi

View File

@@ -0,0 +1,21 @@
#!/bin/bash
set -euo pipefail
SPARK_VERSION=3.0.1
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
echo "--- Build XGBoost JVM packages with CUDA"
if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
then
arch_flag="-DGPU_COMPUTE_VER=75"
else
arch_flag=""
fi
tests/ci_build/ci_build.sh jvm_gpu_build nvidia-docker \
--build-arg CUDA_VERSION_ARG=${CUDA_VERSION} tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} -Duse.cuda=ON ${arch_flag}

View File

@@ -0,0 +1,16 @@
#!/bin/bash
set -euo pipefail
SPARK_VERSION=3.0.1
source tests/buildkite/conftest.sh
echo "--- Build XGBoost JVM packages"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION}
echo "--- Stash XGBoost4J JARs"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"

View File

@@ -0,0 +1,21 @@
$ErrorActionPreference = "Stop"
. tests/buildkite/conftest.ps1
Write-Host "--- Build XGBoost R package with CUDA"
nvcc --version
if ( $is_release_branch -eq 1 ) {
$arch_flag = "-DGPU_COMPUTE_VER=75"
bash tests/ci_build/build_r_pkg_with_cuda_win64.sh $Env:BUILDKITE_COMMIT
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
Write-Host "--- Upload R tarball"
Get-ChildItem . -Filter xgboost_r_gpu_win64_*.tar.gz |
Foreach-Object {
& aws s3 cp $_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ `
--acl public-read --no-progress
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
}
}

View File

@@ -0,0 +1,58 @@
$ErrorActionPreference = "Stop"
. tests/buildkite/conftest.ps1
Write-Host "--- Build libxgboost on Windows with CUDA"
nvcc --version
if ( $is_release_branch -eq 0 ) {
$arch_flag = "-DGPU_COMPUTE_VER=75"
} else {
$arch_flag = ""
}
mkdir build
cd build
cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
-DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_UNITY_BUILD=ON ${arch_flag}
$msbuild = -join @(
"C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\MSBuild\\15.0"
"\\Bin\\MSBuild.exe"
)
& $msbuild xgboost.sln /m /p:Configuration=Release /nodeReuse:false
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
Write-Host "--- Build binary wheel"
cd ../python-package
conda activate
& python setup.py bdist_wheel --universal
Get-ChildItem . -Filter dist/*.whl |
Foreach-Object {
& python ../tests/ci_build/rename_whl.py $_.FullName $Env:BUILDKITE_COMMIT win_amd64
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
}
Write-Host "--- Insert vcomp140.dll (OpenMP runtime) into the wheel"
cd dist
Copy-Item -Path ../../tests/ci_build/insert_vcomp140.py -Destination .
& python insert_vcomp140.py *.whl
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
Write-Host "--- Upload Python wheel"
cd ../..
Get-ChildItem . -Filter python-package/dist/*.whl |
Foreach-Object {
& buildkite-agent artifact upload python-package/dist/$_
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
}
if ( $is_release_branch -eq 1 ) {
Get-ChildItem . -Filter python-package/dist/*.whl |
Foreach-Object {
& aws s3 cp python-package/dist/$_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ `
--acl public-read --no-progress
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
}
}
Write-Host "--- Stash C++ test executables"
& buildkite-agent artifact upload build/testxgboost.exe
& buildkite-agent artifact upload xgboost.exe

View File

@@ -0,0 +1,11 @@
if ( $Env:BUILDKITE_PULL_REQUEST -and ($Env:BUILDKITE_PULL_REQUEST -ne "false") ) {
$is_pull_request = 1
} else {
$is_pull_request = 0
}
if ( ($Env:BUILDKITE_BRANCH -eq "master") -or ($Env:BUILDKITE_BRANCH -match "release_.+") ) {
$is_release_branch = 1
} else {
$is_release_branch = 0
}

22
tests/buildkite/conftest.sh Executable file
View File

@@ -0,0 +1,22 @@
#!/bin/bash
set -euo pipefail
set -x
if [[ -n $BUILDKITE_PULL_REQUEST && $BUILDKITE_PULL_REQUEST != "false" ]]
then
is_pull_request=1
export BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST
else
is_pull_request=0
export BRANCH_NAME=$BUILDKITE_BRANCH
fi
if [[ $BUILDKITE_BRANCH == "master" || $BUILDKITE_BRANCH == "release_"* ]]
then
is_release_branch=1
else
is_release_branch=0
fi
set +x

View File

@@ -0,0 +1,16 @@
#!/bin/bash
set -euo pipefail
SPARK_VERSION=3.0.1
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo"
tests/ci_build/ci_build.sh jvm_gpu_build docker \
--build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION}
fi

View File

@@ -0,0 +1,101 @@
import argparse
import copy
import boto3
from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS
TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
def get_availability_zones(*, aws_region):
client = boto3.client("ec2", region_name=aws_region)
r = client.describe_availability_zones(
Filters=[
{"Name": "region-name", "Values": [aws_region]},
{"Name": "zone-type", "Values": ["availability-zone"]},
]
)
return sorted([x["ZoneName"] for x in r["AvailabilityZones"]])
def get_default_vpc(*, aws_region):
ec2 = boto3.resource("ec2", region_name=aws_region)
default_vpc_id = None
for x in ec2.vpcs.filter(Filters=[{"Name": "is-default", "Values": ["true"]}]):
return x
# Create default VPC if not exist
client = boto3.client("ec2", region_name=aws_region)
r = client.create_default_vpc()
default_vpc_id = r["Vpc"]["VpcId"]
return ec2.Vpc(default_vpc_id)
def format_params(args, *, stack_id):
default_vpc = get_default_vpc(aws_region=args.aws_region)
azs = get_availability_zones(aws_region=args.aws_region)
# For each of the first two availability zones (AZs), choose the default subnet
subnets = [
x.id
for x in default_vpc.subnets.filter(
Filters=[
{"Name": "default-for-az", "Values": ["true"]},
{"Name": "availability-zone", "Values": azs[:2]},
]
)
]
assert len(subnets) == 2
params = copy.deepcopy(STACK_PARAMS[stack_id])
params["ImageId"] = AMI_ID[stack_id][args.aws_region]
params["BuildkiteQueue"] = stack_id
params["CostAllocationTagValue"] = f"buildkite-{stack_id}"
params["BuildkiteAgentToken"] = args.agent_token
params["VpcId"] = default_vpc.id
params["Subnets"] = ",".join(subnets)
params.update(COMMON_STACK_PARAMS)
return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]
def get_full_stack_id(stack_id):
return f"buildkite-{stack_id}-autoscaling-group"
def main(args):
client = boto3.client("cloudformation", region_name=args.aws_region)
for stack_id in AMI_ID:
stack_id_full = get_full_stack_id(stack_id)
print(f"Creating elastic CI stack {stack_id_full}...")
params = format_params(args, stack_id=stack_id)
response = client.create_stack(
StackName=stack_id_full,
TemplateURL=TEMPLATE_URL,
Capabilities=[
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
OnFailure="ROLLBACK",
EnableTerminationProtection=False,
Parameters=params,
)
print(f"CI stack {stack_id_full} is in progress in the background")
for stack_id in AMI_ID:
stack_id_full = get_full_stack_id(stack_id)
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_id_full)
print(f"CI stack {stack_id_full} is now finished.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--aws-region", type=str, required=True)
parser.add_argument("--agent-token", type=str, required=True)
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,101 @@
AMI_ID = {
# Managed by XGBoost team
"linux-amd64-gpu": {
"us-west-2": "ami-00ed92bd37f77bc33",
},
"linux-amd64-mgpu": {
"us-west-2": "ami-00ed92bd37f77bc33",
},
"windows-gpu": {
"us-west-2": "ami-0a1a2ea551a07ad5f",
},
# Managed by BuildKite
"linux-amd64-cpu": {
"us-west-2": "ami-075d4c25d5f0c17c1",
},
"pipeline-loader": {
"us-west-2": "ami-075d4c25d5f0c17c1",
},
"linux-arm64-cpu": {
"us-west-2": "ami-0952c6fb6db9a9891",
},
}
STACK_PARAMS = {
"linux-amd64-gpu": {
"InstanceOperatingSystem": "linux",
"InstanceType": "g4dn.xlarge",
"AgentsPerInstance": "1",
"MinSize": "0",
"MaxSize": "8",
"OnDemandPercentage": "100",
"ScaleOutFactor": "1.0",
"ScaleInIdlePeriod": "60", # in seconds
},
"linux-amd64-mgpu": {
"InstanceOperatingSystem": "linux",
"InstanceType": "g4dn.12xlarge",
"AgentsPerInstance": "1",
"MinSize": "0",
"MaxSize": "4",
"OnDemandPercentage": "100",
"ScaleOutFactor": "1.0",
"ScaleInIdlePeriod": "60", # in seconds
},
"windows-gpu": {
"InstanceOperatingSystem": "windows",
"InstanceType": "g4dn.2xlarge",
"AgentsPerInstance": "1",
"MinSize": "0",
"MaxSize": "2",
"OnDemandPercentage": "100",
"ScaleOutFactor": "1.0",
"ScaleInIdlePeriod": "600", # in seconds
},
"linux-amd64-cpu": {
"InstanceOperatingSystem": "linux",
"InstanceType": "c5a.4xlarge",
"AgentsPerInstance": "1",
"MinSize": "0",
"MaxSize": "16",
"OnDemandPercentage": "100",
"ScaleOutFactor": "1.0",
"ScaleInIdlePeriod": "60", # in seconds
},
"pipeline-loader": {
"InstanceOperatingSystem": "linux",
"InstanceType": "t3a.micro",
"AgentsPerInstance": "1",
"MinSize": "1",
"MaxSize": "1",
"OnDemandPercentage": "100",
"ScaleOutFactor": "1.0",
"ScaleInIdlePeriod": "60", # in seconds
},
"linux-arm64-cpu": {
"InstanceOperatingSystem": "linux",
"InstanceType": "c6g.4xlarge",
"AgentsPerInstance": "1",
"MinSize": "0",
"MaxSize": "8",
"OnDemandPercentage": "100",
"ScaleOutFactor": "1.0",
"ScaleInIdlePeriod": "60", # in seconds
},
}
COMMON_STACK_PARAMS = {
"BuildkiteAgentTimestampLines": "false",
"BuildkiteWindowsAdministrator": "true",
"AssociatePublicIpAddress": "true",
"ScaleOutForWaitingJobs": "false",
"EnableCostAllocationTags": "true",
"CostAllocationTagName": "CreatedBy",
"ECRAccessPolicy": "full",
"ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess",
"EnableSecretsPlugin": "false",
"EnableECRPlugin": "false",
"EnableDockerLoginPlugin": "false",
"EnableDockerUserNamespaceRemap": "false",
"BuildkiteAgentExperiments": "normalised-upload-paths,resolve-commit-after-checkout",
}

View File

@@ -0,0 +1,2 @@
boto3
cfn_tools

View File

@@ -0,0 +1,44 @@
import argparse
import os
import boto3
current_dir = os.path.dirname(__file__)
def main(args):
with open(
os.path.join(current_dir, "service-user-template.yml"), encoding="utf-8"
) as f:
service_user_template = f.read()
stack_id = "buildkite-elastic-ci-stack-service-user"
print("Create a new IAM user with suitable permissions...")
client = boto3.client("cloudformation", region_name=args.aws_region)
response = client.create_stack(
StackName=stack_id,
TemplateBody=service_user_template,
Capabilities=[
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
],
Parameters=[{"ParameterKey": "UserName", "ParameterValue": args.user_name}],
)
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_id)
user = boto3.resource("iam", region_name=args.aws_region).User(args.user_name)
key_pair = user.create_access_key_pair()
print("Finished creating an IAM users with suitable permissions.")
print(f"Access Key ID: {key_pair.access_key_id}")
print(f"Access Secret Access Key: {key_pair.secret_access_key}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--aws-region", type=str, required=True)
parser.add_argument(
"--user-name", type=str, default="buildkite-elastic-ci-stack-user"
)
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,349 @@
---
AWSTemplateFormatVersion: "2010-09-09"
Description: "Buildkite Elastic CI Stack CloudFormation service user"
Parameters:
UserName:
Type: String
Default: buildkite-elastic-ci-stack-user
Description: Name of user to create
Outputs:
UserNameOutput:
Value: !Ref CloudFormationServiceUser
UserArnOutput:
Value: !GetAtt CloudFormationServiceUser.Arn
Resources:
CloudFormationServiceUser:
Type: AWS::IAM::User
Properties:
ManagedPolicyArns:
- !Ref SubstackCrudPolicy
- !Ref CrudPolicy
- !Ref ImageBuilderPolicy
UserName: !Ref UserName
SubstackCrudPolicy:
Type: AWS::IAM::ManagedPolicy
Properties:
PolicyDocument:
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": "cloudformation:*",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"serverlessrepo:GetApplication",
"serverlessrepo:GetCloudFormationTemplate",
"serverlessrepo:CreateCloudFormationTemplate"
],
"Resource": "*"
}
]
}
CrudPolicy:
Type: AWS::IAM::ManagedPolicy
Properties:
PolicyDocument:
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ec2:DescribeAccountAttributes",
"ec2:DescribeAvailabilityZones",
"ec2:DescribeInstances",
"ec2:DescribeInternetGateways",
"ec2:DescribeLaunchTemplateVersions",
"ec2:DescribeLaunchTemplates",
"ec2:DescribeNetworkInterfaces",
"ec2:DescribeRouteTables",
"ec2:DescribeSecurityGroups",
"ec2:DescribeSubnets",
"ec2:DescribeVpcs",
"ec2:CreateTags"
],
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"ec2:CreateInternetGateway",
"ec2:AttachInternetGateway",
"ec2:DetachInternetGateway",
"ec2:DeleteInternetGateway"
],
"Resource": "arn:aws:ec2:*:*:internet-gateway/*"
},
{
"Effect": "Allow",
"Action": [
"ec2:CreateLaunchTemplate",
"ec2:CreateLaunchTemplateVersion",
"ec2:DeleteLaunchTemplate"
],
"Resource": "arn:aws:ec2:*:*:launch-template/*"
},
{
"Effect": "Allow",
"Action": [
"ec2:AssociateRouteTable",
"ec2:DisassociateRouteTable",
"ec2:CreateRoute",
"ec2:CreateRouteTable",
"ec2:DeleteRoute",
"ec2:DeleteRouteTable"
],
"Resource": "arn:aws:ec2:*:*:route-table/*"
},
{
"Effect": "Allow",
"Action": [
"ec2:AuthorizeSecurityGroupIngress",
"ec2:RevokeSecurityGroupIngress",
"ec2:CreateSecurityGroup",
"ec2:DeleteSecurityGroup"
],
"Resource": "arn:aws:ec2:*:*:security-group/*"
},
{
"Effect": "Allow",
"Action": "ec2:RunInstances",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"ec2:CreateSubnet",
"ec2:DeleteSubnet",
"ec2:AssociateRouteTable",
"ec2:DisassociateRouteTable"
],
"Resource": "arn:aws:ec2:*:*:subnet/*"
},
{
"Effect": "Allow",
"Action": [
"ec2:CreateVpc",
"ec2:CreateSecurityGroup",
"ec2:ModifyVpcAttribute",
"ec2:AttachInternetGateway",
"ec2:DetachInternetGateway",
"ec2:CreateSubnet",
"ec2:CreateRouteTable",
"ec2:DeleteVpc"
],
"Resource": "arn:aws:ec2:*:*:vpc/*"
},
{
"Effect": "Allow",
"Action": [
"ec2:CreateDefaultVpc",
"ec2:CreateDefaultSubnet"
],
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"iam:CreateInstanceProfile",
"iam:GetInstanceProfile",
"iam:AddRoleToInstanceProfile",
"iam:RemoveRoleFromInstanceProfile",
"iam:DeleteInstanceProfile"
],
"Resource": "arn:aws:iam::*:instance-profile/*"
},
{
"Effect": "Allow",
"Action": [
"kms:DescribeKey",
"kms:CreateGrant",
"kms:Decrypt",
"kms:Encrypt"
],
"Resource": "arn:aws:kms:*:*:key/*"
},
{
"Effect": "Allow",
"Action": [
"lambda:CreateFunction",
"lambda:GetFunction",
"lambda:GetFunctionCodeSigningConfig",
"lambda:AddPermission",
"lambda:RemovePermission",
"lambda:DeleteFunction",
"lambda:InvokeFunction",
"lambda:TagResource"
],
"Resource": "arn:aws:lambda:*:*:function:*"
},
{
"Effect": "Allow",
"Action": [
"logs:CreateLogGroup",
"logs:PutRetentionPolicy",
"logs:DeleteLogGroup"
],
"Resource": "arn:aws:logs:*:*:log-group:*"
},
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:CreateBucket",
"s3:PutBucketAcl",
"s3:PutBucketLogging",
"s3:PutBucketTagging",
"s3:PutBucketVersioning"
],
"Resource": "arn:aws:s3:::*"
},
{
"Effect": "Allow",
"Action": [
"ssm:GetParameter",
"ssm:PutParameter",
"ssm:DeleteParameter"
],
"Resource": "arn:aws:ssm:*:*:parameter/*"
},
{
"Effect": "Allow",
"Action": [
"iam:ListPolicies",
"iam:ListInstanceProfiles",
"iam:ListRoles",
"iam:ListPolicyVersions",
"iam:ListRolePolicies",
"iam:ListAttachedRolePolicies",
"iam:ListInstanceProfileTags",
"iam:ListRoleTags",
"iam:ListInstanceProfilesForRole",
"iam:GetPolicyVersion",
"iam:GetPolicy",
"iam:GetInstanceProfile",
"iam:GetRole",
"iam:GetRolePolicy",
"iam:TagPolicy",
"iam:UntagPolicy",
"iam:TagInstanceProfile",
"iam:UntagInstanceProfile",
"iam:TagRole",
"iam:UntagRole",
"iam:CreateRole",
"iam:PassRole",
"iam:DeleteRole",
"iam:UpdateRoleDescription",
"iam:UpdateRole",
"iam:AddRoleToInstanceProfile",
"iam:RemoveRoleFromInstanceProfile",
"iam:CreateInstanceProfile",
"iam:DeleteInstanceProfile",
"iam:DetachRolePolicy",
"iam:SetDefaultPolicyVersion",
"iam:AttachRolePolicy",
"iam:UpdateAssumeRolePolicy",
"iam:PutRolePermissionsBoundary",
"iam:DeleteRolePermissionsBoundary",
"iam:CreatePolicy",
"iam:DeletePolicyVersion",
"iam:DeletePolicy",
"iam:PutRolePolicy",
"iam:DeleteRolePolicy"
],
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"autoscaling:DescribeLifecycleHookTypes",
"autoscaling:DescribeTerminationPolicyTypes",
"autoscaling:DescribePolicies",
"autoscaling:DescribeWarmPool",
"autoscaling:DescribeScalingActivities",
"autoscaling:DescribeScalingProcessTypes",
"autoscaling:DescribeScheduledActions",
"autoscaling:DescribeAutoScalingGroups",
"autoscaling:DescribeAutoScalingInstances",
"autoscaling:DescribeLifecycleHooks",
"autoscaling:SetDesiredCapacity",
"autoscaling:PutLifecycleHook",
"autoscaling:DeleteLifecycleHook",
"autoscaling:SetInstanceProtection",
"autoscaling:CreateAutoScalingGroup",
"autoscaling:EnableMetricsCollection",
"autoscaling:UpdateAutoScalingGroup",
"autoscaling:DeleteAutoScalingGroup",
"autoscaling:PutScalingPolicy",
"autoscaling:DeletePolicy",
"autoscaling:BatchPutScheduledUpdateGroupAction",
"autoscaling:PutScheduledUpdateGroupAction",
"autoscaling:DeleteScheduledAction",
"autoscaling:PutWarmPool",
"autoscaling:DeleteWarmPool",
"autoscaling:TerminateInstanceInAutoScalingGroup",
"autoscaling:AttachInstances"
],
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"events:DescribeRule",
"events:PutRule",
"events:PutTargets",
"events:RemoveTargets",
"events:DeleteRule"
],
"Resource": "arn:aws:events:*:*:rule/*"
}
]
}
ImageBuilderPolicy:
Type: AWS::IAM::ManagedPolicy
Properties:
PolicyDocument:
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"imagebuilder:CreateComponent",
"imagebuilder:GetComponent",
"imagebuilder:DeleteComponent",
"imagebuilder:CreateImageRecipe",
"imagebuilder:GetImageRecipe",
"imagebuilder:DeleteImageRecipe",
"imagebuilder:CreateImagePipeline",
"imagebuilder:GetImagePipeline",
"imagebuilder:DeleteImagePipeline",
"imagebuilder:CreateInfrastructureConfiguration",
"imagebuilder:GetInfrastructureConfiguration",
"imagebuilder:DeleteInfrastructureConfiguration",
"imagebuilder:CreateDistributionConfiguration",
"imagebuilder:GetDistributionConfiguration",
"imagebuilder:DeleteDistributionConfiguration",
"imagebuilder:TagResource",
"imagebuilder:StartImagePipelineExecution",
"ec2:DescribeImages",
"ec2:DescribeSnapshots",
"ec2:DescribeRegions",
"ec2:DescribeVolumes",
"ec2:DescribeKeyPairs",
"ec2:DescribeInstanceTypeOfferings"
],
"Resource": "*"
}
]
}

View File

@@ -0,0 +1,86 @@
import argparse
import copy
import json
import os
from urllib.request import urlopen
import boto3
import cfn_flip
from metadata import IMAGE_PARAMS
current_dir = os.path.dirname(__file__)
BUILDKITE_CF_TEMPLATE_URL = (
"https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
)
def format_params(*, stack_id, aws_region, ami_mapping):
params = copy.deepcopy(IMAGE_PARAMS[stack_id])
with open(
os.path.join(current_dir, params["BootstrapScript"]),
encoding="utf-8",
) as f:
bootstrap_script = f.read()
params["BaseImageId"] = ami_mapping[aws_region][params["BaseImageId"]]
params["BootstrapScript"] = bootstrap_script
return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]
def get_ami_mapping():
with urlopen(BUILDKITE_CF_TEMPLATE_URL) as response:
buildkite_cf_template = response.read().decode("utf-8")
cfn_obj = json.loads(cfn_flip.to_json(buildkite_cf_template))
return cfn_obj["Mappings"]["AWSRegion2AMI"]
def get_full_stack_id(stack_id):
return f"buildkite-{stack_id}-worker"
def main(args):
with open(
os.path.join(current_dir, "ec2-image-builder-pipeline-template.yml"),
encoding="utf-8",
) as f:
ec2_image_pipeline_template = f.read()
ami_mapping = get_ami_mapping()
for stack_id in IMAGE_PARAMS:
stack_id_full = get_full_stack_id(stack_id)
print(f"Creating EC2 image builder stack {stack_id_full}...")
params = format_params(
stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping
)
client = boto3.client("cloudformation", region_name=args.aws_region)
response = client.create_stack(
StackName=stack_id_full,
TemplateBody=ec2_image_pipeline_template,
Capabilities=[
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
OnFailure="ROLLBACK",
EnableTerminationProtection=False,
Parameters=params,
)
print(
f"EC2 image builder stack {stack_id_full} is in progress in the background"
)
for stack_id in IMAGE_PARAMS:
stack_id_full = get_full_stack_id(stack_id)
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_id_full)
print(f"EC2 image builder stack {stack_id_full} is now finished.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--aws-region", type=str, required=True)
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,108 @@
---
AWSTemplateFormatVersion: "2010-09-09"
Description: "EC2 Image Builder pipelines to build workers"
Parameters:
BaseImageId:
Type: String
Description: Base AMI to build a new image on top of.
BootstrapScript:
Type: String
Description: Content of AMI customization script
InstanceType:
Type: String
Description: Instance type for the Image Builder instances.
InstanceOperatingSystem:
Type: String
Description: The operating system to run on the instance
AllowedValues:
- Linux
- Windows
Default: "Linux"
VolumeSize:
Type: Number
Description: Size of EBS volume, in GiBs
Conditions:
IsInstanceWindows:
!Equals [ !Ref InstanceOperatingSystem, "Windows" ]
Resources:
# IAM role for the image builder instance
InstanceRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: "Allow"
Principal:
Service: "ec2.amazonaws.com"
Action: "sts:AssumeRole"
ManagedPolicyArns:
- arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
- arn:aws:iam::aws:policy/EC2InstanceProfileForImageBuilder
- arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
InstanceProfile:
Type: AWS::IAM::InstanceProfile
Properties:
Roles:
- !Ref InstanceRole
# Component that runs the bootstrap script
BootstrapComponent:
Type: AWS::ImageBuilder::Component
Properties:
Name: !Sub "${AWS::StackName}-bootstrap-component"
Platform: !Ref InstanceOperatingSystem
Version: "1.0.0"
Description: Execute a bootstrap script.
Data: !Ref BootstrapScript
Recipe:
Type: AWS::ImageBuilder::ImageRecipe
Properties:
Name: !Sub "${AWS::StackName}-image"
Components:
- ComponentArn: !Ref BootstrapComponent
ParentImage: !Ref BaseImageId
BlockDeviceMappings:
- DeviceName: !If [IsInstanceWindows, "/dev/sda1", "/dev/xvda"]
Ebs:
DeleteOnTermination: true
Encrypted: false
VolumeSize: !Ref VolumeSize
VolumeType: gp2
Version: "1.0.0"
Infrastructure:
Type: AWS::ImageBuilder::InfrastructureConfiguration
Properties:
Name: !Sub "${AWS::StackName}-image-pipeline-infrastructure"
InstanceProfileName: !Ref InstanceProfile
InstanceTypes:
- !Ref InstanceType
TerminateInstanceOnFailure: true
# Copy to this region only
Distribution:
Type: AWS::ImageBuilder::DistributionConfiguration
Properties:
Name: !Sub "${AWS::StackName}-image-pipeline-distribution-config"
Distributions:
- Region: !Ref AWS::Region
AmiDistributionConfiguration: {}
# Composition of the above elements
Pipeline:
Type: AWS::ImageBuilder::ImagePipeline
Properties:
Name: !Sub "${AWS::StackName}-image-pipeline"
DistributionConfigurationArn: !Ref Distribution
ImageRecipeArn: !Ref Recipe
InfrastructureConfigurationArn: !Ref Infrastructure

View File

@@ -0,0 +1,26 @@
name: BuildKiteLinuxAMD64GPUBootstrap
description: Set up worker image for linux-amd64-gpu pipeline
schemaVersion: 1.0
phases:
- name: build
steps:
- name: SetupStep
action: ExecuteBash
inputs:
commands:
- |
yum groupinstall -y "Development tools"
yum install -y kernel-devel-$(uname -r)
aws s3 cp --recursive s3://ec2-linux-nvidia-drivers/latest/ .
chmod +x NVIDIA-Linux-x86_64*.run
CC=/usr/bin/gcc10-cc ./NVIDIA-Linux-x86_64*.run --silent
amazon-linux-extras install docker
systemctl --now enable docker
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
&& curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo \
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
yum clean expire-cache
yum install -y nvidia-docker2
systemctl restart docker

View File

@@ -0,0 +1,18 @@
IMAGE_PARAMS = {
"linux-amd64-gpu": {
"BaseImageId": "linuxamd64",
# AMI ID is looked up from Buildkite's CloudFormation template
"BootstrapScript": "linux-amd64-gpu-bootstrap.yml",
"InstanceType": "g4dn.xlarge",
"InstanceOperatingSystem": "Linux",
"VolumeSize": "40", # in GiBs
},
"windows-gpu": {
"BaseImageId": "windows",
# AMI ID is looked up from Buildkite's CloudFormation template
"BootstrapScript": "windows-gpu-bootstrap.yml",
"InstanceType": "g4dn.2xlarge",
"InstanceOperatingSystem": "Windows",
"VolumeSize": "80", # in GiBs
},
}

View File

@@ -0,0 +1,22 @@
import argparse
import boto3
from create_worker_image_pipelines import get_full_stack_id
from metadata import IMAGE_PARAMS
def main(args):
cf = boto3.resource("cloudformation", region_name=args.aws_region)
builder_client = boto3.client("imagebuilder", region_name=args.aws_region)
for stack_id in IMAGE_PARAMS:
stack_id_full = get_full_stack_id(stack_id)
pipeline_arn = cf.Stack(stack_id_full).Resource("Pipeline").physical_resource_id
print(f"Running pipeline {pipeline_arn} to generate a new AMI...")
r = builder_client.start_image_pipeline_execution(imagePipelineArn=pipeline_arn)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--aws-region", type=str, required=True)
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,73 @@
name: BuildKiteWindowsGPUBootstrap
description: Set up worker image for windows-gpu pipeline
schemaVersion: 1.0
phases:
- name: build
steps:
- name: SetupStep
action: ExecutePowerShell
inputs:
commands:
- |
$ErrorActionPreference = "Stop"
choco --version
choco feature enable -n=allowGlobalConfirmation
# CMake 3.18
Write-Host '>>> Installing CMake 3.18...'
choco install cmake --version 3.18.0 --installargs "ADD_CMAKE_TO_PATH=System"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Notepad++
Write-Host '>>> Installing Notepad++...'
choco install notepadplusplus
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Miniconda
Write-Host '>>> Installing Miniconda...'
choco install miniconda3 /RegisterPython:1 /D:C:\tools\miniconda3
C:\tools\miniconda3\Scripts\conda.exe init --user --system
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
. "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
conda config --set auto_activate_base false
conda config --prepend channels conda-forge
# Install Java 11
Write-Host '>>> Installing Java 11...'
choco install openjdk11jre
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install GraphViz
Write-Host '>>> Installing GraphViz...'
choco install graphviz
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install Visual Studio Community 2017 (15.9)
Write-Host '>>> Installing Visual Studio 2017 Community (15.9)...'
choco install visualstudio2017community --version 15.9.23.0 `
--params "--wait --passive --norestart"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
choco install visualstudio2017-workload-nativedesktop --params `
"--wait --passive --norestart --includeOptional"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install CUDA 11.0
Write-Host '>>> Installing CUDA 11.0...'
choco install cuda --version 11.0.3
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install Python packages
Write-Host '>>> Installing Python packages...'
conda activate
conda install -y mamba
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install R
Write-Host '>>> Installing R...'
choco install r.project --version=3.6.3
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
choco install rtools --version=3.5.0.4
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

View File

@@ -0,0 +1,22 @@
steps:
- block: ":rocket: Run this test job"
#### -------- BUILD --------
- label: ":windows: Build XGBoost for Windows with CUDA"
command: "tests/buildkite/build-win64-gpu.ps1"
key: build-win64-gpu
agents:
queue: windows-gpu
- label: ":windows: Build XGBoost R package for Windows with CUDA"
command: "tests/buildkite/build-rpkg-win64-gpu.ps1"
key: build-rpkg-win64-gpu
agents:
queue: windows-gpu
- wait
#### -------- TEST --------
- label: ":windows: Test XGBoost on Windows"
command: "tests/buildkite/test-win64-gpu.ps1"
key: test-win64-gpu
agents:
queue: windows-gpu

View File

@@ -0,0 +1,93 @@
env:
DOCKER_CACHE_ECR_ID: "492475357299"
DOCKER_CACHE_ECR_REGION: "us-west-2"
steps:
- block: ":rocket: Run this test job"
#### -------- BUILD --------
- label: ":console: Run clang-tidy"
command: "tests/buildkite/run-clang-tidy.sh"
key: run-clang-tidy
agents:
queue: linux-amd64-cpu
- label: ":console: Build CPU"
command: "tests/buildkite/build-cpu.sh"
key: build-cpu
agents:
queue: linux-amd64-cpu
- label: ":console: Build CPU ARM64"
command: "tests/buildkite/build-cpu-arm64.sh"
key: build-cpu-arm64
agents:
queue: linux-arm64-cpu
- label: ":console: Build CUDA"
command: "tests/buildkite/build-cuda.sh"
key: build-cuda
agents:
queue: linux-amd64-cpu
- label: ":console: Build CUDA with RMM"
command: "tests/buildkite/build-cuda-with-rmm.sh"
key: build-cuda-with-rmm
agents:
queue: linux-amd64-cpu
- label: ":console: Build R package with CUDA"
command: "tests/buildkite/build-gpu-rpkg.sh"
key: build-gpu-rpkg
agents:
queue: linux-amd64-cpu
- label: ":console: Build JVM packages"
command: "tests/buildkite/build-jvm-packages.sh"
key: build-jvm-packages
agents:
queue: linux-amd64-cpu
- label: ":console: Build JVM packages with CUDA"
command: "tests/buildkite/build-jvm-packages-gpu.sh"
key: build-jvm-packages-gpu
agents:
queue: linux-amd64-mgpu
- label: ":console: Build JVM package doc"
command: "tests/buildkite/build-jvm-doc.sh"
key: build-jvm-doc
agents:
queue: linux-amd64-cpu
- wait
#### -------- TEST --------
- label: ":console: Test Python package, CPU"
command: "tests/buildkite/test-python-cpu.sh"
key: test-python-cpu
agents:
queue: linux-amd64-cpu
- label: ":console: Test Python package, CPU ARM64"
command: "tests/buildkite/test-python-cpu-arm64.sh"
key: test-python-cpu-arm64
agents:
queue: linux-arm64-cpu
- label: ":console: Test Python package, single GPU"
command: "tests/buildkite/test-python-gpu.sh gpu"
key: test-python-gpu
agents:
queue: linux-amd64-gpu
- label: ":console: Test Python package, 4 GPUs"
command: "tests/buildkite/test-python-gpu.sh mgpu"
key: test-python-mgpu
agents:
queue: linux-amd64-mgpu
- label: ":console: Run Google Tests, 4 GPUs"
command: "tests/buildkite/test-cpp-gpu.sh"
key: test-cpp-gpu
agents:
queue: linux-amd64-mgpu
- label: ":console: Run integration tests with JVM packages"
command: "tests/buildkite/test-integration-jvm-packages.sh"
key: test-integration-jvm-packages
agents:
queue: linux-amd64-cpu
- wait
- label: ":console: Deploy JVM packages"
command: "tests/buildkite/deploy-jvm-packages.sh"
key: deploy-jvm-packages
agents:
queue: linux-amd64-cpu

View File

@@ -0,0 +1,13 @@
#!/bin/bash
set -euo pipefail
CUDA_VERSION=11.0.3
echo "--- Run clang-tidy"
source tests/buildkite/conftest.sh
tests/ci_build/ci_build.sh clang_tidy docker \
--build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
python3 tests/ci_build/tidy.py --cuda-archs 75

21
tests/buildkite/test-cpp-gpu.sh Executable file
View File

@@ -0,0 +1,21 @@
#!/bin/bash
set -euo pipefail
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
echo "--- Run Google Tests with CUDA, using 4 GPUs"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION build/testxgboost
echo "--- Run Google Tests with CUDA, using 4 GPUs, RMM enabled"
rm -rfv build/
buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
chmod +x build/testxgboost
tests/ci_build/ci_build.sh rmm nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION bash -c \
"source activate gpu_test && build/testxgboost --use-rmm-pool"

View File

@@ -0,0 +1,16 @@
#!/bin/bash
set -euo pipefail
JDK_VERSION=8
SPARK_VERSION=3.0.1
source tests/buildkite/conftest.sh
echo "--- Test XGBoost4J on a machine with JDK ${JDK_VERSION}, Spark ${SPARK_VERSION}"
buildkite-agent artifact download "jvm-packages/xgboost4j/target/*.jar" . --step build-jvm-packages
buildkite-agent artifact download "jvm-packages/xgboost4j-spark/target/*.jar" . --step build-jvm-packages
buildkite-agent artifact download "jvm-packages/xgboost4j-example/target/*.jar" . --step build-jvm-packages
export CI_DOCKER_EXTRA_PARAMS_INIT='-e RUN_INTEGRATION_TEST=1'
tests/ci_build/ci_build.sh jvm_cross docker --build-arg JDK_VERSION=${JDK_VERSION} \
--build-arg SPARK_VERSION=${SPARK_VERSION} tests/ci_build/test_jvm_cross.sh

View File

@@ -0,0 +1,11 @@
#!/bin/bash
set -euo pipefail
source tests/buildkite/conftest.sh
echo "--- Test Python CPU ARM64"
buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cpu-arm64
buildkite-agent artifact download "xgboost" . --step build-cpu-arm64
chmod +x ./xgboost
tests/ci_build/ci_build.sh aarch64 docker tests/ci_build/test_python.sh cpu-arm64

View File

@@ -0,0 +1,14 @@
#!/bin/bash
set -euo pipefail
echo "--- Test CPU code in Python env"
source tests/buildkite/conftest.sh
mkdir -pv python-package/dist
buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda
buildkite-agent artifact download "xgboost" . --step build-cpu
chmod +x ./xgboost
tests/ci_build/ci_build.sh cpu docker tests/ci_build/test_python.sh cpu

View File

@@ -0,0 +1,46 @@
#!/bin/bash
set -euo pipefail
CUDA_VERSION=11.0.3
if [ "$#" -lt 1 ]
then
suite=''
args=''
else
suite=$1
shift 1
args="$@"
fi
source tests/buildkite/conftest.sh
echo "--- Fetch build artifacts"
buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
chmod +x build/testxgboost
# Allocate extra space in /dev/shm to enable NCCL
export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION"
# Run specified test suite
case "$suite" in
gpu)
echo "--- Test XGBoost Python package, single GPU"
$command_wrapper tests/ci_build/test_python.sh $suite
;;
mgpu)
echo "--- Test XGBoost Python package, 4 GPUs"
$command_wrapper tests/ci_build/test_python.sh $suite
;;
*)
echo "Usage: $0 {gpu|mgpu} [extra args to pass to pytest]"
exit 1
;;
esac

View File

@@ -0,0 +1,37 @@
$ErrorActionPreference = "Stop"
. tests/buildkite/conftest.ps1
Write-Host "--- Test XGBoost on Windows with CUDA"
New-Item python-package/dist -ItemType Directory -ea 0
New-Item build -ItemType Directory -ea 0
buildkite-agent artifact download "python-package/dist/*.whl" . --step build-win64-gpu
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
buildkite-agent artifact download "build/testxgboost.exe" . --step build-win64-gpu
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
buildkite-agent artifact download "xgboost.exe" . --step build-win64-gpu
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
nvcc --version
Write-Host "--- Run Google Tests"
& build/testxgboost.exe
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
Write-Host "--- Set up Python env"
conda activate
$env_name = -join("win64_", (New-Guid).ToString().replace("-", ""))
mamba env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml
conda activate ${env_name}
Get-ChildItem . -Filter python-package/dist/*.whl |
Foreach-Object {
& python -m pip install python-package/dist/$_
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
}
Write-Host "--- Run Python tests"
python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python
Write-Host "--- Run Python tests with GPU"
python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"`
tests/python-gpu

View File

@@ -10,7 +10,7 @@ RUN \
yum update -y && \
yum install -y devtoolset-7 && \
# Python
wget --no-verbose -O conda.sh https://github.com/conda-forge/miniforge/releases/download/4.8.2-1/Miniforge3-4.8.2-1-Linux-aarch64.sh && \
wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/4.8.2-1/Miniforge3-4.8.2-1-Linux-aarch64.sh && \
bash conda.sh -b -p /opt/miniforge-python
ENV PATH=/opt/miniforge-python/bin:$PATH
@@ -25,7 +25,7 @@ RUN conda env create -n aarch64_test --file=/scripts/aarch64_test.yml
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget --no-verbose -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

View File

@@ -11,7 +11,7 @@ RUN \
apt-get update && \
apt-get install -y tar unzip wget git build-essential python3 python3-pip software-properties-common \
apt-transport-https ca-certificates gnupg-agent && \
wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
wget -nv -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
add-apt-repository -u 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main' && \
apt-get update && \
apt-get install -y llvm-11 clang-tidy-11 clang-11 && \
@@ -31,7 +31,7 @@ ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

View File

@@ -15,7 +15,7 @@ RUN \
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
# Python
wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \
wget -nv https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \
bash Mambaforge-Linux-x86_64.sh -b -p /opt/python
ENV PATH=/opt/python/bin:$PATH
@@ -32,7 +32,7 @@ RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

View File

@@ -12,7 +12,7 @@ RUN \
apt-get update && \
apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
# Python
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python
ENV PATH=/opt/python/bin:$PATH
@@ -30,7 +30,7 @@ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

View File

@@ -18,7 +18,7 @@ RUN \
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
# Python
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
@@ -37,7 +37,7 @@ ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

View File

@@ -16,15 +16,15 @@ RUN \
apt-get update && \
apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \
# Python
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python && \
/opt/python/bin/pip install awscli && \
# Maven
wget https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
# Spark
wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
@@ -38,7 +38,7 @@ ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

View File

@@ -13,15 +13,15 @@ RUN \
apt-get update && \
apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \
# Python
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python && \
/opt/python/bin/pip install awscli && \
# Maven
wget https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
# Spark
wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
@@ -35,7 +35,7 @@ ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

View File

@@ -12,7 +12,7 @@ RUN \
apt-get update && \
apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \
# Python
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
@@ -33,7 +33,7 @@ ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

View File

@@ -15,7 +15,7 @@ ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

View File

@@ -144,19 +144,19 @@ then
DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com"
echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}"
# Login for Docker registry
echo "\$(python3 -m awscli ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})"
$(python3 -m awscli ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})
echo "\$(aws ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})"
$(aws ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})
# Pull pre-build container from Docker build cache,
# if one exists for the particular branch or pull request
echo "docker pull ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
if docker pull "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
if docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
then
CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
else
# If the build cache is empty of the particular branch or pull request,
# use the build cache associated with the master branch
echo "docker pull ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master"
docker pull "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" || true
echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master"
docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master" || true
CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:master"
fi
else
@@ -188,11 +188,11 @@ then
echo "docker tag ${DOCKER_IMG_NAME} ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
docker tag "${DOCKER_IMG_NAME}" "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
echo "python3 -m awscli ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true"
python3 -m awscli ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true
echo "aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true"
aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION} || true
echo "docker push ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
docker push "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
echo "docker push --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
docker push --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
if [[ $? != "0" ]]; then
echo "ERROR: could not update Docker cache ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
exit 1

View File

@@ -14,5 +14,6 @@ for wheel_path in sorted(glob.glob(sys.argv[1])):
assert m, f'wheel_path = {wheel_path}'
version = m.group(1)
print(f"Inserting vcomp140.dll into {wheel_path}...")
with zipfile.ZipFile(wheel_path, 'a') as f:
f.write(vcomp140_path, 'xgboost-{}.data/data/xgboost/vcomp140.dll'.format(version))