[CI] Use RAPIDS 22.10 (#8298)

* [CI] Use RAPIDS 22.10

* Store CUDA and RAPIDS versions in one place

* Fix

* Add missing #include

* Update gputreeshap submodule

* Fix

* Remove outdated distributed tests
This commit is contained in:
Philip Hyunsu Cho 2022-10-04 00:18:07 -07:00 committed by GitHub
parent 37886a5dff
commit ca0547bb65
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 25 additions and 258 deletions

@ -1 +1 @@
Subproject commit c78fe621e429117cbca45e7b23eb5c3b6280fa3a
Subproject commit acb5be3c17e9adae34ac0b176da6ea8e197cb17e

View File

@ -7,6 +7,8 @@
#include <thrust/device_malloc_allocator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#include <thrust/execution_policy.h>

View File

@ -4,6 +4,7 @@
#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/transform_scan.h>
#include <thrust/unique.h>

View File

@ -2,8 +2,6 @@
set -euo pipefail
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
echo "--- Build with CUDA ${CUDA_VERSION}, RMM enabled"
@ -16,7 +14,8 @@ else
fi
command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION"
`"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \

View File

@ -2,13 +2,12 @@
set -euo pipefail
CUDA_VERSION=11.0.3
WHEEL_TAG=manylinux2014_x86_64
echo "--- Build with CUDA ${CUDA_VERSION}"
source tests/buildkite/conftest.sh
echo "--- Build with CUDA ${CUDA_VERSION}"
if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
then
arch_flag="-DGPU_COMPUTE_VER=75"

View File

@ -2,8 +2,6 @@
set -euo pipefail
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
echo "--- Build XGBoost R package with CUDA"

View File

@ -2,9 +2,6 @@
set -euo pipefail
SPARK_VERSION=3.0.1
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
echo "--- Build XGBoost JVM packages with CUDA"

View File

@ -2,8 +2,6 @@
set -euo pipefail
SPARK_VERSION=3.0.1
source tests/buildkite/conftest.sh
echo "--- Build XGBoost JVM packages"

View File

@ -3,6 +3,11 @@
set -euo pipefail
set -x
CUDA_VERSION=11.0.3
RAPIDS_VERSION=22.10
SPARK_VERSION=3.0.1
JDK_VERSION=8
if [[ -z ${BUILDKITE:-} ]]
then
echo "$0 is not meant to run locally; it should run inside BuildKite."

View File

@ -2,9 +2,6 @@
set -euo pipefail
SPARK_VERSION=3.0.1
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]

View File

@ -2,8 +2,6 @@
set -euo pipefail
CUDA_VERSION=11.0.3
echo "--- Run clang-tidy"
source tests/buildkite/conftest.sh

View File

@ -2,20 +2,21 @@
set -euo pipefail
CUDA_VERSION=11.0.3
source tests/buildkite/conftest.sh
echo "--- Run Google Tests with CUDA, using 4 GPUs"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION build/testxgboost
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
build/testxgboost
echo "--- Run Google Tests with CUDA, using 4 GPUs, RMM enabled"
rm -rfv build/
buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
chmod +x build/testxgboost
tests/ci_build/ci_build.sh rmm nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION bash -c \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
"source activate gpu_test && build/testxgboost --use-rmm-pool"

View File

@ -2,9 +2,6 @@
set -euo pipefail
JDK_VERSION=8
SPARK_VERSION=3.0.1
source tests/buildkite/conftest.sh
echo "--- Test XGBoost4J on a machine with JDK ${JDK_VERSION}, Spark ${SPARK_VERSION}"

View File

@ -2,8 +2,6 @@
set -euo pipefail
CUDA_VERSION=11.0.3
if [ "$#" -lt 1 ]
then
suite=''
@ -25,7 +23,8 @@ chmod +x build/testxgboost
export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION"
`"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
# Run specified test suite
case "$suite" in

View File

@ -1,6 +1,7 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu18.04
ARG CUDA_VERSION_ARG
ARG RAPIDS_VERSION_ARG
# Environment
ENV DEBIAN_FRONTEND noninteractive
@ -21,7 +22,8 @@ ENV PATH=/opt/python/bin:$PATH
RUN \
conda install -c conda-forge mamba && \
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.8 cudf=22.04* rmm=22.04* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda=22.04* dask-cudf=22.04* cupy \
python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
pyspark cloudpickle cuda-python=11.7.0

View File

@ -1,6 +1,7 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu18.04
ARG CUDA_VERSION_ARG
ARG RAPIDS_VERSION_ARG
# Environment
ENV DEBIAN_FRONTEND noninteractive
@ -27,7 +28,7 @@ ENV PATH=/opt/python/bin:$PATH
# Create new Conda environment with RMM
RUN \
conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.9 rmm=22.04* cudatoolkit=$CUDA_VERSION_ARG cmake
python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake
ENV GOSU_VERSION 1.10

View File

@ -67,9 +67,6 @@ case "$suite" in
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu
unset_pyspark_envs
cd tests/distributed
./runtests-gpu.sh
uninstall_xgboost
;;
@ -80,8 +77,6 @@ case "$suite" in
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python
unset_pyspark_envs
cd tests/distributed
./runtests.sh
uninstall_xgboost
;;

View File

@ -1,84 +0,0 @@
"""Distributed GPU tests."""
import sys
import xgboost as xgb
import os
import numpy as np
def run_test(name, params_fun):
"""Runs a distributed GPU test."""
# Always call this before using distributed module
with xgb.rabit.RabitContext():
rank = xgb.rabit.get_rank()
world = xgb.rabit.get_world_size()
# Load file, file will be automatically sharded in distributed mode.
dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
params, n_rounds = params_fun(rank)
# Specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
# Run training, all the features in training API is available.
# Currently, this script only support calling train once for fault recovery purpose.
bst = xgb.train(params, dtrain, n_rounds, watchlist, early_stopping_rounds=2)
# Have each worker save its model
model_name = "test.model.%s.%d" % (name, rank)
bst.dump_model(model_name, with_stats=True)
xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX) # sync
xgb.rabit.tracker_print("Finished training\n")
if (rank == 0):
for i in range(0, world):
model_name_root = "test.model.%s.%d" % (name, i)
for j in range(0, world):
if i == j:
continue
with open(model_name_root, 'r') as model_root:
contents_root = model_root.read()
model_name_rank = "test.model.%s.%d" % (name, j)
with open(model_name_rank, 'r') as model_rank:
contents_rank = model_rank.read()
if contents_root != contents_rank:
raise Exception(
('Worker models diverged: test.model.%s.%d '
'differs from test.model.%s.%d') % (name, i, name, j))
base_params = {
'tree_method': 'gpu_hist',
'max_depth': 2,
'eta': 1,
'verbosity': 0,
'objective': 'binary:logistic',
'debug_synchronize': True
}
def params_basic_1x4(rank):
return dict(base_params, **{
'gpu_id': rank,
}), 20
rf_update_params = {
'subsample': 0.5,
'colsample_bynode': 0.5
}
def wrap_rf(params_fun):
def wrapped_params_fun(rank):
params, n_estimators = params_fun(rank)
rf_params = dict(rf_update_params, num_parallel_tree=n_estimators)
return dict(params, **rf_params), 1
return wrapped_params_fun
params_rf_1x4 = wrap_rf(params_basic_1x4)
test_name = sys.argv[1]
run_test(test_name, globals()['params_%s' % test_name])

View File

@ -1,14 +0,0 @@
#!/bin/bash
rm -f *.model*
export DMLC_SUBMIT_CLUSTER=local
submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit"
echo -e "\n ====== 1. Basic distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
$submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py basic_1x4 || exit 1
rm test.model.*
echo -e "\n ====== 2. RF distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
$submit --num-workers=$(nvidia-smi -L | wc -l) python distributed_gpu.py rf_1x4 || exit 1
rm test.model.*

View File

@ -1,13 +0,0 @@
#!/bin/bash
rm -f *.model*
export DMLC_SUBMIT_CLUSTER=mpi
submit="timeout 5 python ../../dmlc-core/tracker/dmlc-submit"
echo "====== 1. Basic distributed test with Python ======"
$submit --cluster=local --num-workers=3 python test_basic.py
echo "====== 2. Regression test for issue #3402 ======"
$submit --cluster=local --num-workers=2 --worker-cores=1 python test_issue3402.py

View File

@ -1,13 +0,0 @@
#!/bin/bash
rm -f *.model*
export DMLC_SUBMIT_CLUSTER=local
submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit"
echo "====== 1. Basic distributed test with Python ======"
$submit --cluster=local --num-workers=3 python test_basic.py
echo "====== 2. Regression test for issue #3402 ======"
$submit --cluster=local --num-workers=2 --worker-cores=1 python test_issue3402.py

View File

@ -1,24 +0,0 @@
#!/usr/bin/python
import xgboost as xgb
# Always call this before using distributed module
with xgb.rabit.RabitContext():
# Load file, file will be automatically sharded in distributed mode.
dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
# Specify parameters via map, definition are same as c++ version
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
# Specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 20
# Run training, all the features in training API is available.
# Currently, this script only support calling train once for fault recovery purpose.
bst = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=2)
# Save the model, only ask process 0 to save the model.
if xgb.rabit.get_rank() == 0:
bst.save_model("test.model")
xgb.rabit.tracker_print("Finished training\n")

View File

@ -1,74 +0,0 @@
#!/usr/bin/python
import xgboost as xgb
import numpy as np
with xgb.rabit.RabitContext():
X = [
[15.00,28.90,29.00,3143.70,0.00,0.10,69.90,90.00,13726.07,0.00,2299.70,0.00,0.05,
4327.03,0.00,24.00,0.18,3.00,0.41,3.77,0.00,0.00,4.00,0.00,150.92,0.00,2.00,0.00,
0.01,138.00,1.00,0.02,69.90,0.00,0.83,5.00,0.01,0.12,47.30,0.00,296.00,0.16,0.00,
0.00,27.70,7.00,7.25,4406.16,1.00,0.54,245.28,3.00,0.06,306.50,5143.00,29.00,23.74,
548.00,2.00,68.00,70.90,25.45,0.39,0.00,0.01,497.11,0.00,42.00,83.00,4.00,0.00,1.00,
0.00,104.35,94.12,0.03,79.23,237.69,1.00,0.04,0.01,0.02,2.00,108.81,7.00,12.00,0.46,
31.00,0.00,0.15,74.59,0.00,19.50,0.00,0.75,0.06,0.08,118.00,35.90,0.01,0.07,1.00,
0.03,81.18,13.33,0.00,0.00,0.00,0.00,0.00,0.41,0.00,0.15,57.00,0.00,22.00,449.68,
0.00,0.00,2.00,195.26,51.58,306.50,0.10,1.00,0.00,258.00,21.00,0.43,3.00,16.00,0.00,
0.00,0.00,0.00,1.00,74.51,4.00,0.02,35.90,30.00,8.69,0.00,0.36,5.00,2.00,3.00,0.26,
9.50,8.00,11.00,11918.15,0.00,258.00,13.00,9.04,0.14,604.65,0.92,74.59,0.00,0.00,
72.76,1.00,0.22,64.00,2.00,0.00,0.00,0.02,0.00,305.50,27.70,0.02,0.00,177.00,14.00,
0.00,0.05,90.00,0.03,0.00,1.00,0.43,4.00,0.05,0.09,431.00,0.00,2.00,0.00,0.00,1.00,
0.25,0.17,0.00,0.00,21.00,94.12,0.17,0.00,0.00,0.00,548.00,0.00,68.00,0.00,0.00,9.50,
25.45,1390.31,7.00,0.00,2.00,310.70,0.00,0.01,0.01,0.03,81.40,1.00,0.02,0.00,9.00,
6.00,0.00,175.76,36.00,0.00,20.75,2.00,0.00,0.00,0.00,0.22,74.16,0.10,56.81,0.00,
2197.03,0.00,197.66,0.00,55.00,20.00,367.18,22.00,0.00,0.01,1510.26,0.24,0.00,0.01,
0.00,11.00,278.10,61.70,278.10,0.00,0.08,0.57,1.00,0.65,255.60,0.00,0.86,0.25,70.95,
2299.70,0.23,0.05,92.70,1.00,38.00,0.00,0.00,56.81,21.85,0.00,23.74,0.00,2.00,0.03,
2.00,0.00,347.58,30.00,243.55,109.00,0.00,296.00,6.00,6.00,0.00,0.00,109.00,2299.70,
0.00,0.01,0.08,1.00,4745.09,4.00,0.18,0.00,0.17,0.02,0.00,1.00,147.13,71.07,2115.16,
0.00,0.26,0.00,43.00,604.90,49.44,4327.03,0.68,0.75,0.10,86.36,52.98,0.20,0.00,22.50,
305.50,0.00,1.00,0.00,7.00,0.78,0.00,296.00,22.50,0.00,5.00,2979.54,1.00,14.00,51.00,
0.42,0.11,0.00,1.00,0.00,0.00,70.90,37.84,0.02,548.40,0.00,46.35,5.00,1.66,0.29,0.00,
0.02,2255.69,160.53,790.64,6775.15,0.68,19.50,2299.70,79.87,6.00,0.00,60.00,0.27,
233.77,10.00,0.00,0.00,23.00,82.27,1.00,0.00,1.00,0.42,1.00,0.01,0.40,0.41,9.50,2299.70,
46.30,0.00,0.00,2299.70,3.00,0.00,0.00,83.00,1.00],
[48.00,80.89,69.90,11570.00,26.00,0.40,468.00,0.00,5739.46,0.00,1480.00,90.89,0.00,
14042.09,3600.08,120.00,0.09,31.00,0.25,2.36,0.00,7.00,22.00,0.00,257.59,0.00,6.00,
260.00,0.05,313.00,1.00,0.07,468.00,0.00,0.67,11.00,0.02,0.32,0.00,0.00,1387.61,0.34,
0.00,0.00,158.04,6.00,13.98,12380.05,0.00,0.16,122.74,3.00,0.18,291.33,7517.79,124.00,
45.08,900.00,1.00,0.00,577.25,79.75,0.39,0.00,0.00,244.62,0.00,57.00,178.00,19.00,
0.00,1.00,386.10,103.51,480.00,0.06,129.41,334.31,1.00,0.06,0.00,0.06,3.00,125.55,
0.00,76.00,0.14,30.00,0.00,0.03,411.29,791.33,55.00,0.12,3.80,0.07,0.01,188.00,221.11,
0.01,0.15,1.00,0.18,144.32,15.00,0.00,0.05,0.00,3.00,0.00,0.20,0.00,0.14,62.00,0.06,
55.00,239.35,0.00,0.00,2.00,534.20,747.50,400.57,0.40,0.00,0.00,219.98,30.00,0.25,
1.00,70.00,0.02,0.04,0.00,0.00,7.00,747.50,8.67,0.06,271.01,28.00,5.63,75.39,0.46,
11.00,3.00,19.00,0.38,131.74,23.00,39.00,30249.41,0.00,202.68,2.00,64.94,0.03,2787.68,
0.54,35.00,0.02,106.03,25.00,1.00,0.10,45.00,2.00,0.00,0.00,0.00,0.00,449.27,172.38,
0.05,0.00,550.00,130.00,2006.55,0.07,0.00,0.03,0.00,5.00,0.21,22.00,0.05,0.01,1011.40,
0.00,4.00,3600.08,0.00,1.00,1.00,1.00,0.00,3.00,9.00,270.00,0.12,0.03,0.00,0.00,820.00,
1827.50,0.00,100.33,0.00,131.74,53.16,9557.97,7.00,0.00,11.00,180.81,0.00,0.01,0.04,
0.02,1480.00,0.92,0.05,0.00,15.00,6.00,0.00,161.42,28.00,169.00,35.60,4.00,0.12,0.00,
0.00,0.27,230.56,0.42,171.90,0.00,28407.51,1.00,883.10,0.00,261.00,9.00,1031.67,38.00,
0.00,0.04,1607.68,0.32,791.33,0.04,1403.00,2.00,2260.50,88.08,2260.50,0.00,0.12,0.75,
3.00,0.00,1231.68,0.07,0.60,0.24,0.00,0.00,0.15,0.14,753.50,1.00,95.00,7.00,0.26,
77.63,38.45,0.00,42.65,0.00,14.00,0.07,6.00,0.00,1911.59,43.00,386.77,1324.80,0.00,
518.00,10.00,10.00,0.11,0.00,1324.80,0.00,0.00,0.02,0.16,1.00,10492.12,5.00,0.94,
5.00,0.08,0.10,1.00,0.92,3731.49,105.81,6931.39,0.00,0.43,0.00,118.00,5323.71,81.66,
14042.09,0.08,0.20,0.40,96.64,0.00,0.08,4.00,1028.82,353.00,0.00,2.00,32.00,43.00,
5.16,75.39,900.00,232.10,3.00,5.00,6049.88,1.00,126.00,46.00,0.59,0.15,0.00,8.00,
7.00,0.00,577.25,0.00,0.07,2415.10,0.00,83.72,9.00,1.76,0.20,0.00,0.17,3278.65,155.26,
4415.50,22731.62,1.00,55.00,0.00,499.94,22.00,0.58,67.00,0.21,341.72,16.00,0.00,965.07,
17.00,138.41,0.00,0.00,1.00,0.14,1.00,0.02,0.35,1.69,369.00,1300.00,25.00,0.00,0.01,
0.00,0.00,0.00,0.00,52.00,8.00]]
X = np.array(X)
y = [1, 0]
dtrain = xgb.DMatrix(X, label=y)
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic' }
watchlist = [(dtrain,'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
if xgb.rabit.get_rank() == 0:
bst.save_model("test_issue3402.model")
xgb.rabit.tracker_print("Finished training\n")