merge latest change from upstream
This commit is contained in:
@@ -24,7 +24,7 @@ set -x
|
||||
|
||||
CUDA_VERSION=11.8.0
|
||||
NCCL_VERSION=2.16.5-1
|
||||
RAPIDS_VERSION=24.02
|
||||
RAPIDS_VERSION=24.04
|
||||
SPARK_VERSION=3.4.0
|
||||
JDK_VERSION=8
|
||||
R_VERSION=4.3.2
|
||||
@@ -39,13 +39,14 @@ fi
|
||||
if [[ -n $BUILDKITE_PULL_REQUEST && $BUILDKITE_PULL_REQUEST != "false" ]]
|
||||
then
|
||||
is_pull_request=1
|
||||
export BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST
|
||||
BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST
|
||||
else
|
||||
is_pull_request=0
|
||||
export BRANCH_NAME=$BUILDKITE_BRANCH
|
||||
BRANCH_NAME=$BUILDKITE_BRANCH
|
||||
fi
|
||||
export BRANCH_NAME=${BRANCH_NAME//\//-}
|
||||
|
||||
if [[ $BUILDKITE_BRANCH == "master" || $BUILDKITE_BRANCH == "release_"* ]]
|
||||
if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* ]]
|
||||
then
|
||||
is_release_branch=1
|
||||
enforce_daily_budget=0
|
||||
|
||||
106
tests/buildkite/infrastructure/README.md
Normal file
106
tests/buildkite/infrastructure/README.md
Normal file
@@ -0,0 +1,106 @@
|
||||
BuildKite CI Infrastructure
|
||||
===========================
|
||||
|
||||
# Worker image builder (`worker-image-pipeline/`)
|
||||
|
||||
Use EC2 Image Builder to build machine images in a deterministic fashion.
|
||||
The machine images are used to initialize workers in the CI/CD pipelines.
|
||||
|
||||
## Editing bootstrap scripts
|
||||
|
||||
Currently, we create two pipelines for machine images: one for Linux workers and another
|
||||
for Windows workers.
|
||||
You can edit the bootstrap scripts to change how the worker machines are initialized.
|
||||
|
||||
* `linux-amd64-gpu-bootstrap.yml`: Bootstrap script for Linux worker machines
|
||||
* `windows-gpu-bootstrap.yml`: Bootstrap script for Windows worker machines
|
||||
|
||||
## Creating and running Image Builder pipelines
|
||||
|
||||
Run the following commands to create and run pipelines in EC2 Image Builder service:
|
||||
```bash
|
||||
python worker-image-pipeline/create_worker_image_pipelines.py --aws-region us-west-2
|
||||
python worker-image-pipeline/run_pipelines.py --aws-region us-west-2
|
||||
```
|
||||
Go to the AWS CloudFormation console and verify the existence of two CloudFormation stacks:
|
||||
* `buildkite-windows-gpu-worker`
|
||||
* `buildkite-linux-amd64-gpu-worker`
|
||||
|
||||
Then go to the EC2 Image Builder console to check the status of the image builds. You may
|
||||
want to inspect the log output should a build fails.
|
||||
Once the new machine images are done building, see the next section to deploy the new
|
||||
images to the worker machines.
|
||||
|
||||
# Elastic CI Stack for AWS (`aws-stack-creator/`)
|
||||
|
||||
Use EC2 Autoscaling groups to launch worker machines in EC2. BuildKite periodically sends
|
||||
messages to the Autoscaling groups to increase or decrease the number of workers according
|
||||
to the number of outstanding testing jobs.
|
||||
|
||||
## Deploy an updated CI stack with new machine images
|
||||
|
||||
First, edit `aws-stack-creator/metadata.py` to update the `AMI_ID` fields:
|
||||
```python
|
||||
AMI_ID = {
|
||||
# Managed by XGBoost team
|
||||
"linux-amd64-gpu": {
|
||||
"us-west-2": "...",
|
||||
},
|
||||
"linux-amd64-mgpu": {
|
||||
"us-west-2": "...",
|
||||
},
|
||||
"windows-gpu": {
|
||||
"us-west-2": "...",
|
||||
},
|
||||
"windows-cpu": {
|
||||
"us-west-2": "...",
|
||||
},
|
||||
# Managed by BuildKite
|
||||
# from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
|
||||
"linux-amd64-cpu": {
|
||||
"us-west-2": "...",
|
||||
},
|
||||
"pipeline-loader": {
|
||||
"us-west-2": "...",
|
||||
},
|
||||
"linux-arm64-cpu": {
|
||||
"us-west-2": "...",
|
||||
},
|
||||
}
|
||||
```
|
||||
AMI IDs uniquely identify the machine images in the EC2 service.
|
||||
Go to the EC2 Image Builder console to find the AMI IDs for the new machine images
|
||||
(see the previous section), and update the following fields:
|
||||
|
||||
* `AMI_ID["linux-amd64-gpu"]["us-west-2"]`:
|
||||
Use the latest output from the `buildkite-linux-amd64-gpu-worker` pipeline
|
||||
* `AMI_ID["linux-amd64-mgpu"]["us-west-2"]`:
|
||||
Should be identical to `AMI_ID["linux-amd64-gpu"]["us-west-2"]`
|
||||
* `AMI_ID["windows-gpu"]["us-west-2"]`:
|
||||
Use the latest output from the `buildkite-windows-gpu-worker` pipeline
|
||||
* `AMI_ID["windows-cpu"]["us-west-2"]`:
|
||||
Should be identical to `AMI_ID["windows-gpu"]["us-west-2"]`
|
||||
|
||||
Next, visit https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
|
||||
to look up the AMI IDs for the following fields:
|
||||
|
||||
* `AMI_ID["linux-amd64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field
|
||||
`Mappings/AWSRegion2AMI/us-west-2/linuxamd64`
|
||||
* `AMI_ID["pipeline-loader"]["us-west-2"]`:
|
||||
Should be identical to `AMI_ID["linux-amd64-cpu"]["us-west-2"]`
|
||||
* `AMI_ID["linux-arm64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field
|
||||
`Mappings/AWSRegion2AMI/us-west-2/linuxarm64`
|
||||
|
||||
Finally, run the following commands to deploy the new machine images:
|
||||
```
|
||||
python aws-stack-creator/create_stack.py --aws-region us-west-2 --agent-token AGENT_TOKEN
|
||||
```
|
||||
Go to the AWS CloudFormation console and verify the existence of the following
|
||||
CloudFormation stacks:
|
||||
* `buildkite-pipeline-loader-autoscaling-group`
|
||||
* `buildkite-linux-amd64-cpu-autoscaling-group`
|
||||
* `buildkite-linux-amd64-gpu-autoscaling-group`
|
||||
* `buildkite-linux-amd64-mgpu-autoscaling-group`
|
||||
* `buildkite-linux-arm64-cpu-autoscaling-group`
|
||||
* `buildkite-windows-cpu-autoscaling-group`
|
||||
* `buildkite-windows-gpu-autoscaling-group`
|
||||
@@ -1,27 +1,27 @@
|
||||
AMI_ID = {
|
||||
# Managed by XGBoost team
|
||||
"linux-amd64-gpu": {
|
||||
"us-west-2": "ami-08c3bc1dd5ec8bc5c",
|
||||
"us-west-2": "ami-070080d04e81c5e39",
|
||||
},
|
||||
"linux-amd64-mgpu": {
|
||||
"us-west-2": "ami-08c3bc1dd5ec8bc5c",
|
||||
"us-west-2": "ami-070080d04e81c5e39",
|
||||
},
|
||||
"windows-gpu": {
|
||||
"us-west-2": "ami-03c7f2156f93b22a7",
|
||||
"us-west-2": "ami-07c14abcf529d816a",
|
||||
},
|
||||
"windows-cpu": {
|
||||
"us-west-2": "ami-03c7f2156f93b22a7",
|
||||
"us-west-2": "ami-07c14abcf529d816a",
|
||||
},
|
||||
# Managed by BuildKite
|
||||
# from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
|
||||
"linux-amd64-cpu": {
|
||||
"us-west-2": "ami-015e64acb52b3e595",
|
||||
"us-west-2": "ami-0180f7fb0f07eb0bc",
|
||||
},
|
||||
"pipeline-loader": {
|
||||
"us-west-2": "ami-015e64acb52b3e595",
|
||||
"us-west-2": "ami-0180f7fb0f07eb0bc",
|
||||
},
|
||||
"linux-arm64-cpu": {
|
||||
"us-west-2": "ami-0884e9c23a2fa98d0",
|
||||
"us-west-2": "ami-00686bdc2043a5505",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -15,9 +15,9 @@ phases:
|
||||
choco --version
|
||||
choco feature enable -n=allowGlobalConfirmation
|
||||
|
||||
# CMake 3.27
|
||||
Write-Host '>>> Installing CMake 3.27...'
|
||||
choco install cmake --version 3.27.9 --installargs "ADD_CMAKE_TO_PATH=System"
|
||||
# CMake 3.29.2
|
||||
Write-Host '>>> Installing CMake 3.29.2...'
|
||||
choco install cmake --version 3.29.2 --installargs "ADD_CMAKE_TO_PATH=System"
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Notepad++
|
||||
@@ -53,9 +53,9 @@ phases:
|
||||
"--wait --passive --norestart --includeOptional"
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Install CUDA 11.8
|
||||
Write-Host '>>> Installing CUDA 11.8...'
|
||||
choco install cuda --version=11.8.0.52206
|
||||
# Install CUDA 12.4
|
||||
Write-Host '>>> Installing CUDA 12.4...'
|
||||
choco install cuda --version=12.4.1.551
|
||||
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||
|
||||
# Install R
|
||||
|
||||
@@ -21,14 +21,14 @@ ENV PATH=/opt/mambaforge/bin:$PATH
|
||||
|
||||
# Create new Conda environment with cuDF, Dask, and cuPy
|
||||
RUN \
|
||||
conda install -c conda-forge mamba && \
|
||||
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
|
||||
export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \
|
||||
mamba create -y -n gpu_test -c rapidsai -c nvidia -c conda-forge \
|
||||
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
|
||||
nccl>=$(cut -d "-" -f 1 << $NCCL_VERSION_ARG) \
|
||||
"nccl>=${NCCL_SHORT_VER}" \
|
||||
dask=2024.1.1 \
|
||||
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
|
||||
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
||||
pyspark>=3.4.0 cloudpickle cuda-python && \
|
||||
"pyspark>=3.4.0" cloudpickle cuda-python && \
|
||||
mamba clean --all && \
|
||||
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
|
||||
|
||||
|
||||
@@ -15,9 +15,9 @@ RUN \
|
||||
wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
|
||||
bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
|
||||
# Maven
|
||||
wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
|
||||
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
|
||||
ln -s /opt/apache-maven-3.6.1/ /opt/maven
|
||||
wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz && \
|
||||
tar xvf apache-maven-3.6.3-bin.tar.gz -C /opt && \
|
||||
ln -s /opt/apache-maven-3.6.3/ /opt/maven
|
||||
|
||||
ENV PATH=/opt/mambaforge/bin:/opt/maven/bin:$PATH
|
||||
ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
|
||||
|
||||
@@ -17,9 +17,9 @@ RUN \
|
||||
bash conda.sh -b -p /opt/mambaforge && \
|
||||
/opt/mambaforge/bin/pip install awscli && \
|
||||
# Maven
|
||||
wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
|
||||
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
|
||||
ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
|
||||
wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz && \
|
||||
tar xvf apache-maven-3.6.3-bin.tar.gz -C /opt && \
|
||||
ln -s /opt/apache-maven-3.6.3/ /opt/maven && \
|
||||
# Spark with scala 2.12
|
||||
mkdir -p /opt/spark-scala-2.12 && \
|
||||
wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \
|
||||
|
||||
@@ -18,9 +18,9 @@ RUN \
|
||||
wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
|
||||
bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
|
||||
# Maven
|
||||
wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
|
||||
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
|
||||
ln -s /opt/apache-maven-3.6.1/ /opt/maven
|
||||
wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz && \
|
||||
tar xvf apache-maven-3.6.3-bin.tar.gz -C /opt && \
|
||||
ln -s /opt/apache-maven-3.6.3/ /opt/maven
|
||||
|
||||
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
|
||||
RUN \
|
||||
|
||||
@@ -81,7 +81,7 @@ target_include_directories(testxgboost
|
||||
${xgboost_SOURCE_DIR}/rabit/include)
|
||||
target_link_libraries(testxgboost
|
||||
PRIVATE
|
||||
${GTEST_LIBRARIES})
|
||||
GTest::gtest GTest::gmock)
|
||||
|
||||
set_output_directory(testxgboost ${xgboost_BINARY_DIR})
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h> // for ASSERT_EQ
|
||||
#include <xgboost/span.h> // for Span, oper...
|
||||
@@ -34,8 +34,8 @@ class Worker : public WorkerForTest {
|
||||
std::vector<std::int32_t> data(comm_.World(), 0);
|
||||
data[comm_.Rank()] = comm_.Rank();
|
||||
|
||||
auto rc = RingAllgather(this->comm_, common::Span{data.data(), data.size()}, 1);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
auto rc = RingAllgather(this->comm_, common::Span{data.data(), data.size()});
|
||||
SafeColl(rc);
|
||||
|
||||
for (std::int32_t r = 0; r < comm_.World(); ++r) {
|
||||
ASSERT_EQ(data[r], r);
|
||||
@@ -51,8 +51,8 @@ class Worker : public WorkerForTest {
|
||||
auto seg = s_data.subspan(comm_.Rank() * n, n);
|
||||
std::iota(seg.begin(), seg.end(), comm_.Rank());
|
||||
|
||||
auto rc = RingAllgather(comm_, common::Span{data.data(), data.size()}, n);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
auto rc = RingAllgather(comm_, common::Span{data.data(), data.size()});
|
||||
SafeColl(rc);
|
||||
|
||||
for (std::int32_t r = 0; r < comm_.World(); ++r) {
|
||||
auto seg = s_data.subspan(r * n, n);
|
||||
@@ -81,7 +81,7 @@ class Worker : public WorkerForTest {
|
||||
std::vector<std::int32_t> data(comm_.Rank() + 1, comm_.Rank());
|
||||
std::vector<std::int32_t> result;
|
||||
auto rc = RingAllgatherV(comm_, common::Span{data.data(), data.size()}, &result);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
ASSERT_EQ(result.size(), (1 + comm_.World()) * comm_.World() / 2);
|
||||
CheckV(result);
|
||||
}
|
||||
@@ -91,7 +91,7 @@ class Worker : public WorkerForTest {
|
||||
std::int32_t n{comm_.Rank()};
|
||||
std::vector<std::int32_t> result;
|
||||
auto rc = RingAllgatherV(comm_, common::Span{&n, 1}, &result);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
for (std::int32_t i = 0; i < comm_.World(); ++i) {
|
||||
ASSERT_EQ(result[i], i);
|
||||
}
|
||||
@@ -104,8 +104,8 @@ class Worker : public WorkerForTest {
|
||||
|
||||
std::vector<std::int64_t> sizes(comm_.World(), 0);
|
||||
sizes[comm_.Rank()] = s_data.size_bytes();
|
||||
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()});
|
||||
SafeColl(rc);
|
||||
std::shared_ptr<Coll> pcoll{new Coll{}};
|
||||
|
||||
std::vector<std::int64_t> recv_segments(comm_.World() + 1, 0);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
|
||||
#include <gtest/gtest.h>
|
||||
@@ -33,8 +33,8 @@ class Worker : public NCCLWorkerForTest {
|
||||
// get size
|
||||
std::vector<std::int64_t> sizes(comm_.World(), -1);
|
||||
sizes[comm_.Rank()] = s_data.size_bytes();
|
||||
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()});
|
||||
SafeColl(rc);
|
||||
// create result
|
||||
dh::device_vector<std::int32_t> result(comm_.World(), -1);
|
||||
auto s_result = common::EraseType(dh::ToSpan(result));
|
||||
@@ -42,7 +42,7 @@ class Worker : public NCCLWorkerForTest {
|
||||
std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
|
||||
rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
|
||||
common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
|
||||
for (std::int32_t i = 0; i < comm_.World(); ++i) {
|
||||
ASSERT_EQ(result[i], i);
|
||||
@@ -57,8 +57,8 @@ class Worker : public NCCLWorkerForTest {
|
||||
// get size
|
||||
std::vector<std::int64_t> sizes(nccl_comm_->World(), 0);
|
||||
sizes[comm_.Rank()] = dh::ToSpan(data).size_bytes();
|
||||
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()});
|
||||
SafeColl(rc);
|
||||
auto n_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
|
||||
// create result
|
||||
dh::device_vector<std::int32_t> result(n_bytes / sizeof(std::int32_t), -1);
|
||||
@@ -67,7 +67,7 @@ class Worker : public NCCLWorkerForTest {
|
||||
std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
|
||||
rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
|
||||
common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
// check segment size
|
||||
if (algo != AllgatherVAlgo::kBcast) {
|
||||
auto size = recv_seg[nccl_comm_->Rank() + 1] - recv_seg[nccl_comm_->Rank()];
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <numeric> // for iota
|
||||
|
||||
#include "../../../src/collective/allreduce.h"
|
||||
#include "../../../src/collective/coll.h" // for Coll
|
||||
#include "../../../src/collective/tracker.h"
|
||||
#include "../../../src/common/type.h" // for EraseType
|
||||
#include "test_worker.h" // for WorkerForTest, TestDistributed
|
||||
|
||||
@@ -58,7 +59,7 @@ class AllreduceWorker : public WorkerForTest {
|
||||
auto pcoll = std::shared_ptr<Coll>{new Coll{}};
|
||||
auto rc = pcoll->Allreduce(comm_, common::EraseType(common::Span{data.data(), data.size()}),
|
||||
ArrayInterfaceHandler::kU4, Op::kBitwiseOR);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
for (auto v : data) {
|
||||
ASSERT_EQ(v, ~std::uint32_t{0});
|
||||
}
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/host_vector.h> // for host_vector
|
||||
|
||||
#include "../../../src/common/common.h"
|
||||
#include "../../../src/common/common.h" // for AllVisibleGPUs
|
||||
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
|
||||
#include "../../../src/common/type.h" // for EraseType
|
||||
#include "test_worker.cuh" // for NCCLWorkerForTest
|
||||
@@ -24,7 +24,7 @@ class Worker : public NCCLWorkerForTest {
|
||||
data[comm_.Rank()] = ~std::uint32_t{0};
|
||||
auto rc = nccl_coll_->Allreduce(*nccl_comm_, common::EraseType(dh::ToSpan(data)),
|
||||
ArrayInterfaceHandler::kU4, Op::kBitwiseOR);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
thrust::host_vector<std::uint32_t> h_data(data.size());
|
||||
thrust::copy(data.cbegin(), data.cend(), h_data.begin());
|
||||
for (auto v : h_data) {
|
||||
@@ -36,7 +36,7 @@ class Worker : public NCCLWorkerForTest {
|
||||
dh::device_vector<double> data(314, 1.5);
|
||||
auto rc = nccl_coll_->Allreduce(*nccl_comm_, common::EraseType(dh::ToSpan(data)),
|
||||
ArrayInterfaceHandler::kF8, Op::kSum);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
for (std::size_t i = 0; i < data.size(); ++i) {
|
||||
auto v = data[i];
|
||||
ASSERT_EQ(v, 1.5 * static_cast<double>(comm_.World())) << i;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/collective/socket.h>
|
||||
@@ -10,7 +10,6 @@
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../../src/collective/broadcast.h" // for Broadcast
|
||||
#include "../../../src/collective/tracker.h" // for GetHostAddress
|
||||
#include "test_worker.h" // for WorkerForTest, TestDistributed
|
||||
|
||||
namespace xgboost::collective {
|
||||
@@ -24,14 +23,14 @@ class Worker : public WorkerForTest {
|
||||
// basic test
|
||||
std::vector<std::int32_t> data(1, comm_.Rank());
|
||||
auto rc = Broadcast(this->comm_, common::Span{data.data(), data.size()}, r);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
ASSERT_EQ(data[0], r);
|
||||
}
|
||||
|
||||
for (std::int32_t r = 0; r < comm_.World(); ++r) {
|
||||
std::vector<std::int32_t> data(1 << 16, comm_.Rank());
|
||||
auto rc = Broadcast(this->comm_, common::Span{data.data(), data.size()}, r);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
ASSERT_EQ(data[0], r);
|
||||
}
|
||||
}
|
||||
@@ -41,11 +40,11 @@ class BroadcastTest : public SocketTest {};
|
||||
} // namespace
|
||||
|
||||
TEST_F(BroadcastTest, Basic) {
|
||||
std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
|
||||
std::int32_t n_workers = std::min(2u, std::thread::hardware_concurrency());
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker worker{host, port, timeout, n_workers, r};
|
||||
worker.Run();
|
||||
});
|
||||
} // namespace
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
|
||||
@@ -25,13 +25,13 @@ TEST_F(TrackerAPITest, CAPI) {
|
||||
auto config_str = Json::Dump(config);
|
||||
auto rc = XGTrackerCreate(config_str.c_str(), &handle);
|
||||
ASSERT_EQ(rc, 0);
|
||||
rc = XGTrackerRun(handle);
|
||||
rc = XGTrackerRun(handle, nullptr);
|
||||
ASSERT_EQ(rc, 0);
|
||||
|
||||
std::thread bg_wait{[&] {
|
||||
Json config{Object{}};
|
||||
auto config_str = Json::Dump(config);
|
||||
auto rc = XGTrackerWait(handle, config_str.c_str());
|
||||
auto rc = XGTrackerWaitFor(handle, config_str.c_str());
|
||||
ASSERT_EQ(rc, 0);
|
||||
}};
|
||||
|
||||
@@ -42,8 +42,8 @@ TEST_F(TrackerAPITest, CAPI) {
|
||||
|
||||
std::string host;
|
||||
ASSERT_TRUE(GetHostAddress(&host).OK());
|
||||
ASSERT_EQ(host, get<String const>(args["DMLC_TRACKER_URI"]));
|
||||
auto port = get<Integer const>(args["DMLC_TRACKER_PORT"]);
|
||||
ASSERT_EQ(host, get<String const>(args["dmlc_tracker_uri"]));
|
||||
auto port = get<Integer const>(args["dmlc_tracker_port"]);
|
||||
ASSERT_NE(port, 0);
|
||||
|
||||
std::vector<std::thread> workers;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
@@ -14,7 +14,7 @@ class CommTest : public TrackerTest {};
|
||||
|
||||
TEST_F(CommTest, Channel) {
|
||||
auto n_workers = 4;
|
||||
RabitTracker tracker{host, n_workers, 0, timeout};
|
||||
RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
|
||||
auto fut = tracker.Run();
|
||||
|
||||
std::vector<std::thread> workers;
|
||||
@@ -29,7 +29,7 @@ TEST_F(CommTest, Channel) {
|
||||
return p_chan->SendAll(
|
||||
EraseType(common::Span<std::int32_t const>{&i, static_cast<std::size_t>(1)}));
|
||||
} << [&] { return p_chan->Block(); };
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
} else {
|
||||
auto p_chan = worker.Comm().Chan(i - 1);
|
||||
std::int32_t r{-1};
|
||||
@@ -37,7 +37,7 @@ TEST_F(CommTest, Channel) {
|
||||
return p_chan->RecvAll(
|
||||
EraseType(common::Span<std::int32_t>{&r, static_cast<std::size_t>(1)}));
|
||||
} << [&] { return p_chan->Block(); };
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
ASSERT_EQ(r, i - 1);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -17,17 +17,6 @@
|
||||
|
||||
namespace xgboost::collective {
|
||||
namespace {
|
||||
auto MakeConfig(std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) {
|
||||
Json config{Object{}};
|
||||
config["dmlc_communicator"] = std::string{"rabit"};
|
||||
config["DMLC_TRACKER_URI"] = host;
|
||||
config["DMLC_TRACKER_PORT"] = port;
|
||||
config["dmlc_timeout_sec"] = static_cast<std::int64_t>(timeout.count());
|
||||
config["DMLC_TASK_ID"] = std::to_string(r);
|
||||
config["dmlc_retry"] = 2;
|
||||
return config;
|
||||
}
|
||||
|
||||
class CommGroupTest : public SocketTest {};
|
||||
} // namespace
|
||||
|
||||
@@ -36,7 +25,7 @@ TEST_F(CommGroupTest, Basic) {
|
||||
TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Context ctx;
|
||||
auto config = MakeConfig(host, port, timeout, r);
|
||||
auto config = MakeDistributedTestConfig(host, port, timeout, r);
|
||||
std::unique_ptr<CommGroup> ptr{CommGroup::Create(config)};
|
||||
ASSERT_TRUE(ptr->IsDistributed());
|
||||
ASSERT_EQ(ptr->World(), n_workers);
|
||||
@@ -52,7 +41,7 @@ TEST_F(CommGroupTest, BasicGPU) {
|
||||
TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
auto ctx = MakeCUDACtx(r);
|
||||
auto config = MakeConfig(host, port, timeout, r);
|
||||
auto config = MakeDistributedTestConfig(host, port, timeout, r);
|
||||
std::unique_ptr<CommGroup> ptr{CommGroup::Create(config)};
|
||||
auto const& comm = ptr->Ctx(&ctx, DeviceOrd::CUDA(0));
|
||||
ASSERT_EQ(comm.TaskID(), std::to_string(r));
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h> // for ASSERT_TRUE, ASSERT_EQ
|
||||
#include <xgboost/collective/socket.h> // for TCPSocket, Connect, SocketFinalize, SocketStartup
|
||||
@@ -28,18 +28,23 @@ class LoopTest : public ::testing::Test {
|
||||
|
||||
auto domain = SockDomain::kV4;
|
||||
pair_.first = TCPSocket::Create(domain);
|
||||
auto port = pair_.first.BindHost();
|
||||
pair_.first.Listen();
|
||||
std::int32_t port{0};
|
||||
auto rc = Success() << [&] {
|
||||
return pair_.first.BindHost(&port);
|
||||
} << [&] {
|
||||
return pair_.first.Listen();
|
||||
};
|
||||
SafeColl(rc);
|
||||
|
||||
auto const& addr = SockAddrV4::Loopback().Addr();
|
||||
auto rc = Connect(StringView{addr}, port, 1, timeout, &pair_.second);
|
||||
ASSERT_TRUE(rc.OK());
|
||||
rc = Connect(StringView{addr}, port, 1, timeout, &pair_.second);
|
||||
SafeColl(rc);
|
||||
rc = pair_.second.NonBlocking(true);
|
||||
ASSERT_TRUE(rc.OK());
|
||||
SafeColl(rc);
|
||||
|
||||
pair_.first = pair_.first.Accept();
|
||||
rc = pair_.first.NonBlocking(true);
|
||||
ASSERT_TRUE(rc.OK());
|
||||
SafeColl(rc);
|
||||
|
||||
loop_ = std::shared_ptr<Loop>{new Loop{timeout}};
|
||||
}
|
||||
@@ -74,8 +79,26 @@ TEST_F(LoopTest, Op) {
|
||||
loop_->Submit(rop);
|
||||
|
||||
auto rc = loop_->Block();
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
|
||||
ASSERT_EQ(rbuf[0], wbuf[0]);
|
||||
}
|
||||
|
||||
TEST_F(LoopTest, Block) {
|
||||
// We need to ensure that a blocking call doesn't go unanswered.
|
||||
auto op = Loop::Op::Sleep(2);
|
||||
|
||||
common::Timer t;
|
||||
t.Start();
|
||||
loop_->Submit(op);
|
||||
t.Stop();
|
||||
// submit is non-blocking
|
||||
ASSERT_LT(t.ElapsedSeconds(), 1);
|
||||
|
||||
t.Start();
|
||||
auto rc = loop_->Block();
|
||||
t.Stop();
|
||||
SafeColl(rc);
|
||||
ASSERT_GE(t.ElapsedSeconds(), 1);
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
|
||||
31
tests/cpp/collective/test_result.cc
Normal file
31
tests/cpp/collective/test_result.cc
Normal file
@@ -0,0 +1,31 @@
|
||||
/**
|
||||
* Copyright 2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/collective/result.h>
|
||||
|
||||
namespace xgboost::collective {
|
||||
TEST(Result, Concat) {
|
||||
auto rc0 = Fail("foo");
|
||||
auto rc1 = Fail("bar");
|
||||
auto rc = std::move(rc0) + std::move(rc1);
|
||||
ASSERT_NE(rc.Report().find("foo"), std::string::npos);
|
||||
ASSERT_NE(rc.Report().find("bar"), std::string::npos);
|
||||
|
||||
auto rc2 = Fail("Another", std::move(rc));
|
||||
auto assert_that = [](Result const& rc) {
|
||||
ASSERT_NE(rc.Report().find("Another"), std::string::npos);
|
||||
ASSERT_NE(rc.Report().find("foo"), std::string::npos);
|
||||
ASSERT_NE(rc.Report().find("bar"), std::string::npos);
|
||||
};
|
||||
assert_that(rc2);
|
||||
|
||||
auto empty = Success();
|
||||
auto rc3 = std::move(empty) + std::move(rc2);
|
||||
assert_that(rc3);
|
||||
|
||||
empty = Success();
|
||||
auto rc4 = std::move(rc3) + std::move(empty);
|
||||
assert_that(rc4);
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2022-2023, XGBoost Contributors
|
||||
* Copyright 2022-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/collective/socket.h>
|
||||
@@ -21,14 +21,19 @@ TEST_F(SocketTest, Basic) {
|
||||
auto run_test = [msg](SockDomain domain) {
|
||||
auto server = TCPSocket::Create(domain);
|
||||
ASSERT_EQ(server.Domain(), domain);
|
||||
auto port = server.BindHost();
|
||||
server.Listen();
|
||||
std::int32_t port{0};
|
||||
auto rc = Success() << [&] {
|
||||
return server.BindHost(&port);
|
||||
} << [&] {
|
||||
return server.Listen();
|
||||
};
|
||||
SafeColl(rc);
|
||||
|
||||
TCPSocket client;
|
||||
if (domain == SockDomain::kV4) {
|
||||
auto const& addr = SockAddrV4::Loopback().Addr();
|
||||
auto rc = Connect(StringView{addr}, port, 1, std::chrono::seconds{3}, &client);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
} else {
|
||||
auto const& addr = SockAddrV6::Loopback().Addr();
|
||||
auto rc = Connect(StringView{addr}, port, 1, std::chrono::seconds{3}, &client);
|
||||
@@ -45,7 +50,8 @@ TEST_F(SocketTest, Basic) {
|
||||
accepted.Send(msg);
|
||||
|
||||
std::string str;
|
||||
client.Recv(&str);
|
||||
rc = client.Recv(&str);
|
||||
SafeColl(rc);
|
||||
ASSERT_EQ(StringView{str}, msg);
|
||||
};
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gmock/gmock.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <chrono> // for seconds
|
||||
@@ -10,6 +11,7 @@
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../../src/collective/comm.h"
|
||||
#include "../helpers.h" // for GMockThrow
|
||||
#include "test_worker.h"
|
||||
|
||||
namespace xgboost::collective {
|
||||
@@ -20,13 +22,13 @@ class PrintWorker : public WorkerForTest {
|
||||
|
||||
void Print() {
|
||||
auto rc = comm_.LogTracker("ack:" + std::to_string(this->comm_.Rank()));
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST_F(TrackerTest, Bootstrap) {
|
||||
RabitTracker tracker{host, n_workers, 0, timeout};
|
||||
RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
|
||||
ASSERT_FALSE(tracker.Ready());
|
||||
auto fut = tracker.Run();
|
||||
|
||||
@@ -34,7 +36,7 @@ TEST_F(TrackerTest, Bootstrap) {
|
||||
|
||||
auto args = tracker.WorkerArgs();
|
||||
ASSERT_TRUE(tracker.Ready());
|
||||
ASSERT_EQ(get<String const>(args["DMLC_TRACKER_URI"]), host);
|
||||
ASSERT_EQ(get<String const>(args["dmlc_tracker_uri"]), host);
|
||||
|
||||
std::int32_t port = tracker.Port();
|
||||
|
||||
@@ -44,12 +46,11 @@ TEST_F(TrackerTest, Bootstrap) {
|
||||
for (auto &w : workers) {
|
||||
w.join();
|
||||
}
|
||||
|
||||
ASSERT_TRUE(fut.get().OK());
|
||||
SafeColl(fut.get());
|
||||
}
|
||||
|
||||
TEST_F(TrackerTest, Print) {
|
||||
RabitTracker tracker{host, n_workers, 0, timeout};
|
||||
RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
|
||||
auto fut = tracker.Run();
|
||||
|
||||
std::vector<std::thread> workers;
|
||||
@@ -73,4 +74,47 @@ TEST_F(TrackerTest, Print) {
|
||||
}
|
||||
|
||||
TEST_F(TrackerTest, GetHostAddress) { ASSERT_TRUE(host.find("127.") == std::string::npos); }
|
||||
|
||||
/**
|
||||
* Test connecting the tracker after it has finished. This should not hang the workers.
|
||||
*/
|
||||
TEST_F(TrackerTest, AfterShutdown) {
|
||||
RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
|
||||
auto fut = tracker.Run();
|
||||
|
||||
std::vector<std::thread> workers;
|
||||
auto rc = tracker.WaitUntilReady();
|
||||
ASSERT_TRUE(rc.OK());
|
||||
|
||||
std::int32_t port = tracker.Port();
|
||||
|
||||
// Launch no-op workers to cause the tracker to shutdown.
|
||||
for (std::int32_t i = 0; i < n_workers; ++i) {
|
||||
workers.emplace_back([=] { WorkerForTest worker{host, port, timeout, n_workers, i}; });
|
||||
}
|
||||
|
||||
for (auto &w : workers) {
|
||||
w.join();
|
||||
}
|
||||
|
||||
ASSERT_TRUE(fut.get().OK());
|
||||
|
||||
// Launch workers again, they should fail.
|
||||
workers.clear();
|
||||
for (std::int32_t i = 0; i < n_workers; ++i) {
|
||||
auto assert_that = [=] {
|
||||
WorkerForTest worker{host, port, timeout, n_workers, i};
|
||||
};
|
||||
// On a Linux platform, the connection will be refused, on Apple platform, this gets
|
||||
// an operation now in progress poll failure, on Windows, it's a timeout error.
|
||||
#if defined(__linux__)
|
||||
workers.emplace_back([=] { ASSERT_THAT(assert_that, GMockThrow("Connection refused")); });
|
||||
#else
|
||||
workers.emplace_back([=] { ASSERT_THAT(assert_that, GMockThrow("Failed to connect to")); });
|
||||
#endif
|
||||
}
|
||||
for (auto &w : workers) {
|
||||
w.join();
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <chrono> // for seconds
|
||||
#include <cstdint> // for int32_t
|
||||
#include <fstream> // for ifstream
|
||||
#include <string> // for string
|
||||
#include <thread> // for thread
|
||||
#include <utility> // for move
|
||||
@@ -36,7 +37,7 @@ class WorkerForTest {
|
||||
comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_, DefaultNcclName()} {
|
||||
CHECK_EQ(world_size_, comm_.World());
|
||||
}
|
||||
virtual ~WorkerForTest() = default;
|
||||
virtual ~WorkerForTest() noexcept(false) { SafeColl(comm_.Shutdown()); }
|
||||
auto& Comm() { return comm_; }
|
||||
|
||||
void LimitSockBuf(std::int32_t n_bytes) {
|
||||
@@ -86,19 +87,30 @@ class TrackerTest : public SocketTest {
|
||||
void SetUp() override {
|
||||
SocketTest::SetUp();
|
||||
auto rc = GetHostAddress(&host);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
}
|
||||
};
|
||||
|
||||
inline Json MakeTrackerConfig(std::string host, std::int32_t n_workers,
|
||||
std::chrono::seconds timeout) {
|
||||
Json config{Object{}};
|
||||
config["host"] = host;
|
||||
config["port"] = Integer{0};
|
||||
config["n_workers"] = Integer{n_workers};
|
||||
config["sortby"] = Integer{static_cast<std::int32_t>(Tracker::SortBy::kHost)};
|
||||
config["timeout"] = timeout.count();
|
||||
return config;
|
||||
}
|
||||
|
||||
template <typename WorkerFn>
|
||||
void TestDistributed(std::int32_t n_workers, WorkerFn worker_fn) {
|
||||
std::chrono::seconds timeout{2};
|
||||
|
||||
std::string host;
|
||||
auto rc = GetHostAddress(&host);
|
||||
ASSERT_TRUE(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
LOG(INFO) << "Using " << n_workers << " workers for test.";
|
||||
RabitTracker tracker{StringView{host}, n_workers, 0, timeout};
|
||||
RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
|
||||
auto fut = tracker.Run();
|
||||
|
||||
std::vector<std::thread> workers;
|
||||
@@ -114,4 +126,15 @@ void TestDistributed(std::int32_t n_workers, WorkerFn worker_fn) {
|
||||
|
||||
ASSERT_TRUE(fut.get().OK());
|
||||
}
|
||||
inline auto MakeDistributedTestConfig(std::string host, std::int32_t port,
|
||||
std::chrono::seconds timeout, std::int32_t r) {
|
||||
Json config{Object{}};
|
||||
config["dmlc_communicator"] = std::string{"rabit"};
|
||||
config["dmlc_tracker_uri"] = host;
|
||||
config["dmlc_tracker_port"] = port;
|
||||
config["dmlc_timeout_sec"] = static_cast<std::int64_t>(timeout.count());
|
||||
config["dmlc_task_id"] = std::to_string(r);
|
||||
config["dmlc_retry"] = 2;
|
||||
return config;
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
/*!
|
||||
* Copyright 2017-2021 XGBoost contributors
|
||||
/**
|
||||
* Copyright 2017-2024, XGBoost contributors
|
||||
*/
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/sort.h> // for is_sorted
|
||||
#include <xgboost/base.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <vector>
|
||||
#include <xgboost/base.h>
|
||||
|
||||
#include "../../../src/common/device_helpers.cuh"
|
||||
#include "../../../src/common/quantile.h"
|
||||
#include "../helpers.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
TEST(SumReduce, Test) {
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
/**
|
||||
* Copyright 2019-2023 by XGBoost Contributors
|
||||
* Copyright 2019-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "../../../src/common/hist_util.h"
|
||||
#include "../../../src/data/gradient_index.h"
|
||||
@@ -135,7 +134,7 @@ TEST(CutsBuilder, SearchGroupInd) {
|
||||
group[2] = 7;
|
||||
group[3] = 5;
|
||||
|
||||
p_mat->SetInfo("group", group.data(), DataType::kUInt32, kNumGroups);
|
||||
p_mat->SetInfo("group", Make1dInterfaceTest(group.data(), group.size()));
|
||||
|
||||
HistogramCuts hmat;
|
||||
|
||||
@@ -348,7 +347,8 @@ void TestSketchFromWeights(bool with_group) {
|
||||
for (size_t i = 0; i < kGroups; ++i) {
|
||||
groups[i] = kRows / kGroups;
|
||||
}
|
||||
info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
|
||||
auto sg = linalg::Make1dInterface(groups.data(), kGroups);
|
||||
info.SetInfo(ctx, "group", sg.c_str());
|
||||
}
|
||||
|
||||
info.num_row_ = kRows;
|
||||
@@ -356,10 +356,10 @@ void TestSketchFromWeights(bool with_group) {
|
||||
|
||||
// Assign weights.
|
||||
if (with_group) {
|
||||
m->SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
|
||||
m->SetInfo("group", Make1dInterfaceTest(groups.data(), kGroups));
|
||||
}
|
||||
|
||||
m->SetInfo("weight", h_weights.data(), DataType::kFloat32, h_weights.size());
|
||||
m->SetInfo("weight", Make1dInterfaceTest(h_weights.data(), h_weights.size()));
|
||||
m->Info().num_col_ = kCols;
|
||||
m->Info().num_row_ = kRows;
|
||||
ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2019-2023 by XGBoost Contributors
|
||||
* Copyright 2019-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/device_vector.h>
|
||||
@@ -684,7 +684,7 @@ TEST(HistUtil, DeviceSketchFromGroupWeights) {
|
||||
for (size_t i = 0; i < kGroups; ++i) {
|
||||
groups[i] = kRows / kGroups;
|
||||
}
|
||||
m->SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
|
||||
m->SetInfo("group", Make1dInterfaceTest(groups.data(), kGroups));
|
||||
HistogramCuts weighted_cuts = DeviceSketch(&ctx, m.get(), kBins, 0);
|
||||
|
||||
// sketch with no weight
|
||||
@@ -729,7 +729,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
|
||||
for (size_t i = 0; i < kGroups; ++i) {
|
||||
groups[i] = kRows / kGroups;
|
||||
}
|
||||
info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
|
||||
info.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), kGroups));
|
||||
}
|
||||
|
||||
info.weights_.SetDevice(DeviceOrd::CUDA(0));
|
||||
@@ -748,10 +748,10 @@ void TestAdapterSketchFromWeights(bool with_group) {
|
||||
|
||||
auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols);
|
||||
if (with_group) {
|
||||
dmat->Info().SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
|
||||
dmat->Info().SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), kGroups));
|
||||
}
|
||||
|
||||
dmat->Info().SetInfo(ctx, "weight", h_weights.data(), DataType::kFloat32, h_weights.size());
|
||||
dmat->Info().SetInfo(ctx, "weight", Make1dInterfaceTest(h_weights.data(), h_weights.size()));
|
||||
dmat->Info().num_col_ = kCols;
|
||||
dmat->Info().num_row_ = kRows;
|
||||
ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost Contributors
|
||||
* Copyright 2019-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
#include <fstream> // for ofstream
|
||||
#include <numeric> // for iota
|
||||
|
||||
#include "../../../src/common/io.h"
|
||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||
|
||||
@@ -4,10 +4,10 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <iterator> // for back_inserter
|
||||
#include <limits> // for numeric_limits
|
||||
#include <map>
|
||||
#include <numeric> // for iota
|
||||
|
||||
#include "../../../src/common/charconv.h"
|
||||
#include "../../../src/common/io.h"
|
||||
#include "../../../src/common/json_utils.h"
|
||||
#include "../../../src/common/threading_utils.h" // for ParallelFor
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
/**
|
||||
* Copyright 2018-2023 by XGBoost Contributors
|
||||
* Copyright 2018-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/span.h>
|
||||
#include <xgboost/host_device_vector.h>
|
||||
#include <xgboost/span.h>
|
||||
|
||||
#include <numeric> // for iota
|
||||
#include <vector>
|
||||
|
||||
#include "../../../src/common/transform.h"
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost Contributors
|
||||
* Copyright 2021-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/host_device_vector.h>
|
||||
#include "../helpers.h"
|
||||
|
||||
#include "../../../src/data/array_interface.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "../../../src/common/version.h"
|
||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||
#include "../helpers.h" // for GMockTHrow
|
||||
#include "xgboost/base.h"
|
||||
@@ -23,23 +22,22 @@ TEST(MetaInfo, GetSet) {
|
||||
double double2[2] = {1.0, 2.0};
|
||||
|
||||
EXPECT_EQ(info.labels.Size(), 0);
|
||||
info.SetInfo(ctx, "label", double2, xgboost::DataType::kFloat32, 2);
|
||||
info.SetInfo(ctx, "label", Make1dInterfaceTest(double2, 2));
|
||||
EXPECT_EQ(info.labels.Size(), 2);
|
||||
|
||||
float float2[2] = {1.0f, 2.0f};
|
||||
EXPECT_EQ(info.GetWeight(1), 1.0f)
|
||||
<< "When no weights are given, was expecting default value 1";
|
||||
info.SetInfo(ctx, "weight", float2, xgboost::DataType::kFloat32, 2);
|
||||
EXPECT_EQ(info.GetWeight(1), 1.0f) << "When no weights are given, was expecting default value 1";
|
||||
info.SetInfo(ctx, "weight", Make1dInterfaceTest(float2, 2));
|
||||
EXPECT_EQ(info.GetWeight(1), 2.0f);
|
||||
|
||||
uint32_t uint32_t2[2] = {1U, 2U};
|
||||
EXPECT_EQ(info.base_margin_.Size(), 0);
|
||||
info.SetInfo(ctx, "base_margin", uint32_t2, xgboost::DataType::kUInt32, 2);
|
||||
info.SetInfo(ctx, "base_margin", Make1dInterfaceTest(uint32_t2, 2));
|
||||
EXPECT_EQ(info.base_margin_.Size(), 2);
|
||||
|
||||
uint64_t uint64_t2[2] = {1U, 2U};
|
||||
EXPECT_EQ(info.group_ptr_.size(), 0);
|
||||
info.SetInfo(ctx, "group", uint64_t2, xgboost::DataType::kUInt64, 2);
|
||||
info.SetInfo(ctx, "group", Make1dInterfaceTest(uint64_t2, 2));
|
||||
ASSERT_EQ(info.group_ptr_.size(), 3);
|
||||
EXPECT_EQ(info.group_ptr_[2], 3);
|
||||
|
||||
@@ -135,9 +133,9 @@ TEST(MetaInfo, SaveLoadBinary) {
|
||||
};
|
||||
std::vector<float> values (kRows);
|
||||
std::generate(values.begin(), values.end(), generator);
|
||||
info.SetInfo(ctx, "label", values.data(), xgboost::DataType::kFloat32, kRows);
|
||||
info.SetInfo(ctx, "weight", values.data(), xgboost::DataType::kFloat32, kRows);
|
||||
info.SetInfo(ctx, "base_margin", values.data(), xgboost::DataType::kFloat32, kRows);
|
||||
info.SetInfo(ctx, "label", Make1dInterfaceTest(values.data(), kRows));
|
||||
info.SetInfo(ctx, "weight", Make1dInterfaceTest(values.data(), kRows));
|
||||
info.SetInfo(ctx, "base_margin", Make1dInterfaceTest(values.data(), kRows));
|
||||
|
||||
info.num_row_ = kRows;
|
||||
info.num_col_ = kCols;
|
||||
@@ -271,7 +269,7 @@ TEST(MetaInfo, CPUQid) {
|
||||
qid[i] = i;
|
||||
}
|
||||
|
||||
info.SetInfo(ctx, "qid", qid.data(), xgboost::DataType::kUInt32, info.num_row_);
|
||||
info.SetInfo(ctx, "qid", Make1dInterfaceTest(qid.data(), info.num_row_));
|
||||
ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
|
||||
ASSERT_EQ(info.group_ptr_.front(), 0);
|
||||
ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
|
||||
@@ -288,14 +286,12 @@ TEST(MetaInfo, Validate) {
|
||||
info.num_col_ = 3;
|
||||
std::vector<xgboost::bst_group_t> groups (11);
|
||||
Context ctx;
|
||||
info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
|
||||
info.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size()));
|
||||
EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
|
||||
|
||||
std::vector<float> labels(info.num_row_ + 1);
|
||||
EXPECT_THROW(
|
||||
{
|
||||
info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_ + 1);
|
||||
},
|
||||
{ info.SetInfo(ctx, "label", Make1dInterfaceTest(labels.data(), info.num_row_ + 1)); },
|
||||
dmlc::Error);
|
||||
|
||||
// Make overflow data, which can happen when users pass group structure as int
|
||||
@@ -305,13 +301,13 @@ TEST(MetaInfo, Validate) {
|
||||
groups.push_back(1562500);
|
||||
}
|
||||
groups.push_back(static_cast<xgboost::bst_group_t>(-1));
|
||||
EXPECT_THROW(info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size()),
|
||||
EXPECT_THROW(info.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size())),
|
||||
dmlc::Error);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
info.group_ptr_.clear();
|
||||
labels.resize(info.num_row_);
|
||||
info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
|
||||
info.SetInfo(ctx, "label", Make1dInterfaceTest(labels.data(), info.num_row_));
|
||||
info.labels.SetDevice(FstCU());
|
||||
EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
|
||||
|
||||
@@ -340,8 +336,8 @@ TEST(MetaInfo, HostExtend) {
|
||||
for (size_t g = 0; g < kRows / per_group; ++g) {
|
||||
groups.emplace_back(per_group);
|
||||
}
|
||||
lhs.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size());
|
||||
rhs.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size());
|
||||
lhs.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size()));
|
||||
rhs.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size()));
|
||||
|
||||
lhs.Extend(rhs, true, true);
|
||||
ASSERT_EQ(lhs.num_row_, kRows * 2);
|
||||
|
||||
@@ -408,7 +408,7 @@ class Dart : public testing::TestWithParam<char const*> {
|
||||
for (size_t i = 0; i < kRows; ++i) {
|
||||
labels[i] = i % 2;
|
||||
}
|
||||
p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kRows);
|
||||
p_mat->SetInfo("label", Make1dInterfaceTest(labels.data(), kRows));
|
||||
|
||||
auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
|
||||
learner->SetParam("booster", "dart");
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
/**
|
||||
* Copyright 2020-2024, XGBoost contributors
|
||||
*/
|
||||
#include <xgboost/c_api.h>
|
||||
|
||||
#include "helpers.h"
|
||||
#include "../../src/data/device_adapter.cuh"
|
||||
#include "../../src/data/iterative_dmatrix.h"
|
||||
#include "helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
|
||||
@@ -15,19 +15,18 @@
|
||||
|
||||
#include <cstdint> // std::int32_t
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include "../../src/collective/communicator-inl.h"
|
||||
#include "../../src/common/common.h"
|
||||
#include "../../src/common/threading_utils.h"
|
||||
#include "../../src/data/array_interface.h"
|
||||
#include "filesystem.h" // dmlc::TemporaryDirectory
|
||||
#include "xgboost/linalg.h"
|
||||
#if !defined(_OPENMP)
|
||||
#include <thread>
|
||||
#endif
|
||||
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
#define DeclareUnifiedTest(name) GPU ## name
|
||||
@@ -333,7 +332,7 @@ inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t nu
|
||||
std::vector<float> x(n);
|
||||
std::mt19937 rng(0);
|
||||
std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
|
||||
std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
|
||||
std::generate(x.begin(), x.end(), [&]() { return static_cast<float>(dist(rng)); });
|
||||
// Make sure each category is present
|
||||
for (size_t i = 0; i < num_categories; i++) {
|
||||
x[i] = static_cast<decltype(x)::value_type>(i);
|
||||
@@ -494,6 +493,16 @@ inline int Next(DataIterHandle self) {
|
||||
return static_cast<ArrayIterForTest*>(self)->Next();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Create an array interface for host vector.
|
||||
*/
|
||||
template <typename T>
|
||||
char const* Make1dInterfaceTest(T const* vec, std::size_t len) {
|
||||
static thread_local std::string str;
|
||||
str = linalg::Make1dInterface(vec, len);
|
||||
return str.c_str();
|
||||
}
|
||||
|
||||
class RMMAllocator;
|
||||
using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
|
||||
RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
|
||||
|
||||
@@ -5,10 +5,9 @@
|
||||
#include <xgboost/json.h>
|
||||
#include <xgboost/metric.h>
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <numeric> // for iota
|
||||
|
||||
#include "../../../src/common/linalg_op.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost::metric {
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
/*!
|
||||
* Copyright 2018-2023 XGBoost contributors
|
||||
/**
|
||||
* Copyright 2018-2024, XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/context.h>
|
||||
#include <xgboost/objective.h>
|
||||
|
||||
#include "../../../src/objective/adaptive.h"
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../helpers.h"
|
||||
#include <numeric> // for iota
|
||||
|
||||
#include "../../../src/objective/adaptive.h"
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../helpers.h"
|
||||
#include "test_regression_obj.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@@ -60,8 +60,7 @@ TEST_F(FederatedCollTest, Allgather) {
|
||||
|
||||
std::vector<std::int32_t> buffer(n_workers, 0);
|
||||
buffer[comm->Rank()] = comm->Rank();
|
||||
auto rc = coll.Allgather(*comm, common::EraseType(common::Span{buffer.data(), buffer.size()}),
|
||||
sizeof(int));
|
||||
auto rc = coll.Allgather(*comm, common::EraseType(common::Span{buffer.data(), buffer.size()}));
|
||||
ASSERT_TRUE(rc.OK());
|
||||
for (auto i = 0; i < n_workers; i++) {
|
||||
ASSERT_EQ(buffer[i], i);
|
||||
|
||||
@@ -5,13 +5,13 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/collective/result.h> // for Result
|
||||
|
||||
#include "../../../../src/collective/allreduce.h"
|
||||
#include "../../../../src/common/common.h" // for AllVisibleGPUs
|
||||
#include "../../../../src/common/device_helpers.cuh" // for device_vector
|
||||
#include "../../../../src/common/type.h" // for EraseType
|
||||
#include "../../collective/test_worker.h" // for SocketTest
|
||||
#include "../../helpers.h" // for MakeCUDACtx
|
||||
#include "federated_coll.cuh"
|
||||
#include "federated_comm.cuh"
|
||||
#include "test_worker.h" // for TestFederated
|
||||
|
||||
namespace xgboost::collective {
|
||||
@@ -71,7 +71,7 @@ void TestAllgather(std::shared_ptr<FederatedComm> comm, std::int32_t rank, std::
|
||||
|
||||
dh::device_vector<std::int32_t> buffer(n_workers, 0);
|
||||
buffer[comm->Rank()] = comm->Rank();
|
||||
auto rc = w.coll->Allgather(*w.nccl_comm, common::EraseType(dh::ToSpan(buffer)), sizeof(int));
|
||||
auto rc = w.coll->Allgather(*w.nccl_comm, common::EraseType(dh::ToSpan(buffer)));
|
||||
ASSERT_TRUE(rc.OK());
|
||||
for (auto i = 0; i < n_workers; i++) {
|
||||
ASSERT_EQ(buffer[i], i);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
#include "../../../../src/collective/tracker.h" // for GetHostAddress
|
||||
#include "federated_tracker.h"
|
||||
#include "test_worker.h"
|
||||
#include "xgboost/json.h" // for Json
|
||||
|
||||
namespace xgboost::collective {
|
||||
@@ -26,7 +25,7 @@ TEST(FederatedTrackerTest, Basic) {
|
||||
ASSERT_GE(tracker->Port(), 1);
|
||||
std::string host;
|
||||
auto rc = GetHostAddress(&host);
|
||||
ASSERT_EQ(get<String const>(args["DMLC_TRACKER_URI"]), host);
|
||||
ASSERT_EQ(get<String const>(args["dmlc_tracker_uri"]), host);
|
||||
|
||||
rc = tracker->Shutdown();
|
||||
ASSERT_TRUE(rc.OK());
|
||||
|
||||
@@ -8,22 +8,23 @@
|
||||
namespace xgboost::sycl {
|
||||
template<typename T, typename Container>
|
||||
void VerifySyclVector(const USMVector<T, MemoryType::shared>& sycl_vector,
|
||||
const Container& host_vector) {
|
||||
const Container& host_vector, T eps = T()) {
|
||||
ASSERT_EQ(sycl_vector.Size(), host_vector.size());
|
||||
|
||||
size_t size = sycl_vector.Size();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
ASSERT_EQ(sycl_vector[i], host_vector[i]);
|
||||
EXPECT_NEAR(sycl_vector[i], host_vector[i], eps);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename Container>
|
||||
void VerifySyclVector(const std::vector<T>& sycl_vector, const Container& host_vector) {
|
||||
void VerifySyclVector(const std::vector<T>& sycl_vector,
|
||||
const Container& host_vector, T eps = T()) {
|
||||
ASSERT_EQ(sycl_vector.size(), host_vector.size());
|
||||
|
||||
size_t size = sycl_vector.size();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
ASSERT_EQ(sycl_vector[i], host_vector[i]);
|
||||
EXPECT_NEAR(sycl_vector[i], host_vector[i], eps);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -26,7 +26,6 @@ TEST(FederatedAdapterSimpleTest, ThrowOnInvalidDeviceOrdinal) {
|
||||
namespace {
|
||||
void VerifyAllReduceSum() {
|
||||
auto const world_size = collective::GetWorldSize();
|
||||
auto const rank = collective::GetRank();
|
||||
auto const device = GPUIDX;
|
||||
int count = 3;
|
||||
common::SetDevice(device);
|
||||
|
||||
157
tests/cpp/plugin/test_sycl_ghist_builder.cc
Normal file
157
tests/cpp/plugin/test_sycl_ghist_builder.cc
Normal file
@@ -0,0 +1,157 @@
|
||||
/**
|
||||
* Copyright 2020-2024 by XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||
#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
#include "../../../plugin/sycl/common/hist_util.h"
|
||||
#include "../../../plugin/sycl/device_manager.h"
|
||||
#include "sycl_helpers.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost::sycl::common {
|
||||
|
||||
template <typename GradientSumT>
|
||||
void GHistBuilderTest(float sparsity, bool force_atomic_use) {
|
||||
const size_t num_rows = 8;
|
||||
const size_t num_columns = 1;
|
||||
const int n_bins = 2;
|
||||
const GradientSumT eps = 1e-6;
|
||||
|
||||
Context ctx;
|
||||
ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
|
||||
|
||||
DeviceManager device_manager;
|
||||
auto qu = device_manager.GetQueue(ctx.Device());
|
||||
|
||||
auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix();
|
||||
sycl::DeviceMatrix dmat;
|
||||
dmat.Init(qu, p_fmat.get());
|
||||
|
||||
GHistIndexMatrix gmat_sycl;
|
||||
gmat_sycl.Init(qu, &ctx, dmat, n_bins);
|
||||
|
||||
xgboost::GHistIndexMatrix gmat{&ctx, p_fmat.get(), n_bins, 0.3, false};
|
||||
|
||||
RowSetCollection row_set_collection;
|
||||
auto& row_indices = row_set_collection.Data();
|
||||
row_indices.Resize(&qu, num_rows);
|
||||
size_t* p_row_indices = row_indices.Data();
|
||||
|
||||
qu.submit([&](::sycl::handler& cgh) {
|
||||
cgh.parallel_for<>(::sycl::range<1>(num_rows),
|
||||
[p_row_indices](::sycl::item<1> pid) {
|
||||
const size_t idx = pid.get_id(0);
|
||||
p_row_indices[idx] = idx;
|
||||
});
|
||||
}).wait_and_throw();
|
||||
row_set_collection.Init();
|
||||
|
||||
auto builder = GHistBuilder<GradientSumT>(qu, n_bins);
|
||||
|
||||
std::vector<GradientPair> gpair = {
|
||||
{0.1f, 0.2f}, {0.3f, 0.4f}, {0.5f, 0.6f}, {0.7f, 0.8f},
|
||||
{0.9f, 0.1f}, {0.2f, 0.3f}, {0.4f, 0.5f}, {0.6f, 0.7f}};
|
||||
CHECK_EQ(gpair.size(), num_rows);
|
||||
USMVector<GradientPair, MemoryType::on_device> gpair_device(&qu, gpair);
|
||||
|
||||
std::vector<GradientSumT> hist_host(2*n_bins);
|
||||
GHistRow<GradientSumT, MemoryType::on_device> hist(&qu, 2 * n_bins);
|
||||
::sycl::event event;
|
||||
|
||||
const size_t nblocks = 2;
|
||||
GHistRow<GradientSumT, MemoryType::on_device> hist_buffer(&qu, 2 * nblocks * n_bins);
|
||||
|
||||
InitHist(qu, &hist, hist.Size(), &event);
|
||||
InitHist(qu, &hist_buffer, hist_buffer.Size(), &event);
|
||||
|
||||
event = builder.BuildHist(gpair_device, row_set_collection[0], gmat_sycl, &hist,
|
||||
sparsity < eps , &hist_buffer, event, force_atomic_use);
|
||||
qu.memcpy(hist_host.data(), hist.Data(),
|
||||
2 * n_bins * sizeof(GradientSumT), event);
|
||||
qu.wait_and_throw();
|
||||
|
||||
// Build hist on host to compare
|
||||
std::vector<GradientSumT> hist_desired(2*n_bins);
|
||||
for (size_t rid = 0; rid < num_rows; ++rid) {
|
||||
const size_t ibegin = gmat.row_ptr[rid];
|
||||
const size_t iend = gmat.row_ptr[rid + 1];
|
||||
for (size_t i = ibegin; i < iend; ++i) {
|
||||
const size_t bin_idx = gmat.index[i];
|
||||
hist_desired[2*bin_idx] += gpair[rid].GetGrad();
|
||||
hist_desired[2*bin_idx+1] += gpair[rid].GetHess();
|
||||
}
|
||||
}
|
||||
|
||||
VerifySyclVector(hist_host, hist_desired, eps);
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
void GHistSubtractionTest() {
|
||||
const size_t n_bins = 4;
|
||||
using GHistType = GHistRow<GradientSumT, MemoryType::on_device>;
|
||||
|
||||
Context ctx;
|
||||
ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
|
||||
|
||||
DeviceManager device_manager;
|
||||
auto qu = device_manager.GetQueue(ctx.Device());
|
||||
|
||||
::sycl::event event;
|
||||
std::vector<GradientSumT> hist1_host = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
|
||||
GHistType hist1(&qu, 2 * n_bins);
|
||||
event = qu.memcpy(hist1.Data(), hist1_host.data(),
|
||||
2 * n_bins * sizeof(GradientSumT), event);
|
||||
|
||||
std::vector<GradientSumT> hist2_host = {0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1};
|
||||
GHistType hist2(&qu, 2 * n_bins);
|
||||
event = qu.memcpy(hist2.Data(), hist2_host.data(),
|
||||
2 * n_bins * sizeof(GradientSumT), event);
|
||||
|
||||
std::vector<GradientSumT> hist3_host(2 * n_bins);
|
||||
GHistType hist3(&qu, 2 * n_bins);
|
||||
event = SubtractionHist(qu, &hist3, hist1, hist2, n_bins, event);
|
||||
qu.memcpy(hist3_host.data(), hist3.Data(),
|
||||
2 * n_bins * sizeof(GradientSumT), event);
|
||||
qu.wait_and_throw();
|
||||
|
||||
std::vector<GradientSumT> hist3_desired(2 * n_bins);
|
||||
for (size_t idx = 0; idx < 2 * n_bins; ++idx) {
|
||||
hist3_desired[idx] = hist1_host[idx] - hist2_host[idx];
|
||||
}
|
||||
|
||||
const GradientSumT eps = 1e-6;
|
||||
VerifySyclVector(hist3_host, hist3_desired, eps);
|
||||
}
|
||||
|
||||
TEST(SyclGHistBuilder, ByBlockDenseCase) {
|
||||
GHistBuilderTest<float>(0.0, false);
|
||||
GHistBuilderTest<double>(0.0, false);
|
||||
}
|
||||
|
||||
TEST(SyclGHistBuilder, ByBlockSparseCase) {
|
||||
GHistBuilderTest<float>(0.3, false);
|
||||
GHistBuilderTest<double>(0.3, false);
|
||||
}
|
||||
|
||||
TEST(SyclGHistBuilder, ByAtomicDenseCase) {
|
||||
GHistBuilderTest<float>(0.0, true);
|
||||
GHistBuilderTest<double>(0.0, true);
|
||||
}
|
||||
|
||||
TEST(SyclGHistBuilder, ByAtomicSparseCase) {
|
||||
GHistBuilderTest<float>(0.3, true);
|
||||
GHistBuilderTest<double>(0.3, true);
|
||||
}
|
||||
|
||||
TEST(SyclGHistBuilder, Subtraction) {
|
||||
GHistSubtractionTest<float>();
|
||||
GHistSubtractionTest<double>();
|
||||
}
|
||||
|
||||
} // namespace xgboost::sycl::common
|
||||
55
tests/cpp/plugin/test_sycl_quantile_hist_builder.cc
Normal file
55
tests/cpp/plugin/test_sycl_quantile_hist_builder.cc
Normal file
@@ -0,0 +1,55 @@
|
||||
/**
|
||||
* Copyright 2020-2024 by XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||
#include <xgboost/json.h>
|
||||
#include <xgboost/task.h>
|
||||
#include "../../../plugin/sycl/tree/updater_quantile_hist.h" // for QuantileHistMaker
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
namespace xgboost::sycl::tree {
|
||||
TEST(SyclQuantileHistMaker, Basic) {
|
||||
Context ctx;
|
||||
ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
|
||||
|
||||
ObjInfo task{ObjInfo::kRegression};
|
||||
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker_sycl", &ctx, &task)};
|
||||
|
||||
ASSERT_EQ(updater->Name(), "grow_quantile_histmaker_sycl");
|
||||
}
|
||||
|
||||
TEST(SyclQuantileHistMaker, JsonIO) {
|
||||
Context ctx;
|
||||
ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
|
||||
|
||||
ObjInfo task{ObjInfo::kRegression};
|
||||
Json config {Object()};
|
||||
{
|
||||
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker_sycl", &ctx, &task)};
|
||||
updater->Configure({{"max_depth", std::to_string(42)}});
|
||||
updater->Configure({{"single_precision_histogram", std::to_string(true)}});
|
||||
updater->SaveConfig(&config);
|
||||
}
|
||||
|
||||
{
|
||||
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker_sycl", &ctx, &task)};
|
||||
updater->LoadConfig(config);
|
||||
|
||||
Json new_config {Object()};
|
||||
updater->SaveConfig(&new_config);
|
||||
|
||||
ASSERT_EQ(config, new_config);
|
||||
|
||||
auto max_depth = atoi(get<String const>(new_config["train_param"]["max_depth"]).c_str());
|
||||
ASSERT_EQ(max_depth, 42);
|
||||
|
||||
auto single_precision_histogram = atoi(get<String const>(new_config["sycl_hist_train_param"]["single_precision_histogram"]).c_str());
|
||||
ASSERT_EQ(single_precision_histogram, 1);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace xgboost::sycl::tree
|
||||
@@ -12,7 +12,6 @@
|
||||
#include <cinttypes> // for int32_t, int64_t, uint32_t
|
||||
#include <cstddef> // for size_t
|
||||
#include <iosfwd> // for ofstream
|
||||
#include <iterator> // for back_insert_iterator, back_inserter
|
||||
#include <limits> // for numeric_limits
|
||||
#include <map> // for map
|
||||
#include <memory> // for unique_ptr, shared_ptr, __shared_ptr_...
|
||||
@@ -30,7 +29,6 @@
|
||||
#include "../../src/common/random.h" // for GlobalRandom
|
||||
#include "dmlc/io.h" // for Stream
|
||||
#include "dmlc/omp.h" // for omp_get_max_threads
|
||||
#include "dmlc/registry.h" // for Registry
|
||||
#include "filesystem.h" // for TemporaryDirectory
|
||||
#include "helpers.h" // for GetBaseScore, RandomDataGenerator
|
||||
#include "objective_helpers.h" // for MakeObjNamesForTest, ObjTestNameGenerator
|
||||
@@ -103,9 +101,9 @@ TEST(Learner, CheckGroup) {
|
||||
labels[i] = i % 2;
|
||||
}
|
||||
|
||||
p_mat->SetInfo("weight", static_cast<void *>(weight.data()), DataType::kFloat32, kNumGroups);
|
||||
p_mat->SetInfo("group", group.data(), DataType::kUInt32, kNumGroups);
|
||||
p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kNumRows);
|
||||
p_mat->SetInfo("weight", Make1dInterfaceTest(weight.data(), kNumGroups));
|
||||
p_mat->SetInfo("group", Make1dInterfaceTest(group.data(), kNumGroups));
|
||||
p_mat->SetInfo("label", Make1dInterfaceTest(labels.data(), kNumRows));
|
||||
|
||||
std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {p_mat};
|
||||
auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
|
||||
@@ -115,7 +113,7 @@ TEST(Learner, CheckGroup) {
|
||||
group.resize(kNumGroups+1);
|
||||
group[3] = 4;
|
||||
group[4] = 1;
|
||||
p_mat->SetInfo("group", group.data(), DataType::kUInt32, kNumGroups+1);
|
||||
p_mat->SetInfo("group", Make1dInterfaceTest(group.data(), kNumGroups+1));
|
||||
EXPECT_ANY_THROW(learner->UpdateOneIter(0, p_mat));
|
||||
}
|
||||
|
||||
@@ -132,7 +130,7 @@ TEST(Learner, SLOW_CheckMultiBatch) { // NOLINT
|
||||
for (size_t i = 0; i < num_row; ++i) {
|
||||
labels[i] = i % 2;
|
||||
}
|
||||
dmat->SetInfo("label", labels.data(), DataType::kFloat32, num_row);
|
||||
dmat->SetInfo("label", Make1dInterfaceTest(labels.data(), num_row));
|
||||
std::vector<std::shared_ptr<DMatrix>> mat{dmat};
|
||||
auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
|
||||
learner->SetParams(Args{{"objective", "binary:logistic"}});
|
||||
|
||||
@@ -239,4 +239,18 @@ void TestAtomicAdd() {
|
||||
TEST(Histogram, AtomicAddInt64) {
|
||||
TestAtomicAdd();
|
||||
}
|
||||
|
||||
TEST(Histogram, Quantiser) {
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
std::size_t n_samples{16};
|
||||
HostDeviceVector<GradientPair> gpair(n_samples, GradientPair{1.0, 1.0});
|
||||
gpair.SetDevice(ctx.Device());
|
||||
|
||||
auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
|
||||
for (auto v : gpair.ConstHostVector()) {
|
||||
auto gh = quantiser.ToFloatingPoint(quantiser.ToFixedPoint(v));
|
||||
ASSERT_EQ(gh.GetGrad(), 1.0);
|
||||
ASSERT_EQ(gh.GetHess(), 1.0);
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -71,15 +71,6 @@ def _test_from_cudf(DMatrixT):
|
||||
assert dtrain.num_col() == 1
|
||||
assert dtrain.num_row() == 5
|
||||
|
||||
# Boolean is not supported.
|
||||
X_boolean = cudf.DataFrame({"x": cudf.Series([True, False])})
|
||||
with pytest.raises(Exception):
|
||||
dtrain = DMatrixT(X_boolean)
|
||||
|
||||
y_boolean = cudf.DataFrame({"x": cudf.Series([True, False, True, True, True])})
|
||||
with pytest.raises(Exception):
|
||||
dtrain = DMatrixT(X_boolean, label=y_boolean)
|
||||
|
||||
|
||||
def _test_cudf_training(DMatrixT):
|
||||
import pandas as pd
|
||||
|
||||
@@ -929,8 +929,127 @@ class TestPySparkLocal:
|
||||
model_loaded.set_device("cuda")
|
||||
assert model_loaded._run_on_gpu()
|
||||
|
||||
def test_validate_gpu_params(self) -> None:
|
||||
# Standalone
|
||||
standalone_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
classifer_on_cpu = SparkXGBClassifier(use_gpu=False)
|
||||
classifer_on_gpu = SparkXGBClassifier(use_gpu=True)
|
||||
|
||||
# No exception for classifier on CPU
|
||||
classifer_on_cpu._validate_gpu_params("3.4.0", standalone_conf)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="XGBoost doesn't support GPU fractional configurations"
|
||||
):
|
||||
classifer_on_gpu._validate_gpu_params("3.3.0", standalone_conf)
|
||||
|
||||
# No issues
|
||||
classifer_on_gpu._validate_gpu_params("3.4.0", standalone_conf)
|
||||
classifer_on_gpu._validate_gpu_params("3.4.1", standalone_conf)
|
||||
classifer_on_gpu._validate_gpu_params("3.5.0", standalone_conf)
|
||||
classifer_on_gpu._validate_gpu_params("3.5.1", standalone_conf)
|
||||
|
||||
# no spark.executor.resource.gpu.amount
|
||||
standalone_bad_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
msg_match = (
|
||||
"The `spark.executor.resource.gpu.amount` is required for training on GPU"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg_match):
|
||||
classifer_on_gpu._validate_gpu_params("3.3.0", standalone_bad_conf)
|
||||
with pytest.raises(ValueError, match=msg_match):
|
||||
classifer_on_gpu._validate_gpu_params("3.4.0", standalone_bad_conf)
|
||||
with pytest.raises(ValueError, match=msg_match):
|
||||
classifer_on_gpu._validate_gpu_params("3.4.1", standalone_bad_conf)
|
||||
with pytest.raises(ValueError, match=msg_match):
|
||||
classifer_on_gpu._validate_gpu_params("3.5.0", standalone_bad_conf)
|
||||
with pytest.raises(ValueError, match=msg_match):
|
||||
classifer_on_gpu._validate_gpu_params("3.5.1", standalone_bad_conf)
|
||||
|
||||
standalone_bad_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
)
|
||||
msg_match = (
|
||||
"The `spark.task.resource.gpu.amount` is required for training on GPU"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg_match):
|
||||
classifer_on_gpu._validate_gpu_params("3.3.0", standalone_bad_conf)
|
||||
|
||||
classifer_on_gpu._validate_gpu_params("3.4.0", standalone_bad_conf)
|
||||
classifer_on_gpu._validate_gpu_params("3.5.0", standalone_bad_conf)
|
||||
classifer_on_gpu._validate_gpu_params("3.5.1", standalone_bad_conf)
|
||||
|
||||
# Yarn and K8s mode
|
||||
for mode in ["yarn", "k8s://"]:
|
||||
conf = (
|
||||
SparkConf()
|
||||
.setMaster(mode)
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="XGBoost doesn't support GPU fractional configurations",
|
||||
):
|
||||
classifer_on_gpu._validate_gpu_params("3.3.0", conf)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="XGBoost doesn't support GPU fractional configurations",
|
||||
):
|
||||
classifer_on_gpu._validate_gpu_params("3.4.0", conf)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="XGBoost doesn't support GPU fractional configurations",
|
||||
):
|
||||
classifer_on_gpu._validate_gpu_params("3.4.1", conf)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="XGBoost doesn't support GPU fractional configurations",
|
||||
):
|
||||
classifer_on_gpu._validate_gpu_params("3.5.0", conf)
|
||||
|
||||
classifer_on_gpu._validate_gpu_params("3.5.1", conf)
|
||||
|
||||
for mode in ["yarn", "k8s://"]:
|
||||
bad_conf = (
|
||||
SparkConf()
|
||||
.setMaster(mode)
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
)
|
||||
msg_match = (
|
||||
"The `spark.task.resource.gpu.amount` is required for training on GPU"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg_match):
|
||||
classifer_on_gpu._validate_gpu_params("3.3.0", bad_conf)
|
||||
with pytest.raises(ValueError, match=msg_match):
|
||||
classifer_on_gpu._validate_gpu_params("3.4.0", bad_conf)
|
||||
with pytest.raises(ValueError, match=msg_match):
|
||||
classifer_on_gpu._validate_gpu_params("3.5.0", bad_conf)
|
||||
|
||||
classifer_on_gpu._validate_gpu_params("3.5.1", bad_conf)
|
||||
|
||||
def test_skip_stage_level_scheduling(self) -> None:
|
||||
conf = (
|
||||
standalone_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
@@ -943,26 +1062,36 @@ class TestPySparkLocal:
|
||||
classifer_on_gpu = SparkXGBClassifier(use_gpu=True)
|
||||
|
||||
# the correct configurations should not skip stage-level scheduling
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", conf)
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling(
|
||||
"3.4.0", standalone_conf
|
||||
)
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling(
|
||||
"3.4.1", standalone_conf
|
||||
)
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling(
|
||||
"3.5.0", standalone_conf
|
||||
)
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling(
|
||||
"3.5.1", standalone_conf
|
||||
)
|
||||
|
||||
# spark version < 3.4.0
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", conf)
|
||||
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", standalone_conf)
|
||||
# not run on GPU
|
||||
assert classifer_on_cpu._skip_stage_level_scheduling("3.4.0", conf)
|
||||
assert classifer_on_cpu._skip_stage_level_scheduling("3.4.0", standalone_conf)
|
||||
|
||||
# spark.executor.cores is not set
|
||||
badConf = (
|
||||
bad_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
|
||||
|
||||
# spark.executor.cores=1
|
||||
badConf = (
|
||||
bad_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "1")
|
||||
@@ -970,20 +1099,20 @@ class TestPySparkLocal:
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
|
||||
|
||||
# spark.executor.resource.gpu.amount is not set
|
||||
badConf = (
|
||||
bad_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
|
||||
|
||||
# spark.executor.resource.gpu.amount>1
|
||||
badConf = (
|
||||
bad_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
@@ -991,20 +1120,20 @@ class TestPySparkLocal:
|
||||
.set("spark.executor.resource.gpu.amount", "2")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
|
||||
|
||||
# spark.task.resource.gpu.amount is not set
|
||||
badConf = (
|
||||
bad_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
)
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
|
||||
|
||||
# spark.task.resource.gpu.amount=1
|
||||
badConf = (
|
||||
bad_conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
@@ -1012,29 +1141,32 @@ class TestPySparkLocal:
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "1")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
|
||||
|
||||
# yarn
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("yarn")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "1")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
# For Yarn and K8S
|
||||
for mode in ["yarn", "k8s://"]:
|
||||
for gpu_amount in ["0.08", "0.2", "1.0"]:
|
||||
conf = (
|
||||
SparkConf()
|
||||
.setMaster(mode)
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", gpu_amount)
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", conf)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", conf)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.1", conf)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.5.0", conf)
|
||||
|
||||
# k8s
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("k8s://")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "1")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
# This will be fixed when spark 4.0.0 is released.
|
||||
if gpu_amount == "1.0":
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.5.1", conf)
|
||||
else:
|
||||
# Starting from 3.5.1+, stage-level scheduling is working for Yarn and K8s
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling(
|
||||
"3.5.1", conf
|
||||
)
|
||||
|
||||
|
||||
class XgboostLocalTest(SparkTestCase):
|
||||
|
||||
Reference in New Issue
Block a user