Fix quantile tests running on multi-gpus (#8775)

* Fix quantile tests running on multi-gpus

* Run some gtests with multiple GPUs

* fix mgpu test naming

* Instruct NCCL to print extra logs

* Allocate extra space in /dev/shm to enable NCCL

* use gtest_skip to skip mgpu tests

---------

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Rong Ou 2023-02-12 17:00:26 -08:00 committed by GitHub
parent 225b3158f6
commit ed91e775ec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 50 additions and 32 deletions

View File

@ -36,6 +36,11 @@ steps:
queue: linux-amd64-mgpu queue: linux-amd64-mgpu
- wait - wait
#### -------- TEST -------- #### -------- TEST --------
- label: ":console: Run Google Tests"
command: "tests/buildkite/test-cpp-mgpu.sh"
key: test-cpp-mgpu
agents:
queue: linux-amd64-mgpu
- label: ":console: Test Python package, 4 GPUs" - label: ":console: Test Python package, 4 GPUs"
command: "tests/buildkite/test-python-gpu.sh mgpu" command: "tests/buildkite/test-python-gpu.sh mgpu"
key: test-python-mgpu key: test-python-mgpu

View File

@ -0,0 +1,16 @@
#!/bin/bash
set -euo pipefail
source tests/buildkite/conftest.sh
# Allocate extra space in /dev/shm to enable NCCL
export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
echo "--- Run Google Tests with CUDA, using multiple GPUs"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
build/testxgboost --gtest_filter=*MGPU*

View File

@ -341,30 +341,26 @@ TEST(GPUQuantile, MultiMerge) {
namespace { namespace {
void TestAllReduceBasic(int32_t n_gpus) { void TestAllReduceBasic(int32_t n_gpus) {
auto const world = collective::GetWorldSize(); auto const world = collective::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}
constexpr size_t kRows = 1000, kCols = 100; constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
auto const device = collective::GetRank();
// Set up single node version; // Set up single node version;
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft({}, device);
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0); SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, device);
size_t intermediate_num_cuts = std::min( size_t intermediate_num_cuts = std::min(
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor)); kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
std::vector<SketchContainer> containers; std::vector<SketchContainer> containers;
for (auto rank = 0; rank < world; ++rank) { for (auto rank = 0; rank < world; ++rank) {
HostDeviceVector<float> storage; HostDeviceVector<float> storage({}, device);
std::string interface_str = RandomDataGenerator{kRows, kCols, 0} std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0) .Device(device)
.Seed(rank + seed) .Seed(rank + seed)
.GenerateArrayInterface(&storage); .GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str); data::CupyAdapter adapter(interface_str);
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft({}, device);
containers.emplace_back(ft, n_bins, kCols, kRows, 0); containers.emplace_back(ft, n_bins, kCols, kRows, device);
AdapterDeviceSketch(adapter.Value(), n_bins, info, AdapterDeviceSketch(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN(),
&containers.back()); &containers.back());
@ -375,16 +371,16 @@ void TestAllReduceBasic(int32_t n_gpus) {
sketch_on_single_node.FixError(); sketch_on_single_node.FixError();
} }
sketch_on_single_node.Unique(); sketch_on_single_node.Unique();
TestQuantileElemRank(0, sketch_on_single_node.Data(), TestQuantileElemRank(device, sketch_on_single_node.Data(),
sketch_on_single_node.ColumnsPtr(), true); sketch_on_single_node.ColumnsPtr(), true);
// Set up distributed version. We rely on using rank as seed to generate // Set up distributed version. We rely on using rank as seed to generate
// the exact same copy of data. // the exact same copy of data.
auto rank = collective::GetRank(); auto rank = collective::GetRank();
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0); SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
HostDeviceVector<float> storage; HostDeviceVector<float> storage({}, device);
std::string interface_str = RandomDataGenerator{kRows, kCols, 0} std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0) .Device(device)
.Seed(rank + seed) .Seed(rank + seed)
.GenerateArrayInterface(&storage); .GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str); data::CupyAdapter adapter(interface_str);
@ -399,7 +395,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
ASSERT_EQ(sketch_distributed.Data().size(), ASSERT_EQ(sketch_distributed.Data().size(),
sketch_on_single_node.Data().size()); sketch_on_single_node.Data().size());
TestQuantileElemRank(0, sketch_distributed.Data(), TestQuantileElemRank(device, sketch_distributed.Data(),
sketch_distributed.ColumnsPtr(), true); sketch_distributed.ColumnsPtr(), true);
std::vector<SketchEntry> single_node_data( std::vector<SketchEntry> single_node_data(
@ -420,29 +416,27 @@ void TestAllReduceBasic(int32_t n_gpus) {
} }
} // anonymous namespace } // anonymous namespace
TEST(GPUQuantile, AllReduceBasic) { TEST(GPUQuantile, MGPUAllReduceBasic) {
auto const n_gpus = AllVisibleGPUs(); auto const n_gpus = AllVisibleGPUs();
if (n_gpus <= 1) {
GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus;
}
RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus); RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
} }
namespace { namespace {
void TestSameOnAllWorkers(int32_t n_gpus) { void TestSameOnAllWorkers(int32_t n_gpus) {
auto world = collective::GetWorldSize(); auto world = collective::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}
constexpr size_t kRows = 1000, kCols = 100; constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) { MetaInfo const &info) {
auto rank = collective::GetRank(); auto const rank = collective::GetRank();
HostDeviceVector<FeatureType> ft; auto const device = rank;
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0); HostDeviceVector<FeatureType> ft({}, device);
HostDeviceVector<float> storage; SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
HostDeviceVector<float> storage({}, device);
std::string interface_str = RandomDataGenerator{kRows, kCols, 0} std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0) .Device(device)
.Seed(rank + seed) .Seed(rank + seed)
.GenerateArrayInterface(&storage); .GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str); data::CupyAdapter adapter(interface_str);
@ -451,7 +445,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
&sketch_distributed); &sketch_distributed);
sketch_distributed.AllReduce(); sketch_distributed.AllReduce();
sketch_distributed.Unique(); sketch_distributed.Unique();
TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
// Test for all workers having the same sketch. // Test for all workers having the same sketch.
size_t n_data = sketch_distributed.Data().size(); size_t n_data = sketch_distributed.Data().size();
@ -468,7 +462,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
thrust::copy(thrust::device, local_data.data(), thrust::copy(thrust::device, local_data.data(),
local_data.data() + local_data.size(), local_data.data() + local_data.size(),
all_workers.begin() + local_data.size() * rank); all_workers.begin() + local_data.size() * rank);
collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0); collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(device);
communicator->AllReduceSum(all_workers.data().get(), all_workers.size()); communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
communicator->Synchronize(); communicator->Synchronize();
@ -492,8 +486,11 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
} }
} // anonymous namespace } // anonymous namespace
TEST(GPUQuantile, SameOnAllWorkers) { TEST(GPUQuantile, MGPUSameOnAllWorkers) {
auto const n_gpus = AllVisibleGPUs(); auto const n_gpus = AllVisibleGPUs();
if (n_gpus <= 1) {
GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus;
}
RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus); RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
} }