Fix quantile tests running on multi-gpus (#8775)
* Fix quantile tests running on multi-gpus * Run some gtests with multiple GPUs * fix mgpu test naming * Instruct NCCL to print extra logs * Allocate extra space in /dev/shm to enable NCCL * use gtest_skip to skip mgpu tests --------- Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
parent
225b3158f6
commit
ed91e775ec
@ -36,6 +36,11 @@ steps:
|
|||||||
queue: linux-amd64-mgpu
|
queue: linux-amd64-mgpu
|
||||||
- wait
|
- wait
|
||||||
#### -------- TEST --------
|
#### -------- TEST --------
|
||||||
|
- label: ":console: Run Google Tests"
|
||||||
|
command: "tests/buildkite/test-cpp-mgpu.sh"
|
||||||
|
key: test-cpp-mgpu
|
||||||
|
agents:
|
||||||
|
queue: linux-amd64-mgpu
|
||||||
- label: ":console: Test Python package, 4 GPUs"
|
- label: ":console: Test Python package, 4 GPUs"
|
||||||
command: "tests/buildkite/test-python-gpu.sh mgpu"
|
command: "tests/buildkite/test-python-gpu.sh mgpu"
|
||||||
key: test-python-mgpu
|
key: test-python-mgpu
|
||||||
|
|||||||
16
tests/buildkite/test-cpp-mgpu.sh
Executable file
16
tests/buildkite/test-cpp-mgpu.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
source tests/buildkite/conftest.sh
|
||||||
|
|
||||||
|
# Allocate extra space in /dev/shm to enable NCCL
|
||||||
|
export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
|
||||||
|
|
||||||
|
echo "--- Run Google Tests with CUDA, using multiple GPUs"
|
||||||
|
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
|
||||||
|
chmod +x build/testxgboost
|
||||||
|
tests/ci_build/ci_build.sh gpu nvidia-docker \
|
||||||
|
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
|
||||||
|
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
|
||||||
|
build/testxgboost --gtest_filter=*MGPU*
|
||||||
@ -341,30 +341,26 @@ TEST(GPUQuantile, MultiMerge) {
|
|||||||
namespace {
|
namespace {
|
||||||
void TestAllReduceBasic(int32_t n_gpus) {
|
void TestAllReduceBasic(int32_t n_gpus) {
|
||||||
auto const world = collective::GetWorldSize();
|
auto const world = collective::GetWorldSize();
|
||||||
if (world != 1) {
|
|
||||||
ASSERT_EQ(world, n_gpus);
|
|
||||||
} else {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr size_t kRows = 1000, kCols = 100;
|
constexpr size_t kRows = 1000, kCols = 100;
|
||||||
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
||||||
|
auto const device = collective::GetRank();
|
||||||
|
|
||||||
// Set up single node version;
|
// Set up single node version;
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft({}, device);
|
||||||
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
|
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, device);
|
||||||
|
|
||||||
size_t intermediate_num_cuts = std::min(
|
size_t intermediate_num_cuts = std::min(
|
||||||
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
|
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
|
||||||
std::vector<SketchContainer> containers;
|
std::vector<SketchContainer> containers;
|
||||||
for (auto rank = 0; rank < world; ++rank) {
|
for (auto rank = 0; rank < world; ++rank) {
|
||||||
HostDeviceVector<float> storage;
|
HostDeviceVector<float> storage({}, device);
|
||||||
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
||||||
.Device(0)
|
.Device(device)
|
||||||
.Seed(rank + seed)
|
.Seed(rank + seed)
|
||||||
.GenerateArrayInterface(&storage);
|
.GenerateArrayInterface(&storage);
|
||||||
data::CupyAdapter adapter(interface_str);
|
data::CupyAdapter adapter(interface_str);
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft({}, device);
|
||||||
containers.emplace_back(ft, n_bins, kCols, kRows, 0);
|
containers.emplace_back(ft, n_bins, kCols, kRows, device);
|
||||||
AdapterDeviceSketch(adapter.Value(), n_bins, info,
|
AdapterDeviceSketch(adapter.Value(), n_bins, info,
|
||||||
std::numeric_limits<float>::quiet_NaN(),
|
std::numeric_limits<float>::quiet_NaN(),
|
||||||
&containers.back());
|
&containers.back());
|
||||||
@ -375,16 +371,16 @@ void TestAllReduceBasic(int32_t n_gpus) {
|
|||||||
sketch_on_single_node.FixError();
|
sketch_on_single_node.FixError();
|
||||||
}
|
}
|
||||||
sketch_on_single_node.Unique();
|
sketch_on_single_node.Unique();
|
||||||
TestQuantileElemRank(0, sketch_on_single_node.Data(),
|
TestQuantileElemRank(device, sketch_on_single_node.Data(),
|
||||||
sketch_on_single_node.ColumnsPtr(), true);
|
sketch_on_single_node.ColumnsPtr(), true);
|
||||||
|
|
||||||
// Set up distributed version. We rely on using rank as seed to generate
|
// Set up distributed version. We rely on using rank as seed to generate
|
||||||
// the exact same copy of data.
|
// the exact same copy of data.
|
||||||
auto rank = collective::GetRank();
|
auto rank = collective::GetRank();
|
||||||
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
|
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
|
||||||
HostDeviceVector<float> storage;
|
HostDeviceVector<float> storage({}, device);
|
||||||
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
||||||
.Device(0)
|
.Device(device)
|
||||||
.Seed(rank + seed)
|
.Seed(rank + seed)
|
||||||
.GenerateArrayInterface(&storage);
|
.GenerateArrayInterface(&storage);
|
||||||
data::CupyAdapter adapter(interface_str);
|
data::CupyAdapter adapter(interface_str);
|
||||||
@ -399,7 +395,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
|
|||||||
ASSERT_EQ(sketch_distributed.Data().size(),
|
ASSERT_EQ(sketch_distributed.Data().size(),
|
||||||
sketch_on_single_node.Data().size());
|
sketch_on_single_node.Data().size());
|
||||||
|
|
||||||
TestQuantileElemRank(0, sketch_distributed.Data(),
|
TestQuantileElemRank(device, sketch_distributed.Data(),
|
||||||
sketch_distributed.ColumnsPtr(), true);
|
sketch_distributed.ColumnsPtr(), true);
|
||||||
|
|
||||||
std::vector<SketchEntry> single_node_data(
|
std::vector<SketchEntry> single_node_data(
|
||||||
@ -420,29 +416,27 @@ void TestAllReduceBasic(int32_t n_gpus) {
|
|||||||
}
|
}
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
TEST(GPUQuantile, AllReduceBasic) {
|
TEST(GPUQuantile, MGPUAllReduceBasic) {
|
||||||
auto const n_gpus = AllVisibleGPUs();
|
auto const n_gpus = AllVisibleGPUs();
|
||||||
|
if (n_gpus <= 1) {
|
||||||
|
GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus;
|
||||||
|
}
|
||||||
RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
|
RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
void TestSameOnAllWorkers(int32_t n_gpus) {
|
void TestSameOnAllWorkers(int32_t n_gpus) {
|
||||||
auto world = collective::GetWorldSize();
|
auto world = collective::GetWorldSize();
|
||||||
if (world != 1) {
|
|
||||||
ASSERT_EQ(world, n_gpus);
|
|
||||||
} else {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr size_t kRows = 1000, kCols = 100;
|
constexpr size_t kRows = 1000, kCols = 100;
|
||||||
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
|
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
|
||||||
MetaInfo const &info) {
|
MetaInfo const &info) {
|
||||||
auto rank = collective::GetRank();
|
auto const rank = collective::GetRank();
|
||||||
HostDeviceVector<FeatureType> ft;
|
auto const device = rank;
|
||||||
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
|
HostDeviceVector<FeatureType> ft({}, device);
|
||||||
HostDeviceVector<float> storage;
|
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
|
||||||
|
HostDeviceVector<float> storage({}, device);
|
||||||
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
||||||
.Device(0)
|
.Device(device)
|
||||||
.Seed(rank + seed)
|
.Seed(rank + seed)
|
||||||
.GenerateArrayInterface(&storage);
|
.GenerateArrayInterface(&storage);
|
||||||
data::CupyAdapter adapter(interface_str);
|
data::CupyAdapter adapter(interface_str);
|
||||||
@ -451,7 +445,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
|
|||||||
&sketch_distributed);
|
&sketch_distributed);
|
||||||
sketch_distributed.AllReduce();
|
sketch_distributed.AllReduce();
|
||||||
sketch_distributed.Unique();
|
sketch_distributed.Unique();
|
||||||
TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
|
TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
|
||||||
|
|
||||||
// Test for all workers having the same sketch.
|
// Test for all workers having the same sketch.
|
||||||
size_t n_data = sketch_distributed.Data().size();
|
size_t n_data = sketch_distributed.Data().size();
|
||||||
@ -468,7 +462,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
|
|||||||
thrust::copy(thrust::device, local_data.data(),
|
thrust::copy(thrust::device, local_data.data(),
|
||||||
local_data.data() + local_data.size(),
|
local_data.data() + local_data.size(),
|
||||||
all_workers.begin() + local_data.size() * rank);
|
all_workers.begin() + local_data.size() * rank);
|
||||||
collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0);
|
collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(device);
|
||||||
|
|
||||||
communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
|
communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
|
||||||
communicator->Synchronize();
|
communicator->Synchronize();
|
||||||
@ -492,8 +486,11 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
|
|||||||
}
|
}
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
TEST(GPUQuantile, SameOnAllWorkers) {
|
TEST(GPUQuantile, MGPUSameOnAllWorkers) {
|
||||||
auto const n_gpus = AllVisibleGPUs();
|
auto const n_gpus = AllVisibleGPUs();
|
||||||
|
if (n_gpus <= 1) {
|
||||||
|
GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus;
|
||||||
|
}
|
||||||
RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
|
RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user