[Breaking] Switch from rabit to the collective communicator (#8257)
* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "test_quantile.h"
|
||||
#include "../helpers.h"
|
||||
#include "../../../src/collective/device_communicator.cuh"
|
||||
#include "../../../src/common/hist_util.cuh"
|
||||
#include "../../../src/common/quantile.cuh"
|
||||
|
||||
@@ -341,17 +342,14 @@ TEST(GPUQuantile, AllReduceBasic) {
|
||||
// This test is supposed to run by a python test that setups the environment.
|
||||
std::string msg {"Skipping AllReduce test"};
|
||||
auto n_gpus = AllVisibleGPUs();
|
||||
InitRabitContext(msg, n_gpus);
|
||||
auto world = rabit::GetWorldSize();
|
||||
InitCommunicatorContext(msg, n_gpus);
|
||||
auto world = collective::GetWorldSize();
|
||||
if (world != 1) {
|
||||
ASSERT_EQ(world, n_gpus);
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
auto reducer = std::make_shared<dh::AllReducer>();
|
||||
reducer->Init(0);
|
||||
|
||||
constexpr size_t kRows = 1000, kCols = 100;
|
||||
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
||||
// Set up single node version;
|
||||
@@ -385,8 +383,8 @@ TEST(GPUQuantile, AllReduceBasic) {
|
||||
|
||||
// Set up distributed version. We rely on using rank as seed to generate
|
||||
// the exact same copy of data.
|
||||
auto rank = rabit::GetRank();
|
||||
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
|
||||
auto rank = collective::GetRank();
|
||||
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
|
||||
HostDeviceVector<float> storage;
|
||||
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
||||
.Device(0)
|
||||
@@ -422,28 +420,26 @@ TEST(GPUQuantile, AllReduceBasic) {
|
||||
ASSERT_NEAR(single_node_data[i].wmin, distributed_data[i].wmin, Eps);
|
||||
}
|
||||
});
|
||||
rabit::Finalize();
|
||||
collective::Finalize();
|
||||
}
|
||||
|
||||
TEST(GPUQuantile, SameOnAllWorkers) {
|
||||
std::string msg {"Skipping SameOnAllWorkers test"};
|
||||
auto n_gpus = AllVisibleGPUs();
|
||||
InitRabitContext(msg, n_gpus);
|
||||
auto world = rabit::GetWorldSize();
|
||||
InitCommunicatorContext(msg, n_gpus);
|
||||
auto world = collective::GetWorldSize();
|
||||
if (world != 1) {
|
||||
ASSERT_EQ(world, n_gpus);
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
auto reducer = std::make_shared<dh::AllReducer>();
|
||||
reducer->Init(0);
|
||||
|
||||
constexpr size_t kRows = 1000, kCols = 100;
|
||||
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
|
||||
MetaInfo const &info) {
|
||||
auto rank = rabit::GetRank();
|
||||
auto rank = collective::GetRank();
|
||||
HostDeviceVector<FeatureType> ft;
|
||||
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
|
||||
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
|
||||
HostDeviceVector<float> storage;
|
||||
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
||||
.Device(0)
|
||||
@@ -459,7 +455,7 @@ TEST(GPUQuantile, SameOnAllWorkers) {
|
||||
|
||||
// Test for all workers having the same sketch.
|
||||
size_t n_data = sketch_distributed.Data().size();
|
||||
rabit::Allreduce<rabit::op::Max>(&n_data, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&n_data, 1);
|
||||
ASSERT_EQ(n_data, sketch_distributed.Data().size());
|
||||
size_t size_as_float =
|
||||
sketch_distributed.Data().size_bytes() / sizeof(float);
|
||||
@@ -472,9 +468,10 @@ TEST(GPUQuantile, SameOnAllWorkers) {
|
||||
thrust::copy(thrust::device, local_data.data(),
|
||||
local_data.data() + local_data.size(),
|
||||
all_workers.begin() + local_data.size() * rank);
|
||||
reducer->AllReduceSum(all_workers.data().get(), all_workers.data().get(),
|
||||
all_workers.size());
|
||||
reducer->Synchronize();
|
||||
collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0);
|
||||
|
||||
communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
|
||||
communicator->Synchronize();
|
||||
|
||||
auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
|
||||
std::vector<float> h_base_line(base_line.size());
|
||||
|
||||
Reference in New Issue
Block a user