[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator

* fix size_t specialization

* really fix size_t

* try again

* add include

* more include

* fix lint errors

* remove rabit includes

* fix pylint error

* return dict from communicator context

* fix communicator shutdown

* fix dask test

* reset communicator mocklist

* fix distributed tests

* do not save device communicator

* fix jvm gpu tests

* add python test for federated communicator

* Update gputreeshap submodule

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Rong Ou
2022-10-05 15:39:01 -07:00
committed by GitHub
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions

View File

@@ -46,8 +46,8 @@ template <bool use_column>
void TestDistributedQuantile(size_t rows, size_t cols) {
std::string msg {"Skipping AllReduce test"};
int32_t constexpr kWorkers = 4;
InitRabitContext(msg, kWorkers);
auto world = rabit::GetWorldSize();
InitCommunicatorContext(msg, kWorkers);
auto world = collective::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, kWorkers);
} else {
@@ -65,7 +65,7 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
// Generate cuts for distributed environment.
auto sparsity = 0.5f;
auto rank = rabit::GetRank();
auto rank = collective::GetRank();
std::vector<FeatureType> ft(cols);
for (size_t i = 0; i < ft.size(); ++i) {
ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
@@ -99,8 +99,8 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
sketch_distributed.MakeCuts(&distributed_cuts);
// Generate cuts for single node environment
rabit::Finalize();
CHECK_EQ(rabit::GetWorldSize(), 1);
collective::Finalize();
CHECK_EQ(collective::GetWorldSize(), 1);
std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
m->Info().num_row_ = world * rows;
ContainerType<use_column> sketch_on_single_node(n_bins, m->Info().feature_types.ConstHostSpan(),
@@ -184,8 +184,8 @@ TEST(Quantile, SameOnAllWorkers) {
#if defined(__unix__)
std::string msg{"Skipping Quantile AllreduceBasic test"};
int32_t constexpr kWorkers = 4;
InitRabitContext(msg, kWorkers);
auto world = rabit::GetWorldSize();
InitCommunicatorContext(msg, kWorkers);
auto world = collective::GetWorldSize();
if (world != 1) {
CHECK_EQ(world, kWorkers);
} else {
@@ -196,7 +196,7 @@ TEST(Quantile, SameOnAllWorkers) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(
kRows, [=](int32_t seed, size_t n_bins, MetaInfo const&) {
auto rank = rabit::GetRank();
auto rank = collective::GetRank();
HostDeviceVector<float> storage;
std::vector<FeatureType> ft(kCols);
for (size_t i = 0; i < ft.size(); ++i) {
@@ -217,12 +217,12 @@ TEST(Quantile, SameOnAllWorkers) {
std::vector<float> cut_min_values(cuts.MinValues().size() * world, 0);
size_t value_size = cuts.Values().size();
rabit::Allreduce<rabit::op::Max>(&value_size, 1);
collective::Allreduce<collective::Operation::kMax>(&value_size, 1);
size_t ptr_size = cuts.Ptrs().size();
rabit::Allreduce<rabit::op::Max>(&ptr_size, 1);
collective::Allreduce<collective::Operation::kMax>(&ptr_size, 1);
CHECK_EQ(ptr_size, kCols + 1);
size_t min_value_size = cuts.MinValues().size();
rabit::Allreduce<rabit::op::Max>(&min_value_size, 1);
collective::Allreduce<collective::Operation::kMax>(&min_value_size, 1);
CHECK_EQ(min_value_size, kCols);
size_t value_offset = value_size * rank;
@@ -235,9 +235,9 @@ TEST(Quantile, SameOnAllWorkers) {
std::copy(cuts.MinValues().cbegin(), cuts.MinValues().cend(),
cut_min_values.begin() + min_values_offset);
rabit::Allreduce<rabit::op::Sum>(cut_values.data(), cut_values.size());
rabit::Allreduce<rabit::op::Sum>(cut_ptrs.data(), cut_ptrs.size());
rabit::Allreduce<rabit::op::Sum>(cut_min_values.data(), cut_min_values.size());
collective::Allreduce<collective::Operation::kSum>(cut_values.data(), cut_values.size());
collective::Allreduce<collective::Operation::kSum>(cut_ptrs.data(), cut_ptrs.size());
collective::Allreduce<collective::Operation::kSum>(cut_min_values.data(), cut_min_values.size());
for (int32_t i = 0; i < world; i++) {
for (size_t j = 0; j < value_size; ++j) {
@@ -256,7 +256,7 @@ TEST(Quantile, SameOnAllWorkers) {
}
}
});
rabit::Finalize();
collective::Finalize();
#endif // defined(__unix__)
}
} // namespace common

View File

@@ -1,6 +1,7 @@
#include <gtest/gtest.h>
#include "test_quantile.h"
#include "../helpers.h"
#include "../../../src/collective/device_communicator.cuh"
#include "../../../src/common/hist_util.cuh"
#include "../../../src/common/quantile.cuh"
@@ -341,17 +342,14 @@ TEST(GPUQuantile, AllReduceBasic) {
// This test is supposed to run by a python test that setups the environment.
std::string msg {"Skipping AllReduce test"};
auto n_gpus = AllVisibleGPUs();
InitRabitContext(msg, n_gpus);
auto world = rabit::GetWorldSize();
InitCommunicatorContext(msg, n_gpus);
auto world = collective::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}
auto reducer = std::make_shared<dh::AllReducer>();
reducer->Init(0);
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
// Set up single node version;
@@ -385,8 +383,8 @@ TEST(GPUQuantile, AllReduceBasic) {
// Set up distributed version. We rely on using rank as seed to generate
// the exact same copy of data.
auto rank = rabit::GetRank();
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
auto rank = collective::GetRank();
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
@@ -422,28 +420,26 @@ TEST(GPUQuantile, AllReduceBasic) {
ASSERT_NEAR(single_node_data[i].wmin, distributed_data[i].wmin, Eps);
}
});
rabit::Finalize();
collective::Finalize();
}
TEST(GPUQuantile, SameOnAllWorkers) {
std::string msg {"Skipping SameOnAllWorkers test"};
auto n_gpus = AllVisibleGPUs();
InitRabitContext(msg, n_gpus);
auto world = rabit::GetWorldSize();
InitCommunicatorContext(msg, n_gpus);
auto world = collective::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}
auto reducer = std::make_shared<dh::AllReducer>();
reducer->Init(0);
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) {
auto rank = rabit::GetRank();
auto rank = collective::GetRank();
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
@@ -459,7 +455,7 @@ TEST(GPUQuantile, SameOnAllWorkers) {
// Test for all workers having the same sketch.
size_t n_data = sketch_distributed.Data().size();
rabit::Allreduce<rabit::op::Max>(&n_data, 1);
collective::Allreduce<collective::Operation::kMax>(&n_data, 1);
ASSERT_EQ(n_data, sketch_distributed.Data().size());
size_t size_as_float =
sketch_distributed.Data().size_bytes() / sizeof(float);
@@ -472,9 +468,10 @@ TEST(GPUQuantile, SameOnAllWorkers) {
thrust::copy(thrust::device, local_data.data(),
local_data.data() + local_data.size(),
all_workers.begin() + local_data.size() * rank);
reducer->AllReduceSum(all_workers.data().get(), all_workers.data().get(),
all_workers.size());
reducer->Synchronize();
collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0);
communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
communicator->Synchronize();
auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
std::vector<float> h_base_line(base_line.size());

View File

@@ -1,16 +1,16 @@
#ifndef XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_
#define XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_
#include <rabit/rabit.h>
#include <algorithm>
#include <string>
#include <vector>
#include "../helpers.h"
#include "../../src/collective/communicator-inl.h"
namespace xgboost {
namespace common {
inline void InitRabitContext(std::string msg, int32_t n_workers) {
inline void InitCommunicatorContext(std::string msg, int32_t n_workers) {
auto port = std::getenv("DMLC_TRACKER_PORT");
std::string port_str;
if (port) {
@@ -28,12 +28,11 @@ inline void InitRabitContext(std::string msg, int32_t n_workers) {
return;
}
std::vector<std::string> envs{
"DMLC_TRACKER_PORT=" + port_str,
"DMLC_TRACKER_URI=" + uri_str,
"DMLC_NUM_WORKER=" + std::to_string(n_workers)};
char* c_envs[] {&(envs[0][0]), &(envs[1][0]), &(envs[2][0])};
rabit::Init(3, c_envs);
Json config{JsonObject()};
config["DMLC_TRACKER_PORT"] = port_str;
config["DMLC_TRACKER_URI"] = uri_str;
config["DMLC_NUM_WORKER"] = n_workers;
collective::Init(config);
}
template <typename Fn> void RunWithSeedsAndBins(size_t rows, Fn fn) {