Revamp the rabit implementation. (#10112)

This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features:
- Federated learning for both CPU and GPU.
- NCCL.
- More data types.
- A unified interface for all the underlying implementations.
- Improved timeout handling for both tracker and workers.
- Exhausted tests with metrics (fixed a couple of bugs along the way).
- A reusable tracker for Python and JVM packages.
This commit is contained in:
Jiaming Yuan
2024-05-20 11:56:23 +08:00
committed by GitHub
parent ba9b4cb1ee
commit a5a58102e5
195 changed files with 2768 additions and 9234 deletions

View File

@@ -15,8 +15,8 @@ namespace xgboost::common {
TEST(MemoryFixSizeBuffer, Seek) {
size_t constexpr kSize { 64 };
std::vector<int32_t> memory( kSize );
rabit::utils::MemoryFixSizeBuffer buf(memory.data(), memory.size());
buf.Seek(rabit::utils::MemoryFixSizeBuffer::kSeekEnd);
MemoryFixSizeBuffer buf(memory.data(), memory.size());
buf.Seek(MemoryFixSizeBuffer::kSeekEnd);
size_t end = buf.Tell();
ASSERT_EQ(end, kSize);
}

View File

@@ -1,12 +1,16 @@
/**
* Copyright 2020-2023 by XGBoost Contributors
* Copyright 2020-2024, XGBoost Contributors
*/
#include "test_quantile.h"
#include <gtest/gtest.h>
#include <cstdint> // for int64_t
#include "../../../src/collective/allreduce.h"
#include "../../../src/common/hist_util.h"
#include "../../../src/data/adapter.h"
#include "../collective/test_worker.h" // for TestDistributedGlobal
#include "xgboost/context.h"
namespace xgboost::common {
@@ -90,6 +94,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
// Generate cuts for single node environment
collective::Finalize();
CHECK_EQ(collective::GetWorldSize(), 1);
std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
m->Info().num_row_ = world * rows;
@@ -145,7 +150,8 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
template <bool use_column>
void TestDistributedQuantile(size_t const rows, size_t const cols) {
auto constexpr kWorkers = 4;
RunWithInMemoryCommunicator(kWorkers, DoTestDistributedQuantile<use_column>, rows, cols);
collective::TestDistributedGlobal(
kWorkers, [=] { DoTestDistributedQuantile<use_column>(rows, cols); }, false);
}
} // anonymous namespace
@@ -272,7 +278,8 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
template <bool use_column>
void TestColSplitQuantile(size_t rows, size_t cols) {
auto constexpr kWorkers = 4;
RunWithInMemoryCommunicator(kWorkers, DoTestColSplitQuantile<use_column>, rows, cols);
collective::TestDistributedGlobal(kWorkers,
[=] { DoTestColSplitQuantile<use_column>(rows, cols); });
}
} // anonymous namespace
@@ -324,43 +331,56 @@ void TestSameOnAllWorkers() {
cut_ptrs(cuts.Ptrs().size() * world, 0);
std::vector<float> cut_min_values(cuts.MinValues().size() * world, 0);
size_t value_size = cuts.Values().size();
collective::Allreduce<collective::Operation::kMax>(&value_size, 1);
size_t ptr_size = cuts.Ptrs().size();
collective::Allreduce<collective::Operation::kMax>(&ptr_size, 1);
CHECK_EQ(ptr_size, kCols + 1);
size_t min_value_size = cuts.MinValues().size();
collective::Allreduce<collective::Operation::kMax>(&min_value_size, 1);
CHECK_EQ(min_value_size, kCols);
std::int64_t value_size = cuts.Values().size();
std::int64_t ptr_size = cuts.Ptrs().size();
std::int64_t min_value_size = cuts.MinValues().size();
size_t value_offset = value_size * rank;
std::copy(cuts.Values().begin(), cuts.Values().end(),
cut_values.begin() + value_offset);
size_t ptr_offset = ptr_size * rank;
std::copy(cuts.Ptrs().cbegin(), cuts.Ptrs().cend(),
cut_ptrs.begin() + ptr_offset);
size_t min_values_offset = min_value_size * rank;
auto rc = collective::Success() << [&] {
return collective::Allreduce(&ctx, &value_size, collective::Op::kMax);
} << [&] {
return collective::Allreduce(&ctx, &ptr_size, collective::Op::kMax);
} << [&] {
return collective::Allreduce(&ctx, &min_value_size, collective::Op::kMax);
};
collective::SafeColl(rc);
ASSERT_EQ(ptr_size, kCols + 1);
ASSERT_EQ(min_value_size, kCols);
std::size_t value_offset = value_size * rank;
std::copy(cuts.Values().begin(), cuts.Values().end(), cut_values.begin() + value_offset);
std::size_t ptr_offset = ptr_size * rank;
std::copy(cuts.Ptrs().cbegin(), cuts.Ptrs().cend(), cut_ptrs.begin() + ptr_offset);
std::size_t min_values_offset = min_value_size * rank;
std::copy(cuts.MinValues().cbegin(), cuts.MinValues().cend(),
cut_min_values.begin() + min_values_offset);
collective::Allreduce<collective::Operation::kSum>(cut_values.data(), cut_values.size());
collective::Allreduce<collective::Operation::kSum>(cut_ptrs.data(), cut_ptrs.size());
collective::Allreduce<collective::Operation::kSum>(cut_min_values.data(), cut_min_values.size());
rc = std::move(rc) << [&] {
return collective::Allreduce(&ctx, linalg::MakeVec(cut_values.data(), cut_values.size()),
collective::Op::kSum);
} << [&] {
return collective::Allreduce(&ctx, linalg::MakeVec(cut_ptrs.data(), cut_ptrs.size()),
collective::Op::kSum);
} << [&] {
return collective::Allreduce(
&ctx, linalg::MakeVec(cut_min_values.data(), cut_min_values.size()),
collective::Op::kSum);
};
collective::SafeColl(rc);
for (int32_t i = 0; i < world; i++) {
for (size_t j = 0; j < value_size; ++j) {
for (std::int32_t i = 0; i < world; i++) {
for (std::int64_t j = 0; j < value_size; ++j) {
size_t idx = i * value_size + j;
EXPECT_NEAR(cuts.Values().at(j), cut_values.at(idx), kRtEps);
ASSERT_NEAR(cuts.Values().at(j), cut_values.at(idx), kRtEps);
}
for (size_t j = 0; j < ptr_size; ++j) {
for (std::int64_t j = 0; j < ptr_size; ++j) {
size_t idx = i * ptr_size + j;
EXPECT_EQ(cuts.Ptrs().at(j), cut_ptrs.at(idx));
}
for (size_t j = 0; j < min_value_size; ++j) {
for (std::int64_t j = 0; j < min_value_size; ++j) {
size_t idx = i * min_value_size + j;
EXPECT_EQ(cuts.MinValues().at(j), cut_min_values.at(idx));
ASSERT_EQ(cuts.MinValues().at(j), cut_min_values.at(idx));
}
}
});
@@ -369,6 +389,6 @@ void TestSameOnAllWorkers() {
TEST(Quantile, SameOnAllWorkers) {
auto constexpr kWorkers = 4;
RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
collective::TestDistributedGlobal(kWorkers, [] { TestSameOnAllWorkers(); });
}
} // namespace xgboost::common

View File

@@ -1,12 +1,13 @@
/**
* Copyright 2020-2023, XGBoost contributors
* Copyright 2020-2024, XGBoost contributors
*/
#include <gtest/gtest.h>
#include "../../../src/collective/communicator-inl.cuh"
#include "../../../src/collective/allreduce.h"
#include "../../../src/common/hist_util.cuh"
#include "../../../src/common/quantile.cuh"
#include "../../../src/data/device_adapter.cuh" // CupyAdapter
#include "../collective/test_worker.h" // for BaseMGPUTest
#include "../helpers.h"
#include "test_quantile.h"
@@ -18,9 +19,9 @@ struct IsSorted {
}
};
}
namespace common {
class MGPUQuantileTest : public BaseMGPUTest {};
namespace common {
class MGPUQuantileTest : public collective::BaseMGPUTest {};
TEST(GPUQuantile, Basic) {
constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
@@ -36,7 +37,8 @@ TEST(GPUQuantile, Basic) {
void TestSketchUnique(float sparsity) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](std::int32_t seed, bst_bin_t n_bins,
MetaInfo const& info) {
HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
@@ -121,7 +123,7 @@ void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
TEST(GPUQuantile, Prune) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
@@ -190,7 +192,7 @@ TEST(GPUQuantile, MergeEmpty) {
TEST(GPUQuantile, MergeBasic) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
HostDeviceVector<float> storage_0;
@@ -260,9 +262,9 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
using Tuple = thrust::tuple<size_t, float>;
auto it = thrust::make_zip_iterator(tuple_it);
thrust::transform(thrust::device, it, it + data_1.size(), data_1.data(),
[=] __device__(Tuple const &tuple) {
[=] XGBOOST_DEVICE(Tuple const& tuple) {
auto i = thrust::get<0>(tuple);
if (thrust::get<0>(tuple) % 2 == 0) {
if (i % 2 == 0) {
return 0.0f;
} else {
return thrust::get<1>(tuple);
@@ -306,7 +308,7 @@ TEST(GPUQuantile, MergeDuplicated) {
TEST(GPUQuantile, MultiMerge) {
constexpr size_t kRows = 20, kCols = 1;
int32_t world = 2;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
// Set up single node version
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());
@@ -368,16 +370,18 @@ namespace {
void TestAllReduceBasic() {
auto const world = collective::GetWorldSize();
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
auto const device = DeviceOrd::CUDA(GPUIDX);
auto ctx = MakeCUDACtx(device.ordinal);
// Set up single node version;
/**
* Set up single node version.
*/
HostDeviceVector<FeatureType> ft({}, device);
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, device);
size_t intermediate_num_cuts = std::min(
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
size_t intermediate_num_cuts =
std::min(kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
std::vector<SketchContainer> containers;
for (auto rank = 0; rank < world; ++rank) {
HostDeviceVector<float> storage({}, device);
@@ -388,21 +392,22 @@ void TestAllReduceBasic() {
data::CupyAdapter adapter(interface_str);
HostDeviceVector<FeatureType> ft({}, device);
containers.emplace_back(ft, n_bins, kCols, kRows, device);
AdapterDeviceSketch(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
&containers.back());
}
for (auto &sketch : containers) {
for (auto& sketch : containers) {
sketch.Prune(intermediate_num_cuts);
sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
sketch_on_single_node.FixError();
}
sketch_on_single_node.Unique();
TestQuantileElemRank(device, sketch_on_single_node.Data(),
sketch_on_single_node.ColumnsPtr(), true);
TestQuantileElemRank(device, sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr(),
true);
// Set up distributed version. We rely on using rank as seed to generate
// the exact same copy of data.
/**
* Set up distributed version. We rely on using rank as seed to generate
* the exact same copy of data.
*/
auto rank = collective::GetRank();
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
HostDeviceVector<float> storage({}, device);
@@ -411,22 +416,23 @@ void TestAllReduceBasic() {
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
AdapterDeviceSketch(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
&sketch_distributed);
if (world == 1) {
auto n_samples_global = kRows * world;
intermediate_num_cuts =
std::min(n_samples_global, static_cast<size_t>(n_bins * SketchContainer::kFactor));
sketch_distributed.Prune(intermediate_num_cuts);
}
sketch_distributed.AllReduce(&ctx, false);
sketch_distributed.Unique();
ASSERT_EQ(sketch_distributed.ColumnsPtr().size(),
sketch_on_single_node.ColumnsPtr().size());
ASSERT_EQ(sketch_distributed.Data().size(),
sketch_on_single_node.Data().size());
ASSERT_EQ(sketch_distributed.ColumnsPtr().size(), sketch_on_single_node.ColumnsPtr().size());
ASSERT_EQ(sketch_distributed.Data().size(), sketch_on_single_node.Data().size());
TestQuantileElemRank(device, sketch_distributed.Data(),
sketch_distributed.ColumnsPtr(), true);
TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
std::vector<SketchEntry> single_node_data(
sketch_on_single_node.Data().size());
std::vector<SketchEntry> single_node_data(sketch_on_single_node.Data().size());
dh::CopyDeviceSpanToVector(&single_node_data, sketch_on_single_node.Data());
std::vector<SketchEntry> distributed_data(sketch_distributed.Data().size());
@@ -444,7 +450,8 @@ void TestAllReduceBasic() {
} // anonymous namespace
TEST_F(MGPUQuantileTest, AllReduceBasic) {
DoTest(TestAllReduceBasic);
this->DoTest([] { TestAllReduceBasic(); }, true);
this->DoTest([] { TestAllReduceBasic(); }, false);
}
namespace {
@@ -490,7 +497,8 @@ void TestColumnSplit(DMatrix* dmat) {
TEST_F(MGPUQuantileTest, ColumnSplitBasic) {
std::size_t constexpr kRows = 1000, kCols = 100;
auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
DoTest(TestColumnSplit, dmat.get());
this->DoTest([&] { TestColumnSplit(dmat.get()); }, true);
this->DoTest([&] { TestColumnSplit(dmat.get()); }, false);
}
TEST_F(MGPUQuantileTest, ColumnSplitCategorical) {
@@ -507,15 +515,15 @@ TEST_F(MGPUQuantileTest, ColumnSplitCategorical) {
.Type(ft)
.MaxCategory(13)
.GenerateDMatrix();
DoTest(TestColumnSplit, dmat.get());
this->DoTest([&] { TestColumnSplit(dmat.get()); }, true);
this->DoTest([&] { TestColumnSplit(dmat.get()); }, false);
}
namespace {
void TestSameOnAllWorkers() {
auto world = collective::GetWorldSize();
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) {
RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
auto const rank = collective::GetRank();
auto const device = DeviceOrd::CUDA(GPUIDX);
Context ctx = MakeCUDACtx(device.ordinal);
@@ -536,7 +544,8 @@ void TestSameOnAllWorkers() {
// Test for all workers having the same sketch.
size_t n_data = sketch_distributed.Data().size();
collective::Allreduce<collective::Operation::kMax>(&n_data, 1);
auto rc = collective::Allreduce(&ctx, linalg::MakeVec(&n_data, 1), collective::Op::kMax);
SafeColl(rc);
ASSERT_EQ(n_data, sketch_distributed.Data().size());
size_t size_as_float =
sketch_distributed.Data().size_bytes() / sizeof(float);
@@ -549,9 +558,10 @@ void TestSameOnAllWorkers() {
thrust::copy(thrust::device, local_data.data(),
local_data.data() + local_data.size(),
all_workers.begin() + local_data.size() * rank);
collective::AllReduce<collective::Operation::kSum>(device.ordinal, all_workers.data().get(),
all_workers.size());
collective::Synchronize(device.ordinal);
rc = collective::Allreduce(
&ctx, linalg::MakeVec(all_workers.data().get(), all_workers.size(), ctx.Device()),
collective::Op::kSum);
SafeColl(rc);
auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
std::vector<float> h_base_line(base_line.size());
@@ -573,7 +583,8 @@ void TestSameOnAllWorkers() {
} // anonymous namespace
TEST_F(MGPUQuantileTest, SameOnAllWorkers) {
DoTest(TestSameOnAllWorkers);
this->DoTest([] { TestSameOnAllWorkers(); }, true);
this->DoTest([] { TestSameOnAllWorkers(); }, false);
}
TEST(GPUQuantile, Push) {

View File

@@ -1,21 +1,22 @@
/**
* Copyright 2020-2024, XGBoost Contributors
*/
#ifndef XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_
#define XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_
#include <algorithm>
#include <string>
#include <vector>
#include "../helpers.h"
namespace xgboost {
namespace common {
namespace xgboost::common {
template <typename Fn> void RunWithSeedsAndBins(size_t rows, Fn fn) {
std::vector<int32_t> seeds(2);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(3, 1000);
std::generate(seeds.begin(), seeds.end(), [&](){ return dist(&lcg); });
std::vector<size_t> bins(2);
std::vector<bst_bin_t> bins(2);
for (size_t i = 0; i < bins.size() - 1; ++i) {
bins[i] = i * 35 + 2;
}
@@ -36,7 +37,6 @@ template <typename Fn> void RunWithSeedsAndBins(size_t rows, Fn fn) {
}
}
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
#endif // XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_