Revamp the rabit implementation. (#10112)

This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features:
- Federated learning for both CPU and GPU.
- NCCL.
- More data types.
- A unified interface for all the underlying implementations.
- Improved timeout handling for both tracker and workers.
- Exhausted tests with metrics (fixed a couple of bugs along the way).
- A reusable tracker for Python and JVM packages.
This commit is contained in:
Jiaming Yuan
2024-05-20 11:56:23 +08:00
committed by GitHub
parent ba9b4cb1ee
commit a5a58102e5
195 changed files with 2768 additions and 9234 deletions

View File

@@ -1,12 +1,12 @@
/**
* Copyright 2020-2023, XGBoost contributors
* Copyright 2020-2024, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <thrust/host_vector.h>
#include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
#include "../../collective/test_worker.h" // for BaseMGPUTest
#include "../../helpers.h"
#include "../../histogram_helpers.h"
#include "../test_evaluate_splits.h" // TestPartitionBasedSplit
namespace xgboost::tree {
@@ -17,13 +17,13 @@ auto ZeroParam() {
tparam.UpdateAllowUnknown(args);
return tparam;
}
} // anonymous namespace
inline GradientQuantiser DummyRoundingFactor(Context const* ctx) {
GradientQuantiser DummyRoundingFactor(Context const* ctx) {
thrust::device_vector<GradientPair> gpair(1);
gpair[0] = {1000.f, 1000.f}; // Tests should not exceed sum of 1000
return {ctx, dh::ToSpan(gpair), MetaInfo()};
}
} // anonymous namespace
thrust::device_vector<GradientPairInt64> ConvertToInteger(Context const* ctx,
std::vector<GradientPairPrecise> x) {
@@ -546,7 +546,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
ASSERT_NEAR(split.loss_chg, best_score_, 1e-2);
}
class MGPUHistTest : public BaseMGPUTest {};
class MGPUHistTest : public collective::BaseMGPUTest {};
namespace {
void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
@@ -589,21 +589,29 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
EXPECT_EQ(result.findex, 1) << "rank: " << rank;
EXPECT_EQ(result.findex, 1);
if (is_categorical) {
ASSERT_TRUE(std::isnan(result.fvalue));
} else {
EXPECT_EQ(result.fvalue, 11.0) << "rank: " << rank;
EXPECT_EQ(result.fvalue, 11.0);
}
EXPECT_EQ(result.left_sum + result.right_sum, parent_sum) << "rank: " << rank;
EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
}
} // anonymous namespace
TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
DoTest(VerifyColumnSplitEvaluateSingleSplit, false);
if (common::AllVisibleGPUs() > 1) {
// We can't emulate multiple GPUs with NCCL.
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, false, true);
}
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, true, true);
}
TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
DoTest(VerifyColumnSplitEvaluateSingleSplit, true);
if (common::AllVisibleGPUs() > 1) {
// We can't emulate multiple GPUs with NCCL.
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, false, true);
}
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, true, true);
}
} // namespace xgboost::tree