Revamp the rabit implementation. (#10112)

This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features:
- Federated learning for both CPU and GPU.
- NCCL.
- More data types.
- A unified interface for all the underlying implementations.
- Improved timeout handling for both tracker and workers.
- Exhausted tests with metrics (fixed a couple of bugs along the way).
- A reusable tracker for Python and JVM packages.
This commit is contained in:
Jiaming Yuan
2024-05-20 11:56:23 +08:00
committed by GitHub
parent ba9b4cb1ee
commit a5a58102e5
195 changed files with 2768 additions and 9234 deletions

View File

@@ -12,6 +12,7 @@
#include "../../../src/data/proxy_dmatrix.h"
#include "../../../src/gbm/gbtree.h"
#include "../../../src/gbm/gbtree_model.h"
#include "../collective/test_worker.h" // for TestDistributedGlobal
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
#include "test_predictor.h"
@@ -43,7 +44,7 @@ void TestColumnSplit() {
TEST(CpuPredictor, BasicColumnSplit) {
auto constexpr kWorldSize = 2;
RunWithInMemoryCommunicator(kWorldSize, TestColumnSplit);
collective::TestDistributedGlobal(kWorldSize, TestColumnSplit);
}
TEST(CpuPredictor, IterationRange) {
@@ -157,7 +158,7 @@ TEST(CPUPredictor, CategoricalPrediction) {
TEST(CPUPredictor, CategoricalPredictionColumnSplit) {
auto constexpr kWorldSize = 2;
RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPrediction, false, true);
collective::TestDistributedGlobal(kWorldSize, [] { TestCategoricalPrediction(false, true); });
}
TEST(CPUPredictor, CategoricalPredictLeaf) {
@@ -168,7 +169,7 @@ TEST(CPUPredictor, CategoricalPredictLeaf) {
TEST(CPUPredictor, CategoricalPredictLeafColumnSplit) {
auto constexpr kWorldSize = 2;
Context ctx;
RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, &ctx, true);
collective::TestDistributedGlobal(kWorldSize, [&] { TestCategoricalPredictLeaf(&ctx, true); });
}
TEST(CpuPredictor, UpdatePredictionCache) {
@@ -183,7 +184,8 @@ TEST(CpuPredictor, LesserFeatures) {
TEST(CpuPredictor, LesserFeaturesColumnSplit) {
auto constexpr kWorldSize = 2;
RunWithInMemoryCommunicator(kWorldSize, TestPredictionWithLesserFeaturesColumnSplit, false);
collective::TestDistributedGlobal(kWorldSize,
[] { TestPredictionWithLesserFeaturesColumnSplit(false); });
}
TEST(CpuPredictor, Sparse) {

View File

@@ -12,6 +12,7 @@
#include "../../../src/data/device_adapter.cuh"
#include "../../../src/data/proxy_dmatrix.h"
#include "../../../src/gbm/gbtree_model.h"
#include "../collective/test_worker.h" // for TestDistributedGlobal, BaseMGPUTest
#include "../helpers.h"
#include "test_predictor.h"
@@ -85,7 +86,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
}
} // anonymous namespace
class MGPUPredictorTest : public BaseMGPUTest {};
class MGPUPredictorTest : public collective::BaseMGPUTest {};
TEST_F(MGPUPredictorTest, BasicColumnSplit) {
auto ctx = MakeCUDACtx(0);
@@ -111,7 +112,8 @@ TEST_F(MGPUPredictorTest, BasicColumnSplit) {
result[i - 1] = out_predictions_h;
}
DoTest(VerifyBasicColumnSplit, result);
this->DoTest([&] { VerifyBasicColumnSplit(result); }, true);
this->DoTest([&] { VerifyBasicColumnSplit(result); }, false);
}
TEST(GPUPredictor, EllpackBasic) {
@@ -209,7 +211,8 @@ TEST(GpuPredictor, LesserFeatures) {
}
TEST_F(MGPUPredictorTest, LesserFeaturesColumnSplit) {
RunWithInMemoryCommunicator(world_size_, TestPredictionWithLesserFeaturesColumnSplit, true);
this->DoTest([] { TestPredictionWithLesserFeaturesColumnSplit(true); }, true);
this->DoTest([] { TestPredictionWithLesserFeaturesColumnSplit(true); }, false);
}
// Very basic test of empty model
@@ -277,7 +280,7 @@ TEST(GPUPredictor, IterationRange) {
}
TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
TestIterationRangeColumnSplit(world_size_, true);
TestIterationRangeColumnSplit(common::AllVisibleGPUs(), true);
}
TEST(GPUPredictor, CategoricalPrediction) {
@@ -285,7 +288,8 @@ TEST(GPUPredictor, CategoricalPrediction) {
}
TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
RunWithInMemoryCommunicator(world_size_, TestCategoricalPrediction, true, true);
this->DoTest([] { TestCategoricalPrediction(true, true); }, true);
this->DoTest([] { TestCategoricalPrediction(true, true); }, false);
}
TEST(GPUPredictor, CategoricalPredictLeaf) {
@@ -294,8 +298,18 @@ TEST(GPUPredictor, CategoricalPredictLeaf) {
}
TEST_F(MGPUPredictorTest, CategoricalPredictionLeafColumnSplit) {
auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
RunWithInMemoryCommunicator(world_size_, TestCategoricalPredictLeaf, &ctx, true);
this->DoTest(
[&] {
auto ctx = MakeCUDACtx(collective::GetRank());
TestCategoricalPredictLeaf(&ctx, true);
},
true);
this->DoTest(
[&] {
auto ctx = MakeCUDACtx(collective::GetRank());
TestCategoricalPredictLeaf(&ctx, true);
},
false);
}
TEST(GPUPredictor, PredictLeafBasic) {
@@ -325,7 +339,7 @@ TEST(GPUPredictor, Sparse) {
}
TEST_F(MGPUPredictorTest, SparseColumnSplit) {
TestSparsePredictionColumnSplit(world_size_, true, 0.2);
TestSparsePredictionColumnSplit(world_size_, true, 0.8);
TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.2);
TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.8);
}
} // namespace xgboost::predictor

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2023 by XGBoost Contributors
* Copyright 2020-2024, XGBoost Contributors
*/
#include "test_predictor.h"
@@ -10,7 +10,6 @@
#include <xgboost/predictor.h> // for PredictionCacheEntry, Predictor, Predic...
#include <xgboost/string_view.h> // for StringView
#include <algorithm> // for max
#include <limits> // for numeric_limits
#include <memory> // for shared_ptr
#include <unordered_map> // for unordered_map
@@ -18,6 +17,7 @@
#include "../../../src/common/bitfield.h" // for LBitField32
#include "../../../src/data/iterative_dmatrix.h" // for IterativeDMatrix
#include "../../../src/data/proxy_dmatrix.h" // for DMatrixProxy
#include "../collective/test_worker.h" // for TestDistributedGlobal
#include "../helpers.h" // for GetDMatrixFromData, RandomDataGenerator
#include "xgboost/json.h" // for Json, Object, get, String
#include "xgboost/linalg.h" // for MakeVec, Tensor, TensorView, Vector
@@ -593,9 +593,23 @@ void TestIterationRangeColumnSplit(int world_size, bool use_gpu) {
Json sliced_model{Object{}};
sliced->SaveModel(&sliced_model);
RunWithInMemoryCommunicator(world_size, VerifyIterationRangeColumnSplit, use_gpu, ranged_model,
sliced_model, kRows, kCols, kClasses, margin_ranged, margin_sliced,
leaf_ranged, leaf_sliced);
#if !defined(XGBOOST_USE_NCCL)
if (use_gpu) {
GTEST_SKIP_("Not compiled with NCCL");
return;
}
#endif // defined(XGBOOST_USE_NCCL)
collective::TestDistributedGlobal(world_size, [&] {
VerifyIterationRangeColumnSplit(use_gpu, ranged_model, sliced_model, kRows, kCols, kClasses,
margin_ranged, margin_sliced, leaf_ranged, leaf_sliced);
});
#if defined(XGBOOST_USE_FEDERATED)
collective::TestFederatedGlobal(world_size, [&] {
VerifyIterationRangeColumnSplit(use_gpu, ranged_model, sliced_model, kRows, kCols, kClasses,
margin_ranged, margin_sliced, leaf_ranged, leaf_sliced);
});
#endif // defined(XGBOOST_USE_FEDERATED)
}
void TestSparsePrediction(Context const *ctx, float sparsity) {
@@ -701,8 +715,23 @@ void TestSparsePredictionColumnSplit(int world_size, bool use_gpu, float sparsit
learner->SetParam("device", ctx.DeviceName());
learner->Predict(Xy, false, &sparse_predt, 0, 0);
RunWithInMemoryCommunicator(world_size, VerifySparsePredictionColumnSplit, use_gpu, model,
kRows, kCols, sparsity, sparse_predt.HostVector());
#if !defined(XGBOOST_USE_NCCL)
if (use_gpu) {
GTEST_SKIP_("Not compiled with NCCL.");
return;
}
#endif // defined(XGBOOST_USE_CUDA)
collective::TestDistributedGlobal(world_size, [&] {
VerifySparsePredictionColumnSplit(use_gpu, model, kRows, kCols, sparsity,
sparse_predt.HostVector());
});
#if defined(XGBOOST_USE_FEDERATED)
collective::TestFederatedGlobal(world_size, [&] {
VerifySparsePredictionColumnSplit(use_gpu, model, kRows, kCols, sparsity,
sparse_predt.HostVector());
});
#endif // defined(XGBOOST_USE_FEDERATED)
}
void TestVectorLeafPrediction(Context const *ctx) {