Revamp the rabit implementation. (#10112)

This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features:
- Federated learning for both CPU and GPU.
- NCCL.
- More data types.
- A unified interface for all the underlying implementations.
- Improved timeout handling for both tracker and workers.
- Exhausted tests with metrics (fixed a couple of bugs along the way).
- A reusable tracker for Python and JVM packages.
This commit is contained in:
Jiaming Yuan
2024-05-20 11:56:23 +08:00
committed by GitHub
parent ba9b4cb1ee
commit a5a58102e5
195 changed files with 2768 additions and 9234 deletions

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
* Copyright 2017-2024, XGBoost Contributors
*/
#include <algorithm> // for max, fill, min
#include <any> // for any, any_cast
@@ -12,7 +12,7 @@
#include <vector> // for vector
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
#include "../collective/communicator.h" // for Operation
#include "../collective/allreduce.h"
#include "../common/bitfield.h" // for RBitField8
#include "../common/categorical.h" // for IsCat, Decision
#include "../common/common.h" // for DivRoundUp
@@ -461,11 +461,17 @@ class ColumnSplitHelper {
return tree_offsets_[tree_index] * n_rows_ + row_id * tree_sizes_[tree_index] + node_id;
}
void AllreduceBitVectors(Context const*) {
collective::Allreduce<collective::Operation::kBitwiseOR>(decision_storage_.data(),
decision_storage_.size());
collective::Allreduce<collective::Operation::kBitwiseAND>(missing_storage_.data(),
missing_storage_.size());
void AllreduceBitVectors(Context const *ctx) {
auto rc = collective::Success() << [&] {
return collective::Allreduce(
ctx, linalg::MakeVec(decision_storage_.data(), decision_storage_.size()),
collective::Op::kBitwiseOR);
} << [&] {
return collective::Allreduce(
ctx, linalg::MakeVec(missing_storage_.data(), missing_storage_.size()),
collective::Op::kBitwiseAND);
};
collective::SafeColl(rc);
}
void MaskOneTree(RegTree::FVec const &feat, std::size_t tree_id, std::size_t row_id) {

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
* Copyright 2017-2024, XGBoost Contributors
*/
#include <GPUTreeShap/gpu_treeshap.h>
#include <thrust/copy.h>
@@ -11,7 +11,7 @@
#include <any> // for any, any_cast
#include <memory>
#include "../collective/communicator-inl.cuh"
#include "../collective/allreduce.h"
#include "../common/bitfield.h"
#include "../common/categorical.h"
#include "../common/common.h"
@@ -817,10 +817,18 @@ class ColumnSplitHelper {
void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
dh::caching_device_vector<BitType>* missing_storage) const {
collective::AllReduce<collective::Operation::kBitwiseOR>(
ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
collective::AllReduce<collective::Operation::kBitwiseAND>(
ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
auto rc = collective::Success() << [&] {
return collective::Allreduce(
ctx_,
linalg::MakeVec(decision_storage->data().get(), decision_storage->size(), ctx_->Device()),
collective::Op::kBitwiseOR);
} << [&] {
return collective::Allreduce(
ctx_,
linalg::MakeVec(missing_storage->data().get(), missing_storage->size(), ctx_->Device()),
collective::Op::kBitwiseAND);
};
collective::SafeColl(rc);
}
void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,