Revamp the rabit implementation. (#10112)
This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features: - Federated learning for both CPU and GPU. - NCCL. - More data types. - A unified interface for all the underlying implementations. - Improved timeout handling for both tracker and workers. - Exhausted tests with metrics (fixed a couple of bugs along the way). - A reusable tracker for Python and JVM packages.
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2017-2023 by XGBoost Contributors
|
||||
* Copyright 2017-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <algorithm> // for max, fill, min
|
||||
#include <any> // for any, any_cast
|
||||
@@ -12,7 +12,7 @@
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
|
||||
#include "../collective/communicator.h" // for Operation
|
||||
#include "../collective/allreduce.h"
|
||||
#include "../common/bitfield.h" // for RBitField8
|
||||
#include "../common/categorical.h" // for IsCat, Decision
|
||||
#include "../common/common.h" // for DivRoundUp
|
||||
@@ -461,11 +461,17 @@ class ColumnSplitHelper {
|
||||
return tree_offsets_[tree_index] * n_rows_ + row_id * tree_sizes_[tree_index] + node_id;
|
||||
}
|
||||
|
||||
void AllreduceBitVectors(Context const*) {
|
||||
collective::Allreduce<collective::Operation::kBitwiseOR>(decision_storage_.data(),
|
||||
decision_storage_.size());
|
||||
collective::Allreduce<collective::Operation::kBitwiseAND>(missing_storage_.data(),
|
||||
missing_storage_.size());
|
||||
void AllreduceBitVectors(Context const *ctx) {
|
||||
auto rc = collective::Success() << [&] {
|
||||
return collective::Allreduce(
|
||||
ctx, linalg::MakeVec(decision_storage_.data(), decision_storage_.size()),
|
||||
collective::Op::kBitwiseOR);
|
||||
} << [&] {
|
||||
return collective::Allreduce(
|
||||
ctx, linalg::MakeVec(missing_storage_.data(), missing_storage_.size()),
|
||||
collective::Op::kBitwiseAND);
|
||||
};
|
||||
collective::SafeColl(rc);
|
||||
}
|
||||
|
||||
void MaskOneTree(RegTree::FVec const &feat, std::size_t tree_id, std::size_t row_id) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2017-2023 by XGBoost Contributors
|
||||
* Copyright 2017-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <GPUTreeShap/gpu_treeshap.h>
|
||||
#include <thrust/copy.h>
|
||||
@@ -11,7 +11,7 @@
|
||||
#include <any> // for any, any_cast
|
||||
#include <memory>
|
||||
|
||||
#include "../collective/communicator-inl.cuh"
|
||||
#include "../collective/allreduce.h"
|
||||
#include "../common/bitfield.h"
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/common.h"
|
||||
@@ -817,10 +817,18 @@ class ColumnSplitHelper {
|
||||
|
||||
void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
||||
dh::caching_device_vector<BitType>* missing_storage) const {
|
||||
collective::AllReduce<collective::Operation::kBitwiseOR>(
|
||||
ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
|
||||
collective::AllReduce<collective::Operation::kBitwiseAND>(
|
||||
ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
|
||||
auto rc = collective::Success() << [&] {
|
||||
return collective::Allreduce(
|
||||
ctx_,
|
||||
linalg::MakeVec(decision_storage->data().get(), decision_storage->size(), ctx_->Device()),
|
||||
collective::Op::kBitwiseOR);
|
||||
} << [&] {
|
||||
return collective::Allreduce(
|
||||
ctx_,
|
||||
linalg::MakeVec(missing_storage->data().get(), missing_storage->size(), ctx_->Device()),
|
||||
collective::Op::kBitwiseAND);
|
||||
};
|
||||
collective::SafeColl(rc);
|
||||
}
|
||||
|
||||
void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
||||
|
||||
Reference in New Issue
Block a user