[Breaking] Switch from rabit to the collective communicator (#8257)
* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -84,11 +84,11 @@ GradientQuantizer::GradientQuantizer(common::Span<GradientPair const> gpair) {
|
||||
// Treat pair as array of 4 primitive types to allreduce
|
||||
using ReduceT = typename decltype(p.first)::ValueT;
|
||||
static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
|
||||
rabit::Allreduce<rabit::op::Sum, ReduceT>(reinterpret_cast<ReduceT*>(&p), 4);
|
||||
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<ReduceT*>(&p), 4);
|
||||
GradientPair positive_sum{p.first}, negative_sum{p.second};
|
||||
|
||||
std::size_t total_rows = gpair.size();
|
||||
rabit::Allreduce<rabit::op::Sum>(&total_rows, 1);
|
||||
collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);
|
||||
|
||||
auto histogram_rounding = GradientSumT{
|
||||
CreateRoundingFactor<T>(std::max(positive_sum.GetGrad(), negative_sum.GetGrad()), total_rows),
|
||||
|
||||
@@ -8,10 +8,10 @@
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "../../collective/communicator-inl.h"
|
||||
#include "../../common/hist_util.h"
|
||||
#include "../../data/gradient_index.h"
|
||||
#include "expand_entry.h"
|
||||
#include "rabit/rabit.h"
|
||||
#include "xgboost/tree_model.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -202,8 +202,9 @@ class HistogramBuilder {
|
||||
}
|
||||
});
|
||||
|
||||
rabit::Allreduce<rabit::op::Sum>(reinterpret_cast<double*>(this->hist_[starting_index].data()),
|
||||
builder_.GetNumBins() * sync_count * 2);
|
||||
collective::Allreduce<collective::Operation::kSum>(
|
||||
reinterpret_cast<double *>(this->hist_[starting_index].data()),
|
||||
builder_.GetNumBins() * sync_count * 2);
|
||||
|
||||
ParallelSubtractionHist(space, nodes_for_explicit_hist_build,
|
||||
nodes_for_subtraction_trick, p_tree);
|
||||
|
||||
@@ -74,7 +74,7 @@ class GloablApproxBuilder {
|
||||
}
|
||||
|
||||
histogram_builder_.Reset(n_total_bins, BatchSpec(param_, hess), ctx_->Threads(), n_batches_,
|
||||
rabit::IsDistributed());
|
||||
collective::IsDistributed());
|
||||
monitor_->Stop(__func__);
|
||||
}
|
||||
|
||||
@@ -88,7 +88,7 @@ class GloablApproxBuilder {
|
||||
for (auto const &g : gpair) {
|
||||
root_sum.Add(g);
|
||||
}
|
||||
rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double *>(&root_sum), 2);
|
||||
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
|
||||
std::vector<CPUExpandEntry> nodes{best};
|
||||
size_t i = 0;
|
||||
auto space = ConstructHistSpace(partitioner_, nodes);
|
||||
|
||||
@@ -4,8 +4,6 @@
|
||||
* \brief use columnwise update to construct a tree
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <rabit/rabit.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
@@ -100,7 +98,7 @@ class ColMaker: public TreeUpdater {
|
||||
void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
|
||||
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
|
||||
const std::vector<RegTree *> &trees) override {
|
||||
if (rabit::IsDistributed()) {
|
||||
if (collective::IsDistributed()) {
|
||||
LOG(FATAL) << "Updater `grow_colmaker` or `exact` tree method doesn't "
|
||||
"support distributed training.";
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/json.h"
|
||||
|
||||
#include "../collective/device_communicator.cuh"
|
||||
#include "../common/io.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "../common/hist_util.h"
|
||||
@@ -528,13 +529,12 @@ struct GPUHistMakerDevice {
|
||||
}
|
||||
|
||||
// num histograms is the number of contiguous histograms in memory to reduce over
|
||||
void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) {
|
||||
void AllReduceHist(int nidx, collective::DeviceCommunicator* communicator, int num_histograms) {
|
||||
monitor.Start("AllReduce");
|
||||
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
|
||||
using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
|
||||
reducer->AllReduceSum(reinterpret_cast<ReduceT*>(d_node_hist),
|
||||
reinterpret_cast<ReduceT*>(d_node_hist),
|
||||
page->Cuts().TotalBins() * 2 * num_histograms);
|
||||
communicator->AllReduceSum(reinterpret_cast<ReduceT*>(d_node_hist),
|
||||
page->Cuts().TotalBins() * 2 * num_histograms);
|
||||
|
||||
monitor.Stop("AllReduce");
|
||||
}
|
||||
@@ -542,8 +542,8 @@ struct GPUHistMakerDevice {
|
||||
/**
|
||||
* \brief Build GPU local histograms for the left and right child of some parent node
|
||||
*/
|
||||
void BuildHistLeftRight(std::vector<GPUExpandEntry> const& candidates, dh::AllReducer* reducer,
|
||||
const RegTree& tree) {
|
||||
void BuildHistLeftRight(std::vector<GPUExpandEntry> const& candidates,
|
||||
collective::DeviceCommunicator* communicator, const RegTree& tree) {
|
||||
if (candidates.empty()) return;
|
||||
// Some nodes we will manually compute histograms
|
||||
// others we will do by subtraction
|
||||
@@ -574,7 +574,7 @@ struct GPUHistMakerDevice {
|
||||
// Reduce all in one go
|
||||
// This gives much better latency in a distributed setting
|
||||
// when processing a large batch
|
||||
this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size());
|
||||
this->AllReduceHist(hist_nidx.at(0), communicator, hist_nidx.size());
|
||||
|
||||
for (size_t i = 0; i < subtraction_nidx.size(); i++) {
|
||||
auto build_hist_nidx = hist_nidx.at(i);
|
||||
@@ -584,7 +584,7 @@ struct GPUHistMakerDevice {
|
||||
if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
|
||||
// Calculate other histogram manually
|
||||
this->BuildHist(subtraction_trick_nidx);
|
||||
this->AllReduceHist(subtraction_trick_nidx, reducer, 1);
|
||||
this->AllReduceHist(subtraction_trick_nidx, communicator, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -593,7 +593,7 @@ struct GPUHistMakerDevice {
|
||||
RegTree& tree = *p_tree;
|
||||
|
||||
// Sanity check - have we created a leaf with no training instances?
|
||||
if (!rabit::IsDistributed() && row_partitioner) {
|
||||
if (!collective::IsDistributed() && row_partitioner) {
|
||||
CHECK(row_partitioner->GetRows(candidate.nid).size() > 0)
|
||||
<< "No training instances in this leaf!";
|
||||
}
|
||||
@@ -642,7 +642,7 @@ struct GPUHistMakerDevice {
|
||||
parent.RightChild());
|
||||
}
|
||||
|
||||
GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
|
||||
GPUExpandEntry InitRoot(RegTree* p_tree, collective::DeviceCommunicator* communicator) {
|
||||
constexpr bst_node_t kRootNIdx = 0;
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
auto gpair_it = dh::MakeTransformIterator<GradientPairPrecise>(
|
||||
@@ -650,11 +650,11 @@ struct GPUHistMakerDevice {
|
||||
GradientPairPrecise root_sum =
|
||||
dh::Reduce(thrust::cuda::par(alloc), gpair_it, gpair_it + gpair.size(),
|
||||
GradientPairPrecise{}, thrust::plus<GradientPairPrecise>{});
|
||||
rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double*>(&root_sum), 2);
|
||||
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double*>(&root_sum), 2);
|
||||
|
||||
hist.AllocateHistograms({kRootNIdx});
|
||||
this->BuildHist(kRootNIdx);
|
||||
this->AllReduceHist(kRootNIdx, reducer, 1);
|
||||
this->AllReduceHist(kRootNIdx, communicator, 1);
|
||||
|
||||
// Remember root stats
|
||||
node_sum_gradients[kRootNIdx] = root_sum;
|
||||
@@ -669,7 +669,7 @@ struct GPUHistMakerDevice {
|
||||
}
|
||||
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo task,
|
||||
RegTree* p_tree, dh::AllReducer* reducer,
|
||||
RegTree* p_tree, collective::DeviceCommunicator* communicator,
|
||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
auto& tree = *p_tree;
|
||||
// Process maximum 32 nodes at a time
|
||||
@@ -680,7 +680,7 @@ struct GPUHistMakerDevice {
|
||||
monitor.Stop("Reset");
|
||||
|
||||
monitor.Start("InitRoot");
|
||||
driver.Push({ this->InitRoot(p_tree, reducer) });
|
||||
driver.Push({ this->InitRoot(p_tree, communicator) });
|
||||
monitor.Stop("InitRoot");
|
||||
|
||||
// The set of leaves that can be expanded asynchronously
|
||||
@@ -707,7 +707,7 @@ struct GPUHistMakerDevice {
|
||||
monitor.Stop("UpdatePosition");
|
||||
|
||||
monitor.Start("BuildHist");
|
||||
this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
|
||||
this->BuildHistLeftRight(filtered_expand_set, communicator, tree);
|
||||
monitor.Stop("BuildHist");
|
||||
|
||||
monitor.Start("EvaluateSplits");
|
||||
@@ -789,11 +789,10 @@ class GPUHistMaker : public TreeUpdater {
|
||||
void InitDataOnce(DMatrix* dmat) {
|
||||
CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
|
||||
info_ = &dmat->Info();
|
||||
reducer_.Init({ctx_->gpu_id}); // NOLINT
|
||||
|
||||
// Synchronise the column sampling seed
|
||||
uint32_t column_sampling_seed = common::GlobalRandom()();
|
||||
rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||
|
||||
BatchParam batch_param{
|
||||
ctx_->gpu_id,
|
||||
@@ -823,12 +822,12 @@ class GPUHistMaker : public TreeUpdater {
|
||||
void CheckTreesSynchronized(RegTree* local_tree) const {
|
||||
std::string s_model;
|
||||
common::MemoryBufferStream fs(&s_model);
|
||||
int rank = rabit::GetRank();
|
||||
int rank = collective::GetRank();
|
||||
if (rank == 0) {
|
||||
local_tree->Save(&fs);
|
||||
}
|
||||
fs.Seek(0);
|
||||
rabit::Broadcast(&s_model, 0);
|
||||
collective::Broadcast(&s_model, 0);
|
||||
RegTree reference_tree{}; // rank 0 tree
|
||||
reference_tree.Load(&fs);
|
||||
CHECK(*local_tree == reference_tree);
|
||||
@@ -841,7 +840,8 @@ class GPUHistMaker : public TreeUpdater {
|
||||
monitor_.Stop("InitData");
|
||||
|
||||
gpair->SetDevice(ctx_->gpu_id);
|
||||
maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_, p_out_position);
|
||||
auto* communicator = collective::Communicator::GetDevice(ctx_->gpu_id);
|
||||
maker->UpdateTree(gpair, p_fmat, task_, p_tree, communicator, p_out_position);
|
||||
}
|
||||
|
||||
bool UpdatePredictionCache(const DMatrix* data,
|
||||
@@ -867,8 +867,6 @@ class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
GPUHistMakerTrainParam hist_maker_param_;
|
||||
|
||||
dh::AllReducer reducer_;
|
||||
|
||||
DMatrix* p_last_fmat_{nullptr};
|
||||
RegTree const* p_last_tree_{nullptr};
|
||||
ObjInfo task_;
|
||||
|
||||
@@ -4,16 +4,13 @@
|
||||
* \brief prune a tree given the statistics
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <rabit/rabit.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/json.h"
|
||||
#include "./param.h"
|
||||
#include "../common/io.h"
|
||||
#include "../common/timer.h"
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
@@ -6,19 +6,12 @@
|
||||
*/
|
||||
#include "./updater_quantile_hist.h"
|
||||
|
||||
#include <rabit/rabit.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "../common/column_matrix.h"
|
||||
#include "../common/hist_util.h"
|
||||
#include "../common/random.h"
|
||||
#include "../common/threading_utils.h"
|
||||
#include "constraints.h"
|
||||
#include "hist/evaluate_splits.h"
|
||||
#include "param.h"
|
||||
@@ -103,7 +96,7 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
|
||||
for (auto const &grad : gpair_h) {
|
||||
grad_stat.Add(grad.GetGrad(), grad.GetHess());
|
||||
}
|
||||
rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double *>(&grad_stat), 2);
|
||||
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat), 2);
|
||||
}
|
||||
|
||||
auto weight = evaluator_->InitRoot(GradStats{grad_stat});
|
||||
@@ -320,7 +313,7 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
|
||||
++page_id;
|
||||
}
|
||||
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
|
||||
rabit::IsDistributed());
|
||||
collective::IsDistributed());
|
||||
|
||||
if (param_.subsample < 1.0f) {
|
||||
CHECK_EQ(param_.sampling_method, TrainParam::kUniform)
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
|
||||
#define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
|
||||
|
||||
#include <rabit/rabit.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
@@ -4,17 +4,17 @@
|
||||
* \brief refresh the statistics and leaf value on the tree on the dataset
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <rabit/rabit.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "xgboost/json.h"
|
||||
#include "./param.h"
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../common/io.h"
|
||||
#include "../common/threading_utils.h"
|
||||
#include "../predictor/predict_fn.h"
|
||||
#include "./param.h"
|
||||
#include "xgboost/json.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -100,8 +100,9 @@ class TreeRefresher : public TreeUpdater {
|
||||
}
|
||||
});
|
||||
};
|
||||
rabit::Allreduce<rabit::op::Sum>(&dmlc::BeginPtr(stemp[0])->sum_grad, stemp[0].size() * 2,
|
||||
lazy_get_stats);
|
||||
lazy_get_stats();
|
||||
collective::Allreduce<collective::Operation::kSum>(&dmlc::BeginPtr(stemp[0])->sum_grad,
|
||||
stemp[0].size() * 2);
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param_.learning_rate;
|
||||
param_.learning_rate = lr / trees.size();
|
||||
|
||||
@@ -4,12 +4,14 @@
|
||||
* \brief synchronize the tree in all distributed nodes
|
||||
*/
|
||||
#include <xgboost/tree_updater.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <limits>
|
||||
|
||||
#include "xgboost/json.h"
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../common/io.h"
|
||||
#include "xgboost/json.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -35,17 +37,17 @@ class TreeSyncher : public TreeUpdater {
|
||||
void Update(HostDeviceVector<GradientPair>*, DMatrix*,
|
||||
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
if (rabit::GetWorldSize() == 1) return;
|
||||
if (collective::GetWorldSize() == 1) return;
|
||||
std::string s_model;
|
||||
common::MemoryBufferStream fs(&s_model);
|
||||
int rank = rabit::GetRank();
|
||||
int rank = collective::GetRank();
|
||||
if (rank == 0) {
|
||||
for (auto tree : trees) {
|
||||
tree->Save(&fs);
|
||||
}
|
||||
}
|
||||
fs.Seek(0);
|
||||
rabit::Broadcast(&s_model, 0);
|
||||
collective::Broadcast(&s_model, 0);
|
||||
for (auto tree : trees) {
|
||||
tree->Load(&fs);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user