[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator

* fix size_t specialization

* really fix size_t

* try again

* add include

* more include

* fix lint errors

* remove rabit includes

* fix pylint error

* return dict from communicator context

* fix communicator shutdown

* fix dask test

* reset communicator mocklist

* fix distributed tests

* do not save device communicator

* fix jvm gpu tests

* add python test for federated communicator

* Update gputreeshap submodule

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Rong Ou
2022-10-05 15:39:01 -07:00
committed by GitHub
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions

View File

@@ -84,11 +84,11 @@ GradientQuantizer::GradientQuantizer(common::Span<GradientPair const> gpair) {
// Treat pair as array of 4 primitive types to allreduce
using ReduceT = typename decltype(p.first)::ValueT;
static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
rabit::Allreduce<rabit::op::Sum, ReduceT>(reinterpret_cast<ReduceT*>(&p), 4);
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<ReduceT*>(&p), 4);
GradientPair positive_sum{p.first}, negative_sum{p.second};
std::size_t total_rows = gpair.size();
rabit::Allreduce<rabit::op::Sum>(&total_rows, 1);
collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);
auto histogram_rounding = GradientSumT{
CreateRoundingFactor<T>(std::max(positive_sum.GetGrad(), negative_sum.GetGrad()), total_rows),

View File

@@ -8,10 +8,10 @@
#include <limits>
#include <vector>
#include "../../collective/communicator-inl.h"
#include "../../common/hist_util.h"
#include "../../data/gradient_index.h"
#include "expand_entry.h"
#include "rabit/rabit.h"
#include "xgboost/tree_model.h"
namespace xgboost {
@@ -202,8 +202,9 @@ class HistogramBuilder {
}
});
rabit::Allreduce<rabit::op::Sum>(reinterpret_cast<double*>(this->hist_[starting_index].data()),
builder_.GetNumBins() * sync_count * 2);
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double *>(this->hist_[starting_index].data()),
builder_.GetNumBins() * sync_count * 2);
ParallelSubtractionHist(space, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, p_tree);

View File

@@ -74,7 +74,7 @@ class GloablApproxBuilder {
}
histogram_builder_.Reset(n_total_bins, BatchSpec(param_, hess), ctx_->Threads(), n_batches_,
rabit::IsDistributed());
collective::IsDistributed());
monitor_->Stop(__func__);
}
@@ -88,7 +88,7 @@ class GloablApproxBuilder {
for (auto const &g : gpair) {
root_sum.Add(g);
}
rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double *>(&root_sum), 2);
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
std::vector<CPUExpandEntry> nodes{best};
size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes);

View File

@@ -4,8 +4,6 @@
* \brief use columnwise update to construct a tree
* \author Tianqi Chen
*/
#include <rabit/rabit.h>
#include <memory>
#include <vector>
#include <cmath>
#include <algorithm>
@@ -100,7 +98,7 @@ class ColMaker: public TreeUpdater {
void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
const std::vector<RegTree *> &trees) override {
if (rabit::IsDistributed()) {
if (collective::IsDistributed()) {
LOG(FATAL) << "Updater `grow_colmaker` or `exact` tree method doesn't "
"support distributed training.";
}

View File

@@ -19,6 +19,7 @@
#include "xgboost/span.h"
#include "xgboost/json.h"
#include "../collective/device_communicator.cuh"
#include "../common/io.h"
#include "../common/device_helpers.cuh"
#include "../common/hist_util.h"
@@ -528,13 +529,12 @@ struct GPUHistMakerDevice {
}
// num histograms is the number of contiguous histograms in memory to reduce over
void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) {
void AllReduceHist(int nidx, collective::DeviceCommunicator* communicator, int num_histograms) {
monitor.Start("AllReduce");
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
reducer->AllReduceSum(reinterpret_cast<ReduceT*>(d_node_hist),
reinterpret_cast<ReduceT*>(d_node_hist),
page->Cuts().TotalBins() * 2 * num_histograms);
communicator->AllReduceSum(reinterpret_cast<ReduceT*>(d_node_hist),
page->Cuts().TotalBins() * 2 * num_histograms);
monitor.Stop("AllReduce");
}
@@ -542,8 +542,8 @@ struct GPUHistMakerDevice {
/**
* \brief Build GPU local histograms for the left and right child of some parent node
*/
void BuildHistLeftRight(std::vector<GPUExpandEntry> const& candidates, dh::AllReducer* reducer,
const RegTree& tree) {
void BuildHistLeftRight(std::vector<GPUExpandEntry> const& candidates,
collective::DeviceCommunicator* communicator, const RegTree& tree) {
if (candidates.empty()) return;
// Some nodes we will manually compute histograms
// others we will do by subtraction
@@ -574,7 +574,7 @@ struct GPUHistMakerDevice {
// Reduce all in one go
// This gives much better latency in a distributed setting
// when processing a large batch
this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size());
this->AllReduceHist(hist_nidx.at(0), communicator, hist_nidx.size());
for (size_t i = 0; i < subtraction_nidx.size(); i++) {
auto build_hist_nidx = hist_nidx.at(i);
@@ -584,7 +584,7 @@ struct GPUHistMakerDevice {
if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
// Calculate other histogram manually
this->BuildHist(subtraction_trick_nidx);
this->AllReduceHist(subtraction_trick_nidx, reducer, 1);
this->AllReduceHist(subtraction_trick_nidx, communicator, 1);
}
}
}
@@ -593,7 +593,7 @@ struct GPUHistMakerDevice {
RegTree& tree = *p_tree;
// Sanity check - have we created a leaf with no training instances?
if (!rabit::IsDistributed() && row_partitioner) {
if (!collective::IsDistributed() && row_partitioner) {
CHECK(row_partitioner->GetRows(candidate.nid).size() > 0)
<< "No training instances in this leaf!";
}
@@ -642,7 +642,7 @@ struct GPUHistMakerDevice {
parent.RightChild());
}
GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
GPUExpandEntry InitRoot(RegTree* p_tree, collective::DeviceCommunicator* communicator) {
constexpr bst_node_t kRootNIdx = 0;
dh::XGBCachingDeviceAllocator<char> alloc;
auto gpair_it = dh::MakeTransformIterator<GradientPairPrecise>(
@@ -650,11 +650,11 @@ struct GPUHistMakerDevice {
GradientPairPrecise root_sum =
dh::Reduce(thrust::cuda::par(alloc), gpair_it, gpair_it + gpair.size(),
GradientPairPrecise{}, thrust::plus<GradientPairPrecise>{});
rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double*>(&root_sum), 2);
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double*>(&root_sum), 2);
hist.AllocateHistograms({kRootNIdx});
this->BuildHist(kRootNIdx);
this->AllReduceHist(kRootNIdx, reducer, 1);
this->AllReduceHist(kRootNIdx, communicator, 1);
// Remember root stats
node_sum_gradients[kRootNIdx] = root_sum;
@@ -669,7 +669,7 @@ struct GPUHistMakerDevice {
}
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo task,
RegTree* p_tree, dh::AllReducer* reducer,
RegTree* p_tree, collective::DeviceCommunicator* communicator,
HostDeviceVector<bst_node_t>* p_out_position) {
auto& tree = *p_tree;
// Process maximum 32 nodes at a time
@@ -680,7 +680,7 @@ struct GPUHistMakerDevice {
monitor.Stop("Reset");
monitor.Start("InitRoot");
driver.Push({ this->InitRoot(p_tree, reducer) });
driver.Push({ this->InitRoot(p_tree, communicator) });
monitor.Stop("InitRoot");
// The set of leaves that can be expanded asynchronously
@@ -707,7 +707,7 @@ struct GPUHistMakerDevice {
monitor.Stop("UpdatePosition");
monitor.Start("BuildHist");
this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
this->BuildHistLeftRight(filtered_expand_set, communicator, tree);
monitor.Stop("BuildHist");
monitor.Start("EvaluateSplits");
@@ -789,11 +789,10 @@ class GPUHistMaker : public TreeUpdater {
void InitDataOnce(DMatrix* dmat) {
CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
info_ = &dmat->Info();
reducer_.Init({ctx_->gpu_id}); // NOLINT
// Synchronise the column sampling seed
uint32_t column_sampling_seed = common::GlobalRandom()();
rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
BatchParam batch_param{
ctx_->gpu_id,
@@ -823,12 +822,12 @@ class GPUHistMaker : public TreeUpdater {
void CheckTreesSynchronized(RegTree* local_tree) const {
std::string s_model;
common::MemoryBufferStream fs(&s_model);
int rank = rabit::GetRank();
int rank = collective::GetRank();
if (rank == 0) {
local_tree->Save(&fs);
}
fs.Seek(0);
rabit::Broadcast(&s_model, 0);
collective::Broadcast(&s_model, 0);
RegTree reference_tree{}; // rank 0 tree
reference_tree.Load(&fs);
CHECK(*local_tree == reference_tree);
@@ -841,7 +840,8 @@ class GPUHistMaker : public TreeUpdater {
monitor_.Stop("InitData");
gpair->SetDevice(ctx_->gpu_id);
maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_, p_out_position);
auto* communicator = collective::Communicator::GetDevice(ctx_->gpu_id);
maker->UpdateTree(gpair, p_fmat, task_, p_tree, communicator, p_out_position);
}
bool UpdatePredictionCache(const DMatrix* data,
@@ -867,8 +867,6 @@ class GPUHistMaker : public TreeUpdater {
GPUHistMakerTrainParam hist_maker_param_;
dh::AllReducer reducer_;
DMatrix* p_last_fmat_{nullptr};
RegTree const* p_last_tree_{nullptr};
ObjInfo task_;

View File

@@ -4,16 +4,13 @@
* \brief prune a tree given the statistics
* \author Tianqi Chen
*/
#include <rabit/rabit.h>
#include <xgboost/tree_updater.h>
#include <string>
#include <memory>
#include "xgboost/base.h"
#include "xgboost/json.h"
#include "./param.h"
#include "../common/io.h"
#include "../common/timer.h"
namespace xgboost {
namespace tree {

View File

@@ -6,19 +6,12 @@
*/
#include "./updater_quantile_hist.h"
#include <rabit/rabit.h>
#include <algorithm>
#include <memory>
#include <numeric>
#include <string>
#include <utility>
#include <vector>
#include "../common/column_matrix.h"
#include "../common/hist_util.h"
#include "../common/random.h"
#include "../common/threading_utils.h"
#include "constraints.h"
#include "hist/evaluate_splits.h"
#include "param.h"
@@ -103,7 +96,7 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
for (auto const &grad : gpair_h) {
grad_stat.Add(grad.GetGrad(), grad.GetHess());
}
rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double *>(&grad_stat), 2);
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat), 2);
}
auto weight = evaluator_->InitRoot(GradStats{grad_stat});
@@ -320,7 +313,7 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
++page_id;
}
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
rabit::IsDistributed());
collective::IsDistributed());
if (param_.subsample < 1.0f) {
CHECK_EQ(param_.sampling_method, TrainParam::kUniform)

View File

@@ -7,7 +7,6 @@
#ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
#define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
#include <rabit/rabit.h>
#include <xgboost/tree_updater.h>
#include <algorithm>

View File

@@ -4,17 +4,17 @@
* \brief refresh the statistics and leaf value on the tree on the dataset
* \author Tianqi Chen
*/
#include <rabit/rabit.h>
#include <xgboost/tree_updater.h>
#include <vector>
#include <limits>
#include <vector>
#include "xgboost/json.h"
#include "./param.h"
#include "../collective/communicator-inl.h"
#include "../common/io.h"
#include "../common/threading_utils.h"
#include "../predictor/predict_fn.h"
#include "./param.h"
#include "xgboost/json.h"
namespace xgboost {
namespace tree {
@@ -100,8 +100,9 @@ class TreeRefresher : public TreeUpdater {
}
});
};
rabit::Allreduce<rabit::op::Sum>(&dmlc::BeginPtr(stemp[0])->sum_grad, stemp[0].size() * 2,
lazy_get_stats);
lazy_get_stats();
collective::Allreduce<collective::Operation::kSum>(&dmlc::BeginPtr(stemp[0])->sum_grad,
stemp[0].size() * 2);
// rescale learning rate according to size of trees
float lr = param_.learning_rate;
param_.learning_rate = lr / trees.size();

View File

@@ -4,12 +4,14 @@
* \brief synchronize the tree in all distributed nodes
*/
#include <xgboost/tree_updater.h>
#include <vector>
#include <string>
#include <limits>
#include "xgboost/json.h"
#include <limits>
#include <string>
#include <vector>
#include "../collective/communicator-inl.h"
#include "../common/io.h"
#include "xgboost/json.h"
namespace xgboost {
namespace tree {
@@ -35,17 +37,17 @@ class TreeSyncher : public TreeUpdater {
void Update(HostDeviceVector<GradientPair>*, DMatrix*,
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
const std::vector<RegTree*>& trees) override {
if (rabit::GetWorldSize() == 1) return;
if (collective::GetWorldSize() == 1) return;
std::string s_model;
common::MemoryBufferStream fs(&s_model);
int rank = rabit::GetRank();
int rank = collective::GetRank();
if (rank == 0) {
for (auto tree : trees) {
tree->Save(&fs);
}
}
fs.Seek(0);
rabit::Broadcast(&s_model, 0);
collective::Broadcast(&s_model, 0);
for (auto tree : trees) {
tree->Load(&fs);
}