[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
2022-10-05 15:39:01 -07:00
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -84,11 +84,11 @@ GradientQuantizer::GradientQuantizer(common::Span<GradientPair const> gpair) {
  // Treat pair as array of 4 primitive types to allreduce
  using ReduceT = typename decltype(p.first)::ValueT;
  static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
-  rabit::Allreduce<rabit::op::Sum, ReduceT>(reinterpret_cast<ReduceT*>(&p), 4);
+  collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<ReduceT*>(&p), 4);
  GradientPair positive_sum{p.first}, negative_sum{p.second};

  std::size_t total_rows = gpair.size();
-  rabit::Allreduce<rabit::op::Sum>(&total_rows, 1);
+  collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);

  auto histogram_rounding = GradientSumT{
      CreateRoundingFactor<T>(std::max(positive_sum.GetGrad(), negative_sum.GetGrad()), total_rows),
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -8,10 +8,10 @@
 #include <limits>
 #include <vector>

+#include "../../collective/communicator-inl.h"
 #include "../../common/hist_util.h"
 #include "../../data/gradient_index.h"
 #include "expand_entry.h"
-#include "rabit/rabit.h"
 #include "xgboost/tree_model.h"

 namespace xgboost {
@@ -202,8 +202,9 @@ class HistogramBuilder {
          }
        });

-    rabit::Allreduce<rabit::op::Sum>(reinterpret_cast<double*>(this->hist_[starting_index].data()),
-                                     builder_.GetNumBins() * sync_count * 2);
+    collective::Allreduce<collective::Operation::kSum>(
+        reinterpret_cast<double *>(this->hist_[starting_index].data()),
+        builder_.GetNumBins() * sync_count * 2);

    ParallelSubtractionHist(space, nodes_for_explicit_hist_build,
                            nodes_for_subtraction_trick, p_tree);
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -74,7 +74,7 @@ class GloablApproxBuilder {
    }

    histogram_builder_.Reset(n_total_bins, BatchSpec(param_, hess), ctx_->Threads(), n_batches_,
-                             rabit::IsDistributed());
+                             collective::IsDistributed());
    monitor_->Stop(__func__);
  }

@@ -88,7 +88,7 @@ class GloablApproxBuilder {
    for (auto const &g : gpair) {
      root_sum.Add(g);
    }
-    rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double *>(&root_sum), 2);
+    collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
    std::vector<CPUExpandEntry> nodes{best};
    size_t i = 0;
    auto space = ConstructHistSpace(partitioner_, nodes);
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -4,8 +4,6 @@
 * \brief use columnwise update to construct a tree
 * \author Tianqi Chen
 */
-#include <rabit/rabit.h>
-#include <memory>
 #include <vector>
 #include <cmath>
 #include <algorithm>
@@ -100,7 +98,7 @@ class ColMaker: public TreeUpdater {
  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
              common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
              const std::vector<RegTree *> &trees) override {
-    if (rabit::IsDistributed()) {
+    if (collective::IsDistributed()) {
      LOG(FATAL) << "Updater `grow_colmaker` or `exact` tree method doesn't "
                    "support distributed training.";
    }
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -19,6 +19,7 @@
 #include "xgboost/span.h"
 #include "xgboost/json.h"

+#include "../collective/device_communicator.cuh"
 #include "../common/io.h"
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
@@ -528,13 +529,12 @@ struct GPUHistMakerDevice {
  }

  // num histograms is the number of contiguous histograms in memory to reduce over
-  void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) {
+  void AllReduceHist(int nidx, collective::DeviceCommunicator* communicator, int num_histograms) {
    monitor.Start("AllReduce");
    auto d_node_hist = hist.GetNodeHistogram(nidx).data();
    using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-    reducer->AllReduceSum(reinterpret_cast<ReduceT*>(d_node_hist),
-                          reinterpret_cast<ReduceT*>(d_node_hist),
-                          page->Cuts().TotalBins() * 2 * num_histograms);
+    communicator->AllReduceSum(reinterpret_cast<ReduceT*>(d_node_hist),
+                               page->Cuts().TotalBins() * 2 * num_histograms);

    monitor.Stop("AllReduce");
  }
@@ -542,8 +542,8 @@ struct GPUHistMakerDevice {
  /**
   * \brief Build GPU local histograms for the left and right child of some parent node
   */
-  void BuildHistLeftRight(std::vector<GPUExpandEntry> const& candidates, dh::AllReducer* reducer,
-                          const RegTree& tree) {
+  void BuildHistLeftRight(std::vector<GPUExpandEntry> const& candidates,
+                          collective::DeviceCommunicator* communicator, const RegTree& tree) {
    if (candidates.empty()) return;
    // Some nodes we will manually compute histograms
    // others we will do by subtraction
@@ -574,7 +574,7 @@ struct GPUHistMakerDevice {
    // Reduce all in one go
    // This gives much better latency in a distributed setting
    // when processing a large batch
-    this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size());
+    this->AllReduceHist(hist_nidx.at(0), communicator, hist_nidx.size());

    for (size_t i = 0; i < subtraction_nidx.size(); i++) {
      auto build_hist_nidx = hist_nidx.at(i);
@@ -584,7 +584,7 @@ struct GPUHistMakerDevice {
      if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
        // Calculate other histogram manually
        this->BuildHist(subtraction_trick_nidx);
-        this->AllReduceHist(subtraction_trick_nidx, reducer, 1);
+        this->AllReduceHist(subtraction_trick_nidx, communicator, 1);
      }
    }
  }
@@ -593,7 +593,7 @@ struct GPUHistMakerDevice {
    RegTree& tree = *p_tree;

    // Sanity check - have we created a leaf with no training instances?
-    if (!rabit::IsDistributed() && row_partitioner) {
+    if (!collective::IsDistributed() && row_partitioner) {
      CHECK(row_partitioner->GetRows(candidate.nid).size() > 0)
          << "No training instances in this leaf!";
    }
@@ -642,7 +642,7 @@ struct GPUHistMakerDevice {
                                  parent.RightChild());
  }

-  GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
+  GPUExpandEntry InitRoot(RegTree* p_tree, collective::DeviceCommunicator* communicator) {
    constexpr bst_node_t kRootNIdx = 0;
    dh::XGBCachingDeviceAllocator<char> alloc;
    auto gpair_it = dh::MakeTransformIterator<GradientPairPrecise>(
@@ -650,11 +650,11 @@ struct GPUHistMakerDevice {
    GradientPairPrecise root_sum =
        dh::Reduce(thrust::cuda::par(alloc), gpair_it, gpair_it + gpair.size(),
                   GradientPairPrecise{}, thrust::plus<GradientPairPrecise>{});
-    rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double*>(&root_sum), 2);
+    collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double*>(&root_sum), 2);

    hist.AllocateHistograms({kRootNIdx});
    this->BuildHist(kRootNIdx);
-    this->AllReduceHist(kRootNIdx, reducer, 1);
+    this->AllReduceHist(kRootNIdx, communicator, 1);

    // Remember root stats
    node_sum_gradients[kRootNIdx] = root_sum;
@@ -669,7 +669,7 @@ struct GPUHistMakerDevice {
  }

  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo task,
-                  RegTree* p_tree, dh::AllReducer* reducer,
+                  RegTree* p_tree, collective::DeviceCommunicator* communicator,
                  HostDeviceVector<bst_node_t>* p_out_position) {
    auto& tree = *p_tree;
    // Process maximum 32 nodes at a time
@@ -680,7 +680,7 @@ struct GPUHistMakerDevice {
    monitor.Stop("Reset");

    monitor.Start("InitRoot");
-    driver.Push({ this->InitRoot(p_tree, reducer) });
+    driver.Push({ this->InitRoot(p_tree, communicator) });
    monitor.Stop("InitRoot");

    // The set of leaves that can be expanded asynchronously
@@ -707,7 +707,7 @@ struct GPUHistMakerDevice {
      monitor.Stop("UpdatePosition");

      monitor.Start("BuildHist");
-      this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
+      this->BuildHistLeftRight(filtered_expand_set, communicator, tree);
      monitor.Stop("BuildHist");

      monitor.Start("EvaluateSplits");
@@ -789,11 +789,10 @@ class GPUHistMaker : public TreeUpdater {
  void InitDataOnce(DMatrix* dmat) {
    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
    info_ = &dmat->Info();
-    reducer_.Init({ctx_->gpu_id});  // NOLINT

    // Synchronise the column sampling seed
    uint32_t column_sampling_seed = common::GlobalRandom()();
-    rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
+    collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);

    BatchParam batch_param{
      ctx_->gpu_id,
@@ -823,12 +822,12 @@ class GPUHistMaker : public TreeUpdater {
  void CheckTreesSynchronized(RegTree* local_tree) const {
    std::string s_model;
    common::MemoryBufferStream fs(&s_model);
-    int rank = rabit::GetRank();
+    int rank = collective::GetRank();
    if (rank == 0) {
      local_tree->Save(&fs);
    }
    fs.Seek(0);
-    rabit::Broadcast(&s_model, 0);
+    collective::Broadcast(&s_model, 0);
    RegTree reference_tree{};  // rank 0 tree
    reference_tree.Load(&fs);
    CHECK(*local_tree == reference_tree);
@@ -841,7 +840,8 @@ class GPUHistMaker : public TreeUpdater {
    monitor_.Stop("InitData");

    gpair->SetDevice(ctx_->gpu_id);
-    maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_, p_out_position);
+    auto* communicator = collective::Communicator::GetDevice(ctx_->gpu_id);
+    maker->UpdateTree(gpair, p_fmat, task_, p_tree, communicator, p_out_position);
  }

  bool UpdatePredictionCache(const DMatrix* data,
@@ -867,8 +867,6 @@ class GPUHistMaker : public TreeUpdater {

  GPUHistMakerTrainParam hist_maker_param_;

-  dh::AllReducer reducer_;
-
  DMatrix* p_last_fmat_{nullptr};
  RegTree const* p_last_tree_{nullptr};
  ObjInfo task_;
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -4,16 +4,13 @@
 * \brief prune a tree given the statistics
 * \author Tianqi Chen
 */
-#include <rabit/rabit.h>
 #include <xgboost/tree_updater.h>

-#include <string>
 #include <memory>

 #include "xgboost/base.h"
 #include "xgboost/json.h"
 #include "./param.h"
-#include "../common/io.h"
 #include "../common/timer.h"
 namespace xgboost {
 namespace tree {
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -6,19 +6,12 @@
 */
 #include "./updater_quantile_hist.h"

-#include <rabit/rabit.h>
-
 #include <algorithm>
 #include <memory>
-#include <numeric>
 #include <string>
 #include <utility>
 #include <vector>

-#include "../common/column_matrix.h"
-#include "../common/hist_util.h"
-#include "../common/random.h"
-#include "../common/threading_utils.h"
 #include "constraints.h"
 #include "hist/evaluate_splits.h"
 #include "param.h"
@@ -103,7 +96,7 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
      for (auto const &grad : gpair_h) {
        grad_stat.Add(grad.GetGrad(), grad.GetHess());
      }
-      rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double *>(&grad_stat), 2);
+      collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat), 2);
    }

    auto weight = evaluator_->InitRoot(GradStats{grad_stat});
@@ -320,7 +313,7 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
      ++page_id;
    }
    histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
-                              rabit::IsDistributed());
+                              collective::IsDistributed());

    if (param_.subsample < 1.0f) {
      CHECK_EQ(param_.sampling_method, TrainParam::kUniform)
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -7,7 +7,6 @@
 #ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
 #define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_

-#include <rabit/rabit.h>
 #include <xgboost/tree_updater.h>

 #include <algorithm>
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -4,17 +4,17 @@
 * \brief refresh the statistics and leaf value on the tree on the dataset
 * \author Tianqi Chen
 */
-#include <rabit/rabit.h>
 #include <xgboost/tree_updater.h>

-#include <vector>
 #include <limits>
+#include <vector>

-#include "xgboost/json.h"
-#include "./param.h"
+#include "../collective/communicator-inl.h"
 #include "../common/io.h"
 #include "../common/threading_utils.h"
 #include "../predictor/predict_fn.h"
+#include "./param.h"
+#include "xgboost/json.h"

 namespace xgboost {
 namespace tree {
@@ -100,8 +100,9 @@ class TreeRefresher : public TreeUpdater {
        }
      });
    };
-    rabit::Allreduce<rabit::op::Sum>(&dmlc::BeginPtr(stemp[0])->sum_grad, stemp[0].size() * 2,
-                                     lazy_get_stats);
+    lazy_get_stats();
+    collective::Allreduce<collective::Operation::kSum>(&dmlc::BeginPtr(stemp[0])->sum_grad,
+                                                       stemp[0].size() * 2);
    // rescale learning rate according to size of trees
    float lr = param_.learning_rate;
    param_.learning_rate = lr / trees.size();
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -4,12 +4,14 @@
 * \brief synchronize the tree in all distributed nodes
 */
 #include <xgboost/tree_updater.h>
-#include <vector>
-#include <string>
-#include <limits>

-#include "xgboost/json.h"
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "../collective/communicator-inl.h"
 #include "../common/io.h"
+#include "xgboost/json.h"

 namespace xgboost {
 namespace tree {
@@ -35,17 +37,17 @@ class TreeSyncher : public TreeUpdater {
  void Update(HostDeviceVector<GradientPair>*, DMatrix*,
              common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
              const std::vector<RegTree*>& trees) override {
-    if (rabit::GetWorldSize() == 1) return;
+    if (collective::GetWorldSize() == 1) return;
    std::string s_model;
    common::MemoryBufferStream fs(&s_model);
-    int rank = rabit::GetRank();
+    int rank = collective::GetRank();
    if (rank == 0) {
      for (auto tree : trees) {
        tree->Save(&fs);
      }
    }
    fs.Seek(0);
-    rabit::Broadcast(&s_model, 0);
+    collective::Broadcast(&s_model, 0);
    for (auto tree : trees) {
      tree->Load(&fs);
    }