[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
2022-10-05 15:39:01 -07:00
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -2,36 +2,36 @@
 * Copyright 2015-2022 by XGBoost Contributors
 * \file data.cc
 */
+#include "xgboost/data.h"
+
 #include <dmlc/registry.h>
+
 #include <array>
 #include <cstring>

-#include "dmlc/io.h"
-#include "xgboost/data.h"
-#include "xgboost/c_api.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/logging.h"
-#include "xgboost/version_config.h"
-#include "xgboost/learner.h"
-#include "xgboost/string_view.h"
-
-#include "sparse_page_writer.h"
-#include "simple_dmatrix.h"
-
+#include "../collective/communicator-inl.h"
+#include "../common/group_data.h"
 #include "../common/io.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
 #include "../common/numeric.h"
-#include "../common/version.h"
-#include "../common/group_data.h"
 #include "../common/threading_utils.h"
+#include "../common/version.h"
 #include "../data/adapter.h"
 #include "../data/iterative_dmatrix.h"
-#include "file_iterator.h"
-
-#include "validation.h"
-#include "./sparse_page_source.h"
 #include "./sparse_page_dmatrix.h"
+#include "./sparse_page_source.h"
+#include "dmlc/io.h"
+#include "file_iterator.h"
+#include "simple_dmatrix.h"
+#include "sparse_page_writer.h"
+#include "validation.h"
+#include "xgboost/c_api.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/learner.h"
+#include "xgboost/logging.h"
+#include "xgboost/string_view.h"
+#include "xgboost/version_config.h"

 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
@@ -793,12 +793,12 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split,
        size_t pos = cache_shards[i].rfind('.');
        if (pos == std::string::npos) {
          os << cache_shards[i]
-             << ".r" << rabit::GetRank()
-             << "-" <<  rabit::GetWorldSize();
+             << ".r" << collective::GetRank()
+             << "-" <<  collective::GetWorldSize();
        } else {
          os << cache_shards[i].substr(0, pos)
-             << ".r" << rabit::GetRank()
-             << "-" <<  rabit::GetWorldSize()
+             << ".r" << collective::GetRank()
+             << "-" <<  collective::GetWorldSize()
             << cache_shards[i].substr(pos, cache_shards[i].length());
        }
        if (i + 1 != cache_shards.size()) {
@@ -821,8 +821,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split,

  int partid = 0, npart = 1;
  if (load_row_split) {
-    partid = rabit::GetRank();
-    npart = rabit::GetWorldSize();
+    partid = collective::GetRank();
+    npart = collective::GetWorldSize();
  } else {
    // test option to load in part
    npart = 1;
@@ -877,7 +877,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split,
  /* sync up number of features after matrix loaded.
   * partitioned data will fail the train/val validation check
   * since partitioned data not knowing the real number of features. */
-  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
+  collective::Allreduce<collective::Operation::kMax>(&dmat->Info().num_col_, 1);
  return dmat;
 }

--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -3,13 +3,11 @@
 */
 #include "iterative_dmatrix.h"

-#include <rabit/rabit.h>
-
 #include <algorithm>  // std::copy

+#include "../collective/communicator-inl.h"
 #include "../common/categorical.h"  // common::IsCat
 #include "../common/column_matrix.h"
-#include "../common/hist_util.h"  // common::HistogramCuts
 #include "../tree/param.h"        // FIXME(jiamingy): Find a better way to share this parameter.
 #include "gradient_index.h"
 #include "proxy_dmatrix.h"
@@ -140,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
    // We use do while here as the first batch is fetched in ctor
    if (n_features == 0) {
      n_features = num_cols();
-      rabit::Allreduce<rabit::op::Max>(&n_features, 1);
+      collective::Allreduce<collective::Operation::kMax>(&n_features, 1);
      column_sizes.resize(n_features);
      info_.num_col_ = n_features;
    } else {
@@ -157,7 +155,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
  // From here on Info() has the correct data shape
  Info().num_row_ = accumulated_rows;
  Info().num_nonzero_ = nnz;
-  rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
+  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
  CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) {
    return f > accumulated_rows;
  })) << "Something went wrong during iteration.";
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -62,7 +62,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
    dh::safe_cuda(cudaSetDevice(get_device()));
    if (cols == 0) {
      cols = num_cols();
-      rabit::Allreduce<rabit::op::Max>(&cols, 1);
+      collective::Allreduce<collective::Operation::kMax>(&cols, 1);
      this->info_.num_col_ = cols;
    } else {
      CHECK_EQ(cols, num_cols()) << "Inconsistent number of columns.";
@@ -166,7 +166,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,

  iter.Reset();
  // Synchronise worker columns
-  rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
+  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
 }

 BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& param) {
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -189,7 +189,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {


  // Synchronise worker columns
-  rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
+  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);

  if (adapter->NumRows() == kAdapterUnknownSize) {
    using IteratorAdapterT
@@ -322,7 +322,7 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
  }
  // Synchronise worker columns
  info_.num_col_ = adapter->NumColumns();
-  rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
+  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
  info_.num_row_ = total_batch_size;
  info_.num_nonzero_ = data_vec.size();
  CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -35,7 +35,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
  info_.num_col_ = adapter->NumColumns();
  info_.num_row_ = adapter->NumRows();
  // Synchronise worker columns
-  rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
+  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
 }

 template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -5,6 +5,8 @@
 * \author Tianqi Chen
 */
 #include "./sparse_page_dmatrix.h"
+
+#include "../collective/communicator-inl.h"
 #include "./simple_batch_iterator.h"
 #include "gradient_index.h"

@@ -46,8 +48,8 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
      cache_prefix_{std::move(cache_prefix)} {
  ctx_.nthread = nthreads;
  cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
-  if (rabit::IsDistributed()) {
-    cache_prefix_ += ("-r" + std::to_string(rabit::GetRank()));
+  if (collective::IsDistributed()) {
+    cache_prefix_ += ("-r" + std::to_string(collective::GetRank()));
  }
  DMatrixProxy *proxy = MakeProxy(proxy_);
  auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
@@ -94,7 +96,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
  this->info_.num_col_ = n_features;
  this->info_.num_nonzero_ = nnz;

-  rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
+  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
  CHECK_NE(info_.num_col_, 0);
 }

--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -14,7 +14,6 @@
 #include <map>
 #include <memory>

-#include "rabit/rabit.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"