[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator

* fix size_t specialization

* really fix size_t

* try again

* add include

* more include

* fix lint errors

* remove rabit includes

* fix pylint error

* return dict from communicator context

* fix communicator shutdown

* fix dask test

* reset communicator mocklist

* fix distributed tests

* do not save device communicator

* fix jvm gpu tests

* add python test for federated communicator

* Update gputreeshap submodule

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Rong Ou
2022-10-05 15:39:01 -07:00
committed by GitHub
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions

View File

@@ -23,6 +23,7 @@
#include <utility>
#include <vector>
#include "collective/communicator-inl.h"
#include "common/charconv.h"
#include "common/common.h"
#include "common/io.h"
@@ -478,7 +479,7 @@ class LearnerConfiguration : public Learner {
// add additional parameters
// These are cosntraints that need to be satisfied.
if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
if (tparam_.dsplit == DataSplitMode::kAuto && collective::IsDistributed()) {
tparam_.dsplit = DataSplitMode::kRow;
}
@@ -757,7 +758,7 @@ class LearnerConfiguration : public Learner {
num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
}
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
collective::Allreduce<collective::Operation::kMax>(&num_feature, 1);
if (num_feature > mparam_.num_feature) {
mparam_.num_feature = num_feature;
}
@@ -1083,7 +1084,7 @@ class LearnerIO : public LearnerConfiguration {
cfg_.insert(n.cbegin(), n.cend());
// copy dsplit from config since it will not run again during restore
if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
if (tparam_.dsplit == DataSplitMode::kAuto && collective::IsDistributed()) {
tparam_.dsplit = DataSplitMode::kRow;
}
@@ -1228,7 +1229,7 @@ class LearnerImpl : public LearnerIO {
}
// Configuration before data is known.
void CheckDataSplitMode() {
if (rabit::IsDistributed()) {
if (collective::IsDistributed()) {
CHECK(tparam_.dsplit != DataSplitMode::kAuto)
<< "Precondition violated; dsplit cannot be 'auto' in distributed mode";
if (tparam_.dsplit == DataSplitMode::kCol) {
@@ -1488,7 +1489,7 @@ class LearnerImpl : public LearnerIO {
}
if (p_fmat->Info().num_row_ == 0) {
LOG(WARNING) << "Empty dataset at worker: " << rabit::GetRank();
LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
}
}