[Breaking] Switch from rabit to the collective communicator (#8257)
* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -23,6 +23,7 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "collective/communicator-inl.h"
|
||||
#include "common/charconv.h"
|
||||
#include "common/common.h"
|
||||
#include "common/io.h"
|
||||
@@ -478,7 +479,7 @@ class LearnerConfiguration : public Learner {
|
||||
|
||||
// add additional parameters
|
||||
// These are cosntraints that need to be satisfied.
|
||||
if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
|
||||
if (tparam_.dsplit == DataSplitMode::kAuto && collective::IsDistributed()) {
|
||||
tparam_.dsplit = DataSplitMode::kRow;
|
||||
}
|
||||
|
||||
@@ -757,7 +758,7 @@ class LearnerConfiguration : public Learner {
|
||||
num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
|
||||
}
|
||||
|
||||
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&num_feature, 1);
|
||||
if (num_feature > mparam_.num_feature) {
|
||||
mparam_.num_feature = num_feature;
|
||||
}
|
||||
@@ -1083,7 +1084,7 @@ class LearnerIO : public LearnerConfiguration {
|
||||
cfg_.insert(n.cbegin(), n.cend());
|
||||
|
||||
// copy dsplit from config since it will not run again during restore
|
||||
if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
|
||||
if (tparam_.dsplit == DataSplitMode::kAuto && collective::IsDistributed()) {
|
||||
tparam_.dsplit = DataSplitMode::kRow;
|
||||
}
|
||||
|
||||
@@ -1228,7 +1229,7 @@ class LearnerImpl : public LearnerIO {
|
||||
}
|
||||
// Configuration before data is known.
|
||||
void CheckDataSplitMode() {
|
||||
if (rabit::IsDistributed()) {
|
||||
if (collective::IsDistributed()) {
|
||||
CHECK(tparam_.dsplit != DataSplitMode::kAuto)
|
||||
<< "Precondition violated; dsplit cannot be 'auto' in distributed mode";
|
||||
if (tparam_.dsplit == DataSplitMode::kCol) {
|
||||
@@ -1488,7 +1489,7 @@ class LearnerImpl : public LearnerIO {
|
||||
}
|
||||
|
||||
if (p_fmat->Info().num_row_ == 0) {
|
||||
LOG(WARNING) << "Empty dataset at worker: " << rabit::GetRank();
|
||||
LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user