[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator

* fix size_t specialization

* really fix size_t

* try again

* add include

* more include

* fix lint errors

* remove rabit includes

* fix pylint error

* return dict from communicator context

* fix communicator shutdown

* fix dask test

* reset communicator mocklist

* fix distributed tests

* do not save device communicator

* fix jvm gpu tests

* add python test for federated communicator

* Update gputreeshap submodule

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Rong Ou
2022-10-05 15:39:01 -07:00
committed by GitHub
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions

View File

@@ -7,8 +7,8 @@
#include <limits>
#include <vector>
#include "../collective/communicator-inl.h"
#include "../common/common.h"
#include "rabit/rabit.h"
#include "xgboost/generic_parameters.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/tree_model.h"
@@ -39,7 +39,7 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
auto const& h_node_idx = nidx;
size_t n_leaf{h_node_idx.size()};
rabit::Allreduce<rabit::op::Max>(&n_leaf, 1);
collective::Allreduce<collective::Operation::kMax>(&n_leaf, 1);
CHECK(quantiles.empty() || quantiles.size() == n_leaf);
if (quantiles.empty()) {
quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
@@ -49,12 +49,12 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
std::vector<int32_t> n_valids(quantiles.size());
std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
[](float q) { return static_cast<int32_t>(!std::isnan(q)); });
rabit::Allreduce<rabit::op::Sum>(n_valids.data(), n_valids.size());
collective::Allreduce<collective::Operation::kSum>(n_valids.data(), n_valids.size());
// convert to 0 for all reduce
std::replace_if(
quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
// use the mean value
rabit::Allreduce<rabit::op::Sum>(quantiles.data(), quantiles.size());
collective::Allreduce<collective::Operation::kSum>(quantiles.data(), quantiles.size());
for (size_t i = 0; i < n_leaf; ++i) {
if (n_valids[i] > 0) {
quantiles[i] /= static_cast<float>(n_valids[i]);

View File

@@ -724,8 +724,8 @@ class MeanAbsoluteError : public ObjFunction {
}
// Weighted average base score across all workers
rabit::Allreduce<rabit::op::Sum>(out.Values().data(), out.Values().size());
rabit::Allreduce<rabit::op::Sum>(&w, 1);
collective::Allreduce<collective::Operation::kSum>(out.Values().data(), out.Values().size());
collective::Allreduce<collective::Operation::kSum>(&w, 1);
std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
[w](float v) { return v / w; });