[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator

* fix size_t specialization

* really fix size_t

* try again

* add include

* more include

* fix lint errors

* remove rabit includes

* fix pylint error

* return dict from communicator context

* fix communicator shutdown

* fix dask test

* reset communicator mocklist

* fix distributed tests

* do not save device communicator

* fix jvm gpu tests

* add python test for federated communicator

* Update gputreeshap submodule

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Rong Ou
2022-10-05 15:39:01 -07:00
committed by GitHub
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions

View File

@@ -1,27 +1,23 @@
/*!
* Copyright 2021 by XGBoost Contributors
*/
#include "auc.h"
#include <algorithm>
#include <array>
#include <atomic>
#include <algorithm>
#include <functional>
#include <limits>
#include <memory>
#include <numeric>
#include <utility>
#include <tuple>
#include <utility>
#include <vector>
#include "rabit/rabit.h"
#include "xgboost/linalg.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/metric.h"
#include "auc.h"
#include "../common/common.h"
#include "../common/math.h"
#include "../common/threading_utils.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/linalg.h"
#include "xgboost/metric.h"
namespace xgboost {
namespace metric {
@@ -117,7 +113,8 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
// we have 2 averages going in here, first is among workers, second is among
// classes. allreduce sums up fp/tp auc for each class.
rabit::Allreduce<rabit::op::Sum>(results.Values().data(), results.Values().size());
collective::Allreduce<collective::Operation::kSum>(results.Values().data(),
results.Values().size());
double auc_sum{0};
double tp_sum{0};
for (size_t c = 0; c < n_classes; ++c) {
@@ -265,7 +262,7 @@ class EvalAUC : public Metric {
}
// We use the global size to handle empty dataset.
std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
rabit::Allreduce<rabit::op::Max>(meta.data(), meta.size());
collective::Allreduce<collective::Operation::kMax>(meta.data(), meta.size());
if (meta[0] == 0) {
// Empty across all workers, which is not supported.
auc = std::numeric_limits<double>::quiet_NaN();
@@ -287,7 +284,7 @@ class EvalAUC : public Metric {
}
std::array<double, 2> results{auc, static_cast<double>(valid_groups)};
rabit::Allreduce<rabit::op::Sum>(results.data(), results.size());
collective::Allreduce<collective::Operation::kSum>(results.data(), results.size());
auc = results[0];
valid_groups = static_cast<uint32_t>(results[1]);
@@ -316,7 +313,7 @@ class EvalAUC : public Metric {
}
double local_area = fp * tp;
std::array<double, 2> result{auc, local_area};
rabit::Allreduce<rabit::op::Sum>(result.data(), result.size());
collective::Allreduce<collective::Operation::kSum>(result.data(), result.size());
std::tie(auc, local_area) = common::UnpackArr(std::move(result));
if (local_area <= 0) {
// the dataset across all workers have only positive or negative sample

View File

@@ -11,11 +11,10 @@
#include <utility>
#include <tuple>
#include "rabit/rabit.h"
#include "xgboost/span.h"
#include "xgboost/data.h"
#include "auc.h"
#include "../common/device_helpers.cuh"
#include "../collective/device_communicator.cuh"
#include "../common/ranking_utils.cuh"
namespace xgboost {
@@ -46,9 +45,8 @@ struct DeviceAUCCache {
dh::device_vector<size_t> unique_idx;
// p^T: transposed prediction matrix, used by MultiClassAUC
dh::device_vector<float> predts_t;
std::unique_ptr<dh::AllReducer> reducer;
void Init(common::Span<float const> predts, bool is_multi, int32_t device) {
void Init(common::Span<float const> predts, bool is_multi) {
if (sorted_idx.size() != predts.size()) {
sorted_idx.resize(predts.size());
fptp.resize(sorted_idx.size());
@@ -58,10 +56,6 @@ struct DeviceAUCCache {
predts_t.resize(sorted_idx.size());
}
}
if (is_multi && !reducer) {
reducer.reset(new dh::AllReducer);
reducer->Init(device);
}
}
};
@@ -72,7 +66,7 @@ void InitCacheOnce(common::Span<float const> predts, int32_t device,
if (!cache) {
cache.reset(new DeviceAUCCache);
}
cache->Init(predts, is_multi, device);
cache->Init(predts, is_multi);
}
/**
@@ -205,9 +199,11 @@ double ScaleClasses(common::Span<double> results, common::Span<double> local_are
common::Span<double> tp, common::Span<double> auc,
std::shared_ptr<DeviceAUCCache> cache, size_t n_classes) {
dh::XGBDeviceAllocator<char> alloc;
if (rabit::IsDistributed()) {
CHECK_EQ(dh::CudaGetPointerDevice(results.data()), dh::CurrentDevice());
cache->reducer->AllReduceSum(results.data(), results.data(), results.size());
if (collective::IsDistributed()) {
int32_t device = dh::CurrentDevice();
CHECK_EQ(dh::CudaGetPointerDevice(results.data()), device);
auto* communicator = collective::Communicator::GetDevice(device);
communicator->AllReduceSum(results.data(), results.size());
}
auto reduce_in = dh::MakeTransformIterator<Pair>(
thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {

View File

@@ -10,13 +10,13 @@
#include <tuple>
#include <utility>
#include "rabit/rabit.h"
#include "xgboost/base.h"
#include "xgboost/span.h"
#include "xgboost/data.h"
#include "xgboost/metric.h"
#include "../collective/communicator-inl.h"
#include "../common/common.h"
#include "../common/threading_utils.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/metric.h"
#include "xgboost/span.h"
namespace xgboost {
namespace metric {
@@ -101,7 +101,7 @@ XGBOOST_DEVICE inline double CalcDeltaPRAUC(double fp_prev, double fp,
inline void InvalidGroupAUC() {
LOG(INFO) << "Invalid group with less than 3 samples is found on worker "
<< rabit::GetRank() << ". Calculating AUC value requires at "
<< collective::GetRank() << ". Calculating AUC value requires at "
<< "least 2 pairs of samples.";
}

View File

@@ -7,11 +7,11 @@
* The expressions like wsum == 0 ? esum : esum / wsum is used to handle empty dataset.
*/
#include <dmlc/registry.h>
#include <rabit/rabit.h>
#include <xgboost/metric.h>
#include <cmath>
#include "../collective/communicator-inl.h"
#include "../common/common.h"
#include "../common/math.h"
#include "../common/pseudo_huber.h"
@@ -196,8 +196,8 @@ class PseudoErrorLoss : public Metric {
return std::make_tuple(v, wt);
});
double dat[2]{result.Residue(), result.Weights()};
if (rabit::IsDistributed()) {
rabit::Allreduce<rabit::op::Sum>(dat, 2);
if (collective::IsDistributed()) {
collective::Allreduce<collective::Operation::kSum>(dat, 2);
}
return EvalRowMAPE::GetFinal(dat[0], dat[1]);
}
@@ -365,7 +365,7 @@ struct EvalEWiseBase : public Metric {
});
double dat[2]{result.Residue(), result.Weights()};
rabit::Allreduce<rabit::op::Sum>(dat, 2);
collective::Allreduce<collective::Operation::kSum>(dat, 2);
return Policy::GetFinal(dat[0], dat[1]);
}

View File

@@ -4,15 +4,14 @@
* \brief evaluation metrics for multiclass classification.
* \author Kailong Chen, Tianqi Chen
*/
#include <rabit/rabit.h>
#include <xgboost/metric.h>
#include <atomic>
#include <cmath>
#include "metric_common.h"
#include "../collective/communicator-inl.h"
#include "../common/math.h"
#include "../common/common.h"
#include "../common/threading_utils.h"
#if defined(XGBOOST_USE_CUDA)
@@ -185,7 +184,7 @@ struct EvalMClassBase : public Metric {
dat[0] = result.Residue();
dat[1] = result.Weights();
}
rabit::Allreduce<rabit::op::Sum>(dat, 2);
collective::Allreduce<collective::Operation::kSum>(dat, 2);
return Derived::GetFinal(dat[0], dat[1]);
}
/*!

View File

@@ -20,17 +20,17 @@
// corresponding headers that brings in those function declaration can't be included with CUDA).
// This precludes the CPU and GPU logic to coexist inside a .cu file
#include <rabit/rabit.h>
#include <xgboost/metric.h>
#include <dmlc/registry.h>
#include <cmath>
#include <xgboost/metric.h>
#include <cmath>
#include <vector>
#include "xgboost/host_device_vector.h"
#include "../collective/communicator-inl.h"
#include "../common/math.h"
#include "../common/threading_utils.h"
#include "metric_common.h"
#include "xgboost/host_device_vector.h"
namespace {
@@ -103,7 +103,7 @@ struct EvalAMS : public Metric {
}
double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
CHECK(!rabit::IsDistributed()) << "metric AMS do not support distributed evaluation";
CHECK(!collective::IsDistributed()) << "metric AMS do not support distributed evaluation";
using namespace std; // NOLINT(*)
const auto ndata = static_cast<bst_omp_uint>(info.labels.Size());
@@ -216,10 +216,10 @@ struct EvalRank : public Metric, public EvalRankConfig {
exc.Rethrow();
}
if (rabit::IsDistributed()) {
if (collective::IsDistributed()) {
double dat[2]{sum_metric, static_cast<double>(ngroups)};
// approximately estimate the metric using mean
rabit::Allreduce<rabit::op::Sum>(dat, 2);
collective::Allreduce<collective::Operation::kSum>(dat, 2);
return dat[0] / dat[1];
} else {
return sum_metric / ngroups;
@@ -341,7 +341,7 @@ struct EvalCox : public Metric {
public:
EvalCox() = default;
double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
CHECK(!rabit::IsDistributed()) << "Cox metric does not support distributed evaluation";
CHECK(!collective::IsDistributed()) << "Cox metric does not support distributed evaluation";
using namespace std; // NOLINT(*)
const auto ndata = static_cast<bst_omp_uint>(info.labels.Size());

View File

@@ -4,15 +4,12 @@
* \brief prediction rank based metrics.
* \author Kailong Chen, Tianqi Chen
*/
#include <rabit/rabit.h>
#include <dmlc/registry.h>
#include <xgboost/metric.h>
#include <xgboost/host_device_vector.h>
#include <thrust/iterator/discard_iterator.h>
#include <cmath>
#include <array>
#include <vector>
#include "metric_common.h"

View File

@@ -5,7 +5,6 @@
* \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
*/
#include <rabit/rabit.h>
#include <dmlc/registry.h>
#include <memory>
@@ -16,6 +15,7 @@
#include "xgboost/host_device_vector.h"
#include "metric_common.h"
#include "../collective/communicator-inl.h"
#include "../common/math.h"
#include "../common/survival_util.h"
#include "../common/threading_utils.h"
@@ -214,7 +214,7 @@ template <typename Policy> struct EvalEWiseSurvivalBase : public Metric {
info.labels_upper_bound_, preds);
double dat[2]{result.Residue(), result.Weights()};
rabit::Allreduce<rabit::op::Sum>(dat, 2);
collective::Allreduce<collective::Operation::kSum>(dat, 2);
return Policy::GetFinal(dat[0], dat[1]);
}