[Breaking] Switch from rabit to the collective communicator (#8257)
* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -1,27 +1,23 @@
|
||||
/*!
|
||||
* Copyright 2021 by XGBoost Contributors
|
||||
*/
|
||||
#include "auc.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <utility>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "rabit/rabit.h"
|
||||
#include "xgboost/linalg.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/metric.h"
|
||||
|
||||
#include "auc.h"
|
||||
|
||||
#include "../common/common.h"
|
||||
#include "../common/math.h"
|
||||
#include "../common/threading_utils.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/linalg.h"
|
||||
#include "xgboost/metric.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
@@ -117,7 +113,8 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
|
||||
|
||||
// we have 2 averages going in here, first is among workers, second is among
|
||||
// classes. allreduce sums up fp/tp auc for each class.
|
||||
rabit::Allreduce<rabit::op::Sum>(results.Values().data(), results.Values().size());
|
||||
collective::Allreduce<collective::Operation::kSum>(results.Values().data(),
|
||||
results.Values().size());
|
||||
double auc_sum{0};
|
||||
double tp_sum{0};
|
||||
for (size_t c = 0; c < n_classes; ++c) {
|
||||
@@ -265,7 +262,7 @@ class EvalAUC : public Metric {
|
||||
}
|
||||
// We use the global size to handle empty dataset.
|
||||
std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
|
||||
rabit::Allreduce<rabit::op::Max>(meta.data(), meta.size());
|
||||
collective::Allreduce<collective::Operation::kMax>(meta.data(), meta.size());
|
||||
if (meta[0] == 0) {
|
||||
// Empty across all workers, which is not supported.
|
||||
auc = std::numeric_limits<double>::quiet_NaN();
|
||||
@@ -287,7 +284,7 @@ class EvalAUC : public Metric {
|
||||
}
|
||||
|
||||
std::array<double, 2> results{auc, static_cast<double>(valid_groups)};
|
||||
rabit::Allreduce<rabit::op::Sum>(results.data(), results.size());
|
||||
collective::Allreduce<collective::Operation::kSum>(results.data(), results.size());
|
||||
auc = results[0];
|
||||
valid_groups = static_cast<uint32_t>(results[1]);
|
||||
|
||||
@@ -316,7 +313,7 @@ class EvalAUC : public Metric {
|
||||
}
|
||||
double local_area = fp * tp;
|
||||
std::array<double, 2> result{auc, local_area};
|
||||
rabit::Allreduce<rabit::op::Sum>(result.data(), result.size());
|
||||
collective::Allreduce<collective::Operation::kSum>(result.data(), result.size());
|
||||
std::tie(auc, local_area) = common::UnpackArr(std::move(result));
|
||||
if (local_area <= 0) {
|
||||
// the dataset across all workers have only positive or negative sample
|
||||
|
||||
@@ -11,11 +11,10 @@
|
||||
#include <utility>
|
||||
#include <tuple>
|
||||
|
||||
#include "rabit/rabit.h"
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "auc.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "../collective/device_communicator.cuh"
|
||||
#include "../common/ranking_utils.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -46,9 +45,8 @@ struct DeviceAUCCache {
|
||||
dh::device_vector<size_t> unique_idx;
|
||||
// p^T: transposed prediction matrix, used by MultiClassAUC
|
||||
dh::device_vector<float> predts_t;
|
||||
std::unique_ptr<dh::AllReducer> reducer;
|
||||
|
||||
void Init(common::Span<float const> predts, bool is_multi, int32_t device) {
|
||||
void Init(common::Span<float const> predts, bool is_multi) {
|
||||
if (sorted_idx.size() != predts.size()) {
|
||||
sorted_idx.resize(predts.size());
|
||||
fptp.resize(sorted_idx.size());
|
||||
@@ -58,10 +56,6 @@ struct DeviceAUCCache {
|
||||
predts_t.resize(sorted_idx.size());
|
||||
}
|
||||
}
|
||||
if (is_multi && !reducer) {
|
||||
reducer.reset(new dh::AllReducer);
|
||||
reducer->Init(device);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -72,7 +66,7 @@ void InitCacheOnce(common::Span<float const> predts, int32_t device,
|
||||
if (!cache) {
|
||||
cache.reset(new DeviceAUCCache);
|
||||
}
|
||||
cache->Init(predts, is_multi, device);
|
||||
cache->Init(predts, is_multi);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -205,9 +199,11 @@ double ScaleClasses(common::Span<double> results, common::Span<double> local_are
|
||||
common::Span<double> tp, common::Span<double> auc,
|
||||
std::shared_ptr<DeviceAUCCache> cache, size_t n_classes) {
|
||||
dh::XGBDeviceAllocator<char> alloc;
|
||||
if (rabit::IsDistributed()) {
|
||||
CHECK_EQ(dh::CudaGetPointerDevice(results.data()), dh::CurrentDevice());
|
||||
cache->reducer->AllReduceSum(results.data(), results.data(), results.size());
|
||||
if (collective::IsDistributed()) {
|
||||
int32_t device = dh::CurrentDevice();
|
||||
CHECK_EQ(dh::CudaGetPointerDevice(results.data()), device);
|
||||
auto* communicator = collective::Communicator::GetDevice(device);
|
||||
communicator->AllReduceSum(results.data(), results.size());
|
||||
}
|
||||
auto reduce_in = dh::MakeTransformIterator<Pair>(
|
||||
thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {
|
||||
|
||||
@@ -10,13 +10,13 @@
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
#include "rabit/rabit.h"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/metric.h"
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../common/common.h"
|
||||
#include "../common/threading_utils.h"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/metric.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
@@ -101,7 +101,7 @@ XGBOOST_DEVICE inline double CalcDeltaPRAUC(double fp_prev, double fp,
|
||||
|
||||
inline void InvalidGroupAUC() {
|
||||
LOG(INFO) << "Invalid group with less than 3 samples is found on worker "
|
||||
<< rabit::GetRank() << ". Calculating AUC value requires at "
|
||||
<< collective::GetRank() << ". Calculating AUC value requires at "
|
||||
<< "least 2 pairs of samples.";
|
||||
}
|
||||
|
||||
|
||||
@@ -7,11 +7,11 @@
|
||||
* The expressions like wsum == 0 ? esum : esum / wsum is used to handle empty dataset.
|
||||
*/
|
||||
#include <dmlc/registry.h>
|
||||
#include <rabit/rabit.h>
|
||||
#include <xgboost/metric.h>
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../common/common.h"
|
||||
#include "../common/math.h"
|
||||
#include "../common/pseudo_huber.h"
|
||||
@@ -196,8 +196,8 @@ class PseudoErrorLoss : public Metric {
|
||||
return std::make_tuple(v, wt);
|
||||
});
|
||||
double dat[2]{result.Residue(), result.Weights()};
|
||||
if (rabit::IsDistributed()) {
|
||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||
if (collective::IsDistributed()) {
|
||||
collective::Allreduce<collective::Operation::kSum>(dat, 2);
|
||||
}
|
||||
return EvalRowMAPE::GetFinal(dat[0], dat[1]);
|
||||
}
|
||||
@@ -365,7 +365,7 @@ struct EvalEWiseBase : public Metric {
|
||||
});
|
||||
|
||||
double dat[2]{result.Residue(), result.Weights()};
|
||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||
collective::Allreduce<collective::Operation::kSum>(dat, 2);
|
||||
return Policy::GetFinal(dat[0], dat[1]);
|
||||
}
|
||||
|
||||
|
||||
@@ -4,15 +4,14 @@
|
||||
* \brief evaluation metrics for multiclass classification.
|
||||
* \author Kailong Chen, Tianqi Chen
|
||||
*/
|
||||
#include <rabit/rabit.h>
|
||||
#include <xgboost/metric.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
|
||||
#include "metric_common.h"
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../common/math.h"
|
||||
#include "../common/common.h"
|
||||
#include "../common/threading_utils.h"
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
@@ -185,7 +184,7 @@ struct EvalMClassBase : public Metric {
|
||||
dat[0] = result.Residue();
|
||||
dat[1] = result.Weights();
|
||||
}
|
||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||
collective::Allreduce<collective::Operation::kSum>(dat, 2);
|
||||
return Derived::GetFinal(dat[0], dat[1]);
|
||||
}
|
||||
/*!
|
||||
|
||||
@@ -20,17 +20,17 @@
|
||||
// corresponding headers that brings in those function declaration can't be included with CUDA).
|
||||
// This precludes the CPU and GPU logic to coexist inside a .cu file
|
||||
|
||||
#include <rabit/rabit.h>
|
||||
#include <xgboost/metric.h>
|
||||
#include <dmlc/registry.h>
|
||||
#include <cmath>
|
||||
#include <xgboost/metric.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../common/math.h"
|
||||
#include "../common/threading_utils.h"
|
||||
#include "metric_common.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -103,7 +103,7 @@ struct EvalAMS : public Metric {
|
||||
}
|
||||
|
||||
double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
|
||||
CHECK(!rabit::IsDistributed()) << "metric AMS do not support distributed evaluation";
|
||||
CHECK(!collective::IsDistributed()) << "metric AMS do not support distributed evaluation";
|
||||
using namespace std; // NOLINT(*)
|
||||
|
||||
const auto ndata = static_cast<bst_omp_uint>(info.labels.Size());
|
||||
@@ -216,10 +216,10 @@ struct EvalRank : public Metric, public EvalRankConfig {
|
||||
exc.Rethrow();
|
||||
}
|
||||
|
||||
if (rabit::IsDistributed()) {
|
||||
if (collective::IsDistributed()) {
|
||||
double dat[2]{sum_metric, static_cast<double>(ngroups)};
|
||||
// approximately estimate the metric using mean
|
||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||
collective::Allreduce<collective::Operation::kSum>(dat, 2);
|
||||
return dat[0] / dat[1];
|
||||
} else {
|
||||
return sum_metric / ngroups;
|
||||
@@ -341,7 +341,7 @@ struct EvalCox : public Metric {
|
||||
public:
|
||||
EvalCox() = default;
|
||||
double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
|
||||
CHECK(!rabit::IsDistributed()) << "Cox metric does not support distributed evaluation";
|
||||
CHECK(!collective::IsDistributed()) << "Cox metric does not support distributed evaluation";
|
||||
using namespace std; // NOLINT(*)
|
||||
|
||||
const auto ndata = static_cast<bst_omp_uint>(info.labels.Size());
|
||||
|
||||
@@ -4,15 +4,12 @@
|
||||
* \brief prediction rank based metrics.
|
||||
* \author Kailong Chen, Tianqi Chen
|
||||
*/
|
||||
#include <rabit/rabit.h>
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
#include <xgboost/metric.h>
|
||||
#include <xgboost/host_device_vector.h>
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
|
||||
#include "metric_common.h"
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
* \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
|
||||
*/
|
||||
|
||||
#include <rabit/rabit.h>
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
#include <memory>
|
||||
@@ -16,6 +15,7 @@
|
||||
#include "xgboost/host_device_vector.h"
|
||||
|
||||
#include "metric_common.h"
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../common/math.h"
|
||||
#include "../common/survival_util.h"
|
||||
#include "../common/threading_utils.h"
|
||||
@@ -214,7 +214,7 @@ template <typename Policy> struct EvalEWiseSurvivalBase : public Metric {
|
||||
info.labels_upper_bound_, preds);
|
||||
|
||||
double dat[2]{result.Residue(), result.Weights()};
|
||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||
collective::Allreduce<collective::Operation::kSum>(dat, 2);
|
||||
return Policy::GetFinal(dat[0], dat[1]);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user