[Breaking] Switch from rabit to the collective communicator (#8257)
* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -2,36 +2,36 @@
|
||||
* Copyright 2015-2022 by XGBoost Contributors
|
||||
* \file data.cc
|
||||
*/
|
||||
#include "xgboost/data.h"
|
||||
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
|
||||
#include "dmlc/io.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/c_api.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/version_config.h"
|
||||
#include "xgboost/learner.h"
|
||||
#include "xgboost/string_view.h"
|
||||
|
||||
#include "sparse_page_writer.h"
|
||||
#include "simple_dmatrix.h"
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../common/group_data.h"
|
||||
#include "../common/io.h"
|
||||
#include "../common/linalg_op.h"
|
||||
#include "../common/math.h"
|
||||
#include "../common/numeric.h"
|
||||
#include "../common/version.h"
|
||||
#include "../common/group_data.h"
|
||||
#include "../common/threading_utils.h"
|
||||
#include "../common/version.h"
|
||||
#include "../data/adapter.h"
|
||||
#include "../data/iterative_dmatrix.h"
|
||||
#include "file_iterator.h"
|
||||
|
||||
#include "validation.h"
|
||||
#include "./sparse_page_source.h"
|
||||
#include "./sparse_page_dmatrix.h"
|
||||
#include "./sparse_page_source.h"
|
||||
#include "dmlc/io.h"
|
||||
#include "file_iterator.h"
|
||||
#include "simple_dmatrix.h"
|
||||
#include "sparse_page_writer.h"
|
||||
#include "validation.h"
|
||||
#include "xgboost/c_api.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/learner.h"
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/string_view.h"
|
||||
#include "xgboost/version_config.h"
|
||||
|
||||
namespace dmlc {
|
||||
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
|
||||
@@ -793,12 +793,12 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split,
|
||||
size_t pos = cache_shards[i].rfind('.');
|
||||
if (pos == std::string::npos) {
|
||||
os << cache_shards[i]
|
||||
<< ".r" << rabit::GetRank()
|
||||
<< "-" << rabit::GetWorldSize();
|
||||
<< ".r" << collective::GetRank()
|
||||
<< "-" << collective::GetWorldSize();
|
||||
} else {
|
||||
os << cache_shards[i].substr(0, pos)
|
||||
<< ".r" << rabit::GetRank()
|
||||
<< "-" << rabit::GetWorldSize()
|
||||
<< ".r" << collective::GetRank()
|
||||
<< "-" << collective::GetWorldSize()
|
||||
<< cache_shards[i].substr(pos, cache_shards[i].length());
|
||||
}
|
||||
if (i + 1 != cache_shards.size()) {
|
||||
@@ -821,8 +821,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split,
|
||||
|
||||
int partid = 0, npart = 1;
|
||||
if (load_row_split) {
|
||||
partid = rabit::GetRank();
|
||||
npart = rabit::GetWorldSize();
|
||||
partid = collective::GetRank();
|
||||
npart = collective::GetWorldSize();
|
||||
} else {
|
||||
// test option to load in part
|
||||
npart = 1;
|
||||
@@ -877,7 +877,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split,
|
||||
/* sync up number of features after matrix loaded.
|
||||
* partitioned data will fail the train/val validation check
|
||||
* since partitioned data not knowing the real number of features. */
|
||||
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&dmat->Info().num_col_, 1);
|
||||
return dmat;
|
||||
}
|
||||
|
||||
|
||||
@@ -3,13 +3,11 @@
|
||||
*/
|
||||
#include "iterative_dmatrix.h"
|
||||
|
||||
#include <rabit/rabit.h>
|
||||
|
||||
#include <algorithm> // std::copy
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "../common/categorical.h" // common::IsCat
|
||||
#include "../common/column_matrix.h"
|
||||
#include "../common/hist_util.h" // common::HistogramCuts
|
||||
#include "../tree/param.h" // FIXME(jiamingy): Find a better way to share this parameter.
|
||||
#include "gradient_index.h"
|
||||
#include "proxy_dmatrix.h"
|
||||
@@ -140,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
|
||||
// We use do while here as the first batch is fetched in ctor
|
||||
if (n_features == 0) {
|
||||
n_features = num_cols();
|
||||
rabit::Allreduce<rabit::op::Max>(&n_features, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&n_features, 1);
|
||||
column_sizes.resize(n_features);
|
||||
info_.num_col_ = n_features;
|
||||
} else {
|
||||
@@ -157,7 +155,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
|
||||
// From here on Info() has the correct data shape
|
||||
Info().num_row_ = accumulated_rows;
|
||||
Info().num_nonzero_ = nnz;
|
||||
rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
|
||||
CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) {
|
||||
return f > accumulated_rows;
|
||||
})) << "Something went wrong during iteration.";
|
||||
|
||||
@@ -62,7 +62,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
if (cols == 0) {
|
||||
cols = num_cols();
|
||||
rabit::Allreduce<rabit::op::Max>(&cols, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&cols, 1);
|
||||
this->info_.num_col_ = cols;
|
||||
} else {
|
||||
CHECK_EQ(cols, num_cols()) << "Inconsistent number of columns.";
|
||||
@@ -166,7 +166,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
|
||||
|
||||
iter.Reset();
|
||||
// Synchronise worker columns
|
||||
rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
|
||||
}
|
||||
|
||||
BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& param) {
|
||||
|
||||
@@ -189,7 +189,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
||||
|
||||
|
||||
// Synchronise worker columns
|
||||
rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
|
||||
|
||||
if (adapter->NumRows() == kAdapterUnknownSize) {
|
||||
using IteratorAdapterT
|
||||
@@ -322,7 +322,7 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
|
||||
}
|
||||
// Synchronise worker columns
|
||||
info_.num_col_ = adapter->NumColumns();
|
||||
rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
|
||||
info_.num_row_ = total_batch_size;
|
||||
info_.num_nonzero_ = data_vec.size();
|
||||
CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
|
||||
|
||||
@@ -35,7 +35,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
|
||||
info_.num_col_ = adapter->NumColumns();
|
||||
info_.num_row_ = adapter->NumRows();
|
||||
// Synchronise worker columns
|
||||
rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
|
||||
}
|
||||
|
||||
template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include "./sparse_page_dmatrix.h"
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "./simple_batch_iterator.h"
|
||||
#include "gradient_index.h"
|
||||
|
||||
@@ -46,8 +48,8 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
|
||||
cache_prefix_{std::move(cache_prefix)} {
|
||||
ctx_.nthread = nthreads;
|
||||
cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
|
||||
if (rabit::IsDistributed()) {
|
||||
cache_prefix_ += ("-r" + std::to_string(rabit::GetRank()));
|
||||
if (collective::IsDistributed()) {
|
||||
cache_prefix_ += ("-r" + std::to_string(collective::GetRank()));
|
||||
}
|
||||
DMatrixProxy *proxy = MakeProxy(proxy_);
|
||||
auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
|
||||
@@ -94,7 +96,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
|
||||
this->info_.num_col_ = n_features;
|
||||
this->info_.num_nonzero_ = nnz;
|
||||
|
||||
rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
|
||||
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
|
||||
CHECK_NE(info_.num_col_, 0);
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
||||
#include "rabit/rabit.h"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user