[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator

* fix size_t specialization

* really fix size_t

* try again

* add include

* more include

* fix lint errors

* remove rabit includes

* fix pylint error

* return dict from communicator context

* fix communicator shutdown

* fix dask test

* reset communicator mocklist

* fix distributed tests

* do not save device communicator

* fix jvm gpu tests

* add python test for federated communicator

* Update gputreeshap submodule

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Rong Ou
2022-10-05 15:39:01 -07:00
committed by GitHub
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions

View File

@@ -4,12 +4,14 @@
* \brief synchronize the tree in all distributed nodes
*/
#include <xgboost/tree_updater.h>
#include <vector>
#include <string>
#include <limits>
#include "xgboost/json.h"
#include <limits>
#include <string>
#include <vector>
#include "../collective/communicator-inl.h"
#include "../common/io.h"
#include "xgboost/json.h"
namespace xgboost {
namespace tree {
@@ -35,17 +37,17 @@ class TreeSyncher : public TreeUpdater {
void Update(HostDeviceVector<GradientPair>*, DMatrix*,
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
const std::vector<RegTree*>& trees) override {
if (rabit::GetWorldSize() == 1) return;
if (collective::GetWorldSize() == 1) return;
std::string s_model;
common::MemoryBufferStream fs(&s_model);
int rank = rabit::GetRank();
int rank = collective::GetRank();
if (rank == 0) {
for (auto tree : trees) {
tree->Save(&fs);
}
}
fs.Seek(0);
rabit::Broadcast(&s_model, 0);
collective::Broadcast(&s_model, 0);
for (auto tree : trees) {
tree->Load(&fs);
}