[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
2022-10-05 15:39:01 -07:00
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -1,6 +1,7 @@
 #include <gtest/gtest.h>
 #include "test_quantile.h"
 #include "../helpers.h"
+#include "../../../src/collective/device_communicator.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/quantile.cuh"

@@ -341,17 +342,14 @@ TEST(GPUQuantile, AllReduceBasic) {
  // This test is supposed to run by a python test that setups the environment.
  std::string msg {"Skipping AllReduce test"};
  auto n_gpus = AllVisibleGPUs();
-  InitRabitContext(msg, n_gpus);
-  auto world = rabit::GetWorldSize();
+  InitCommunicatorContext(msg, n_gpus);
+  auto world = collective::GetWorldSize();
  if (world != 1) {
    ASSERT_EQ(world, n_gpus);
  } else {
    return;
  }

-  auto reducer = std::make_shared<dh::AllReducer>();
-  reducer->Init(0);
-
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
    // Set up single node version;
@@ -385,8 +383,8 @@ TEST(GPUQuantile, AllReduceBasic) {

    // Set up distributed version.  We rely on using rank as seed to generate
    // the exact same copy of data.
-    auto rank = rabit::GetRank();
-    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
+    auto rank = collective::GetRank();
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
                                    .Device(0)
@@ -422,28 +420,26 @@ TEST(GPUQuantile, AllReduceBasic) {
      ASSERT_NEAR(single_node_data[i].wmin, distributed_data[i].wmin, Eps);
    }
  });
-  rabit::Finalize();
+  collective::Finalize();
 }

 TEST(GPUQuantile, SameOnAllWorkers) {
  std::string msg {"Skipping SameOnAllWorkers test"};
  auto n_gpus = AllVisibleGPUs();
-  InitRabitContext(msg, n_gpus);
-  auto world = rabit::GetWorldSize();
+  InitCommunicatorContext(msg, n_gpus);
+  auto world = collective::GetWorldSize();
  if (world != 1) {
    ASSERT_EQ(world, n_gpus);
  } else {
    return;
  }
-  auto reducer = std::make_shared<dh::AllReducer>();
-  reducer->Init(0);

  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                 MetaInfo const &info) {
-    auto rank = rabit::GetRank();
+    auto rank = collective::GetRank();
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
                                    .Device(0)
@@ -459,7 +455,7 @@ TEST(GPUQuantile, SameOnAllWorkers) {

    // Test for all workers having the same sketch.
    size_t n_data = sketch_distributed.Data().size();
-    rabit::Allreduce<rabit::op::Max>(&n_data, 1);
+    collective::Allreduce<collective::Operation::kMax>(&n_data, 1);
    ASSERT_EQ(n_data, sketch_distributed.Data().size());
    size_t size_as_float =
        sketch_distributed.Data().size_bytes() / sizeof(float);
@@ -472,9 +468,10 @@ TEST(GPUQuantile, SameOnAllWorkers) {
    thrust::copy(thrust::device, local_data.data(),
                 local_data.data() + local_data.size(),
                 all_workers.begin() + local_data.size() * rank);
-    reducer->AllReduceSum(all_workers.data().get(), all_workers.data().get(),
-                         all_workers.size());
-    reducer->Synchronize();
+    collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0);
+
+    communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
+    communicator->Synchronize();

    auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
    std::vector<float> h_base_line(base_line.size());