[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
2022-10-05 15:39:01 -07:00
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -24,6 +24,10 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
    int32_t const rank = communicator_->GetRank();
    int32_t const world = communicator_->GetWorldSize();

+    if (world == 1) {
+      return;
+    }
+
    std::vector<uint64_t> uuids(world * kUuidLength, 0);
    auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
    auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
@@ -52,8 +56,15 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  }

  ~NcclDeviceCommunicator() override {
-    dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
-    ncclCommDestroy(nccl_comm_);
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+    if (cuda_stream_) {
+      dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
+    }
+    if (nccl_comm_) {
+      dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    }
    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
      LOG(CONSOLE) << "======== NCCL Statistics========";
      LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
@@ -61,16 +72,28 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
    }
  }

-  void AllReduceSum(double *send_receive_buffer, int count) override {
-    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count, ncclDouble,
-                                ncclSum, nccl_comm_, cuda_stream_));
-    allreduce_bytes_ += count * sizeof(double);
-    allreduce_calls_ += 1;
+  void AllReduceSum(float *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<ncclFloat>(send_receive_buffer, count);
+  }
+
+  void AllReduceSum(double *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<ncclDouble>(send_receive_buffer, count);
+  }
+
+  void AllReduceSum(int64_t *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<ncclInt64>(send_receive_buffer, count);
+  }
+
+  void AllReduceSum(uint64_t *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<ncclUint64>(send_receive_buffer, count);
  }

  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                  dh::caching_device_vector<char> *receive_buffer) override {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
    int const world_size = communicator_->GetWorldSize();
    int const rank = communicator_->GetRank();
@@ -95,6 +118,9 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  }

  void Synchronize() override {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
  }
@@ -136,6 +162,19 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
    return id;
  }

+  template <ncclDataType_t data_type, typename T>
+  void DoAllReduceSum(T *send_receive_buffer, size_t count) {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count, data_type, ncclSum,
+                                nccl_comm_, cuda_stream_));
+    allreduce_bytes_ += count * sizeof(T);
+    allreduce_calls_ += 1;
+  }
+
  int const device_ordinal_;
  Communicator *communicator_;
  ncclComm_t nccl_comm_{};