sync Jun 1

2023-06-01 15:55:06 -07:00
parent c5b575e00e fa2ab1f021
commit 42867a4805
76 changed files with 1424 additions and 595 deletions
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -3,6 +3,7 @@
 */
 #pragma once
 #include <string>
+#include <vector>

 #include "communicator.h"

@@ -224,5 +225,46 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
  Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }

+template <typename T>
+struct AllgatherVResult {
+  std::vector<std::size_t> offsets;
+  std::vector<std::size_t> sizes;
+  std::vector<T> result;
+};
+
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * We assume each worker has the same number of inputs, but each input may be of a different size.
+ *
+ * @param inputs All the inputs from the local worker.
+ * @param sizes  Sizes of each input.
+ */
+template <typename T>
+inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
+                                      std::vector<std::size_t> const &sizes) {
+  auto num_inputs = sizes.size();
+
+  // Gather the sizes across all workers.
+  std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
+  std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
+  collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
+
+  // Calculate input offsets (std::exclusive_scan).
+  std::vector<std::size_t> offsets(all_sizes.size());
+  for (std::size_t i = 1; i < offsets.size(); i++) {
+    offsets[i] = offsets[i - 1] + all_sizes[i - 1];
+  }
+
+  // Gather all the inputs.
+  auto total_input_size = offsets.back() + all_sizes.back();
+  std::vector<T> all_inputs(total_input_size);
+  std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
+  // We cannot use allgather here, since each worker might have a different size.
+  Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
+
+  return {offsets, all_sizes, all_inputs};
+}
+
 }  // namespace collective
 }  // namespace xgboost
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -12,19 +12,22 @@
 namespace xgboost {
 namespace collective {

-thread_local int Communicator::device_ordinal_{-1};
 thread_local std::unique_ptr<DeviceCommunicator> Communicator::device_communicator_{};

 void Communicator::Finalize() {
  communicator_->Shutdown();
  communicator_.reset(new NoOpCommunicator());
-  device_ordinal_ = -1;
  device_communicator_.reset(nullptr);
 }

 DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
-  if (!device_communicator_ || device_ordinal_ != device_ordinal) {
-    device_ordinal_ = device_ordinal;
+  thread_local auto old_device_ordinal = -1;
+  // If the number of GPUs changes, we need to re-initialize NCCL.
+  thread_local auto old_world_size = -1;
+  if (!device_communicator_ || device_ordinal != old_device_ordinal ||
+      communicator_->GetWorldSize() != old_world_size) {
+    old_device_ordinal = device_ordinal;
+    old_world_size = communicator_->GetWorldSize();
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
    if (type_ != CommunicatorType::kFederated) {
      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -229,7 +229,6 @@ class Communicator {
  static thread_local std::unique_ptr<Communicator> communicator_;
  static thread_local CommunicatorType type_;
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-  static thread_local int device_ordinal_;
  static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif