Support column split in GPU evaluate splits (#9511)

2023-08-23 01:33:43 -07:00
parent 8c10af45a0
commit 6103dca0bb
11 changed files with 240 additions and 113 deletions
--- a/src/collective/communicator-inl.cuh
+++ b/src/collective/communicator-inl.cuh
@@ -57,6 +57,20 @@ inline void AllReduce(int device, double *send_receive_buffer, size_t count) {
  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }

+/**
+ * @brief Gather values from all all processes.
+ *
+ * This assumes all ranks have the same size.
+ *
+ * @param send_buffer    Buffer storing the data to be sent.
+ * @param receive_buffer Buffer storing the gathered data.
+ * @param send_size      Size of the sent data in bytes.
+ */
+inline void AllGather(int device, void const *send_buffer, void *receive_buffer,
+                      std::size_t send_size) {
+  Communicator::GetDevice(device)->AllGather(send_buffer, receive_buffer, send_size);
+}
+
 /**
 * @brief Gather variable-length values from all processes.
 * @param device         ID of the device.
--- a/src/collective/device_communicator.cuh
+++ b/src/collective/device_communicator.cuh
@@ -27,6 +27,17 @@ class DeviceCommunicator {
  virtual void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                         Operation op) = 0;

+  /**
+   * @brief Gather values from all all processes.
+   *
+   * This assumes all ranks have the same size.
+   *
+   * @param send_buffer    Buffer storing the data to be sent.
+   * @param receive_buffer Buffer storing the gathered data.
+   * @param send_size      Size of the sent data in bytes.
+   */
+  virtual void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) = 0;
+
  /**
   * @brief Gather variable-length values from all processes.
   * @param send_buffer Buffer storing the input data.
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -28,12 +28,26 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
    auto size = count * GetTypeSize(data_type);
-    host_buffer_.reserve(size);
+    host_buffer_.resize(size);
    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
    Allreduce(host_buffer_.data(), count, data_type, op);
    dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
  }

+  void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override {
+    if (world_size_ == 1) {
+      return;
+    }
+
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    host_buffer_.resize(send_size * world_size_);
+    dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
+                             cudaMemcpyDefault));
+    Allgather(host_buffer_.data(), host_buffer_.size());
+    dh::safe_cuda(
+        cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
+  }
+
  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                  dh::caching_device_vector<char> *receive_buffer) override {
    if (world_size_ == 1) {
@@ -49,7 +63,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
    auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
    receive_buffer->resize(total_bytes);

-    host_buffer_.reserve(total_bytes);
+    host_buffer_.resize(total_bytes);
    size_t offset = 0;
    for (int32_t i = 0; i < world_size_; ++i) {
      size_t as_bytes = segments->at(i);
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -178,6 +178,17 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
  allreduce_calls_ += 1;
 }

+void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_buffer,
+                                       std::size_t send_size) {
+  if (world_size_ == 1) {
+    return;
+  }
+
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
+                              dh::DefaultStream()));
+}
+
 void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
                                        std::vector<std::size_t> *segments,
                                        dh::caching_device_vector<char> *receive_buffer) {
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -29,6 +29,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  ~NcclDeviceCommunicator() override;
  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                 Operation op) override;
+  void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override;
  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                  dh::caching_device_vector<char> *receive_buffer) override;
  void Synchronize() override;
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -5,8 +5,8 @@
 #include <vector>
 #include <limits>

+#include "../../collective/communicator-inl.cuh"
 #include "../../common/categorical.h"
-#include "../../common/device_helpers.cuh"
 #include "../../data/ellpack_page.cuh"
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
@@ -409,6 +409,23 @@ void GPUHistEvaluator::EvaluateSplits(
  this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
                             evaluator, out_splits);

+  if (is_column_split_) {
+    // With column-wise data split, we gather the split candidates from all the workers and find the
+    // global best candidates.
+    auto const world_size = collective::GetWorldSize();
+    dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
+    auto all_candidates = dh::ToSpan(all_candidate_storage);
+    collective::AllGather(device_, out_splits.data(), all_candidates.data(),
+                          out_splits.size() * sizeof(DeviceSplitCandidate));
+
+    // Reduce to get the best candidate from all workers.
+    dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
+      for (auto rank = 0; rank < world_size; rank++) {
+        out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
+      }
+    });
+  }
+
  auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
  auto d_entries = out_entries;
  auto device_cats_accessor = this->DeviceCatStorage(nidx);
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -83,6 +83,9 @@ class GPUHistEvaluator {
  // Number of elements of categorical storage type
  // needed to hold categoricals for a single mode
  std::size_t node_categorical_storage_size_ = 0;
+  // Is the data split column-wise?
+  bool is_column_split_ = false;
+  int32_t device_;

  // Copy the categories from device to host asynchronously.
  void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -136,7 +139,8 @@ class GPUHistEvaluator {
   * \brief Reset the evaluator, should be called before any use.
   */
  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
-             bst_feature_t n_features, TrainParam const &param, int32_t device);
+             bst_feature_t n_features, TrainParam const &param, bool is_column_split,
+             int32_t device);

  /**
   * \brief Get host category storage for nidx.  Different from the internal version, this
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -14,10 +14,9 @@

 namespace xgboost {
 namespace tree {
-void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
-                                           common::Span<FeatureType const> ft,
-                                           bst_feature_t n_features, TrainParam const &param,
-                                           int32_t device) {
+void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
+                             bst_feature_t n_features, TrainParam const &param,
+                             bool is_column_split, int32_t device) {
  param_ = param;
  tree_evaluator_ = TreeEvaluator{param, n_features, device};
  has_categoricals_ = cuts.HasCategorical();
@@ -65,6 +64,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
                        return fidx;
                      });
  }
+  is_column_split_ = is_column_split;
+  device_ = device;
 }

 common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -242,7 +242,8 @@ struct GPUHistMakerDevice {
    page = sample.page;
    gpair = sample.gpair;

-    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
+    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
+                           dmat->Info().IsColumnSplit(), ctx_->gpu_id);

    quantiser.reset(new GradientQuantiser(this->gpair));