[coll] Pass context to various functions. (#9772)

* [coll] Pass context to various functions. In the future, the `Context` object would be required for collective operations, this PR passes the context object to some required functions to prepare for swapping out the implementation.
2023-11-08 09:54:05 +08:00
parent 6c0a190f6d
commit 06bdc15e9b
45 changed files with 275 additions and 255 deletions
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -395,11 +395,11 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
  }
 }

-void GPUHistEvaluator::EvaluateSplits(
-    const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
-    common::Span<const EvaluateSplitInputs> d_inputs,
-    EvaluateSplitSharedInputs shared_inputs,
-    common::Span<GPUExpandEntry> out_entries) {
+void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_node_t> &nidx,
+                                      bst_feature_t max_active_features,
+                                      common::Span<const EvaluateSplitInputs> d_inputs,
+                                      EvaluateSplitSharedInputs shared_inputs,
+                                      common::Span<GPUExpandEntry> out_entries) {
  auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();

  dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
@@ -417,19 +417,20 @@ void GPUHistEvaluator::EvaluateSplits(
                          out_splits.size() * sizeof(DeviceSplitCandidate));

    // Reduce to get the best candidate from all workers.
-    dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
-      out_splits[i] = all_candidates[i];
-      for (auto rank = 1; rank < world_size; rank++) {
-        out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
-      }
-    });
+    dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(),
+                [world_size, all_candidates, out_splits] __device__(size_t i) {
+                  out_splits[i] = all_candidates[i];
+                  for (auto rank = 1; rank < world_size; rank++) {
+                    out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
+                  }
+                });
  }

  auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
  auto d_entries = out_entries;
  auto device_cats_accessor = this->DeviceCatStorage(nidx);
  // turn candidate into entry, along with handling sort based split.
-  dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable {
+  dh::LaunchN(d_inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t i) mutable {
    auto const input = d_inputs[i];
    auto &split = out_splits[i];
    // Subtract parent gain here
@@ -464,12 +465,12 @@ void GPUHistEvaluator::EvaluateSplits(
  this->CopyToHost(nidx);
 }

-GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
-    EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs) {
+GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
+                                                     EvaluateSplitSharedInputs shared_inputs) {
  dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
  dh::TemporaryArray<GPUExpandEntry> out_entries(1);
-  this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
-                       dh::ToSpan(out_entries));
+  this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
+                       shared_inputs, dh::ToSpan(out_entries));
  GPUExpandEntry root_entry;
  dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                cudaMemcpyDeviceToHost));
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -193,7 +193,7 @@ class GPUHistEvaluator {
  /**
   * \brief Evaluate splits for left and right nodes.
   */
-  void EvaluateSplits(const std::vector<bst_node_t> &nidx,
+  void EvaluateSplits(Context const* ctx, const std::vector<bst_node_t> &nidx,
                      bst_feature_t max_active_features,
                      common::Span<const EvaluateSplitInputs> d_inputs,
                      EvaluateSplitSharedInputs shared_inputs,
@@ -201,7 +201,7 @@ class GPUHistEvaluator {
  /**
   * \brief Evaluate splits for root node.
   */
-  GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,
+  GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
                                     EvaluateSplitSharedInputs shared_inputs);
 };
 }  // namespace tree
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -16,8 +16,7 @@
 #include "row_partitioner.cuh"
 #include "xgboost/base.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 struct Pair {
  GradientPair first;
@@ -53,7 +52,8 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
 *
 * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
 */
-GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
+GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
+                                     MetaInfo const& info) {
  using GradientSumT = GradientPairPrecise;
  using T = typename GradientSumT::ValueT;
  dh::XGBCachingDeviceAllocator<char> alloc;
@@ -99,7 +99,6 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, Met
                                 static_cast<T>(1) / to_floating_point_.GetHess());
 }

-
 XGBOOST_DEV_INLINE void
 AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
               xgboost::GradientPairInt64 const &gpair) {
@@ -314,6 +313,4 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&

  dh::safe_cuda(cudaGetLastError());
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -39,18 +39,20 @@ private:
  GradientPairPrecise to_floating_point_;

 public:
-  GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
-  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
+  GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                               gpair.GetHess() * to_fixed_point_.GetHess());
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
    return adjusted;
  }
-  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPairPrecise const& gpair) const {
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64
+  ToFixedPoint(GradientPairPrecise const& gpair) const {
    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                               gpair.GetHess() * to_fixed_point_.GetHess());
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
    return adjusted;
  }
-  XGBOOST_DEVICE GradientPairPrecise ToFloatingPoint(const GradientPairInt64&gpair) const {
+  [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
+  ToFloatingPoint(const GradientPairInt64& gpair) const {
    auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
    auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
    return {g,h};