[EM] Avoid synchronous calls and unnecessary ATS access. (#10811)

- Pass context into various functions. - Factor out some CUDA algorithms. - Use ATS only for update position.
2024-09-10 14:33:14 +08:00
parent ed5f33df16
commit d94f6679fc
16 changed files with 161 additions and 201 deletions
--- a/tests/cpp/common/test_threading_utils.cu
+++ b/tests/cpp/common/test_threading_utils.cu
@@ -1,16 +1,17 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>  // thrust::copy

 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/threading_utils.cuh"
+#include "../helpers.h"  // for MakeCUDACtx

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(SegmentedTrapezoidThreads, Basic) {
  size_t constexpr kElements = 24, kGroups = 3;
+  auto ctx = MakeCUDACtx(0);
  dh::device_vector<size_t> offset_ptr(kGroups + 1, 0);
  offset_ptr[0] = 0;
  offset_ptr[1] = 8;
@@ -19,11 +20,11 @@ TEST(SegmentedTrapezoidThreads, Basic) {

  size_t h = 1;
  dh::device_vector<size_t> thread_ptr(kGroups + 1, 0);
-  size_t total = SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  size_t total = SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
  ASSERT_EQ(total, kElements - kGroups);

  h = 2;
-  SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
  std::vector<size_t> h_thread_ptr(thread_ptr.size());
  thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin());
  for (size_t i = 1; i < h_thread_ptr.size(); ++i) {
@@ -31,7 +32,7 @@ TEST(SegmentedTrapezoidThreads, Basic) {
  }

  h = 7;
-  SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
  thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin());
  for (size_t i = 1; i < h_thread_ptr.size(); ++i) {
    ASSERT_EQ(h_thread_ptr[i] - h_thread_ptr[i - 1], 28);
@@ -66,5 +67,4 @@ TEST(SegmentedTrapezoidThreads, Unravel) {
  ASSERT_EQ(i, 6);
  ASSERT_EQ(j, 7);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -60,8 +60,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {

  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};

-  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false,
-                  ctx.Device());
+  evaluator.Reset(&ctx, cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false);
  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;

  ASSERT_EQ(result.thresh, 1);
@@ -104,7 +103,7 @@ TEST(GpuHist, PartitionBasic) {
  };

  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);

  {
    // -1.0s go right
@@ -217,7 +216,7 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                          false};

  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);

  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -277,10 +276,8 @@ TEST(GpuHist, PartitionTwoNodes) {
                                          cuts.min_vals_.ConstDeviceSpan(),
                                          false};

-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
-                             ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
-                  ctx.Device());
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);

  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -336,10 +333,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                          cuts.min_vals_.ConstDeviceSpan(),
                                          false};

-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
-                             ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
-                  ctx.Device());
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;

  EXPECT_EQ(result.findex, 1);
@@ -522,7 +517,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
  cuts_.cut_values_.SetDevice(ctx.Device());
  cuts_.min_vals_.SetDevice(ctx.Device());

-  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device());
+  evaluator.Reset(&ctx, cuts_, dh::ToSpan(ft), info_.num_col_, param_, false);

  // Convert the sample histogram to fixed point
  auto quantiser = DummyRoundingFactor(&ctx);
@@ -586,7 +581,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
                                          false};

  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true);
  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;

  EXPECT_EQ(result.findex, 1);