Implement column sampler in CUDA. (#9785)

- CUDA implementation. - Extract the broadcasting logic, we will need the context parameter after revamping the collective implementation. - Some changes to the event loop for fixing a deadlock in CI. - Move argsort into algorithms.cuh, add support for cuda stream.
2023-11-17 04:29:08 +08:00
parent 178cfe70a8
commit fedd9674c8
20 changed files with 447 additions and 232 deletions
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -57,13 +57,13 @@ TEST(Algorithm, GpuArgSort) {
  auto ctx = MakeCUDACtx(0);

  dh::device_vector<float> values(20);
-  dh::Iota(dh::ToSpan(values));                                    // accending
+  dh::Iota(dh::ToSpan(values), ctx.CUDACtx()->Stream());  // accending
  dh::device_vector<size_t> sorted_idx(20);
-  dh::ArgSort<false>(dh::ToSpan(values), dh::ToSpan(sorted_idx));  // sort to descending
-  ASSERT_TRUE(thrust::is_sorted(thrust::device, sorted_idx.begin(), sorted_idx.end(),
+  ArgSort<false>(&ctx, dh::ToSpan(values), dh::ToSpan(sorted_idx));  // sort to descending
+  ASSERT_TRUE(thrust::is_sorted(ctx.CUDACtx()->CTP(), sorted_idx.begin(), sorted_idx.end(),
                                thrust::greater<size_t>{}));

-  dh::Iota(dh::ToSpan(values));
+  dh::Iota(dh::ToSpan(values), ctx.CUDACtx()->Stream());
  dh::device_vector<size_t> groups(3);
  groups[0] = 0;
  groups[1] = 10;
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -16,6 +16,7 @@
 #include <vector>     // for vector

 #include "../../../include/xgboost/logging.h"
+#include "../../../src/common/cuda_context.cuh"
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/hist_util.h"
@@ -211,7 +212,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
  cuts_ptr.SetDevice(DeviceOrd::CUDA(0));

  dh::device_vector<float> weight(n_samples * n_features, 0);
-  dh::Iota(dh::ToSpan(weight));
+  dh::Iota(dh::ToSpan(weight), ctx.CUDACtx()->Stream());

  dh::caching_device_vector<bst_row_t> columns_ptr(4);
  for (std::size_t i = 0; i < columns_ptr.size(); ++i) {
--- a/tests/cpp/common/test_random.cc
+++ b/tests/cpp/common/test_random.cc
@@ -1,19 +1,20 @@
-#include <valarray>
+/**
+ * Copyright 2018-2023, XGBoost Contributors
+ */
 #include "../../../src/common/random.h"
 #include "../helpers.h"
 #include "gtest/gtest.h"
-#include "xgboost/context.h"  // Context
+#include "xgboost/context.h"  // for Context

-namespace xgboost {
-namespace common {
-TEST(ColumnSampler, Test) {
-  Context ctx;
+namespace xgboost::common {
+namespace {
+void TestBasic(Context const* ctx) {
  int n = 128;
-  ColumnSampler cs;
+  ColumnSampler cs{1u};
  std::vector<float> feature_weights;

  // No node sampling
-  cs.Init(&ctx, n, feature_weights, 1.0f, 0.5f, 0.5f);
+  cs.Init(ctx, n, feature_weights, 1.0f, 0.5f, 0.5f);
  auto set0 = cs.GetFeatureSet(0);
  ASSERT_EQ(set0->Size(), 32);

@@ -26,7 +27,7 @@ TEST(ColumnSampler, Test) {
  ASSERT_EQ(set2->Size(), 32);

  // Node sampling
-  cs.Init(&ctx, n, feature_weights, 0.5f, 1.0f, 0.5f);
+  cs.Init(ctx, n, feature_weights, 0.5f, 1.0f, 0.5f);
  auto set3 = cs.GetFeatureSet(0);
  ASSERT_EQ(set3->Size(), 32);

@@ -36,21 +37,33 @@ TEST(ColumnSampler, Test) {
  ASSERT_EQ(set4->Size(), 32);

  // No level or node sampling, should be the same at different depth
-  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 0.5f);
-  ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
-            cs.GetFeatureSet(1)->HostVector());
+  cs.Init(ctx, n, feature_weights, 1.0f, 1.0f, 0.5f);
+  ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector());

-  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
+  cs.Init(ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
  auto set5 = cs.GetFeatureSet(0);
  ASSERT_EQ(set5->Size(), n);
-  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
+  cs.Init(ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
  auto set6 = cs.GetFeatureSet(0);
  ASSERT_EQ(set5->HostVector(), set6->HostVector());

  // Should always be a minimum of one feature
-  cs.Init(&ctx, n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
+  cs.Init(ctx, n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
  ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
 }
+}  // namespace
+
+TEST(ColumnSampler, Test) {
+  Context ctx;
+  TestBasic(&ctx);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(ColumnSampler, GPUTest) {
+  auto ctx = MakeCUDACtx(0);
+  TestBasic(&ctx);
+}
+#endif  // defined(XGBOOST_USE_CUDA)

 // Test if different threads using the same seed produce the same result
 TEST(ColumnSampler, ThreadSynchronisation) {
@@ -81,16 +94,16 @@ TEST(ColumnSampler, ThreadSynchronisation) {
  ASSERT_TRUE(success);
 }

-TEST(ColumnSampler, WeightedSampling) {
-  auto test_basic = [](int first) {
-    Context ctx;
+namespace {
+void TestWeightedSampling(Context const* ctx) {
+  auto test_basic = [ctx](int first) {
    std::vector<float> feature_weights(2);
    feature_weights[0] = std::abs(first - 1.0f);
    feature_weights[1] = first - 0.0f;
    ColumnSampler cs{0};
-    cs.Init(&ctx, 2, feature_weights, 1.0, 1.0, 0.5);
+    cs.Init(ctx, 2, feature_weights, 1.0, 1.0, 0.5);
    auto feature_sets = cs.GetFeatureSet(0);
-    auto const &h_feat_set = feature_sets->HostVector();
+    auto const& h_feat_set = feature_sets->HostVector();
    ASSERT_EQ(h_feat_set.size(), 1);
    ASSERT_EQ(h_feat_set[0], first - 0);
  };
@@ -104,8 +117,7 @@ TEST(ColumnSampler, WeightedSampling) {
  SimpleRealUniformDistribution<float> dist(.0f, 12.0f);
  std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); });
  ColumnSampler cs{0};
-  Context ctx;
-  cs.Init(&ctx, kCols, feature_weights, 0.5f, 1.0f, 1.0f);
+  cs.Init(ctx, kCols, feature_weights, 0.5f, 1.0f, 1.0f);
  std::vector<bst_feature_t> features(kCols);
  std::iota(features.begin(), features.end(), 0);
  std::vector<float> freq(kCols, 0);
@@ -131,8 +143,22 @@ TEST(ColumnSampler, WeightedSampling) {
    EXPECT_NEAR(freq[i], feature_weights[i], 1e-2);
  }
 }
+}  // namespace

-TEST(ColumnSampler, WeightedMultiSampling) {
+TEST(ColumnSampler, WeightedSampling) {
+  Context ctx;
+  TestWeightedSampling(&ctx);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(ColumnSampler, GPUWeightedSampling) {
+  auto ctx = MakeCUDACtx(0);
+  TestWeightedSampling(&ctx);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
+
+namespace {
+void TestWeightedMultiSampling(Context const* ctx) {
  size_t constexpr kCols = 32;
  std::vector<float> feature_weights(kCols, 0);
  for (size_t i = 0; i < feature_weights.size(); ++i) {
@@ -140,13 +166,24 @@ TEST(ColumnSampler, WeightedMultiSampling) {
  }
  ColumnSampler cs{0};
  float bytree{0.5}, bylevel{0.5}, bynode{0.5};
-  Context ctx;
-  cs.Init(&ctx, feature_weights.size(), feature_weights, bytree, bylevel, bynode);
+  cs.Init(ctx, feature_weights.size(), feature_weights, bytree, bylevel, bynode);
  auto feature_set = cs.GetFeatureSet(0);
  size_t n_sampled = kCols * bytree * bylevel * bynode;
  ASSERT_EQ(feature_set->Size(), n_sampled);
  feature_set = cs.GetFeatureSet(1);
  ASSERT_EQ(feature_set->Size(), n_sampled);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace
+
+TEST(ColumnSampler, WeightedMultiSampling) {
+  Context ctx;
+  TestWeightedMultiSampling(&ctx);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(ColumnSampler, GPUWeightedMultiSampling) {
+  auto ctx = MakeCUDACtx(0);
+  TestWeightedMultiSampling(&ctx);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::common
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -28,7 +28,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
  Context ctx;
  ctx.nthread = 4;
  int static constexpr kRows = 8, kCols = 16;
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);

  TrainParam param;
  param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
@@ -102,7 +102,7 @@ TEST(HistMultiEvaluator, Evaluate) {

  TrainParam param;
  param.Init(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);

  std::size_t n_samples = 3;
  bst_feature_t n_features = 2;
@@ -166,7 +166,7 @@ TEST(HistEvaluator, Apply) {
  TrainParam param;
  param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}});
  auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix();
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
  auto evaluator_ = HistEvaluator{&ctx, &param, dmat->Info(), sampler};

  CPUExpandEntry entry{0, 0};
@@ -194,7 +194,7 @@ TEST_F(TestPartitionBasedSplit, CPUHist) {
  Context ctx;
  // check the evaluator is returning the optimal split
  std::vector<FeatureType> ft{FeatureType::kCategorical};
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
  HistEvaluator evaluator{&ctx, &param_, info_, sampler};
  evaluator.InitRoot(GradStats{total_gpair_});
  RegTree tree;
@@ -224,7 +224,7 @@ auto CompareOneHotAndPartition(bool onehot) {
  auto dmat =
      RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix();

-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
  auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
  std::vector<CPUExpandEntry> entries(1);
  HistMakerTrainParam hist_param;
@@ -271,7 +271,7 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
  ASSERT_EQ(node_hist.size(), feature_histogram_.size());
  std::copy(feature_histogram_.cbegin(), feature_histogram_.cend(), node_hist.begin());

-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
  MetaInfo info;
  info.num_col_ = 1;
  info.feature_types = {FeatureType::kCategorical};
--- a/tests/cpp/tree/test_constraints.cc
+++ b/tests/cpp/tree/test_constraints.cc
@@ -1,3 +1,6 @@
+/**
+ * Copyright 2019-2023, XGBoost Contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>
 #include <xgboost/logging.h>
@@ -9,9 +12,7 @@
 #include "../../../src/tree/hist/evaluate_splits.h"
 #include "../helpers.h"

-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 TEST(CPUFeatureInteractionConstraint, Empty) {
  TrainParam param;
  param.UpdateAllowUnknown(Args{});
@@ -77,7 +78,7 @@ TEST(CPUMonoConstraint, Basic) {
  param.UpdateAllowUnknown(Args{{"monotone_constraints", str_mono}});

  auto Xy = RandomDataGenerator{kRows, kCols, 0.0}.GenerateDMatrix(true);
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);

  HistEvaluator evalutor{&ctx, &param, Xy->Info(), sampler};
  evalutor.InitRoot(GradStats{2.0, 2.0});
@@ -90,5 +91,4 @@ TEST(CPUMonoConstraint, Basic) {

  ASSERT_TRUE(evalutor.Evaluator().has_constraint);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree