[EM] Refactor GPU histogram builder. (#10764)

- Expose the maximum number of cached nodes to be consistent with the CPU implementation. Also easier for testing. - Extract the subtraction trick for easier testing. - Split up the `GradientQuantiser` to avoid circular dependency.
2024-08-30 02:39:14 +08:00
parent 34937fea41
commit 61dd854a52
17 changed files with 394 additions and 187 deletions
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -9,6 +9,7 @@

 #include "../../../../src/tree/gpu_hist/histogram.cuh"
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"  // for RowPartitioner
+#include "../../../../src/tree/hist/param.h"                  // for HistMakerTrainParam
 #include "../../../../src/tree/param.h"                       // for TrainParam
 #include "../../categorical_helpers.h"                        // for OneHotEncodeFeature
 #include "../../helpers.h"
@@ -21,13 +22,13 @@ TEST(Histogram, DeviceHistogramStorage) {
  constexpr size_t kNBins = 128;
  constexpr int kNNodes = 4;
  constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
-  DeviceHistogramStorage<kStopGrowing> histogram;
-  histogram.Init(FstCU(), kNBins);
+  DeviceHistogramStorage histogram{};
+  histogram.Reset(&ctx, kNBins, kNNodes);
  for (int i = 0; i < kNNodes; ++i) {
    histogram.AllocateHistograms(&ctx, {i});
  }
-  histogram.Reset(&ctx);
  ASSERT_EQ(histogram.Data().size(), kStopGrowing);
+  histogram.Reset(&ctx, kNBins, kNNodes);

  // Use allocated memory but do not erase nidx_map.
  for (int i = 0; i < kNNodes; ++i) {
@@ -55,6 +56,35 @@ TEST(Histogram, DeviceHistogramStorage) {
  EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1}););
 }

+TEST(Histogram, SubtractionTrack) {
+  auto ctx = MakeCUDACtx(0);
+
+  auto page = BuildEllpackPage(&ctx, 64, 4);
+  auto cuts = page->CutsShared();
+  FeatureGroups fg{*cuts, true, std::numeric_limits<std::size_t>::max(),
+                   sizeof(GradientPairPrecise)};
+  auto fg_acc = fg.DeviceAccessor(ctx.Device());
+  auto n_total_bins = cuts->TotalBins();
+
+  // 2 nodes
+  auto max_cached_hist_nodes = 2ull;
+  DeviceHistogramBuilder histogram;
+  histogram.Reset(&ctx, max_cached_hist_nodes, fg_acc, n_total_bins, false);
+  histogram.AllocateHistograms(&ctx, {0, 1, 2});
+  GPUExpandEntry root;
+  root.nid = 0;
+  auto need_build = histogram.SubtractHist({root}, {0}, {1});
+
+  std::vector<GPUExpandEntry> candidates(2);
+  candidates[0].nid = 1;
+  candidates[1].nid = 2;
+
+  need_build = histogram.SubtractHist(candidates, {3, 5}, {4, 6});
+  ASSERT_EQ(need_build.size(), 2);
+  ASSERT_EQ(need_build[0], 4);
+  ASSERT_EQ(need_build[1], 6);
+}
+
 std::vector<GradientPairPrecise> GetHostHistGpair() {
  // 24 bins, 3 bins for each feature (column).
  std::vector<GradientPairPrecise> hist_gpair = {
@@ -101,17 +131,16 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  auto shm_size = use_shared_memory_histograms ? dh::MaxSharedMemoryOptin(ctx.Ordinal()) : 0;
  FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64));

-  DeviceHistogramStorage hist;
-  hist.Init(ctx.Device(), page->Cuts().TotalBins());
-  hist.AllocateHistograms(&ctx, {0});
-
  DeviceHistogramBuilder builder;
-  builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), !use_shared_memory_histograms);
+  builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                feature_groups.DeviceAccessor(ctx.Device()), page->Cuts().TotalBins(),
+                !use_shared_memory_histograms);
+  builder.AllocateHistograms(&ctx, {0});
  builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                         feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(),
-                         row_partitioner->GetRows(0), hist.GetNodeHistogram(0), *quantiser);
+                         row_partitioner->GetRows(0), builder.GetNodeHistogram(0), *quantiser);

-  auto node_histogram = hist.GetNodeHistogram(0);
+  auto node_histogram = builder.GetNodeHistogram(0);

  std::vector<GradientPairInt64> h_result(node_histogram.size());
  dh::CopyDeviceSpanToVector(&h_result, node_histogram);
@@ -158,7 +187,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)

    auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
    DeviceHistogramBuilder builder;
-    builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
+    builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                  feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                           feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                           d_histogram, quantiser);
@@ -173,7 +203,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)

      auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
      DeviceHistogramBuilder builder;
-      builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
+      builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                    feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                             feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                             d_new_histogram, quantiser);
@@ -197,7 +228,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)

      dh::device_vector<GradientPairInt64> baseline(num_bins);
      DeviceHistogramBuilder builder;
-      builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), force_global);
+      builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                    single_group.DeviceAccessor(ctx.Device()), num_bins, force_global);
      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                             single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                             dh::ToSpan(baseline), quantiser);
@@ -264,7 +296,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
    auto* page = batch.Impl();
    FeatureGroups single_group(page->Cuts());
    DeviceHistogramBuilder builder;
-    builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
+    builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                  single_group.DeviceAccessor(ctx.Device()), num_categories, false);
    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                           dh::ToSpan(cat_hist), quantiser);
@@ -280,7 +313,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
    auto* page = batch.Impl();
    FeatureGroups single_group(page->Cuts());
    DeviceHistogramBuilder builder;
-    builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
+    builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                  single_group.DeviceAccessor(ctx.Device()), encode_hist.size(), false);
    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                           dh::ToSpan(encode_hist), quantiser);
@@ -429,7 +463,8 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
        auto ridx = partitioners.at(k)->GetRows(0);
        auto d_histogram = dh::ToSpan(multi_hist);
        DeviceHistogramBuilder builder;
-        builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
+        builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                      fg->DeviceAccessor(ctx.Device()), d_histogram.size(), force_global);
        builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(ctx.Device()),
                               fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
                               d_histogram, quantiser);
@@ -454,7 +489,8 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
      auto ridx = partitioner.GetRows(0);
      auto d_histogram = dh::ToSpan(single_hist);
      DeviceHistogramBuilder builder;
-      builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
+      builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(), fg->DeviceAccessor(ctx.Device()),
+                    d_histogram.size(), force_global);
      builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(ctx.Device()),
                             fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
                             d_histogram, quantiser);
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -51,7 +51,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
  row_set_collection.Init();

  HistMakerTrainParam hist_param;
-  hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
+  hist.Reset(gmat.cut.Ptrs().back(), hist_param.MaxCachedHistNodes(ctx.Device()));
  hist.AllocateHistograms({0});
  auto const &elem = row_set_collection[0];
  common::BuildHist<false>(row_gpairs, common::Span{elem.begin(), elem.end()}, gmat, hist[0],
@@ -120,7 +120,7 @@ TEST(HistMultiEvaluator, Evaluate) {
  linalg::Vector<GradientPairPrecise> root_sum({2}, DeviceOrd::CPU());
  for (bst_target_t t{0}; t < n_targets; ++t) {
    auto &hist = histogram[t];
-    hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
+    hist.Reset(n_bins * n_features, hist_param.MaxCachedHistNodes(ctx.Device()));
    hist.AllocateHistograms({0});
    auto node_hist = hist[0];
    node_hist[0] = {-0.5, 0.5};
@@ -237,7 +237,7 @@ auto CompareOneHotAndPartition(bool onehot) {
    entries.front().nid = 0;
    entries.front().depth = 0;

-    hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node);
+    hist.Reset(gmat.cut.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
    hist.AllocateHistograms({0});
    auto node_hist = hist[0];

@@ -265,9 +265,10 @@ TEST(HistEvaluator, Categorical) {
 }

 TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
+  Context ctx;
  BoundedHistCollection hist;
  HistMakerTrainParam hist_param;
-  hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
+  hist.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
  hist.AllocateHistograms({0});
  auto node_hist = hist[0];
  ASSERT_EQ(node_hist.size(), feature_histogram_.size());
@@ -277,10 +278,9 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
  MetaInfo info;
  info.num_col_ = 1;
  info.feature_types = {FeatureType::kCategorical};
-  Context ctx;
+
  auto evaluator = HistEvaluator{&ctx, &param_, info, sampler};
  evaluator.InitRoot(GradStats{parent_sum_});
-
  std::vector<CPUExpandEntry> entries(1);
  RegTree tree;
  evaluator.EvaluateSplits(hist, cuts_, info.feature_types.ConstHostSpan(), tree, &entries);
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -56,8 +56,9 @@ class TestPartitionBasedSplit : public ::testing::Test {

    cuts_.min_vals_.Resize(1);

+    Context ctx;
    HistMakerTrainParam hist_param;
-    hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
+    hist_.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
    hist_.AllocateHistograms({0});
    auto node_hist = hist_[0];

--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -216,7 +216,7 @@ TEST(GpuHist, ConfigIO) {
 }

 TEST(GpuHist, MaxDepth) {
-  Context ctx(MakeCUDACtx(0));
+  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows = 16;
  size_t constexpr kCols = 4;
  auto p_mat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -10,6 +10,7 @@ from xgboost import testing as tm
 from xgboost.testing.params import (
    cat_parameter_strategy,
    exact_parameter_strategy,
+    hist_cache_strategy,
    hist_parameter_strategy,
 )
 from xgboost.testing.updater import (
@@ -46,6 +47,7 @@ class TestGPUUpdaters:
    @given(
        exact_parameter_strategy,
        hist_parameter_strategy,
+        hist_cache_strategy,
        strategies.integers(1, 20),
        tm.make_dataset_strategy(),
    )
@@ -54,19 +56,44 @@ class TestGPUUpdaters:
        self,
        param: Dict[str, Any],
        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
        num_rounds: int,
        dataset: tm.TestDataset,
    ) -> None:
        param.update({"tree_method": "hist", "device": "cuda"})
        param.update(hist_param)
+        param.update(cache_param)
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(str(result))
        assert tm.non_increasing(result["train"][dataset.metric])

+    @pytest.mark.parametrize("tree_method", ["approx", "hist"])
+    def test_cache_size(self, tree_method: str) -> None:
+        from sklearn.datasets import make_regression
+
+        X, y = make_regression(n_samples=4096, n_features=64, random_state=1994)
+        Xy = xgb.DMatrix(X, y)
+        results = []
+        for cache_size in [1, 3, 2048]:
+            params: Dict[str, Any] = {"tree_method": tree_method, "device": "cuda"}
+            params["max_cached_hist_node"] = cache_size
+            evals_result: Dict[str, Dict[str, list]] = {}
+            xgb.train(
+                params,
+                Xy,
+                num_boost_round=4,
+                evals=[(Xy, "Train")],
+                evals_result=evals_result,
+            )
+            results.append(evals_result["Train"]["rmse"])
+        for i in range(1, len(results)):
+            np.testing.assert_allclose(results[0], results[i])
+
    @given(
        exact_parameter_strategy,
        hist_parameter_strategy,
+        hist_cache_strategy,
        strategies.integers(1, 20),
        tm.make_dataset_strategy(),
    )
@@ -75,11 +102,13 @@ class TestGPUUpdaters:
        self,
        param: Dict[str, Any],
        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
        num_rounds: int,
        dataset: tm.TestDataset,
    ) -> None:
        param.update({"tree_method": "approx", "device": "cuda"})
        param.update(hist_param)
+        param.update(cache_param)
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(str(result))