temp merge, disable 1 line, SetValid

2023-10-12 16:16:44 -07:00
parent 2e7e9d3b2d 85d3017ca5
commit ea19555474
492 changed files with 15533 additions and 9376 deletions
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -2,6 +2,7 @@
 * Copyright 2020-2022 by XGBoost contributors
 */
 #include <gtest/gtest.h>
+#include <thrust/host_vector.h>

 #if defined(XGBOOST_USE_CUDA)
 #include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
@@ -11,25 +12,23 @@
 #include "../../helpers.h"
 #include "../../histogram_helpers.h"
 #include "../test_evaluate_splits.h"  // TestPartitionBasedSplit
-#include <thrust/host_vector.h>

 namespace xgboost {
 namespace tree {
+
 namespace {
 auto ZeroParam() {
-  auto args = Args{{"min_child_weight", "0"},
-                   {"lambda", "0"}};
+  auto args = Args{{"min_child_weight", "0"}, {"lambda", "0"}};
  TrainParam tparam;
  tparam.UpdateAllowUnknown(args);
  return tparam;
 }
-
 }  // anonymous namespace

 inline GradientQuantiser DummyRoundingFactor() {
  thrust::device_vector<GradientPair> gpair(1);
  gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return GradientQuantiser(dh::ToSpan(gpair));
+  return {dh::ToSpan(gpair), MetaInfo()};
 }

 thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPairPrecise> x) {
@@ -41,7 +40,6 @@ thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPa
  return y;
 }

-
 TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
  GPUTrainingParam param{param_};
@@ -65,12 +63,13 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {

  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), 0};

-  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, 0);
  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;

  ASSERT_EQ(result.thresh, 1);
  this->CheckResult(result.loss_chg, result.findex, result.fvalue, result.is_cat,
-                    result.dir == kLeftDir, quantiser.ToFloatingPoint(result.left_sum), quantiser.ToFloatingPoint(result.right_sum));
+                    result.dir == kLeftDir, quantiser.ToFloatingPoint(result.left_sum),
+                    quantiser.ToFloatingPoint(result.right_sum));
 }

 TEST(GpuHist, PartitionBasic) {
@@ -106,7 +105,7 @@ TEST(GpuHist, PartitionBasic) {
  };

  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);

  {
    // -1.0s go right
@@ -147,7 +146,8 @@ TEST(GpuHist, PartitionBasic) {
    EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
  }
  // With 3.0/3.0 missing values
-  // Forward, first 2 categories are selected, while the last one go to left along with missing value
+  // Forward, first 2 categories are selected, while the last one go to left along with missing
+  // value
  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 6.0});
    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
@@ -217,11 +217,12 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                          false};

  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);

  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
-    auto feature_histogram = ConvertToInteger({        {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(
+        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
    EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
                              dh::ToSpan(feature_histogram)};
    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
@@ -233,7 +234,8 @@ TEST(GpuHist, PartitionTwoFeatures) {

  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
-    auto feature_histogram = ConvertToInteger({        {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
+    auto feature_histogram = ConvertToInteger(
+        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
    EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                              dh::ToSpan(feature_histogram)};
    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
@@ -275,12 +277,12 @@ TEST(GpuHist, PartitionTwoNodes) {
                                          false};

  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);

  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
-    auto feature_histogram_a = ConvertToInteger({{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0},
-                                         {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+    auto feature_histogram_a = ConvertToInteger(
+        {{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
    thrust::device_vector<EvaluateSplitInputs> inputs(2);
    inputs[0] = EvaluateSplitInputs{0, 0, parent_sum, dh::ToSpan(feature_set),
                                    dh::ToSpan(feature_histogram_a)};
@@ -289,8 +291,6 @@ TEST(GpuHist, PartitionTwoNodes) {
                                    dh::ToSpan(feature_histogram_b)};
    thrust::device_vector<GPUExpandEntry> results(2);
    evaluator.EvaluateSplits({0, 1}, 1, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(results));
-    GPUExpandEntry result_a = results[0];
-    GPUExpandEntry result_b = results[1];
    EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(0)[0]),
              std::bitset<32>("10000000000000000000000000000000"));
    EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(1)[0]),
@@ -310,8 +310,7 @@ void TestEvaluateSingleSplit(bool is_categorical) {
  // Setup gradients so that second feature gets higher gain
  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});

-  dh::device_vector<FeatureType> feature_types(feature_set.size(),
-                                               FeatureType::kCategorical);
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
  common::Span<FeatureType> d_feature_types;
  if (is_categorical) {
    auto max_cat = *std::max_element(cuts.cut_values_.HostVector().begin(),
@@ -330,9 +329,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                          cuts.min_vals_.ConstDeviceSpan(),
                                          false};

-  GPUHistEvaluator evaluator{
-      tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;

  EXPECT_EQ(result.findex, 1);
@@ -344,31 +342,23 @@ void TestEvaluateSingleSplit(bool is_categorical) {
  EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
 }

-TEST(GpuHist, EvaluateSingleSplit) {
-  TestEvaluateSingleSplit(false);
-}
+TEST(GpuHist, EvaluateSingleSplit) { TestEvaluateSingleSplit(false); }

-TEST(GpuHist, EvaluateSingleCategoricalSplit) {
-  TestEvaluateSingleSplit(true);
-}
+TEST(GpuHist, EvaluateSingleCategoricalSplit) { TestEvaluateSingleSplit(true); }

 TEST(GpuHist, EvaluateSingleSplitMissing) {
  auto quantiser = DummyRoundingFactor();
-    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{1.0, 1.5});
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{1.0, 1.5});
  TrainParam tparam = ZeroParam();
  GPUTrainingParam param{tparam};

-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2};
  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0};
  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0};
  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
  EvaluateSplitSharedInputs shared_inputs{param,
                                          quantiser,
                                          {},
@@ -383,7 +373,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
  EXPECT_EQ(result.findex, 0);
  EXPECT_EQ(result.fvalue, 1.0);
  EXPECT_EQ(result.dir, kRightDir);
-  EXPECT_EQ(result.left_sum,quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
+  EXPECT_EQ(result.left_sum, quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
  EXPECT_EQ(result.right_sum, quantiser.ToFixedPoint(GradientPairPrecise(1.5, 1.0)));
 }

@@ -404,24 +394,18 @@ TEST(GpuHist, EvaluateSingleSplitEmpty) {
 // Feature 0 has a better split, but the algorithm must select feature 1
 TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
  auto quantiser = DummyRoundingFactor();
-    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
  TrainParam tparam = ZeroParam();
  tparam.UpdateAllowUnknown(Args{});
  GPUTrainingParam param{tparam};

-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({          {-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{1};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
+  auto feature_histogram = ConvertToInteger({{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
  EvaluateSplitSharedInputs shared_inputs{param,
                                          quantiser,
                                          {},
@@ -435,31 +419,25 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {

  EXPECT_EQ(result.findex, 1);
  EXPECT_EQ(result.fvalue, 11.0);
-  EXPECT_EQ(result.left_sum,quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
+  EXPECT_EQ(result.left_sum, quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
  EXPECT_EQ(result.right_sum, quantiser.ToFixedPoint(GradientPairPrecise(0.5, 0.5)));
 }

 // Features 0 and 1 have identical gain, the algorithm must select 0
 TEST(GpuHist, EvaluateSingleSplitBreakTies) {
  auto quantiser = DummyRoundingFactor();
-    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
  TrainParam tparam = ZeroParam();
  tparam.UpdateAllowUnknown(Args{});
  GPUTrainingParam param{tparam};

-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0, 1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({          {-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
+  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
  EvaluateSplitSharedInputs shared_inputs{param,
                                          quantiser,
                                          {},
@@ -469,7 +447,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
                                          false};

  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0);
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input,shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;

  EXPECT_EQ(result.findex, 0);
  EXPECT_EQ(result.fvalue, 1.0);
@@ -483,41 +461,31 @@ TEST(GpuHist, EvaluateSplits) {
  tparam.UpdateAllowUnknown(Args{});
  GPUTrainingParam param{tparam};

-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0, 1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 0.0};
-  auto feature_histogram_left = ConvertToInteger({          {-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
-  auto feature_histogram_right = ConvertToInteger({          {-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input_left{
-      1,0,
-      parent_sum,
-      dh::ToSpan(feature_set),
-      dh::ToSpan(feature_histogram_left)};
-  EvaluateSplitInputs input_right{
-      2,0,
-      parent_sum,
-      dh::ToSpan(feature_set),
-      dh::ToSpan(feature_histogram_right)};
-  EvaluateSplitSharedInputs shared_inputs{
-      param,
-      quantiser,
-      {},
-      dh::ToSpan(feature_segments),
-      dh::ToSpan(feature_values),
-      dh::ToSpan(feature_min_values),
-      false
-  };
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 0.0};
+  auto feature_histogram_left =
+      ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
+  auto feature_histogram_right =
+      ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  EvaluateSplitInputs input_left{1, 0, parent_sum, dh::ToSpan(feature_set),
+                                 dh::ToSpan(feature_histogram_left)};
+  EvaluateSplitInputs input_right{2, 0, parent_sum, dh::ToSpan(feature_set),
+                                  dh::ToSpan(feature_histogram_right)};
+  EvaluateSplitSharedInputs shared_inputs{param,
+                                          quantiser,
+                                          {},
+                                          dh::ToSpan(feature_segments),
+                                          dh::ToSpan(feature_values),
+                                          dh::ToSpan(feature_min_values),
+                                          false};

-  GPUHistEvaluator evaluator{
-      tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
-  dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input_left,input_right};
-  evaluator.LaunchEvaluateSplits(input_left.feature_set.size(),dh::ToSpan(inputs),shared_inputs, evaluator.GetEvaluator(),
-                           dh::ToSpan(out_splits));
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
+  dh::device_vector<EvaluateSplitInputs> inputs =
+      std::vector<EvaluateSplitInputs>{input_left, input_right};
+  evaluator.LaunchEvaluateSplits(input_left.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
+                                 evaluator.GetEvaluator(), dh::ToSpan(out_splits));

  DeviceSplitCandidate result_left = out_splits[0];
  EXPECT_EQ(result_left.findex, 1);
@@ -536,18 +504,19 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
  cuts_.cut_values_.SetDevice(0);
  cuts_.min_vals_.SetDevice(0);

-  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, 0);

  // Convert the sample histogram to fixed point
  auto quantiser = DummyRoundingFactor();
  thrust::host_vector<GradientPairInt64> h_hist;
-  for(auto e: hist_[0]){
+  for (auto e : hist_[0]) {
    h_hist.push_back(quantiser.ToFixedPoint(e));
  }
  dh::device_vector<GradientPairInt64> d_hist = h_hist;
  dh::device_vector<bst_feature_t> feature_set{std::vector<bst_feature_t>{0}};

-  EvaluateSplitInputs input{0, 0, quantiser.ToFixedPoint(total_gpair_), dh::ToSpan(feature_set), dh::ToSpan(d_hist)};
+  EvaluateSplitInputs input{0, 0, quantiser.ToFixedPoint(total_gpair_), dh::ToSpan(feature_set),
+                            dh::ToSpan(d_hist)};
  EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param_},
                                          quantiser,
                                          dh::ToSpan(ft),
@@ -558,5 +527,65 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
  auto split = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
  ASSERT_NEAR(split.loss_chg, best_score_, 1e-2);
 }
+
+class MGPUHistTest : public BaseMGPUTest {};
+
+namespace {
+void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
+  auto rank = collective::GetRank();
+  auto quantiser = DummyRoundingFactor();
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
+  TrainParam tparam = ZeroParam();
+  GPUTrainingParam param{tparam};
+
+  common::HistogramCuts cuts{rank == 0
+                                 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, GPUIDX)
+                                 : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, GPUIDX)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+
+  // Setup gradients so that second feature gets higher gain
+  auto feature_histogram = rank == 0 ? ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}})
+                                     : ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}});
+
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
+  common::Span<FeatureType> d_feature_types;
+  if (is_categorical) {
+    auto max_cat = *std::max_element(cuts.cut_values_.HostVector().begin(),
+                                     cuts.cut_values_.HostVector().end());
+    cuts.SetCategorical(true, max_cat);
+    d_feature_types = dh::ToSpan(feature_types);
+  }
+
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
+  EvaluateSplitSharedInputs shared_inputs{param,
+                                          quantiser,
+                                          d_feature_types,
+                                          cuts.cut_ptrs_.ConstDeviceSpan(),
+                                          cuts.cut_values_.ConstDeviceSpan(),
+                                          cuts.min_vals_.ConstDeviceSpan(),
+                                          false};
+
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), GPUIDX};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, GPUIDX);
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+
+  EXPECT_EQ(result.findex, 1) << "rank: " << rank;
+  if (is_categorical) {
+    ASSERT_TRUE(std::isnan(result.fvalue));
+  } else {
+    EXPECT_EQ(result.fvalue, 11.0) << "rank: " << rank;
+  }
+  EXPECT_EQ(result.left_sum + result.right_sum, parent_sum) << "rank: " << rank;
+}
+}  // anonymous namespace
+
+TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
+  DoTest(VerifyColumnSplitEvaluateSingleSplit, false);
+}
+
+TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
+  DoTest(VerifyColumnSplitEvaluateSingleSplit, true);
+}
 }  // namespace tree
 }  // namespace xgboost
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -43,7 +43,8 @@ void VerifySampling(size_t page_size,
    EXPECT_NE(page->n_rows, kRows);
  }

-  GradientBasedSampler sampler(&ctx, page, kRows, param, subsample, sampling_method);
+  GradientBasedSampler sampler(&ctx, kRows, param, subsample, sampling_method,
+                               !fixed_size_sampling);
  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());

  if (fixed_size_sampling) {
@@ -97,7 +98,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
  EXPECT_NE(page->n_rows, kRows);

-  GradientBasedSampler sampler(&ctx, page, kRows, param, kSubsample, TrainParam::kUniform);
+  GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
  auto sampled_page = sample.page;
  EXPECT_EQ(sample.sample_rows, kRows);
@@ -145,7 +146,8 @@ TEST(GradientBasedSampler, GradientBasedSampling) {
  constexpr size_t kPageSize = 0;
  constexpr float kSubsample = 0.8;
  constexpr int kSamplingMethod = TrainParam::kGradientBased;
-  VerifySampling(kPageSize, kSubsample, kSamplingMethod);
+  constexpr bool kFixedSizeSampling = true;
+  VerifySampling(kPageSize, kSubsample, kSamplingMethod, kFixedSizeSampling);
 }

 TEST(GradientBasedSampler, GradientBasedSamplingExternalMemory) {
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -44,7 +44,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
    FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
                                 sizeof(GradientPairInt64));

-    auto quantiser = GradientQuantiser(gpair.DeviceSpan());
+    auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
                           feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram,
                           quantiser);
@@ -64,7 +64,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
      dh::device_vector<GradientPairInt64> new_histogram(num_bins);
      auto d_new_histogram = dh::ToSpan(new_histogram);

-      auto quantiser = GradientQuantiser(gpair.DeviceSpan());
+      auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
                             feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
                             d_new_histogram, quantiser);
@@ -154,7 +154,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
  dh::device_vector<GradientPairInt64> cat_hist(num_categories);
  auto gpair = GenerateRandomGradients(kRows, 0, 2);
  gpair.SetDevice(0);
-  auto quantiser = GradientQuantiser(gpair.DeviceSpan());
+  auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
  /**
   * Generate hist with cat data.
   */
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -34,7 +34,7 @@ void TestUpdatePositionBatch() {
  std::vector<int> extra_data = {0};
  // Send the first five training instances to the right node
  // and the second 5 to the left node
-  rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int) {
+  rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
    return ridx > 4;
  });
  rows = rp.GetRowsHost(1);
@@ -47,7 +47,7 @@ void TestUpdatePositionBatch() {
  }

  // Split the left node again
-  rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int) {
+  rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
    return ridx < 7;
  });
  EXPECT_EQ(rp.GetRows(3).size(), 2);
@@ -61,7 +61,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
  thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
  thrust::device_vector<bst_uint> counts(segments.size());

-  auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
+  auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
  std::vector<int> op_data(segments.size());
  std::vector<PerNodeData<int>> h_batch_info(segments.size());
  dh::TemporaryArray<PerNodeData<int>> d_batch_info(segments.size());
@@ -84,7 +84,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
  dh::device_vector<int8_t> tmp;
  SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
                                                 dh::ToSpan(ridx_tmp), dh::ToSpan(counts),
-                                                 total_rows, op, &tmp, nullptr);
+                                                 total_rows, op, &tmp);

  auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
  for (size_t i = 0; i < segments.size(); i++) {
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -4,13 +4,13 @@
 #include "../test_evaluate_splits.h"

 #include <gtest/gtest.h>
-#include <xgboost/base.h>                               // for GradientPairPrecise, Args, Gradie...
-#include <xgboost/context.h>                            // for Context
-#include <xgboost/data.h>                               // for FeatureType, DMatrix, MetaInfo
-#include <xgboost/logging.h>                            // for CHECK_EQ
-#include <xgboost/tree_model.h>                         // for RegTree, RTreeNodeStat
+#include <xgboost/base.h>        // for GradientPairPrecise, Args, Gradie...
+#include <xgboost/context.h>     // for Context
+#include <xgboost/data.h>        // for FeatureType, DMatrix, MetaInfo
+#include <xgboost/logging.h>     // for CHECK_EQ
+#include <xgboost/tree_model.h>  // for RegTree, RTreeNodeStat

-#include <memory>                                       // for make_shared, shared_ptr, addressof
+#include <memory>  // for make_shared, shared_ptr, addressof

 #include "../../../../src/common/hist_util.h"           // for HistCollection, HistogramCuts
 #include "../../../../src/common/random.h"              // for ColumnSampler
@@ -18,6 +18,8 @@
 #include "../../../../src/data/gradient_index.h"        // for GHistIndexMatrix
 #include "../../../../src/tree/hist/evaluate_splits.h"  // for HistEvaluator
 #include "../../../../src/tree/hist/expand_entry.h"     // for CPUExpandEntry
+#include "../../../../src/tree/hist/hist_cache.h"       // for BoundedHistCollection
+#include "../../../../src/tree/hist/param.h"            // for HistMakerTrainParam
 #include "../../../../src/tree/param.h"                 // for GradStats, TrainParam
 #include "../../helpers.h"                              // for RandomDataGenerator, AllThreadsFo...

@@ -34,7 +36,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
  auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix();

  auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
-  common::HistCollection hist;
+  BoundedHistCollection hist;
  std::vector<GradientPair> row_gpairs = {
      {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f},  {2.27f, 0.28f},
      {0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f}};
@@ -48,12 +50,10 @@ void TestEvaluateSplits(bool force_read_by_column) {
  std::iota(row_indices.begin(), row_indices.end(), 0);
  row_set_collection.Init();

-  auto hist_builder = common::GHistBuilder(gmat.cut.Ptrs().back());
-  hist.Init(gmat.cut.Ptrs().back());
-  hist.AddHistRow(0);
-  hist.AllocateAllData();
-  hist_builder.template BuildHist<false>(row_gpairs, row_set_collection[0],
-                                         gmat, hist[0], force_read_by_column);
+  HistMakerTrainParam hist_param;
+  hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
+  hist.AllocateHistograms({0});
+  common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);

  // Compute total gradient for all data points
  GradientPairPrecise total_gpair;
@@ -113,13 +113,13 @@ TEST(HistMultiEvaluator, Evaluate) {
      RandomDataGenerator{n_samples, n_features, 0.5}.Targets(n_targets).GenerateDMatrix(true);

  HistMultiEvaluator evaluator{&ctx, p_fmat->Info(), &param, sampler};
-  std::vector<common::HistCollection> histogram(n_targets);
+  HistMakerTrainParam hist_param;
+  std::vector<BoundedHistCollection> histogram(n_targets);
  linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
  for (bst_target_t t{0}; t < n_targets; ++t) {
    auto &hist = histogram[t];
-    hist.Init(n_bins * n_features);
-    hist.AddHistRow(0);
-    hist.AllocateAllData();
+    hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
+    hist.AllocateHistograms({0});
    auto node_hist = hist[0];
    node_hist[0] = {-0.5, 0.5};
    node_hist[1] = {2.0, 0.5};
@@ -145,7 +145,7 @@ TEST(HistMultiEvaluator, Evaluate) {

  std::vector<MultiExpandEntry> entries(1, {/*nidx=*/0, /*depth=*/0});

-  std::vector<common::HistCollection const *> ptrs;
+  std::vector<BoundedHistCollection const *> ptrs;
  std::transform(histogram.cbegin(), histogram.cend(), std::back_inserter(ptrs),
                 [](auto const &h) { return std::addressof(h); });

@@ -227,16 +227,16 @@ auto CompareOneHotAndPartition(bool onehot) {
  auto sampler = std::make_shared<common::ColumnSampler>();
  auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
  std::vector<CPUExpandEntry> entries(1);
+  HistMakerTrainParam hist_param;

  for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>(&ctx, {32, param.sparse_threshold})) {
-    common::HistCollection hist;
+    BoundedHistCollection hist;

    entries.front().nid = 0;
    entries.front().depth = 0;

-    hist.Init(gmat.cut.TotalBins());
-    hist.AddHistRow(0);
-    hist.AllocateAllData();
+    hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node);
+    hist.AllocateHistograms({0});
    auto node_hist = hist[0];

    CHECK_EQ(node_hist.size(), n_cats);
@@ -263,10 +263,10 @@ TEST(HistEvaluator, Categorical) {
 }

 TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
-  common::HistCollection hist;
-  hist.Init(cuts_.TotalBins());
-  hist.AddHistRow(0);
-  hist.AllocateAllData();
+  BoundedHistCollection hist;
+  HistMakerTrainParam hist_param;
+  hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
+  hist.AllocateHistograms({0});
  auto node_hist = hist[0];
  ASSERT_EQ(node_hist.size(), feature_histogram_.size());
  std::copy(feature_histogram_.cbegin(), feature_histogram_.cend(), node_hist.begin());
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -2,19 +2,40 @@
 * Copyright 2018-2023 by Contributors
 */
 #include <gtest/gtest.h>
-#include <xgboost/context.h>  // Context
+#include <xgboost/base.h>                // for bst_node_t, bst_bin_t, Gradient...
+#include <xgboost/context.h>             // for Context
+#include <xgboost/data.h>                // for BatchIterator, BatchSet, DMatrix
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/linalg.h>              // for MakeTensorView
+#include <xgboost/logging.h>             // for Error, LogCheck_EQ, LogCheck_LT
+#include <xgboost/span.h>                // for Span, operator!=
+#include <xgboost/tree_model.h>          // for RegTree

-#include <limits>
+#include <algorithm>   // for max
+#include <cstddef>     // for size_t
+#include <cstdint>     // for int32_t, uint32_t
+#include <functional>  // for function
+#include <iterator>    // for back_inserter
+#include <limits>      // for numeric_limits
+#include <memory>      // for shared_ptr, allocator, unique_ptr
+#include <numeric>     // for iota, accumulate
+#include <vector>      // for vector

-#include "../../../../src/common/categorical.h"
-#include "../../../../src/common/row_set.h"
-#include "../../../../src/tree/hist/expand_entry.h"
-#include "../../../../src/tree/hist/histogram.h"
-#include "../../categorical_helpers.h"
-#include "../../helpers.h"
+#include "../../../../src/collective/communicator-inl.h"  // for GetRank, GetWorldSize
+#include "../../../../src/common/hist_util.h"             // for GHistRow, HistogramCuts, Sketch...
+#include "../../../../src/common/ref_resource_view.h"     // for RefResourceView
+#include "../../../../src/common/row_set.h"               // for RowSetCollection
+#include "../../../../src/common/threading_utils.h"       // for BlockedSpace2d
+#include "../../../../src/data/gradient_index.h"          // for GHistIndexMatrix
+#include "../../../../src/tree/common_row_partitioner.h"  // for CommonRowPartitioner
+#include "../../../../src/tree/hist/expand_entry.h"       // for CPUExpandEntry
+#include "../../../../src/tree/hist/hist_cache.h"         // for BoundedHistCollection
+#include "../../../../src/tree/hist/histogram.h"          // for HistogramBuilder
+#include "../../../../src/tree/hist/param.h"              // for HistMakerTrainParam
+#include "../../categorical_helpers.h"                    // for OneHotEncodeFeature
+#include "../../helpers.h"                                // for RandomDataGenerator, GenerateRa...

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 void InitRowPartitionForTest(common::RowSetCollection *row_set, size_t n_samples, size_t base_rowid = 0) {
  auto &row_indices = *row_set->Data();
@@ -26,10 +47,8 @@ void InitRowPartitionForTest(common::RowSetCollection *row_set, size_t n_samples

 void TestAddHistRows(bool is_distributed) {
  Context ctx;
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
-  std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
-  int starting_index = std::numeric_limits<int>::max();
-  int sync_count = 0;
+  std::vector<bst_node_t> nodes_to_build;
+  std::vector<bst_node_t> nodes_to_sub;

  size_t constexpr kNRows = 8, kNCols = 16;
  int32_t constexpr kMaxBins = 4;
@@ -42,26 +61,22 @@ void TestAddHistRows(bool is_distributed) {
  tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
  tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
  tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
-  nodes_for_explicit_hist_build_.emplace_back(4, tree.GetDepth(4));
-  nodes_for_subtraction_trick_.emplace_back(5, tree.GetDepth(5));
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
+  nodes_to_build.emplace_back(3);
+  nodes_to_build.emplace_back(4);
+  nodes_to_sub.emplace_back(5);
+  nodes_to_sub.emplace_back(6);

-  HistogramBuilder<CPUExpandEntry> histogram_builder;
-  histogram_builder.Reset(gmat.cut.TotalBins(), {kMaxBins, 0.5}, omp_get_max_threads(), 1,
-                          is_distributed, false);
-  histogram_builder.AddHistRows(&starting_index, &sync_count,
-                                nodes_for_explicit_hist_build_,
-                                nodes_for_subtraction_trick_, &tree);
+  HistMakerTrainParam hist_param;
+  HistogramBuilder histogram_builder;
+  histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed, false,
+                          &hist_param);
+  histogram_builder.AddHistRows(&tree, &nodes_to_build, &nodes_to_sub, false);

-  ASSERT_EQ(sync_count, 2);
-  ASSERT_EQ(starting_index, 3);
-
-  for (const CPUExpandEntry &node : nodes_for_explicit_hist_build_) {
-    ASSERT_EQ(histogram_builder.Histogram().RowExists(node.nid), true);
+  for (bst_node_t const &nidx : nodes_to_build) {
+    ASSERT_TRUE(histogram_builder.Histogram().HistogramExists(nidx));
  }
-  for (const CPUExpandEntry &node : nodes_for_subtraction_trick_) {
-    ASSERT_EQ(histogram_builder.Histogram().RowExists(node.nid), true);
+  for (bst_node_t const &nidx : nodes_to_sub) {
+    ASSERT_TRUE(histogram_builder.Histogram().HistogramExists(nidx));
  }
 }

@@ -72,87 +87,77 @@ TEST(CPUHistogram, AddRows) {
 }

 void TestSyncHist(bool is_distributed) {
-  size_t constexpr kNRows = 8, kNCols = 16;
-  int32_t constexpr kMaxBins = 4;
+  std::size_t constexpr kNRows = 8, kNCols = 16;
+  bst_bin_t constexpr kMaxBins = 4;
  Context ctx;

-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
-  std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
-  int starting_index = std::numeric_limits<int>::max();
-  int sync_count = 0;
+  std::vector<bst_bin_t> nodes_for_explicit_hist_build;
+  std::vector<bst_bin_t> nodes_for_subtraction_trick;
  RegTree tree;

  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
  auto const &gmat =
      *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());

-  HistogramBuilder<CPUExpandEntry> histogram;
+  HistogramBuilder histogram;
  uint32_t total_bins = gmat.cut.Ptrs().back();
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed, false);
+  HistMakerTrainParam hist_param;
+  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, false, &hist_param);

-  common::RowSetCollection row_set_collection_;
+  common::RowSetCollection row_set_collection;
  {
-    row_set_collection_.Clear();
-    std::vector<size_t> &row_indices = *row_set_collection_.Data();
+    row_set_collection.Clear();
+    std::vector<size_t> &row_indices = *row_set_collection.Data();
    row_indices.resize(kNRows);
    std::iota(row_indices.begin(), row_indices.end(), 0);
-    row_set_collection_.Init();
+    row_set_collection.Init();
  }

  // level 0
-  nodes_for_explicit_hist_build_.emplace_back(0, tree.GetDepth(0));
-  histogram.AddHistRows(&starting_index, &sync_count,
-                        nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_, &tree);
+  nodes_for_explicit_hist_build.emplace_back(0);
+  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);

  tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
-  nodes_for_explicit_hist_build_.clear();
-  nodes_for_subtraction_trick_.clear();
+  nodes_for_explicit_hist_build.clear();
+  nodes_for_subtraction_trick.clear();

  // level 1
-  nodes_for_explicit_hist_build_.emplace_back(tree[0].LeftChild(), tree.GetDepth(1));
-  nodes_for_subtraction_trick_.emplace_back(tree[0].RightChild(), tree.GetDepth(2));
+  nodes_for_explicit_hist_build.emplace_back(tree[0].LeftChild());
+  nodes_for_subtraction_trick.emplace_back(tree[0].RightChild());

-  histogram.AddHistRows(&starting_index, &sync_count,
-                        nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_, &tree);
+  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);

  tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
  tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);

-  nodes_for_explicit_hist_build_.clear();
-  nodes_for_subtraction_trick_.clear();
+  nodes_for_explicit_hist_build.clear();
+  nodes_for_subtraction_trick.clear();
  // level 2
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
-  nodes_for_subtraction_trick_.emplace_back(4, tree.GetDepth(4));
-  nodes_for_explicit_hist_build_.emplace_back(5, tree.GetDepth(5));
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
+  nodes_for_explicit_hist_build.emplace_back(3);
+  nodes_for_subtraction_trick.emplace_back(4);
+  nodes_for_explicit_hist_build.emplace_back(5);
+  nodes_for_subtraction_trick.emplace_back(6);

-  histogram.AddHistRows(&starting_index, &sync_count,
-                        nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_, &tree);
+  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);

-  const size_t n_nodes = nodes_for_explicit_hist_build_.size();
+  const size_t n_nodes = nodes_for_explicit_hist_build.size();
  ASSERT_EQ(n_nodes, 2ul);
-  row_set_collection_.AddSplit(0, tree[0].LeftChild(), tree[0].RightChild(), 4,
-                               4);
-  row_set_collection_.AddSplit(1, tree[1].LeftChild(), tree[1].RightChild(), 2,
-                               2);
-  row_set_collection_.AddSplit(2, tree[2].LeftChild(), tree[2].RightChild(), 2,
-                               2);
+  row_set_collection.AddSplit(0, tree[0].LeftChild(), tree[0].RightChild(), 4, 4);
+  row_set_collection.AddSplit(1, tree[1].LeftChild(), tree[1].RightChild(), 2, 2);
+  row_set_collection.AddSplit(2, tree[2].LeftChild(), tree[2].RightChild(), 2, 2);

  common::BlockedSpace2d space(
      n_nodes,
-      [&](size_t node) {
-        const int32_t nid = nodes_for_explicit_hist_build_[node].nid;
-        return row_set_collection_[nid].Size();
+      [&](std::size_t nidx_in_set) {
+        bst_node_t nidx = nodes_for_explicit_hist_build[nidx_in_set];
+        return row_set_collection[nidx].Size();
      },
      256);

  std::vector<common::GHistRow> target_hists(n_nodes);
-  for (size_t i = 0; i < nodes_for_explicit_hist_build_.size(); ++i) {
-    const int32_t nid = nodes_for_explicit_hist_build_[i].nid;
-    target_hists[i] = histogram.Histogram()[nid];
+  for (size_t i = 0; i < nodes_for_explicit_hist_build.size(); ++i) {
+    bst_node_t nidx = nodes_for_explicit_hist_build[i];
+    target_hists[i] = histogram.Histogram()[nidx];
  }

  // set values to specific nodes hist
@@ -176,14 +181,7 @@ void TestSyncHist(bool is_distributed) {

  histogram.Buffer().Reset(1, n_nodes, space, target_hists);
  // sync hist
-  if (is_distributed) {
-    histogram.SyncHistogramDistributed(&tree, nodes_for_explicit_hist_build_,
-                                       nodes_for_subtraction_trick_,
-                                       starting_index, sync_count);
-  } else {
-    histogram.SyncHistogramLocal(&tree, nodes_for_explicit_hist_build_,
-                                 nodes_for_subtraction_trick_);
-  }
+  histogram.SyncHistogram(&tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);

  using GHistRowT = common::GHistRow;
  auto check_hist = [](const GHistRowT parent, const GHistRowT left, const GHistRowT right,
@@ -196,11 +194,10 @@ void TestSyncHist(bool is_distributed) {
    }
  };
  size_t node_id = 0;
-  for (const CPUExpandEntry &node : nodes_for_explicit_hist_build_) {
-    auto this_hist = histogram.Histogram()[node.nid];
-    const size_t parent_id = tree[node.nid].Parent();
-    const size_t subtraction_node_id =
-        nodes_for_subtraction_trick_[node_id].nid;
+  for (auto const &nidx : nodes_for_explicit_hist_build) {
+    auto this_hist = histogram.Histogram()[nidx];
+    const size_t parent_id = tree[nidx].Parent();
+    const size_t subtraction_node_id = nodes_for_subtraction_trick[node_id];
    auto parent_hist = histogram.Histogram()[parent_id];
    auto sibling_hist = histogram.Histogram()[subtraction_node_id];

@@ -208,11 +205,10 @@ void TestSyncHist(bool is_distributed) {
    ++node_id;
  }
  node_id = 0;
-  for (const CPUExpandEntry &node : nodes_for_subtraction_trick_) {
-    auto this_hist = histogram.Histogram()[node.nid];
-    const size_t parent_id = tree[node.nid].Parent();
-    const size_t subtraction_node_id =
-        nodes_for_explicit_hist_build_[node_id].nid;
+  for (auto const &nidx : nodes_for_subtraction_trick) {
+    auto this_hist = histogram.Histogram()[nidx];
+    const size_t parent_id = tree[nidx].Parent();
+    const size_t subtraction_node_id = nodes_for_explicit_hist_build[node_id];
    auto parent_hist = histogram.Histogram()[parent_id];
    auto sibling_hist = histogram.Histogram()[subtraction_node_id];

@@ -246,9 +242,9 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
      {0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f}};

  bst_node_t nid = 0;
-  HistogramBuilder<CPUExpandEntry> histogram;
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed,
-                  is_col_split);
+  HistogramBuilder histogram;
+  HistMakerTrainParam hist_param;
+  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, &hist_param);

  RegTree tree;

@@ -260,12 +256,17 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
  row_set_collection.Init();

  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
-  nodes_for_explicit_hist_build.push_back(node);
+  std::vector<bst_node_t> nodes_to_build{node.nid};
+  std::vector<bst_node_t> dummy_sub;
+
+  histogram.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+  common::BlockedSpace2d space{
+      1, [&](std::size_t nidx_in_set) { return row_set_collection[nidx_in_set].Size(); }, 256};
  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {kMaxBins, 0.5})) {
-    histogram.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                        gpair, force_read_by_column);
+    histogram.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
+                        linalg::MakeTensorView(&ctx, gpair, gpair.size()), force_read_by_column);
  }
+  histogram.SyncHistogram(&tree, nodes_to_build, {});

  // Check if number of histogram bins is correct
  ASSERT_EQ(histogram.Histogram()[nid].size(), gmat.cut.Ptrs().back());
@@ -326,18 +327,18 @@ void ValidateCategoricalHistogram(size_t n_categories,

 void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
  size_t constexpr kRows = 340;
-  int32_t constexpr kBins = 256;
+  bst_bin_t constexpr kBins = 256;
  auto x = GenerateRandomCategoricalSingleColumn(kRows, n_categories);
  auto cat_m = GetDMatrixFromData(x, kRows, 1);
  cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
  Context ctx;

-  BatchParam batch_param{0, static_cast<int32_t>(kBins)};
+  BatchParam batch_param{0, kBins};

  RegTree tree;
-  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
-  nodes_for_explicit_hist_build.push_back(node);
+  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(RegTree::kRoot)};
+  std::vector<bst_node_t> nodes_to_build;
+  nodes_to_build.push_back(node.nid);

  auto gpair = GenerateRandomGradients(kRows, 0, 2);

@@ -347,30 +348,41 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
  row_indices.resize(kRows);
  std::iota(row_indices.begin(), row_indices.end(), 0);
  row_set_collection.Init();
+  HistMakerTrainParam hist_param;
+  std::vector<bst_node_t> dummy_sub;
+
+  common::BlockedSpace2d space{
+      1, [&](std::size_t nidx_in_set) { return row_set_collection[nidx_in_set].Size(); }, 256};

  /**
   * Generate hist with cat data.
   */
-  HistogramBuilder<CPUExpandEntry> cat_hist;
+  HistogramBuilder cat_hist;
  for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
    auto total_bins = gidx.cut.TotalBins();
-    cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
-    cat_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                       gpair.HostVector(), force_read_by_column);
+    cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+    cat_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+    cat_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
+                       linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
+                       force_read_by_column);
  }
+  cat_hist.SyncHistogram(&tree, nodes_to_build, {});

  /**
   * Generate hist with one hot encoded data.
   */
  auto x_encoded = OneHotEncodeFeature(x, n_categories);
  auto encode_m = GetDMatrixFromData(x_encoded, kRows, n_categories);
-  HistogramBuilder<CPUExpandEntry> onehot_hist;
+  HistogramBuilder onehot_hist;
  for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
    auto total_bins = gidx.cut.TotalBins();
-    onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
-    onehot_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                          gpair.HostVector(), force_read_by_column);
+    onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+    onehot_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+    onehot_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
+                          linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
+                          force_read_by_column);
  }
+  onehot_hist.SyncHistogram(&tree, nodes_to_build, {});

  auto cat = cat_hist.Histogram()[0];
  auto onehot = onehot_hist.Histogram()[0];
@@ -397,19 +409,22 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
    batch_param.hess = hess;
  }

-  std::vector<size_t> partition_size(1, 0);
-  size_t total_bins{0};
-  size_t n_samples{0};
+  std::vector<std::size_t> partition_size(1, 0);
+  bst_bin_t total_bins{0};
+  bst_row_t n_samples{0};

  auto gpair = GenerateRandomGradients(m->Info().num_row_, 0.0, 1.0);
  auto const &h_gpair = gpair.HostVector();

  RegTree tree;
-  std::vector<CPUExpandEntry> nodes;
-  nodes.emplace_back(0, tree.GetDepth(0));
+  std::vector<bst_node_t> nodes{RegTree::kRoot};
+  common::BlockedSpace2d space{
+      1, [&](std::size_t nidx_in_set) { return partition_size.at(nidx_in_set); }, 256};

  common::GHistRow multi_page;
-  HistogramBuilder<CPUExpandEntry> multi_build;
+  HistogramBuilder multi_build;
+  HistMakerTrainParam hist_param;
+  std::vector<bst_node_t> dummy_sub;
  {
    /**
     * Multi page
@@ -427,23 +442,21 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
    }
    ASSERT_EQ(n_samples, m->Info().num_row_);

-    common::BlockedSpace2d space{
-        1, [&](size_t nidx_in_set) { return partition_size.at(nidx_in_set); },
-        256};
-
-    multi_build.Reset(total_bins, batch_param, ctx->Threads(), rows_set.size(), false, false);
-
-    size_t page_idx{0};
+    multi_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
+    multi_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
+    std::size_t page_idx{0};
    for (auto const &page : m->GetBatches<GHistIndexMatrix>(ctx, batch_param)) {
-      multi_build.BuildHist(page_idx, space, page, &tree, rows_set.at(page_idx), nodes, {}, h_gpair,
+      multi_build.BuildHist(page_idx, space, page, rows_set[page_idx], nodes,
+                            linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()),
                            force_read_by_column);
      ++page_idx;
    }
-    ASSERT_EQ(page_idx, 2);
-    multi_page = multi_build.Histogram()[0];
+    multi_build.SyncHistogram(&tree, nodes, {});
+
+    multi_page = multi_build.Histogram()[RegTree::kRoot];
  }

-  HistogramBuilder<CPUExpandEntry> single_build;
+  HistogramBuilder single_build;
  common::GHistRow single_page;
  {
    /**
@@ -452,18 +465,24 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
    common::RowSetCollection row_set_collection;
    InitRowPartitionForTest(&row_set_collection, n_samples);

-    single_build.Reset(total_bins, batch_param, ctx->Threads(), 1, false, false);
+    single_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
    SparsePage concat;
    std::vector<float> hess(m->Info().num_row_, 1.0f);
-    for (auto const& page : m->GetBatches<SparsePage>()) {
+    for (auto const &page : m->GetBatches<SparsePage>()) {
      concat.Push(page);
    }

    auto cut = common::SketchOnDMatrix(ctx, m.get(), batch_param.max_bin, false, hess);
    GHistIndexMatrix gmat(concat, {}, cut, batch_param.max_bin, false,
                          std::numeric_limits<double>::quiet_NaN(), ctx->Threads());
-    single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair, force_read_by_column);
-    single_page = single_build.Histogram()[0];
+
+    single_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
+    single_build.BuildHist(0, space, gmat, row_set_collection, nodes,
+                           linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()),
+                           force_read_by_column);
+    single_build.SyncHistogram(&tree, nodes, {});
+
+    single_page = single_build.Histogram()[RegTree::kRoot];
  }

  for (size_t i = 0; i < single_page.size(); ++i) {
@@ -487,5 +506,108 @@ TEST(CPUHistogram, ExternalMemory) {
  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, false);
  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, true);
 }
-}  // namespace tree
-}  // namespace xgboost
+
+namespace {
+class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
+ public:
+  std::vector<GradientPairPrecise> TestOverflow(bool limit, bool is_distributed,
+                                                bool is_col_split) {
+    bst_bin_t constexpr kBins = 256;
+    Context ctx;
+    HistMakerTrainParam hist_param;
+    if (limit) {
+      hist_param.Init(Args{{"max_cached_hist_node", "1"}});
+    }
+
+    std::shared_ptr<DMatrix> Xy =
+        is_col_split ? RandomDataGenerator{8192, 16, 0.5}.GenerateDMatrix(true)
+                     : RandomDataGenerator{8192, 16, 0.5}.Bins(kBins).GenerateQuantileDMatrix(true);
+    if (is_col_split) {
+      Xy =
+          std::shared_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+    }
+
+    double sparse_thresh{TrainParam::DftSparseThreshold()};
+    auto batch = BatchParam{kBins, sparse_thresh};
+    bst_bin_t n_total_bins{0};
+    float split_cond{0};
+    for (auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+      n_total_bins = page.cut.TotalBins();
+      // use a cut point in the second column for split
+      split_cond = page.cut.Values()[kBins + kBins / 2];
+    }
+
+    RegTree tree;
+    MultiHistogramBuilder hist_builder;
+    CHECK_EQ(Xy->Info().IsColumnSplit(), is_col_split);
+
+    hist_builder.Reset(&ctx, n_total_bins, tree.NumTargets(), batch, is_distributed,
+                       Xy->Info().IsColumnSplit(), &hist_param);
+
+    std::vector<CommonRowPartitioner> partitioners;
+    partitioners.emplace_back(&ctx, Xy->Info().num_row_, /*base_rowid=*/0,
+                              Xy->Info().IsColumnSplit());
+
+    auto gpair = GenerateRandomGradients(Xy->Info().num_row_, 0.0, 1.0);
+
+    CPUExpandEntry best;
+    hist_builder.BuildRootHist(Xy.get(), &tree, partitioners,
+                               linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1),
+                               best, batch);
+
+    best.split.Update(1.0f, 1, split_cond, false, false, GradStats{1.0, 1.0}, GradStats{1.0, 1.0});
+    tree.ExpandNode(best.nid, best.split.SplitIndex(), best.split.split_value, false,
+                    /*base_weight=*/2.0f,
+                    /*left_leaf_weight=*/1.0f, /*right_leaf_weight=*/1.0f, best.GetLossChange(),
+                    /*sum_hess=*/2.0f, best.split.left_sum.GetHess(),
+                    best.split.right_sum.GetHess());
+
+    std::vector<CPUExpandEntry> valid_candidates{best};
+    for (auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+      partitioners.front().UpdatePosition(&ctx, page, valid_candidates, &tree);
+    }
+    CHECK_NE(partitioners.front()[tree.LeftChild(best.nid)].Size(), 0);
+    CHECK_NE(partitioners.front()[tree.RightChild(best.nid)].Size(), 0);
+
+    hist_builder.BuildHistLeftRight(
+        Xy.get(), &tree, partitioners, valid_candidates,
+        linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1), batch);
+
+    if (limit) {
+      CHECK(!hist_builder.Histogram(0).HistogramExists(best.nid));
+    } else {
+      CHECK(hist_builder.Histogram(0).HistogramExists(best.nid));
+    }
+
+    std::vector<GradientPairPrecise> result;
+    auto hist = hist_builder.Histogram(0)[tree.LeftChild(best.nid)];
+    std::copy(hist.cbegin(), hist.cend(), std::back_inserter(result));
+    hist = hist_builder.Histogram(0)[tree.RightChild(best.nid)];
+    std::copy(hist.cbegin(), hist.cend(), std::back_inserter(result));
+
+    return result;
+  }
+
+  void RunTest() {
+    auto param = GetParam();
+    auto res0 = this->TestOverflow(false, std::get<0>(param), std::get<1>(param));
+    auto res1 = this->TestOverflow(true, std::get<0>(param), std::get<1>(param));
+    ASSERT_EQ(res0, res1);
+  }
+};
+
+auto MakeParamsForTest() {
+  std::vector<std::tuple<bool, bool>> configs;
+  for (auto i : {true, false}) {
+    for (auto j : {true, false}) {
+      configs.emplace_back(i, j);
+    }
+  }
+  return configs;
+}
+}  // anonymous namespace
+
+TEST_P(OverflowTest, Overflow) { this->RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(CPUHistogram, OverflowTest, ::testing::ValuesIn(MakeParamsForTest()));
+}  // namespace xgboost::tree
--- a/tests/cpp/tree/test_constraints.cu
+++ b/tests/cpp/tree/test_constraints.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>
@@ -59,7 +59,7 @@ void CompareBitField(LBitField64 d_field, std::set<uint32_t> positions) {
  LBitField64 h_field{ {h_field_storage.data(),
                        h_field_storage.data() + h_field_storage.size()} };

-  for (size_t i = 0; i < h_field.Size(); ++i) {
+  for (size_t i = 0; i < h_field.Capacity(); ++i) {
    if (positions.find(i) != positions.cend()) {
      ASSERT_TRUE(h_field.Check(i));
    } else {
@@ -88,7 +88,7 @@ TEST(GPUFeatureInteractionConstraint, Init) {
        {h_node_storage.data(), h_node_storage.data() +  h_node_storage.size()}
      };
      // no feature is attached to node.
-      for (size_t i = 0; i < h_node.Size(); ++i) {
+      for (size_t i = 0; i < h_node.Capacity(); ++i) {
        ASSERT_FALSE(h_node.Check(i));
      }
    }
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -2,22 +2,24 @@
 * Copyright 2022-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
-#include <xgboost/base.h>                       // for GradientPairInternal, GradientPairPrecise
-#include <xgboost/data.h>                       // for MetaInfo
-#include <xgboost/host_device_vector.h>         // for HostDeviceVector
-#include <xgboost/span.h>                       // for operator!=, Span, SpanIterator
+#include <xgboost/base.h>                // for GradientPairInternal, GradientPairPrecise
+#include <xgboost/data.h>                // for MetaInfo
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/span.h>                // for operator!=, Span, SpanIterator

-#include <algorithm>                            // for max, max_element, next_permutation, copy
-#include <cmath>                                // for isnan
-#include <cstddef>                              // for size_t
-#include <cstdint>                              // for int32_t, uint64_t, uint32_t
-#include <limits>                               // for numeric_limits
-#include <numeric>                              // for iota
-#include <tuple>                                // for make_tuple, tie, tuple
-#include <utility>                              // for pair
-#include <vector>                               // for vector
+#include <algorithm>  // for max, max_element, next_permutation, copy
+#include <cmath>      // for isnan
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, uint64_t, uint32_t
+#include <limits>     // for numeric_limits
+#include <numeric>    // for iota
+#include <tuple>      // for make_tuple, tie, tuple
+#include <utility>    // for pair
+#include <vector>     // for vector

 #include "../../../src/common/hist_util.h"      // for HistogramCuts, HistCollection, GHistRow
+#include "../../../src/tree/hist/hist_cache.h"  // for HistogramCollection
+#include "../../../src/tree/hist/param.h"       // for HistMakerTrainParam
 #include "../../../src/tree/param.h"            // for TrainParam, GradStats
 #include "../../../src/tree/split_evaluator.h"  // for TreeEvaluator
 #include "../helpers.h"                         // for SimpleLCG, SimpleRealUniformDistribution
@@ -35,7 +37,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
  MetaInfo info_;
  float best_score_{-std::numeric_limits<float>::infinity()};
  common::HistogramCuts cuts_;
-  common::HistCollection hist_;
+  BoundedHistCollection hist_;
  GradientPairPrecise total_gpair_;

  void SetUp() override {
@@ -56,9 +58,9 @@ class TestPartitionBasedSplit : public ::testing::Test {

    cuts_.min_vals_.Resize(1);

-    hist_.Init(cuts_.TotalBins());
-    hist_.AddHistRow(0);
-    hist_.AllocateAllData();
+    HistMakerTrainParam hist_param;
+    hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
+    hist_.AllocateHistograms({0});
    auto node_hist = hist_[0];

    SimpleLCG lcg;
--- a/tests/cpp/tree/test_fit_stump.cc
+++ b/tests/cpp/tree/test_fit_stump.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/linalg.h>
@@ -8,17 +8,17 @@
 #include "../../src/tree/fit_stump.h"
 #include "../helpers.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 void TestFitStump(Context const *ctx, DataSplitMode split = DataSplitMode::kRow) {
  std::size_t constexpr kRows = 16, kTargets = 2;
-  HostDeviceVector<GradientPair> gpair;
-  auto &h_gpair = gpair.HostVector();
-  h_gpair.resize(kRows * kTargets);
+  linalg::Matrix<GradientPair> gpair;
+  gpair.SetDevice(ctx->Device());
+  gpair.Reshape(kRows, kTargets);
+  auto h_gpair = gpair.HostView();
  for (std::size_t i = 0; i < kRows; ++i) {
    for (std::size_t t = 0; t < kTargets; ++t) {
-      h_gpair.at(i * kTargets + t) = GradientPair{static_cast<float>(i), 1};
+      h_gpair(i, t) = GradientPair{static_cast<float>(i), 1};
    }
  }
  linalg::Vector<float> out;
@@ -53,6 +53,4 @@ TEST(InitEstimation, FitStumpColumnSplit) {
  auto constexpr kWorldSize{3};
  RunWithInMemoryCommunicator(kWorldSize, &TestFitStump, &ctx, DataSplitMode::kCol);
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -11,16 +11,15 @@
 #include <vector>

 #include "../../../src/common/common.h"
-#include "../../../src/data/sparse_page_source.h"
 #if defined(XGBOOST_USE_CUDA)
-#include "../../../src/tree/constraints.cuh"
+#include "../../../src/data/ellpack_page.cuh"  // for EllpackPageImpl
+#include "../../../src/data/ellpack_page.h"    // for EllpackPage
 #include "../../../src/tree/param.h"  // for TrainParam
-#include "../../../src/tree/updater_gpu_common.cuh"
 #include "../../../src/tree/updater_gpu_hist.cu"
 #elif defined(XGBOOST_USE_HIP)
-#include "../../../src/tree/constraints.hip.h"
+#include "../../../src/data/ellpack_page.hip.h"  // for EllpackPageImpl
+#include "../../../src/data/ellpack_page.h"    // for EllpackPage
 #include "../../../src/tree/param.h"  // for TrainParam
-#include "../../../src/tree/updater_gpu_common.hip.h"
 #include "../../../src/tree/updater_gpu_hist.hip"
 #endif
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
@@ -103,8 +102,9 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  auto page = BuildEllpackPage(kNRows, kNCols);
  BatchParam batch_param{};
  Context ctx{MakeCUDACtx(0)};
-  GPUHistMakerDevice<GradientSumT> maker(&ctx, page.get(), {}, kNRows, param, kNCols, kNCols,
-                                         batch_param);
+  auto cs = std::make_shared<common::ColumnSampler>(0);
+  GPUHistMakerDevice maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param, cs, kNCols,
+                           batch_param, MetaInfo());
  xgboost::SimpleLCG gen;
  xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
  HostDeviceVector<GradientPair> gpair(kNRows);
@@ -116,10 +116,16 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  gpair.SetDevice(0);

  thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
-  maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
+  maker.row_partitioner = std::make_unique<RowPartitioner>(0, kNRows);
+
+  maker.hist.Init(0, page->Cuts().TotalBins());
  maker.hist.AllocateHistograms({0});
+
  maker.gpair = gpair.DeviceSpan();
-  maker.quantiser.reset(new GradientQuantiser(maker.gpair));
+  maker.quantiser = std::make_unique<GradientQuantiser>(maker.gpair, MetaInfo());
+  maker.page = page.get();
+
+  maker.InitFeatureGroupsOnce();

  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
                         maker.feature_groups->DeviceAccessor(0), gpair.DeviceSpan(),
@@ -143,8 +149,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  std::vector<GradientPairPrecise> solution = GetHostHistGpair();
  for (size_t i = 0; i < h_result.size(); ++i) {
    auto result = maker.quantiser->ToFloatingPoint(h_result[i]);
-    EXPECT_NEAR(result.GetGrad(), solution[i].GetGrad(), 0.01f);
-    EXPECT_NEAR(result.GetHess(), solution[i].GetHess(), 0.01f);
+    ASSERT_NEAR(result.GetGrad(), solution[i].GetGrad(), 0.01f);
+    ASSERT_NEAR(result.GetHess(), solution[i].GetHess(), 0.01f);
  }
 }

@@ -176,7 +182,7 @@ HistogramCutsWrapper GetHostCutMatrix () {
 inline GradientQuantiser DummyRoundingFactor() {
  thrust::device_vector<GradientPair> gpair(1);
  gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return GradientQuantiser(dh::ToSpan(gpair));
+  return {dh::ToSpan(gpair), MetaInfo()};
 }

 void TestHistogramIndexImpl() {
@@ -225,7 +231,7 @@ TEST(GpuHist, TestHistogramIndex) {
  TestHistogramIndexImpl();
 }

-void UpdateTree(Context const* ctx, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
                size_t gpu_page_size, RegTree* tree, HostDeviceVector<bst_float>* preds,
                float subsample = 1.0f, const std::string& sampling_method = "uniform",
                int max_bin = 2) {
@@ -257,6 +263,7 @@ void UpdateTree(Context const* ctx, HostDeviceVector<GradientPair>* gpair, DMatr

  ObjInfo task{ObjInfo::kRegression};
  tree::GPUHistMaker hist_maker{ctx, &task};
+  hist_maker.Configure(Args{});

  std::vector<HostDeviceVector<bst_node_t>> position(1);
  hist_maker.Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
@@ -274,7 +281,8 @@ TEST(GpuHist, UniformSampling) {
  // Create an in-memory DMatrix.
  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));

-  auto gpair = GenerateRandomGradients(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));

  // Build a tree using the in-memory DMatrix.
  RegTree tree;
@@ -304,7 +312,8 @@ TEST(GpuHist, GradientBasedSampling) {
  // Create an in-memory DMatrix.
  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));

-  auto gpair = GenerateRandomGradients(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));

  // Build a tree using the in-memory DMatrix.
  RegTree tree;
@@ -340,11 +349,12 @@ TEST(GpuHist, ExternalMemory) {
  // Create a single batch DMatrix.
  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));

-  auto gpair = GenerateRandomGradients(kRows);
+  Context ctx(MakeCUDACtx(0));
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));

  // Build a tree using the in-memory DMatrix.
  RegTree tree;
-  Context ctx(MakeCUDACtx(0));
  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
  // Build another tree using multiple ELLPACK pages.
@@ -377,12 +387,13 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
  std::unique_ptr<DMatrix> dmat_ext(
      CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));

-  auto gpair = GenerateRandomGradients(kRows);
+  Context ctx(MakeCUDACtx(0));
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));

  // Build a tree using the in-memory DMatrix.
  auto rng = common::GlobalRandom();

-  Context ctx(MakeCUDACtx(0));
  RegTree tree;
  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
@@ -408,14 +419,14 @@ TEST(GpuHist, ConfigIO) {
  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_gpu_hist", &ctx, &task)};
  updater->Configure(Args{});

-  Json j_updater { Object() };
+  Json j_updater{Object{}};
  updater->SaveConfig(&j_updater);
-  ASSERT_TRUE(IsA<Object>(j_updater["gpu_hist_train_param"]));
+  ASSERT_TRUE(IsA<Object>(j_updater["hist_train_param"]));
  updater->LoadConfig(j_updater);

-  Json j_updater_roundtrip { Object() };
+  Json j_updater_roundtrip{Object{}};
  updater->SaveConfig(&j_updater_roundtrip);
-  ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["gpu_hist_train_param"]));
+  ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["hist_train_param"]));

  ASSERT_EQ(j_updater, j_updater_roundtrip);
 }
@@ -432,4 +443,54 @@ TEST(GpuHist, MaxDepth) {

  ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
 }
+
+namespace {
+RegTree GetUpdatedTree(Context const* ctx, DMatrix* dmat) {
+  ObjInfo task{ObjInfo::kRegression};
+  GPUHistMaker hist_maker{ctx, &task};
+  hist_maker.Configure(Args{});
+
+  TrainParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+  RegTree tree;
+  hist_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                    {&tree});
+  return tree;
+}
+
+void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+  Context ctx(MakeCUDACtx(GPUIDX));
+
+  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
+
+  RegTree tree = GetUpdatedTree(&ctx, sliced.get());
+
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
+  ASSERT_EQ(json, expected_json);
+}
+}  // anonymous namespace
+
+class MGPUHistTest : public BaseMGPUTest {};
+
+TEST_F(MGPUHistTest, GPUHistColumnSplit) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  Context ctx(MakeCUDACtx(0));
+  auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+  RegTree expected_tree = GetUpdatedTree(&ctx, dmat.get());
+
+  DoTest(VerifyColumnSplit, kRows, kCols, expected_tree);
+}
 }  // namespace xgboost::tree
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -26,9 +26,11 @@ TEST(GrowHistMaker, InteractionConstraint) {
  auto constexpr kRows = 32;
  auto constexpr kCols = 16;
  auto p_dmat = GenerateDMatrix(kRows, kCols);
-  auto p_gradients = GenerateGradients(kRows);
-
  Context ctx;
+
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+
  ObjInfo task{ObjInfo::kRegression};
  {
    // With constraints
@@ -39,7 +41,8 @@ TEST(GrowHistMaker, InteractionConstraint) {
    param.UpdateAllowUnknown(
        Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});

    ASSERT_EQ(tree.NumExtraNodes(), 4);
    ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -55,7 +58,8 @@ TEST(GrowHistMaker, InteractionConstraint) {
    std::vector<HostDeviceVector<bst_node_t>> position(1);
    TrainParam param;
    param.Init(Args{});
-    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});

    ASSERT_EQ(tree.NumExtraNodes(), 10);
    ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -68,9 +72,12 @@ TEST(GrowHistMaker, InteractionConstraint) {
 namespace {
 void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
                       RegTree const& expected_tree) {
-  auto p_dmat = GenerateDMatrix(rows, cols, categorical);
-  auto p_gradients = GenerateGradients(rows);
  Context ctx;
+  auto p_dmat = GenerateDMatrix(rows, cols, categorical);
+  linalg::Matrix<GradientPair> gpair({rows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(rows));
+
+
  ObjInfo task{ObjInfo::kRegression};
  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
  std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -81,7 +88,8 @@ void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
  RegTree tree{1u, cols};
  TrainParam param;
  param.Init(Args{});
-  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
+  updater->Configure(Args{});
+  updater->Update(&param, &gpair, sliced.get(), position, {&tree});

  Json json{Object{}};
  tree.SaveModel(&json);
@@ -97,14 +105,16 @@ void TestColumnSplit(bool categorical) {
  RegTree expected_tree{1u, kCols};
  ObjInfo task{ObjInfo::kRegression};
  {
-    auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
-    auto p_gradients = GenerateGradients(kRows);
    Context ctx;
+    auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
+    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+    gpair.Data()->Copy(GenerateRandomGradients(kRows));
    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
    std::vector<HostDeviceVector<bst_node_t>> position(1);
    TrainParam param;
    param.Init(Args{});
-    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&expected_tree});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&expected_tree});
  }

  auto constexpr kWorldSize = 2;
--- a/tests/cpp/tree/test_node_partition.cc
+++ b/tests/cpp/tree/test_node_partition.cc
@@ -6,7 +6,9 @@
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_updater.h>  // for TreeUpdater

-#include <memory>                  // for unique_ptr
+#include <memory>  // for unique_ptr
+
+#include "../helpers.h"

 namespace xgboost {
 TEST(Updater, HasNodePosition) {
@@ -19,7 +21,7 @@ TEST(Updater, HasNodePosition) {
  ASSERT_TRUE(up->HasNodePosition());

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-  ctx.gpu_id = 0;
+  ctx = MakeCUDACtx(0);
  up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, &task));
  ASSERT_TRUE(up->HasNodePosition());
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -24,15 +24,11 @@ class TestPredictionCache : public ::testing::Test {
    Xy_ = RandomDataGenerator{n_samples_, n_features, 0}.Targets(n_targets).GenerateDMatrix(true);
  }

-  void RunLearnerTest(std::string updater_name, float subsample, std::string const& grow_policy,
-                      std::string const& strategy) {
+  void RunLearnerTest(Context const* ctx, std::string updater_name, float subsample,
+                      std::string const& grow_policy, std::string const& strategy) {
    std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
-    if (updater_name == "grow_gpu_hist") {
-      // gpu_id setup
-      learner->SetParam("tree_method", "gpu_hist");
-    } else {
-      learner->SetParam("updater", updater_name);
-    }
+    learner->SetParam("device", ctx->DeviceName());
+    learner->SetParam("updater", updater_name);
    learner->SetParam("multi_strategy", strategy);
    learner->SetParam("grow_policy", grow_policy);
    learner->SetParam("subsample", std::to_string(subsample));
@@ -65,54 +61,62 @@ class TestPredictionCache : public ::testing::Test {
    }
  }

-  void RunTest(std::string const& updater_name, std::string const& strategy) {
+  void RunTest(Context* ctx, std::string const& updater_name, std::string const& strategy) {
    {
-      Context ctx;
-      ctx.InitAllowUnknown(Args{{"nthread", "8"}});
-      if (updater_name == "grow_gpu_hist") {
-        ctx.gpu_id = 0;
-      } else {
-        ctx.gpu_id = Context::kCpuId;
-      }
+      ctx->InitAllowUnknown(Args{{"nthread", "8"}});

      ObjInfo task{ObjInfo::kRegression};
-      std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, &ctx, &task)};
+      std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, ctx, &task)};
      RegTree tree;
-      std::vector<RegTree *> trees{&tree};
-      auto gpair = GenerateRandomGradients(n_samples_);
+      std::vector<RegTree*> trees{&tree};
+      auto gpair = GenerateRandomGradients(ctx, n_samples_, 1);
      tree::TrainParam param;
      param.UpdateAllowUnknown(Args{{"max_bin", "64"}});

+      updater->Configure(Args{});
      std::vector<HostDeviceVector<bst_node_t>> position(1);
      updater->Update(&param, &gpair, Xy_.get(), position, trees);
      HostDeviceVector<float> out_prediction_cached;
-      out_prediction_cached.SetDevice(ctx.gpu_id);
+      out_prediction_cached.SetDevice(ctx->Device());
      out_prediction_cached.Resize(n_samples_);
      auto cache =
-          linalg::MakeTensorView(&ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
+          linalg::MakeTensorView(ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
      ASSERT_TRUE(updater->UpdatePredictionCache(Xy_.get(), cache));
    }

    for (auto policy : {"depthwise", "lossguide"}) {
      for (auto subsample : {1.0f, 0.4f}) {
-        this->RunLearnerTest(updater_name, subsample, policy, strategy);
-        this->RunLearnerTest(updater_name, subsample, policy, strategy);
+        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
+        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
      }
    }
  }
 };

-TEST_F(TestPredictionCache, Approx) { this->RunTest("grow_histmaker", "one_output_per_tree"); }
+TEST_F(TestPredictionCache, Approx) {
+  Context ctx;
+  this->RunTest(&ctx, "grow_histmaker", "one_output_per_tree");
+}

 TEST_F(TestPredictionCache, Hist) {
-  this->RunTest("grow_quantile_histmaker", "one_output_per_tree");
+  Context ctx;
+  this->RunTest(&ctx, "grow_quantile_histmaker", "one_output_per_tree");
 }

 TEST_F(TestPredictionCache, HistMulti) {
-  this->RunTest("grow_quantile_histmaker", "multi_output_tree");
+  Context ctx;
+  this->RunTest(&ctx, "grow_quantile_histmaker", "multi_output_tree");
 }

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-TEST_F(TestPredictionCache, GpuHist) { this->RunTest("grow_gpu_hist", "one_output_per_tree"); }
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+TEST_F(TestPredictionCache, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_hist", "one_output_per_tree");
+}
+
+TEST_F(TestPredictionCache, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_approx", "one_output_per_tree");
+}
+#endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -21,15 +21,13 @@ TEST(Updater, Prune) {
  std::vector<std::pair<std::string, std::string>> cfg;
  cfg.emplace_back("num_feature", std::to_string(kCols));
  cfg.emplace_back("min_split_loss", "10");
+  Context ctx;

  // These data are just place holders.
-  HostDeviceVector<GradientPair> gpair =
-      { {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f},
-        {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} };
-  std::shared_ptr<DMatrix> p_dmat {
-    RandomDataGenerator{32, 10, 0}.GenerateDMatrix() };
-
-  Context ctx;
+  linalg::Matrix<GradientPair> gpair
+      {{ {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f},
+         {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} }, {8, 1}, ctx.Device()};
+  std::shared_ptr<DMatrix> p_dmat{RandomDataGenerator{32, 10, 0}.GenerateDMatrix()};

  // prepare tree
  RegTree tree = RegTree{1u, kCols};
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -13,7 +13,6 @@
 #include "../../../src/tree/common_row_partitioner.h"
 #include "../../../src/tree/hist/expand_entry.h"  // for MultiExpandEntry, CPUExpandEntry
 #include "../../../src/tree/param.h"
-#include "../../../src/tree/split_evaluator.h"
 #include "../helpers.h"
 #include "test_partitioner.h"
 #include "xgboost/data.h"
@@ -49,7 +48,7 @@ void TestPartitioner(bst_target_t n_targets) {
      auto min_value = gmat.cut.MinValues()[split_ind];
      RegTree tree{n_targets, n_features};
      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
-      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+      if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
        GetSplit(&tree, min_value, &candidates);
      } else {
        GetMultiSplitForTest(&tree, min_value, &candidates);
@@ -203,13 +202,13 @@ TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEn
 TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }

 namespace {
-void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, bst_target_t n_targets,
+void VerifyColumnSplit(Context const* ctx, bst_row_t rows, bst_feature_t cols, bst_target_t n_targets,
                       RegTree const& expected_tree) {
  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
-  auto p_gradients = GenerateGradients(rows, n_targets);
-  Context ctx;
+  linalg::Matrix<GradientPair> gpair = GenerateRandomGradients(ctx, rows, n_targets);
+
  ObjInfo task{ObjInfo::kRegression};
-  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
+  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker", ctx, &task)};
  std::vector<HostDeviceVector<bst_node_t>> position(1);

  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
@@ -217,7 +216,8 @@ void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, bst_target_t n_target
  RegTree tree{n_targets, cols};
  TrainParam param;
  param.Init(Args{});
-  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
+  updater->Configure(Args{});
+  updater->Update(&param, &gpair, sliced.get(), position, {&tree});

  Json json{Object{}};
  tree.SaveModel(&json);
@@ -232,20 +232,21 @@ void TestColumnSplit(bst_target_t n_targets) {

  RegTree expected_tree{n_targets, kCols};
  ObjInfo task{ObjInfo::kRegression};
+  Context ctx;
  {
    auto Xy = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
-    auto p_gradients = GenerateGradients(kRows, n_targets);
-    Context ctx;
+    auto gpair = GenerateRandomGradients(&ctx, kRows, n_targets);
    std::unique_ptr<TreeUpdater> updater{
        TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
    std::vector<HostDeviceVector<bst_node_t>> position(1);
    TrainParam param;
    param.Init(Args{});
-    updater->Update(&param, p_gradients.get(), Xy.get(), position, {&expected_tree});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, Xy.get(), position, {&expected_tree});
  }

  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, kRows, kCols, n_targets,
+  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, &ctx, kRows, kCols, n_targets,
                              std::cref(expected_tree));
 }
 }  // anonymous namespace
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -17,10 +17,11 @@ namespace xgboost::tree {
 TEST(Updater, Refresh) {
  bst_row_t constexpr kRows = 8;
  bst_feature_t constexpr kCols = 16;
+  Context ctx;

-  HostDeviceVector<GradientPair> gpair =
-      { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
-        {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
+  linalg::Matrix<GradientPair> gpair
+      {{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
+         {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} }, {8, 1}, ctx.Device()};
  std::shared_ptr<DMatrix> p_dmat{
    RandomDataGenerator{kRows, kCols, 0.4f}.Seed(3).GenerateDMatrix()};
  std::vector<std::pair<std::string, std::string>> cfg{
@@ -29,7 +30,6 @@ TEST(Updater, Refresh) {
      {"reg_lambda", "1"}};

  RegTree tree = RegTree{1u, kCols};
-  Context ctx;
  std::vector<RegTree*> trees{&tree};

  ObjInfo task{ObjInfo::kRegression};
--- a/tests/cpp/tree/test_regen.cc
+++ b/tests/cpp/tree/test_regen.cc
@@ -62,8 +62,10 @@ class RegenTest : public ::testing::Test {
  auto constexpr Iter() const { return 4; }

  template <typename Page>
-  size_t TestTreeMethod(std::string tree_method, std::string obj, bool reset = true) const {
+  size_t TestTreeMethod(Context const* ctx, std::string tree_method, std::string obj,
+                        bool reset = true) const {
    auto learner = std::unique_ptr<Learner>{Learner::Create({p_fmat_})};
+    learner->SetParam("device", ctx->DeviceName());
    learner->SetParam("tree_method", tree_method);
    learner->SetParam("objective", obj);
    learner->Configure();
@@ -87,40 +89,71 @@ class RegenTest : public ::testing::Test {
 }  // anonymous namespace

 TEST_F(RegenTest, Approx) {
-  auto n = this->TestTreeMethod<GHistIndexMatrix>("approx", "reg:squarederror");
+  Context ctx;
+  auto n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "approx", "reg:squarederror");
  ASSERT_EQ(n, 1);
-  n = this->TestTreeMethod<GHistIndexMatrix>("approx", "reg:logistic");
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "approx", "reg:logistic");
  ASSERT_EQ(n, this->Iter());
 }

 TEST_F(RegenTest, Hist) {
-  auto n = this->TestTreeMethod<GHistIndexMatrix>("hist", "reg:squarederror");
+  Context ctx;
+  auto n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "hist", "reg:squarederror");
  ASSERT_EQ(n, 1);
-  n = this->TestTreeMethod<GHistIndexMatrix>("hist", "reg:logistic");
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "hist", "reg:logistic");
  ASSERT_EQ(n, 1);
 }

 TEST_F(RegenTest, Mixed) {
-  auto n = this->TestTreeMethod<GHistIndexMatrix>("hist", "reg:squarederror", false);
+  Context ctx;
+  auto n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "hist", "reg:squarederror", false);
  ASSERT_EQ(n, 1);
-  n = this->TestTreeMethod<GHistIndexMatrix>("approx", "reg:logistic", true);
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "approx", "reg:logistic", true);
  ASSERT_EQ(n, this->Iter() + 1);

-  n = this->TestTreeMethod<GHistIndexMatrix>("approx", "reg:logistic", false);
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "approx", "reg:logistic", false);
  ASSERT_EQ(n, this->Iter());
-  n = this->TestTreeMethod<GHistIndexMatrix>("hist", "reg:squarederror", true);
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "hist", "reg:squarederror", true);
  ASSERT_EQ(n, this->Iter() + 1);
 }

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-TEST_F(RegenTest, GpuHist) {
-  auto n = this->TestTreeMethod<EllpackPage>("gpu_hist", "reg:squarederror");
+TEST_F(RegenTest, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  auto n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:squarederror", true);
  ASSERT_EQ(n, 1);
-  n = this->TestTreeMethod<EllpackPage>("gpu_hist", "reg:logistic", false);
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:logistic", false);
+  ASSERT_EQ(n, this->Iter());
+
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:logistic", true);
+  ASSERT_EQ(n, this->Iter() * 2);
+}
+
+TEST_F(RegenTest, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  auto n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:squarederror", true);
+  ASSERT_EQ(n, 1);
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:logistic", false);
  ASSERT_EQ(n, 1);

-  n = this->TestTreeMethod<EllpackPage>("hist", "reg:logistic");
-  ASSERT_EQ(n, 2);
+  {
+    Context ctx;
+    n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:logistic");
+    ASSERT_EQ(n, 2);
+  }
+}
+
+TEST_F(RegenTest, GpuMixed) {
+  auto ctx = MakeCUDACtx(0);
+  auto n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:squarederror", false);
+  ASSERT_EQ(n, 1);
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:logistic", true);
+  ASSERT_EQ(n, this->Iter() + 1);
+
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:logistic", false);
+  ASSERT_EQ(n, this->Iter());
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:squarederror", true);
+  ASSERT_EQ(n, this->Iter() + 1);
 }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 }  // namespace xgboost
--- a/tests/cpp/tree/test_tree_policy.cc
+++ b/tests/cpp/tree/test_tree_policy.cc
@@ -20,10 +20,11 @@ class TestGrowPolicy : public ::testing::Test {
            true);
  }

-  std::unique_ptr<Learner> TrainOneIter(std::string tree_method, std::string policy,
-                                        int32_t max_leaves, int32_t max_depth) {
+  std::unique_ptr<Learner> TrainOneIter(Context const* ctx, std::string tree_method,
+                                        std::string policy, int32_t max_leaves, int32_t max_depth) {
    std::unique_ptr<Learner> learner{Learner::Create({this->Xy_})};
    learner->SetParam("tree_method", tree_method);
+    learner->SetParam("device", ctx->DeviceName());
    if (max_leaves >= 0) {
      learner->SetParam("max_leaves", std::to_string(max_leaves));
    }
@@ -63,7 +64,7 @@ class TestGrowPolicy : public ::testing::Test {

    if (max_leaves == 0 && max_depth == 0) {
      // unconstrainted
-      if (tree_method != "gpu_hist") {
+      if (ctx->IsCPU()) {
        // GPU pre-allocates for all nodes.
        learner->UpdateOneIter(0, Xy_);
      }
@@ -86,23 +87,23 @@ class TestGrowPolicy : public ::testing::Test {
    return learner;
  }

-  void TestCombination(std::string tree_method) {
+  void TestCombination(Context const* ctx, std::string tree_method) {
    for (auto policy : {"depthwise", "lossguide"}) {
      // -1 means default
      for (auto leaves : {-1, 0, 3}) {
        for (auto depth : {-1, 0, 3}) {
-          this->TrainOneIter(tree_method, policy, leaves, depth);
+          this->TrainOneIter(ctx, tree_method, policy, leaves, depth);
        }
      }
    }
  }

-  void TestTreeGrowPolicy(std::string tree_method, std::string policy) {
+  void TestTreeGrowPolicy(Context const* ctx, std::string tree_method, std::string policy) {
    {
      /**
       *  max_leaves
       */
-      auto learner = this->TrainOneIter(tree_method, policy, 16, -1);
+      auto learner = this->TrainOneIter(ctx, tree_method, policy, 16, -1);
      Json model{Object{}};
      learner->SaveModel(&model);

@@ -115,7 +116,7 @@ class TestGrowPolicy : public ::testing::Test {
      /**
       *  max_depth
       */
-      auto learner = this->TrainOneIter(tree_method, policy, -1, 3);
+      auto learner = this->TrainOneIter(ctx, tree_method, policy, -1, 3);
      Json model{Object{}};
      learner->SaveModel(&model);

@@ -133,25 +134,36 @@ class TestGrowPolicy : public ::testing::Test {
 };

 TEST_F(TestGrowPolicy, Approx) {
-  this->TestTreeGrowPolicy("approx", "depthwise");
-  this->TestTreeGrowPolicy("approx", "lossguide");
+  Context ctx;
+  this->TestTreeGrowPolicy(&ctx, "approx", "depthwise");
+  this->TestTreeGrowPolicy(&ctx, "approx", "lossguide");

-  this->TestCombination("approx");
+  this->TestCombination(&ctx, "approx");
 }

 TEST_F(TestGrowPolicy, Hist) {
-  this->TestTreeGrowPolicy("hist", "depthwise");
-  this->TestTreeGrowPolicy("hist", "lossguide");
+  Context ctx;
+  this->TestTreeGrowPolicy(&ctx, "hist", "depthwise");
+  this->TestTreeGrowPolicy(&ctx, "hist", "lossguide");

-  this->TestCombination("hist");
+  this->TestCombination(&ctx, "hist");
 }

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 TEST_F(TestGrowPolicy, GpuHist) {
-  this->TestTreeGrowPolicy("gpu_hist", "depthwise");
-  this->TestTreeGrowPolicy("gpu_hist", "lossguide");
+  auto ctx = MakeCUDACtx(0);
+  this->TestTreeGrowPolicy(&ctx, "hist", "depthwise");
+  this->TestTreeGrowPolicy(&ctx, "hist", "lossguide");

-  this->TestCombination("gpu_hist");
+  this->TestCombination(&ctx, "hist");
+}
+
+TEST_F(TestGrowPolicy, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  this->TestTreeGrowPolicy(&ctx, "approx", "depthwise");
+  this->TestTreeGrowPolicy(&ctx, "approx", "lossguide");
+
+  this->TestCombination(&ctx, "approx");
 }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 }  // namespace xgboost
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -16,7 +16,7 @@ namespace xgboost {
 class UpdaterTreeStatTest : public ::testing::Test {
 protected:
  std::shared_ptr<DMatrix> p_dmat_;
-  HostDeviceVector<GradientPair> gpairs_;
+  linalg::Matrix<GradientPair> gpairs_;
  size_t constexpr static kRows = 10;
  size_t constexpr static kCols = 10;

@@ -24,8 +24,8 @@ class UpdaterTreeStatTest : public ::testing::Test {
  void SetUp() override {
    p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix(true);
    auto g = GenerateRandomGradients(kRows);
-    gpairs_.Resize(kRows);
-    gpairs_.Copy(g);
+    gpairs_.Reshape(kRows, 1);
+    gpairs_.Data()->Copy(g);
  }

  void RunTest(std::string updater) {
@@ -63,7 +63,7 @@ TEST_F(UpdaterTreeStatTest, Approx) { this->RunTest("grow_histmaker"); }
 class UpdaterEtaTest : public ::testing::Test {
 protected:
  std::shared_ptr<DMatrix> p_dmat_;
-  HostDeviceVector<GradientPair> gpairs_;
+  linalg::Matrix<GradientPair> gpairs_;
  size_t constexpr static kRows = 10;
  size_t constexpr static kCols = 10;
  size_t constexpr static kClasses = 10;
@@ -71,8 +71,8 @@ class UpdaterEtaTest : public ::testing::Test {
  void SetUp() override {
    p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix(true, false, kClasses);
    auto g = GenerateRandomGradients(kRows);
-    gpairs_.Resize(kRows);
-    gpairs_.Copy(g);
+    gpairs_.Reshape(kRows, 1);
+    gpairs_.Data()->Copy(g);
  }

  void RunTest(std::string updater) {
@@ -125,17 +125,18 @@ TEST_F(UpdaterEtaTest, GpuHist) { this->RunTest("grow_gpu_hist"); }

 class TestMinSplitLoss : public ::testing::Test {
  std::shared_ptr<DMatrix> dmat_;
-  HostDeviceVector<GradientPair> gpair_;
+  linalg::Matrix<GradientPair> gpair_;

  void SetUp() override {
    constexpr size_t kRows = 32;
    constexpr size_t kCols = 16;
    constexpr float kSparsity = 0.6;
    dmat_ = RandomDataGenerator(kRows, kCols, kSparsity).Seed(3).GenerateDMatrix();
-    gpair_ = GenerateRandomGradients(kRows);
+    gpair_.Reshape(kRows, 1);
+    gpair_.Data()->Copy(GenerateRandomGradients(kRows));
  }

-  std::int32_t Update(std::string updater, float gamma) {
+  std::int32_t Update(Context const* ctx, std::string updater, float gamma) {
    Args args{{"max_depth", "1"},
              {"max_leaves", "0"},

@@ -154,8 +155,7 @@ class TestMinSplitLoss : public ::testing::Test {
    param.UpdateAllowUnknown(args);
    ObjInfo task{ObjInfo::kRegression};

-    Context ctx{MakeCUDACtx(updater == "grow_gpu_hist" ? 0 : Context::kCpuId)};
-    auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
+    auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, ctx, &task)};
    up->Configure({});

    RegTree tree;
@@ -167,16 +167,16 @@ class TestMinSplitLoss : public ::testing::Test {
  }

 public:
-  void RunTest(std::string updater) {
+  void RunTest(Context const* ctx, std::string updater) {
    {
-      int32_t n_nodes = Update(updater, 0.01);
+      int32_t n_nodes = Update(ctx, updater, 0.01);
      // This is not strictly verified, meaning the numeber `2` is whatever GPU_Hist retured
      // when writing this test, and only used for testing larger gamma (below) does prevent
      // building tree.
      ASSERT_EQ(n_nodes, 2);
    }
    {
-      int32_t n_nodes = Update(updater, 100.0);
+      int32_t n_nodes = Update(ctx, updater, 100.0);
      // No new nodes with gamma == 100.
      ASSERT_EQ(n_nodes, static_cast<decltype(n_nodes)>(0));
    }
@@ -185,10 +185,25 @@ class TestMinSplitLoss : public ::testing::Test {

 /* Exact tree method requires a pruner as an additional updater, so not tested here. */

-TEST_F(TestMinSplitLoss, Approx) { this->RunTest("grow_histmaker"); }
+TEST_F(TestMinSplitLoss, Approx) {
+  Context ctx;
+  this->RunTest(&ctx, "grow_histmaker");
+}
+
+TEST_F(TestMinSplitLoss, Hist) {
+  Context ctx;
+  this->RunTest(&ctx, "grow_quantile_histmaker");
+}

-TEST_F(TestMinSplitLoss, Hist) { this->RunTest("grow_quantile_histmaker"); }
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-TEST_F(TestMinSplitLoss, GpuHist) { this->RunTest("grow_gpu_hist"); }
+TEST_F(TestMinSplitLoss, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_hist");
+}
+
+TEST_F(TestMinSplitLoss, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_approx");
+}
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 }  // namespace xgboost