temp merge, disable 1 line, SetValid

2023-10-12 16:16:44 -07:00
parent 2e7e9d3b2d 85d3017ca5
commit ea19555474
492 changed files with 15533 additions and 9376 deletions
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -21,8 +21,7 @@
 namespace xgboost {
 namespace common {
 void TestSegmentedArgSort() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  size_t constexpr kElements = 100, kGroups = 3;
  dh::device_vector<size_t> sorted_idx(kElements, 0);
@@ -60,8 +59,7 @@ void TestSegmentedArgSort() {
 TEST(Algorithm, SegmentedArgSort) { TestSegmentedArgSort(); }

 TEST(Algorithm, GpuArgSort) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  dh::device_vector<float> values(20);
  dh::Iota(dh::ToSpan(values));                                    // accending
--- a/tests/cpp/common/test_bitfield.cc
+++ b/tests/cpp/common/test_bitfield.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include "../../../src/common/bitfield.h"
@@ -14,7 +14,7 @@ TEST(BitField, Check) {
                static_cast<typename common::Span<LBitField64::value_type>::index_type>(
                    storage.size())});
    size_t true_bit = 190;
-    for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
+    for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
      ASSERT_FALSE(bits.Check(i));
    }
    ASSERT_TRUE(bits.Check(true_bit));
@@ -34,7 +34,7 @@ TEST(BitField, Check) {
      ASSERT_FALSE(bits.Check(i));
    }
    ASSERT_TRUE(bits.Check(true_bit));
-    for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
+    for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
      ASSERT_FALSE(bits.Check(i));
    }
  }
--- a/tests/cpp/common/test_bitfield.cu
+++ b/tests/cpp/common/test_bitfield.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>
@@ -16,7 +16,7 @@ namespace xgboost {

 __global__ void TestSetKernel(LBitField64 bits) {
  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < bits.Size()) {
+  if (tid < bits.Capacity()) {
    bits.Set(tid);
  }
 }
@@ -40,20 +40,16 @@ TEST(BitField, GPUSet) {

  std::vector<LBitField64::value_type> h_storage(storage.size());
  thrust::copy(storage.begin(), storage.end(), h_storage.begin());
-
-  LBitField64 outputs {
-    common::Span<LBitField64::value_type>{h_storage.data(),
-                                       h_storage.data() + h_storage.size()}};
+  LBitField64 outputs{
+      common::Span<LBitField64::value_type>{h_storage.data(), h_storage.data() + h_storage.size()}};
  for (size_t i = 0; i < kBits; ++i) {
    ASSERT_TRUE(outputs.Check(i));
  }
 }

-__global__ void TestOrKernel(LBitField64 lhs, LBitField64 rhs) {
-  lhs |= rhs;
-}
-
-TEST(BitField, GPUAnd) {
+namespace {
+template <bool is_and, typename Op>
+void TestGPULogic(Op op) {
  uint32_t constexpr kBits = 128;
  dh::device_vector<LBitField64::value_type> lhs_storage(kBits);
  dh::device_vector<LBitField64::value_type> rhs_storage(kBits);
@@ -61,13 +57,32 @@ TEST(BitField, GPUAnd) {
  auto rhs = LBitField64(dh::ToSpan(rhs_storage));
  thrust::fill(lhs_storage.begin(), lhs_storage.end(), 0UL);
  thrust::fill(rhs_storage.begin(), rhs_storage.end(), ~static_cast<LBitField64::value_type>(0UL));
-  TestOrKernel<<<1, kBits>>>(lhs, rhs);
+  dh::LaunchN(kBits, [=] __device__(auto) mutable { op(lhs, rhs); });

  std::vector<LBitField64::value_type> h_storage(lhs_storage.size());
  thrust::copy(lhs_storage.begin(), lhs_storage.end(), h_storage.begin());
-  LBitField64 outputs {{h_storage.data(), h_storage.data() + h_storage.size()}};
-  for (size_t i = 0; i < kBits; ++i) {
-    ASSERT_TRUE(outputs.Check(i));
+  LBitField64 outputs{{h_storage.data(), h_storage.data() + h_storage.size()}};
+  if (is_and) {
+    for (size_t i = 0; i < kBits; ++i) {
+      ASSERT_FALSE(outputs.Check(i));
+    }
+  } else {
+    for (size_t i = 0; i < kBits; ++i) {
+      ASSERT_TRUE(outputs.Check(i));
+    }
  }
 }
+
+void TestGPUAnd() {
+  TestGPULogic<true>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs &= rhs; });
+}
+
+void TestGPUOr() {
+  TestGPULogic<false>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs |= rhs; });
+}
+}  // namespace
+
+TEST(BitField, GPUAnd) { TestGPUAnd(); }
+
+TEST(BitField, GPUOr) { TestGPUOr(); }
 }  // namespace xgboost
--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@@ -2,15 +2,26 @@
 * Copyright 2018-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
+#include <xgboost/base.h>     // for bst_bin_t
+#include <xgboost/context.h>  // for Context
+#include <xgboost/data.h>     // for BatchIterator, BatchSet, DMatrix, Met...

-#include "../../../src/common/column_matrix.h"
-#include "../helpers.h"
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t, uint16_t, uint8_t
+#include <limits>       // for numeric_limits
+#include <memory>       // for shared_ptr, __shared_ptr_access, allo...
+#include <type_traits>  // for remove_reference_t

+#include "../../../src/common/column_matrix.h"      // for ColumnMatrix, Column, DenseColumnIter
+#include "../../../src/common/hist_util.h"          // for DispatchBinType, BinTypeSize, Index
+#include "../../../src/common/ref_resource_view.h"  // for RefResourceView
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#include "../../../src/data/iterative_dmatrix.h"    // for IterativeDMatrix
+#include "../../../src/tree/param.h"                // for TrainParam
+#include "../helpers.h"                             // for RandomDataGenerator, NumpyArrayIterFo...

-namespace xgboost {
-namespace common {
-
-TEST(DenseColumn, Test) {
+namespace xgboost::common {
+TEST(ColumnMatrix, Basic) {
  int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -22,7 +33,7 @@ TEST(DenseColumn, Test) {
    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, sparse_thresh, false};
    ColumnMatrix column_matrix;
    for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, sparse_thresh, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, sparse_thresh, ctx.Threads());
    }
    ASSERT_GE(column_matrix.GetTypeSize(), last);
    ASSERT_LE(column_matrix.GetTypeSize(), kUint32BinsTypeSize);
@@ -59,7 +70,7 @@ void CheckSparseColumn(SparseColumnIter<BinIdxType>* p_col, const GHistIndexMatr
  }
 }

-TEST(SparseColumn, Test) {
+TEST(ColumnMatrix, SparseColumn) {
  int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -69,7 +80,7 @@ TEST(SparseColumn, Test) {
    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, 0.5f, false};
    ColumnMatrix column_matrix;
    for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, 1.0, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, 1.0, ctx.Threads());
    }
    common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
      using T = decltype(dtype);
@@ -83,12 +94,14 @@ template <typename BinIdxType>
 void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
                                const GHistIndexMatrix& gmat) {
  for (auto i = 0ull; i < col.Size(); i++) {
-    if (col.IsMissing(i)) continue;
+    if (col.IsMissing(i)) {
+      continue;
+    }
    EXPECT_EQ(gmat.index[gmat.row_ptr[i]], col.GetGlobalBinIdx(i));
  }
 }

-TEST(DenseColumnWithMissing, Test) {
+TEST(ColumnMatrix, DenseColumnWithMissing) {
  int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -98,7 +111,7 @@ TEST(DenseColumnWithMissing, Test) {
    GHistIndexMatrix gmat(&ctx, dmat.get(), max_num_bin, 0.2, false);
    ColumnMatrix column_matrix;
    for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, 0.2, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, 0.2, ctx.Threads());
    }
    ASSERT_TRUE(column_matrix.AnyMissing());
    DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
@@ -108,5 +121,29 @@ TEST(DenseColumnWithMissing, Test) {
    });
  }
 }
-}  // namespace common
-}  // namespace xgboost
+
+TEST(ColumnMatrix, GrowMissing) {
+  float sparsity = 0.5;
+  NumpyArrayIterForTest iter(sparsity);
+  auto n_threads = 0;
+  bst_bin_t n_bins = 16;
+  BatchParam batch{n_bins, tree::TrainParam::DftSparseThreshold()};
+  Context ctx;
+  auto m = std::make_shared<data::IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
+                                                    std::numeric_limits<float>::quiet_NaN(),
+                                                    n_threads, n_bins);
+  for (auto const& page : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+    auto const& column_matrix = page.Transpose();
+    auto const& missing = column_matrix.Missing();
+    auto n = NumpyArrayIterForTest::Rows() * NumpyArrayIterForTest::Cols();
+    auto expected = std::remove_reference_t<decltype(missing)>::BitFieldT::ComputeStorageSize(n);
+    auto got = missing.storage.size();
+    ASSERT_EQ(expected, got);
+    DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
+      using T = decltype(dtype);
+      auto col = column_matrix.DenseColumn<T, true>(0);
+      CheckColumWithMissingValue(col, page);
+    });
+  }
+}
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -27,8 +27,8 @@ void ParallelGHistBuilderReset() {

  for(size_t inode = 0; inode < kNodesExtended; inode++) {
    collection.AddHistRow(inode);
+    collection.AllocateData(inode);
  }
-  collection.AllocateAllData();
  ParallelGHistBuilder hist_builder;
  hist_builder.Init(kBins);
  std::vector<GHistRow> target_hist(kNodes);
@@ -83,8 +83,8 @@ void ParallelGHistBuilderReduceHist(){

  for(size_t inode = 0; inode < kNodes; inode++) {
    collection.AddHistRow(inode);
+    collection.AllocateData(inode);
  }
-  collection.AllocateAllData();
  ParallelGHistBuilder hist_builder;
  hist_builder.Init(kBins);
  std::vector<GHistRow> target_hist(kNodes);
@@ -129,7 +129,7 @@ TEST(CutsBuilder, SearchGroupInd) {

  auto p_mat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();

-  std::vector<bst_int> group(kNumGroups);
+  std::vector<bst_group_t> group(kNumGroups);
  group[0] = 2;
  group[1] = 3;
  group[2] = 7;
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -3,18 +3,23 @@
 */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
+#include <xgboost/base.h>  // for bst_bin_t
 #include <xgboost/c_api.h>
 #include <xgboost/data.h>

-#include <algorithm>
-#include <cmath>
+#include <algorithm>  // for transform
+#include <cmath>      // for floor
+#include <cstddef>    // for size_t
+#include <limits>     // for numeric_limits
+#include <string>     // for string, to_string
+#include <tuple>      // for tuple, make_tuple
+#include <vector>     // for vector

 #include "../../../include/xgboost/logging.h"
 #if defined(XGBOOST_USE_CUDA)
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/hist_util.h"
-#include "../../../src/common/math.h"
 #include "../../../src/data/device_adapter.cuh"
 #elif defined(XGBOOST_USE_HIP)
 #include "../../../src/common/device_helpers.hip.h"
@@ -29,8 +34,7 @@
 #include "../helpers.h"
 #include "test_hist_util.h"

-namespace xgboost {
-namespace common {
+namespace xgboost::common {

 template <typename AdapterT>
 HistogramCuts GetHostCuts(Context const* ctx, AdapterT* adapter, int num_bins, float missing) {
@@ -40,16 +44,17 @@ HistogramCuts GetHostCuts(Context const* ctx, AdapterT* adapter, int num_bins, f
 }

 TEST(HistUtil, DeviceSketch) {
+  auto ctx = MakeCUDACtx(0);
  int num_columns = 1;
  int num_bins = 4;
  std::vector<float> x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, -1.0f};
  int num_rows = x.size();
  auto dmat = GetDMatrixFromData(x, num_rows, num_columns);

-  auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
+  auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);

-  Context ctx;
-  HistogramCuts host_cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
+  Context cpu_ctx;
+  HistogramCuts host_cuts = SketchOnDMatrix(&cpu_ctx, dmat.get(), num_bins);

  EXPECT_EQ(device_cuts.Values(), host_cuts.Values());
  EXPECT_EQ(device_cuts.Ptrs(), host_cuts.Ptrs());
@@ -79,6 +84,7 @@ TEST(HistUtil, SketchBatchNumElements) {
 }

 TEST(HistUtil, DeviceSketchMemory) {
+  auto ctx = MakeCUDACtx(0);
  int num_columns = 100;
  int num_rows = 1000;
  int num_bins = 256;
@@ -87,7 +93,7 @@ TEST(HistUtil, DeviceSketchMemory) {

  dh::GlobalMemoryLogger().Clear();
  ConsoleLogger::Configure({{"verbosity", "3"}});
-  auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
+  auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);

  size_t bytes_required = detail::RequiredMemory(
      num_rows, num_columns, num_rows * num_columns, num_bins, false);
@@ -97,6 +103,7 @@ TEST(HistUtil, DeviceSketchMemory) {
 }

 TEST(HistUtil, DeviceSketchWeightsMemory) {
+  auto ctx = MakeCUDACtx(0);
  int num_columns = 100;
  int num_rows = 1000;
  int num_bins = 256;
@@ -106,7 +113,7 @@ TEST(HistUtil, DeviceSketchWeightsMemory) {

  dh::GlobalMemoryLogger().Clear();
  ConsoleLogger::Configure({{"verbosity", "3"}});
-  auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
+  auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
  ConsoleLogger::Configure({{"verbosity", "0"}});

  size_t bytes_required = detail::RequiredMemory(
@@ -116,52 +123,56 @@ TEST(HistUtil, DeviceSketchWeightsMemory) {
 }

 TEST(HistUtil, DeviceSketchDeterminism) {
+  auto ctx = MakeCUDACtx(0);
  int num_rows = 500;
  int num_columns = 5;
  int num_bins = 256;
  auto x = GenerateRandom(num_rows, num_columns);
  auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
-  auto reference_sketch = DeviceSketch(0, dmat.get(), num_bins);
+  auto reference_sketch = DeviceSketch(&ctx, dmat.get(), num_bins);
  size_t constexpr kRounds{ 100 };
  for (size_t r = 0; r < kRounds; ++r) {
-    auto new_sketch = DeviceSketch(0, dmat.get(), num_bins);
+    auto new_sketch = DeviceSketch(&ctx, dmat.get(), num_bins);
    ASSERT_EQ(reference_sketch.Values(), new_sketch.Values());
    ASSERT_EQ(reference_sketch.MinValues(), new_sketch.MinValues());
  }
 }

 TEST(HistUtil, DeviceSketchCategoricalAsNumeric) {
-  int categorical_sizes[] = {2, 6, 8, 12};
+  auto ctx = MakeCUDACtx(0);
+  auto categorical_sizes = {2, 6, 8, 12};
  int num_bins = 256;
-  int sizes[] = {25, 100, 1000};
+  auto sizes = {25, 100, 1000};
  for (auto n : sizes) {
    for (auto num_categories : categorical_sizes) {
      auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
      auto dmat = GetDMatrixFromData(x, n, 1);
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
 }

 TEST(HistUtil, DeviceSketchCategoricalFeatures) {
-  TestCategoricalSketch(1000, 256, 32, false,
-                        [](DMatrix *p_fmat, int32_t num_bins) {
-                          return DeviceSketch(0, p_fmat, num_bins);
-                        });
-  TestCategoricalSketch(1000, 256, 32, true,
-                        [](DMatrix *p_fmat, int32_t num_bins) {
-                          return DeviceSketch(0, p_fmat, num_bins);
-                        });
+  auto ctx = MakeCUDACtx(0);
+  TestCategoricalSketch(1000, 256, 32, false, [ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return DeviceSketch(&ctx, p_fmat, num_bins);
+  });
+  TestCategoricalSketch(1000, 256, 32, true, [ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return DeviceSketch(&ctx, p_fmat, num_bins);
+  });
 }

 void TestMixedSketch() {
  size_t n_samples = 1000, n_features = 2, n_categories = 3;
+  bst_bin_t n_bins = 64;
+
  std::vector<float> data(n_samples * n_features);
  SimpleLCG gen;
  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
  SimpleRealUniformDistribution<float> num_d{0.0f, 3.0f};
  for (size_t i = 0; i < n_samples * n_features; ++i) {
+    // two features, row major. The first column is numeric and the second is categorical.
    if (i % 2 == 0) {
      data[i] = std::floor(cat_d(&gen));
    } else {
@@ -173,46 +184,113 @@ void TestMixedSketch() {
  m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
  m->Info().feature_types.HostVector().push_back(FeatureType::kNumerical);

-  auto cuts = DeviceSketch(0, m.get(), 64);
-  ASSERT_EQ(cuts.Values().size(), 64 + n_categories);
+  auto ctx = MakeCUDACtx(0);
+  auto cuts = DeviceSketch(&ctx, m.get(), n_bins);
+  ASSERT_EQ(cuts.Values().size(), n_bins + n_categories);
 }

-TEST(HistUtil, DeviceSketchMixedFeatures) {
-  TestMixedSketch();
+TEST(HistUtil, DeviceSketchMixedFeatures) { TestMixedSketch(); }
+
+TEST(HistUtil, RemoveDuplicatedCategories) {
+  bst_row_t n_samples = 512;
+  bst_feature_t n_features = 3;
+  bst_cat_t n_categories = 5;
+
+  auto ctx = MakeCUDACtx(0);
+  SimpleLCG rng;
+  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
+
+  dh::device_vector<Entry> sorted_entries(n_samples * n_features);
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    for (bst_feature_t j = 0; j < n_features; ++j) {
+      float fvalue{0.0f};
+      // The second column is categorical
+      if (j == 1) {
+        fvalue = std::floor(cat_d(&rng));
+      } else {
+        fvalue = i;
+      }
+      sorted_entries[i * n_features + j] = Entry{j, fvalue};
+    }
+  }
+
+  MetaInfo info;
+  info.num_col_ = n_features;
+  info.num_row_ = n_samples;
+  info.feature_types.HostVector() = std::vector<FeatureType>{
+      FeatureType::kNumerical, FeatureType::kCategorical, FeatureType::kNumerical};
+  ASSERT_EQ(info.feature_types.Size(), n_features);
+
+  HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
+  cuts_ptr.SetDevice(0);
+
+  dh::device_vector<float> weight(n_samples * n_features, 0);
+  dh::Iota(dh::ToSpan(weight));
+
+  dh::caching_device_vector<bst_row_t> columns_ptr(4);
+  for (std::size_t i = 0; i < columns_ptr.size(); ++i) {
+    columns_ptr[i] = i * n_samples;
+  }
+  // sort into column major
+  thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
+                      detail::EntryCompareOp());
+
+  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
+                                     &weight, &columns_ptr);
+
+  auto const& h_cptr = cuts_ptr.ConstHostVector();
+  ASSERT_EQ(h_cptr.back(), n_samples * 2 + n_categories);
+  // check numerical
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    ASSERT_EQ(weight[i], i * 3);
+  }
+  auto beg = n_samples + n_categories;
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    ASSERT_EQ(weight[i + beg], i * 3 + 2);
+  }
+  // check categorical
+  beg = n_samples;
+  for (bst_cat_t i = 0; i < n_categories; ++i) {
+    // all from the second column
+    ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
+  }
 }

 TEST(HistUtil, DeviceSketchMultipleColumns) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
 }

 TEST(HistUtil, DeviceSketchMultipleColumnsWeights) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
    dmat->Info().weights_.HostVector() = GenerateRandomWeights(num_rows);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
 }

 TEST(HistUitl, DeviceSketchWeights) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
@@ -222,8 +300,8 @@ TEST(HistUitl, DeviceSketchWeights) {
    h_weights.resize(num_rows);
    std::fill(h_weights.begin(), h_weights.end(), 1.0f);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
-      auto wcuts = DeviceSketch(0, weighted_dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
+      auto wcuts = DeviceSketch(&ctx, weighted_dmat.get(), num_bins);
      ASSERT_EQ(cuts.MinValues(), wcuts.MinValues());
      ASSERT_EQ(cuts.Ptrs(), wcuts.Ptrs());
      ASSERT_EQ(cuts.Values(), wcuts.Values());
@@ -234,14 +312,15 @@ TEST(HistUitl, DeviceSketchWeights) {
 }

 TEST(HistUtil, DeviceSketchBatches) {
+  auto ctx = MakeCUDACtx(0);
  int num_bins = 256;
  int num_rows = 5000;
-  int batch_sizes[] = {0, 100, 1500, 6000};
+  auto batch_sizes = {0, 100, 1500, 6000};
  int num_columns = 5;
  for (auto batch_size : batch_sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
-    auto cuts = DeviceSketch(0, dmat.get(), num_bins, batch_size);
+    auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins, batch_size);
    ValidateCuts(cuts, dmat.get(), num_bins);
  }

@@ -249,8 +328,8 @@ TEST(HistUtil, DeviceSketchBatches) {
  size_t batches = 16;
  auto x = GenerateRandom(num_rows * batches, num_columns);
  auto dmat = GetDMatrixFromData(x, num_rows * batches, num_columns);
-  auto cuts_with_batches = DeviceSketch(0, dmat.get(), num_bins, num_rows);
-  auto cuts = DeviceSketch(0, dmat.get(), num_bins, 0);
+  auto cuts_with_batches = DeviceSketch(&ctx, dmat.get(), num_bins, num_rows);
+  auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins, 0);

  auto const& cut_values_batched = cuts_with_batches.Values();
  auto const& cut_values = cuts.Values();
@@ -261,15 +340,16 @@ TEST(HistUtil, DeviceSketchBatches) {
 }

 TEST(HistUtil, DeviceSketchMultipleColumnsExternal) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns =5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    dmlc::TemporaryDirectory temp;
    auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
@@ -277,8 +357,9 @@ TEST(HistUtil, DeviceSketchMultipleColumnsExternal) {

 // See https://github.com/dmlc/xgboost/issues/5866.
 TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  dmlc::TemporaryDirectory temp;
  for (auto num_rows : sizes) {
@@ -286,7 +367,7 @@ TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
    auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
    dmat->Info().weights_.HostVector() = GenerateRandomWeights(num_rows);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
@@ -299,7 +380,7 @@ auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing,
  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
  MetaInfo info;
  AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
-  sketch_container.MakeCuts(&batched_cuts);
+  sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
  return batched_cuts;
 }

@@ -367,7 +448,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
  AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
  HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
  size_t bytes_required = detail::RequiredMemory(
      num_rows, num_columns, num_rows * num_columns, num_bins, false);
  EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05);
@@ -397,7 +478,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
                      &sketch_container);

  HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
  ConsoleLogger::Configure({{"verbosity", "0"}});
  size_t bytes_required = detail::RequiredMemory(
      num_rows, num_columns, num_rows * num_columns, num_bins, true);
@@ -430,7 +511,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
  AdapterDeviceSketch(adapter.Value(), num_bins, info,
                      std::numeric_limits<float>::quiet_NaN(), &container);
  HistogramCuts cuts;
-  container.MakeCuts(&cuts);
+  container.MakeCuts(&cuts, info.IsColumnSplit());

  thrust::sort(x.begin(), x.end());
  auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
@@ -452,9 +533,9 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
 }

 TEST(HistUtil, AdapterDeviceSketchCategorical) {
-  int categorical_sizes[] = {2, 6, 8, 12};
+  auto categorical_sizes = {2, 6, 8, 12};
  int num_bins = 256;
-  int sizes[] = {25, 100, 1000};
+  auto sizes = {25, 100, 1000};
  for (auto n : sizes) {
    for (auto num_categories : categorical_sizes) {
      auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
@@ -469,8 +550,8 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
 }

 TEST(HistUtil, AdapterDeviceSketchMultipleColumns) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
@@ -486,7 +567,7 @@ TEST(HistUtil, AdapterDeviceSketchMultipleColumns) {
 TEST(HistUtil, AdapterDeviceSketchBatches) {
  int num_bins = 256;
  int num_rows = 5000;
-  int batch_sizes[] = {0, 100, 1500, 6000};
+  auto batch_sizes = {0, 100, 1500, 6000};
  int num_columns = 5;
  for (auto batch_size : batch_sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
@@ -571,14 +652,15 @@ TEST(HistUtil, GetColumnSize) {
 // Check sketching from adapter or DMatrix results in the same answer
 // Consistency here is useful for testing and user experience
 TEST(HistUtil, SketchingEquivalent) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
    for (auto num_bins : bin_sizes) {
-      auto dmat_cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto dmat_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      auto x_device = thrust::device_vector<float>(x);
      auto adapter = AdapterFromData(x_device, num_rows, num_columns);
      common::HistogramCuts adapter_cuts = MakeUnweightedCutsForTest(
@@ -593,21 +675,25 @@ TEST(HistUtil, SketchingEquivalent) {
 }

 TEST(HistUtil, DeviceSketchFromGroupWeights) {
+  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows = 3000, kCols = 200, kBins = 256;
  size_t constexpr kGroups = 10;
  auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+
+  // sketch with group weight
  auto& h_weights = m->Info().weights_.HostVector();
-  h_weights.resize(kRows);
+  h_weights.resize(kGroups);
  std::fill(h_weights.begin(), h_weights.end(), 1.0f);
  std::vector<bst_group_t> groups(kGroups);
  for (size_t i = 0; i < kGroups; ++i) {
    groups[i] = kRows / kGroups;
  }
  m->SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
-  HistogramCuts weighted_cuts = DeviceSketch(0, m.get(), kBins, 0);
+  HistogramCuts weighted_cuts = DeviceSketch(&ctx, m.get(), kBins, 0);

+  // sketch with no weight
  h_weights.clear();
-  HistogramCuts cuts = DeviceSketch(0, m.get(), kBins, 0);
+  HistogramCuts cuts = DeviceSketch(&ctx, m.get(), kBins, 0);

  ASSERT_EQ(cuts.Values().size(), weighted_cuts.Values().size());
  ASSERT_EQ(cuts.MinValues().size(), weighted_cuts.MinValues().size());
@@ -662,7 +748,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
                      &sketch_container);

  common::HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());

  auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols);
  if (with_group) {
@@ -675,9 +761,10 @@ void TestAdapterSketchFromWeights(bool with_group) {
  ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
  ValidateCuts(cuts, dmat.get(), kBins);

+  auto cuda_ctx = MakeCUDACtx(0);
  if (with_group) {
    dmat->Info().weights_ = decltype(dmat->Info().weights_)();  // remove weight
-    HistogramCuts non_weighted = DeviceSketch(0, dmat.get(), kBins, 0);
+    HistogramCuts non_weighted = DeviceSketch(&cuda_ctx, dmat.get(), kBins, 0);
    for (size_t i = 0; i < cuts.Values().size(); ++i) {
      ASSERT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
    }
@@ -703,7 +790,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
    SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
    AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                        &sketch_container);
-    sketch_container.MakeCuts(&weighted);
+    sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
    ValidateCuts(weighted, dmat.get(), kBins);
  }
 }
@@ -712,5 +799,156 @@ TEST(HistUtil, AdapterSketchFromWeights) {
  TestAdapterSketchFromWeights(false);
  TestAdapterSketchFromWeights(true);
 }
-}  // namespace common
-}  // namespace xgboost
+
+namespace {
+class DeviceSketchWithHessianTest
+    : public ::testing::TestWithParam<std::tuple<bool, bst_row_t, bst_bin_t>> {
+  bst_feature_t n_features_ = 5;
+  bst_group_t n_groups_{3};
+
+  auto GenerateHessian(Context const* ctx, bst_row_t n_samples) const {
+    HostDeviceVector<float> hessian;
+    auto& h_hess = hessian.HostVector();
+    h_hess = GenerateRandomWeights(n_samples);
+    std::mt19937 rng(0);
+    std::shuffle(h_hess.begin(), h_hess.end(), rng);
+    hessian.SetDevice(ctx->Device());
+    return hessian;
+  }
+
+  void CheckReg(Context const* ctx, std::shared_ptr<DMatrix> p_fmat, bst_bin_t n_bins,
+                HostDeviceVector<float> const& hessian, std::vector<float> const& w,
+                std::size_t n_elements) const {
+    auto const& h_hess = hessian.ConstHostVector();
+    {
+      auto& h_weight = p_fmat->Info().weights_.HostVector();
+      h_weight = w;
+    }
+
+    HistogramCuts cuts_hess =
+        DeviceSketchWithHessian(ctx, p_fmat.get(), n_bins, hessian.ConstDeviceSpan(), n_elements);
+    ValidateCuts(cuts_hess, p_fmat.get(), n_bins);
+
+    // merge hessian
+    {
+      auto& h_weight = p_fmat->Info().weights_.HostVector();
+      ASSERT_EQ(h_weight.size(), h_hess.size());
+      for (std::size_t i = 0; i < h_weight.size(); ++i) {
+        h_weight[i] = w[i] * h_hess[i];
+      }
+    }
+
+    HistogramCuts cuts_wh = DeviceSketch(ctx, p_fmat.get(), n_bins, n_elements);
+    ValidateCuts(cuts_wh, p_fmat.get(), n_bins);
+    ASSERT_EQ(cuts_hess.Values().size(), cuts_wh.Values().size());
+    for (std::size_t i = 0; i < cuts_hess.Values().size(); ++i) {
+      ASSERT_NEAR(cuts_wh.Values()[i], cuts_hess.Values()[i], kRtEps);
+    }
+
+    p_fmat->Info().weights_.HostVector() = w;
+  }
+
+ protected:
+  Context ctx_ = MakeCUDACtx(0);
+
+  void TestLTR(Context const* ctx, bst_row_t n_samples, bst_bin_t n_bins,
+               std::size_t n_elements) const {
+    auto x = GenerateRandom(n_samples, n_features_);
+
+    std::vector<bst_group_t> gptr;
+    gptr.resize(n_groups_ + 1, 0);
+    gptr[1] = n_samples / n_groups_;
+    gptr[2] = n_samples / n_groups_ + gptr[1];
+    gptr.back() = n_samples;
+
+    auto hessian = this->GenerateHessian(ctx, n_samples);
+    auto const& h_hess = hessian.ConstHostVector();
+    auto p_fmat = GetDMatrixFromData(x, n_samples, n_features_);
+    p_fmat->Info().group_ptr_ = gptr;
+
+    // test with constant group weight
+    std::vector<float> w(n_groups_, 1.0f);
+    p_fmat->Info().weights_.HostVector() = w;
+    HistogramCuts cuts_hess =
+        DeviceSketchWithHessian(ctx, p_fmat.get(), n_bins, hessian.ConstDeviceSpan(), n_elements);
+    // make validation easier by converting it into sample weight.
+    p_fmat->Info().weights_.HostVector() = h_hess;
+    p_fmat->Info().group_ptr_.clear();
+    ValidateCuts(cuts_hess, p_fmat.get(), n_bins);
+    // restore ltr properties
+    p_fmat->Info().weights_.HostVector() = w;
+    p_fmat->Info().group_ptr_ = gptr;
+
+    // test with random group weight
+    w = GenerateRandomWeights(n_groups_);
+    p_fmat->Info().weights_.HostVector() = w;
+    cuts_hess =
+        DeviceSketchWithHessian(ctx, p_fmat.get(), n_bins, hessian.ConstDeviceSpan(), n_elements);
+    // make validation easier by converting it into sample weight.
+    p_fmat->Info().weights_.HostVector() = h_hess;
+    p_fmat->Info().group_ptr_.clear();
+    ValidateCuts(cuts_hess, p_fmat.get(), n_bins);
+
+    // merge hessian with sample weight
+    p_fmat->Info().weights_.Resize(n_samples);
+    p_fmat->Info().group_ptr_.clear();
+    for (std::size_t i = 0; i < h_hess.size(); ++i) {
+      auto gidx = dh::SegmentId(Span{gptr.data(), gptr.size()}, i);
+      p_fmat->Info().weights_.HostVector()[i] = w[gidx] * h_hess[i];
+    }
+    auto cuts = DeviceSketch(ctx, p_fmat.get(), n_bins, n_elements);
+    ValidateCuts(cuts, p_fmat.get(), n_bins);
+    ASSERT_EQ(cuts.Values().size(), cuts_hess.Values().size());
+    for (std::size_t i = 0; i < cuts.Values().size(); ++i) {
+      EXPECT_NEAR(cuts.Values()[i], cuts_hess.Values()[i], 1e-4f);
+    }
+  }
+
+  void TestRegression(Context const* ctx, bst_row_t n_samples, bst_bin_t n_bins,
+                      std::size_t n_elements) const {
+    auto x = GenerateRandom(n_samples, n_features_);
+    auto p_fmat = GetDMatrixFromData(x, n_samples, n_features_);
+    std::vector<float> w = GenerateRandomWeights(n_samples);
+
+    auto hessian = this->GenerateHessian(ctx, n_samples);
+
+    this->CheckReg(ctx, p_fmat, n_bins, hessian, w, n_elements);
+  }
+};
+
+auto MakeParamsForTest() {
+  std::vector<bst_row_t> sizes = {1, 2, 256, 512, 1000, 1500};
+  std::vector<bst_bin_t> bin_sizes = {2, 16, 256, 512};
+  std::vector<std::tuple<bool, bst_row_t, bst_bin_t>> configs;
+  for (auto n_samples : sizes) {
+    for (auto n_bins : bin_sizes) {
+      configs.emplace_back(true, n_samples, n_bins);
+      configs.emplace_back(false, n_samples, n_bins);
+    }
+  }
+  return configs;
+}
+}  // namespace
+
+TEST_P(DeviceSketchWithHessianTest, DeviceSketchWithHessian) {
+  auto param = GetParam();
+  auto n_samples = std::get<1>(param);
+  auto n_bins = std::get<2>(param);
+  if (std::get<0>(param)) {
+    this->TestLTR(&ctx_, n_samples, n_bins, 0);
+    this->TestLTR(&ctx_, n_samples, n_bins, 512);
+  } else {
+    this->TestRegression(&ctx_, n_samples, n_bins, 0);
+    this->TestRegression(&ctx_, n_samples, n_bins, 512);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    HistUtil, DeviceSketchWithHessianTest, ::testing::ValuesIn(MakeParamsForTest()),
+    [](::testing::TestParamInfo<DeviceSketchWithHessianTest::ParamType> const& info) {
+      auto task = std::get<0>(info.param) ? "ltr" : "reg";
+      auto n_samples = std::to_string(std::get<1>(info.param));
+      auto n_bins = std::to_string(std::get<2>(info.param));
+      return std::string{task} + "_" + n_samples + "_" + n_bins;
+    });
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -1,16 +1,16 @@
-/*!
- * Copyright (c) by XGBoost Contributors 2019
+/**
+ * Copyright 2019-2023, XGBoost Contributors
 */
 #include <gtest/gtest.h>

-#include <fstream>
+#include <cstddef>  // for size_t
+#include <fstream>  // for ofstream

 #include "../../../src/common/io.h"
-#include "../helpers.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../helpers.h"

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(MemoryFixSizeBuffer, Seek) {
  size_t constexpr kSize { 64 };
  std::vector<int32_t> memory( kSize );
@@ -63,31 +63,159 @@ TEST(IO, LoadSequentialFile) {

  // Generate a JSON file.
  size_t constexpr kRows = 1000, kCols = 100;
-  std::shared_ptr<DMatrix> p_dmat{
-    RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
-  std::unique_ptr<Learner> learner { Learner::Create({p_dmat}) };
+  std::shared_ptr<DMatrix> p_dmat{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
+  std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
  learner->SetParam("tree_method", "hist");
  learner->Configure();

  for (int32_t iter = 0; iter < 10; ++iter) {
    learner->UpdateOneIter(iter, p_dmat);
  }
-  Json out { Object() };
+  Json out{Object()};
  learner->SaveModel(&out);
-  std::string str;
+  std::vector<char> str;
  Json::Dump(out, &str);

  std::string tmpfile = tempdir.path + "/model.json";
  {
-    std::unique_ptr<dmlc::Stream> fo(
-        dmlc::Stream::Create(tmpfile.c_str(), "w"));
-    fo->Write(str.c_str(), str.size());
+    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(tmpfile.c_str(), "w"));
+    fo->Write(str.data(), str.size());
  }

-  auto loaded = LoadSequentialFile(tmpfile, true);
+  auto loaded = LoadSequentialFile(tmpfile);
  ASSERT_EQ(loaded, str);
-
-  ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
 }
-}  // namespace common
-}  // namespace xgboost
+
+TEST(IO, Resource) {
+  {
+    // test malloc basic
+    std::size_t n = 128;
+    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
+    ASSERT_EQ(resource->Size(), n);
+    ASSERT_EQ(resource->Type(), ResourceHandler::kMalloc);
+  }
+
+  // test malloc resize
+  auto test_malloc_resize = [](bool force_malloc) {
+    std::size_t n = 64;
+    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
+    auto ptr = reinterpret_cast<std::uint8_t *>(resource->Data());
+    std::iota(ptr, ptr + n, 0);
+
+    auto malloc_resource = std::dynamic_pointer_cast<MallocResource>(resource);
+    ASSERT_TRUE(malloc_resource);
+    if (force_malloc) {
+      malloc_resource->Resize<true>(n * 2);
+    } else {
+      malloc_resource->Resize<false>(n * 2);
+    }
+    for (std::size_t i = 0; i < n; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], i) << force_malloc;
+    }
+    for (std::size_t i = n; i < 2 * n; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
+    }
+
+    ptr = malloc_resource->DataAs<std::uint8_t>();
+    std::fill_n(ptr, malloc_resource->Size(), 7);
+    if (force_malloc) {
+      malloc_resource->Resize<true>(n * 3, std::byte{3});
+    } else {
+      malloc_resource->Resize<false>(n * 3, std::byte{3});
+    }
+    for (std::size_t i = 0; i < n * 2; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 7);
+    }
+    for (std::size_t i = n * 2; i < n * 3; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 3);
+    }
+  };
+  test_malloc_resize(true);
+  test_malloc_resize(false);
+
+  {
+    // test mmap
+    dmlc::TemporaryDirectory tmpdir;
+    auto path = tmpdir.path + "/testfile";
+
+    std::ofstream fout(path, std::ios::binary);
+    double val{1.0};
+    fout.write(reinterpret_cast<char const *>(&val), sizeof(val));
+    fout << 1.0 << std::endl;
+    fout.close();
+
+    auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
+    ASSERT_EQ(resource->Size(), sizeof(double));
+    ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
+    ASSERT_EQ(resource->DataAs<double>()[0], val);
+  }
+}
+
+TEST(IO, PrivateMmapStream) {
+  dmlc::TemporaryDirectory tempdir;
+  auto path = tempdir.path + "/testfile";
+
+  // The page size on Linux is usually set to 4096, while the allocation granularity on
+  // the Windows machine where this test is writted is 65536. We span the test to cover
+  // all of them.
+  std::size_t n_batches{64};
+  std::size_t multiplier{2048};
+
+  std::vector<std::vector<std::int32_t>> batches;
+  std::vector<std::size_t> offset{0ul};
+
+  using T = std::int32_t;
+
+  {
+    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    for (std::size_t i = 0; i < n_batches; ++i) {
+      std::size_t size = (i + 1) * multiplier;
+      std::vector<T> data(size, 0);
+      std::iota(data.begin(), data.end(), i * i);
+
+      fo->Write(static_cast<std::uint64_t>(data.size()));
+      fo->Write(data.data(), data.size() * sizeof(T));
+
+      std::size_t bytes = sizeof(std::uint64_t) + data.size() * sizeof(T);
+      offset.push_back(bytes);
+
+      batches.emplace_back(std::move(data));
+    }
+  }
+
+  // Turn size info offset
+  std::partial_sum(offset.begin(), offset.end(), offset.begin());
+
+  // Test read
+  for (std::size_t i = 0; i < n_batches; ++i) {
+    std::size_t off = offset[i];
+    std::size_t n = offset.at(i + 1) - offset[i];
+    auto fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
+    std::vector<T> data;
+
+    std::uint64_t size{0};
+    ASSERT_TRUE(fi->Read(&size));
+    ASSERT_EQ(fi->Tell(), sizeof(size));
+    data.resize(size);
+
+    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), size * sizeof(T));
+    ASSERT_EQ(data, batches[i]);
+  }
+
+  // Test consume
+  for (std::size_t i = 0; i < n_batches; ++i) {
+    std::size_t off = offset[i];
+    std::size_t n = offset.at(i + 1) - offset[i];
+    std::unique_ptr<AlignedResourceReadStream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
+    std::vector<T> data;
+
+    std::uint64_t size{0};
+    ASSERT_TRUE(fi->Consume(&size));
+    ASSERT_EQ(fi->Tell(), sizeof(size));
+    data.resize(size);
+
+    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), sizeof(T) * size);
+    ASSERT_EQ(data, batches[i]);
+  }
+}
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -41,7 +41,6 @@ std::string GetModelStr() {
    "num_class": "0",
    "num_feature": "10",
    "objective": "reg:linear",
-    "predictor": "gpu_predictor",
    "tree_method": "gpu_hist",
    "updater": "grow_gpu_hist"
  },
@@ -419,7 +418,7 @@ TEST(Json, AssigningString) {

 TEST(Json, LoadDump) {
  std::string ori_buffer = GetModelStr();
-  Json origin {Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};
+  Json origin{Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};

  dmlc::TemporaryDirectory tempdir;
  auto const& path = tempdir.path + "test_model_dump";
@@ -431,9 +430,9 @@ TEST(Json, LoadDump) {
  ASSERT_TRUE(fout);
  fout << out << std::flush;

-  std::string new_buffer = common::LoadSequentialFile(path);
+  std::vector<char> new_buffer = common::LoadSequentialFile(path);

-  Json load_back {Json::Load(StringView(new_buffer.c_str(), new_buffer.size()))};
+  Json load_back{Json::Load(StringView(new_buffer.data(), new_buffer.size()))};
  ASSERT_EQ(load_back, origin);
 }

@@ -652,7 +651,7 @@ TEST(UBJson, Basic) {
    }

    auto data = common::LoadSequentialFile("test.ubj");
-    UBJReader reader{StringView{data}};
+    UBJReader reader{StringView{data.data(), data.size()}};
    json = reader.Load();
    return json;
  };
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -3,7 +3,7 @@
 */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
-#include <xgboost/host_device_vector.h>
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
 #include <xgboost/linalg.h>

 #include <cstddef>  // size_t
@@ -14,8 +14,8 @@

 namespace xgboost::linalg {
 namespace {
-auto kCpuId = Context::kCpuId;
-}
+DeviceOrd CPU() { return DeviceOrd::CPU(); }
+}  // namespace

 auto MakeMatrixFromTest(HostDeviceVector<float> *storage, std::size_t n_rows, std::size_t n_cols) {
  storage->Resize(n_rows * n_cols);
@@ -23,7 +23,7 @@ auto MakeMatrixFromTest(HostDeviceVector<float> *storage, std::size_t n_rows, st

  std::iota(h_storage.begin(), h_storage.end(), 0);

-  auto m = linalg::TensorView<float, 2>{h_storage, {n_rows, static_cast<size_t>(n_cols)}, -1};
+  auto m = linalg::TensorView<float, 2>{h_storage, {n_rows, static_cast<size_t>(n_cols)}, CPU()};
  return m;
 }

@@ -31,7 +31,7 @@ TEST(Linalg, MatrixView) {
  size_t kRows = 31, kCols = 77;
  HostDeviceVector<float> storage;
  auto m = MakeMatrixFromTest(&storage, kRows, kCols);
-  ASSERT_EQ(m.DeviceIdx(), kCpuId);
+  ASSERT_EQ(m.Device(), CPU());
  ASSERT_EQ(m(0, 0), 0);
  ASSERT_EQ(m(kRows - 1, kCols - 1), storage.Size() - 1);
 }
@@ -76,7 +76,7 @@ TEST(Linalg, TensorView) {

  {
    // as vector
-    TensorView<double, 1> vec{data, {data.size()}, -1};
+    TensorView<double, 1> vec{data, {data.size()}, CPU()};
    ASSERT_EQ(vec.Size(), data.size());
    ASSERT_EQ(vec.Shape(0), data.size());
    ASSERT_EQ(vec.Shape().size(), 1);
@@ -87,7 +87,7 @@ TEST(Linalg, TensorView) {

  {
    // as matrix
-    TensorView<double, 2> mat(data, {6, 4}, -1);
+    TensorView<double, 2> mat(data, {6, 4}, CPU());
    auto s = mat.Slice(2, All());
    ASSERT_EQ(s.Shape().size(), 1);
    s = mat.Slice(All(), 1);
@@ -96,7 +96,7 @@ TEST(Linalg, TensorView) {

  {
    // assignment
-    TensorView<double, 3> t{data, {2, 3, 4}, 0};
+    TensorView<double, 3> t{data, {2, 3, 4}, CPU()};
    double pi = 3.14159;
    auto old = t(1, 2, 3);
    t(1, 2, 3) = pi;
@@ -201,7 +201,7 @@ TEST(Linalg, TensorView) {
  }
  {
    // f-contiguous
-    TensorView<double, 3> t{data, {4, 3, 2}, {1, 4, 12}, kCpuId};
+    TensorView<double, 3> t{data, {4, 3, 2}, {1, 4, 12}, CPU()};
    ASSERT_TRUE(t.Contiguous());
    ASSERT_TRUE(t.FContiguous());
    ASSERT_FALSE(t.CContiguous());
@@ -210,11 +210,11 @@ TEST(Linalg, TensorView) {

 TEST(Linalg, Tensor) {
  {
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
-    auto view = t.View(kCpuId);
+    Tensor<float, 3> t{{2, 3, 4}, CPU(), Order::kC};
+    auto view = t.View(CPU());

    auto const &as_const = t;
-    auto k_view = as_const.View(kCpuId);
+    auto k_view = as_const.View(CPU());

    size_t n = 2 * 3 * 4;
    ASSERT_EQ(t.Size(), n);
@@ -229,7 +229,7 @@ TEST(Linalg, Tensor) {
  }
  {
    // Reshape
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
+    Tensor<float, 3> t{{2, 3, 4}, CPU(), Order::kC};
    t.Reshape(4, 3, 2);
    ASSERT_EQ(t.Size(), 24);
    ASSERT_EQ(t.Shape(2), 2);
@@ -247,7 +247,7 @@ TEST(Linalg, Tensor) {

 TEST(Linalg, Empty) {
  {
-    auto t = TensorView<double, 2>{{}, {0, 3}, kCpuId, Order::kC};
+    auto t = TensorView<double, 2>{{}, {0, 3}, CPU(), Order::kC};
    for (int32_t i : {0, 1, 2}) {
      auto s = t.Slice(All(), i);
      ASSERT_EQ(s.Size(), 0);
@@ -256,9 +256,9 @@ TEST(Linalg, Empty) {
    }
  }
  {
-    auto t = Tensor<double, 2>{{0, 3}, kCpuId, Order::kC};
+    auto t = Tensor<double, 2>{{0, 3}, CPU(), Order::kC};
    ASSERT_EQ(t.Size(), 0);
-    auto view = t.View(kCpuId);
+    auto view = t.View(CPU());

    for (int32_t i : {0, 1, 2}) {
      auto s = view.Slice(All(), i);
@@ -270,7 +270,7 @@ TEST(Linalg, Empty) {
 }

 TEST(Linalg, ArrayInterface) {
-  auto cpu = kCpuId;
+  auto cpu = CPU();
  auto t = Tensor<double, 2>{{3, 3}, cpu, Order::kC};
  auto v = t.View(cpu);
  std::iota(v.Values().begin(), v.Values().end(), 0);
@@ -315,16 +315,16 @@ TEST(Linalg, Popc) {
 }

 TEST(Linalg, Stack) {
-  Tensor<float, 3> l{{2, 3, 4}, kCpuId, Order::kC};
-  ElementWiseTransformHost(l.View(kCpuId), omp_get_max_threads(),
+  Tensor<float, 3> l{{2, 3, 4}, CPU(), Order::kC};
+  ElementWiseTransformHost(l.View(CPU()), omp_get_max_threads(),
                           [=](size_t i, float) { return i; });
-  Tensor<float, 3> r_0{{2, 3, 4}, kCpuId, Order::kC};
-  ElementWiseTransformHost(r_0.View(kCpuId), omp_get_max_threads(),
+  Tensor<float, 3> r_0{{2, 3, 4}, CPU(), Order::kC};
+  ElementWiseTransformHost(r_0.View(CPU()), omp_get_max_threads(),
                           [=](size_t i, float) { return i; });

  Stack(&l, r_0);

-  Tensor<float, 3> r_1{{0, 3, 4}, kCpuId, Order::kC};
+  Tensor<float, 3> r_1{{0, 3, 4}, CPU(), Order::kC};
  Stack(&l, r_1);
  ASSERT_EQ(l.Shape(0), 4);

@@ -335,7 +335,7 @@ TEST(Linalg, Stack) {
 TEST(Linalg, FOrder) {
  std::size_t constexpr kRows = 16, kCols = 3;
  std::vector<float> data(kRows * kCols);
-  MatrixView<float> mat{data, {kRows, kCols}, Context::kCpuId, Order::kF};
+  MatrixView<float> mat{data, {kRows, kCols}, CPU(), Order::kF};
  float k{0};
  for (std::size_t i = 0; i < kRows; ++i) {
    for (std::size_t j = 0; j < kCols; ++j) {
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -8,23 +8,25 @@
 #elif defined(XGBOOST_USE_HIP)
 #include "../../../src/common/linalg_op.hip.h"
 #endif
+#include "../helpers.h"
 #include "xgboost/context.h"
 #include "xgboost/linalg.h"

 namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
+  auto device = DeviceOrd::CUDA(0);
  Tensor<float, 3> l{{2, 3, 4}, 0};
  {
    /**
     * Non-contiguous
     */
    // GPU view
-    auto t = l.View(0).Slice(linalg::All(), 1, linalg::All());
+    auto t = l.View(device).Slice(linalg::All(), 1, linalg::All());
    ASSERT_FALSE(t.CContiguous());
    ElementWiseTransformDevice(t, [] __device__(size_t i, float) { return i; });
    // CPU view
-    t = l.View(Context::kCpuId).Slice(linalg::All(), 1, linalg::All());
+    t = l.View(DeviceOrd::CPU()).Slice(linalg::All(), 1, linalg::All());
    size_t k = 0;
    for (size_t i = 0; i < l.Shape(0); ++i) {
      for (size_t j = 0; j < l.Shape(2); ++j) {
@@ -32,7 +34,7 @@ void TestElementWiseKernel() {
      }
    }

-    t = l.View(0).Slice(linalg::All(), 1, linalg::All());
+    t = l.View(device).Slice(linalg::All(), 1, linalg::All());
    ElementWiseKernelDevice(t, [] XGBOOST_DEVICE(size_t i, float v) { SPAN_CHECK(v == i); });
  }

@@ -40,11 +42,11 @@ void TestElementWiseKernel() {
    /**
     * Contiguous
     */
-    auto t = l.View(0);
+    auto t = l.View(device);
    ElementWiseTransformDevice(t, [] XGBOOST_DEVICE(size_t i, float) { return i; });
    ASSERT_TRUE(t.CContiguous());
    // CPU view
-    t = l.View(Context::kCpuId);
+    t = l.View(DeviceOrd::CPU());

    size_t ind = 0;
    for (size_t i = 0; i < l.Shape(0); ++i) {
@@ -58,8 +60,7 @@ void TestElementWiseKernel() {
 }

 void TestSlice() {
-  Context ctx;
-  ctx.gpu_id = 1;
+  auto ctx = MakeCUDACtx(1);
  thrust::device_vector<double> data(2 * 3 * 4);
  auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
  dh::LaunchN(1, [=] __device__(size_t) {
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -1,15 +1,21 @@
+/**
+ * Copyright 2020-2023, XGBoost contributors
+ */
 #include <gtest/gtest.h>
-#include "test_quantile.h"
-#include "../helpers.h"
+
 #if defined(XGBOOST_USE_CUDA)
 #include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/quantile.cuh"
+#include "../../../src/data/device_adapter.cuh"  // CupyAdapter
 #elif defined(XGBOOST_USE_HIP)
 #include "../../../src/collective/communicator-inl.hip.h"
 #include "../../../src/common/hist_util.hip.h"
 #include "../../../src/common/quantile.hip.h"
+#include "../../../src/data/device_adapter.hip.h"  // CupyAdapter
 #endif
+#include "../helpers.h"
+#include "test_quantile.h"

 namespace xgboost {
 namespace {
@@ -20,6 +26,9 @@ struct IsSorted {
 };
 }
 namespace common {
+
+class MGPUQuantileTest : public BaseMGPUTest {};
+
 TEST(GPUQuantile, Basic) {
  constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
  HostDeviceVector<FeatureType> ft;
@@ -349,12 +358,11 @@ TEST(GPUQuantile, MultiMerge) {
 }

 namespace {
-void TestAllReduceBasic(int32_t n_gpus) {
+void TestAllReduceBasic() {
  auto const world = collective::GetWorldSize();
-  CHECK_EQ(world, n_gpus);
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    auto const device = collective::GetRank();
+    auto const device = GPUIDX;

    // Set up single node version;
    HostDeviceVector<FeatureType> ft({}, device);
@@ -398,7 +406,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
    AdapterDeviceSketch(adapter.Value(), n_bins, info,
                        std::numeric_limits<float>::quiet_NaN(),
                        &sketch_distributed);
-    sketch_distributed.AllReduce();
+    sketch_distributed.AllReduce(false);
    sketch_distributed.Unique();

    ASSERT_EQ(sketch_distributed.ColumnsPtr().size(),
@@ -427,23 +435,66 @@ void TestAllReduceBasic(int32_t n_gpus) {
 }
 }  // anonymous namespace

-TEST(GPUQuantile, MGPUAllReduceBasic) {
-  auto const n_gpus = AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
+TEST_F(MGPUQuantileTest, AllReduceBasic) {
+  DoTest(TestAllReduceBasic);
 }

 namespace {
-void TestSameOnAllWorkers(std::int32_t n_gpus) {
+void TestColumnSplitBasic() {
+  auto const world = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::size_t constexpr kRows = 1000, kCols = 100, kBins = 64;
+
+  auto m = std::unique_ptr<DMatrix>{[=]() {
+    auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+    return dmat->SliceCol(world, rank);
+  }()};
+
+  // Generate cuts for distributed environment.
+  auto ctx = MakeCUDACtx(GPUIDX);
+  HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
+
+  // Generate cuts for single node environment
+  collective::Finalize();
+  CHECK_EQ(collective::GetWorldSize(), 1);
+  HistogramCuts single_node_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
+
+  auto const& sptrs = single_node_cuts.Ptrs();
+  auto const& dptrs = distributed_cuts.Ptrs();
+  auto const& svals = single_node_cuts.Values();
+  auto const& dvals = distributed_cuts.Values();
+  auto const& smins = single_node_cuts.MinValues();
+  auto const& dmins = distributed_cuts.MinValues();
+
+  EXPECT_EQ(sptrs.size(), dptrs.size());
+  for (size_t i = 0; i < sptrs.size(); ++i) {
+    EXPECT_EQ(sptrs[i], dptrs[i]) << "rank: " << rank << ", i: " << i;
+  }
+
+  EXPECT_EQ(svals.size(), dvals.size());
+  for (size_t i = 0; i < svals.size(); ++i) {
+    EXPECT_NEAR(svals[i], dvals[i], 2e-2f) << "rank: " << rank << ", i: " << i;
+  }
+
+  EXPECT_EQ(smins.size(), dmins.size());
+  for (size_t i = 0; i < smins.size(); ++i) {
+    EXPECT_FLOAT_EQ(smins[i], dmins[i]) << "rank: " << rank << ", i: " << i;
+  }
+}
+}  // anonymous namespace
+
+TEST_F(MGPUQuantileTest, ColumnSplitBasic) {
+  DoTest(TestColumnSplitBasic);
+}
+
+namespace {
+void TestSameOnAllWorkers() {
  auto world = collective::GetWorldSize();
-  CHECK_EQ(world, n_gpus);
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                 MetaInfo const &info) {
    auto const rank = collective::GetRank();
-    auto const device = rank;
+    auto const device = GPUIDX;
    HostDeviceVector<FeatureType> ft({}, device);
    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
    HostDeviceVector<float> storage({}, device);
@@ -455,7 +506,7 @@ void TestSameOnAllWorkers(std::int32_t n_gpus) {
    AdapterDeviceSketch(adapter.Value(), n_bins, info,
                        std::numeric_limits<float>::quiet_NaN(),
                        &sketch_distributed);
-    sketch_distributed.AllReduce();
+    sketch_distributed.AllReduce(false);
    sketch_distributed.Unique();
    TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);

@@ -497,12 +548,8 @@ void TestSameOnAllWorkers(std::int32_t n_gpus) {
 }
 }  // anonymous namespace

-TEST(GPUQuantile, MGPUSameOnAllWorkers) {
-  auto const n_gpus = AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
+TEST_F(MGPUQuantileTest, SameOnAllWorkers) {
+  DoTest(TestSameOnAllWorkers);
 }

 TEST(GPUQuantile, Push) {
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -30,8 +30,7 @@

 namespace xgboost::ltr {
 void TestCalcQueriesInvIDCG() {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  std::size_t n_groups = 5, n_samples_per_group = 32;

  dh::device_vector<float> scores(n_samples_per_group * n_groups);
@@ -49,7 +48,7 @@ void TestCalcQueriesInvIDCG() {
  p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});

  cuda_impl::CalcQueriesInvIDCG(&ctx, linalg::MakeTensorView(&ctx, d_scores, d_scores.size()),
-                                dh::ToSpan(group_ptr), inv_IDCG.View(ctx.gpu_id), p);
+                                dh::ToSpan(group_ptr), inv_IDCG.View(ctx.Device()), p);
  for (std::size_t i = 0; i < n_groups; ++i) {
    double inv_idcg = inv_IDCG(i);
    ASSERT_NEAR(inv_idcg, 0.00551782, kRtEps);
@@ -92,20 +91,17 @@ void TestRankingCache(Context const* ctx) {
 }  // namespace

 TEST(RankingCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestRankingCache(&ctx);
 }

 TEST(NDCGCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestNDCGCache(&ctx);
 }

 TEST(MAPCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestMAPCache(&ctx);
 }
 }  // namespace xgboost::ltr
--- a/tests/cpp/common/test_ref_resource_view.cc
+++ b/tests/cpp/common/test_ref_resource_view.cc
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include <cstddef>  // for size_t
+#include <memory>   // for make_shared, make_unique
+#include <numeric>  // for iota
+#include <vector>   // for vector
+
+#include "../../../src/common/ref_resource_view.h"
+#include "dmlc/filesystem.h"  // for TemporaryDirectory
+
+namespace xgboost::common {
+TEST(RefResourceView, Basic) {
+  std::size_t n_bytes = 1024;
+  auto mem = std::make_shared<MallocResource>(n_bytes);
+  {
+    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+
+    RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
+                          mem};
+    ASSERT_EQ(mem.use_count(), 3);
+    ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
+    ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
+  }
+  {
+    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
+                         1.5f};
+    for (auto v : view) {
+      ASSERT_EQ(v, 1.5f);
+    }
+    std::iota(view.begin(), view.end(), 0.0f);
+    ASSERT_EQ(view.front(), 0.0f);
+    ASSERT_EQ(view.back(), static_cast<float>(view.size() - 1));
+
+    view.front() = 1.0f;
+    view.back() = 2.0f;
+    ASSERT_EQ(view.front(), 1.0f);
+    ASSERT_EQ(view.back(), 2.0f);
+  }
+  ASSERT_EQ(mem.use_count(), 1);
+}
+
+TEST(RefResourceView, IO) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto path = tmpdir.path + "/testfile";
+  auto data = MakeFixedVecWithMalloc(123, std::size_t{1});
+
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    ASSERT_EQ(fo->Write(data.data(), data.size_bytes()), data.size_bytes());
+  }
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    ASSERT_EQ(WriteVec(fo.get(), data),
+              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    auto read = MakeFixedVecWithMalloc(123, std::size_t{1});
+    ASSERT_TRUE(ReadVec(fi.get(), &read));
+    for (auto v : read) {
+      ASSERT_EQ(v, 1ul);
+    }
+  }
+}
+
+TEST(RefResourceView, IOAligned) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto path = tmpdir.path + "/testfile";
+  auto data = MakeFixedVecWithMalloc(123, 1.0f);
+
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    // + sizeof(float) for alignment
+    ASSERT_EQ(WriteVec(fo.get(), data),
+              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type) + sizeof(float));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    // wrong type, float vs. double
+    auto read = MakeFixedVecWithMalloc(123, 2.0);
+    ASSERT_FALSE(ReadVec(fi.get(), &read));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    auto read = MakeFixedVecWithMalloc(123, 2.0f);
+    ASSERT_TRUE(ReadVec(fi.get(), &read));
+    for (auto v : read) {
+      ASSERT_EQ(v, 1ul);
+    }
+  }
+  {
+    // Test std::vector
+    std::vector<float> data(123);
+    std::iota(data.begin(), data.end(), 0.0f);
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    // + sizeof(float) for alignment
+    ASSERT_EQ(WriteVec(fo.get(), data), data.size() * sizeof(float) +
+                                            sizeof(RefResourceView<std::size_t>::size_type) +
+                                            sizeof(float));
+  }
+}
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -7,6 +7,7 @@

 #include "../../../src/common/stats.h"
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
+#include "../helpers.h"

 namespace xgboost {
 namespace common {
@@ -71,7 +72,7 @@ TEST(Stats, Median) {
    ASSERT_EQ(m, .5f);

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    ASSERT_FALSE(ctx.IsCPU());
    Median(&ctx, values, weights, &out);
    m = out(0);
@@ -80,7 +81,7 @@ TEST(Stats, Median) {
  }

  {
-    ctx.gpu_id = Context::kCpuId;
+    ctx = ctx.MakeCPU();
    // 4x2 matrix
    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
    HostDeviceVector<float> weights;
@@ -90,7 +91,7 @@ TEST(Stats, Median) {
    ASSERT_EQ(out(1), .5f);

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    Median(&ctx, values, weights, &out);
    ASSERT_EQ(out(0), .5f);
    ASSERT_EQ(out(1), .5f);
@@ -123,8 +124,7 @@ TEST(Stats, Mean) {

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 TEST(Stats, GPUMean) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestMean(&ctx);
 }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -3,9 +3,9 @@
 */
 #include <gtest/gtest.h>

-#include <cstddef>                            // std::size_t
-#include <utility>                            // std::pair
-#include <vector>                             // std::vector
+#include <cstddef>  // std::size_t
+#include <utility>  // std::pair
+#include <vector>   // std::vector

 #if defined(XGBOOST_USE_CUDA)
 #include "../../../src/common/linalg_op.cuh"  // ElementWiseTransformDevice
@@ -14,10 +14,11 @@
 #include "../../../src/common/linalg_op.hip.h"  // ElementWiseTransformDevice
 #include "../../../src/common/stats.hip.h"
 #endif
-#include "xgboost/base.h"                     // XGBOOST_DEVICE
-#include "xgboost/context.h"                  // Context
-#include "xgboost/host_device_vector.h"       // HostDeviceVector
-#include "xgboost/linalg.h"                   // Tensor
+#include "../helpers.h"
+#include "xgboost/base.h"                // XGBOOST_DEVICE
+#include "xgboost/context.h"             // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/linalg.h"              // Tensor

 namespace xgboost {
 namespace common {
@@ -38,7 +39,7 @@ class StatsGPU : public ::testing::Test {
  }

 public:
-  void SetUp() override { ctx_.gpu_id = 0; }
+  void SetUp() override { ctx_  = MakeCUDACtx(0); }

  void WeightedMulti() {
    // data for one segment
@@ -51,7 +52,7 @@ class StatsGPU : public ::testing::Test {
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
-    auto d_arr = arr.View(0);
+    auto d_arr = arr.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
        thrust::make_counting_iterator(0ul),
@@ -75,8 +76,8 @@ class StatsGPU : public ::testing::Test {
  }

  void Weighted() {
-    auto d_arr = arr_.View(0);
-    auto d_key = indptr_.View(0);
+    auto d_arr = arr_.View(DeviceOrd::CUDA(0));
+    auto d_key = indptr_.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
        thrust::make_counting_iterator(0ul),
@@ -85,7 +86,7 @@ class StatsGPU : public ::testing::Test {
        dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                         [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
    linalg::Tensor<float, 1> weights{{10}, 0};
-    linalg::ElementWiseTransformDevice(weights.View(0),
+    linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
                                       [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
    auto w_it = weights.Data()->ConstDevicePointer();
    for (auto const& pair : TestSet{{0.0f, 1.0f}, {0.5f, 3.0f}, {1.0f, 5.0f}}) {
@@ -106,7 +107,7 @@ class StatsGPU : public ::testing::Test {
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
-    auto d_arr = arr.View(0);
+    auto d_arr = arr.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
        thrust::make_counting_iterator(0ul),
@@ -129,8 +130,8 @@ class StatsGPU : public ::testing::Test {
  }

  void NonWeighted() {
-    auto d_arr = arr_.View(0);
-    auto d_key = indptr_.View(0);
+    auto d_arr = arr_.View(DeviceOrd::CUDA(0));
+    auto d_key = indptr_.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
        thrust::make_counting_iterator(0ul), [=] __device__(std::size_t i) { return d_key(i); });