[EM] Support mmap backed ellpack. (#10602)

- Support resource view in ellpack. - Define the CUDA version of MMAP resource. - Define the CUDA version of malloc resource. - Refactor cuda runtime API wrappers, and add memory access related wrappers. - gather windows macros into a single header.
2024-07-18 08:20:21 +08:00
parent e9fbce9791
commit 292bb677e5
59 changed files with 889 additions and 646 deletions
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -15,7 +15,7 @@
 #include "../../../src/collective/comm.h"
 #include "../../../src/collective/communicator-inl.h"  // for Init, Finalize
 #include "../../../src/collective/tracker.h"           // for GetHostAddress
-#include "../../../src/common/common.h"                // for AllVisibleGPUs
+#include "../../../src/common/cuda_rt_utils.h"         // for AllVisibleGPUs
 #include "../helpers.h"                                // for FileExists

 #if defined(XGBOOST_USE_FEDERATED)
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -4,10 +4,11 @@
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
-
-#include "../../../src/common/device_helpers.cuh"
 #include <xgboost/host_device_vector.h>

+#include "../../../src/common/cuda_rt_utils.h"  // for SetDevice
+#include "../../../src/common/device_helpers.cuh"
+
 namespace xgboost::common {
 namespace {
 void SetDeviceForTest(DeviceOrd device) {
--- a/tests/cpp/common/test_ref_resource_view.cc
+++ b/tests/cpp/common/test_ref_resource_view.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
 */
 #include <gtest/gtest.h>

@@ -16,17 +16,16 @@ TEST(RefResourceView, Basic) {
  std::size_t n_bytes = 1024;
  auto mem = std::make_shared<MallocResource>(n_bytes);
  {
-    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+    RefResourceView view{static_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};

-    RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
-                          mem};
+    RefResourceView kview{static_cast<float const*>(mem->Data()), mem->Size() / sizeof(float), mem};
    ASSERT_EQ(mem.use_count(), 3);
    ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
    ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
  }
  {
-    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
-                         1.5f};
+    RefResourceView view{static_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+    std::fill_n(static_cast<float*>(mem->Data()), mem->Size() / sizeof(float), 1.5f);
    for (auto v : view) {
      ASSERT_EQ(v, 1.5f);
    }
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -27,15 +27,15 @@ TEST(EllpackPage, EmptyDMatrix) {
  auto impl = page.Impl();
  ASSERT_EQ(impl->row_stride, 0);
  ASSERT_EQ(impl->Cuts().TotalBins(), 0);
-  ASSERT_EQ(impl->gidx_buffer.Size(), 4);
+  ASSERT_EQ(impl->gidx_buffer.size(), 4);
 }

 TEST(EllpackPage, BuildGidxDense) {
  int constexpr kNRows = 16, kNCols = 8;
-  auto page = BuildEllpackPage(kNRows, kNCols);
-
-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
-  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), page->NumSymbols());
+  auto ctx = MakeCUDACtx(0);
+  auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);

  ASSERT_EQ(page->row_stride, kNCols);

@@ -58,16 +58,17 @@ TEST(EllpackPage, BuildGidxDense) {
    1, 4, 7, 10, 14, 16, 19, 21,
  };
  for (size_t i = 0; i < kNRows * kNCols; ++i) {
-    ASSERT_EQ(solution[i], gidx[i]);
+    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
  }
 }

 TEST(EllpackPage, BuildGidxSparse) {
  int constexpr kNRows = 16, kNCols = 8;
-  auto page = BuildEllpackPage(kNRows, kNCols, 0.9f);
+  auto ctx = MakeCUDACtx(0);
+  auto page = BuildEllpackPage(&ctx, kNRows, kNCols, 0.9f);

-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
-  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);

  ASSERT_LE(page->row_stride, 3);

@@ -78,7 +79,7 @@ TEST(EllpackPage, BuildGidxSparse) {
    24,  7, 14, 16,  4, 24, 24, 24, 24, 24,  9, 24, 24,  1, 24, 24
  };
  for (size_t i = 0; i < kNRows * page->row_stride; ++i) {
-    ASSERT_EQ(solution[i], gidx[i]);
+    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
  }
 }

@@ -94,7 +95,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
  Context ctx{MakeCUDACtx(0)};
  auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
  auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU());
+  auto accessor = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
  ASSERT_EQ(kCats, accessor.NumBins());

  auto x_copy = x;
@@ -110,13 +111,11 @@ TEST(EllpackPage, FromCategoricalBasic) {
  ASSERT_EQ(h_cuts_ptr.size(), 2);
  ASSERT_EQ(h_cuts_values.size(), kCats);

-  std::vector<common::CompressedByteT> const &h_gidx_buffer =
-      ellpack.Impl()->gidx_buffer.HostVector();
-  auto h_gidx_iter = common::CompressedIterator<uint32_t>(
-      h_gidx_buffer.data(), accessor.NumSymbols());
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = ellpack.Impl()->GetHostAccessor(&ctx, &h_gidx_buffer);

  for (size_t i = 0; i < x.size(); ++i) {
-    auto bin = h_gidx_iter[i];
+    auto bin = h_accessor.gidx_iter[i];
    auto bin_value = h_cuts_values.at(bin);
    ASSERT_EQ(AsCat(x[i]), AsCat(bin_value));
  }
@@ -152,12 +151,12 @@ TEST(EllpackPage, Copy) {
  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();

  // Create an empty result page.
-  EllpackPageImpl result(FstCU(), page->CutsShared(), page->is_dense, page->row_stride, kRows);
+  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kRows);

  // Copy batch pages into the result page.
  size_t offset = 0;
  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset);
+    size_t num_elements = result.Copy(&ctx, batch.Impl(), offset);
    offset += num_elements;
  }

@@ -171,11 +170,11 @@ TEST(EllpackPage, Copy) {
    EXPECT_EQ(impl->base_rowid, current_row);

    for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row,
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
                                         row_d.data().get()));
      thrust::copy(row_d.begin(), row_d.end(), row.begin());

-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row,
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), current_row,
                                         row_result_d.data().get()));
      thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());

@@ -200,7 +199,7 @@ TEST(EllpackPage, Compact) {
  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();

  // Create an empty result page.
-  EllpackPageImpl result(ctx.Device(), page->CutsShared(), page->is_dense, page->row_stride,
+  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride,
                         kCompactedRows);

  // Compact batch pages into the result page.
@@ -229,14 +228,13 @@ TEST(EllpackPage, Compact) {
        continue;
      }

-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()),
-                                         current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
+                                         row_d.data().get()));
      dh::safe_cuda(cudaDeviceSynchronize());
      thrust::copy(row_d.begin(), row_d.end(), row.begin());

-      dh::LaunchN(kCols,
-                  ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row,
-                                  row_result_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), compacted_row,
+                                         row_result_d.data().get()));
      thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());

      EXPECT_EQ(row, row_result);
@@ -269,16 +267,13 @@ class EllpackPageTest : public testing::TestWithParam<float> {
      ASSERT_EQ(from_sparse_page->base_rowid, 0);
      ASSERT_EQ(from_sparse_page->base_rowid, from_ghist->base_rowid);
      ASSERT_EQ(from_sparse_page->n_rows, from_ghist->n_rows);
-      ASSERT_EQ(from_sparse_page->gidx_buffer.Size(), from_ghist->gidx_buffer.Size());
-      auto const& h_gidx_from_sparse = from_sparse_page->gidx_buffer.HostVector();
-      auto const& h_gidx_from_ghist = from_ghist->gidx_buffer.HostVector();
+      ASSERT_EQ(from_sparse_page->gidx_buffer.size(), from_ghist->gidx_buffer.size());
+      std::vector<common::CompressedByteT> h_gidx_from_sparse, h_gidx_from_ghist;
+      auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist);
+      auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse);
      ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
-      common::CompressedIterator<uint32_t> from_ghist_it(h_gidx_from_ghist.data(),
-                                                         from_ghist->NumSymbols());
-      common::CompressedIterator<uint32_t> from_sparse_it(h_gidx_from_sparse.data(),
-                                                          from_sparse_page->NumSymbols());
      for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) {
-        EXPECT_EQ(from_ghist_it[i], from_sparse_it[i]);
+        EXPECT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
      }
    }
  }
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -14,9 +14,8 @@
 namespace xgboost::data {
 namespace {
 template <typename FormatStreamPolicy>
-void TestEllpackPageRawFormat() {
-  FormatStreamPolicy policy;
-
+void TestEllpackPageRawFormat(FormatStreamPolicy *p_policy) {
+  auto &policy = *p_policy;
  Context ctx{MakeCUDACtx(0)};
  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};

@@ -55,16 +54,30 @@ void TestEllpackPageRawFormat() {
    ASSERT_EQ(loaded->Cuts().Values(), orig->Cuts().Values());
    ASSERT_EQ(loaded->base_rowid, orig->base_rowid);
    ASSERT_EQ(loaded->row_stride, orig->row_stride);
-    ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
+    std::vector<common::CompressedByteT> h_loaded, h_orig;
+    [[maybe_unused]] auto h_loaded_acc = loaded->GetHostAccessor(&ctx, &h_loaded);
+    [[maybe_unused]] auto h_orig_acc = orig->GetHostAccessor(&ctx, &h_orig);
+    ASSERT_EQ(h_loaded, h_orig);
  }
 }
 }  // anonymous namespace

 TEST(EllpackPageRawFormat, DiskIO) {
-  TestEllpackPageRawFormat<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>();
+  EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{false};
+  TestEllpackPageRawFormat(&policy);
+}
+
+TEST(EllpackPageRawFormat, DiskIOHmm) {
+  if (common::SupportsPageableMem()) {
+    EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{true};
+    TestEllpackPageRawFormat(&policy);
+  } else {
+    GTEST_SKIP_("HMM is not supported.");
+  }
 }

 TEST(EllpackPageRawFormat, HostIO) {
-  TestEllpackPageRawFormat<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>();
+  EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy> policy;
+  TestEllpackPageRawFormat(&policy);
 }
 }  // namespace xgboost::data
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost contributors
+ * Copyright 2020-2024, XGBoost contributors
 */
 #include <gtest/gtest.h>

@@ -21,10 +21,10 @@ void TestEquivalent(float sparsity) {
  std::size_t offset = 0;
  auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
  std::unique_ptr<EllpackPageImpl> page_concatenated{new EllpackPageImpl(
-      ctx.Device(), first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)};
+      &ctx, first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)};
  for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
    auto page = batch.Impl();
-    size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset);
+    size_t num_elements = page_concatenated->Copy(&ctx, page, offset);
    offset += num_elements;
  }
  auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
@@ -66,18 +66,15 @@ void TestEquivalent(float sparsity) {
      ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]);
    }

-    auto const& buffer_from_iter = page_concatenated->gidx_buffer;
-    auto const& buffer_from_data = ellpack.Impl()->gidx_buffer;
-    ASSERT_NE(buffer_from_data.Size(), 0);
-
-    common::CompressedIterator<uint32_t> data_buf{
-        buffer_from_data.ConstHostPointer(), from_data.NumSymbols()};
-    common::CompressedIterator<uint32_t> data_iter{
-        buffer_from_iter.ConstHostPointer(), from_iter.NumSymbols()};
+    std::vector<common::CompressedByteT> buffer_from_iter, buffer_from_data;
+    auto data_iter = page_concatenated->GetHostAccessor(&ctx, &buffer_from_iter);
+    auto data_buf = ellpack.Impl()->GetHostAccessor(&ctx, &buffer_from_data);
+    ASSERT_NE(buffer_from_data.size(), 0);
+    ASSERT_NE(buffer_from_iter.size(), 0);
    CHECK_EQ(from_data.NumSymbols(), from_iter.NumSymbols());
    CHECK_EQ(from_data.n_rows * from_data.row_stride, from_data.n_rows * from_iter.row_stride);
    for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) {
-      CHECK_EQ(data_buf[i], data_iter[i]);
+      CHECK_EQ(data_buf.gidx_iter[i], data_iter.gidx_iter[i]);
    }
  }
 }
@@ -97,8 +94,8 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
  for (auto& ellpack : m.GetBatches<EllpackPage>(&ctx, {})) {
    n_batches ++;
    auto impl = ellpack.Impl();
-    common::CompressedIterator<uint32_t> iterator(
-        impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
+    std::vector<common::CompressedByteT> h_gidx;
+    auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx);
    auto cols = CudaArrayIterForTest::Cols();
    auto rows = CudaArrayIterForTest::Rows();

@@ -111,7 +108,7 @@ TEST(IterativeDeviceDMatrix, RowMajor) {

    for(auto i = 0ull; i < rows * cols; i++) {
      int column_idx = i % cols;
-      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), iterator[i]);
+      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), h_accessor.gidx_iter[i]);
    }
    EXPECT_EQ(m.Info().num_col_, cols);
    EXPECT_EQ(m.Info().num_row_, rows);
@@ -147,12 +144,12 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
      *m.GetBatches<EllpackPage>(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()})
           .begin();
  auto impl = ellpack.Impl();
-  common::CompressedIterator<uint32_t> iterator(
-      impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
-  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
-  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  std::vector<common::CompressedByteT> h_gidx;
+  auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx);
+  EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
  // null values get placed after valid values in a row
-  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
  EXPECT_EQ(m.Info().num_col_, cols);
  EXPECT_EQ(m.Info().num_row_, rows);
  EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -154,13 +154,18 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
  for (auto it = begin; it != end; ++it) {
    iterators.push_back(it.Page());
    gidx_buffers.emplace_back();
-    gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size());
-    gidx_buffers.back().Copy((*it).Impl()->gidx_buffer);
+    gidx_buffers.back().SetDevice(ctx.Device());
+    gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.size());
+    auto d_dst = gidx_buffers.back().DevicePointer();
+    auto const& d_src = (*it).Impl()->gidx_buffer;
+    dh::safe_cuda(cudaMemcpyAsync(d_dst, d_src.data(), d_src.size_bytes(), cudaMemcpyDefault));
  }
  ASSERT_GE(iterators.size(), 2);

  for (size_t i = 0; i < iterators.size(); ++i) {
-    ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
+    std::vector<common::CompressedByteT> h_buf;
+    [[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostAccessor(&ctx, &h_buf);
+    ASSERT_EQ(h_buf, gidx_buffers.at(i).HostVector());
    ASSERT_EQ(iterators[i].use_count(), 1);
  }

@@ -210,11 +215,11 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
    size_t offset = 0;
    for (auto& batch : p_ext_fmat->GetBatches<EllpackPage>(&ctx, param)) {
      if (!impl_ext) {
-        impl_ext = std::make_unique<EllpackPageImpl>(
-            batch.Impl()->gidx_buffer.Device(), batch.Impl()->CutsShared(), batch.Impl()->is_dense,
-            batch.Impl()->row_stride, kRows);
+        impl_ext = std::make_unique<EllpackPageImpl>(&ctx, batch.Impl()->CutsShared(),
+                                                     batch.Impl()->is_dense,
+                                                     batch.Impl()->row_stride, kRows);
      }
-      auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset);
+      auto n_elems = impl_ext->Copy(&ctx, batch.Impl(), offset);
      offset += n_elems;
    }
    ASSERT_EQ(impl_ext->base_rowid, 0);
@@ -223,8 +228,10 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
    ASSERT_EQ(impl_ext->row_stride, 2);
    ASSERT_EQ(impl_ext->Cuts().TotalBins(), 4);

-    std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.HostVector());
-    std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.HostVector());
+    std::vector<common::CompressedByteT> buffer;
+    [[maybe_unused]] auto h_acc = impl->GetHostAccessor(&ctx, &buffer);
+    std::vector<common::CompressedByteT> buffer_ext;
+    [[maybe_unused]] auto h_ext_acc = impl_ext->GetHostAccessor(&ctx, &buffer_ext);
    ASSERT_EQ(buffer, buffer_ext);
  }
 };
--- a/tests/cpp/filesystem.h
+++ b/tests/cpp/filesystem.h
@@ -1,13 +1,10 @@
-/*!
- * Copyright (c) 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2024, XGBoost Contributors
 */
 #ifndef XGBOOST_TESTS_CPP_FILESYSTEM_H
 #define XGBOOST_TESTS_CPP_FILESYSTEM_H

-// A macro used inside `windows.h` to avoid conflicts with `winsock2.h`
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif  // WIN32_LEAN_AND_MEAN
+#include <xgboost/windefs.h>

 #include "dmlc/filesystem.h"

--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -21,14 +21,11 @@

 #if defined(__CUDACC__)
 #include "../../src/collective/communicator-inl.h"  // for GetRank
-#include "../../src/common/common.h"                // for AllVisibleGPUs
+#include "../../src/common/cuda_rt_utils.h"         // for AllVisibleGPUs
 #endif  // defined(__CUDACC__)

 #include "filesystem.h"  // dmlc::TemporaryDirectory
 #include "xgboost/linalg.h"
-#if !defined(_OPENMP)
-#include <thread>
-#endif

 #if defined(__CUDACC__)
 #define DeclareUnifiedTest(name) GPU ## name
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -23,7 +23,7 @@ class HistogramCutsWrapper : public common::HistogramCuts {
 };
 }  // namespace detail

-inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
+inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(Context const* ctx, int n_rows, int n_cols,
                                                         bst_float sparsity = 0) {
  auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
  const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
@@ -48,7 +48,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
  }

  auto page = std::unique_ptr<EllpackPageImpl>(
-      new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {}));
+      new EllpackPageImpl(ctx, cmat, batch, dmat->IsDense(), row_stride, {}));

  return page;
 }
--- a/tests/cpp/objective/test_aft_obj.cc
+++ b/tests/cpp/objective/test_aft_obj.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors 
+ * Copyright 2020-2024, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <memory>
@@ -10,7 +10,6 @@
 #include "xgboost/objective.h"
 #include "xgboost/logging.h"
 #include "../helpers.h"
-#include "../../../src/common/survival_util.h"

 namespace xgboost::common {
 TEST(Objective, DeclareUnifiedTest(AFTObjConfiguration)) {
--- a/tests/cpp/plugin/federated/test_federated_coll.cu
+++ b/tests/cpp/plugin/federated/test_federated_coll.cu
@@ -6,7 +6,7 @@
 #include <xgboost/collective/result.h>  // for Result

 #include "../../../../src/collective/allreduce.h"
-#include "../../../../src/common/common.h"            // for AllVisibleGPUs
+#include "../../../../src/common/cuda_rt_utils.h"     // for AllVisibleGPUs
 #include "../../../../src/common/device_helpers.cuh"  // for device_vector
 #include "../../../../src/common/type.h"              // for EraseType
 #include "../../collective/test_worker.h"             // for SocketTest
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc
@@ -1,11 +1,11 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/json.h>  // for Json

 #include "../../../../src/collective/comm_group.h"
-#include "../../helpers.h"
+#include "../../../../src/common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "test_worker.h"

 namespace xgboost::collective {
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cu
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu
@@ -1,10 +1,11 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/json.h>  // for Json

 #include "../../../../src/collective/comm_group.h"
+#include "../../../../src/common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "../../helpers.h"
 #include "test_worker.h"

--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>  // for Args
@@ -8,7 +8,7 @@

 #include <string>  // for string, to_string

-#include "../../src/common/common.h"  // for AllVisibleGPUs
+#include "../../src/common/cuda_rt_utils.h"  // for AllVisibleGPUs

 namespace xgboost {
 namespace {
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
 */
 #include <gtest/gtest.h>

@@ -102,19 +102,17 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
  EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
  EXPECT_EQ(sampled_page->n_rows, kRows);

-  std::vector<common::CompressedByteT> buffer(sampled_page->gidx_buffer.HostVector());
-  common::CompressedIterator<common::CompressedByteT>
-      ci(buffer.data(), sampled_page->NumSymbols());
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = sampled_page->GetHostAccessor(&ctx, &h_gidx_buffer);

-  size_t offset = 0;
+  std::size_t offset = 0;
  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
    auto page = batch.Impl();
-    std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.HostVector());
-    common::CompressedIterator<common::CompressedByteT>
-        page_ci(page_buffer.data(), page->NumSymbols());
+    std::vector<common::CompressedByteT> h_page_gidx_buffer;
+    auto page_accessor = page->GetHostAccessor(&ctx, &h_page_gidx_buffer);
    size_t num_elements = page->n_rows * page->row_stride;
    for (size_t i = 0; i < num_elements; i++) {
-      EXPECT_EQ(ci[i + offset], page_ci[i]);
+      EXPECT_EQ(h_accessor.gidx_iter[i + offset], page_accessor.gidx_iter[i]);
    }
    offset += num_elements;
  }
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -328,8 +328,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
      for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
        concat.Push(page);
      }
-      EllpackPageImpl page{
-          ctx.Device(), cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}};
+      EllpackPageImpl page{&ctx, cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}};
      auto ridx = partitioner.GetRows(0);
      auto d_histogram = dh::ToSpan(single_hist);
      DeviceHistogramBuilder builder;
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -81,6 +81,7 @@ std::vector<GradientPairPrecise> GetHostHistGpair() {
 template <typename GradientSumT>
 void TestBuildHist(bool use_shared_memory_histograms) {
  int const kNRows = 16, kNCols = 8;
+  Context ctx{MakeCUDACtx(0)};

  TrainParam param;
  Args args{
@@ -89,9 +90,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  };
  param.Init(args);

-  auto page = BuildEllpackPage(kNRows, kNCols);
+  auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
  BatchParam batch_param{};
-  Context ctx{MakeCUDACtx(0)};
  auto cs = std::make_shared<common::ColumnSampler>(0);
  GPUHistMakerDevice maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param, cs, kNCols,
                           batch_param, MetaInfo());
@@ -105,7 +105,6 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  }
  gpair.SetDevice(ctx.Device());

-  thrust::host_vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
  maker.row_partitioner = std::make_unique<RowPartitioner>(&ctx, kNRows, 0);

  maker.hist.Init(ctx.Device(), page->Cuts().TotalBins());
@@ -198,14 +197,12 @@ void TestHistogramIndexImpl() {
  auto grad = GenerateRandomGradients(kNRows);
  grad.SetDevice(DeviceOrd::CUDA(0));
  maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
-  std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());

  const auto &maker_ext = hist_maker_ext.maker;
  maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols);
-  std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());

  ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins());
-  ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size());
+  ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size());
 }

 TEST(GpuHist, TestHistogramIndex) {