temp merge, disable 1 line, SetValid

2023-10-12 16:16:44 -07:00
parent 2e7e9d3b2d 85d3017ca5
commit ea19555474
492 changed files with 15533 additions and 9376 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -3,7 +3,7 @@
 */
 #include "xgboost/c_api.h"

-#include <algorithm>                         // for copy
+#include <algorithm>                         // for copy, transform
 #include <cinttypes>                         // for strtoimax
 #include <cmath>                             // for nan
 #include <cstring>                           // for strcmp
@@ -20,9 +20,12 @@
 #include "../collective/communicator-inl.h"  // for Allreduce, Broadcast, Finalize, GetProcessor...
 #include "../common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
+#include "../common/hist_util.h"             // for HistogramCuts
 #include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
+#include "../common/linalg_op.h"             // for ElementWiseTransformHost
 #include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
 #include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
+#include "../data/ellpack_page.h"            // for EllpackPage
 #include "../data/proxy_dmatrix.h"           // for DMatrixProxy
 #include "../data/simple_dmatrix.h"          // for SimpleDMatrix
 #include "c_api_error.h"                     // for xgboost_CHECK_C_ARG_PTR, API_END, API_BEGIN
@@ -66,6 +69,7 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
  }
 }

+static_assert(DMLC_CXX11_THREAD_LOCAL, "XGBoost depends on thread-local storage.");
 using GlobalConfigAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
@@ -81,13 +85,6 @@ void XGBBuildInfoDevice(Json *p_info) {
 }  // namespace xgboost
 #endif

-namespace {
-void DeprecatedFunc(StringView old, StringView since, StringView replacement) {
-  LOG(WARNING) << "`" << old << "` is deprecated since" << since << ", use `" << replacement
-               << "` instead.";
-}
-}  // anonymous namespace
-
 XGB_DLL int XGBuildInfo(char const **out) {
  API_BEGIN();
  xgboost_CHECK_C_ARG_PTR(out);
@@ -328,7 +325,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
                                                      int nthread, int max_bin,
                                                      DMatrixHandle *out) {
  API_BEGIN();
-  DeprecatedFunc(__func__, "1.7.0", "XGQuantileDMatrixCreateFromCallback");
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "1.7.0", "XGQuantileDMatrixCreateFromCallback");
  *out = new std::shared_ptr<xgboost::DMatrix>{
      xgboost::DMatrix::Create(iter, proxy, nullptr, reset, next, missing, nthread, max_bin)};
  API_END();
@@ -432,7 +429,7 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indic
                                     const bst_float *data, size_t nindptr, size_t nelem,
                                     size_t num_col, DMatrixHandle *out) {
  API_BEGIN();
-  DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR");
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR");
  data::CSRAdapter adapter(indptr, indices, data, nindptr - 1, nelem, num_col);
  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
  API_END();
@@ -465,8 +462,11 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data,
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
  auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
+  auto data_split_mode =
+      static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
  xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(
+      DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
  API_END();
 }

@@ -493,7 +493,7 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t *col_ptr, const unsigned *indi
                                     const bst_float *data, size_t nindptr, size_t, size_t num_row,
                                     DMatrixHandle *out) {
  API_BEGIN();
-  DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC");
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC");
  data::CSCAdapter adapter(col_ptr, indices, data, nindptr - 1, num_row);
  xgboost_CHECK_C_ARG_PTR(out);
  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
@@ -721,8 +721,7 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
  API_END();
 }

-XGB_DLL int XGDMatrixNumRow(const DMatrixHandle handle,
-                            xgboost::bst_ulong *out) {
+XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, xgboost::bst_ulong *out) {
  API_BEGIN();
  CHECK_HANDLE();
  auto p_m = CastDMatrixHandle(handle);
@@ -731,8 +730,7 @@ XGB_DLL int XGDMatrixNumRow(const DMatrixHandle handle,
  API_END();
 }

-XGB_DLL int XGDMatrixNumCol(const DMatrixHandle handle,
-                            xgboost::bst_ulong *out) {
+XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, xgboost::bst_ulong *out) {
  API_BEGIN();
  CHECK_HANDLE();
  auto p_m = CastDMatrixHandle(handle);
@@ -784,6 +782,104 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
  API_END();
 }

+namespace {
+template <typename Page>
+void GetCutImpl(Context const *ctx, std::shared_ptr<DMatrix> p_m,
+                std::vector<std::uint64_t> *p_indptr, std::vector<float> *p_data) {
+  auto &indptr = *p_indptr;
+  auto &data = *p_data;
+  for (auto const &page : p_m->GetBatches<Page>(ctx, {})) {
+    auto const &cut = page.Cuts();
+
+    auto const &ptrs = cut.Ptrs();
+    indptr.resize(ptrs.size());
+
+    auto const &vals = cut.Values();
+    auto const &mins = cut.MinValues();
+
+    bst_feature_t n_features = p_m->Info().num_col_;
+    auto ft = p_m->Info().feature_types.ConstHostSpan();
+    std::size_t n_categories = std::count_if(ft.cbegin(), ft.cend(),
+                                             [](auto t) { return t == FeatureType::kCategorical; });
+    data.resize(vals.size() + n_features - n_categories);  // |vals| + |mins|
+    std::size_t i{0}, n_numeric{0};
+    for (bst_feature_t fidx = 0; fidx < n_features; ++fidx) {
+      CHECK_LT(i, data.size());
+      bool is_numeric = !common::IsCat(ft, fidx);
+      if (is_numeric) {
+        data[i] = mins[fidx];
+        i++;
+      }
+      auto beg = ptrs[fidx];
+      auto end = ptrs[fidx + 1];
+      CHECK_LE(end, data.size());
+      std::copy(vals.cbegin() + beg, vals.cbegin() + end, data.begin() + i);
+      i += (end - beg);
+      // shift by min values.
+      indptr[fidx] = ptrs[fidx] + n_numeric;
+      if (is_numeric) {
+        n_numeric++;
+      }
+    }
+    CHECK_EQ(n_numeric, n_features - n_categories);
+
+    indptr.back() = data.size();
+    CHECK_EQ(indptr.back(), vals.size() + mins.size() - n_categories);
+    break;
+  }
+}
+}  // namespace
+
+XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *config,
+                                    char const **out_indptr, char const **out_data) {
+  API_BEGIN();
+  CHECK_HANDLE();
+
+  auto p_m = CastDMatrixHandle(handle);
+
+  xgboost_CHECK_C_ARG_PTR(config);
+  xgboost_CHECK_C_ARG_PTR(out_indptr);
+  xgboost_CHECK_C_ARG_PTR(out_data);
+
+  auto jconfig = Json::Load(StringView{config});
+
+  if (!p_m->PageExists<GHistIndexMatrix>() && !p_m->PageExists<EllpackPage>()) {
+    LOG(FATAL) << "The quantile cut hasn't been generated yet. Unless this is a `QuantileDMatrix`, "
+                  "quantile cut is generated during training.";
+  }
+  // Get return buffer
+  auto &data = p_m->GetThreadLocal().ret_vec_float;
+  auto &indptr = p_m->GetThreadLocal().ret_vec_u64;
+
+  if (p_m->PageExists<GHistIndexMatrix>()) {
+    auto ctx = p_m->Ctx()->IsCPU() ? *p_m->Ctx() : p_m->Ctx()->MakeCPU();
+    GetCutImpl<GHistIndexMatrix>(&ctx, p_m, &indptr, &data);
+  } else {
+    auto ctx = p_m->Ctx()->IsCUDA() ? *p_m->Ctx() : p_m->Ctx()->MakeCUDA(0);
+    GetCutImpl<EllpackPage>(&ctx, p_m, &indptr, &data);
+  }
+
+  // Create a CPU context
+  Context ctx;
+  // Get return buffer
+  auto &ret_vec_str = p_m->GetThreadLocal().ret_vec_str;
+  ret_vec_str.clear();
+
+  ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
+      linalg::MakeTensorView(&ctx, common::Span{indptr.data(), indptr.size()}, indptr.size())));
+  ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
+      linalg::MakeTensorView(&ctx, common::Span{data.data(), data.size()}, data.size())));
+
+  auto &charp_vecs = p_m->GetThreadLocal().ret_vec_charp;
+  charp_vecs.resize(ret_vec_str.size());
+  std::transform(ret_vec_str.cbegin(), ret_vec_str.cend(), charp_vecs.begin(),
+                 [](auto const &str) { return str.c_str(); });
+
+  *out_indptr = charp_vecs[0];
+  *out_data = charp_vecs[1];
+  API_END();
+}
+
 // xgboost implementation
 XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[],
                            xgboost::bst_ulong len,
@@ -876,28 +972,71 @@ XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle,
  API_END();
 }

-XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle,
-                                  DMatrixHandle dtrain,
-                                  bst_float *grad,
-                                  bst_float *hess,
-                                  xgboost::bst_ulong len) {
+XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bst_float *grad,
+                                  bst_float *hess, xgboost::bst_ulong len) {
  API_BEGIN();
  CHECK_HANDLE();
-  HostDeviceVector<GradientPair> tmp_gpair;
-  auto* bst = static_cast<Learner*>(handle);
-  auto* dtr =
-      static_cast<std::shared_ptr<DMatrix>*>(dtrain);
-  tmp_gpair.Resize(len);
-  std::vector<GradientPair>& tmp_gpair_h = tmp_gpair.HostVector();
-  if (len > 0) {
-    xgboost_CHECK_C_ARG_PTR(grad);
-    xgboost_CHECK_C_ARG_PTR(hess);
-  }
-  for (xgboost::bst_ulong i = 0; i < len; ++i) {
-    tmp_gpair_h[i] = GradientPair(grad[i], hess[i]);
-  }
+  error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
+  auto *learner = static_cast<Learner *>(handle);
+  auto ctx = learner->Ctx()->MakeCPU();

-  bst->BoostOneIter(0, *dtr, &tmp_gpair);
+  auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, len}, len);
+  auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, len}, len);
+
+  auto s_grad = linalg::ArrayInterfaceStr(t_grad);
+  auto s_hess = linalg::ArrayInterfaceStr(t_hess);
+
+  return XGBoosterTrainOneIter(handle, dtrain, 0, s_grad.c_str(), s_hess.c_str());
+  API_END();
+}
+
+namespace xgboost {
+// copy user-supplied CUDA gradient arrays
+void CopyGradientFromCUDAArrays(Context const *, ArrayInterface<2, false> const &,
+                                ArrayInterface<2, false> const &, linalg::Matrix<GradientPair> *)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
+{
+  common::AssertGPUSupport();
+}
+#else
+;  // NOLINT
+#endif
+}  // namespace xgboost
+
+XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, int iter,
+                                  char const *grad, char const *hess) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  xgboost_CHECK_C_ARG_PTR(grad);
+  xgboost_CHECK_C_ARG_PTR(hess);
+  auto p_fmat = CastDMatrixHandle(dtrain);
+  ArrayInterface<2, false> i_grad{StringView{grad}};
+  ArrayInterface<2, false> i_hess{StringView{hess}};
+  StringView msg{"Mismatched shape between the gradient and hessian."};
+  CHECK_EQ(i_grad.Shape(0), i_hess.Shape(0)) << msg;
+  CHECK_EQ(i_grad.Shape(1), i_hess.Shape(1)) << msg;
+  linalg::Matrix<GradientPair> gpair;
+  auto grad_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_grad.data);
+  auto hess_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_hess.data);
+  CHECK_EQ(i_grad.Shape(0), p_fmat->Info().num_row_)
+      << "Mismatched size between the gradient and training data.";
+  CHECK_EQ(grad_is_cuda, hess_is_cuda) << "gradient and hessian should be on the same device.";
+  auto *learner = static_cast<Learner *>(handle);
+  auto ctx = learner->Ctx();
+  if (!grad_is_cuda) {
+    gpair.Reshape(i_grad.Shape(0), i_grad.Shape(1));
+    auto const shape = gpair.Shape();
+    auto h_gpair = gpair.HostView();
+    DispatchDType(i_grad, DeviceOrd::CPU(), [&](auto &&t_grad) {
+      DispatchDType(i_hess, DeviceOrd::CPU(), [&](auto &&t_hess) {
+        common::ParallelFor(h_gpair.Size(), ctx->Threads(),
+                            detail::CustomGradHessOp{t_grad, t_hess, h_gpair});
+      });
+    });
+  } else {
+    CopyGradientFromCUDAArrays(ctx, i_grad, i_hess, &gpair);
+  }
+  learner->BoostOneIter(iter, p_fmat, &gpair);
  API_END();
 }

@@ -1025,7 +1164,6 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
                        const float **out_result) {
  xgboost_CHECK_C_ARG_PTR(c_json_config);
  auto config = Json::Load(StringView{c_json_config});
-  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";

  HostDeviceVector<float> *p_predt{nullptr};
  auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
@@ -1044,6 +1182,7 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
  xgboost_CHECK_C_ARG_PTR(out_dim);
  CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(),
                   learner->BoostedRounds(), &shape, out_dim);
+  CHECK_GE(p_predt->Size(), n_samples);

  xgboost_CHECK_C_ARG_PTR(out_result);
  xgboost_CHECK_C_ARG_PTR(out_shape);
@@ -1126,12 +1265,12 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
    return str;
  };
  if (common::FileExtension(fname) == "json") {
-    auto str = read_file();
-    Json in{Json::Load(StringView{str})};
+    auto buffer = read_file();
+    Json in{Json::Load(StringView{buffer.data(), buffer.size()})};
    static_cast<Learner*>(handle)->LoadModel(in);
  } else if (common::FileExtension(fname) == "ubj") {
-    auto str = read_file();
-    Json in = Json::Load(StringView{str}, std::ios::binary);
+    auto buffer = read_file();
+    Json in = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary);
    static_cast<Learner *>(handle)->LoadModel(in);
  } else {
    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
@@ -1246,7 +1385,7 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_l
  raw_str.resize(0);

  common::MemoryBufferStream fo(&raw_str);
-  DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");

  learner->Configure();
  learner->SaveModel(&fo);
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -1,8 +1,12 @@
 /**
 * Copyright 2019-2023 by XGBoost Contributors
 */
-#include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
+#include <thrust/transform.h>  // for transform
+
+#include "../common/api_entry.h"       // for XGBAPIThreadLocalEntry
+#include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/threading_utils.h"
+#include "../data/array_interface.h"  // for DispatchDType, ArrayInterface
 #include "../data/device_adapter.cuh"
 #include "../data/proxy_dmatrix.h"
 #include "c_api_error.h"
@@ -13,7 +17,6 @@
 #include "xgboost/learner.h"

 namespace xgboost {
-
 void XGBBuildInfoDevice(Json *p_info) {
  auto &info = *p_info;

@@ -72,6 +75,27 @@ void XGBoostAPIGuard::RestoreGPUAttribute() {
  hipSetDevice(device_id_);
 #endif
 }
+
+void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
+                                ArrayInterface<2, false> const &hess,
+                                linalg::Matrix<GradientPair> *out_gpair) {
+  auto grad_dev = dh::CudaGetPointerDevice(grad.data);
+  auto hess_dev = dh::CudaGetPointerDevice(hess.data);
+  CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
+  auto &gpair = *out_gpair;
+  gpair.SetDevice(grad_dev);
+  gpair.Reshape(grad.Shape(0), grad.Shape(1));
+  auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
+  auto cuctx = ctx->CUDACtx();
+
+  DispatchDType(grad, DeviceOrd::CUDA(grad_dev), [&](auto &&t_grad) {
+    DispatchDType(hess, DeviceOrd::CUDA(hess_dev), [&](auto &&t_hess) {
+      CHECK_EQ(t_grad.Size(), t_hess.Size());
+      thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), t_grad.Size(),
+                         detail::CustomGradHessOp{t_grad, t_hess, d_gpair});
+    });
+  });
+}
 }                        // namespace xgboost

 using namespace xgboost;  // NOLINT
@@ -109,7 +133,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
  API_END();
 }

-int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
+int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
                       char const *c_json_config, std::shared_ptr<DMatrix> p_m,
                       xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
                       const float **out_result) {
@@ -124,7 +148,6 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
  proxy->SetCUDAArray(c_array_interface);

  auto config = Json::Load(StringView{c_json_config});
-  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
  auto *learner = static_cast<Learner *>(handle);

  HostDeviceVector<float> *p_predt{nullptr};
@@ -135,7 +158,10 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
                          RequiredArg<Integer>(config, "iteration_begin", __func__),
                          RequiredArg<Integer>(config, "iteration_end", __func__));
  CHECK(p_predt);
-  CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
+  if (learner->Ctx()->IsCUDA()) {
+    CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
+  }
+  p_predt->SetDevice(proxy->DeviceIdx());

  auto &shape = learner->GetThreadLocal().prediction_shape;
  size_t n_samples = p_m->Info().num_row_;
@@ -163,7 +189,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c
  if (m) {
    p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
  }
-  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                            out_result);
 }

@@ -176,6 +202,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_js
    p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
  }
  xgboost_CHECK_C_ARG_PTR(out_result);
-  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                            out_result);
 }
--- a/src/c_api/c_api_error.h
+++ b/src/c_api/c_api_error.h
@@ -1,5 +1,5 @@
-/*!
- *  Copyright (c) 2015-2022 by Contributors
+/**
+ *  Copyright 2015-2023, XGBoost Contributors
 * \file c_api_error.h
 * \brief Error handling for C API.
 */
@@ -35,8 +35,8 @@
  }                                                                            \
  return 0; // NOLINT(*)

-#define CHECK_HANDLE() if (handle == nullptr) \
-  LOG(FATAL) << "DMatrix/Booster has not been initialized or has already been disposed.";
+#define CHECK_HANDLE() \
+  if (handle == nullptr) ::xgboost::detail::EmptyHandle();

 /*!
 * \brief Set the last error message needed by C API
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -7,8 +7,10 @@
 #include <algorithm>
 #include <cstddef>
 #include <functional>
-#include <memory>  // std::shared_ptr
-#include <string>
+#include <memory>   // for shared_ptr
+#include <string>   // for string
+#include <tuple>    // for make_tuple
+#include <utility>  // for move
 #include <vector>

 #include "xgboost/c_api.h"
@@ -16,7 +18,7 @@
 #include "xgboost/feature_map.h"  // for FeatureMap
 #include "xgboost/json.h"
 #include "xgboost/learner.h"
-#include "xgboost/linalg.h"       // ArrayInterfaceHandler
+#include "xgboost/linalg.h"  // ArrayInterfaceHandler, MakeTensorView, ArrayInterfaceStr
 #include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // StringView

@@ -287,6 +289,19 @@ inline std::shared_ptr<DMatrix> CastDMatrixHandle(DMatrixHandle const handle) {
 }

 namespace detail {
+inline void EmptyHandle() {
+  LOG(FATAL) << "DMatrix/Booster has not been initialized or has already been disposed.";
+}
+
+inline xgboost::Context const *BoosterCtx(BoosterHandle handle) {
+  if (handle == nullptr) {
+    EmptyHandle();
+  }
+  auto *learner = static_cast<xgboost::Learner *>(handle);
+  CHECK(learner);
+  return learner->Ctx();
+}
+
 template <typename PtrT, typename I, typename T>
 void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data,
                       std::size_t nindptr, std::string *indptr_str, std::string *indices_str,
@@ -334,6 +349,40 @@ void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data
  Json::Dump(jindices, indices_str);
  Json::Dump(jdata, data_str);
 }
+
+/**
+ * @brief Make array interface for other language bindings.
+ */
+template <typename G, typename H>
+auto MakeGradientInterface(Context const *ctx, G const *grad, H const *hess, linalg::Order order,
+                           std::size_t n_samples, std::size_t n_targets) {
+  auto t_grad = linalg::MakeTensorView(ctx, order, common::Span{grad, n_samples * n_targets},
+                                       n_samples, n_targets);
+  auto t_hess = linalg::MakeTensorView(ctx, order, common::Span{hess, n_samples * n_targets},
+                                       n_samples, n_targets);
+  auto s_grad = linalg::ArrayInterfaceStr(t_grad);
+  auto s_hess = linalg::ArrayInterfaceStr(t_hess);
+  return std::make_tuple(s_grad, s_hess);
+}
+
+template <typename G, typename H>
+struct CustomGradHessOp {
+  linalg::MatrixView<G> t_grad;
+  linalg::MatrixView<H> t_hess;
+  linalg::MatrixView<GradientPair> d_gpair;
+
+  CustomGradHessOp(linalg::MatrixView<G> t_grad, linalg::MatrixView<H> t_hess,
+                   linalg::MatrixView<GradientPair> d_gpair)
+      : t_grad{std::move(t_grad)}, t_hess{std::move(t_hess)}, d_gpair{std::move(d_gpair)} {}
+
+  XGBOOST_DEVICE void operator()(std::size_t i) {
+    auto [m, n] = linalg::UnravelIndex(i, t_grad.Shape(0), t_grad.Shape(1));
+    auto g = t_grad(m, n);
+    auto h = t_hess(m, n);
+    // from struct of arrays to array of structs.
+    d_gpair(m, n) = GradientPair{static_cast<float>(g), static_cast<float>(h)};
+  }
+};
 }  // namespace detail
 }  // namespace xgboost
 #endif  // XGBOOST_C_API_C_API_UTILS_H_
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -345,10 +345,10 @@ class CLI {

  void LoadModel(std::string const& path, Learner* learner) const {
    if (common::FileExtension(path) == "json") {
-      auto str = common::LoadSequentialFile(path);
-      CHECK_GT(str.size(), 2);
-      CHECK_EQ(str[0], '{');
-      Json in{Json::Load({str.c_str(), str.size()})};
+      auto buffer = common::LoadSequentialFile(path);
+      CHECK_GT(buffer.size(), 2);
+      CHECK_EQ(buffer[0], '{');
+      Json in{Json::Load({buffer.data(), buffer.size()})};
      learner->LoadModel(in);
    } else {
      std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(path.c_str(), "r"));
@@ -514,7 +514,9 @@ class CLI {
 };
 }  // namespace xgboost

-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
+  LOG(WARNING)
+      << "The command line interface is deprecated and will be removed in future releases.";
  try {
    xgboost::CLI cli(argc, argv);
    return cli.Run();
--- a/src/collective/aggregator.cuh
+++ b/src/collective/aggregator.cuh
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ *
+ * Higher level functions built on top the Communicator API, taking care of behavioral differences
+ * between row-split vs column-split distributed training, and horizontal vs vertical federated
+ * learning.
+ */
+#pragma once
+#include <xgboost/data.h>
+
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "communicator-inl.cuh"
+
+namespace xgboost {
+namespace collective {
+
+/**
+ * @brief Find the global sum of the given values across all workers.
+ *
+ * This only applies when the data is split row-wise (horizontally). When data is split
+ * column-wise (vertically), the original values are returned.
+ *
+ * @tparam T The type of the values.
+ * @param info MetaInfo about the DMatrix.
+ * @param device The device id.
+ * @param values Pointer to the inputs to sum.
+ * @param size Number of values to sum.
+ */
+template <typename T>
+void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
+  if (info.IsRowSplit()) {
+    collective::AllReduce<collective::Operation::kSum>(device, values, size);
+  }
+}
+}  // namespace collective
+}  // namespace xgboost
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -26,7 +26,6 @@ namespace collective {
 * applied there, with the results broadcast to other workers.
 *
 * @tparam Function The function used to calculate the results.
- * @tparam Args Arguments to the function.
 * @param info MetaInfo about the DMatrix.
 * @param buffer The buffer storing the results.
 * @param size The size of the buffer.
@@ -57,6 +56,52 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
  }
 }

+/**
+ * @brief Apply the given function where the labels are.
+ *
+ * Normally all the workers have access to the labels, so the function is just applied locally. In
+ * vertical federated learning, we assume labels are only available on worker 0, so the function is
+ * applied there, with the results broadcast to other workers.
+ *
+ * @tparam T Type of the HostDeviceVector storing the results.
+ * @tparam Function The function used to calculate the results.
+ * @param info MetaInfo about the DMatrix.
+ * @param result The HostDeviceVector storing the results.
+ * @param function The function used to calculate the results.
+ */
+template <typename T, typename Function>
+void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function&& function) {
+  if (info.IsVerticalFederated()) {
+    // We assume labels are only available on worker 0, so the calculation is done there and result
+    // broadcast to other workers.
+    std::string message;
+    if (collective::GetRank() == 0) {
+      try {
+        std::forward<Function>(function)();
+      } catch (dmlc::Error& e) {
+        message = e.what();
+      }
+    }
+
+    collective::Broadcast(&message, 0);
+    if (!message.empty()) {
+      LOG(FATAL) << &message[0];
+      return;
+    }
+
+    std::size_t size{};
+    if (collective::GetRank() == 0) {
+      size = result->Size();
+    }
+    collective::Broadcast(&size, sizeof(std::size_t), 0);
+
+    result->Resize(size);
+    collective::Broadcast(result->HostPointer(), size * sizeof(T), 0);
+  } else {
+    std::forward<Function>(function)();
+  }
+}
+
 /**
 * @brief Find the global max of the given value across all workers.
 *
--- a/src/collective/communicator-inl.cuh
+++ b/src/collective/communicator-inl.cuh
@@ -57,6 +57,20 @@ inline void AllReduce(int device, double *send_receive_buffer, size_t count) {
  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }

+/**
+ * @brief Gather values from all all processes.
+ *
+ * This assumes all ranks have the same size.
+ *
+ * @param send_buffer    Buffer storing the data to be sent.
+ * @param receive_buffer Buffer storing the gathered data.
+ * @param send_size      Size of the sent data in bytes.
+ */
+inline void AllGather(int device, void const *send_buffer, void *receive_buffer,
+                      std::size_t send_size) {
+  Communicator::GetDevice(device)->AllGather(send_buffer, receive_buffer, send_size);
+}
+
 /**
 * @brief Gather variable-length values from all processes.
 * @param device         ID of the device.
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@@ -41,7 +41,8 @@ void Communicator::Init(Json const& config) {
 #endif
      break;
    }
-    case CommunicatorType::kInMemory: {
+    case CommunicatorType::kInMemory:
+    case CommunicatorType::kInMemoryNccl: {
      communicator_.reset(InMemoryCommunicator::Create(config));
      break;
    }
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -29,13 +29,22 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
    old_device_ordinal = device_ordinal;
    old_world_size = communicator_->GetWorldSize();
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
-    if (type_ != CommunicatorType::kFederated) {
-      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
-    } else {
-      device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
+    switch (type_) {
+      case CommunicatorType::kRabit:
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        break;
+      case CommunicatorType::kFederated:
+      case CommunicatorType::kInMemory:
+        device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
+        break;
+      case CommunicatorType::kInMemoryNccl:
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
+        break;
+      default:
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
    }
 #else
-    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
+    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
 #endif
  }
  return device_communicator_.get();
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -69,7 +69,7 @@ enum class Operation {

 class DeviceCommunicator;

-enum class CommunicatorType { kUnknown, kRabit, kFederated, kInMemory };
+enum class CommunicatorType { kUnknown, kRabit, kFederated, kInMemory, kInMemoryNccl };

 /** \brief Case-insensitive string comparison. */
 inline int CompareStringsCaseInsensitive(const char *s1, const char *s2) {
@@ -220,6 +220,8 @@ class Communicator {
      result = CommunicatorType::kFederated;
    } else if (!CompareStringsCaseInsensitive("in-memory", str)) {
      result = CommunicatorType::kInMemory;
+    } else if (!CompareStringsCaseInsensitive("in-memory-nccl", str)) {
+      result = CommunicatorType::kInMemoryNccl;
    } else {
      LOG(FATAL) << "Unknown communicator type " << str;
    }
--- a/src/collective/device_communicator.cuh
+++ b/src/collective/device_communicator.cuh
@@ -27,6 +27,17 @@ class DeviceCommunicator {
  virtual void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                         Operation op) = 0;

+  /**
+   * @brief Gather values from all all processes.
+   *
+   * This assumes all ranks have the same size.
+   *
+   * @param send_buffer    Buffer storing the data to be sent.
+   * @param receive_buffer Buffer storing the gathered data.
+   * @param send_size      Size of the sent data in bytes.
+   */
+  virtual void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) = 0;
+
  /**
   * @brief Gather variable-length values from all processes.
   * @param send_buffer Buffer storing the input data.
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -11,21 +11,18 @@ namespace collective {

 class DeviceCommunicatorAdapter : public DeviceCommunicator {
 public:
-  DeviceCommunicatorAdapter(int device_ordinal, Communicator *communicator)
-      : device_ordinal_{device_ordinal}, communicator_{communicator} {
+  explicit DeviceCommunicatorAdapter(int device_ordinal)
+      : device_ordinal_{device_ordinal}, world_size_{GetWorldSize()}, rank_{GetRank()} {
    if (device_ordinal_ < 0) {
      LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
    }
-    if (communicator_ == nullptr) {
-      LOG(FATAL) << "Communicator cannot be null.";
-    }
  }

  ~DeviceCommunicatorAdapter() override = default;

  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                 Operation op) override {
-    if (communicator_->GetWorldSize() == 1) {
+    if (world_size_ == 1) {
      return;
    }

@@ -35,62 +32,82 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
    dh::safe_cuda(hipSetDevice(device_ordinal_));
 #endif
    auto size = count * GetTypeSize(data_type);
-    host_buffer_.reserve(size);
+    host_buffer_.resize(size);
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
-    communicator_->AllReduce(host_buffer_.data(), count, data_type, op);
+    Allreduce(host_buffer_.data(), count, data_type, op);
    dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpy(host_buffer_.data(), send_receive_buffer, size, hipMemcpyDefault));
-    communicator_->AllReduce(host_buffer_.data(), count, data_type, op);
+    AllReduce(host_buffer_.data(), count, data_type, op);
    dh::safe_cuda(hipMemcpy(send_receive_buffer, host_buffer_.data(), size, hipMemcpyDefault));
 #endif
  }

+  void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override {
+    if (world_size_ == 1) {
+      return;
+    }
+
+#if defined(XGBOOST_USE_CUDA)
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    host_buffer_.resize(send_size * world_size_);
+    dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
+                             cudaMemcpyDefault));
+    Allgather(host_buffer_.data(), host_buffer_.size());
+    dh::safe_cuda(
+        cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
+#elif defined(XGBOOST_USE_HIP)
+    dh::safe_cuda(hipSetDevice(device_ordinal_));
+    host_buffer_.resize(send_size * world_size_);
+    dh::safe_cuda(hipMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
+                             hipMemcpyDefault));
+    Allgather(host_buffer_.data(), host_buffer_.size());
+    dh::safe_cuda(
+        hipMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), hipMemcpyDefault));
+#endif
+  }
+
  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                  dh::caching_device_vector<char> *receive_buffer) override {
-    if (communicator_->GetWorldSize() == 1) {
+    if (world_size_ == 1) {
      return;
    }

 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(device_ordinal_));
-#else
+#elif defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
 #endif

-    int const world_size = communicator_->GetWorldSize();
-    int const rank = communicator_->GetRank();
-
    segments->clear();
-    segments->resize(world_size, 0);
-    segments->at(rank) = length_bytes;
-    communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
-                             Operation::kMax);
+    segments->resize(world_size_, 0);
+    segments->at(rank_) = length_bytes;
+    Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
    auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
    receive_buffer->resize(total_bytes);

-    host_buffer_.reserve(total_bytes);
+    host_buffer_.resize(total_bytes);
    size_t offset = 0;
-    for (int32_t i = 0; i < world_size; ++i) {
+    for (int32_t i = 0; i < world_size_; ++i) {
      size_t as_bytes = segments->at(i);
-      if (i == rank) {
-#if defined(XGBOOST_USE_HIP)
-        dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank),
-                                 hipMemcpyDefault));
-#else
-        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank),
+      if (i == rank_) {
+#if defined(XGBOOST_USE_CUDA)
+        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
                                 cudaMemcpyDefault));
+#elif defined(XGBOOST_USE_HIP)
+        dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
+                                 hipMemcpyDefault));
 #endif
      }
-      communicator_->Broadcast(host_buffer_.data() + offset, as_bytes, i);
+      Broadcast(host_buffer_.data() + offset, as_bytes, i);
      offset += as_bytes;
    }

 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
                             hipMemcpyDefault));
-#else
+#elif defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
                             cudaMemcpyDefault));
 #endif
@@ -102,7 +119,8 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {

 private:
  int const device_ordinal_;
-  Communicator *communicator_;
+  int const world_size_;
+  int const rank_;
  /// Host buffer used to call communicator functions.
  std::vector<char> host_buffer_{};
 };
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -0,0 +1,229 @@
+/*!
+ * Copyright 2023 XGBoost contributors
+ */
+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
+#include "nccl_device_communicator.cuh"
+
+namespace xgboost {
+namespace collective {
+
+NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
+    : device_ordinal_{device_ordinal},
+      needs_sync_{needs_sync},
+      world_size_{GetWorldSize()},
+      rank_{GetRank()} {
+  if (device_ordinal_ < 0) {
+    LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
+  }
+  if (world_size_ == 1) {
+    return;
+  }
+
+  std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
+  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
+  auto s_this_uuid = s_uuid.subspan(rank_ * kUuidLength, kUuidLength);
+  GetCudaUUID(s_this_uuid);
+
+  // TODO(rongou): replace this with allgather.
+  Allreduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
+
+  std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world_size_);
+  size_t j = 0;
+  for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
+    converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
+    j++;
+  }
+
+  auto iter = std::unique(converted.begin(), converted.end());
+  auto n_uniques = std::distance(converted.begin(), iter);
+
+  CHECK_EQ(n_uniques, world_size_)
+      << "Multiple processes within communication group running on same CUDA "
+      << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
+
+  nccl_unique_id_ = GetUniqueId();
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
+}
+
+NcclDeviceCommunicator::~NcclDeviceCommunicator() {
+  if (world_size_ == 1) {
+    return;
+  }
+  if (nccl_comm_) {
+    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+  }
+  if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+    LOG(CONSOLE) << "======== NCCL Statistics========";
+    LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
+    LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
+  }
+}
+
+namespace {
+ncclDataType_t GetNcclDataType(DataType const &data_type) {
+  ncclDataType_t result{ncclInt8};
+  switch (data_type) {
+    case DataType::kInt8:
+      result = ncclInt8;
+      break;
+    case DataType::kUInt8:
+      result = ncclUint8;
+      break;
+    case DataType::kInt32:
+      result = ncclInt32;
+      break;
+    case DataType::kUInt32:
+      result = ncclUint32;
+      break;
+    case DataType::kInt64:
+      result = ncclInt64;
+      break;
+    case DataType::kUInt64:
+      result = ncclUint64;
+      break;
+    case DataType::kFloat:
+      result = ncclFloat;
+      break;
+    case DataType::kDouble:
+      result = ncclDouble;
+      break;
+    default:
+      LOG(FATAL) << "Unknown data type.";
+  }
+  return result;
+}
+
+bool IsBitwiseOp(Operation const &op) {
+  return op == Operation::kBitwiseAND || op == Operation::kBitwiseOR ||
+         op == Operation::kBitwiseXOR;
+}
+
+ncclRedOp_t GetNcclRedOp(Operation const &op) {
+  ncclRedOp_t result{ncclMax};
+  switch (op) {
+    case Operation::kMax:
+      result = ncclMax;
+      break;
+    case Operation::kMin:
+      result = ncclMin;
+      break;
+    case Operation::kSum:
+      result = ncclSum;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported reduce operation.";
+  }
+  return result;
+}
+
+template <typename Func>
+void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
+                         std::size_t size) {
+  dh::LaunchN(size, [=] __device__(std::size_t idx) {
+    auto result = device_buffer[idx];
+    for (auto rank = 1; rank < world_size; rank++) {
+      result = func(result, device_buffer[rank * size + idx]);
+    }
+    out_buffer[idx] = result;
+  });
+}
+}  // anonymous namespace
+
+void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::size_t count,
+                                              DataType data_type, Operation op) {
+  auto const size = count * GetTypeSize(data_type);
+  dh::caching_device_vector<char> buffer(size * world_size_);
+  auto *device_buffer = buffer.data().get();
+
+  // First gather data from all the workers.
+  dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
+                              nccl_comm_, dh::DefaultStream()));
+  if (needs_sync_) {
+    dh::DefaultStream().Sync();
+  }
+
+  // Then reduce locally.
+  auto *out_buffer = static_cast<char *>(send_receive_buffer);
+  switch (op) {
+    case Operation::kBitwiseAND:
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size);
+      break;
+    case Operation::kBitwiseOR:
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size);
+      break;
+    case Operation::kBitwiseXOR:
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size);
+      break;
+    default:
+      LOG(FATAL) << "Not a bitwise reduce operation.";
+  }
+}
+
+void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t count,
+                                       DataType data_type, Operation op) {
+  if (world_size_ == 1) {
+    return;
+  }
+
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  if (IsBitwiseOp(op)) {
+    BitwiseAllReduce(send_receive_buffer, count, data_type, op);
+  } else {
+    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
+                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
+                                dh::DefaultStream()));
+  }
+  allreduce_bytes_ += count * GetTypeSize(data_type);
+  allreduce_calls_ += 1;
+}
+
+void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_buffer,
+                                       std::size_t send_size) {
+  if (world_size_ == 1) {
+    return;
+  }
+
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
+                              dh::DefaultStream()));
+}
+
+void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
+                                        std::vector<std::size_t> *segments,
+                                        dh::caching_device_vector<char> *receive_buffer) {
+  if (world_size_ == 1) {
+    return;
+  }
+
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+
+  segments->clear();
+  segments->resize(world_size_, 0);
+  segments->at(rank_) = length_bytes;
+  Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
+  auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
+  receive_buffer->resize(total_bytes);
+
+  size_t offset = 0;
+  dh::safe_nccl(ncclGroupStart());
+  for (int32_t i = 0; i < world_size_; ++i) {
+    size_t as_bytes = segments->at(i);
+    dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
+                                ncclChar, i, nccl_comm_, dh::DefaultStream()));
+    offset += as_bytes;
+  }
+  dh::safe_nccl(ncclGroupEnd());
+}
+
+void NcclDeviceCommunicator::Synchronize() {
+  if (world_size_ == 1) {
+    return;
+  }
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  dh::DefaultStream().Sync();
+}
+
+}  // namespace collective
+}  // namespace xgboost
+#endif
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022 XGBoost contributors
+ * Copyright 2022-2023 XGBoost contributors
 */
 #pragma once

@@ -12,136 +12,27 @@ namespace collective {

 class NcclDeviceCommunicator : public DeviceCommunicator {
 public:
-  NcclDeviceCommunicator(int device_ordinal, Communicator *communicator)
-      : device_ordinal_{device_ordinal}, communicator_{communicator} {
-    if (device_ordinal_ < 0) {
-      LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
-    }
-    if (communicator_ == nullptr) {
-      LOG(FATAL) << "Communicator cannot be null.";
-    }
-
-    int32_t const rank = communicator_->GetRank();
-    int32_t const world = communicator_->GetWorldSize();
-
-    if (world == 1) {
-      return;
-    }
-
-    std::vector<uint64_t> uuids(world * kUuidLength, 0);
-    auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
-    auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
-    GetCudaUUID(s_this_uuid);
-
-    // TODO(rongou): replace this with allgather.
-    communicator_->AllReduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
-
-    std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);
-    size_t j = 0;
-    for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
-      converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
-      j++;
-    }
-
-    auto iter = std::unique(converted.begin(), converted.end());
-    auto n_uniques = std::distance(converted.begin(), iter);
-
-    CHECK_EQ(n_uniques, world)
-        << "Multiple processes within communication group running on same CUDA "
-        << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
-
-    nccl_unique_id_ = GetUniqueId();
-    dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world, nccl_unique_id_, rank));
-
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipStreamCreate(&cuda_stream_));
-#else
-    dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
-#endif
-  }
-
-  ~NcclDeviceCommunicator() override {
-    if (communicator_->GetWorldSize() == 1) {
-      return;
-    }
-    if (cuda_stream_) {
-#if defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipStreamDestroy(cuda_stream_));
-#else
-      dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
-#endif
-    }
-    if (nccl_comm_) {
-      dh::safe_nccl(ncclCommDestroy(nccl_comm_));
-    }
-    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      LOG(CONSOLE) << "======== NCCL Statistics========";
-      LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
-      LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
-    }
-  }
-
+  /**
+   * @brief Construct a new NCCL communicator.
+   * @param device_ordinal The GPU device id.
+   * @param needs_sync Whether extra CUDA stream synchronization is needed.
+   *
+   * In multi-GPU tests when multiple NCCL communicators are created in the same process, sometimes
+   * a deadlock happens because NCCL kernels are blocking. The extra CUDA stream synchronization
+   * makes sure that the NCCL kernels are caught up, thus avoiding the deadlock.
+   *
+   * The Rabit communicator runs with one process per GPU, so the additional synchronization is not
+   * needed. The in-memory communicator is used in tests with multiple threads, each thread
+   * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
+   */
+  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
+  ~NcclDeviceCommunicator() override;
  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
-                 Operation op) override {
-    if (communicator_->GetWorldSize() == 1) {
-      return;
-    }
-
-    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
-                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                cuda_stream_));
-    allreduce_bytes_ += count * GetTypeSize(data_type);
-    allreduce_calls_ += 1;
-  }
-
+                 Operation op) override;
+  void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override;
  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
-                  dh::caching_device_vector<char> *receive_buffer) override {
-    if (communicator_->GetWorldSize() == 1) {
-      return;
-    }
-
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(device_ordinal_));
-#else
-    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-#endif
-
-    int const world_size = communicator_->GetWorldSize();
-    int const rank = communicator_->GetRank();
-
-    segments->clear();
-    segments->resize(world_size, 0);
-    segments->at(rank) = length_bytes;
-    communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
-                             Operation::kMax);
-    auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
-    receive_buffer->resize(total_bytes);
-
-    size_t offset = 0;
-    dh::safe_nccl(ncclGroupStart());
-    for (int32_t i = 0; i < world_size; ++i) {
-      size_t as_bytes = segments->at(i);
-      dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                  ncclChar, i, nccl_comm_, cuda_stream_));
-      offset += as_bytes;
-    }
-    dh::safe_nccl(ncclGroupEnd());
-  }
-
-  void Synchronize() override {
-    if (communicator_->GetWorldSize() == 1) {
-      return;
-    }
-
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(device_ordinal_));
-    dh::safe_cuda(hipStreamSynchronize(cuda_stream_));
-#else
-    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
-#endif
-  }
+                  dh::caching_device_vector<char> *receive_buffer) override;
+  void Synchronize() override;

 private:
  static constexpr std::size_t kUuidLength =
@@ -182,79 +73,21 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  ncclUniqueId GetUniqueId() {
    static const int kRootRank = 0;
    ncclUniqueId id;
-    if (communicator_->GetRank() == kRootRank) {
+    if (rank_ == kRootRank) {
      dh::safe_nccl(ncclGetUniqueId(&id));
    }
-    communicator_->Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId),
-                             static_cast<int>(kRootRank));
+    Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
    return id;
  }

-  static ncclDataType_t GetNcclDataType(DataType const &data_type) {
-    ncclDataType_t result;
-    switch (data_type) {
-      case DataType::kInt8:
-        result = ncclInt8;
-        break;
-      case DataType::kUInt8:
-        result = ncclUint8;
-        break;
-      case DataType::kInt32:
-        result = ncclInt32;
-        break;
-      case DataType::kUInt32:
-        result = ncclUint32;
-        break;
-      case DataType::kInt64:
-        result = ncclInt64;
-        break;
-      case DataType::kUInt64:
-        result = ncclUint64;
-        break;
-      case DataType::kFloat:
-        result = ncclFloat;
-        break;
-      case DataType::kDouble:
-        result = ncclDouble;
-        break;
-      default:
-        LOG(FATAL) << "Unknown data type.";
-    }
-    return result;
-  }
-
-  static ncclRedOp_t GetNcclRedOp(Operation const &op) {
-    ncclRedOp_t result;
-    switch (op) {
-      case Operation::kMax:
-        result = ncclMax;
-        break;
-      case Operation::kMin:
-        result = ncclMin;
-        break;
-      case Operation::kSum:
-        result = ncclSum;
-        break;
-      case Operation::kBitwiseAND:
-      case Operation::kBitwiseOR:
-      case Operation::kBitwiseXOR:
-        LOG(FATAL) << "Not implemented yet.";
-      default:
-        LOG(FATAL) << "Unknown reduce operation.";
-    }
-    return result;
-  }
+  void BitwiseAllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
+                        Operation op);

  int const device_ordinal_;
-  Communicator *communicator_;
+  bool const needs_sync_;
+  int const world_size_;
+  int const rank_;
  ncclComm_t nccl_comm_{};
-
-#if defined(XGBOOST_USE_HIP)
-  hipStream_t cuda_stream_{};
-#else
-  cudaStream_t cuda_stream_{};
-#endif
-
  ncclUniqueId nccl_unique_id_{};
  size_t allreduce_bytes_{0};  // Keep statistics of the number of bytes communicated.
  size_t allreduce_calls_{0};  // Keep statistics of the number of reduce calls.
--- a/src/collective/socket.cc
+++ b/src/collective/socket.cc
@@ -1,19 +1,22 @@
-/*!
- * Copyright (c) 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
 */
 #include "xgboost/collective/socket.h"

 #include <cstddef>       // std::size_t
 #include <cstdint>       // std::int32_t
 #include <cstring>       // std::memcpy, std::memset
+#include <filesystem>    // for path
 #include <system_error>  // std::error_code, std::system_category

+#include "rabit/internal/socket.h"      // for PollHelper
+#include "xgboost/collective/result.h"  // for Result
+
 #if defined(__unix__) || defined(__APPLE__)
 #include <netdb.h>  // getaddrinfo, freeaddrinfo
 #endif              // defined(__unix__) || defined(__APPLE__)

-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 SockAddress MakeSockAddress(StringView host, in_port_t port) {
  struct addrinfo hints;
  std::memset(&hints, 0, sizeof(hints));
@@ -71,7 +74,12 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
  return bytes;
 }

-std::error_code Connect(SockAddress const &addr, TCPSocket *out) {
+[[nodiscard]] Result Connect(xgboost::StringView host, std::int32_t port, std::int32_t retry,
+                             std::chrono::seconds timeout,
+                             xgboost::collective::TCPSocket *out_conn) {
+  auto addr = MakeSockAddress(xgboost::StringView{host}, port);
+  auto &conn = *out_conn;
+
  sockaddr const *addr_handle{nullptr};
  socklen_t addr_len{0};
  if (addr.IsV4()) {
@@ -81,14 +89,67 @@ std::error_code Connect(SockAddress const &addr, TCPSocket *out) {
    addr_handle = reinterpret_cast<const sockaddr *>(&addr.V6().Handle());
    addr_len = sizeof(addr.V6().Handle());
  }
-  auto socket = TCPSocket::Create(addr.Domain());
-  CHECK_EQ(static_cast<std::int32_t>(socket.Domain()), static_cast<std::int32_t>(addr.Domain()));
-  auto rc = connect(socket.Handle(), addr_handle, addr_len);
-  if (rc != 0) {
-    return std::error_code{errno, std::system_category()};
+
+  conn = TCPSocket::Create(addr.Domain());
+  CHECK_EQ(static_cast<std::int32_t>(conn.Domain()), static_cast<std::int32_t>(addr.Domain()));
+  conn.SetNonBlock(true);
+
+  Result last_error;
+  auto log_failure = [&host, &last_error](Result err, char const *file, std::int32_t line) {
+    last_error = std::move(err);
+    LOG(WARNING) << std::filesystem::path{file}.filename().string() << "(" << line
+                 << "): Failed to connect to:" << host << " Error:" << last_error.Report();
+  };
+
+  for (std::int32_t attempt = 0; attempt < std::max(retry, 1); ++attempt) {
+    if (attempt > 0) {
+      LOG(WARNING) << "Retrying connection to " << host << " for the " << attempt << " time.";
+#if defined(_MSC_VER) || defined(__MINGW32__)
+      Sleep(attempt << 1);
+#else
+      sleep(attempt << 1);
+#endif
+    }
+
+    auto rc = connect(conn.Handle(), addr_handle, addr_len);
+    if (rc != 0) {
+      auto errcode = system::LastError();
+      if (!system::ErrorWouldBlock(errcode)) {
+        log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
+                    __FILE__, __LINE__);
+        continue;
+      }
+
+      rabit::utils::PollHelper poll;
+      poll.WatchWrite(conn);
+      auto result = poll.Poll(timeout);
+      if (!result.OK()) {
+        log_failure(std::move(result), __FILE__, __LINE__);
+        continue;
+      }
+      if (!poll.CheckWrite(conn)) {
+        log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}),
+                    __FILE__, __LINE__);
+        continue;
+      }
+      result = conn.GetSockError();
+      if (!result.OK()) {
+        log_failure(std::move(result), __FILE__, __LINE__);
+        continue;
+      }
+
+      conn.SetNonBlock(false);
+      return Success();
+
+    } else {
+      conn.SetNonBlock(false);
+      return Success();
+    }
  }
-  *out = std::move(socket);
-  return std::make_error_code(std::errc{});
+
+  std::stringstream ss;
+  ss << "Failed to connect to " << host << ":" << port;
+  conn.Close();
+  return Fail(ss.str(), std::move(last_error));
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -188,7 +188,7 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
 #if defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
                                sorted_idx.size_bytes(), hipMemcpyDeviceToDevice));
-#else
+#elif defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
 #endif
--- a/src/common/api_entry.h
+++ b/src/common/api_entry.h
@@ -24,6 +24,8 @@ struct XGBAPIThreadLocalEntry {
  std::vector<const char *> ret_vec_charp;
  /*! \brief returning float vector. */
  std::vector<float> ret_vec_float;
+  /*! \brief returning uint vector. */
+  std::vector<std::uint64_t> ret_vec_u64;
  /*! \brief temp variable of gradient pairs. */
  std::vector<GradientPair> tmp_gpair;
  /*! \brief Temp variable for returning prediction result. */
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 by Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
 * \file bitfield.h
 */
 #ifndef XGBOOST_COMMON_BITFIELD_H_
@@ -54,14 +54,17 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
 }
 #endif  // defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)

-/*!
- * \brief A non-owning type with auxiliary methods defined for manipulating bits.
+/**
+ * @brief A non-owning type with auxiliary methods defined for manipulating bits.
 *
- * \tparam Direction Whether the bits start from left or from right.
+ * @tparam VT        Underlying value type, must be an unsigned integer.
+ * @tparam Direction Whether the bits start from left or from right.
+ * @tparam IsConst   Whether the view is const.
 */
 template <typename VT, typename Direction, bool IsConst = false>
 struct BitFieldContainer {
  using value_type = std::conditional_t<IsConst, VT const, VT>;  // NOLINT
+  using size_type = size_t;                                      // NOLINT
  using index_type = size_t;                                     // NOLINT
  using pointer = value_type*;                                   // NOLINT

@@ -74,8 +77,9 @@ struct BitFieldContainer {
  };

 private:
-  common::Span<value_type> bits_;
-  static_assert(!std::is_signed<VT>::value, "Must use unsiged type as underlying storage.");
+  value_type* bits_{nullptr};
+  size_type n_values_{0};
+  static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");

 public:
  XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
@@ -90,13 +94,15 @@ struct BitFieldContainer {

 public:
  BitFieldContainer() = default;
-  XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
-  XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}
+  XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits)
+      : bits_{bits.data()}, n_values_{bits.size()} {}
+  BitFieldContainer(BitFieldContainer const& other) = default;
+  BitFieldContainer(BitFieldContainer&& other) = default;
  BitFieldContainer &operator=(BitFieldContainer const &that) = default;
  BitFieldContainer &operator=(BitFieldContainer &&that) = default;

-  XGBOOST_DEVICE common::Span<value_type>       Bits()       { return bits_; }
-  XGBOOST_DEVICE common::Span<value_type const> Bits() const { return bits_; }
+  XGBOOST_DEVICE auto Bits() { return common::Span<value_type>{bits_, NumValues()}; }
+  XGBOOST_DEVICE auto Bits() const { return common::Span<value_type const>{bits_, NumValues()}; }

  /*\brief Compute the size of needed memory allocation.  The returned value is in terms
   *       of number of elements with `BitFieldContainer::value_type'.
@@ -107,17 +113,17 @@ struct BitFieldContainer {
 #if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
  __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t min_size = min(bits_.size(), rhs.bits_.size());
+    size_t min_size = min(NumValues(), rhs.NumValues());
    if (tid < min_size) {
-      bits_[tid] |= rhs.bits_[tid];
+      Data()[tid] |= rhs.Data()[tid];
    }
    return *this;
  }
 #else
  BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
-    size_t min_size = std::min(bits_.size(), rhs.bits_.size());
+    size_t min_size = std::min(NumValues(), rhs.NumValues());
    for (size_t i = 0; i < min_size; ++i) {
-      bits_[i] |= rhs.bits_[i];
+      Data()[i] |= rhs.Data()[i];
    }
    return *this;
  }
@@ -125,75 +131,85 @@ struct BitFieldContainer {

 #if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
  __device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = min(bits_.size(), rhs.bits_.size());
+    size_t min_size = min(NumValues(), rhs.NumValues());
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < min_size) {
-      bits_[tid] &= rhs.bits_[tid];
+      Data()[tid] &= rhs.Data()[tid];
    }
    return *this;
  }
 #else
  BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = std::min(bits_.size(), rhs.bits_.size());
+    size_t min_size = std::min(NumValues(), rhs.NumValues());
    for (size_t i = 0; i < min_size; ++i) {
-      bits_[i] &= rhs.bits_[i];
+      Data()[i] &= rhs.Data()[i];
    }
    return *this;
  }
 #endif  // defined(__CUDA_ARCH__)

 #if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
-  __device__ auto Set(index_type pos) {
+  __device__ auto Set(index_type pos) noexcept(true) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = bits_[pos_v.int_pos];
+    value_type& value = Data()[pos_v.int_pos];
    value_type set_bit = kOne << pos_v.bit_pos;
    using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
    atomicOr(reinterpret_cast<Type *>(&value), set_bit);
  }
-  __device__ void Clear(index_type pos) {
+  __device__ void Clear(index_type pos) noexcept(true) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = bits_[pos_v.int_pos];
+    value_type& value = Data()[pos_v.int_pos];
    value_type clear_bit = ~(kOne << pos_v.bit_pos);
    using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
    atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
  }
 #else
-  void Set(index_type pos) {
+  void Set(index_type pos) noexcept(true) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = bits_[pos_v.int_pos];
+    value_type& value = Data()[pos_v.int_pos];
    value_type set_bit = kOne << pos_v.bit_pos;
    value |= set_bit;
  }
-  void Clear(index_type pos) {
+  void Clear(index_type pos) noexcept(true) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = bits_[pos_v.int_pos];
+    value_type& value = Data()[pos_v.int_pos];
    value_type clear_bit = ~(kOne << pos_v.bit_pos);
    value &= clear_bit;
  }
 #endif  // defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)

-  XGBOOST_DEVICE bool Check(Pos pos_v) const {
+  XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
    pos_v = Direction::Shift(pos_v);
-    SPAN_LT(pos_v.int_pos, bits_.size());
-    value_type const value = bits_[pos_v.int_pos];
+    assert(pos_v.int_pos < NumValues());
+    value_type const value = Data()[pos_v.int_pos];
    value_type const test_bit = kOne << pos_v.bit_pos;
    value_type result = test_bit & value;
    return static_cast<bool>(result);
  }
-  XGBOOST_DEVICE bool Check(index_type pos) const {
+  [[nodiscard]] XGBOOST_DEVICE bool Check(index_type pos) const noexcept(true) {
    Pos pos_v = ToBitPos(pos);
    return Check(pos_v);
  }
+  /**
+   * @brief Returns the total number of bits that can be viewed. This is equal to or
+   *        larger than the acutal number of valid bits.
+   */
+  [[nodiscard]] XGBOOST_DEVICE size_type Capacity() const noexcept(true) {
+    return kValueSize * NumValues();
+  }
+  /**
+   * @brief Number of storage unit used in this bit field.
+   */
+  [[nodiscard]] XGBOOST_DEVICE size_type NumValues() const noexcept(true) { return n_values_; }

-  XGBOOST_DEVICE size_t Size() const { return kValueSize * bits_.size(); }
+  XGBOOST_DEVICE pointer Data() const noexcept(true) { return bits_; }

-  XGBOOST_DEVICE pointer Data() const { return bits_.data(); }
-
-  inline friend std::ostream &
-  operator<<(std::ostream &os, BitFieldContainer<VT, Direction, IsConst> field) {
-    os << "Bits " << "storage size: " << field.bits_.size() << "\n";
-    for (typename common::Span<value_type>::index_type i = 0; i < field.bits_.size(); ++i) {
-      std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.bits_[i]);
+  inline friend std::ostream& operator<<(std::ostream& os,
+                                         BitFieldContainer<VT, Direction, IsConst> field) {
+    os << "Bits "
+       << "storage size: " << field.NumValues() << "\n";
+    for (typename common::Span<value_type>::index_type i = 0; i < field.NumValues(); ++i) {
+      std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.Data()[i]);
      os << bset << "\n";
    }
    return os;
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2023, XGBoost Contributors
 * \file categorical.h
 */
 #ifndef XGBOOST_COMMON_CATEGORICAL_H_
@@ -10,7 +10,6 @@
 #include "bitfield.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
-#include "xgboost/parameter.h"
 #include "xgboost/span.h"

 namespace xgboost {
@@ -53,7 +52,7 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
 *
 *   Go to left if it's NOT the matching category, which matches one-hot encoding.
 */
-inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
+inline XGBOOST_DEVICE bool Decision(common::Span<CatBitField::value_type const> cats, float cat) {
  KCatBitField const s_cats(cats);
  if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
    return true;
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -1,16 +1,27 @@
-/*!
- * Copyright 2017-2022 by XGBoost Contributors
+/**
+ * Copyright 2017-2023, XGBoost Contributors
 * \brief Utility for fast column-wise access
 */
 #include "column_matrix.h"

-namespace xgboost {
-namespace common {
+#include <algorithm>    // for transform
+#include <cstddef>      // for size_t
+#include <cstdint>      // for uint64_t, uint8_t
+#include <limits>       // for numeric_limits
+#include <type_traits>  // for remove_reference_t
+#include <vector>       // for vector
+
+#include "../data/gradient_index.h"  // for GHistIndexMatrix
+#include "io.h"                      // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "xgboost/base.h"            // for bst_feaature_t
+#include "xgboost/span.h"            // for Span
+
+namespace xgboost::common {
 void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) {
  auto const nfeature = gmat.Features();
  const size_t nrow = gmat.Size();
  // identify type of each column
-  type_.resize(nfeature);
+  type_ = common::MakeFixedVecWithMalloc(nfeature, ColumnType{});

  uint32_t max_val = std::numeric_limits<uint32_t>::max();
  for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
@@ -34,7 +45,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres

  // want to compute storage boundary for each feature
  // using variants of prefix sum scan
-  feature_offsets_.resize(nfeature + 1);
+  feature_offsets_ = common::MakeFixedVecWithMalloc(nfeature + 1, std::size_t{0});
  size_t accum_index = 0;
  feature_offsets_[0] = accum_index;
  for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
@@ -49,9 +60,11 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
  SetTypeSize(gmat.MaxNumBinPerFeat());
  auto storage_size =
      feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
-  index_.resize(storage_size, 0);
+
+  index_ = common::MakeFixedVecWithMalloc(storage_size, std::uint8_t{0});
+
  if (!all_dense_column) {
-    row_ind_.resize(feature_offsets_[nfeature]);
+    row_ind_ = common::MakeFixedVecWithMalloc(feature_offsets_[nfeature], std::size_t{0});
  }

  // store least bin id for each feature
@@ -59,7 +72,51 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres

  any_missing_ = !gmat.IsDense();

-  missing_flags_.clear();
+  missing_ = MissingIndicator{0, false};
 }
-}  // namespace common
-}  // namespace xgboost
+
+// IO procedures for external memory.
+bool ColumnMatrix::Read(AlignedResourceReadStream* fi, uint32_t const* index_base) {
+  if (!common::ReadVec(fi, &index_)) {
+    return false;
+  }
+  if (!common::ReadVec(fi, &type_)) {
+    return false;
+  }
+  if (!common::ReadVec(fi, &row_ind_)) {
+    return false;
+  }
+  if (!common::ReadVec(fi, &feature_offsets_)) {
+    return false;
+  }
+
+  if (!common::ReadVec(fi, &missing_.storage)) {
+    return false;
+  }
+  missing_.InitView();
+
+  index_base_ = index_base;
+  if (!fi->Read(&bins_type_size_)) {
+    return false;
+  }
+  if (!fi->Read(&any_missing_)) {
+    return false;
+  }
+  return true;
+}
+
+std::size_t ColumnMatrix::Write(AlignedFileWriteStream* fo) const {
+  std::size_t bytes{0};
+
+  bytes += common::WriteVec(fo, index_);
+  bytes += common::WriteVec(fo, type_);
+  bytes += common::WriteVec(fo, row_ind_);
+  bytes += common::WriteVec(fo, feature_offsets_);
+  bytes += common::WriteVec(fo, missing_.storage);
+
+  bytes += fo->Write(bins_type_size_);
+  bytes += fo->Write(any_missing_);
+
+  return bytes;
+}
+}  // namespace xgboost::common
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2022 by Contributors
+/**
+ * Copyright 2017-2023, XGBoost Contributors
 * \file column_matrix.h
 * \brief Utility for fast column-wise access
 * \author Philip Cho
@@ -8,25 +8,30 @@
 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
 #define XGBOOST_COMMON_COLUMN_MATRIX_H_

-#include <dmlc/endian.h>
-
 #include <algorithm>
+#include <cstddef>  // for size_t, byte
+#include <cstdint>  // for uint8_t
 #include <limits>
 #include <memory>
-#include <utility>  // std::move
-#include <vector>
+#include <type_traits>  // for enable_if_t, is_same_v, is_signed_v
+#include <utility>      // for move

 #include "../data/adapter.h"
 #include "../data/gradient_index.h"
 #include "algorithm.h"
+#include "bitfield.h"  // for RBitField8
 #include "hist_util.h"
+#include "ref_resource_view.h"  // for RefResourceView
+#include "xgboost/base.h"       // for bst_bin_t
+#include "xgboost/span.h"       // for Span

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 class ColumnMatrix;
+class AlignedFileWriteStream;
+class AlignedResourceReadStream;
+
 /*! \brief column type */
-enum ColumnType : uint8_t { kDenseColumn, kSparseColumn };
+enum ColumnType : std::uint8_t { kDenseColumn, kSparseColumn };

 /*! \brief a column storage, to be used with ApplySplit. Note that each
    bin id is stored as index[i] + index_base.
@@ -41,12 +46,12 @@ class Column {
      : index_(index), index_base_(least_bin_idx) {}
  virtual ~Column() = default;

-  bst_bin_t GetGlobalBinIdx(size_t idx) const {
+  [[nodiscard]] bst_bin_t GetGlobalBinIdx(size_t idx) const {
    return index_base_ + static_cast<bst_bin_t>(index_[idx]);
  }

  /* returns number of elements in column */
-  size_t Size() const { return index_.size(); }
+  [[nodiscard]] size_t Size() const { return index_.size(); }

 private:
  /* bin indexes in range [0, max_bins - 1] */
@@ -63,7 +68,7 @@ class SparseColumnIter : public Column<BinIdxT> {
  common::Span<const size_t> row_ind_;
  size_t idx_;

-  size_t const* RowIndices() const { return row_ind_.data(); }
+  [[nodiscard]] size_t const* RowIndices() const { return row_ind_.data(); }

 public:
  SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
@@ -81,7 +86,7 @@ class SparseColumnIter : public Column<BinIdxT> {
  SparseColumnIter(SparseColumnIter const&) = delete;
  SparseColumnIter(SparseColumnIter&&) = default;

-  size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
+  [[nodiscard]] size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
  bst_bin_t operator[](size_t rid) {
    const size_t column_size = this->Size();
    if (!((idx_) < column_size)) {
@@ -101,25 +106,28 @@ class SparseColumnIter : public Column<BinIdxT> {
  }
 };

+/**
+ * @brief Column stored as a dense vector. It might still contain missing values as
+ *        indicated by the missing flags.
+ */
 template <typename BinIdxT, bool any_missing>
 class DenseColumnIter : public Column<BinIdxT> {
- public:
-  using ByteType = bool;
-
 private:
  using Base = Column<BinIdxT>;
  /* flags for missing values in dense columns */
-  std::vector<ByteType> const& missing_flags_;
+  LBitField32 missing_flags_;
  size_t feature_offset_;

 public:
  explicit DenseColumnIter(common::Span<const BinIdxT> index, bst_bin_t index_base,
-                           std::vector<ByteType> const& missing_flags, size_t feature_offset)
+                           LBitField32 missing_flags, size_t feature_offset)
      : Base{index, index_base}, missing_flags_{missing_flags}, feature_offset_{feature_offset} {}
  DenseColumnIter(DenseColumnIter const&) = delete;
  DenseColumnIter(DenseColumnIter&&) = default;

-  bool IsMissing(size_t ridx) const { return missing_flags_[feature_offset_ + ridx]; }
+  [[nodiscard]] bool IsMissing(size_t ridx) const {
+    return missing_flags_.Check(feature_offset_ + ridx);
+  }

  bst_bin_t operator[](size_t ridx) const {
    if (any_missing) {
@@ -131,12 +139,64 @@ class DenseColumnIter : public Column<BinIdxT> {
 };

 /**
- * \brief Column major matrix for gradient index. This matrix contains both dense column
- * and sparse column, the type of the column is controlled by sparse threshold. When the
- * number of missing values in a column is below the threshold it's classified as dense
- * column.
+ * @brief Column major matrix for gradient index on CPU.
+ *
+ *    This matrix contains both dense columns and sparse columns, the type of the column
+ *    is controlled by the sparse threshold parameter. When the number of missing values
+ *    in a column is below the threshold it's classified as dense column.
 */
 class ColumnMatrix {
+  /**
+   * @brief A bit set for indicating whether an element in a dense column is missing.
+   */
+  struct MissingIndicator {
+    using BitFieldT = LBitField32;
+    using T = typename BitFieldT::value_type;
+
+    BitFieldT missing;
+    RefResourceView<T> storage;
+    static_assert(std::is_same_v<T, std::uint32_t>);
+
+    template <typename U>
+    [[nodiscard]] std::enable_if_t<!std::is_signed_v<U>, U> static InitValue(bool init) {
+      return init ? ~U{0} : U{0};
+    }
+
+    MissingIndicator() = default;
+    /**
+     * @param n_elements Size of the bit set
+     * @param init       Initialize the indicator to true or false.
+     */
+    MissingIndicator(std::size_t n_elements, bool init) {
+      auto m_size = missing.ComputeStorageSize(n_elements);
+      storage = common::MakeFixedVecWithMalloc(m_size, InitValue<T>(init));
+      this->InitView();
+    }
+    /** @brief Set the i^th element to be a valid element (instead of missing). */
+    void SetValid(typename LBitField32::index_type i) { /*missing.Clear(i); */}
+    /** @brief assign the storage to the view. */
+    void InitView() {
+      missing = LBitField32{Span{storage.data(), storage.size()}};
+    }
+
+    void GrowTo(std::size_t n_elements, bool init) {
+      CHECK(storage.Resource()->Type() == ResourceHandler::kMalloc)
+          << "[Internal Error]: Cannot grow the vector when external memory is used.";
+      auto m_size = missing.ComputeStorageSize(n_elements);
+      CHECK_GE(m_size, storage.size());
+      if (m_size == storage.size()) {
+        return;
+      }
+      // grow the storage
+      auto resource = std::dynamic_pointer_cast<common::MallocResource>(storage.Resource());
+      CHECK(resource);
+      resource->Resize(m_size * sizeof(T), InitValue<std::byte>(init));
+      storage = RefResourceView<T>{resource->DataAs<T>(), m_size, resource};
+
+      this->InitView();
+    }
+  };
+
  void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold);

  template <typename ColumnBinT, typename BinT, typename RIdx>
@@ -144,9 +204,10 @@ class ColumnMatrix {
    if (type_[fid] == kDenseColumn) {
      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
      begin[rid] = bin_id - index_base_[fid];
-      // not thread-safe with bool vector.  FIXME(jiamingy): We can directly assign
-      // kMissingId to the index to avoid missing flags.
-      missing_flags_[feature_offsets_[fid] + rid] = false;
+      // not thread-safe with bit field.
+      // FIXME(jiamingy): We can directly assign kMissingId to the index to avoid missing
+      // flags.
+      missing_.SetValid(feature_offsets_[fid] + rid);
    } else {
      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
      begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
@@ -156,9 +217,10 @@ class ColumnMatrix {
  }

 public:
-  using ByteType = bool;
  // get number of features
-  bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }
+  [[nodiscard]] bst_feature_t GetNumFeature() const {
+    return static_cast<bst_feature_t>(type_.size());
+  }

  ColumnMatrix() = default;
  ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) {
@@ -166,7 +228,7 @@ class ColumnMatrix {
  }

  /**
-   * \brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
+   * @brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
   *        SparsePage.
   */
  void InitFromSparse(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
@@ -178,8 +240,8 @@ class ColumnMatrix {
  }

  /**
-   * \brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
-   * data.
+   * @brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
+   *        data.
   *
   *    This function requires a binary search for each bin to get back the feature index
   *    for those bins.
@@ -199,7 +261,7 @@ class ColumnMatrix {
    }
  }

-  bool IsInitialized() const { return !type_.empty(); }
+  [[nodiscard]] bool IsInitialized() const { return !type_.empty(); }

  /**
   * \brief Push batch of data for Quantile DMatrix support.
@@ -257,7 +319,7 @@ class ColumnMatrix {
        reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
        column_size};
    return std::move(DenseColumnIter<BinIdxType, any_missing>{
-        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_flags_, feature_offset});
+        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset});
  }

  // all columns are dense column and has no missing value
@@ -265,7 +327,8 @@ class ColumnMatrix {
  template <typename RowBinIdxT>
  void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
                         const size_t n_features, int32_t n_threads) {
-    missing_flags_.resize(feature_offsets_[n_features], false);
+    missing_.GrowTo(feature_offsets_[n_features], false);
+
    DispatchBinType(bins_type_size_, [&](auto t) {
      using ColumnBinT = decltype(t);
      auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
@@ -290,9 +353,15 @@ class ColumnMatrix {
  void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
                            float missing) {
    auto n_features = gmat.Features();
-    missing_flags_.resize(feature_offsets_[n_features], true);
-    auto const* row_index = gmat.index.data<uint32_t>() + gmat.row_ptr[base_rowid];
-    num_nonzeros_.resize(n_features, 0);
+
+    missing_.GrowTo(feature_offsets_[n_features], true);
+    auto const* row_index = gmat.index.data<std::uint32_t>() + gmat.row_ptr[base_rowid];
+    if (num_nonzeros_.empty()) {
+      num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
+    } else {
+      CHECK_EQ(num_nonzeros_.size(), n_features);
+    }
+
    auto is_valid = data::IsValidFunctor{missing};

    DispatchBinType(bins_type_size_, [&](auto t) {
@@ -321,8 +390,9 @@ class ColumnMatrix {
   */
  void SetIndexMixedColumns(const GHistIndexMatrix& gmat) {
    auto n_features = gmat.Features();
-    missing_flags_.resize(feature_offsets_[n_features], true);
-    num_nonzeros_.resize(n_features, 0);
+
+    missing_ = MissingIndicator{feature_offsets_[n_features], true};
+    num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});

    DispatchBinType(bins_type_size_, [&](auto t) {
      using ColumnBinT = decltype(t);
@@ -335,106 +405,35 @@ class ColumnMatrix {
    });
  }

-  BinTypeSize GetTypeSize() const { return bins_type_size_; }
-  auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
+  [[nodiscard]] BinTypeSize GetTypeSize() const { return bins_type_size_; }
+  [[nodiscard]] auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }

  // And this returns part of state
-  bool AnyMissing() const { return any_missing_; }
+  [[nodiscard]] bool AnyMissing() const { return any_missing_; }

  // IO procedures for external memory.
-  bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
-    fi->Read(&index_);
-#if !DMLC_LITTLE_ENDIAN
-    // s390x
-    std::vector<std::underlying_type<ColumnType>::type> int_types;
-    fi->Read(&int_types);
-    type_.resize(int_types.size());
-    std::transform(
-        int_types.begin(), int_types.end(), type_.begin(),
-        [](std::underlying_type<ColumnType>::type i) { return static_cast<ColumnType>(i); });
-#else
-    fi->Read(&type_);
-#endif  // !DMLC_LITTLE_ENDIAN
-
-    fi->Read(&row_ind_);
-    fi->Read(&feature_offsets_);
-
-    std::vector<std::uint8_t> missing;
-    fi->Read(&missing);
-    missing_flags_.resize(missing.size());
-    std::transform(missing.cbegin(), missing.cend(), missing_flags_.begin(),
-                   [](std::uint8_t flag) { return !!flag; });
-
-    index_base_ = index_base;
-#if !DMLC_LITTLE_ENDIAN
-    std::underlying_type<BinTypeSize>::type v;
-    fi->Read(&v);
-    bins_type_size_ = static_cast<BinTypeSize>(v);
-#else
-    fi->Read(&bins_type_size_);
-#endif
-
-    fi->Read(&any_missing_);
-    return true;
-  }
-
-  size_t Write(dmlc::Stream* fo) const {
-    size_t bytes{0};
-
-    auto write_vec = [&](auto const& vec) {
-      fo->Write(vec);
-      bytes += vec.size() * sizeof(typename std::remove_reference_t<decltype(vec)>::value_type) +
-               sizeof(uint64_t);
-    };
-    write_vec(index_);
-#if !DMLC_LITTLE_ENDIAN
-    // s390x
-    std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
-    std::transform(type_.begin(), type_.end(), int_types.begin(), [](ColumnType t) {
-      return static_cast<std::underlying_type<ColumnType>::type>(t);
-    });
-    write_vec(int_types);
-#else
-    write_vec(type_);
-#endif  // !DMLC_LITTLE_ENDIAN
-    write_vec(row_ind_);
-    write_vec(feature_offsets_);
-    // dmlc can not handle bool vector
-    std::vector<std::uint8_t> missing(missing_flags_.size());
-    std::transform(missing_flags_.cbegin(), missing_flags_.cend(), missing.begin(),
-                   [](bool flag) { return static_cast<std::uint8_t>(flag); });
-    write_vec(missing);
-
-#if !DMLC_LITTLE_ENDIAN
-    auto v = static_cast<std::underlying_type<BinTypeSize>::type>(bins_type_size_);
-    fo->Write(v);
-#else
-    fo->Write(bins_type_size_);
-#endif  // DMLC_LITTLE_ENDIAN
-    bytes += sizeof(bins_type_size_);
-    fo->Write(any_missing_);
-    bytes += sizeof(any_missing_);
-
-    return bytes;
-  }
+  [[nodiscard]] bool Read(AlignedResourceReadStream* fi, uint32_t const* index_base);
+  [[nodiscard]] std::size_t Write(AlignedFileWriteStream* fo) const;
+  [[nodiscard]] MissingIndicator const& Missing() const { return missing_; }

 private:
-  std::vector<uint8_t> index_;
+  RefResourceView<std::uint8_t> index_;

-  std::vector<ColumnType> type_;
-  /* indptr of a CSC matrix. */
-  std::vector<size_t> row_ind_;
-  /* indicate where each column's index and row_ind is stored. */
-  std::vector<size_t> feature_offsets_;
-  /* The number of nnz of each column. */
-  std::vector<size_t> num_nonzeros_;
+  RefResourceView<ColumnType> type_;
+  /** @brief indptr of a CSC matrix. */
+  RefResourceView<std::size_t> row_ind_;
+  /** @brief indicate where each column's index and row_ind is stored. */
+  RefResourceView<std::size_t> feature_offsets_;
+  /** @brief The number of nnz of each column. */
+  RefResourceView<std::size_t> num_nonzeros_;

  // index_base_[fid]: least bin id for feature fid
-  uint32_t const* index_base_;
-  std::vector<ByteType> missing_flags_;
+  std::uint32_t const* index_base_;
+
+  MissingIndicator missing_;
+
  BinTypeSize bins_type_size_;
  bool any_missing_;
 };
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_COLUMN_MATRIX_H_
--- a/src/common/common.cc
+++ b/src/common/common.cc
@@ -1,16 +1,17 @@
-/*!
- * Copyright 2015-2019 by Contributors
- * \file common.cc
- * \brief Enable all kinds of global variables in common.
+/**
+ * Copyright 2015-2023 by Contributors
 */
-#include <dmlc/thread_local.h>
-#include <xgboost/logging.h>
-
 #include "common.h"
-#include "./random.h"

-namespace xgboost {
-namespace common {
+#include <dmlc/thread_local.h>  // for ThreadLocalStore
+
+#include <cstdint>  // for uint8_t
+#include <cstdio>   // for snprintf, size_t
+#include <string>   // for string
+
+#include "./random.h"  // for GlobalRandomEngine, GlobalRandom
+
+namespace xgboost::common {
 /*! \brief thread local entry for random. */
 struct RandomThreadLocalEntry {
  /*! \brief the random engine instance. */
@@ -19,15 +20,43 @@ struct RandomThreadLocalEntry {

 using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;

-GlobalRandomEngine& GlobalRandom() {
-  return RandomThreadLocalStore::Get()->engine;
+GlobalRandomEngine &GlobalRandom() { return RandomThreadLocalStore::Get()->engine; }
+
+void EscapeU8(std::string const &string, std::string *p_buffer) {
+  auto &buffer = *p_buffer;
+  for (size_t i = 0; i < string.length(); i++) {
+    const auto ch = string[i];
+    if (ch == '\\') {
+      if (i < string.size() && string[i + 1] == 'u') {
+        buffer += "\\";
+      } else {
+        buffer += "\\\\";
+      }
+    } else if (ch == '"') {
+      buffer += "\\\"";
+    } else if (ch == '\b') {
+      buffer += "\\b";
+    } else if (ch == '\f') {
+      buffer += "\\f";
+    } else if (ch == '\n') {
+      buffer += "\\n";
+    } else if (ch == '\r') {
+      buffer += "\\r";
+    } else if (ch == '\t') {
+      buffer += "\\t";
+    } else if (static_cast<uint8_t>(ch) <= 0x1f) {
+      // Unit separator
+      char buf[8];
+      snprintf(buf, sizeof buf, "\\u%04x", ch);
+      buffer += buf;
+    } else {
+      buffer += ch;
+    }
+  }
 }

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
-int AllVisibleGPUs() {
-  return 0;
-}
-#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
+int AllVisibleGPUs() { return 0; }
+#endif  // !defined(XGBOOST_USE_CUDA)

-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -6,20 +6,19 @@
 #ifndef XGBOOST_COMMON_COMMON_H_
 #define XGBOOST_COMMON_COMMON_H_

-#include <xgboost/base.h>
-#include <xgboost/logging.h>
-#include <xgboost/span.h>
+#include <algorithm>  // for max
+#include <array>      // for array
+#include <cmath>      // for ceil
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, int64_t
+#include <sstream>    // for basic_istream, operator<<, istringstream
+#include <string>     // for string, basic_string, getline, char_traits
+#include <tuple>      // for make_tuple
+#include <utility>    // for forward, index_sequence, make_index_sequence
+#include <vector>     // for vector

-#include <algorithm>
-#include <exception>
-#include <functional>
-#include <limits>
-#include <numeric>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
+#include "xgboost/base.h"     // for XGBOOST_DEVICE
+#include "xgboost/logging.h"  // for LOG, LOG_FATAL, LogMessageFatal

 #if defined(__CUDACC__)
 #include <thrust/system/cuda/error.h>
@@ -74,8 +73,7 @@ inline hipError_t ThrowOnCudaError(hipError_t code, const char *file, int line)
 #endif
 }  // namespace dh

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 /*!
 * \brief Split a string by delimiter
 * \param s String to be split.
@@ -91,19 +89,13 @@ inline std::vector<std::string> Split(const std::string& s, char delim) {
  return ret;
 }

+void EscapeU8(std::string const &string, std::string *p_buffer);
+
 template <typename T>
 XGBOOST_DEVICE T Max(T a, T b) {
  return a < b ? b : a;
 }

-// simple routine to convert any data to string
-template<typename T>
-inline std::string ToString(const T& data) {
-  std::ostringstream os;
-  os << data;
-  return os.str();
-}
-
 template <typename T1, typename T2>
 XGBOOST_DEVICE T1 DivRoundUp(const T1 a, const T2 b) {
  return static_cast<T1>(std::ceil(static_cast<double>(a) / b));
@@ -217,6 +209,5 @@ template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
  return indptr[group + 1] - 1;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_COMMON_H_
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -482,7 +482,7 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
  cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
    // Configure allocator with maximum cached bin size of ~1GB and no limit on
    // maximum cached bytes
-    static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
+    thread_local cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
    return *allocator;
  }
  pointer allocate(size_t n) {  // NOLINT
@@ -1178,7 +1178,13 @@ inline void CUDAEvent::Record(CUDAStreamView stream) {  // NOLINT
  dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
 }

-inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamLegacy}; }
+inline CUDAStreamView DefaultStream() {
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  return CUDAStreamView{cudaStreamPerThread};
+#else
+  return CUDAStreamView{cudaStreamLegacy};
+#endif
+}

 class CUDAStream {
  cudaStream_t stream_;
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ */
+#include "error_msg.h"
+
+#include <mutex>    // for call_once, once_flag
+#include <sstream>  // for stringstream
+
+#include "../collective/communicator-inl.h"  // for GetRank
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/logging.h"
+
+namespace xgboost::error {
+std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
+  std::stringstream ss;
+  ss << "`" << old << "` is deprecated since" << since << ", use `" << replacement << "` instead.";
+  return ss.str();
+}
+
+void WarnDeprecatedGPUHist() {
+  auto msg =
+      "The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
+      R"(parameter to CUDA instead.
+
+    E.g. tree_method = "hist", device = "cuda"
+)";
+  LOG(WARNING) << msg;
+}
+
+void WarnManualUpdater() {
+  static std::once_flag flag;
+  std::call_once(flag, [] {
+    LOG(WARNING)
+        << "You have manually specified the `updater` parameter. The `tree_method` parameter "
+           "will be ignored. Incorrect sequence of updaters will produce undefined "
+           "behavior. For common uses, we recommend using `tree_method` parameter instead.";
+  });
+}
+
+void WarnDeprecatedGPUId() {
+  static std::once_flag flag;
+  std::call_once(flag, [] {
+    auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
+    msg += " E.g. device=cpu/cuda/cuda:0";
+    LOG(WARNING) << msg;
+  });
+}
+
+void WarnEmptyDataset() {
+  static std::once_flag flag;
+  std::call_once(flag,
+                 [] { LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank(); });
+}
+
+void MismatchedDevices(Context const* booster, Context const* data) {
+  static std::once_flag flag;
+  std::call_once(flag, [&] {
+    LOG(WARNING)
+        << "Falling back to prediction using DMatrix due to mismatched devices. This might "
+           "lead to higher memory usage and slower performance. XGBoost is running on: "
+        << booster->DeviceName() << ", while the input data is on: " << data->DeviceName() << ".\n"
+        << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+
+This warning will only be shown once.
+)";
+  });
+}
+}  // namespace xgboost::error
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -6,6 +6,13 @@
 #ifndef XGBOOST_COMMON_ERROR_MSG_H_
 #define XGBOOST_COMMON_ERROR_MSG_H_

+#include <cinttypes>  // for uint64_t
+#include <limits>     // for numeric_limits
+#include <string>     // for string
+
+#include "xgboost/base.h"     // for bst_feature_t
+#include "xgboost/context.h"  // for Context
+#include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView

 namespace xgboost::error {
@@ -33,5 +40,62 @@ constexpr StringView InconsistentMaxBin() {
  return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
         "and consistent with the Booster being trained.";
 }
+
+constexpr StringView UnknownDevice() { return "Unknown device type."; }
+
+inline void MaxFeatureSize(std::uint64_t n_features) {
+  auto max_n_features = std::numeric_limits<bst_feature_t>::max();
+  CHECK_LE(n_features, max_n_features)
+      << "Unfortunately, XGBoost does not support data matrices with "
+      << std::numeric_limits<bst_feature_t>::max() << " features or greater";
+}
+
+constexpr StringView InplacePredictProxy() {
+  return "Inplace predict accepts only DMatrixProxy as input.";
+}
+
+inline void MaxSampleSize(std::size_t n) {
+  LOG(FATAL) << "Sample size too large for the current updater. Maximum number of samples:" << n
+             << ". Consider using a different updater or tree_method.";
+}
+
+constexpr StringView OldSerialization() {
+  return R"doc(If you are loading a serialized model (like pickle in Python, RDS in R) or
+configuration generated by an older version of XGBoost, please export the model by calling
+`Booster.save_model` from that version first, then load it back in current version. See:
+
+    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
+
+for more details about differences between saving model and serializing.
+)doc";
+}
+
+inline void WarnOldSerialization() {
+  // Display it once is enough. Otherwise this can be really verbose in distributed
+  // environments.
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << OldSerialization();
+  logged = true;
+}
+
+void WarnDeprecatedGPUHist();
+
+void WarnManualUpdater();
+
+void WarnDeprecatedGPUId();
+
+void WarnEmptyDataset();
+
+std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
+
+constexpr StringView InvalidCUDAOrdinal() {
+  return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
+         "available for using GPU.";
+}
+
+void MismatchedDevices(Context const* booster, Context const* data);
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -8,12 +8,12 @@

 #include <vector>

-#include "../common/common.h"
-#include "column_matrix.h"
+#include "../data/adapter.h"         // for SparsePageAdapterBatch
+#include "../data/gradient_index.h"  // for GHistIndexMatrix
 #include "quantile.h"
 #include "xgboost/base.h"
-#include "xgboost/context.h"  // Context
-#include "xgboost/data.h"     // SparsePage, SortedCSCPage
+#include "xgboost/context.h"  // for Context
+#include "xgboost/data.h"     // for SparsePage, SortedCSCPage

 #if defined(XGBOOST_MM_PREFETCH_PRESENT)
  #include <xmmintrin.h>
@@ -24,15 +24,13 @@
  #define PREFETCH_READ_T0(addr) do {} while (0)
 #endif  // defined(XGBOOST_MM_PREFETCH_PRESENT)

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 HistogramCuts::HistogramCuts() {
  cut_ptrs_.HostVector().emplace_back(0);
 }

 HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted,
-                              Span<float> const hessian) {
+                              Span<float const> hessian) {
  HistogramCuts out;
  auto const &info = m->Info();
  auto n_threads = ctx->Threads();
@@ -69,25 +67,14 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
  return out;
 }

-/*!
- * \brief fill a histogram by zeros in range [begin, end)
- */
-void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
-#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  std::fill(hist.begin() + begin, hist.begin() + end, xgboost::GradientPairPrecise());
-#else  // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  memset(hist.data() + begin, '\0', (end - begin) * sizeof(xgboost::GradientPairPrecise));
-#endif  // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-}
-
 /*!
 * \brief Increment hist as dst += add in range [begin, end)
 */
-void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end) {
-  double* pdst = reinterpret_cast<double*>(dst.data());
+void IncrementHist(GHistRow dst, ConstGHistRow add, std::size_t begin, std::size_t end) {
+  double *pdst = reinterpret_cast<double *>(dst.data());
  const double *padd = reinterpret_cast<const double *>(add.data());

-  for (size_t i = 2 * begin; i < 2 * end; ++i) {
+  for (std::size_t i = 2 * begin; i < 2 * end; ++i) {
    pdst[i] += padd[i];
  }
 }
@@ -209,18 +196,23 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,

  const size_t size = row_indices.Size();
  const size_t *rid = row_indices.begin;
-  auto const *pgh = reinterpret_cast<const float *>(gpair.data());
+  auto const *p_gpair = reinterpret_cast<const float *>(gpair.data());
  const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();

  auto const &row_ptr = gmat.row_ptr.data();
  auto base_rowid = gmat.base_rowid;
-  const uint32_t *offsets = gmat.index.Offset();
-  auto get_row_ptr = [&](size_t ridx) {
+  uint32_t const *offsets = gmat.index.Offset();
+  // There's no feature-based compression if missing value is present.
+  if (kAnyMissing) {
+    CHECK(!offsets);
+  } else {
+    CHECK(offsets);
+  }
+
+  auto get_row_ptr = [&](bst_row_t ridx) {
    return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
  };
-  auto get_rid = [&](size_t ridx) {
-    return kFirstPage ? ridx : (ridx - base_rowid);
-  };
+  auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };

  const size_t n_features =
      get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
@@ -230,7 +222,7 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
                          // So we need to multiply each row-index/bin-index by 2
                          // to work with gradient pairs as a singe row FP array

-  for (size_t i = 0; i < size; ++i) {
+  for (std::size_t i = 0; i < size; ++i) {
    const size_t icol_start =
        kAnyMissing ? get_row_ptr(rid[i]) : get_rid(rid[i]) * n_features;
    const size_t icol_end =
@@ -248,7 +240,7 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
          kAnyMissing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset] + 1)
                      : icol_start_prefetch + n_features;

-      PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
+      PREFETCH_READ_T0(p_gpair + two * rid[i + Prefetch::kPrefetchOffset]);
      for (size_t j = icol_start_prefetch; j < icol_end_prefetch;
           j += Prefetch::GetPrefetchStep<uint32_t>()) {
        PREFETCH_READ_T0(gradient_index + j);
@@ -257,12 +249,12 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
    const BinIdxType *gr_index_local = gradient_index + icol_start;

    // The trick with pgh_t buffer helps the compiler to generate faster binary.
-    const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]};
+    const float pgh_t[] = {p_gpair[idx_gh], p_gpair[idx_gh + 1]};
    for (size_t j = 0; j < row_size; ++j) {
-      const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) +
-                                      (kAnyMissing ? 0 : offsets[j]));
+      const uint32_t idx_bin =
+          two * (static_cast<uint32_t>(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j]));
      auto hist_local = hist_data + idx_bin;
-      *(hist_local)     += pgh_t[0];
+      *(hist_local) += pgh_t[0];
      *(hist_local + 1) += pgh_t[1];
    }
  }
@@ -283,12 +275,10 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
  auto const &row_ptr = gmat.row_ptr.data();
  auto base_rowid = gmat.base_rowid;
  const uint32_t *offsets = gmat.index.Offset();
-  auto get_row_ptr = [&](size_t ridx) {
+  auto get_row_ptr = [&](bst_row_t ridx) {
    return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
  };
-  auto get_rid = [&](size_t ridx) {
-    return kFirstPage ? ridx : (ridx - base_rowid);
-  };
+  auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };

  const size_t n_features = gmat.cut.Ptrs().size() - 1;
  const size_t n_columns = n_features;
@@ -350,9 +340,8 @@ void BuildHistDispatch(Span<GradientPair const> gpair, const RowSetCollection::E
 }

 template <bool any_missing>
-void GHistBuilder::BuildHist(Span<GradientPair const> gpair,
-                             const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
-                             GHistRow hist, bool force_read_by_column) const {
+void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+               const GHistIndexMatrix &gmat, GHistRow hist, bool force_read_by_column) {
  /* force_read_by_column is used for testing the columnwise building of histograms.
   * default force_read_by_column = false
   */
@@ -369,14 +358,13 @@ void GHistBuilder::BuildHist(Span<GradientPair const> gpair,
      });
 }

-template void GHistBuilder::BuildHist<true>(Span<GradientPair const> gpair,
-                                            const RowSetCollection::Elem row_indices,
-                                            const GHistIndexMatrix &gmat, GHistRow hist,
-                                            bool force_read_by_column) const;
+template void BuildHist<true>(Span<GradientPair const> gpair,
+                              const RowSetCollection::Elem row_indices,
+                              const GHistIndexMatrix &gmat, GHistRow hist,
+                              bool force_read_by_column);

-template void GHistBuilder::BuildHist<false>(Span<GradientPair const> gpair,
-                                             const RowSetCollection::Elem row_indices,
-                                             const GHistIndexMatrix &gmat, GHistRow hist,
-                                             bool force_read_by_column) const;
-}  // namespace common
-}  // namespace xgboost
+template void BuildHist<false>(Span<GradientPair const> gpair,
+                               const RowSetCollection::Elem row_indices,
+                               const GHistIndexMatrix &gmat, GHistRow hist,
+                               bool force_read_by_column);
+}  // namespace xgboost::common
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -19,16 +19,14 @@
 #include <vector>

 #include "categorical.h"
+#include "cuda_context.cuh"  // for CUDAContext
 #include "device_helpers.cuh"
 #include "hist_util.cuh"
 #include "hist_util.h"
-#include "math.h"  // NOLINT
 #include "quantile.h"
 #include "xgboost/host_device_vector.h"

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 constexpr float SketchContainer::kFactor;

 namespace detail {
@@ -87,13 +85,13 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
  return peak;
 }

-size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
-                              bst_row_t num_rows, bst_feature_t columns,
-                              size_t nnz, int device,
-                              size_t num_cuts, bool has_weight) {
+size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_row_t num_rows,
+                              bst_feature_t columns, size_t nnz, int device, size_t num_cuts,
+                              bool has_weight) {
+  auto constexpr kIntMax = static_cast<std::size_t>(std::numeric_limits<std::int32_t>::max());
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
  // device available memory is not accurate when rmm is used.
-  return nnz;
+  return std::min(nnz, kIntMax);
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1

  if (sketch_batch_num_elements == 0) {
@@ -106,256 +104,279 @@ size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
      sketch_batch_num_elements = std::min(num_rows * static_cast<size_t>(columns), nnz);
    }
  }
-  return sketch_batch_num_elements;
+
+  return std::min(sketch_batch_num_elements, kIntMax);
 }

-void SortByWeight(dh::device_vector<float>* weights,
-                  dh::device_vector<Entry>* sorted_entries) {
+void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* sorted_entries) {
  // Sort both entries and wegihts.
  dh::XGBDeviceAllocator<char> alloc;
-
+  CHECK_EQ(weights->size(), sorted_entries->size());
 #if defined(XGBOOST_USE_CUDA)
-  thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(),
-                      sorted_entries->end(), weights->begin(),
-                      detail::EntryCompareOp());
-#elif defined(XGBOOST_USE_HIP)
-  thrust::sort_by_key(thrust::hip::par(alloc), sorted_entries->begin(),
-                      sorted_entries->end(), weights->begin(),
-                      detail::EntryCompareOp());
-#endif
+  thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(), sorted_entries->end(),
+                      weights->begin(), detail::EntryCompareOp());

  // Scan weights
  dh::XGBCachingDeviceAllocator<char> caching;
-
-#if defined(XGBOOST_USE_CUDA)
-  thrust::inclusive_scan_by_key(thrust::cuda::par(caching),
-                                sorted_entries->begin(), sorted_entries->end(),
-                                weights->begin(), weights->begin(),
-                                [=] __device__(const Entry& a, const Entry& b) {
-                                  return a.index == b.index;
-                                });
+  thrust::inclusive_scan_by_key(
+      thrust::cuda::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
+      weights->begin(),
+      [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
 #elif defined(XGBOOST_USE_HIP)
-  thrust::inclusive_scan_by_key(thrust::hip::par(caching),
-                                sorted_entries->begin(), sorted_entries->end(),
-                                weights->begin(), weights->begin(),
-                                [=] __device__(const Entry& a, const Entry& b) {
-                                  return a.index == b.index;
-                                });
+  thrust::sort_by_key(thrust::hip::par(alloc), sorted_entries->begin(), sorted_entries->end(),
+                      weights->begin(), detail::EntryCompareOp());
+
+  // Scan weights
+  dh::XGBCachingDeviceAllocator<char> caching;
+  thrust::inclusive_scan_by_key(
+      thrust::hip::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
+      weights->begin(),
+      [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
 #endif
 }

-void RemoveDuplicatedCategories(
-    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
-    dh::device_vector<Entry> *p_sorted_entries,
-    dh::caching_device_vector<size_t> *p_column_sizes_scan) {
+void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+                                dh::device_vector<Entry>* p_sorted_entries,
+                                dh::device_vector<float>* p_sorted_weights,
+                                dh::caching_device_vector<size_t>* p_column_sizes_scan) {
  info.feature_types.SetDevice(device);
  auto d_feature_types = info.feature_types.ConstDeviceSpan();
  CHECK(!d_feature_types.empty());
-  auto &column_sizes_scan = *p_column_sizes_scan;
-  auto &sorted_entries = *p_sorted_entries;
+  auto& column_sizes_scan = *p_column_sizes_scan;
+  auto& sorted_entries = *p_sorted_entries;
  // Removing duplicated entries in categorical features.
+
+  // We don't need to accumulate weight for duplicated entries as there's no weighted
+  // sketching for categorical features, the categories are the cut values.
  dh::caching_device_vector<size_t> new_column_scan(column_sizes_scan.size());
-  dh::SegmentedUnique(column_sizes_scan.data().get(),
-                      column_sizes_scan.data().get() + column_sizes_scan.size(),
-                      sorted_entries.begin(), sorted_entries.end(),
-                      new_column_scan.data().get(), sorted_entries.begin(),
-                      [=] __device__(Entry const &l, Entry const &r) {
-                        if (l.index == r.index) {
-                          if (IsCat(d_feature_types, l.index)) {
-                            return l.fvalue == r.fvalue;
-                          }
-                        }
-                        return false;
-                      });
+  std::size_t n_uniques{0};
+  if (p_sorted_weights) {
+    using Pair = thrust::tuple<Entry, float>;
+    auto d_sorted_entries = dh::ToSpan(sorted_entries);
+    auto d_sorted_weights = dh::ToSpan(*p_sorted_weights);
+    auto val_in_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
+    auto val_out_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
+    n_uniques = dh::SegmentedUnique(
+        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
+        val_in_it, val_in_it + sorted_entries.size(), new_column_scan.data().get(), val_out_it,
+        [=] __device__(Pair const& l, Pair const& r) {
+          Entry const& le = thrust::get<0>(l);
+          Entry const& re = thrust::get<0>(r);
+          if (le.index == re.index && IsCat(d_feature_types, le.index)) {
+            return le.fvalue == re.fvalue;
+          }
+          return false;
+        });
+    p_sorted_weights->resize(n_uniques);
+  } else {
+    n_uniques = dh::SegmentedUnique(
+        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
+        sorted_entries.begin(), sorted_entries.end(), new_column_scan.data().get(),
+        sorted_entries.begin(), [=] __device__(Entry const& l, Entry const& r) {
+          if (l.index == r.index) {
+            if (IsCat(d_feature_types, l.index)) {
+              return l.fvalue == r.fvalue;
+            }
+          }
+          return false;
+        });
+  }
+  sorted_entries.resize(n_uniques);

  // Renew the column scan and cut scan based on categorical data.
  auto d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan);
-  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(
-      info.num_col_ + 1);
+  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(info.num_col_ + 1);
  CHECK_EQ(new_column_scan.size(), new_cuts_size.size());
-  dh::LaunchN(
-      new_column_scan.size(),
-      [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
-       d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
-       d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
-        d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
-        if (idx == d_new_columns_ptr.size() - 1) {
-          return;
-        }
-        if (IsCat(d_feature_types, idx)) {
-          // Cut size is the same as number of categories in input.
-          d_new_cuts_size[idx] =
-              d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
-        } else {
-          d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
-        }
-      });
+  dh::LaunchN(new_column_scan.size(),
+              [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
+               d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
+               d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
+                d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
+                if (idx == d_new_columns_ptr.size() - 1) {
+                  return;
+                }
+                if (IsCat(d_feature_types, idx)) {
+                  // Cut size is the same as number of categories in input.
+                  d_new_cuts_size[idx] = d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
+                } else {
+                  d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
+                }
+              });
  // Turn size into ptr.
-  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(),
-                         new_cuts_size.cend(), d_cuts_ptr.data());
+  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(), new_cuts_size.cend(),
+                         d_cuts_ptr.data());
 }
 }  // namespace detail

-void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
-                  size_t begin, size_t end, SketchContainer *sketch_container,
-                  int num_cuts_per_feature, size_t num_columns) {
-  dh::XGBCachingDeviceAllocator<char> alloc;
+void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo const& info,
+                          std::size_t begin, std::size_t end,
+                          SketchContainer* sketch_container,  // <- output sketch
+                          int num_cuts_per_feature, common::Span<float const> sample_weight) {
  dh::device_vector<Entry> sorted_entries;
  if (page.data.DeviceCanRead()) {
-    const auto& device_data = page.data.ConstDevicePointer();
-    sorted_entries = dh::device_vector<Entry>(device_data + begin, device_data + end);
+    // direct copy if data is already on device
+    auto const& d_data = page.data.ConstDevicePointer();
+    sorted_entries = dh::device_vector<Entry>(d_data + begin, d_data + end);
  } else {
-    const auto& host_data = page.data.ConstHostVector();
-    sorted_entries = dh::device_vector<Entry>(host_data.begin() + begin,
-                                              host_data.begin() + end);
+    const auto& h_data = page.data.ConstHostVector();
+    sorted_entries = dh::device_vector<Entry>(h_data.begin() + begin, h_data.begin() + end);
  }

-#if defined(XGBOOST_USE_CUDA)
-  thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
-               sorted_entries.end(), detail::EntryCompareOp());
-#elif defined(XGBOOST_USE_HIP)
-  thrust::sort(thrust::hip::par(alloc), sorted_entries.begin(),
-               sorted_entries.end(), detail::EntryCompareOp());
-#endif
+  bst_row_t base_rowid = page.base_rowid;
+
+  dh::device_vector<float> entry_weight;
+  auto cuctx = ctx->CUDACtx();
+  if (!sample_weight.empty()) {
+    // Expand sample weight into entry weight.
+    CHECK_EQ(sample_weight.size(), info.num_row_);
+    entry_weight.resize(sorted_entries.size());
+    auto d_temp_weight = dh::ToSpan(entry_weight);
+    page.offset.SetDevice(ctx->Device());
+    auto row_ptrs = page.offset.ConstDeviceSpan();
+    thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), entry_weight.size(),
+                       [=] __device__(std::size_t idx) {
+                         std::size_t element_idx = idx + begin;
+                         std::size_t ridx = dh::SegmentId(row_ptrs, element_idx);
+                         d_temp_weight[idx] = sample_weight[ridx + base_rowid];
+                       });
+    detail::SortByWeight(&entry_weight, &sorted_entries);
+  } else {
+    thrust::sort(cuctx->CTP(), sorted_entries.begin(), sorted_entries.end(),
+                 detail::EntryCompareOp());
+  }

  HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
  dh::caching_device_vector<size_t> column_sizes_scan;
  data::IsValidFunctor dummy_is_valid(std::numeric_limits<float>::quiet_NaN());
  auto batch_it = dh::MakeTransformIterator<data::COOTuple>(
-      sorted_entries.data().get(),
-      [] __device__(Entry const &e) -> data::COOTuple {
-        return {0, e.index, e.fvalue};  // row_idx is not needed for scanning column size.
+      sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
+        return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
      });
-  detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-
  if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
+    detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
+                                       &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
  CHECK_EQ(d_cuts_ptr.size(), column_sizes_scan.size());

-  // add cuts into sketches
-  sketch_container->Push(dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan),
-                         d_cuts_ptr, h_cuts_ptr.back());
+  // Add cuts into sketches
+  sketch_container->Push(dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
+                         h_cuts_ptr.back(), dh::ToSpan(entry_weight));
+
  sorted_entries.clear();
  sorted_entries.shrink_to_fit();
  CHECK_EQ(sorted_entries.capacity(), 0);
  CHECK_NE(cuts_ptr.Size(), 0);
 }

-void ProcessWeightedBatch(int device, const SparsePage& page,
-                          MetaInfo const& info, size_t begin, size_t end,
-                          SketchContainer* sketch_container, int num_cuts_per_feature,
-                          size_t num_columns,
-                          bool is_ranking, Span<bst_group_t const> d_group_ptr) {
-  auto weights = info.weights_.ConstDeviceSpan();
+// Unify group weight, Hessian, and sample weight into sample weight.
+[[nodiscard]] Span<float const> UnifyWeight(CUDAContext const* cuctx, MetaInfo const& info,
+                                            common::Span<float const> hessian,
+                                            HostDeviceVector<float>* p_out_weight) {
+  if (hessian.empty()) {
+    if (info.IsRanking() && !info.weights_.Empty()) {
+      common::Span<float const> group_weight = info.weights_.ConstDeviceSpan();
+      dh::device_vector<bst_group_t> group_ptr(info.group_ptr_);
+      auto d_group_ptr = dh::ToSpan(group_ptr);
+      CHECK_GE(d_group_ptr.size(), 2) << "Must have at least 1 group for ranking.";
+      auto d_weight = info.weights_.ConstDeviceSpan();
+      CHECK_EQ(d_weight.size(), d_group_ptr.size() - 1)
+          << "Weight size should equal to number of groups.";
+      p_out_weight->Resize(info.num_row_);
+      auto d_weight_out = p_out_weight->DeviceSpan();

-  dh::XGBCachingDeviceAllocator<char> alloc;
-  const auto& host_data = page.data.ConstHostVector();
-  dh::device_vector<Entry> sorted_entries(host_data.begin() + begin,
-                                          host_data.begin() + end);
-
-  // Binary search to assign weights to each element
-  dh::device_vector<float> temp_weights(sorted_entries.size());
-  auto d_temp_weights = temp_weights.data().get();
-  page.offset.SetDevice(device);
-  auto row_ptrs = page.offset.ConstDeviceSpan();
-  size_t base_rowid = page.base_rowid;
-  if (is_ranking) {
-    CHECK_GE(d_group_ptr.size(), 2)
-        << "Must have at least 1 group for ranking.";
-    CHECK_EQ(weights.size(), d_group_ptr.size() - 1)
-        << "Weight size should equal to number of groups.";
-    dh::LaunchN(temp_weights.size(), [=] __device__(size_t idx) {
-        size_t element_idx = idx + begin;
-        size_t ridx = dh::SegmentId(row_ptrs, element_idx);
-        bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx + base_rowid);
-        d_temp_weights[idx] = weights[group_idx];
-      });
-  } else {
-    dh::LaunchN(temp_weights.size(), [=] __device__(size_t idx) {
-        size_t element_idx = idx + begin;
-        size_t ridx = dh::SegmentId(row_ptrs, element_idx);
-        d_temp_weights[idx] = weights[ridx + base_rowid];
-      });
-  }
-  detail::SortByWeight(&temp_weights, &sorted_entries);
-
-  HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
-  dh::caching_device_vector<size_t> column_sizes_scan;
-  data::IsValidFunctor dummy_is_valid(std::numeric_limits<float>::quiet_NaN());
-  auto batch_it = dh::MakeTransformIterator<data::COOTuple>(
-      sorted_entries.data().get(),
-      [] __device__(Entry const &e) -> data::COOTuple {
-        return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
-      });
-  detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
-                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
-                             &column_sizes_scan);
-  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-  if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
-  }
-
-  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
-
-  // Extract cuts
-  sketch_container->Push(dh::ToSpan(sorted_entries),
-                         dh::ToSpan(column_sizes_scan), d_cuts_ptr,
-                         h_cuts_ptr.back(), dh::ToSpan(temp_weights));
-  sorted_entries.clear();
-  sorted_entries.shrink_to_fit();
-}
-
-HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
-                           size_t sketch_batch_num_elements) {
-  dmat->Info().feature_types.SetDevice(device);
-  dmat->Info().feature_types.ConstDevicePointer();  // pull to device early
-  // Configure batch size based on available memory
-  bool has_weights = dmat->Info().weights_.Size() > 0;
-  size_t num_cuts_per_feature =
-      detail::RequiredSampleCutsPerColumn(max_bins, dmat->Info().num_row_);
-  sketch_batch_num_elements = detail::SketchBatchNumElements(
-      sketch_batch_num_elements,
-      dmat->Info().num_row_,
-      dmat->Info().num_col_,
-      dmat->Info().num_nonzero_,
-      device, num_cuts_per_feature, has_weights);
-
-  HistogramCuts cuts;
-  SketchContainer sketch_container(dmat->Info().feature_types, max_bins, dmat->Info().num_col_,
-                                   dmat->Info().num_row_, device);
-
-  dmat->Info().weights_.SetDevice(device);
-  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    size_t batch_nnz = batch.data.Size();
-    auto const& info = dmat->Info();
-    for (auto begin = 0ull; begin < batch_nnz; begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
-      if (has_weights) {
-        bool is_ranking = HostSketchContainer::UseGroup(dmat->Info());
-        dh::caching_device_vector<uint32_t> groups(info.group_ptr_.cbegin(),
-                                                   info.group_ptr_.cend());
-        ProcessWeightedBatch(
-            device, batch, dmat->Info(), begin, end,
-            &sketch_container,
-            num_cuts_per_feature,
-            dmat->Info().num_col_,
-            is_ranking, dh::ToSpan(groups));
-      } else {
-        ProcessBatch(device, dmat->Info(), batch, begin, end, &sketch_container,
-                     num_cuts_per_feature, dmat->Info().num_col_);
-      }
+      thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), d_weight_out.size(),
+                         [=] XGBOOST_DEVICE(std::size_t i) {
+                           auto gidx = dh::SegmentId(d_group_ptr, i);
+                           d_weight_out[i] = d_weight[gidx];
+                         });
+      return p_out_weight->ConstDeviceSpan();
+    } else {
+      return info.weights_.ConstDeviceSpan();
    }
  }
-  sketch_container.MakeCuts(&cuts);
+
+  // sketch with hessian as weight
+  p_out_weight->Resize(info.num_row_);
+  auto d_weight_out = p_out_weight->DeviceSpan();
+  if (!info.weights_.Empty()) {
+    // merge sample weight with hessian
+    auto d_weight = info.weights_.ConstDeviceSpan();
+    if (info.IsRanking()) {
+      dh::device_vector<bst_group_t> group_ptr(info.group_ptr_);
+      CHECK_EQ(hessian.size(), d_weight_out.size());
+      auto d_group_ptr = dh::ToSpan(group_ptr);
+      CHECK_GE(d_group_ptr.size(), 2) << "Must have at least 1 group for ranking.";
+      CHECK_EQ(d_weight.size(), d_group_ptr.size() - 1)
+          << "Weight size should equal to number of groups.";
+      thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), hessian.size(),
+                         [=] XGBOOST_DEVICE(std::size_t i) {
+                           d_weight_out[i] = d_weight[dh::SegmentId(d_group_ptr, i)] * hessian(i);
+                         });
+    } else {
+      CHECK_EQ(hessian.size(), info.num_row_);
+      CHECK_EQ(hessian.size(), d_weight.size());
+      CHECK_EQ(hessian.size(), d_weight_out.size());
+      thrust::for_each_n(
+          cuctx->CTP(), thrust::make_counting_iterator(0ul), hessian.size(),
+          [=] XGBOOST_DEVICE(std::size_t i) { d_weight_out[i] = d_weight[i] * hessian(i); });
+    }
+  } else {
+    // copy hessian as weight
+    CHECK_EQ(d_weight_out.size(), hessian.size());
+#if defined(XGBOOST_USE_CUDA)
+    dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
+                                  cudaMemcpyDefault));
+#elif defined(XGBOOST_USE_HIP)
+    dh::safe_cuda(hipMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
+                                  hipMemcpyDefault));
+#endif
+  }
+  return d_weight_out;
+}
+
+HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+                                      Span<float const> hessian,
+                                      std::size_t sketch_batch_num_elements) {
+  auto const& info = p_fmat->Info();
+  bool has_weight = !info.weights_.Empty();
+  info.feature_types.SetDevice(ctx->Device());
+
+  HostDeviceVector<float> weight;
+  weight.SetDevice(ctx->Device());
+
+  // Configure batch size based on available memory
+  std::size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(max_bin, info.num_row_);
+  sketch_batch_num_elements = detail::SketchBatchNumElements(
+      sketch_batch_num_elements, info.num_row_, info.num_col_, info.num_nonzero_, ctx->Ordinal(),
+      num_cuts_per_feature, has_weight);
+
+  CUDAContext const* cuctx = ctx->CUDACtx();
+
+  info.weights_.SetDevice(ctx->Device());
+  auto d_weight = UnifyWeight(cuctx, info, hessian, &weight);
+
+  HistogramCuts cuts;
+  SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
+                                   ctx->Ordinal());
+  CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
+  for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
+    std::size_t page_nnz = page.data.Size();
+    for (auto begin = 0ull; begin < page_nnz; begin += sketch_batch_num_elements) {
+      std::size_t end =
+          std::min(page_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
+      ProcessWeightedBatch(ctx, page, info, begin, end, &sketch_container, num_cuts_per_feature,
+                           d_weight);
+    }
+  }
+
+  sketch_container.MakeCuts(&cuts, p_fmat->Info().IsColumnSplit());
  return cuts;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -11,18 +11,17 @@

 #include <cstddef>  // for size_t

-#include "../data/device_adapter.cuh"
+#include "../data/adapter.h"  // for IsValidFunctor
 #include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
-#include "timer.h"
+#include "xgboost/span.h"  // for IterSpan

 #if defined(XGBOOST_USE_HIP)
 namespace cub = hipcub;
 #endif

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace cuda {
 /**
 * copy and paste of the host version, we can't make it a __host__ __device__ function as
@@ -148,12 +147,12 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
      CHECK(!force_use_u64);
      auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
      auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
          kernel, batch_iter, is_valid, out_column_size);
    } else {
      auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
      auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
          kernel, batch_iter, is_valid, out_column_size);
    }
  } else {
@@ -262,16 +261,41 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                  dh::device_vector<Entry>* sorted_entries);

-void RemoveDuplicatedCategories(
-    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
-    dh::device_vector<Entry> *p_sorted_entries,
-    dh::caching_device_vector<size_t> *p_column_sizes_scan);
+void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+                                dh::device_vector<Entry>* p_sorted_entries,
+                                dh::device_vector<float>* p_sorted_weights,
+                                dh::caching_device_vector<size_t>* p_column_sizes_scan);
 }  // namespace detail

-// Compute sketch on DMatrix.
-// sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
-HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
-                           size_t sketch_batch_num_elements = 0);
+/**
+ * @brief Compute sketch on DMatrix with GPU and Hessian as weight.
+ *
+ * @param ctx     Runtime context
+ * @param p_fmat  Training feature matrix
+ * @param max_bin Maximum number of bins for each feature
+ * @param hessian Hessian vector.
+ * @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
+ *
+ * @return Quantile cuts
+ */
+HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+                                      Span<float const> hessian,
+                                      std::size_t sketch_batch_num_elements = 0);
+
+/**
+ * @brief Compute sketch on DMatrix with GPU.
+ *
+ * @param ctx     Runtime context
+ * @param p_fmat  Training feature matrix
+ * @param max_bin Maximum number of bins for each feature
+ * @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
+ *
+ * @return Quantile cuts
+ */
+inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+                                  std::size_t sketch_batch_num_elements = 0) {
+  return DeviceSketchWithHessian(ctx, p_fmat, max_bin, {}, sketch_batch_num_elements);
+}

 template <typename AdapterBatch>
 void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
@@ -303,8 +327,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,

  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
+                                       &column_sizes_scan);
  }

  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@@ -408,8 +432,8 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,

  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
+                                       &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
@@ -471,7 +495,5 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
    }
  }
 }
-}      // namespace common
-}      // namespace xgboost
-
+}  // namespace xgboost::common
 #endif  // COMMON_HIST_UTIL_CUH_
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -16,11 +16,9 @@
 #include <vector>

 #include "categorical.h"
-#include "common.h"
 #include "quantile.h"
 #include "row_set.h"
 #include "threading_utils.h"
-#include "timer.h"
 #include "xgboost/base.h"  // for bst_feature_t, bst_bin_t
 #include "xgboost/data.h"

@@ -84,7 +82,7 @@ class HistogramCuts {
    return *this;
  }

-  uint32_t FeatureBins(bst_feature_t feature) const {
+  [[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const {
    return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
  }

@@ -92,8 +90,8 @@ class HistogramCuts {
  std::vector<float>    const& Values()    const { return cut_values_.ConstHostVector(); }
  std::vector<float>    const& MinValues() const { return min_vals_.ConstHostVector();   }

-  bool HasCategorical() const { return has_categorical_; }
-  float MaxCategory() const { return max_cat_; }
+  [[nodiscard]] bool HasCategorical() const { return has_categorical_; }
+  [[nodiscard]] float MaxCategory() const { return max_cat_; }
  /**
   * \brief Set meta info about categorical features.
   *
@@ -105,12 +103,13 @@ class HistogramCuts {
    max_cat_ = max_cat;
  }

-  size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
+  [[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }

  // Return the index of a cut point that is strictly greater than the input
  // value, or the last available index if none exists
-  bst_bin_t SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
-                      std::vector<float> const& values) const {
+  [[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id,
+                                    std::vector<uint32_t> const& ptrs,
+                                    std::vector<float> const& values) const {
    auto end = ptrs[column_id + 1];
    auto beg = ptrs[column_id];
    auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
@@ -119,20 +118,20 @@ class HistogramCuts {
    return idx;
  }

-  bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
+  [[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
    return this->SearchBin(value, column_id, Ptrs(), Values());
  }
-
  /**
   * \brief Search the bin index for numerical feature.
   */
-  bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
+  [[nodiscard]] bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }

  /**
   * \brief Search the bin index for categorical feature.
   */
-  bst_bin_t SearchCatBin(float value, bst_feature_t fidx, std::vector<uint32_t> const& ptrs,
-                         std::vector<float> const& vals) const {
+  [[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx,
+                                       std::vector<uint32_t> const& ptrs,
+                                       std::vector<float> const& vals) const {
    auto end = ptrs.at(fidx + 1) + vals.cbegin();
    auto beg = ptrs[fidx] + vals.cbegin();
    // Truncates the value in case it's not perfectly rounded.
@@ -143,12 +142,14 @@ class HistogramCuts {
    }
    return bin_idx;
  }
-  bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
+  [[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
    auto const& ptrs = this->Ptrs();
    auto const& vals = this->Values();
    return this->SearchCatBin(value, fidx, ptrs, vals);
  }
-  bst_bin_t SearchCatBin(Entry const& e) const { return SearchCatBin(e.fvalue, e.index); }
+  [[nodiscard]] bst_bin_t SearchCatBin(Entry const& e) const {
+    return SearchCatBin(e.fvalue, e.index);
+  }

  /**
   * \brief Return numerical bin value given bin index.
@@ -171,7 +172,7 @@ class HistogramCuts {
 *                   but consumes more memory.
 */
 HistogramCuts SketchOnDMatrix(Context const* ctx, DMatrix* m, bst_bin_t max_bins,
-                              bool use_sorted = false, Span<float> const hessian = {});
+                              bool use_sorted = false, Span<float const> hessian = {});

 enum BinTypeSize : uint8_t {
  kUint8BinsTypeSize = 1,
@@ -200,13 +201,33 @@ auto DispatchBinType(BinTypeSize type, Fn&& fn) {
 }

 /**
- * \brief Optionally compressed gradient index. The compression works only with dense
+ * @brief Optionally compressed gradient index. The compression works only with dense
 *        data.
 *
 *   The main body of construction code is in gradient_index.cc, this struct is only a
- *   storage class.
+ *   view class.
 */
-struct Index {
+class Index {
+ private:
+  void SetBinTypeSize(BinTypeSize binTypeSize) {
+    binTypeSize_ = binTypeSize;
+    switch (binTypeSize) {
+      case kUint8BinsTypeSize:
+        func_ = &GetValueFromUint8;
+        break;
+      case kUint16BinsTypeSize:
+        func_ = &GetValueFromUint16;
+        break;
+      case kUint32BinsTypeSize:
+        func_ = &GetValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
+              binTypeSize == kUint32BinsTypeSize);
+    }
+  }
+
+ public:
  // Inside the compressor, bin_idx is the index for cut value across all features. By
  // subtracting it with starting pointer of each feature, we can reduce it to smaller
  // value and store it with smaller types. Usable only with dense data.
@@ -230,10 +251,24 @@ struct Index {
  }

  Index() { SetBinTypeSize(binTypeSize_); }
-  Index(const Index& i) = delete;
-  Index& operator=(Index i) = delete;
+
+  Index(Index const& i) = delete;
+  Index& operator=(Index const& i) = delete;
  Index(Index&& i) = delete;
-  Index& operator=(Index&& i) = delete;
+
+  /** @brief Move assignment for lazy initialization. */
+  Index& operator=(Index&& i) = default;
+
+  /**
+   * @brief Construct the index from data.
+   *
+   * @param data     Storage for compressed histogram bin.
+   * @param bin_size Number of bytes for each bin.
+   */
+  Index(Span<std::uint8_t> data, BinTypeSize bin_size) : data_{data} {
+    this->SetBinTypeSize(bin_size);
+  }
+
  uint32_t operator[](size_t i) const {
    if (!bin_offset_.empty()) {
      // dense, compressed
@@ -244,26 +279,7 @@ struct Index {
      return func_(data_.data(), i);
    }
  }
-  void SetBinTypeSize(BinTypeSize binTypeSize) {
-    binTypeSize_ = binTypeSize;
-    switch (binTypeSize) {
-      case kUint8BinsTypeSize:
-        func_ = &GetValueFromUint8;
-        break;
-      case kUint16BinsTypeSize:
-        func_ = &GetValueFromUint16;
-        break;
-      case kUint32BinsTypeSize:
-        func_ = &GetValueFromUint32;
-        break;
-      default:
-        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
-              binTypeSize == kUint32BinsTypeSize);
-    }
-  }
-  BinTypeSize GetBinTypeSize() const {
-    return binTypeSize_;
-  }
+  [[nodiscard]] BinTypeSize GetBinTypeSize() const { return binTypeSize_; }
  template <typename T>
  T const* data() const {  // NOLINT
    return reinterpret_cast<T const*>(data_.data());
@@ -272,30 +288,27 @@ struct Index {
  T* data() {  // NOLINT
    return reinterpret_cast<T*>(data_.data());
  }
-  uint32_t const* Offset() const { return bin_offset_.data(); }
-  size_t OffsetSize() const { return bin_offset_.size(); }
-  size_t Size() const { return data_.size() / (binTypeSize_); }
+  [[nodiscard]] std::uint32_t const* Offset() const { return bin_offset_.data(); }
+  [[nodiscard]] std::size_t OffsetSize() const { return bin_offset_.size(); }
+  [[nodiscard]] std::size_t Size() const { return data_.size() / (binTypeSize_); }

-  void Resize(const size_t n_bytes) {
-    data_.resize(n_bytes);
-  }
  // set the offset used in compression, cut_ptrs is the CSC indptr in HistogramCuts
  void SetBinOffset(std::vector<uint32_t> const& cut_ptrs) {
    bin_offset_.resize(cut_ptrs.size() - 1);  // resize to number of features.
    std::copy_n(cut_ptrs.begin(), bin_offset_.size(), bin_offset_.begin());
  }
-  std::vector<uint8_t>::const_iterator begin() const {  // NOLINT
-    return data_.begin();
+  auto begin() const {  // NOLINT
+    return data_.data();
  }
-  std::vector<uint8_t>::const_iterator end() const {  // NOLINT
-    return data_.end();
+  auto end() const {  // NOLINT
+    return data_.data() + data_.size();
  }

-  std::vector<uint8_t>::iterator begin() {  // NOLINT
-    return data_.begin();
+  auto begin() {  // NOLINT
+    return data_.data();
  }
-  std::vector<uint8_t>::iterator end() {  // NOLINT
-    return data_.end();
+  auto end() {  // NOLINT
+    return data_.data() + data_.size();
  }

 private:
@@ -310,12 +323,12 @@ struct Index {

  using Func = uint32_t (*)(uint8_t const*, size_t);

-  std::vector<uint8_t> data_;
+  Span<std::uint8_t> data_;
  // starting position of each feature inside the cut values (the indptr of the CSC cut matrix
  // HistogramCuts without the last entry.) Used for bin compression.
  std::vector<uint32_t> bin_offset_;

-  BinTypeSize binTypeSize_ {kUint8BinsTypeSize};
+  BinTypeSize binTypeSize_{kUint8BinsTypeSize};
  Func func_;
 };

@@ -349,16 +362,12 @@ bst_bin_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(std::size_t begin, std::size_t
 }

 using GHistRow = Span<xgboost::GradientPairPrecise>;
-
-/*!
- * \brief fill a histogram by zeros
- */
-void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
+using ConstGHistRow = Span<xgboost::GradientPairPrecise const>;

 /*!
 * \brief Increment hist as dst += add in range [begin, end)
 */
-void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end);
+void IncrementHist(GHistRow dst, ConstGHistRow add, std::size_t begin, std::size_t end);

 /*!
 * \brief Copy hist from src to dst in range [begin, end)
@@ -381,12 +390,7 @@ class HistCollection {
    constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
    const size_t id = row_ptr_.at(nid);
    CHECK_NE(id, kMax);
-    GradientPairPrecise* ptr = nullptr;
-    if (contiguous_allocation_) {
-      ptr = const_cast<GradientPairPrecise*>(data_[0].data() + nbins_*id);
-    } else {
-      ptr = const_cast<GradientPairPrecise*>(data_[id].data());
-    }
+    GradientPairPrecise* ptr = const_cast<GradientPairPrecise*>(data_[id].data());
    return {ptr, nbins_};
  }

@@ -431,23 +435,12 @@ class HistCollection {
      data_[row_ptr_[nid]].resize(nbins_, {0, 0});
    }
  }
-  // allocate common buffer contiguously for all nodes, need for single Allreduce call
-  void AllocateAllData() {
-    const size_t new_size = nbins_*data_.size();
-    contiguous_allocation_ = true;
-    if (data_[0].size() != new_size) {
-      data_[0].resize(new_size);
-    }
-  }

 private:
  /*! \brief number of all bins over all features */
  uint32_t nbins_ = 0;
  /*! \brief amount of active nodes in hist collection */
  uint32_t n_nodes_added_ = 0;
-  /*! \brief flag to identify contiguous memory allocation */
-  bool contiguous_allocation_ = false;
-
  std::vector<std::vector<GradientPairPrecise>> data_;

  /*! \brief row_ptr_[nid] locates bin for histogram of node nid */
@@ -503,7 +496,7 @@ class ParallelGHistBuilder {
    GHistRow hist = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];

    if (!hist_was_used_[tid * nodes_ + nid]) {
-      InitilizeHistByZeroes(hist, 0, hist.size());
+      std::fill_n(hist.data(), hist.size(), GradientPairPrecise{});
      hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
    }

@@ -533,7 +526,7 @@ class ParallelGHistBuilder {
    if (!is_updated) {
      // In distributed mode - some tree nodes can be empty on local machines,
      // So we need just set local hist by zeros in this case
-      InitilizeHistByZeroes(dst, begin, end);
+      std::fill(dst.data() + begin, dst.data() + end, GradientPairPrecise{});
    }
  }

@@ -583,6 +576,8 @@ class ParallelGHistBuilder {
    }
  }

+  [[nodiscard]] bst_bin_t TotalBins() const { return nbins_; }
+
 private:
  void MatchNodeNidPairToHist() {
    size_t hist_allocated_additionally = 0;
@@ -628,27 +623,10 @@ class ParallelGHistBuilder {
  std::map<std::pair<size_t, size_t>, int> tid_nid_to_hist_;
 };

-/*!
- * \brief builder for histograms of gradient statistics
- */
-class GHistBuilder {
- public:
-  GHistBuilder() = default;
-  explicit GHistBuilder(uint32_t nbins): nbins_{nbins} {}
-
-  // construct a histogram via histogram aggregation
-  template <bool any_missing>
-  void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
-                 const GHistIndexMatrix& gmat, GHistRow hist,
-                 bool force_read_by_column = false) const;
-  uint32_t GetNumBins() const {
-      return nbins_;
-  }
-
- private:
-  /*! \brief number of all bins over all features */
-  uint32_t nbins_ { 0 };
-};
+// construct a histogram via histogram aggregation
+template <bool any_missing>
+void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+               const GHistIndexMatrix& gmat, GHistRow hist, bool force_read_by_column = false);
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_HIST_UTIL_H_
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -168,6 +168,9 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
 template <typename T>
 void HostDeviceVector<T>::SetDevice(int) const {}

+template <typename T>
+void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
+
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -434,6 +434,11 @@ void HostDeviceVector<T>::SetDevice(int device) const {
  impl_->SetDevice(device);
 }

+template <typename T>
+void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
+  impl_->SetDevice(device.ordinal);
+}
+
 template <typename T>
 void HostDeviceVector<T>::Resize(size_t new_size, T v) {
  impl_->Resize(new_size, v);
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -1,24 +1,48 @@
-/*!
- * Copyright (c) by XGBoost Contributors 2019-2022
+/**
+ * Copyright 2019-2023, by XGBoost Contributors
 */
-#if defined(__unix__)
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
+#if !defined(NOMINMAX) && defined(_WIN32)
+#define NOMINMAX
+#endif  // !defined(NOMINMAX)
+
+#if !defined(xgboost_IS_WIN)
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define xgboost_IS_WIN 1
+#endif  // defined(_MSC_VER) || defined(__MINGW32__)
+
+#endif  // !defined(xgboost_IS_WIN)
+
+#if defined(__unix__) || defined(__APPLE__)
+#include <fcntl.h>     // for open, O_RDONLY
+#include <sys/mman.h>  // for mmap, mmap64, munmap
+#include <unistd.h>    // for close, getpagesize
+#elif defined(xgboost_IS_WIN)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
 #endif  // defined(__unix__)
-#include <algorithm>
-#include <fstream>
-#include <string>
-#include <memory>
-#include <utility>
-#include <cstdio>

-#include "xgboost/logging.h"
+#include <algorithm>     // for copy, transform
+#include <cctype>        // for tolower
+#include <cerrno>        // for errno
+#include <cstddef>       // for size_t
+#include <cstdint>       // for int32_t, uint32_t
+#include <cstring>       // for memcpy
+#include <filesystem>    // for filesystem, weakly_canonical
+#include <fstream>       // for ifstream
+#include <iterator>      // for distance
+#include <limits>        // for numeric_limits
+#include <memory>        // for unique_ptr
+#include <string>        // for string
+#include <system_error>  // for error_code, system_category
+#include <utility>       // for move
+#include <vector>        // for vector
+
 #include "io.h"
+#include "xgboost/collective/socket.h"  // for LastError
+#include "xgboost/logging.h"

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 size_t PeekableInStream::Read(void* dptr, size_t size) {
  size_t nbuffer = buffer_.length() - buffer_ptr_;
  if (nbuffer == 0) return strm_->Read(dptr, size);
@@ -94,52 +118,50 @@ void FixedSizeStream::Take(std::string* out) {
  *out = std::move(buffer_);
 }

-std::string LoadSequentialFile(std::string uri, bool stream) {
+namespace {
+// Get system alignment value for IO with mmap.
+std::size_t GetMmapAlignment() {
+#if defined(xgboost_IS_WIN)
+  SYSTEM_INFO sys_info;
+  GetSystemInfo(&sys_info);
+  // During testing, `sys_info.dwPageSize` is of size 4096 while `dwAllocationGranularity` is of
+  // size 65536.
+  return sys_info.dwAllocationGranularity;
+#else
+  return getpagesize();
+#endif
+}
+
+auto SystemErrorMsg() {
+  std::int32_t errsv = system::LastError();
+  auto err = std::error_code{errsv, std::system_category()};
+  return err.message();
+}
+}  // anonymous namespace
+
+std::vector<char> LoadSequentialFile(std::string uri) {
  auto OpenErr = [&uri]() {
    std::string msg;
    msg = "Opening " + uri + " failed: ";
-    msg += strerror(errno);
+    msg += SystemErrorMsg();
    LOG(FATAL) << msg;
  };

  auto parsed = dmlc::io::URI(uri.c_str());
+  CHECK((parsed.protocol == "file://" || parsed.protocol.length() == 0))
+      << "Only local file is supported.";
  // Read from file.
-  if ((parsed.protocol == "file://" || parsed.protocol.length() == 0) && !stream) {
-    std::string buffer;
-    // Open in binary mode so that correct file size can be computed with
-    // seekg(). This accommodates Windows platform:
-    // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
-    std::ifstream ifs(uri, std::ios_base::binary | std::ios_base::in);
-    if (!ifs) {
-      // https://stackoverflow.com/a/17338934
-      OpenErr();
-    }
-
-    ifs.seekg(0, std::ios_base::end);
-    const size_t file_size = static_cast<size_t>(ifs.tellg());
-    ifs.seekg(0, std::ios_base::beg);
-    buffer.resize(file_size + 1);
-    ifs.read(&buffer[0], file_size);
-    buffer.back() = '\0';
-
-    return buffer;
+  auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri));
+  std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in);
+  if (!ifs) {
+    // https://stackoverflow.com/a/17338934
+    OpenErr();
  }

-  // Read from remote.
-  std::unique_ptr<dmlc::Stream> fs{dmlc::Stream::Create(uri.c_str(), "r")};
-  std::string buffer;
-  size_t constexpr kInitialSize = 4096;
-  size_t size {kInitialSize}, total {0};
-  while (true) {
-    buffer.resize(total + size);
-    size_t read = fs->Read(&buffer[total], size);
-    total += read;
-    if (read < size) {
-      break;
-    }
-    size *= 2;
-  }
-  buffer.resize(total);
+  auto file_size = std::filesystem::file_size(path);
+  std::vector<char> buffer(file_size);
+  ifs.read(&buffer[0], file_size);
+
  return buffer;
 }

@@ -155,5 +177,159 @@ std::string FileExtension(std::string fname, bool lower) {
    return "";
  }
 }
-}  // namespace common
-}  // namespace xgboost
+
+// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
+// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
+ResourceHandler::~ResourceHandler() noexcept(false) {}  // NOLINT
+
+struct MMAPFile {
+#if defined(xgboost_IS_WIN)
+  HANDLE fd{INVALID_HANDLE_VALUE};
+  HANDLE file_map{INVALID_HANDLE_VALUE};
+#else
+  std::int32_t fd{0};
+#endif
+  std::byte* base_ptr{nullptr};
+  std::size_t base_size{0};
+  std::size_t delta{0};
+  std::string path;
+
+  MMAPFile() = default;
+
+#if defined(xgboost_IS_WIN)
+  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd},
+        file_map{fm},
+        base_ptr{base_ptr},
+        base_size{base_size},
+        delta{delta},
+        path{std::move(path)} {}
+#else
+  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
+#endif
+};
+
+std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t length) {
+  if (length == 0) {
+    return std::make_unique<MMAPFile>();
+  }
+
+#if defined(xgboost_IS_WIN)
+  HANDLE fd = CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING,
+                         FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, nullptr);
+  CHECK_NE(fd, INVALID_HANDLE_VALUE) << "Failed to open:" << path << ". " << SystemErrorMsg();
+#else
+  auto fd = open(path.c_str(), O_RDONLY);
+  CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
+#endif
+
+  std::byte* ptr{nullptr};
+  // Round down for alignment.
+  auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
+  auto view_size = length + (offset - view_start);
+
+#if defined(__linux__) || defined(__GLIBC__)
+  int prot{PROT_READ};
+  ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  madvise(ptr, view_size, MADV_WILLNEED);
+  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  auto handle =
+      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
+#elif defined(xgboost_IS_WIN)
+  auto file_size = GetFileSize(fd, nullptr);
+  DWORD access = PAGE_READONLY;
+  auto map_file = CreateFileMapping(fd, nullptr, access, 0, file_size, nullptr);
+  access = FILE_MAP_READ;
+  std::uint32_t loff = static_cast<std::uint32_t>(view_start);
+  std::uint32_t hoff = view_start >> 32;
+  CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
+  CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  auto handle = std::make_unique<MMAPFile>(fd, map_file, ptr, view_size, offset - view_start,
+                                           std::move(path));
+#else
+  CHECK_LE(offset, std::numeric_limits<off_t>::max())
+      << "File size has exceeded the limit on the current system.";
+  int prot{PROT_READ};
+  ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  auto handle =
+      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
+#endif  // defined(__linux__)
+
+  return handle;
+}
+
+MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length)
+    : ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {}
+
+MmapResource::~MmapResource() noexcept(false) {
+  if (!handle_) {
+    return;
+  }
+#if defined(xgboost_IS_WIN)
+  if (handle_->base_ptr) {
+    CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
+  }
+  if (handle_->fd != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
+  }
+  if (handle_->file_map != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
+  }
+#else
+  if (handle_->base_ptr) {
+    CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
+        << "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
+  }
+  if (handle_->fd != 0) {
+    CHECK_NE(close(handle_->fd), -1)
+        << "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
+  }
+#endif
+}
+
+[[nodiscard]] void* MmapResource::Data() {
+  if (!handle_) {
+    return nullptr;
+  }
+  return handle_->base_ptr + handle_->delta;
+}
+
+[[nodiscard]] std::size_t MmapResource::Size() const { return n_; }
+
+// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
+// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
+AlignedResourceReadStream::~AlignedResourceReadStream() noexcept(false) {}  // NOLINT
+PrivateMmapConstStream::~PrivateMmapConstStream() noexcept(false) {}        // NOLINT
+
+AlignedFileWriteStream::AlignedFileWriteStream(StringView path, StringView flags)
+    : pimpl_{dmlc::Stream::Create(path.c_str(), flags.c_str())} {}
+
+[[nodiscard]] std::size_t AlignedFileWriteStream::DoWrite(const void* ptr,
+                                                          std::size_t n_bytes) noexcept(true) {
+  pimpl_->Write(ptr, n_bytes);
+  return n_bytes;
+}
+
+AlignedMemWriteStream::AlignedMemWriteStream(std::string* p_buf)
+    : pimpl_{std::make_unique<MemoryBufferStream>(p_buf)} {}
+AlignedMemWriteStream::~AlignedMemWriteStream() = default;
+
+[[nodiscard]] std::size_t AlignedMemWriteStream::DoWrite(const void* ptr,
+                                                         std::size_t n_bytes) noexcept(true) {
+  this->pimpl_->Write(ptr, n_bytes);
+  return n_bytes;
+}
+
+[[nodiscard]] std::size_t AlignedMemWriteStream::Tell() const noexcept(true) {
+  return this->pimpl_->Tell();
+}
+}  // namespace xgboost::common
+
+#if defined(xgboost_IS_WIN)
+#undef xgboost_IS_WIN
+#endif  // defined(xgboost_IS_WIN)
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -1,23 +1,32 @@
-/*!
- * Copyright by XGBoost Contributors 2014-2022
+/**
+ * Copyright 2014-2023, XGBoost Contributors
 * \file io.h
 * \brief general stream interface for serialization, I/O
 * \author Tianqi Chen
 */
-
 #ifndef XGBOOST_COMMON_IO_H_
 #define XGBOOST_COMMON_IO_H_

 #include <dmlc/io.h>
 #include <rabit/rabit.h>
-#include <string>
-#include <cstring>
-#include <fstream>
+
+#include <algorithm>    // for min, fill_n, copy_n
+#include <array>        // for array
+#include <cstddef>      // for byte, size_t
+#include <cstdlib>      // for malloc, realloc, free
+#include <cstring>      // for memcpy
+#include <fstream>      // for ifstream
+#include <limits>       // for numeric_limits
+#include <memory>       // for unique_ptr
+#include <string>       // for string
+#include <type_traits>  // for alignment_of_v, enable_if_t
+#include <utility>      // for move
+#include <vector>       // for vector

 #include "common.h"
+#include "xgboost/string_view.h"  // for StringView

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 using MemoryFixSizeBuffer = rabit::utils::MemoryFixSizeBuffer;
 using MemoryBufferStream = rabit::utils::MemoryBufferStream;

@@ -56,8 +65,8 @@ class FixedSizeStream : public PeekableInStream {

  size_t Read(void* dptr, size_t size) override;
  size_t PeekRead(void* dptr, size_t size) override;
-  size_t Size() const { return buffer_.size(); }
-  size_t Tell() const { return pointer_; }
+  [[nodiscard]] std::size_t Size() const { return buffer_.size(); }
+  [[nodiscard]] std::size_t Tell() const { return pointer_; }
  void Seek(size_t pos);

  void Write(const void*, size_t) override {
@@ -75,16 +84,14 @@ class FixedSizeStream : public PeekableInStream {
  std::string buffer_;
 };

-/*!
- * \brief Helper function for loading consecutive file to avoid dmlc Stream when possible.
+/**
+ * @brief Helper function for loading consecutive file.
 *
- * \param uri    URI or file name to file.
- * \param stream Use dmlc Stream unconditionally if set to true.  Used for running test
- *               without remote filesystem.
+ * @param uri    URI or file name to file.
 *
- * \return File content.
+ * @return File content.
 */
-std::string LoadSequentialFile(std::string uri, bool stream = false);
+std::vector<char> LoadSequentialFile(std::string uri);

 /**
 * \brief Get file extension from file name.
@@ -127,6 +134,318 @@ inline std::string ReadAll(std::string const &path) {
  return content;
 }

-}  // namespace common
-}  // namespace xgboost
+struct MMAPFile;
+
+/**
+ * @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is
+ *        fixed once it's constructed. Users cannot use mutable operations like resize
+ *        without acquiring the specific resource first.
+ */
+class ResourceHandler {
+ public:
+  // RTTI
+  enum Kind : std::uint8_t {
+    kMalloc = 0,
+    kMmap = 1,
+  };
+
+ private:
+  Kind kind_{kMalloc};
+
+ public:
+  virtual void* Data() = 0;
+  template <typename T>
+  [[nodiscard]] T* DataAs() {
+    return reinterpret_cast<T*>(this->Data());
+  }
+
+  [[nodiscard]] virtual std::size_t Size() const = 0;
+  [[nodiscard]] auto Type() const { return kind_; }
+
+  // Allow exceptions for cleaning up resource.
+  virtual ~ResourceHandler() noexcept(false);
+
+  explicit ResourceHandler(Kind kind) : kind_{kind} {}
+  // Use shared_ptr to manage a pool like resource handler. All copy and assignment
+  // operators are disabled.
+  ResourceHandler(ResourceHandler const& that) = delete;
+  ResourceHandler& operator=(ResourceHandler const& that) = delete;
+  ResourceHandler(ResourceHandler&& that) = delete;
+  ResourceHandler& operator=(ResourceHandler&& that) = delete;
+  /**
+   * @brief Wether two resources have the same type. (both malloc or both mmap).
+   */
+  [[nodiscard]] bool IsSameType(ResourceHandler const& that) const {
+    return this->Type() == that.Type();
+  }
+};
+
+class MallocResource : public ResourceHandler {
+  void* ptr_{nullptr};
+  std::size_t n_{0};
+
+  void Clear() noexcept(true) {
+    std::free(ptr_);
+    ptr_ = nullptr;
+    n_ = 0;
+  }
+
+ public:
+  explicit MallocResource(std::size_t n_bytes) : ResourceHandler{kMalloc} { this->Resize(n_bytes); }
+  ~MallocResource() noexcept(true) override { this->Clear(); }
+
+  void* Data() override { return ptr_; }
+  [[nodiscard]] std::size_t Size() const override { return n_; }
+  /**
+   * @brief Resize the resource to n_bytes. Unlike std::vector::resize, it prefers realloc
+   *        over malloc.
+   *
+   * @tparam force_malloc Force the use of malloc over realloc. Used for testing.
+   *
+   * @param n_bytes The new size.
+   */
+  template <bool force_malloc = false>
+  void Resize(std::size_t n_bytes, std::byte init = std::byte{0}) {
+    // realloc(ptr, 0) works, but is deprecated.
+    if (n_bytes == 0) {
+      this->Clear();
+      return;
+    }
+
+    // If realloc fails, we need to copy the data ourselves.
+    bool need_copy{false};
+    void* new_ptr{nullptr};
+    // use realloc first, it can handle nullptr.
+    if constexpr (!force_malloc) {
+      new_ptr = std::realloc(ptr_, n_bytes);
+    }
+    // retry with malloc if realloc fails
+    if (!new_ptr) {
+      // ptr_ is preserved if realloc fails
+      new_ptr = std::malloc(n_bytes);
+      need_copy = true;
+    }
+    if (!new_ptr) {
+      // malloc fails
+      LOG(FATAL) << "bad_malloc: Failed to allocate " << n_bytes << " bytes.";
+    }
+
+    if (need_copy) {
+      std::copy_n(reinterpret_cast<std::byte*>(ptr_), n_, reinterpret_cast<std::byte*>(new_ptr));
+    }
+    // default initialize
+    std::fill_n(reinterpret_cast<std::byte*>(new_ptr) + n_, n_bytes - n_, init);
+    // free the old ptr if malloc is used.
+    if (need_copy) {
+      this->Clear();
+    }
+
+    ptr_ = new_ptr;
+    n_ = n_bytes;
+  }
+};
+
+/**
+ * @brief A class for wrapping mmap as a resource for RAII.
+ */
+class MmapResource : public ResourceHandler {
+  std::unique_ptr<MMAPFile> handle_;
+  std::size_t n_;
+
+ public:
+  MmapResource(std::string path, std::size_t offset, std::size_t length);
+  ~MmapResource() noexcept(false) override;
+
+  [[nodiscard]] void* Data() override;
+  [[nodiscard]] std::size_t Size() const override;
+};
+
+/**
+ * @param Alignment for resource read stream and aligned write stream.
+ */
+constexpr std::size_t IOAlignment() {
+  // For most of the pod types in XGBoost, 8 byte is sufficient.
+  return 8;
+}
+
+/**
+ * @brief Wrap resource into a dmlc stream.
+ *
+ *  This class is to facilitate the use of mmap. Caller can optionally use the `Read()`
+ *  method or the `Consume()` method. The former copies data into output, while the latter
+ *  makes copy only if it's a primitive type.
+ *
+ *  Input is required to be aligned to IOAlignment().
+ */
+class AlignedResourceReadStream {
+  std::shared_ptr<ResourceHandler> resource_;
+  std::size_t curr_ptr_{0};
+
+  // Similar to SEEK_END in libc
+  static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
+
+ public:
+  explicit AlignedResourceReadStream(std::shared_ptr<ResourceHandler> resource)
+      : resource_{std::move(resource)} {}
+
+  [[nodiscard]] std::shared_ptr<ResourceHandler> Share() noexcept(true) { return resource_; }
+  /**
+   * @brief Consume n_bytes of data, no copying is performed.
+   *
+   * @return A pair with the beginning pointer and the number of available bytes, which
+   *         may be smaller than requested.
+   */
+  [[nodiscard]] auto Consume(std::size_t n_bytes) noexcept(true) {
+    auto res_size = resource_->Size();
+    auto data = reinterpret_cast<std::byte*>(resource_->Data());
+    auto ptr = data + curr_ptr_;
+
+    // Move the cursor
+    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
+    auto aligned_forward = std::min(res_size - curr_ptr_, aligned_n_bytes);
+    std::size_t forward = std::min(res_size - curr_ptr_, n_bytes);
+
+    curr_ptr_ += aligned_forward;
+
+    return std::pair{ptr, forward};
+  }
+
+  template <typename T>
+  [[nodiscard]] auto Consume(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
+    auto [ptr, size] = this->Consume(sizeof(T));
+    if (size != sizeof(T)) {
+      return false;
+    }
+    CHECK_EQ(reinterpret_cast<std::uintptr_t>(ptr) % std::alignment_of_v<T>, 0);
+    *out = *reinterpret_cast<T*>(ptr);
+    return true;
+  }
+
+  [[nodiscard]] virtual std::size_t Tell() noexcept(true) { return curr_ptr_; }
+  /**
+   * @brief Read n_bytes of data, output is copied into ptr.
+   */
+  [[nodiscard]] std::size_t Read(void* ptr, std::size_t n_bytes) noexcept(true) {
+    auto [res_ptr, forward] = this->Consume(n_bytes);
+    if (forward != 0) {
+      std::memcpy(ptr, res_ptr, forward);
+    }
+    return forward;
+  }
+  /**
+   * @brief Read a primitive type.
+   *
+   * @return Whether the read is successful.
+   */
+  template <typename T>
+  [[nodiscard]] auto Read(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
+    return this->Consume(out);
+  }
+  /**
+   * @brief Read a vector.
+   *
+   * @return Whether the read is successful.
+   */
+  template <typename T>
+  [[nodiscard]] bool Read(std::vector<T>* out) noexcept(true) {
+    std::uint64_t n{0};
+    if (!this->Consume(&n)) {
+      return false;
+    }
+    out->resize(n);
+
+    auto n_bytes = sizeof(T) * n;
+    if (this->Read(out->data(), n_bytes) != n_bytes) {
+      return false;
+    }
+    return true;
+  }
+
+  virtual ~AlignedResourceReadStream() noexcept(false);
+};
+
+/**
+ * @brief Private mmap file as a read-only stream.
+ *
+ *  It can calculate alignment automatically based on system page size (or allocation
+ *  granularity on Windows).
+ *
+ *  The file is required to be aligned by IOAlignment().
+ */
+class PrivateMmapConstStream : public AlignedResourceReadStream {
+ public:
+  /**
+   * @brief Construct a private mmap stream.
+   *
+   * @param path      File path.
+   * @param offset    See the `offset` parameter of `mmap` for details.
+   * @param length    See the `length` parameter of `mmap` for details.
+   */
+  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
+      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
+  ~PrivateMmapConstStream() noexcept(false) override;
+};
+
+/**
+ * @brief Base class for write stream with alignment defined by IOAlignment().
+ */
+class AlignedWriteStream {
+ protected:
+  [[nodiscard]] virtual std::size_t DoWrite(const void* ptr,
+                                            std::size_t n_bytes) noexcept(true) = 0;
+
+ public:
+  virtual ~AlignedWriteStream() = default;
+
+  [[nodiscard]] std::size_t Write(const void* ptr, std::size_t n_bytes) noexcept(false) {
+    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
+    auto w_n_bytes = this->DoWrite(ptr, n_bytes);
+    CHECK_EQ(w_n_bytes, n_bytes);
+    auto remaining = aligned_n_bytes - n_bytes;
+    if (remaining > 0) {
+      std::array<std::uint8_t, IOAlignment()> padding;
+      std::memset(padding.data(), '\0', padding.size());
+      w_n_bytes = this->DoWrite(padding.data(), remaining);
+      CHECK_EQ(w_n_bytes, remaining);
+    }
+    return aligned_n_bytes;
+  }
+
+  template <typename T>
+  [[nodiscard]] std::enable_if_t<std::is_pod_v<T>, std::size_t> Write(T const& v) {
+    return this->Write(&v, sizeof(T));
+  }
+};
+
+/**
+ * @brief Output stream backed by a file. Aligned to IOAlignment() bytes.
+ */
+class AlignedFileWriteStream : public AlignedWriteStream {
+  std::unique_ptr<dmlc::Stream> pimpl_;
+
+ protected:
+  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
+
+ public:
+  AlignedFileWriteStream() = default;
+  AlignedFileWriteStream(StringView path, StringView flags);
+  ~AlignedFileWriteStream() override = default;
+};
+
+/**
+ * @brief Output stream backed by memory buffer. Aligned to IOAlignment() bytes.
+ */
+class AlignedMemWriteStream : public AlignedFileWriteStream {
+  std::unique_ptr<MemoryBufferStream> pimpl_;
+
+ protected:
+  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
+
+ public:
+  explicit AlignedMemWriteStream(std::string* p_buf);
+  ~AlignedMemWriteStream() override;
+
+  [[nodiscard]] std::size_t Tell() const noexcept(true);
+};
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_IO_H_
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -1,23 +1,29 @@
-/*!
- * Copyright (c) by Contributors 2019-2022
+/**
+ * Copyright 2019-2023, XGBoost Contributors
 */
 #include "xgboost/json.h"

-#include <dmlc/endian.h>
+#include <array>             // for array
+#include <cctype>            // for isdigit
+#include <cmath>             // for isinf, isnan
+#include <cstdio>            // for EOF
+#include <cstdlib>           // for size_t, strtof
+#include <cstring>           // for memcpy
+#include <initializer_list>  // for initializer_list
+#include <iterator>          // for distance
+#include <limits>            // for numeric_limits
+#include <memory>            // for allocator
+#include <sstream>           // for operator<<, basic_ostream, operator&, ios, stringstream
+#include <system_error>      // for errc

-#include <cctype>
-#include <cmath>
-#include <cstddef>
-#include <iterator>
-#include <limits>
-#include <sstream>
-
-#include "./math.h"
-#include "charconv.h"
-#include "xgboost/base.h"
-#include "xgboost/json_io.h"
-#include "xgboost/logging.h"
-#include "xgboost/string_view.h"
+#include "./math.h"                 // for CheckNAN
+#include "charconv.h"               // for to_chars, NumericLimits, from_chars, to_chars_result
+#include "common.h"                 // for EscapeU8
+#include "xgboost/base.h"           // for XGBOOST_EXPECT
+#include "xgboost/intrusive_ptr.h"  // for IntrusivePtr
+#include "xgboost/json_io.h"        // for JsonReader, UBJReader, UBJWriter, JsonWriter, ToBigEn...
+#include "xgboost/logging.h"        // for LOG, LOG_FATAL, LogMessageFatal, LogCheck_NE, CHECK
+#include "xgboost/string_view.h"    // for StringView, operator<<

 namespace xgboost {

@@ -57,12 +63,12 @@ void JsonWriter::Visit(JsonObject const* obj) {
 }

 void JsonWriter::Visit(JsonNumber const* num) {
-  char number[NumericLimits<float>::kToCharsSize];
-  auto res = to_chars(number, number + sizeof(number), num->GetNumber());
+  std::array<char, NumericLimits<float>::kToCharsSize> number;
+  auto res = to_chars(number.data(), number.data() + number.size(), num->GetNumber());
  auto end = res.ptr;
  auto ori_size = stream_->size();
-  stream_->resize(stream_->size() + end - number);
-  std::memcpy(stream_->data() + ori_size, number, end - number);
+  stream_->resize(stream_->size() + end - number.data());
+  std::memcpy(stream_->data() + ori_size, number.data(), end - number.data());
 }

 void JsonWriter::Visit(JsonInteger const* num) {
@@ -88,43 +94,15 @@ void JsonWriter::Visit(JsonNull const* ) {
 }

 void JsonWriter::Visit(JsonString const* str) {
-  std::string buffer;
-  buffer += '"';
-  auto const& string = str->GetString();
-  for (size_t i = 0; i < string.length(); i++) {
-    const char ch = string[i];
-    if (ch == '\\') {
-      if (i < string.size() && string[i+1] == 'u') {
-        buffer += "\\";
-      } else {
-        buffer += "\\\\";
-      }
-    } else if (ch == '"') {
-      buffer += "\\\"";
-    } else if (ch == '\b') {
-      buffer += "\\b";
-    } else if (ch == '\f') {
-      buffer += "\\f";
-    } else if (ch == '\n') {
-      buffer += "\\n";
-    } else if (ch == '\r') {
-      buffer += "\\r";
-    } else if (ch == '\t') {
-      buffer += "\\t";
-    } else if (static_cast<uint8_t>(ch) <= 0x1f) {
-      // Unit separator
-      char buf[8];
-      snprintf(buf, sizeof buf, "\\u%04x", ch);
-      buffer += buf;
-    } else {
-      buffer += ch;
-    }
-  }
-  buffer += '"';
+    std::string buffer;
+    buffer += '"';
+    auto const& string = str->GetString();
+    common::EscapeU8(string, &buffer);
+    buffer += '"';

-  auto s = stream_->size();
-  stream_->resize(s + buffer.size());
-  std::memcpy(stream_->data() + s, buffer.data(), buffer.size());
+    auto s = stream_->size();
+    stream_->resize(s + buffer.size());
+    std::memcpy(stream_->data() + s, buffer.data(), buffer.size());
 }

 void JsonWriter::Visit(JsonBoolean const* boolean) {
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -12,18 +12,17 @@
 namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
-#if defined(XGBOOST_USE_HIP)
-void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
 void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
+#elif defined(XGBOOST_USE_HIP)
+void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
 #endif
 {
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(t.DeviceIdx()));
-#elif defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(t.DeviceIdx()));
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipSetDevice(t.Device().ordinal));
 #endif
-
  static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
                "For function with return, use transform instead.");
  if (t.Contiguous()) {
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -134,12 +134,6 @@ inline float LogSum(Iterator begin, Iterator end) {
  return mx + std::log(sum);
 }

-// comparator functions for sorting pairs in descending order
-inline static bool CmpFirst(const std::pair<float, unsigned> &a,
-                            const std::pair<float, unsigned> &b) {
-  return a.first > b.first;
-}
-
 // Redefined here to workaround a VC bug that doesn't support overloading for integer
 // types.
 template <typename T>
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -10,6 +10,7 @@
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int32_t
 #include <iterator>   // for iterator_traits
+#include <numeric>    // for accumulate
 #include <vector>

 #include "common.h"                      // AssertGPUSupport
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -587,14 +587,14 @@ void SketchContainer::FixError() {
  });
 }

-void SketchContainer::AllReduce() {
+void SketchContainer::AllReduce(bool is_column_split) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_));
 #endif
  auto world = collective::GetWorldSize();
-  if (world == 1) {
+  if (world == 1 || is_column_split) {
    return;
  }

@@ -672,7 +672,7 @@ struct InvalidCatOp {
 };
 }  // anonymous namespace

-void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
+void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
  timer_.Start(__func__);
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
@@ -682,7 +682,7 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
  p_cuts->min_vals_.Resize(num_columns_);

  // Sync between workers.
-  this->AllReduce();
+  this->AllReduce(is_column_split);

  // Prune to final number of bins.
  this->Prune(num_bins_ + 1);
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -154,9 +154,9 @@ class SketchContainer {
             Span<SketchEntry const> that);

  /* \brief Merge quantiles from other GPU workers. */
-  void AllReduce();
+  void AllReduce(bool is_column_split);
  /* \brief Create the final histogram cut values. */
-  void MakeCuts(HistogramCuts* cuts);
+  void MakeCuts(HistogramCuts* cuts, bool is_column_split);

  Span<SketchEntry const> Data() const {
    return {this->Current().data().get(), this->Current().size()};
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -139,7 +139,7 @@ struct WeightOp {
 void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
  CUDAContext const* cuctx = ctx->CUDACtx();

-  group_ptr_.SetDevice(ctx->gpu_id);
+  group_ptr_.SetDevice(ctx->Device());
  if (info.group_ptr_.empty()) {
    group_ptr_.Resize(2, 0);
    group_ptr_.HostVector()[1] = info.num_row_;
@@ -164,7 +164,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
  max_group_size_ =
      thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});

-  threads_group_ptr_.SetDevice(ctx->gpu_id);
+  threads_group_ptr_.SetDevice(ctx->Device());
  threads_group_ptr_.Resize(n_groups + 1, 0);
  auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
  if (param_.HasTruncation()) {
@@ -179,7 +179,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
    n_cuda_threads_ = info.num_row_ * param_.NumPair();
  }

-  sorted_idx_cache_.SetDevice(ctx->gpu_id);
+  sorted_idx_cache_.SetDevice(ctx->Device());
  sorted_idx_cache_.Resize(info.labels.Size(), 0);

  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
@@ -198,18 +198,18 @@ common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,

 void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
  CUDAContext const* cuctx = ctx->CUDACtx();
-  auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto labels = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
  CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});

  auto d_group_ptr = this->DataGroupPtr(ctx);

  std::size_t n_groups = d_group_ptr.size() - 1;
  inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
-  auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
+  auto d_inv_idcg = inv_idcg_.View(ctx->Device());
  cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
  CHECK_GE(this->Param().NumPair(), 1ul);

-  discounts_.SetDevice(ctx->gpu_id);
+  discounts_.SetDevice(ctx->Device());
  discounts_.Resize(MaxGroupSize());
  auto d_discount = discounts_.DeviceSpan();
  dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
@@ -217,12 +217,12 @@ void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
 }

 void PreCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
-  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto const d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
  CheckPreLabels("pre", d_label, CheckMAPOp{ctx->CUDACtx()});
 }

 void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
-  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto const d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
  CheckPreLabels("map", d_label, CheckMAPOp{ctx->CUDACtx()});
 }
 }  // namespace xgboost::ltr
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -12,7 +12,7 @@
 #include <vector>                        // for vector

 #include "dmlc/parameter.h"              // for FieldEntry, DMLC_DECLARE_FIELD
-#include "error_msg.h"                   // for GroupWeight, GroupSize
+#include "error_msg.h"                   // for GroupWeight, GroupSize, InvalidCUDAOrdinal
 #include "xgboost/base.h"                // for XGBOOST_DEVICE, bst_group_t
 #include "xgboost/context.h"             // for Context
 #include "xgboost/data.h"                // for MetaInfo
@@ -217,7 +217,7 @@ class RankingCache {
  }
  // Constructed as [1, n_samples] if group ptr is not supplied by the user
  common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
-    group_ptr_.SetDevice(ctx->gpu_id);
+    group_ptr_.SetDevice(ctx->Device());
    return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
  }

@@ -228,7 +228,7 @@ class RankingCache {
  // Create a rank list by model prediction
  common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
    if (sorted_idx_cache_.Empty()) {
-      sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      sorted_idx_cache_.SetDevice(ctx->Device());
      sorted_idx_cache_.Resize(predt.size());
    }
    if (ctx->IsCPU()) {
@@ -240,17 +240,17 @@ class RankingCache {
  // The function simply returns a uninitialized buffer as this is only used by the
  // objective for creating pairs.
  common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
-    CHECK(ctx->IsCUDA());
+    CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
    if (y_sorted_idx_cache_.Empty()) {
-      y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      y_sorted_idx_cache_.SetDevice(ctx->Device());
      y_sorted_idx_cache_.Resize(n_samples);
    }
    return y_sorted_idx_cache_.DeviceSpan();
  }
  common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
-    CHECK(ctx->IsCUDA());
+    CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
    if (y_ranked_by_model_.Empty()) {
-      y_ranked_by_model_.SetDevice(ctx->gpu_id);
+      y_ranked_by_model_.SetDevice(ctx->Device());
      y_ranked_by_model_.Resize(n_samples);
    }
    return y_ranked_by_model_.DeviceSpan();
@@ -266,21 +266,21 @@ class RankingCache {

  linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
    if (roundings_.Size() == 0) {
-      roundings_.SetDevice(ctx->gpu_id);
+      roundings_.SetDevice(ctx->Device());
      roundings_.Reshape(Groups());
    }
-    return roundings_.View(ctx->gpu_id);
+    return roundings_.View(ctx->Device());
  }
  common::Span<double> CUDACostRounding(Context const* ctx) {
    if (cost_rounding_.Size() == 0) {
-      cost_rounding_.SetDevice(ctx->gpu_id);
+      cost_rounding_.SetDevice(ctx->Device());
      cost_rounding_.Resize(1);
    }
    return cost_rounding_.DeviceSpan();
  }
  template <typename Type>
  common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
-    max_lambdas_.SetDevice(ctx->gpu_id);
+    max_lambdas_.SetDevice(ctx->Device());
    std::size_t bytes = n * sizeof(Type);
    if (bytes != max_lambdas_.Size()) {
      max_lambdas_.Resize(bytes);
@@ -315,17 +315,17 @@ class NDCGCache : public RankingCache {
  }

  linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
-    return inv_idcg_.View(ctx->gpu_id);
+    return inv_idcg_.View(ctx->Device());
  }
  common::Span<double const> Discount(Context const* ctx) const {
    return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
  }
  linalg::VectorView<double> Dcg(Context const* ctx) {
    if (dcg_.Size() == 0) {
-      dcg_.SetDevice(ctx->gpu_id);
+      dcg_.SetDevice(ctx->Device());
      dcg_.Reshape(this->Groups());
    }
-    return dcg_.View(ctx->gpu_id);
+    return dcg_.View(ctx->Device());
  }
 };

@@ -396,7 +396,7 @@ class PreCache : public RankingCache {

  common::Span<double> Pre(Context const* ctx) {
    if (pre_.Empty()) {
-      pre_.SetDevice(ctx->gpu_id);
+      pre_.SetDevice(ctx->Device());
      pre_.Resize(this->Groups());
    }
    return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
@@ -427,21 +427,21 @@ class MAPCache : public RankingCache {

  common::Span<double> NumRelevant(Context const* ctx) {
    if (n_rel_.Empty()) {
-      n_rel_.SetDevice(ctx->gpu_id);
+      n_rel_.SetDevice(ctx->Device());
      n_rel_.Resize(n_samples_);
    }
    return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
  }
  common::Span<double> Acc(Context const* ctx) {
    if (acc_.Empty()) {
-      acc_.SetDevice(ctx->gpu_id);
+      acc_.SetDevice(ctx->Device());
      acc_.Resize(n_samples_);
    }
    return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
  }
  common::Span<double> Map(Context const* ctx) {
    if (map_.Empty()) {
-      map_.SetDevice(ctx->gpu_id);
+      map_.SetDevice(ctx->Device());
      map_.Resize(this->Groups());
    }
    return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -0,0 +1,193 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
+#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
+
+#include <algorithm>    // for fill_n
+#include <cstdint>      // for uint64_t
+#include <cstring>      // for memcpy
+#include <memory>       // for shared_ptr, make_shared
+#include <type_traits>  // for is_reference_v, remove_reference_t, is_same_v
+#include <utility>      // for swap, move
+
+#include "io.h"  // for ResourceHandler, AlignedResourceReadStream, MallocResource
+#include "xgboost/logging.h"
+#include "xgboost/span.h"  // for Span
+
+namespace xgboost::common {
+/**
+ * @brief A vector-like type that holds a reference counted resource.
+ *
+ *    The vector size is immutable after construction. This way we can swap the underlying
+ *    resource when needed.
+ */
+template <typename T>
+class RefResourceView {
+  static_assert(!std::is_reference_v<T>);
+
+ public:
+  using value_type = T;             // NOLINT
+  using size_type = std::uint64_t;  // NOLINT
+
+ private:
+  value_type* ptr_{nullptr};
+  size_type size_{0};
+  std::shared_ptr<common::ResourceHandler> mem_{nullptr};
+
+ protected:
+  void Init(value_type* ptr, size_type size, std::shared_ptr<common::ResourceHandler> mem) {
+    ptr_ = ptr;
+    size_ = size;
+    mem_ = std::move(mem);
+  }
+
+ public:
+  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
+      : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
+    CHECK_GE(mem_->Size(), n);
+  }
+  /**
+   * @brief Construct a view on ptr with length n. The ptr is held by the mem resource.
+   *
+   * @param ptr  The pointer to view.
+   * @param n    The length of the view.
+   * @param mem  The owner of the pointer.
+   * @param init Initialize the view with this value.
+   */
+  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem,
+                  T const& init)
+      : RefResourceView{ptr, n, mem} {
+    if (n != 0) {
+      std::fill_n(ptr_, n, init);
+    }
+  }
+
+  ~RefResourceView() = default;
+
+  RefResourceView() = default;
+  RefResourceView(RefResourceView const& that) = delete;
+  RefResourceView& operator=(RefResourceView const& that) = delete;
+  /**
+   * @brief We allow move assignment for lazy initialization.
+   */
+  RefResourceView(RefResourceView&& that) = default;
+  RefResourceView& operator=(RefResourceView&& that) = default;
+
+  [[nodiscard]] size_type size() const { return size_; }  // NOLINT
+  [[nodiscard]] size_type size_bytes() const {            // NOLINT
+    return Span{data(), size()}.size_bytes();
+  }
+  [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
+  [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
+  [[nodiscard]] bool empty() const { return size() == 0; }        // NOLINT
+
+  [[nodiscard]] auto cbegin() const { return data(); }         // NOLINT
+  [[nodiscard]] auto begin() { return data(); }                // NOLINT
+  [[nodiscard]] auto begin() const { return cbegin(); }        // NOLINT
+  [[nodiscard]] auto cend() const { return data() + size(); }  // NOLINT
+  [[nodiscard]] auto end() { return data() + size(); }         // NOLINT
+  [[nodiscard]] auto end() const { return cend(); }            // NOLINT
+
+  [[nodiscard]] auto const& front() const { return data()[0]; }          // NOLINT
+  [[nodiscard]] auto& front() { return data()[0]; }                      // NOLINT
+  [[nodiscard]] auto const& back() const { return data()[size() - 1]; }  // NOLINT
+  [[nodiscard]] auto& back() { return data()[size() - 1]; }              // NOLINT
+
+  [[nodiscard]] value_type& operator[](size_type i) { return ptr_[i]; }
+  [[nodiscard]] value_type const& operator[](size_type i) const { return ptr_[i]; }
+
+  /**
+   * @brief Get the underlying resource.
+   */
+  auto Resource() const { return mem_; }
+};
+
+/**
+ * @brief Read a vector from stream. Accepts both `std::vector` and `RefResourceView`.
+ *
+ *  If the output vector is a referenced counted view, no copying occur.
+ */
+template <typename Vec>
+[[nodiscard]] bool ReadVec(common::AlignedResourceReadStream* fi, Vec* vec) {
+  std::uint64_t n{0};
+  if (!fi->Read(&n)) {
+    return false;
+  }
+  if (n == 0) {
+    return true;
+  }
+
+  using T = typename Vec::value_type;
+  auto expected_bytes = sizeof(T) * n;
+
+  auto [ptr, n_bytes] = fi->Consume(expected_bytes);
+  if (n_bytes != expected_bytes) {
+    return false;
+  }
+
+  if constexpr (std::is_same_v<Vec, RefResourceView<T>>) {
+    *vec = RefResourceView<T>{reinterpret_cast<T*>(ptr), n, fi->Share()};
+  } else {
+    vec->resize(n);
+    std::memcpy(vec->data(), ptr, n_bytes);
+  }
+  return true;
+}
+
+/**
+ * @brief Write a vector to stream. Accepts both `std::vector` and `RefResourceView`.
+ */
+template <typename Vec>
+[[nodiscard]] std::size_t WriteVec(AlignedFileWriteStream* fo, Vec const& vec) {
+  std::size_t bytes{0};
+  auto n = static_cast<std::uint64_t>(vec.size());
+  bytes += fo->Write(n);
+  if (n == 0) {
+    return sizeof(n);
+  }
+
+  using T = typename std::remove_reference_t<decltype(vec)>::value_type;
+  bytes += fo->Write(vec.data(), vec.size() * sizeof(T));
+
+  return bytes;
+}
+
+/**
+ * @brief Make a fixed size `RefResourceView` with malloc resource.
+ */
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) {
+  auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
+  return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
+}
+
+template <typename T>
+class ReallocVector : public RefResourceView<T> {
+  static_assert(!std::is_reference_v<T>);
+  static_assert(!std::is_const_v<T>);
+  static_assert(std::is_trivially_copyable_v<T>);
+
+  using Upper = RefResourceView<T>;
+  using size_type = typename Upper::size_type;    // NOLINT
+  using value_type = typename Upper::value_type;  // NOLINT
+
+ public:
+  ReallocVector() : RefResourceView<T>{MakeFixedVecWithMalloc(0, T{})} {}
+
+  ReallocVector(size_type n, value_type const& init)
+      : RefResourceView<T>{MakeFixedVecWithMalloc(n, init)} {}
+  ReallocVector(ReallocVector const& that) = delete;
+  ReallocVector(ReallocVector&& that) = delete;
+  ReallocVector& operator=(ReallocVector const& that) = delete;
+  ReallocVector& operator=(ReallocVector&& that) = delete;
+
+  void Resize(typename Upper::size_type new_size) {
+    auto resource = std::dynamic_pointer_cast<common::MallocResource>(this->Resource());
+    CHECK(resource);
+    resource->Resize(new_size * sizeof(T));
+    this->Init(resource->template DataAs<T>(), new_size, resource);
+  }
+};
+}  // namespace xgboost::common
+#endif  // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -20,9 +20,9 @@ namespace common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
            HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
  if (!ctx->IsCPU()) {
-    weights.SetDevice(ctx->gpu_id);
+    weights.SetDevice(ctx->Device());
    auto opt_weights = OptionalWeights(weights.ConstDeviceSpan());
-    auto t_v = t.View(ctx->gpu_id);
+    auto t_v = t.View(ctx->Device());
    cuda_impl::Median(ctx, t_v, opt_weights, out);
  }

@@ -59,7 +59,7 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
    auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f);
    out->HostView()(0) = ret;
  } else {
-    cuda_impl::Mean(ctx, v.View(ctx->gpu_id), out->View(ctx->gpu_id));
+    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
  }
 }
 }  // namespace common
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -7,13 +7,14 @@
 #include <dmlc/common.h>
 #include <dmlc/omp.h>

-#include <algorithm>
-#include <cstdint>  // for int32_t
-#include <cstdlib>  // for malloc, free
-#include <limits>
+#include <algorithm>    // for min
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <cstdlib>      // for malloc, free
+#include <functional>   // for function
 #include <new>          // for bad_alloc
-#include <type_traits>  // for is_signed
-#include <vector>
+#include <type_traits>  // for is_signed, conditional_t, is_integral_v, invoke_result_t
+#include <vector>       // for vector

 #include "xgboost/logging.h"

@@ -25,14 +26,14 @@ inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; }  // NOLINT

 // MSVC doesn't implement the thread limit.
 #if defined(_OPENMP) && defined(_MSC_VER)
+#include <limits>
+
 extern "C" {
 inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); }  // NOLINT
 }
 #endif  // defined(_MSC_VER)

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 // Represent simple range of indexes [begin, end)
 // Inspired by tbb::blocked_range
 class Range1d {
@@ -69,7 +70,7 @@ class Range1d {
 // [1,2], [3,4], [5,6], [7,8], [9]
 // The class helps to process data in several tree nodes (non-balanced usually) in parallel
 // Using nested parallelism (by nodes and by data in each node)
-// it helps  to improve CPU resources utilization
+// it helps to improve CPU resources utilization
 class BlockedSpace2d {
 public:
  // Example of space:
@@ -86,63 +87,72 @@ class BlockedSpace2d {
  // dim1 - size of the first dimension in the space
  // getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index
  // grain_size - max size of produced blocks
-  template<typename Func>
-  BlockedSpace2d(size_t dim1, Func getter_size_dim2, size_t grain_size) {
-    for (size_t i = 0; i < dim1; ++i) {
-      const size_t size = getter_size_dim2(i);
-      const size_t n_blocks = size/grain_size + !!(size % grain_size);
-      for (size_t iblock = 0; iblock < n_blocks; ++iblock) {
-        const size_t begin = iblock * grain_size;
-        const size_t end   = std::min(begin + grain_size, size);
+  template <typename Getter>
+  BlockedSpace2d(std::size_t dim1, Getter&& getter_size_dim2, std::size_t grain_size) {
+    static_assert(std::is_integral_v<std::invoke_result_t<Getter, std::size_t>>);
+    for (std::size_t i = 0; i < dim1; ++i) {
+      std::size_t size = getter_size_dim2(i);
+      // Each row (second dim) is divided into n_blocks
+      std::size_t n_blocks = size / grain_size + !!(size % grain_size);
+      for (std::size_t iblock = 0; iblock < n_blocks; ++iblock) {
+        std::size_t begin = iblock * grain_size;
+        std::size_t end = std::min(begin + grain_size, size);
        AddBlock(i, begin, end);
      }
    }
  }

  // Amount of blocks(tasks) in a space
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
    return ranges_.size();
  }

  // get index of the first dimension of i-th block(task)
-  size_t GetFirstDimension(size_t i) const {
+  [[nodiscard]] std::size_t GetFirstDimension(std::size_t i) const {
    CHECK_LT(i, first_dimension_.size());
    return first_dimension_[i];
  }

  // get a range of indexes for the second dimension of i-th block(task)
-  Range1d GetRange(size_t i) const {
+  [[nodiscard]] Range1d GetRange(std::size_t i) const {
    CHECK_LT(i, ranges_.size());
    return ranges_[i];
  }

 private:
-  void AddBlock(size_t first_dimension, size_t begin, size_t end) {
-    first_dimension_.push_back(first_dimension);
+  /**
+   * @brief Add a parallel block.
+   *
+   * @param first_dim The row index.
+   * @param begin     The begin of the second dimension.
+   * @param end       The end of the second dimension.
+   */
+  void AddBlock(std::size_t first_dim, std::size_t begin, std::size_t end) {
+    first_dimension_.push_back(first_dim);
    ranges_.emplace_back(begin, end);
  }

  std::vector<Range1d> ranges_;
-  std::vector<size_t> first_dimension_;
+  std::vector<std::size_t> first_dimension_;
 };


 // Wrapper to implement nested parallelism with simple omp parallel for
 template <typename Func>
-void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
-  const size_t num_blocks_in_space = space.Size();
-  CHECK_GE(nthreads, 1);
+void ParallelFor2d(const BlockedSpace2d& space, int n_threads, Func&& func) {
+  static_assert(std::is_void_v<std::invoke_result_t<Func, std::size_t, Range1d>>);
+  std::size_t n_blocks_in_space = space.Size();
+  CHECK_GE(n_threads, 1);

  dmlc::OMPException exc;
-#pragma omp parallel num_threads(nthreads)
+#pragma omp parallel num_threads(n_threads)
  {
    exc.Run([&]() {
-      size_t tid = omp_get_thread_num();
-      size_t chunck_size =
-          num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
+      std::size_t tid = omp_get_thread_num();
+      std::size_t chunck_size = n_blocks_in_space / n_threads + !!(n_blocks_in_space % n_threads);

-      size_t begin = chunck_size * tid;
-      size_t end = std::min(begin + chunck_size, num_blocks_in_space);
+      std::size_t begin = chunck_size * tid;
+      std::size_t end = std::min(begin + chunck_size, n_blocks_in_space);
      for (auto i = begin; i < end; i++) {
        func(space.GetFirstDimension(i), space.GetRange(i));
      }
@@ -303,7 +313,6 @@ class MemStackAllocator {
 * \brief Constant that can be used for initializing static thread local memory.
 */
 std::int32_t constexpr DefaultMaxThreads() { return 128; }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common

 #endif  // XGBOOST_COMMON_THREADING_UTILS_H_
--- a/src/context.cc
+++ b/src/context.cc
@@ -3,53 +3,201 @@
 *
 * \brief Context object used for controlling runtime parameters.
 */
-#include <xgboost/context.h>
+#include "xgboost/context.h"

-#include "common/common.h"  // AssertGPUSupport
+#include <algorithm>  // for find_if
+#include <charconv>   // for from_chars
+#include <iterator>   // for distance
+#include <optional>   // for optional
+#include <regex>      // for regex_replace, regex_match
+
+#include "common/common.h"     // AssertGPUSupport
+#include "common/error_msg.h"  // WarnDeprecatedGPUId
 #include "common/threading_utils.h"
+#include "xgboost/string_view.h"

 namespace xgboost {

 DMLC_REGISTER_PARAMETER(Context);

-std::int32_t constexpr Context::kCpuId;
+bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;

 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}

-void Context::ConfigureGpuId(bool require_gpu) {
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-  if (gpu_id == kCpuId) {  // 0. User didn't specify the `gpu_id'
-    if (require_gpu) {     // 1. `tree_method' or `predictor' or both are using
-                           // GPU.
-      // 2. Use device 0 as default.
-      this->UpdateAllowUnknown(Args{{"gpu_id", "0"}});
-    }
-  }
+namespace {
+inline constexpr char const* kDevice = "device";

-  // 3. When booster is loaded from a memory image (Python pickle or R
-  // raw model), number of available GPUs could be different.  Wrap around it.
-  int32_t n_gpus = common::AllVisibleGPUs();
-  if (n_gpus == 0) {
-    if (gpu_id != kCpuId) {
-      LOG(WARNING) << "No visible GPU is found, setting `gpu_id` to -1";
-    }
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  } else if (fail_on_invalid_gpu_id) {
-    CHECK(gpu_id == kCpuId || gpu_id < n_gpus)
-        << "Only " << n_gpus << " GPUs are visible, gpu_id " << gpu_id << " is invalid.";
-  } else if (gpu_id != kCpuId && gpu_id >= n_gpus) {
-    LOG(WARNING) << "Only " << n_gpus << " GPUs are visible, setting `gpu_id` to "
-                 << gpu_id % n_gpus;
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(gpu_id % n_gpus)}});
-  }
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
+DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
+  device = DeviceOrd::CPU();
+  return device;
+}
 #else
-  // Just set it to CPU, don't think about it.
-  this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  (void)(require_gpu);
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+// Check CUDA on the current device, wrap the ordinal if necessary.
+[[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
+  // When booster is loaded from a memory image (Python pickle or R raw model), number of
+  // available GPUs could be different.  Wrap around it.
+  std::int32_t n_visible = common::AllVisibleGPUs();
+  if (n_visible == 0) {
+    if (device.IsCUDA()) {
+      LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
+    }
+    device = DeviceOrd::CPU();
+  } else if (fail_on_invalid) {
+    CHECK(device.IsCPU() || device.ordinal < n_visible)
+        << "Only " << n_visible << " GPUs are visible, ordinal " << device.ordinal
+        << " is invalid.";
+  } else if (device.IsCUDA() && device.ordinal >= n_visible) {
+    device.ordinal = device.ordinal % n_visible;
+    LOG(WARNING) << "Only " << n_visible << " GPUs are visible, setting device ordinal to "
+                 << device.ordinal;
+  }

-  common::SetDevice(this->gpu_id);
+  if (device.IsCUDA()) {
+    common::SetDevice(device.ordinal);
+  }
+  return device;
+}
+#endif  //  !defined(XGBOOST_USE_CUDA)
+
+[[nodiscard]] std::optional<std::int32_t> ParseInt(StringView ordinal) {
+  // Some basic checks to ensure valid `gpu_id` and device ordinal instead of directly parsing and
+  // letting go of unknown characters.
+  if (ordinal.empty()) {
+    return std::nullopt;
+  }
+
+  std::size_t offset{0};
+  if (ordinal[0] == '-') {
+    offset = 1;
+  }
+  if (ordinal.size() <= offset) {
+    return std::nullopt;
+  }
+
+  bool valid = std::all_of(ordinal.cbegin() + offset, ordinal.cend(),
+                           [](auto c) { return std::isdigit(c); });
+  if (!valid) {
+    return std::nullopt;
+  }
+
+  std::int32_t parsed_id{Context::kCpuId};
+  auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
+  if (res.ec != std::errc()) {
+    return std::nullopt;
+  }
+
+  return parsed_id;
+}
+
+[[nodiscard]] DeviceOrd MakeDeviceOrd(std::string const& input, bool fail_on_invalid_gpu_id) {
+  StringView msg{R"(Invalid argument for `device`. Expected to be one of the following:
+- cpu
+- cuda
+- cuda:<device ordinal>  # e.g. cuda:0
+- gpu
+- gpu:<device ordinal>   # e.g. gpu:0
+)"};
+  auto fatal = [&] { LOG(FATAL) << msg << "Got: `" << input << "`."; };
+
+#if defined(__MINGW32__)
+  // mingw hangs on regex using rtools 430. Basic checks only.
+  CHECK_GE(input.size(), 3) << msg;
+  auto substr = input.substr(0, 3);
+  bool valid = substr == "cpu" || substr == "cud" || substr == "gpu";
+  CHECK(valid) << msg;
+#else
+  std::regex pattern{"gpu(:[0-9]+)?|cuda(:[0-9]+)?|cpu"};
+  if (!std::regex_match(input, pattern)) {
+    fatal();
+  }
+#endif  // defined(__MINGW32__)
+
+  // handle alias
+  std::string s_device = std::regex_replace(input, std::regex{"gpu"}, DeviceSym::CUDA());
+
+  auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
+  DeviceOrd device;
+  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  if (split_it == s_device.cend()) {
+    // no ordinal.
+    if (s_device == DeviceSym::CPU()) {
+      device = DeviceOrd::CPU();
+    } else if (s_device == DeviceSym::CUDA()) {
+      device = DeviceOrd::CUDA(0);  // use 0 as default;
+    } else {
+      fatal();
+    }
+  } else {
+    // must be CUDA when ordinal is specifed.
+    // +1 for colon
+    std::size_t offset = std::distance(s_device.cbegin(), split_it) + 1;
+    // substr
+    StringView s_ordinal = {s_device.data() + offset, s_device.size() - offset};
+    if (s_ordinal.empty()) {
+      fatal();
+    }
+    auto opt_id = ParseInt(s_ordinal);
+    if (!opt_id.has_value()) {
+      fatal();
+    }
+    CHECK_LE(opt_id.value(), std::numeric_limits<bst_d_ordinal_t>::max())
+        << "Ordinal value too large.";
+    device = DeviceOrd::CUDA(opt_id.value());
+  }
+
+  if (device.ordinal < Context::kCpuId) {
+    fatal();
+  }
+  device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
+
+  return device;
+}
+}  // namespace
+
+void Context::ConfigureGpuId(bool require_gpu) {
+  if (this->IsCPU() && require_gpu) {
+    this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
+  }
+}
+
+void Context::SetDeviceOrdinal(Args const& kwargs) {
+  auto gpu_id_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == "gpu_id"; });
+  auto has_gpu_id = gpu_id_it != kwargs.cend();
+  auto device_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == kDevice; });
+  auto has_device = device_it != kwargs.cend();
+  if (has_device && has_gpu_id) {
+    LOG(FATAL) << "Both `device` and `gpu_id` are specified. Use `device` instead.";
+  }
+
+  if (has_gpu_id) {
+    // Compatible with XGBoost < 2.0.0
+    error::WarnDeprecatedGPUId();
+    auto opt_id = ParseInt(StringView{gpu_id_it->second});
+    CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
+    if (opt_id.value() > Context::kCpuId) {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
+    } else {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
+    }
+    return;
+  }
+
+  auto new_d = MakeDeviceOrd(this->device, this->fail_on_invalid_gpu_id);
+
+  if (!has_device) {
+    CHECK_EQ(new_d.ordinal, this->device_.ordinal);  // unchanged
+  }
+  this->SetDevice(new_d);
+
+  if (this->IsCPU()) {
+    CHECK_EQ(this->device_.ordinal, kCpuId);
+  } else {
+    CHECK_GT(this->device_.ordinal, kCpuId);
+  }
 }

 std::int32_t Context::Threads() const {
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -7,7 +7,7 @@
 #include <dmlc/data.h>

 #include <algorithm>
-#include <cstddef>  // std::size_t
+#include <cstddef>  // for size_t
 #include <functional>
 #include <limits>
 #include <map>
@@ -17,6 +17,7 @@
 #include <vector>

 #include "../c_api/c_api_error.h"
+#include "../common/error_msg.h"  // for MaxFeatureSize
 #include "../common/math.h"
 #include "array_interface.h"
 #include "arrow-cdi.h"
@@ -300,9 +301,9 @@ class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
    array_interface_ = ArrayInterface<2>(get<Object const>(j));
    batch_ = ArrayAdapterBatch{array_interface_};
  }
-  ArrayAdapterBatch const& Value() const override { return batch_; }
-  size_t NumRows() const { return array_interface_.Shape(0); }
-  size_t NumColumns() const { return array_interface_.Shape(1); }
+  [[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; }
+  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }

 private:
  ArrayAdapterBatch batch_;
@@ -476,7 +477,6 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
  ArrayInterface<1> indptr_;
  ArrayInterface<1> indices_;
  ArrayInterface<1> values_;
-  bst_row_t n_rows_;

  class Line {
    std::size_t column_idx_;
@@ -502,11 +502,8 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
  static constexpr bool kIsRowMajor = false;

  CSCArrayAdapterBatch(ArrayInterface<1> indptr, ArrayInterface<1> indices,
-                       ArrayInterface<1> values, bst_row_t n_rows)
-      : indptr_{std::move(indptr)},
-        indices_{std::move(indices)},
-        values_{std::move(values)},
-        n_rows_{n_rows} {}
+                       ArrayInterface<1> values)
+      : indptr_{std::move(indptr)}, indices_{std::move(indices)}, values_{std::move(values)} {}

  std::size_t Size() const { return indptr_.n - 1; }
  Line GetLine(std::size_t idx) const {
@@ -541,8 +538,7 @@ class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch>
        indices_{indices},
        values_{values},
        num_rows_{num_rows},
-        batch_{
-            CSCArrayAdapterBatch{indptr_, indices_, values_, static_cast<bst_row_t>(num_rows_)}} {}
+        batch_{CSCArrayAdapterBatch{indptr_, indices_, values_}} {}

  // JVM package sends 0 as unknown
  size_t NumRows() const { return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_; }
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -386,7 +386,7 @@ inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
 *   numpy has the proper support even though it's in the __cuda_array_interface__
 *   protocol defined by numba.
 */
-template <int32_t D, bool allow_mask = (D == 1)>
+template <std::int32_t D, bool allow_mask = (D == 1)>
 class ArrayInterface {
  static_assert(D > 0, "Invalid dimension for array interface.");

@@ -457,7 +457,7 @@ class ArrayInterface {

  explicit ArrayInterface(std::string const &str) : ArrayInterface{StringView{str}} {}

-  explicit ArrayInterface(StringView str) : ArrayInterface<D>{Json::Load(str)} {}
+  explicit ArrayInterface(StringView str) : ArrayInterface{Json::Load(str)} {}

  void AssignType(StringView typestr) {
    using T = ArrayInterfaceHandler::Type;
@@ -590,9 +590,9 @@ class ArrayInterface {
 };

 template <std::int32_t D, typename Fn>
-void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
+void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
  // Only used for cuDF at the moment.
-  CHECK_EQ(array.valid.Size(), 0);
+  CHECK_EQ(array.valid.Capacity(), 0);
  auto dispatch = [&](auto t) {
    using T = std::remove_const_t<decltype(t)> const;
    // Set the data size to max as we don't know the original size of a sliced array:
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -4,42 +4,57 @@
 */
 #include "xgboost/data.h"

-#include <dmlc/registry.h>
+#include <dmlc/registry.h>  // for DMLC_REGISTRY_ENABLE, DMLC_REGISTRY_LINK_TAG

-#include <array>
-#include <cstddef>
-#include <cstring>
+#include <algorithm>    // for copy, max, none_of, min
+#include <atomic>       // for atomic
+#include <cmath>        // for abs
+#include <cstdint>      // for uint64_t, int32_t, uint8_t, uint32_t
+#include <cstring>      // for size_t, strcmp, memcpy
+#include <exception>    // for exception
+#include <iostream>     // for operator<<, basic_ostream, basic_ostream::op...
+#include <map>          // for map, operator!=
+#include <numeric>      // for accumulate, partial_sum
+#include <tuple>        // for get, apply
+#include <type_traits>  // for remove_pointer_t, remove_reference

-#include "../collective/communicator-inl.h"
-#include "../collective/communicator.h"
-#include "../common/algorithm.h"  // for StableSort
-#include "../common/api_entry.h"  // for XGBAPIThreadLocalEntry
-#include "../common/common.h"
-#include "../common/error_msg.h"  // for InfInData, GroupWeight, GroupSize
-#include "../common/group_data.h"
-#include "../common/io.h"
-#include "../common/linalg_op.h"
-#include "../common/math.h"
-#include "../common/numeric.h"  // for Iota
-#include "../common/threading_utils.h"
-#include "../common/version.h"
-#include "../data/adapter.h"
-#include "../data/iterative_dmatrix.h"
-#include "./sparse_page_dmatrix.h"
-#include "./sparse_page_source.h"
-#include "dmlc/io.h"
-#include "file_iterator.h"
-#include "simple_dmatrix.h"
-#include "sparse_page_writer.h"
-#include "validation.h"
-#include "xgboost/c_api.h"
-#include "xgboost/context.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/learner.h"
-#include "xgboost/linalg.h"  // Vector
-#include "xgboost/logging.h"
-#include "xgboost/string_view.h"
-#include "xgboost/version_config.h"
+#include "../collective/communicator-inl.h"  // for GetRank, GetWorldSize, Allreduce, IsFederated
+#include "../collective/communicator.h"      // for Operation
+#include "../common/algorithm.h"             // for StableSort
+#include "../common/api_entry.h"             // for XGBAPIThreadLocalEntry
+#include "../common/common.h"                // for Split
+#include "../common/error_msg.h"             // for GroupSize, GroupWeight, InfInData
+#include "../common/group_data.h"            // for ParallelGroupBuilder
+#include "../common/io.h"                    // for PeekableInStream
+#include "../common/linalg_op.h"             // for ElementWiseTransformHost
+#include "../common/math.h"                  // for CheckNAN
+#include "../common/numeric.h"               // for Iota, RunLengthEncode
+#include "../common/threading_utils.h"       // for ParallelFor
+#include "../common/version.h"               // for Version
+#include "../data/adapter.h"                 // for COOTuple, FileAdapter, IsValidFunctor
+#include "../data/iterative_dmatrix.h"       // for IterativeDMatrix
+#include "./sparse_page_dmatrix.h"           // for SparsePageDMatrix
+#include "array_interface.h"                 // for ArrayInterfaceHandler, ArrayInterface, Dispa...
+#include "dmlc/base.h"                       // for BeginPtr
+#include "dmlc/common.h"                     // for OMPException
+#include "dmlc/data.h"                       // for Parser
+#include "dmlc/endian.h"                     // for ByteSwap, DMLC_IO_NO_ENDIAN_SWAP
+#include "dmlc/io.h"                         // for Stream
+#include "dmlc/thread_local.h"               // for ThreadLocalStore
+#include "ellpack_page.h"                    // for EllpackPage
+#include "file_iterator.h"                   // for ValidateFileFormat, FileIterator, Next, Reset
+#include "gradient_index.h"                  // for GHistIndexMatrix
+#include "simple_dmatrix.h"                  // for SimpleDMatrix
+#include "sparse_page_writer.h"              // for SparsePageFormatReg
+#include "validation.h"                      // for LabelsCheck, WeightsCheck, ValidateQueryGroup
+#include "xgboost/base.h"                    // for bst_group_t, bst_row_t, bst_float, bst_ulong
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/learner.h"                 // for HostDeviceVector
+#include "xgboost/linalg.h"                  // for Tensor, Stack, TensorView, Vector, ArrayInte...
+#include "xgboost/logging.h"                 // for Error, LogCheck_EQ, CHECK, CHECK_EQ, LOG
+#include "xgboost/span.h"                    // for Span, operator!=, SpanIterator
+#include "xgboost/string_view.h"             // for operator==, operator<<, StringView

 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
@@ -351,7 +366,7 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
  // Groups is maintained by a higher level Python function.  We should aim at deprecating
  // the slice function.
  if (this->labels.Size() != this->num_row_) {
-    auto t_labels = this->labels.View(this->labels.Data()->DeviceIdx());
+    auto t_labels = this->labels.View(this->labels.Data()->Device());
    out.labels.Reshape(ridxs.size(), labels.Shape(1));
    out.labels.Data()->HostVector() =
        Gather(this->labels.Data()->HostVector(), ridxs, t_labels.Stride(0));
@@ -379,7 +394,7 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
  if (this->base_margin_.Size() != this->num_row_) {
    CHECK_EQ(this->base_margin_.Size() % this->num_row_, 0)
        << "Incorrect size of base margin vector.";
-    auto t_margin = this->base_margin_.View(this->base_margin_.Data()->DeviceIdx());
+    auto t_margin = this->base_margin_.View(this->base_margin_.Data()->Device());
    out.base_margin_.Reshape(ridxs.size(), t_margin.Shape(1));
    out.base_margin_.Data()->HostVector() =
        Gather(this->base_margin_.Data()->HostVector(), ridxs, t_margin.Stride(0));
@@ -416,7 +431,8 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
    p_out->Reshape(array.shape);
    return;
  }
-  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
+  CHECK_EQ(array.valid.Capacity(), 0)
+      << "Meta info like label or weight can not have missing value.";
  if (array.is_contiguous && array.type == ToDType<T>::kType) {
    // Handle contigious
    p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
@@ -429,10 +445,10 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
    return;
  }
  p_out->Reshape(array.shape);
-  auto t_out = p_out->View(Context::kCpuId);
+  auto t_out = p_out->View(DeviceOrd::CPU());
  CHECK(t_out.CContiguous());
  auto const shape = t_out.Shape();
-  DispatchDType(array, Context::kCpuId, [&](auto&& in) {
+  DispatchDType(array, DeviceOrd::CPU(), [&](auto&& in) {
    linalg::ElementWiseTransformHost(t_out, ctx.Threads(), [&](auto i, auto) {
      return std::apply(in, linalg::UnravelIndex<D>(i, shape));
    });
@@ -548,7 +564,7 @@ void MetaInfo::SetInfo(Context const& ctx, const char* key, const void* dptr, Da
  CHECK(key);
  auto proc = [&](auto cast_d_ptr) {
    using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
-    auto t = linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, Context::kCpuId);
+    auto t = linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, DeviceOrd::CPU());
    CHECK(t.CContiguous());
    Json interface {
      linalg::ArrayInterface(t)
@@ -723,11 +739,14 @@ void MetaInfo::SynchronizeNumberOfColumns() {
 namespace {
 template <typename T>
 void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device)
-      << "Data is resided on a different device than `gpu_id`. "
-      << "Device that data is on: " << v.DeviceIdx() << ", "
-      << "`gpu_id` for XGBoost: " << device;
+  bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
+  if (!valid) {
+    LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
+                  "the booster. The device ordinal of the data is: "
+               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
+  }
 }
+
 template <typename T, std::int32_t D>
 void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
  CheckDevice(device, *v.Data());
@@ -806,10 +825,10 @@ DMatrix::~DMatrix() {
  }
 }

-DMatrix *TryLoadBinary(std::string fname, bool silent) {
-  int magic;
-  std::unique_ptr<dmlc::Stream> fi(
-      dmlc::Stream::Create(fname.c_str(), "r", true));
+namespace {
+DMatrix* TryLoadBinary(std::string fname, bool silent) {
+  std::int32_t magic;
+  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
  if (fi != nullptr) {
    common::PeekableInStream is(fi.get());
    if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
@@ -817,11 +836,10 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) {
        dmlc::ByteSwap(&magic, sizeof(magic), 1);
      }
      if (magic == data::SimpleDMatrix::kMagic) {
-        DMatrix *dmat = new data::SimpleDMatrix(&is);
+        DMatrix* dmat = new data::SimpleDMatrix(&is);
        if (!silent) {
-          LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_
-                       << " matrix with " << dmat->Info().num_nonzero_
-                       << " entries loaded from " << fname;
+          LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
+                       << dmat->Info().num_nonzero_ << " entries loaded from " << fname;
        }
        return dmat;
      }
@@ -829,6 +847,7 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) {
  }
  return nullptr;
 }
+}  // namespace

 DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
  auto need_split = false;
@@ -840,7 +859,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
  }

  std::string fname, cache_file;
-  size_t dlm_pos = uri.find('#');
+  auto dlm_pos = uri.find('#');
  if (dlm_pos != std::string::npos) {
    cache_file = uri.substr(dlm_pos + 1, uri.length());
    fname = uri.substr(0, dlm_pos);
@@ -852,14 +871,11 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
      for (size_t i = 0; i < cache_shards.size(); ++i) {
        size_t pos = cache_shards[i].rfind('.');
        if (pos == std::string::npos) {
-          os << cache_shards[i]
-             << ".r" << collective::GetRank()
-             << "-" <<  collective::GetWorldSize();
+          os << cache_shards[i] << ".r" << collective::GetRank() << "-"
+             << collective::GetWorldSize();
        } else {
-          os << cache_shards[i].substr(0, pos)
-             << ".r" << collective::GetRank()
-             << "-" <<  collective::GetWorldSize()
-             << cache_shards[i].substr(pos, cache_shards[i].length());
+          os << cache_shards[i].substr(0, pos) << ".r" << collective::GetRank() << "-"
+             << collective::GetWorldSize() << cache_shards[i].substr(pos, cache_shards[i].length());
        }
        if (i + 1 != cache_shards.size()) {
          os << ':';
@@ -890,12 +906,12 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
    LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
  }

-  data::ValidateFileFormat(fname);
-  DMatrix* dmat {nullptr};
+  DMatrix* dmat{nullptr};

  if (cache_file.empty()) {
-    std::unique_ptr<dmlc::Parser<uint32_t>> parser(
-        dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
+    fname = data::ValidateFileFormat(fname);
+    std::unique_ptr<dmlc::Parser<std::uint32_t>> parser(
+        dmlc::Parser<std::uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
    data::FileAdapter adapter(parser.get());
    dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
                           cache_file, data_split_mode);
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -45,7 +45,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
    p_out->Reshape(array.shape);
    return;
  }
-  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
+  CHECK_EQ(array.valid.Capacity(), 0)
+      << "Meta info like label or weight can not have missing value.";
  auto ptr_device = SetDeviceToPtr(array.data);
  p_out->SetDevice(ptr_device);

@@ -67,7 +68,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
    return;
  }
  p_out->Reshape(array.shape);
-  auto t = p_out->View(ptr_device);
+  auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
  linalg::ElementWiseTransformDevice(
      t,
      [=] __device__(size_t i, T) {
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -3,12 +3,20 @@
 */
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)

+#include "ellpack_page.h"
+
 #include <xgboost/data.h>

 // dummy implementation of EllpackPage in case CUDA is not used
 namespace xgboost {

-class EllpackPageImpl {};
+class EllpackPageImpl {
+  common::HistogramCuts cuts_;
+
+ public:
+  [[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
+  [[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
+};

 EllpackPage::EllpackPage() = default;

@@ -32,5 +40,16 @@ size_t EllpackPage::Size() const {
  return 0;
 }

+[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+  return impl_->Cuts();
+}
+
+[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+  return impl_->Cuts();
+}
 }  // namespace xgboost
 #endif  // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -4,12 +4,17 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>

+#include <algorithm>  // for copy
+#include <utility>    // for move
+#include <vector>     // for vector
+
 #include "../common/categorical.h"
+#include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
-#include "../common/random.h"
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
 #include "./ellpack_page.cuh"
 #include "device_adapter.cuh"  // for HasInfInData
+#include "ellpack_page.h"
 #include "gradient_index.h"
 #include "xgboost/data.h"

@@ -32,6 +37,16 @@ size_t EllpackPage::Size() const { return impl_->Size(); }

 void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id); }

+[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
+  CHECK(impl_);
+  return impl_->Cuts();
+}
+
+[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
+  CHECK(impl_);
+  return impl_->Cuts();
+}
+
 // Bin each input data entry, store the bin indices in compressed form.
 __global__ void CompressBinEllpackKernel(
    common::CompressedBufferWriter wr,
@@ -128,7 +143,11 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
  monitor_.Start("Quantiles");
  // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
  row_stride = GetRowStride(dmat);
-  cuts_ = common::DeviceSketch(ctx->gpu_id, dmat, param.max_bin);
+  if (!param.hess.empty()) {
+    cuts_ = common::DeviceSketchWithHessian(ctx, dmat, param.max_bin, param.hess);
+  } else {
+    cuts_ = common::DeviceSketch(ctx, dmat, param.max_bin);
+  }
  monitor_.Stop("Quantiles");

  monitor_.Start("InitCompressedData");
@@ -343,7 +362,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
  auto d_csc_indptr = dh::ToSpan(csc_indptr);

  auto bin_type = page.index.GetBinTypeSize();
-  common::CompressedBufferWriter writer{page.cut.TotalBins() + 1};  // +1 for null value
+  common::CompressedBufferWriter writer{page.cut.TotalBins() +
+                                        static_cast<std::size_t>(1)};  // +1 for null value

  dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
    auto ridx = idx / row_stride;
@@ -387,8 +407,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag

  // copy gidx
  common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
-  dh::device_vector<size_t> row_ptr(page.row_ptr);
+  dh::device_vector<size_t> row_ptr(page.row_ptr.size());
  auto d_row_ptr = dh::ToSpan(row_ptr);
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
+                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
+                                hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
+#endif

  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
  auto null = accessor.NullValue();
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -1,17 +1,18 @@
-/*!
- * Copyright 2019 by XGBoost Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
 */

-#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
-#define XGBOOST_DATA_ELLPACK_PAGE_H_
+#ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
+#define XGBOOST_DATA_ELLPACK_PAGE_CUH_

+#include <thrust/binary_search.h>
 #include <xgboost/data.h>

+#include "../common/categorical.h"
 #include "../common/compressed_iterator.h"
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
-#include "../common/categorical.h"
-#include <thrust/binary_search.h>
+#include "ellpack_page.h"

 namespace xgboost {
 /** \brief Struct for accessing and manipulating an ELLPACK matrix on the
@@ -194,8 +195,8 @@ class EllpackPageImpl {
    base_rowid = row_id;
  }

-  common::HistogramCuts& Cuts() { return cuts_; }
-  common::HistogramCuts const& Cuts() const { return cuts_; }
+  [[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
+  [[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }

  /*! \return Estimation of memory cost of this page. */
  static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
@@ -256,4 +257,4 @@ inline size_t GetRowStride(DMatrix* dmat) {
 }
 }  // namespace xgboost

-#endif  // XGBOOST_DATA_ELLPACK_PAGE_H_
+#endif  // XGBOOST_DATA_ELLPACK_PAGE_CUH_
--- a/src/data/ellpack_page.h
+++ b/src/data/ellpack_page.h
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2017-2023 by XGBoost Contributors
+ */
+#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
+#define XGBOOST_DATA_ELLPACK_PAGE_H_
+
+#include <memory>  // for unique_ptr
+
+#include "../common/hist_util.h"  // for HistogramCuts
+#include "xgboost/context.h"      // for Context
+#include "xgboost/data.h"         // for DMatrix, BatchParam
+
+namespace xgboost {
+class EllpackPageImpl;
+/**
+ * @brief A page stored in ELLPACK format.
+ *
+ * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
+ * including CUDA-specific implementation details in the header.
+ */
+class EllpackPage {
+ public:
+  /**
+   * @brief Default constructor.
+   *
+   * This is used in the external memory case. An empty ELLPACK page is constructed with its content
+   * set later by the reader.
+   */
+  EllpackPage();
+  /**
+   * @brief Constructor from an existing DMatrix.
+   *
+   * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
+   * in CSR format.
+   */
+  explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
+
+  /*! \brief Destructor. */
+  ~EllpackPage();
+
+  EllpackPage(EllpackPage&& that);
+
+  /*! \return Number of instances in the page. */
+  [[nodiscard]] size_t Size() const;
+
+  /*! \brief Set the base row id for this page. */
+  void SetBaseRowId(std::size_t row_id);
+
+  [[nodiscard]] const EllpackPageImpl* Impl() const { return impl_.get(); }
+  EllpackPageImpl* Impl() { return impl_.get(); }
+
+  [[nodiscard]] common::HistogramCuts& Cuts();
+  [[nodiscard]] common::HistogramCuts const& Cuts() const;
+
+ private:
+  std::unique_ptr<EllpackPageImpl> impl_;
+};
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_ELLPACK_PAGE_H_
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -1,60 +1,59 @@
-/*!
- * Copyright 2019-2021 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
-#include <xgboost/data.h>
 #include <dmlc/registry.h>

+#include <cstddef>  // for size_t
+
+#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
 #include "ellpack_page.cuh"
-#include "sparse_page_writer.h"
-#include "histogram_cut_format.h"
-
-namespace xgboost {
-namespace data {
+#include "histogram_cut_format.h"  // for ReadHistogramCuts, WriteHistogramCuts
+#include "sparse_page_writer.h"    // for SparsePageFormat

+namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);

-
 class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
 public:
-  bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
+  bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override {
    auto* impl = page->Impl();
    if (!ReadHistogramCuts(&impl->Cuts(), fi)) {
      return false;
    }
-    fi->Read(&impl->n_rows);
-    fi->Read(&impl->is_dense);
-    fi->Read(&impl->row_stride);
-    fi->Read(&impl->gidx_buffer.HostVector());
+    if (!fi->Read(&impl->n_rows)) {
+      return false;
+    }
+    if (!fi->Read(&impl->is_dense)) {
+      return false;
+    }
+    if (!fi->Read(&impl->row_stride)) {
+      return false;
+    }
+    if (!common::ReadVec(fi, &impl->gidx_buffer.HostVector())) {
+      return false;
+    }
    if (!fi->Read(&impl->base_rowid)) {
      return false;
    }
    return true;
  }

-  size_t Write(const EllpackPage& page, dmlc::Stream* fo) override {
-    size_t bytes = 0;
+  size_t Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) override {
+    std::size_t bytes{0};
    auto* impl = page.Impl();
    bytes += WriteHistogramCuts(impl->Cuts(), fo);
-    fo->Write(impl->n_rows);
-    bytes += sizeof(impl->n_rows);
-    fo->Write(impl->is_dense);
-    bytes += sizeof(impl->is_dense);
-    fo->Write(impl->row_stride);
-    bytes += sizeof(impl->row_stride);
+    bytes += fo->Write(impl->n_rows);
+    bytes += fo->Write(impl->is_dense);
+    bytes += fo->Write(impl->row_stride);
    CHECK(!impl->gidx_buffer.ConstHostVector().empty());
-    fo->Write(impl->gidx_buffer.HostVector());
-    bytes += impl->gidx_buffer.ConstHostSpan().size_bytes() + sizeof(uint64_t);
-    fo->Write(impl->base_rowid);
-    bytes += sizeof(impl->base_rowid);
+    bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
+    bytes += fo->Write(impl->base_rowid);
    return bytes;
  }
 };

 XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(raw)
    .describe("Raw ELLPACK binary data format.")
-    .set_body([]() {
-      return new EllpackPageRawFormat();
-    });
-
-}  // namespace data
-}  // namespace xgboost
+    .set_body([]() { return new EllpackPageRawFormat(); });
+}  // namespace xgboost::data
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -5,10 +5,10 @@
 #include <utility>

 #include "ellpack_page.cuh"
+#include "ellpack_page.h"  // for EllpackPage
 #include "ellpack_page_source.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 void EllpackPageSource::Fetch() {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
@@ -31,5 +31,4 @@ void EllpackPageSource::Fetch() {
    this->WriteCache();
  }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -6,17 +6,17 @@
 #define XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_

 #include <xgboost/data.h>
+
 #include <memory>
 #include <string>
 #include <utility>

 #include "../common/common.h"
 #include "../common/hist_util.h"
+#include "ellpack_page.h"  // for EllpackPage
 #include "sparse_page_source.h"

-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
  bool is_dense_;
  size_t row_stride_;
@@ -52,8 +52,7 @@ inline void EllpackPageSource::Fetch() {
  (void)(is_dense_);
  common::AssertGPUSupport();
 }
-#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
-}  // namespace data
-}  // namespace xgboost
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::data

 #endif  // XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
--- a/src/data/file_iterator.cc
+++ b/src/data/file_iterator.cc
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2021-2023, XGBoost contributors
+ */
+#include "file_iterator.h"
+
+#include <xgboost/logging.h>  // for LogCheck_EQ, LogCheck_LE, CHECK_EQ, CHECK_LE, LOG, LOG_...
+
+#include <filesystem>  // for weakly_canonical, path, u8path
+#include <map>         // for map, operator==
+#include <ostream>     // for operator<<, basic_ostream, istringstream
+#include <vector>      // for vector
+
+#include "../common/common.h"     // for Split
+#include "xgboost/string_view.h"  // for operator<<, StringView
+
+namespace xgboost::data {
+std::string ValidateFileFormat(std::string const& uri) {
+  std::vector<std::string> name_args_cache = common::Split(uri, '#');
+  CHECK_LE(name_args_cache.size(), 2)
+      << "Only one `#` is allowed in file path for cachefile specification";
+
+  std::vector<std::string> name_args = common::Split(name_args_cache[0], '?');
+  StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"};
+  CHECK_EQ(name_args.size(), 2) << msg;
+
+  std::map<std::string, std::string> args;
+  std::vector<std::string> arg_list = common::Split(name_args[1], '&');
+  for (size_t i = 0; i < arg_list.size(); ++i) {
+    std::istringstream is(arg_list[i]);
+    std::pair<std::string, std::string> kv;
+    CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
+                                           << " for key in arg " << i + 1;
+    CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
+                                       << " for value in arg " << i + 1;
+    args.insert(kv);
+  }
+  if (args.find("format") == args.cend()) {
+    LOG(FATAL) << msg;
+  }
+
+  auto path = common::Split(uri, '?')[0];
+
+  namespace fs = std::filesystem;
+  name_args[0] = fs::weakly_canonical(fs::u8path(path)).string();
+  if (name_args_cache.size() == 1) {
+    return name_args[0] + "?" + name_args[1];
+  } else {
+    return name_args[0] + "?" + name_args[1] + '#' + name_args_cache[1];
+  }
+}
+}  // namespace xgboost::data
--- a/src/data/file_iterator.h
+++ b/src/data/file_iterator.h
@@ -4,46 +4,20 @@
 #ifndef XGBOOST_DATA_FILE_ITERATOR_H_
 #define XGBOOST_DATA_FILE_ITERATOR_H_

-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
+#include <algorithm>  // for max_element
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t
+#include <memory>     // for unique_ptr
+#include <string>     // for string
+#include <utility>    // for move

-#include "array_interface.h"
-#include "dmlc/data.h"
-#include "xgboost/c_api.h"
-#include "xgboost/json.h"
-#include "xgboost/linalg.h"
+#include "dmlc/data.h"        // for RowBlock, Parser
+#include "xgboost/c_api.h"    // for XGDMatrixSetDenseInfo, XGDMatrixFree, XGProxyDMatrixCreate
+#include "xgboost/linalg.h"   // for ArrayInterfaceStr, MakeVec
+#include "xgboost/logging.h"  // for CHECK

-namespace xgboost {
-namespace data {
-inline void ValidateFileFormat(std::string const& uri) {
-  std::vector<std::string> name_cache = common::Split(uri, '#');
-  CHECK_LE(name_cache.size(), 2)
-      << "Only one `#` is allowed in file path for cachefile specification";
-
-  std::vector<std::string> name_args = common::Split(name_cache[0], '?');
-  CHECK_LE(name_args.size(), 2) << "only one `?` is allowed in file path.";
-
-  StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"};
-  CHECK_EQ(name_args.size(), 2) << msg;
-
-  std::map<std::string, std::string> args;
-  std::vector<std::string> arg_list = common::Split(name_args[1], '&');
-  for (size_t i = 0; i < arg_list.size(); ++i) {
-    std::istringstream is(arg_list[i]);
-    std::pair<std::string, std::string> kv;
-    CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
-                                           << " for key in arg " << i + 1;
-    CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
-                                       << " for value in arg " << i + 1;
-    args.insert(kv);
-  }
-  if (args.find("format") == args.cend()) {
-    LOG(FATAL) << msg;
-  }
-}
+namespace xgboost::data {
+[[nodiscard]] std::string ValidateFileFormat(std::string const& uri);

 /**
 * An iterator for implementing external memory support with file inputs.  Users of
@@ -72,8 +46,7 @@ class FileIterator {

 public:
  FileIterator(std::string uri, unsigned part_index, unsigned num_parts)
-      : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts} {
-    ValidateFileFormat(uri_);
+      : uri_{ValidateFileFormat(std::move(uri))}, part_idx_{part_index}, n_parts_{num_parts} {
    XGProxyDMatrixCreate(&proxy_);
  }
  ~FileIterator() {
@@ -132,6 +105,5 @@ inline int Next(DataIterHandle self) {
  return static_cast<FileIterator*>(self)->Next();
 }
 }  // namespace fileiter
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_FILE_ITERATOR_H_
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -7,13 +7,12 @@
 #include <algorithm>
 #include <limits>
 #include <memory>
-#include <utility>  // std::forward
+#include <utility>  // for forward

 #include "../common/column_matrix.h"
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
-#include "../common/threading_utils.h"
-#include "../common/transform_iterator.h"  // MakeIndexTransformIter
+#include "../common/transform_iterator.h"  // for MakeIndexTransformIter

 namespace xgboost {

@@ -21,7 +20,7 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM

 GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
                                   double sparse_thresh, bool sorted_sketch,
-                                   common::Span<float> hess)
+                                   common::Span<float const> hess)
    : max_numeric_bins_per_feat{max_bins_per_feat} {
  CHECK(p_fmat->SingleColBlock());
  // We use sorted sketching for approx tree method since it's more efficient in
@@ -29,7 +28,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
  cut = common::SketchOnDMatrix(ctx, p_fmat, max_bins_per_feat, sorted_sketch, hess);

  const uint32_t nbins = cut.Ptrs().back();
-  hit_count.resize(nbins, 0);
+  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
  hit_count_tloc_.resize(ctx->Threads() * nbins, 0);

  size_t new_size = 1;
@@ -37,8 +36,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
    new_size += batch.Size();
  }

-  row_ptr.resize(new_size);
-  row_ptr[0] = 0;
+  row_ptr = common::MakeFixedVecWithMalloc(new_size, std::size_t{0});

  const bool isDense = p_fmat->IsDense();
  this->isDense_ = isDense;
@@ -61,8 +59,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_

 GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &&cuts,
                                   bst_bin_t max_bin_per_feat)
-    : row_ptr(info.num_row_ + 1, 0),
-      hit_count(cuts.TotalBins(), 0),
+    : row_ptr{common::MakeFixedVecWithMalloc(info.num_row_ + 1, std::size_t{0})},
+      hit_count{common::MakeFixedVecWithMalloc(cuts.TotalBins(), std::size_t{0})},
      cut{std::forward<common::HistogramCuts>(cuts)},
      max_numeric_bins_per_feat(max_bin_per_feat),
      isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
@@ -95,12 +93,10 @@ GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<Feature
      isDense_{isDense} {
  CHECK_GE(n_threads, 1);
  CHECK_EQ(row_ptr.size(), 0);
-  // The number of threads is pegged to the batch size. If the OMP
-  // block is parallelized on anything other than the batch/block size,
-  // it should be reassigned
-  row_ptr.resize(batch.Size() + 1, 0);
+  row_ptr = common::MakeFixedVecWithMalloc(batch.Size() + 1, std::size_t{0});
+
  const uint32_t nbins = cut.Ptrs().back();
-  hit_count.resize(nbins, 0);
+  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
  hit_count_tloc_.resize(n_threads * nbins, 0);

  this->PushBatch(batch, ft, n_threads);
@@ -128,20 +124,45 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH

 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
+  auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
+    // Must resize instead of allocating a new one. This function is called everytime a
+    // new batch is pushed, and we grow the size accordingly without loosing the data the
+    // previous batches.
+    using T = decltype(t);
+    std::size_t n_bytes = sizeof(T) * n_index;
+    CHECK_GE(n_bytes, this->data.size());
+
+    auto resource = this->data.Resource();
+    decltype(this->data) new_vec;
+    if (!resource) {
+      CHECK(this->data.empty());
+      new_vec = common::MakeFixedVecWithMalloc(n_bytes, std::uint8_t{0});
+    } else {
+      CHECK(resource->Type() == common::ResourceHandler::kMalloc);
+      auto malloc_resource = std::dynamic_pointer_cast<common::MallocResource>(resource);
+      CHECK(malloc_resource);
+      malloc_resource->Resize(n_bytes);
+
+      // gcc-11.3 doesn't work if DataAs is used.
+      std::uint8_t *new_ptr = reinterpret_cast<std::uint8_t *>(malloc_resource->Data());
+      new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
+    }
+    this->data = std::move(new_vec);
+    this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
+  };
+
  if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
      isDense) {
    // compress dense index to uint8
-    index.SetBinTypeSize(common::kUint8BinsTypeSize);
-    index.Resize((sizeof(uint8_t)) * n_index);
+    make_index(std::uint8_t{}, common::kUint8BinsTypeSize);
  } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
              MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
             isDense) {
    // compress dense index to uint16
-    index.SetBinTypeSize(common::kUint16BinsTypeSize);
-    index.Resize((sizeof(uint16_t)) * n_index);
+    make_index(std::uint16_t{}, common::kUint16BinsTypeSize);
  } else {
-    index.SetBinTypeSize(common::kUint32BinsTypeSize);
-    index.Resize((sizeof(uint32_t)) * n_index);
+    // no compression
+    make_index(std::uint32_t{}, common::kUint32BinsTypeSize);
  }
 }

@@ -214,11 +235,11 @@ float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
  return std::numeric_limits<float>::quiet_NaN();
 }

-bool GHistIndexMatrix::ReadColumnPage(dmlc::SeekStream *fi) {
+bool GHistIndexMatrix::ReadColumnPage(common::AlignedResourceReadStream *fi) {
  return this->columns_->Read(fi, this->cut.Ptrs().data());
 }

-size_t GHistIndexMatrix::WriteColumnPage(dmlc::Stream *fo) const {
+std::size_t GHistIndexMatrix::WriteColumnPage(common::AlignedFileWriteStream *fo) const {
  return this->columns_->Write(fo);
 }
 }  // namespace xgboost
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023, XGBoost Contributors
 */
 #include <memory>  // std::unique_ptr

@@ -41,9 +41,9 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
 }

 void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
-                          std::vector<size_t>* p_out) {
+                          common::RefResourceView<std::size_t>* p_out) {
  auto& row_ptr = *p_out;
-  row_ptr.resize(page->Size() + 1, 0);
+  row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0});
  if (page->is_dense) {
    std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
  } else {
@@ -95,7 +95,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
        ctx, page, &hit_count_tloc_, [&](auto bin_idx, auto) { return bin_idx; }, this);
  }

-  this->hit_count.resize(n_bins_total, 0);
+  this->hit_count = common::MakeFixedVecWithMalloc(n_bins_total, std::size_t{0});
  this->GatherHitCount(ctx->Threads(), n_bins_total);

  // sanity checks
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -9,13 +9,14 @@
 #include <atomic>     // for atomic
 #include <cinttypes>  // for uint32_t
 #include <cstddef>    // for size_t
-#include <memory>
+#include <memory>     // for make_unique
 #include <vector>

 #include "../common/categorical.h"
 #include "../common/error_msg.h"  // for InfInData
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
+#include "../common/ref_resource_view.h"  // for RefResourceView
 #include "../common/threading_utils.h"
 #include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
@@ -25,9 +26,11 @@
 namespace xgboost {
 namespace common {
 class ColumnMatrix;
+class AlignedFileWriteStream;
 }  // namespace common
-/*!
- * \brief preprocessed global index matrix, in CSR format
+
+/**
+ * @brief preprocessed global index matrix, in CSR format.
 *
 *  Transform floating values to integer index in histogram This is a global histogram
 *  index for CPU histogram.  On GPU ellpack page is used.
@@ -133,20 +136,22 @@ class GHistIndexMatrix {
  }

 public:
-  /*! \brief row pointer to rows by element position */
-  std::vector<size_t> row_ptr;
-  /*! \brief The index data */
+  /** @brief row pointer to rows by element position */
+  common::RefResourceView<std::size_t> row_ptr;
+  /** @brief data storage for index. */
+  common::RefResourceView<std::uint8_t> data;
+  /** @brief The histogram index. */
  common::Index index;
-  /*! \brief hit count of each index, used for constructing the ColumnMatrix */
-  std::vector<size_t> hit_count;
-  /*! \brief The corresponding cuts */
+  /** @brief hit count of each index, used for constructing the ColumnMatrix */
+  common::RefResourceView<std::size_t> hit_count;
+  /** @brief The corresponding cuts */
  common::HistogramCuts cut;
-  /** \brief max_bin for each feature. */
+  /** @brief max_bin for each feature. */
  bst_bin_t max_numeric_bins_per_feat;
-  /*! \brief base row index for current page (used by external memory) */
-  size_t base_rowid{0};
+  /** @brief base row index for current page (used by external memory) */
+  bst_row_t base_rowid{0};

-  bst_bin_t MaxNumBinPerFeat() const {
+  [[nodiscard]] bst_bin_t MaxNumBinPerFeat() const {
    return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
  }

@@ -155,7 +160,7 @@ class GHistIndexMatrix {
   * \brief Constrcutor for SimpleDMatrix.
   */
  GHistIndexMatrix(Context const* ctx, DMatrix* x, bst_bin_t max_bins_per_feat,
-                   double sparse_thresh, bool sorted_sketch, common::Span<float> hess = {});
+                   double sparse_thresh, bool sorted_sketch, common::Span<float const> hess = {});
  /**
   * \brief Constructor for Iterative DMatrix. Initialize basic information and prepare
   *        for push batch.
@@ -218,29 +223,30 @@ class GHistIndexMatrix {
    }
  }

-  bool IsDense() const {
-    return isDense_;
-  }
+  [[nodiscard]] bool IsDense() const { return isDense_; }
  void SetDense(bool is_dense) { isDense_ = is_dense; }
  /**
-   * \brief Get the local row index.
+   * @brief Get the local row index.
   */
-  size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
+  [[nodiscard]] std::size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }

-  bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
-  bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
+  [[nodiscard]] bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
+  [[nodiscard]] bst_feature_t Features() const { return cut.Ptrs().size() - 1; }

-  bool ReadColumnPage(dmlc::SeekStream* fi);
-  size_t WriteColumnPage(dmlc::Stream* fo) const;
+  [[nodiscard]] bool ReadColumnPage(common::AlignedResourceReadStream* fi);
+  [[nodiscard]] std::size_t WriteColumnPage(common::AlignedFileWriteStream* fo) const;

-  common::ColumnMatrix const& Transpose() const;
+  [[nodiscard]] common::ColumnMatrix const& Transpose() const;

-  bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
+  [[nodiscard]] bst_bin_t GetGindex(size_t ridx, size_t fidx) const;

-  float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
-  float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
-                  std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
-                  bool is_cat) const;
+  [[nodiscard]] float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
+  [[nodiscard]] float GetFvalue(std::vector<std::uint32_t> const& ptrs,
+                                std::vector<float> const& values, std::vector<float> const& mins,
+                                bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
+
+  [[nodiscard]] common::HistogramCuts& Cuts() { return cut; }
+  [[nodiscard]] common::HistogramCuts const& Cuts() const { return cut; }

 private:
  std::unique_ptr<common::ColumnMatrix> columns_;
@@ -294,5 +300,5 @@ void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
    }
  });
 }
-}      // namespace xgboost
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_GRADIENT_INDEX_H_
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -1,38 +1,49 @@
-/*!
- * Copyright 2021-2022 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
 */
-#include "sparse_page_writer.h"
-#include "gradient_index.h"
-#include "histogram_cut_format.h"
+#include <cstddef>      // for size_t
+#include <cstdint>      // for uint8_t
+#include <type_traits>  // for underlying_type_t
+#include <vector>       // for vector

-namespace xgboost {
-namespace data {
+#include "../common/io.h"                 // for AlignedResourceReadStream
+#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
+#include "gradient_index.h"               // for GHistIndexMatrix
+#include "histogram_cut_format.h"         // for ReadHistogramCuts
+#include "sparse_page_writer.h"           // for SparsePageFormat
+
+namespace xgboost::data {
 class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
 public:
-  bool Read(GHistIndexMatrix* page, dmlc::SeekStream* fi) override {
+  bool Read(GHistIndexMatrix* page, common::AlignedResourceReadStream* fi) override {
+    CHECK(fi);
+
    if (!ReadHistogramCuts(&page->cut, fi)) {
      return false;
    }
+
    // indptr
-    fi->Read(&page->row_ptr);
-    // data
-    std::vector<uint8_t> data;
-    if (!fi->Read(&data)) {
+    if (!common::ReadVec(fi, &page->row_ptr)) {
      return false;
    }
-    page->index.Resize(data.size());
-    std::copy(data.cbegin(), data.cend(), page->index.begin());
-    // bin type
+
+    // data
+    // - bin type
    // Old gcc doesn't support reading from enum.
    std::underlying_type_t<common::BinTypeSize> uint_bin_type{0};
    if (!fi->Read(&uint_bin_type)) {
      return false;
    }
-    common::BinTypeSize size_type =
-        static_cast<common::BinTypeSize>(uint_bin_type);
-    page->index.SetBinTypeSize(size_type);
+    common::BinTypeSize size_type = static_cast<common::BinTypeSize>(uint_bin_type);
+    // - index buffer
+    if (!common::ReadVec(fi, &page->data)) {
+      return false;
+    }
+    // - index
+    page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
+
    // hit count
-    if (!fi->Read(&page->hit_count)) {
+    if (!common::ReadVec(fi, &page->hit_count)) {
      return false;
    }
    if (!fi->Read(&page->max_numeric_bins_per_feat)) {
@@ -50,38 +61,35 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
      page->index.SetBinOffset(page->cut.Ptrs());
    }

-    page->ReadColumnPage(fi);
+    if (!page->ReadColumnPage(fi)) {
+      return false;
+    }
    return true;
  }

-  size_t Write(GHistIndexMatrix const &page, dmlc::Stream *fo) override {
-    size_t bytes = 0;
+  std::size_t Write(GHistIndexMatrix const& page, common::AlignedFileWriteStream* fo) override {
+    std::size_t bytes = 0;
    bytes += WriteHistogramCuts(page.cut, fo);
    // indptr
-    fo->Write(page.row_ptr);
-    bytes += page.row_ptr.size() * sizeof(decltype(page.row_ptr)::value_type) +
-             sizeof(uint64_t);
+    bytes += common::WriteVec(fo, page.row_ptr);
+
    // data
-    std::vector<uint8_t> data(page.index.begin(), page.index.end());
-    fo->Write(data);
-    bytes += data.size() * sizeof(decltype(data)::value_type) + sizeof(uint64_t);
-    // bin type
-    std::underlying_type_t<common::BinTypeSize> uint_bin_type =
-        page.index.GetBinTypeSize();
-    fo->Write(uint_bin_type);
-    bytes += sizeof(page.index.GetBinTypeSize());
+    // - bin type
+    std::underlying_type_t<common::BinTypeSize> uint_bin_type = page.index.GetBinTypeSize();
+    bytes += fo->Write(uint_bin_type);
+    // - index buffer
+    std::vector<std::uint8_t> data(page.index.begin(), page.index.end());
+    bytes += fo->Write(static_cast<std::uint64_t>(data.size()));
+    if (!data.empty()) {
+      bytes += fo->Write(data.data(), data.size());
+    }
+
    // hit count
-    fo->Write(page.hit_count);
-    bytes +=
-        page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
-        sizeof(uint64_t);
+    bytes += common::WriteVec(fo, page.hit_count);
    // max_bins, base row, is_dense
-    fo->Write(page.max_numeric_bins_per_feat);
-    bytes += sizeof(page.max_numeric_bins_per_feat);
-    fo->Write(page.base_rowid);
-    bytes += sizeof(page.base_rowid);
-    fo->Write(page.IsDense());
-    bytes += sizeof(page.IsDense());
+    bytes += fo->Write(page.max_numeric_bins_per_feat);
+    bytes += fo->Write(page.base_rowid);
+    bytes += fo->Write(page.IsDense());

    bytes += page.WriteColumnPage(fo);
    return bytes;
@@ -93,6 +101,4 @@ DMLC_REGISTRY_FILE_TAG(gradient_index_format);
 XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(raw)
    .describe("Raw GHistIndex binary data format.")
    .set_body([]() { return new GHistIndexRawFormat(); });
-
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/gradient_index_page_source.cc
+++ b/src/data/gradient_index_page_source.cc
@@ -1,10 +1,9 @@
-/*!
- * Copyright 2021-2022 by XGBoost Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
 #include "gradient_index_page_source.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 void GradientIndexPageSource::Fetch() {
  if (!this->ReadCache()) {
    if (count_ != 0 && !sync_) {
@@ -21,5 +20,4 @@ void GradientIndexPageSource::Fetch() {
    this->WriteCache();
  }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/histogram_cut_format.h
+++ b/src/data/histogram_cut_format.h
@@ -1,36 +1,38 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
 */
 #ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
 #define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_

-#include "../common/hist_util.h"
+#include <dmlc/io.h>  // for Stream

-namespace xgboost {
-namespace data {
-inline bool ReadHistogramCuts(common::HistogramCuts *cuts, dmlc::SeekStream *fi) {
-  if (!fi->Read(&cuts->cut_values_.HostVector())) {
+#include <cstddef>  // for size_t
+
+#include "../common/hist_util.h"          // for HistogramCuts
+#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.h"  // for WriteVec, ReadVec
+
+namespace xgboost::data {
+inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
+  if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
    return false;
  }
-  if (!fi->Read(&cuts->cut_ptrs_.HostVector())) {
+  if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
    return false;
  }
-  if (!fi->Read(&cuts->min_vals_.HostVector())) {
+  if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
    return false;
  }
  return true;
 }

-inline size_t WriteHistogramCuts(common::HistogramCuts const &cuts, dmlc::Stream *fo) {
-  size_t bytes = 0;
-  fo->Write(cuts.cut_values_.ConstHostVector());
-  bytes += cuts.cut_values_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
-  fo->Write(cuts.cut_ptrs_.ConstHostVector());
-  bytes += cuts.cut_ptrs_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
-  fo->Write(cuts.min_vals_.ConstHostVector());
-  bytes += cuts.min_vals_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
+inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
+                                      common::AlignedFileWriteStream *fo) {
+  std::size_t bytes = 0;
+  bytes += common::WriteVec(fo, cuts.Values());
+  bytes += common::WriteVec(fo, cuts.Ptrs());
+  bytes += common::WriteVec(fo, cuts.MinValues());
  return bytes;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -33,10 +33,11 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
  bool valid = iter.Next();
  CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";

-  auto d = MakeProxy(proxy_)->DeviceIdx();
+  auto pctx = MakeProxy(proxy_)->Ctx();

  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
+  ctx.UpdateAllowUnknown(
+      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
  // hardcoded parameter.
  BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};

@@ -240,9 +241,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
   * Generate gradient index.
   */
  this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), p.max_bin);
-  size_t rbegin = 0;
-  size_t prev_sum = 0;
-  size_t i = 0;
+  std::size_t rbegin = 0;
+  std::size_t prev_sum = 0;
+  std::size_t i = 0;
  while (iter.Next()) {
    HostAdapterDispatch(proxy, [&](auto const& batch) {
      proxy->Info().num_nonzero_ = batch_nnz[i];
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -31,10 +31,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  dh::XGBCachingDeviceAllocator<char> alloc;

  auto num_rows = [&]() {
-    return Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
+    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
  };
  auto num_cols = [&]() {
-    return Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
+    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
  };

  size_t row_stride = 0;
@@ -86,7 +86,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
                                     get_device());
      auto* p_sketch = &sketch_containers.back();
      proxy->Info().weights_.SetDevice(get_device());
-      Dispatch(proxy, [&](auto const& value) {
+      cuda_impl::Dispatch(proxy, [&](auto const& value) {
        common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
      });
    }
@@ -94,7 +94,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    accumulated_rows += batch_rows;
    dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
+    row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
                            return GetRowCounts(value, row_counts_span, get_device(), missing);
                          }));

@@ -129,7 +129,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    sketch_containers.clear();
    sketch_containers.shrink_to_fit();

-    final_sketch.MakeCuts(&cuts);
+    final_sketch.MakeCuts(&cuts, this->info_.IsColumnSplit());
  } else {
    GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
  }
@@ -137,7 +137,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  this->info_.num_row_ = accumulated_rows;
  this->info_.num_nonzero_ = nnz;

-  auto init_page = [this, &proxy, &cuts, row_stride, accumulated_rows, get_device]() {
+  auto init_page = [this, &cuts, row_stride, accumulated_rows, get_device]() {
    if (!ellpack_) {
      // Should be put inside the while loop to protect against empty batch.  In
      // that case device id is invalid.
@@ -165,14 +165,14 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    auto rows = num_rows();
    dh::device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    Dispatch(proxy, [=](auto const& value) {
+    cuda_impl::Dispatch(proxy, [=](auto const& value) {
      return GetRowCounts(value, row_counts_span, get_device(), missing);
    });
    auto is_dense = this->IsDense();

    proxy->Info().feature_types.SetDevice(get_device());
    auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
-    auto new_impl = Dispatch(proxy, [&](auto const& value) {
+    auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
      return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
                             d_feature_types, row_stride, rows, cuts);
    });
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -1,14 +1,13 @@
-/*!
- * Copyright 2021 by Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 * \file proxy_dmatrix.cc
 */

 #include "proxy_dmatrix.h"

-namespace xgboost {
-namespace data {
-void DMatrixProxy::SetArrayData(char const *c_interface) {
-  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter(StringView{c_interface})};
+namespace xgboost::data {
+void DMatrixProxy::SetArrayData(StringView interface_str) {
+  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
@@ -25,5 +24,38 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
  this->Info().num_row_ = adapter->NumRows();
  this->ctx_.gpu_id = Context::kCpuId;
 }
-}  // namespace data
-}  // namespace xgboost
+
+namespace cuda_impl {
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
+                                                float) {
+  return nullptr;
+}
+#endif  // XGBOOST_USE_CUDA
+}  // namespace cuda_impl
+
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy,
+                                                float missing) {
+  bool type_error{false};
+  std::shared_ptr<DMatrix> p_fmat{nullptr};
+  if (proxy->Ctx()->IsCPU()) {
+    p_fmat = data::HostAdapterDispatch<false>(
+        proxy.get(),
+        [&](auto const &adapter) {
+          auto p_fmat =
+              std::shared_ptr<DMatrix>(DMatrix::Create(adapter.get(), missing, ctx->Threads()));
+          return p_fmat;
+        },
+        &type_error);
+  } else {
+    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
+  }
+
+  CHECK(p_fmat) << "Failed to fallback.";
+  p_fmat->Info() = proxy->Info().Copy();
+  return p_fmat;
+}
+}  // namespace xgboost::data
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -1,35 +1,47 @@
-/*!
- * Copyright 2020-2022, XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
 */
-#include "proxy_dmatrix.h"
 #include "device_adapter.cuh"
+#include "proxy_dmatrix.cuh"
+#include "proxy_dmatrix.h"

-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
-  std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
-  auto const& value = adapter->Value();
+  auto adapter{std::make_shared<CudfAdapter>(interface_str)};
  this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
  }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }

 void DMatrixProxy::FromCudaArray(StringView interface_str) {
-  std::shared_ptr<CupyAdapter> adapter(new CupyAdapter{StringView{interface_str}});
+  auto adapter(std::make_shared<CupyAdapter>(StringView{interface_str}));
  this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
  }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }
-}  // namespace data
-}  // namespace xgboost
+
+namespace cuda_impl {
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy,
+                                                float missing) {
+  return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
+    auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
+    return p_fmat;
+  });
+}
+}  // namespace cuda_impl
+}  // namespace xgboost::data
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -6,19 +6,34 @@
 #include "device_adapter.cuh"
 #include "proxy_dmatrix.h"

-namespace xgboost::data {
-template <typename Fn>
+namespace xgboost::data::cuda_impl {
+template <bool get_value = true, typename Fn>
 decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
-    return fn(value);
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-    return fn(value);
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
  } else {
    LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
-    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-    return fn(value);
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
  }
 }
-}  // namespace xgboost::data
+}  // namespace xgboost::data::cuda_impl
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -62,7 +62,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  }

-  void SetArrayData(char const* c_interface);
+  void SetArrayData(StringView interface_str);
  void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
                  bst_feature_t n_features, bool on_host);

@@ -114,28 +114,62 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
  return typed;
 }

-template <typename Fn>
+/**
+ * @brief Dispatch function call based on input type.
+ *
+ * @tparam get_value Whether the funciton Fn accept an adapter batch or the adapter itself.
+ * @tparam Fn        The type of the function to be dispatched.
+ *
+ * @param proxy The proxy object holding the reference to the input.
+ * @param fn    The function to be dispatched.
+ * @param type_error[out] Set to ture if it's not null and the input data is not recognized by
+ *                        the host.
+ *
+ * @return The return value of the function being dispatched.
+ */
+template <bool get_value = true, typename Fn>
 decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
    if (type_error) {
      *type_error = false;
    }
-    return fn(value);
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
    if (type_error) {
      *type_error = false;
    }
-    return fn(value);
  } else {
    if (type_error) {
      *type_error = true;
    } else {
      LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
    }
-    return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
+    if constexpr (get_value) {
+      return std::result_of_t<Fn(
+          decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
+    } else {
+      return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()))>();
+    }
  }
 }
+
+/**
+ * @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
+ */
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -8,21 +8,21 @@

 #include <algorithm>
 #include <limits>
+#include <numeric>  // for accumulate
 #include <type_traits>
 #include <vector>

-#include "../common/error_msg.h"  // for InconsistentMaxBin
-#include "../common/random.h"
-#include "../common/threading_utils.h"
+#include "../collective/communicator-inl.h"  // for GetWorldSize, GetRank, Allgather
+#include "../common/error_msg.h"             // for InconsistentMaxBin
 #include "./simple_batch_iterator.h"
 #include "adapter.h"
-#include "batch_utils.h"  // for CheckEmpty, RegenGHist
+#include "batch_utils.h"   // for CheckEmpty, RegenGHist
+#include "ellpack_page.h"  // for EllpackPage
 #include "gradient_index.h"
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 MetaInfo& SimpleDMatrix::Info() { return info_; }

 const MetaInfo& SimpleDMatrix::Info() const { return info_; }
@@ -97,6 +97,10 @@ BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
 BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
  // column page doesn't exist, generate it
  if (!column_page_) {
+    auto n = std::numeric_limits<decltype(Entry::index)>::max();
+    if (this->sparse_page_->Size() > n) {
+      error::MaxSampleSize(n);
+    }
    column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
  }
  auto begin_iter = BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
@@ -106,6 +110,10 @@ BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
 BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches(Context const* ctx) {
  // Sorted column page doesn't exist, generate it
  if (!sorted_column_page_) {
+    auto n = std::numeric_limits<decltype(Entry::index)>::max();
+    if (this->sparse_page_->Size() > n) {
+      error::MaxSampleSize(n);
+    }
    sorted_column_page_.reset(
        new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
    sorted_column_page_->SortRows(ctx->Threads());
@@ -427,5 +435,4 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i

  fmat_ctx_ = ctx;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -32,7 +32,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
 #endif

  Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(device)}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});

  CHECK(adapter->NumRows() != kAdapterUnknownSize);
  CHECK(adapter->NumColumns() != kAdapterUnknownSize);
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -8,7 +8,6 @@
 #include "./sparse_page_dmatrix.h"

 #include "../collective/communicator-inl.h"
-#include "./simple_batch_iterator.h"
 #include "batch_utils.h"  // for RegenGHist
 #include "gradient_index.h"

@@ -165,7 +164,10 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const

 BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ctx,
                                                               const BatchParam &param) {
-  CHECK_GE(param.max_bin, 2);
+  if (param.Initialized()) {
+    CHECK_GE(param.max_bin, 2);
+  }
+  detail::CheckEmpty(batch_param_, param);
  auto id = MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
  this->InitializeSparsePage(ctx);
  if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -1,17 +1,23 @@
 /**
 * Copyright 2021-2023 by XGBoost contributors
 */
+#include <memory>  // for unique_ptr
+
 #include "../common/hist_util.cuh"
-#include "batch_utils.h"  // for CheckEmpty, RegenGHist
+#include "../common/hist_util.h"  // for HistogramCuts
+#include "batch_utils.h"          // for CheckEmpty, RegenGHist
 #include "ellpack_page.cuh"
 #include "sparse_page_dmatrix.h"
-#include "sparse_page_source.h"
+#include "xgboost/context.h"  // for Context
+#include "xgboost/data.h"     // for BatchParam

 namespace xgboost::data {
 BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
                                                           const BatchParam& param) {
  CHECK(ctx->IsCUDA());
-  CHECK_GE(param.max_bin, 2);
+  if (param.Initialized()) {
+    CHECK_GE(param.max_bin, 2);
+  }
  detail::CheckEmpty(batch_param_, param);
  auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
  size_t row_stride = 0;
@@ -21,8 +27,13 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
    cache_info_.erase(id);
    MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
    std::unique_ptr<common::HistogramCuts> cuts;
-    cuts.reset(
-        new common::HistogramCuts{common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0)});
+    if (!param.hess.empty()) {
+      cuts = std::make_unique<common::HistogramCuts>(
+          common::DeviceSketchWithHessian(ctx, this, param.max_bin, param.hess));
+    } else {
+      cuts =
+          std::make_unique<common::HistogramCuts>(common::DeviceSketch(ctx, this, param.max_bin));
+    }
    this->InitializeSparsePage(ctx);  // reset after use.

    row_stride = GetRowStride(this);
@@ -31,10 +42,10 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
    batch_param_ = param;

    auto ft = this->info_.feature_types.ConstDeviceSpan();
-    ellpack_page_source_.reset();  // release resources.
-    ellpack_page_source_.reset(new EllpackPageSource(
+    ellpack_page_source_.reset();  // make sure resource is released before making new ones.
+    ellpack_page_source_ = std::make_shared<EllpackPageSource>(
        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
-        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id));
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
  } else {
    CHECK(sparse_page_source_);
    ellpack_page_source_->Reset();
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -7,9 +7,6 @@
 #ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
 #define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_

-#include <xgboost/data.h>
-#include <xgboost/logging.h>
-
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -20,35 +17,33 @@
 #include "ellpack_page_source.h"
 #include "gradient_index_page_source.h"
 #include "sparse_page_source.h"
+#include "xgboost/data.h"
+#include "xgboost/logging.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 /**
 * \brief DMatrix used for external memory.
 *
 * The external memory is created for controlling memory usage by splitting up data into
- * multiple batches.  However that doesn't mean we will actually process exact 1 batch at
- * a time, which would be terribly slow considering that we have to loop through the
- * whole dataset for every tree split.  So we use async pre-fetch and let caller to decide
- * how many batches it wants to process by returning data as shared pointer.  The caller
- * can use async function to process the data or just stage those batches, making the
- * decision is out of the scope for sparse page dmatrix.  These 2 optimizations might
- * defeat the purpose of splitting up dataset since if you load all the batches then the
- * memory usage is even worse than using a single batch.  Essentially we need to control
- * how many batches can be in memory at the same time.
+ * multiple batches.  However that doesn't mean we will actually process exactly 1 batch
+ * at a time, which would be terribly slow considering that we have to loop through the
+ * whole dataset for every tree split.  So we use async to pre-fetch pages and let the
+ * caller to decide how many batches it wants to process by returning data as a shared
+ * pointer. The caller can use async function to process the data or just stage those
+ * batches based on its use cases. These two optimizations might defeat the purpose of
+ * splitting up dataset since if you stage all the batches then the memory usage might be
+ * even worse than using a single batch. As a result, we must control how many batches can
+ * be in memory at any given time.
 *
- * Right now the write to the cache is sequential operation and is blocking, reading from
- * cache is async but with a hard coded limit of 4 pages as an heuristic.  So by sparse
- * dmatrix itself there can be only 9 pages in main memory (might be of different types)
- * at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
- * dependent pages.  If the caller stops iteration at the middle and start again, then the
- * number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
- * caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
- * sampling algo that samples only the first portion of data).
+ * Right now the write to the cache is a sequential operation and is blocking. Reading
+ * from cache on ther other hand, is async but with a hard coded limit of 3 pages as an
+ * heuristic.  So by sparse dmatrix itself there can be only 7 pages in main memory (might
+ * be of different types) at the same time: 1 page pending for write, 3 pre-fetched sparse
+ * pages, 3 pre-fetched dependent pages.
 *
 * Of course if the caller decides to retain some batches to perform parallel processing,
 * then we might load all pages in memory, which is also considered as a bug in caller's
- * code.  So if the algo supports external memory, it must be careful that queue for async
+ * code. So if the algo supports external memory, it must be careful that queue for async
 * call must have an upper limit.
 *
 * Another assumption we make is that the data must be immutable so caller should never
@@ -101,7 +96,7 @@ class SparsePageDMatrix : public DMatrix {
  MetaInfo &Info() override;
  const MetaInfo &Info() const override;
  Context const *Ctx() const override { return &fmat_ctx_; }
-
+  // The only DMatrix implementation that returns false.
  bool SingleColBlock() const override { return false; }
  DMatrix *Slice(common::Span<int32_t const>) override {
    LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
@@ -153,6 +148,5 @@ inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::st
  }
  return id;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
--- a/src/data/sparse_page_raw_format.cc
+++ b/src/data/sparse_page_raw_format.cc
@@ -1,59 +1,57 @@
-/*!
- * Copyright (c) 2015-2021 by Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
 * \file sparse_page_raw_format.cc
 *  Raw binary format of sparse page.
 */
-#include <xgboost/data.h>
 #include <dmlc/registry.h>

-#include "xgboost/logging.h"
+#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.h"  // for WriteVec
 #include "./sparse_page_writer.h"
+#include "xgboost/data.h"
+#include "xgboost/logging.h"

-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);

-template<typename T>
+template <typename T>
 class SparsePageRawFormat : public SparsePageFormat<T> {
 public:
-  bool Read(T* page, dmlc::SeekStream* fi) override {
+  bool Read(T* page, common::AlignedResourceReadStream* fi) override {
    auto& offset_vec = page->offset.HostVector();
-    if (!fi->Read(&offset_vec)) {
+    if (!common::ReadVec(fi, &offset_vec)) {
      return false;
    }
    auto& data_vec = page->data.HostVector();
    CHECK_NE(page->offset.Size(), 0U) << "Invalid SparsePage file";
    data_vec.resize(offset_vec.back());
    if (page->data.Size() != 0) {
-      size_t n_bytes = fi->Read(dmlc::BeginPtr(data_vec),
-                                (page->data).Size() * sizeof(Entry));
-      CHECK_EQ(n_bytes, (page->data).Size() * sizeof(Entry))
-          << "Invalid SparsePage file";
+      if (!common::ReadVec(fi, &data_vec)) {
+        return false;
+      }
+    }
+    if (!fi->Read(&page->base_rowid, sizeof(page->base_rowid))) {
+      return false;
    }
-    fi->Read(&page->base_rowid, sizeof(page->base_rowid));
    return true;
  }

-  size_t Write(const T& page, dmlc::Stream* fo) override {
+  std::size_t Write(const T& page, common::AlignedFileWriteStream* fo) override {
    const auto& offset_vec = page.offset.HostVector();
    const auto& data_vec = page.data.HostVector();
    CHECK(page.offset.Size() != 0 && offset_vec[0] == 0);
    CHECK_EQ(offset_vec.back(), page.data.Size());
-    fo->Write(offset_vec);
-    auto bytes = page.MemCostBytes();
-    bytes += sizeof(uint64_t);
+
+    std::size_t bytes{0};
+    bytes += common::WriteVec(fo, offset_vec);
    if (page.data.Size() != 0) {
-      fo->Write(dmlc::BeginPtr(data_vec), page.data.Size() * sizeof(Entry));
+      bytes += common::WriteVec(fo, data_vec);
    }
-    fo->Write(&page.base_rowid, sizeof(page.base_rowid));
-    bytes += sizeof(page.base_rowid);
+    bytes += fo->Write(&page.base_rowid, sizeof(page.base_rowid));
    return bytes;
  }

 private:
-  /*! \brief external memory column offset */
-  std::vector<size_t> disk_offset_;
 };

 XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
@@ -74,5 +72,4 @@ XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
    return new SparsePageRawFormat<SortedCSCPage>();
  });

-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -1,33 +1,31 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
 */
+#include "../common/device_helpers.cuh"  // for CurrentDevice
+#include "proxy_dmatrix.cuh"             // for Dispatch, DMatrixProxy
+#include "simple_dmatrix.cuh"            // for CopyToSparsePage
 #include "sparse_page_source.h"
-#include "proxy_dmatrix.cuh"
-#include "simple_dmatrix.cuh"
-
-namespace xgboost {
-namespace data {
+#include "xgboost/data.h"  // for SparsePage

+namespace xgboost::data {
 namespace detail {
 std::size_t NSamplesDevice(DMatrixProxy *proxy) {
-  return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
+  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
 }

 std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
-  return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
+  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
 }
 }  // namespace detail

-void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
+void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
  auto device = proxy->DeviceIdx();
  if (device < 0) {
    device = dh::CurrentDevice();
  }
  CHECK_GE(device, 0);

-  Dispatch(proxy, [&](auto const &value) {
-    CopyToSparsePage(value, device, missing, page);
-  });
+  cuda_impl::Dispatch(proxy,
+                      [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -1,45 +1,49 @@
-/*!
- *  Copyright 2014-2022 by XGBoost Contributors
+/**
+ *  Copyright 2014-2023, XGBoost Contributors
 * \file sparse_page_source.h
 */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
 #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_

-#include <algorithm>  // std::min
-#include <string>
-#include <utility>
-#include <vector>
-#include <future>
-#include <thread>
+#include <algorithm>  // for min
+#include <atomic>     // for atomic
+#include <future>     // for async
 #include <map>
 #include <memory>
+#include <mutex>  // for mutex
+#include <string>
+#include <thread>
+#include <utility>  // for pair, move
+#include <vector>

+#include "../common/common.h"
+#include "../common/io.h"     // for PrivateMmapConstStream
+#include "../common/timer.h"  // for Monitor, Timer
+#include "adapter.h"
+#include "proxy_dmatrix.h"       // for DMatrixProxy
+#include "sparse_page_writer.h"  // for SparsePageFormat
 #include "xgboost/base.h"
 #include "xgboost/data.h"

-#include "adapter.h"
-#include "sparse_page_writer.h"
-#include "proxy_dmatrix.h"
-
-#include "../common/common.h"
-#include "../common/timer.h"
-
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 inline void TryDeleteCacheFile(const std::string& file) {
  if (std::remove(file.c_str()) != 0) {
+    // Don't throw, this is called in a destructor.
    LOG(WARNING) << "Couldn't remove external memory cache file " << file
-              << "; you may want to remove it manually";
+                 << "; you may want to remove it manually";
  }
 }

+/**
+ * @brief Information about the cache including path and page offsets.
+ */
 struct Cache {
  // whether the write to the cache is complete
  bool written;
  std::string name;
  std::string format;
  // offset into binary cache file.
-  std::vector<size_t> offset;
+  std::vector<std::uint64_t> offset;

  Cache(bool w, std::string n, std::string fmt)
      : written{w}, name{std::move(n)}, format{std::move(fmt)} {
@@ -51,11 +55,24 @@ struct Cache {
    return name + format;
  }

-  std::string ShardName() {
+  [[nodiscard]] std::string ShardName() const {
    return ShardName(this->name, this->format);
  }
-
-  // The write is completed.
+  /**
+   * @brief Record a page with size of n_bytes.
+   */
+  void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
+  /**
+   * @brief Returns the view start and length for the i^th page.
+   */
+  [[nodiscard]] auto View(std::size_t i) const {
+    std::uint64_t off = offset.at(i);
+    std::uint64_t len = offset.at(i + 1) - offset[i];
+    return std::pair{off, len};
+  }
+  /**
+   * @brief Call this once the write for the cache is complete.
+   */
  void Commit() {
    if (!written) {
      std::partial_sum(offset.begin(), offset.end(), offset.begin());
@@ -64,7 +81,7 @@ struct Cache {
  }
 };

-// Prevents multi-threaded call.
+// Prevents multi-threaded call to `GetBatches`.
 class TryLockGuard {
  std::mutex& lock_;

@@ -77,74 +94,128 @@ class TryLockGuard {
  }
 };

+// Similar to `dmlc::OMPException`, but doesn't need the threads to be joined before rethrow
+class ExceHandler {
+  std::mutex mutex_;
+  std::atomic<bool> flag_{false};
+  std::exception_ptr curr_exce_{nullptr};
+
+ public:
+  template <typename Fn>
+  decltype(auto) Run(Fn&& fn) noexcept(true) {
+    try {
+      return fn();
+    } catch (dmlc::Error const& e) {
+      std::lock_guard<std::mutex> guard{mutex_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+      }
+      flag_ = true;
+    } catch (std::exception const& e) {
+      std::lock_guard<std::mutex> guard{mutex_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+      }
+      flag_ = true;
+    } catch (...) {
+      std::lock_guard<std::mutex> guard{mutex_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+      }
+      flag_ = true;
+    }
+    return std::invoke_result_t<Fn>();
+  }
+
+  void Rethrow() noexcept(false) {
+    if (flag_) {
+      CHECK(curr_exce_);
+      std::rethrow_exception(curr_exce_);
+    }
+  }
+};
+
+/**
+ * @brief Base class for all page sources. Handles fetching, writing, and iteration.
+ */
 template <typename S>
 class SparsePageSourceImpl : public BatchIteratorImpl<S> {
 protected:
  // Prevents calling this iterator from multiple places(or threads).
  std::mutex single_threaded_;
-
+  // The current page.
  std::shared_ptr<S> page_;

  bool at_end_ {false};
  float missing_;
-  int nthreads_;
+  std::int32_t nthreads_;
  bst_feature_t n_features_;
-
-  uint32_t count_{0};
-
-  uint32_t n_batches_ {0};
+  // Index to the current page.
+  std::uint32_t count_{0};
+  // Total number of batches.
+  std::uint32_t n_batches_{0};

  std::shared_ptr<Cache> cache_info_;
-  std::unique_ptr<dmlc::Stream> fo_;

  using Ring = std::vector<std::future<std::shared_ptr<S>>>;
  // A ring storing futures to data.  Since the DMatrix iterator is forward only, so we
  // can pre-fetch data in a ring.
  std::unique_ptr<Ring> ring_{new Ring};
+  // Catching exception in pre-fetch threads to prevent segfault. Not always work though,
+  // OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
+  // OOM error should be rare.
+  ExceHandler exce_;
+  common::Monitor monitor_;

  bool ReadCache() {
    CHECK(!at_end_);
    if (!cache_info_->written) {
      return false;
    }
-    if (fo_) {
-      fo_.reset();  // flush the data to disk.
+    if (ring_->empty()) {
      ring_->resize(n_batches_);
    }
    // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
    // to let user adjust number of pre-fetched batches when needed.
-    uint32_t constexpr kPreFetch = 4;
+    uint32_t constexpr kPreFetch = 3;

    size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
    CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
-    size_t fetch_it = count_;
+    std::size_t fetch_it = count_;

-    for (size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
+    exce_.Rethrow();
+
+    for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
      fetch_it %= n_batches_;  // ring
      if (ring_->at(fetch_it).valid()) {
        continue;
      }
-      auto const *self = this;  // make sure it's const
+      auto const* self = this;  // make sure it's const
      CHECK_LT(fetch_it, cache_info_->offset.size());
-      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
-        common::Timer timer;
-        timer.Start();
-        std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-        auto n = self->cache_info_->ShardName();
-        size_t offset = self->cache_info_->offset.at(fetch_it);
-        std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(n.c_str())};
-        fi->Seek(offset);
-        CHECK_EQ(fi->Tell(), offset);
+      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
        auto page = std::make_shared<S>();
-        CHECK(fmt->Read(page.get(), fi.get()));
-        LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
+        this->exce_.Run([&] {
+          std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
+          auto name = self->cache_info_->ShardName();
+          auto [offset, length] = self->cache_info_->View(fetch_it);
+          auto fi = std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
+          CHECK(fmt->Read(page.get(), fi.get()));
+        });
        return page;
      });
    }
+
    CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
             n_prefetch_batches)
        << "Sparse DMatrix assumes forward iteration.";
+
+    monitor_.Start("Wait");
    page_ = (*ring_)[count_].get();
+    CHECK(!(*ring_)[count_].valid());
+    monitor_.Stop("Wait");
+
+    exce_.Rethrow();
+
    return true;
  }

@@ -153,29 +224,41 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    common::Timer timer;
    timer.Start();
    std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-    if (!fo_) {
-      auto n = cache_info_->ShardName();
-      fo_.reset(dmlc::Stream::Create(n.c_str(), "w"));
-    }
-    auto bytes = fmt->Write(*page_, fo_.get());
-    timer.Stop();

+    auto name = cache_info_->ShardName();
+    std::unique_ptr<common::AlignedFileWriteStream> fo;
+    if (this->Iter() == 0) {
+      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "wb");
+    } else {
+      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "ab");
+    }
+
+    auto bytes = fmt->Write(*page_, fo.get());
+
+    timer.Stop();
+    // Not entirely accurate, the kernels doesn't have to flush the data.
    LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
              << timer.ElapsedSeconds() << " seconds.";
-    cache_info_->offset.push_back(bytes);
+    cache_info_->Push(bytes);
  }

  virtual void Fetch() = 0;

 public:
-  SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features,
-                       uint32_t n_batches, std::shared_ptr<Cache> cache)
-      : missing_{missing}, nthreads_{nthreads}, n_features_{n_features},
-        n_batches_{n_batches}, cache_info_{std::move(cache)} {}
+  SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches,
+                       std::shared_ptr<Cache> cache)
+      : missing_{missing},
+        nthreads_{nthreads},
+        n_features_{n_features},
+        n_batches_{n_batches},
+        cache_info_{std::move(cache)} {
+    monitor_.Init(typeid(S).name());  // not pretty, but works for basic profiling
+  }

  SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;

  ~SparsePageSourceImpl() override {
+    // Don't orphan the threads.
    for (auto& fu : *ring_) {
      if (fu.valid()) {
        fu.get();
@@ -183,18 +266,18 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    }
  }

-  uint32_t Iter() const { return count_; }
+  [[nodiscard]] uint32_t Iter() const { return count_; }

  const S &operator*() const override {
    CHECK(page_);
    return *page_;
  }

-  std::shared_ptr<S const> Page() const override {
+  [[nodiscard]] std::shared_ptr<S const> Page() const override {
    return page_;
  }

-  bool AtEnd() const override {
+  [[nodiscard]] bool AtEnd() const override {
    return at_end_;
  }

@@ -202,20 +285,23 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    TryLockGuard guard{single_threaded_};
    at_end_ = false;
    count_ = 0;
+    // Pre-fetch for the next round of iterations.
    this->Fetch();
  }
 };

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+// Push data from CUDA.
 void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
 #else
 inline void DevicePush(DMatrixProxy*, float, SparsePage*) { common::AssertGPUSupport(); }
 #endif

 class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
+  // This is the source from the user.
  DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
  DMatrixProxy* proxy_;
-  size_t base_row_id_ {0};
+  std::size_t base_row_id_{0};

  void Fetch() final {
    page_ = std::make_shared<SparsePage>();
@@ -244,7 +330,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
        iter_{iter}, proxy_{proxy} {
    if (!cache_info_->written) {
      iter_.Reset();
-      CHECK_EQ(iter_.Next(), 1) << "Must have at least 1 batch.";
+      CHECK(iter_.Next()) << "Must have at least 1 batch.";
    }
    this->Fetch();
  }
@@ -259,6 +345,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
    }

    if (at_end_) {
+      CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1);
      cache_info_->Commit();
      if (n_batches_ != 0) {
        CHECK_EQ(count_, n_batches_);
@@ -371,6 +458,5 @@ class SortedCSCPageSource : public PageSourceIncMixIn<SortedCSCPage> {
    this->Fetch();
  }
 };
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
--- a/src/data/sparse_page_writer.h
+++ b/src/data/sparse_page_writer.h
@@ -1,52 +1,44 @@
-/*!
- * Copyright (c) 2014-2019 by Contributors
+/**
+ * Copyright 2014-2023, XGBoost Contributors
 * \file sparse_page_writer.h
 * \author Tianqi Chen
 */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
 #define XGBOOST_DATA_SPARSE_PAGE_WRITER_H_

-#include <xgboost/data.h>
-#include <dmlc/io.h>
-#include <vector>
-#include <algorithm>
-#include <cstring>
-#include <string>
-#include <utility>
-#include <memory>
-#include <functional>
+#include <functional>  // for function
+#include <string>      // for string

-#if DMLC_ENABLE_STD_THREAD
-#include <dmlc/concurrency.h>
-#include <thread>
-#endif  // DMLC_ENABLE_STD_THREAD
-
-namespace xgboost {
-namespace data {
+#include "../common/io.h"   // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "dmlc/io.h"        // for Stream
+#include "dmlc/registry.h"  // for Registry, FunctionRegEntryBase
+#include "xgboost/data.h"   // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...

+namespace xgboost::data {
 template<typename T>
 struct SparsePageFormatReg;

-/*!
- * \brief Format specification of SparsePage.
+/**
+ * @brief Format specification of various data formats like SparsePage.
 */
-template<typename T>
+template <typename T>
 class SparsePageFormat {
 public:
-  /*! \brief virtual destructor */
  virtual ~SparsePageFormat() = default;
-  /*!
-   * \brief Load all the segments into page, advance fi to end of the block.
-   * \param page The data to read page into.
-   * \param fi the input stream of the file
-   * \return true of the loading as successful, false if end of file was reached
+  /**
+   * @brief Load all the segments into page, advance fi to end of the block.
+   *
+   * @param page The data to read page into.
+   * @param fi the input stream of the file
+   * @return true of the loading as successful, false if end of file was reached
   */
-  virtual bool Read(T* page, dmlc::SeekStream* fi) = 0;
-  /*!
-   * \brief save the data to fo, when a page was written.
-   * \param fo output stream
+  virtual bool Read(T* page, common::AlignedResourceReadStream* fi) = 0;
+  /**
+   * @brief save the data to fo, when a page was written.
+   *
+   * @param fo output stream
   */
-  virtual size_t Write(const T& page, dmlc::Stream* fo) = 0;
+  virtual size_t Write(const T& page, common::AlignedFileWriteStream* fo) = 0;
 };

 /*!
@@ -105,6 +97,5 @@ struct SparsePageFormatReg
  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>,                \
                         GHistIndexPageFmt, Name)

-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by XGBoost Contributors
+/**
+ * Copyright 2014-2023, XGBoost Contributors
 * \file gblinear.cc
 * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
 *        the update rule is parallel coordinate descent (shotgun)
@@ -26,10 +26,9 @@
 #include "../common/timer.h"
 #include "../common/common.h"
 #include "../common/threading_utils.h"
+#include "../common/error_msg.h"

-namespace xgboost {
-namespace gbm {
-
+namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gblinear);

 // training parameters
@@ -83,7 +82,16 @@ class GBLinear : public GradientBooster {
    }
    param_.UpdateAllowUnknown(cfg);
    param_.CheckGPUSupport();
-    updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
+    if (param_.updater == "gpu_coord_descent") {
+      LOG(WARNING) << error::DeprecatedFunc("gpu_coord_descent", "2.0.0",
+                                            R"(device="cuda", updater="coord_descent")");
+    }
+
+    if (param_.updater == "coord_descent" && ctx_->IsCUDA()) {
+      updater_.reset(LinearUpdater::Create("gpu_coord_descent", ctx_));
+    } else {
+      updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
+    }
    updater_->Configure(cfg);
    monitor_.Init("GBLinear");
  }
@@ -133,7 +141,7 @@ class GBLinear : public GradientBooster {
    this->updater_->SaveConfig(&j_updater);
  }

-  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair, PredictionCacheEntry*,
+  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry*,
               ObjFunction const*) override {
    monitor_.Start("DoBoost");

@@ -172,11 +180,10 @@ class GBLinear : public GradientBooster {
  }

  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                           uint32_t layer_begin, uint32_t /*layer_end*/, bool, int,
-                           unsigned) override {
+                           bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override {
    model_.LazyInitModel();
    LinearCheckLayer(layer_begin);
-    auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
+    auto base_margin = p_fmat->Info().base_margin_.View(DeviceOrd::CPU());
    const int ngroup = model_.learner_model_param->num_output_group;
    const size_t ncolumns = model_.learner_model_param->num_feature + 1;
    // allocate space for (#features + bias) times #groups times #rows
@@ -210,8 +217,8 @@ class GBLinear : public GradientBooster {
    }
  }

-  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                                       unsigned layer_begin, unsigned /*layer_end*/,
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                                       bst_layer_t layer_begin, bst_layer_t /*layer_end*/,
                                       bool) override {
    LinearCheckLayer(layer_begin);
    std::vector<bst_float>& contribs = out_contribs->HostVector();
@@ -224,9 +231,8 @@ class GBLinear : public GradientBooster {
    std::fill(contribs.begin(), contribs.end(), 0);
  }

-  std::vector<std::string> DumpModel(const FeatureMap& fmap,
-                                     bool with_stats,
-                                     std::string format) const override {
+  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
+                                                   std::string format) const override {
    return model_.DumpModel(fmap, with_stats, format);
  }

@@ -244,10 +250,9 @@ class GBLinear : public GradientBooster {
    // The bias is the last weight
    out_scores->resize(model_.weight.size() - learner_model_param_->num_output_group, 0);
    auto n_groups = learner_model_param_->num_output_group;
-    linalg::TensorView<float, 2> scores{
-        *out_scores,
-        {learner_model_param_->num_feature, n_groups},
-        Context::kCpuId};
+    auto scores = linalg::MakeTensorView(DeviceOrd::CPU(),
+                                         common::Span{out_scores->data(), out_scores->size()},
+                                         learner_model_param_->num_feature, n_groups);
    for (size_t i = 0; i < learner_model_param_->num_feature; ++i) {
      for (bst_group_t g = 0; g < n_groups; ++g) {
        scores(i, g) = model_[i][g];
@@ -255,7 +260,7 @@ class GBLinear : public GradientBooster {
    }
  }

-  bool UseGPU() const override {
+  [[nodiscard]] bool UseGPU() const override {
    if (param_.updater == "gpu_coord_descent") {
      return true;
    } else {
@@ -269,12 +274,12 @@ class GBLinear : public GradientBooster {
    monitor_.Start("PredictBatchInternal");
    model_.LazyInitModel();
    std::vector<bst_float> &preds = *out_preds;
-    auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
+    auto base_margin = p_fmat->Info().base_margin_.View(DeviceOrd::CPU());
    // start collecting the prediction
    const int ngroup = model_.learner_model_param->num_output_group;
    preds.resize(p_fmat->Info().num_row_ * ngroup);

-    auto base_score = learner_model_param_->BaseScore(Context::kCpuId);
+    auto base_score = learner_model_param_->BaseScore(DeviceOrd::CPU());
    for (const auto &page : p_fmat->GetBatches<SparsePage>()) {
      auto const& batch = page.GetView();
      // output convention: nrow * k, where nrow is number of rows
@@ -355,5 +360,4 @@ XGBOOST_REGISTER_GBM(GBLinear, "gblinear")
    .set_body([](LearnerModelParam const* booster_config, Context const* ctx) {
      return new GBLinear(booster_config, ctx);
    });
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -9,7 +9,7 @@
 #include <dmlc/omp.h>
 #include <dmlc/parameter.h>

-#include <algorithm>
+#include <algorithm>  // for equal
 #include <cinttypes>  // for uint32_t
 #include <limits>
 #include <memory>
@@ -18,9 +18,11 @@
 #include <vector>

 #include "../common/common.h"
+#include "../common/error_msg.h"  // for UnknownDevice, WarnOldSerialization, InplacePredictProxy
 #include "../common/random.h"
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
+#include "../data/proxy_dmatrix.h"  // for DMatrixProxy, HostAdapterDispatch
 #include "gbtree_model.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -38,9 +40,54 @@
 namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);

+namespace {
+/** @brief Map the `tree_method` parameter to the `updater` parameter. */
+std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method) {
+  // Choose updaters according to tree_method parameters
+  if (ctx->IsCUDA()) {
+    common::AssertGPUSupport();
+  }
+
+  switch (tree_method) {
+    case TreeMethod::kAuto:  // Use hist as default in 2.0
+    case TreeMethod::kHist: {
+      return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
+                                 [] { return "grow_gpu_hist"; });
+    }
+    case TreeMethod::kApprox: {
+      return ctx->DispatchDevice([] { return "grow_histmaker"; }, [] { return "grow_gpu_approx"; });
+    }
+    case TreeMethod::kExact:
+      CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
+      return "grow_colmaker,prune";
+    case TreeMethod::kGPUHist: {
+      common::AssertGPUSupport();
+      error::WarnDeprecatedGPUHist();
+      return "grow_gpu_hist";
+    }
+    default:
+      auto tm = static_cast<std::underlying_type_t<TreeMethod>>(tree_method);
+      LOG(FATAL) << "Unknown tree_method: `" << tm << "`.";
+  }
+
+  LOG(FATAL) << "unreachable";
+  return "";
+}
+
+bool UpdatersMatched(std::vector<std::string> updater_seq,
+                     std::vector<std::unique_ptr<TreeUpdater>> const& updaters) {
+  if (updater_seq.size() != updaters.size()) {
+    return false;
+  }
+
+  return std::equal(updater_seq.cbegin(), updater_seq.cend(), updaters.cbegin(),
+                    [](std::string const& name, std::unique_ptr<TreeUpdater> const& up) {
+                      return name == up->Name();
+                    });
+}
+}  // namespace
+
 void GBTree::Configure(Args const& cfg) {
-  this->cfg_ = cfg;
-  std::string updater_seq = tparam_.updater_seq;
  tparam_.UpdateAllowUnknown(cfg);
  tree_param_.UpdateAllowUnknown(cfg);

@@ -53,15 +100,13 @@ void GBTree::Configure(Args const& cfg) {

  // configure predictors
  if (!cpu_predictor_) {
-    cpu_predictor_ = std::unique_ptr<Predictor>(
-        Predictor::Create("cpu_predictor", this->ctx_));
+    cpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", this->ctx_));
  }
  cpu_predictor_->Configure(cfg);
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  auto n_gpus = common::AllVisibleGPUs();
-  if (!gpu_predictor_ && n_gpus != 0) {
-    gpu_predictor_ = std::unique_ptr<Predictor>(
-        Predictor::Create("gpu_predictor", this->ctx_));
+  if (!gpu_predictor_) {
+    gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
  }
  if (n_gpus != 0) {
    gpu_predictor_->Configure(cfg);
@@ -70,139 +115,41 @@ void GBTree::Configure(Args const& cfg) {

 #if defined(XGBOOST_USE_ONEAPI)
  if (!oneapi_predictor_) {
-    oneapi_predictor_ = std::unique_ptr<Predictor>(
-        Predictor::Create("oneapi_predictor", this->ctx_));
+    oneapi_predictor_ =
+        std::unique_ptr<Predictor>(Predictor::Create("oneapi_predictor", this->ctx_));
  }
  oneapi_predictor_->Configure(cfg);
 #endif  // defined(XGBOOST_USE_ONEAPI)

-  monitor_.Init("GBTree");
-
-  specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
-                   [](std::pair<std::string, std::string> const& arg) {
-                     return arg.first == "updater";
-                   });
-
-  if (specified_updater_ && !showed_updater_warning_) {
-    LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
-        "parameter. The `tree_method` parameter will be ignored. "
-        "Incorrect sequence of updaters will produce undefined "
-        "behavior. For common uses, we recommend using "
-        "`tree_method` parameter instead.";
-    // Don't drive users to silent XGBOost.
-    showed_updater_warning_ = true;
-  }
-
-  this->ConfigureUpdaters();
-  if (updater_seq != tparam_.updater_seq) {
-    updaters_.clear();
-    this->InitUpdater(cfg);
-  } else {
-    for (auto &up : updaters_) {
-      up->Configure(cfg);
-    }
-  }
-
-  configured_ = true;
-}
-
-// FIXME(trivialfis): This handles updaters.  Because the choice of updaters depends on
-// whether external memory is used and how large is dataset.  We can remove the dependency
-// on DMatrix once `hist` tree method can handle external memory so that we can make it
-// default.
-void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
-  CHECK(this->configured_);
-  std::string updater_seq = tparam_.updater_seq;
-  CHECK(tparam_.GetInitialised());
-
-  tparam_.UpdateAllowUnknown(cfg);
-
-  this->PerformTreeMethodHeuristic(fmat);
-  this->ConfigureUpdaters();
-
-  // initialize the updaters only when needed.
-  if (updater_seq != tparam_.updater_seq) {
-    LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
-    this->updaters_.clear();
-    this->InitUpdater(cfg);
-  }
-}
-
-void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
+  // `updater` parameter was manually specified
+  specified_updater_ =
+      std::any_of(cfg.cbegin(), cfg.cend(), [](auto const& arg) { return arg.first == "updater"; });
  if (specified_updater_) {
-    // This method is disabled when `updater` parameter is explicitly
-    // set, since only experts are expected to do so.
-    return;
-  }
-  if (model_.learner_model_param->IsVectorLeaf()) {
-    CHECK(tparam_.tree_method == TreeMethod::kHist)
-        << "Only the hist tree method is supported for building multi-target trees with vector "
-           "leaf.";
-  }
-
-  // tparam_ is set before calling this function.
-  if (tparam_.tree_method != TreeMethod::kAuto) {
-    return;
-  }
-
-  if (collective::IsDistributed()) {
-    LOG(INFO) << "Tree method is automatically selected to be 'approx' "
-                 "for distributed training.";
-    tparam_.tree_method = TreeMethod::kApprox;
-  } else if (!fmat->SingleColBlock()) {
-    LOG(INFO) << "Tree method is automatically set to 'approx' "
-                 "since external-memory data matrix is used.";
-    tparam_.tree_method = TreeMethod::kApprox;
-  } else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
-    /* Choose tree_method='approx' automatically for large data matrix */
-    LOG(INFO) << "Tree method is automatically selected to be "
-                 "'approx' for faster speed. To use old behavior "
-                 "(exact greedy algorithm on single machine), "
-                 "set tree_method to 'exact'.";
-    tparam_.tree_method = TreeMethod::kApprox;
-  } else {
-    tparam_.tree_method = TreeMethod::kExact;
+    error::WarnManualUpdater();
  }
  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
-}

-void GBTree::ConfigureUpdaters() {
-  if (specified_updater_) {
-    return;
+  if (!specified_updater_) {
+    this->tparam_.updater_seq = MapTreeMethodToUpdaters(ctx_, tparam_.tree_method);
  }
-  // `updater` parameter was manually specified
-  /* Choose updaters according to tree_method parameters */
-  switch (tparam_.tree_method) {
-    case TreeMethod::kAuto:
-      // Use heuristic to choose between 'exact' and 'approx' This
-      // choice is carried out in PerformTreeMethodHeuristic() before
-      // calling this function.
-      break;
-    case TreeMethod::kApprox:
-      tparam_.updater_seq = "grow_histmaker";
-      break;
-    case TreeMethod::kExact:
-      tparam_.updater_seq = "grow_colmaker,prune";
-      break;
-    case TreeMethod::kHist: {
-      LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
-                   "grow_quantile_histmaker.";
-      tparam_.updater_seq = "grow_quantile_histmaker";
-      break;
+
+  auto up_names = common::Split(tparam_.updater_seq, ',');
+  if (!UpdatersMatched(up_names, updaters_)) {
+    updaters_.clear();
+    for (auto const& name : up_names) {
+      std::unique_ptr<TreeUpdater> up(
+          TreeUpdater::Create(name.c_str(), ctx_, &model_.learner_model_param->task));
+      updaters_.push_back(std::move(up));
    }
-    case TreeMethod::kGPUHist: {
-      common::AssertGPUSupport();
-      tparam_.updater_seq = "grow_gpu_hist";
-      break;
-    }
-    default:
-      LOG(FATAL) << "Unknown tree_method ("
-                 << static_cast<int>(tparam_.tree_method) << ") detected";
+  }
+
+  for (auto& up : updaters_) {
+    up->Configure(cfg);
  }
 }

-void GPUCopyGradient(HostDeviceVector<GradientPair> const*, bst_group_t, bst_group_t,
-                     HostDeviceVector<GradientPair>*)
+void GPUCopyGradient(Context const*, linalg::Matrix<GradientPair> const*, bst_group_t,
+                     linalg::Matrix<GradientPair>*)
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    ;  // NOLINT
 #else
@@ -211,16 +158,19 @@ void GPUCopyGradient(HostDeviceVector<GradientPair> const*, bst_group_t, bst_gro
 }
 #endif

-void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_threads,
-                  bst_group_t n_groups, bst_group_t group_id,
-                  HostDeviceVector<GradientPair>* out_gpair) {
-  if (in_gpair->DeviceIdx() != Context::kCpuId) {
-    GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
+void CopyGradient(Context const* ctx, linalg::Matrix<GradientPair> const* in_gpair,
+                  bst_group_t group_id, linalg::Matrix<GradientPair>* out_gpair) {
+  out_gpair->SetDevice(ctx->Device());
+  out_gpair->Reshape(in_gpair->Shape(0), 1);
+  if (ctx->IsCUDA()) {
+    GPUCopyGradient(ctx, in_gpair, group_id, out_gpair);
  } else {
-    std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
-    const auto& gpair_h = in_gpair->ConstHostVector();
-    common::ParallelFor(out_gpair->Size(), n_threads,
-                        [&](auto i) { tmp_h[i] = gpair_h[i * n_groups + group_id]; });
+    auto const& in = *in_gpair;
+    auto target_gpair = in.Slice(linalg::All(), group_id);
+    auto h_tmp = out_gpair->HostView();
+    auto h_in = in.HostView().Slice(linalg::All(), group_id);
+    CHECK_EQ(h_tmp.Size(), h_in.Size());
+    common::ParallelFor(h_in.Size(), ctx->Threads(), [&](auto i) { h_tmp(i) = h_in(i); });
  }
 }

@@ -249,21 +199,22 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
  }
 }

-void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
+void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
                     PredictionCacheEntry* predt, ObjFunction const* obj) {
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
+        << "Only the hist tree method is supported for building multi-target trees with vector "
+           "leaf.";
+    CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
+  }
+
  TreesOneIter new_trees;
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
-  ConfigureWithKnownData(this->cfg_, p_fmat);
  monitor_.Start("BoostNewTrees");

-  // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
-  // `gpu_id` be the single source of determining what algorithms to run, but that will
-  // break a lots of existing code.
-  auto device = tparam_.tree_method != TreeMethod::kGPUHist ? Context::kCpuId : ctx_->gpu_id;
-  auto out = linalg::MakeTensorView(
-      device,
-      device == Context::kCpuId ? predt->predictions.HostSpan() : predt->predictions.DeviceSpan(),
-      p_fmat->Info().num_row_, model_.learner_model_param->OutputLength());
+  predt->predictions.SetDevice(ctx_->Ordinal());
+  auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
+                                    model_.learner_model_param->OutputLength());
  CHECK_NE(n_groups, 0);

  if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf()) {
@@ -296,12 +247,12 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
    }
  } else {
    CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
-    HostDeviceVector<GradientPair> tmp(in_gpair->Size() / n_groups, GradientPair(),
-                                       in_gpair->DeviceIdx());
+    linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
+                                     ctx_->Ordinal()};
    bool update_predict = true;
    for (bst_target_t gid = 0; gid < n_groups; ++gid) {
      node_position.clear();
-      CopyGradient(in_gpair, ctx_->Threads(), n_groups, gid, &tmp);
+      CopyGradient(ctx_, in_gpair, gid, &tmp);
      TreesOneGroup ret;
      BoostNewTrees(&tmp, p_fmat, gid, &node_position, &ret);
      UpdateTreeLeaf(p_fmat, predt->predictions, obj, gid, node_position, &ret);
@@ -322,48 +273,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
  this->CommitModel(std::move(new_trees));
 }

-void GBTree::InitUpdater(Args const& cfg) {
-  std::string tval = tparam_.updater_seq;
-  std::vector<std::string> ups = common::Split(tval, ',');
-
-  if (updaters_.size() != 0) {
-    // Assert we have a valid set of updaters.
-    CHECK_EQ(ups.size(), updaters_.size());
-    for (auto const& up : updaters_) {
-      bool contains = std::any_of(ups.cbegin(), ups.cend(),
-                        [&up](std::string const& name) {
-                          return name == up->Name();
-                        });
-      if (!contains) {
-        std::stringstream ss;
-        ss << "Internal Error: " << " mismatched updater sequence.\n";
-        ss << "Specified updaters: ";
-        std::for_each(ups.cbegin(), ups.cend(),
-                      [&ss](std::string const& name){
-                        ss << name << " ";
-                      });
-        ss << "\n" << "Actual updaters: ";
-        std::for_each(updaters_.cbegin(), updaters_.cend(),
-                      [&ss](std::unique_ptr<TreeUpdater> const& updater){
-                        ss << updater->Name() << " ";
-                      });
-        LOG(FATAL) << ss.str();
-      }
-    }
-    // Do not push new updater in.
-    return;
-  }
-
-  // create new updaters
-  for (const std::string& pstr : ups) {
-    std::unique_ptr<TreeUpdater> up(
-        TreeUpdater::Create(pstr.c_str(), ctx_, &model_.learner_model_param->task));
-    up->Configure(cfg);
-    updaters_.push_back(std::move(up));
-  }
-}
-
-void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
                           std::vector<HostDeviceVector<bst_node_t>>* out_position,
                           TreesOneGroup* ret) {
  std::vector<RegTree*> new_trees;
@@ -371,6 +281,7 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
  // create the trees
  for (int i = 0; i < model_.param.num_parallel_tree; ++i) {
    if (tparam_.process_type == TreeProcessType::kDefault) {
+      CHECK(!updaters_.empty());
      CHECK(!updaters_.front()->CanModifyTree())
          << "Updater: `" << updaters_.front()->Name() << "` "
          << "can not be used to create new trees. "
@@ -436,12 +347,7 @@ void GBTree::LoadConfig(Json const& in) {
  // This would cause all trees to be pushed to trees_to_update
  // e.g. updating a model, then saving and loading it would result in an empty model
  tparam_.process_type = TreeProcessType::kDefault;
-  int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
-  if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
-    LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine.  "
-                    "Changing predictor to auto.";
-    tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
-  }
+  std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();

  auto msg = StringView{
      R"(
@@ -457,19 +363,32 @@ void GBTree::LoadConfig(Json const& in) {
    LOG(WARNING) << msg << "  Changing `tree_method` to `hist`.";
  }

-  auto const& j_updaters = get<Object const>(in["updater"]);
+  std::vector<Json> updater_seq;
+  if (IsA<Object>(in["updater"])) {
+    // before 2.0
+    error::WarnOldSerialization();
+    for (auto const& kv : get<Object const>(in["updater"])) {
+      auto name = kv.first;
+      auto config = kv.second;
+      config["name"] = name;
+      updater_seq.push_back(config);
+    }
+  } else {
+    // after 2.0
+    auto const& j_updaters = get<Array const>(in["updater"]);
+    updater_seq = j_updaters;
+  }
+
  updaters_.clear();

-  for (auto const& kv : j_updaters) {
-    auto name = kv.first;
+  for (auto const& config : updater_seq) {
+    auto name = get<String>(config["name"]);
    if (n_gpus == 0 && name == "grow_gpu_hist") {
      name = "grow_quantile_histmaker";
      LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
    }
-    std::unique_ptr<TreeUpdater> up{
-        TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task)};
-    up->LoadConfig(kv.second);
-    updaters_.push_back(std::move(up));
+    updaters_.emplace_back(TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task));
+    updaters_.back()->LoadConfig(config);
  }

  specified_updater_ = get<Boolean>(in["specified_updater"]);
@@ -491,13 +410,14 @@ void GBTree::SaveConfig(Json* p_out) const {
  // language binding doesn't need to know about the forest size.
  out["gbtree_model_param"] = ToJson(model_.param);

-  out["updater"] = Object();
+  out["updater"] = Array{};
+  auto& j_updaters = get<Array>(out["updater"]);

-  auto& j_updaters = out["updater"];
-  for (auto const& up : updaters_) {
-    j_updaters[up->Name()] = Object();
-    auto& j_up = j_updaters[up->Name()];
-    up->SaveConfig(&j_up);
+  for (auto const& up : this->updaters_) {
+    Json up_config{Object{}};
+    up_config["name"] = String{up->Name()};
+    up->SaveConfig(&up_config);
+    j_updaters.emplace_back(up_config);
  }
  out["specified_updater"] = Boolean{specified_updater_};
 }
@@ -517,7 +437,6 @@ void GBTree::SaveModel(Json* p_out) const {

 void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, GradientBooster* out,
                   bool* out_of_bound) const {
-  CHECK(configured_);
  CHECK(out);

  auto p_gbtree = dynamic_cast<GBTree*>(out);
@@ -567,9 +486,8 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
  out_model.param.num_parallel_tree = model_.param.num_parallel_tree;
 }

-void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool,
-                          bst_layer_t layer_begin, bst_layer_t layer_end) {
-  CHECK(configured_);
+void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
+                              bst_layer_t layer_begin, bst_layer_t layer_end) const {
  if (layer_end == 0) {
    layer_end = this->BoostedRounds();
  }
@@ -588,7 +506,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
    CHECK_EQ(out_preds->version, 0);
  }

-  auto const& predictor = GetPredictor(&out_preds->predictions, p_fmat);
+  auto const& predictor = GetPredictor(is_training, &out_preds->predictions, p_fmat);
  if (out_preds->version == 0) {
    // out_preds->Size() can be non-zero as it's initialized here before any
    // tree is built at the 0^th iterator.
@@ -608,52 +526,68 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
  }
 }

-std::unique_ptr<Predictor> const &
-GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
-                     DMatrix *f_dmat) const {
-  CHECK(configured_);
-  if (tparam_.predictor != PredictorType::kAuto) {
-    if (tparam_.predictor == PredictorType::kGPUPredictor) {
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-      CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
+void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
+                          bst_layer_t layer_begin, bst_layer_t layer_end) {
+  // dispatch to const function.
+  this->PredictBatchImpl(p_fmat, out_preds, is_training, layer_begin, layer_end);
+}
+
+void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
+                            PredictionCacheEntry* out_preds, bst_layer_t layer_begin,
+                            bst_layer_t layer_end) const {
+  auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
+  CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
+  if (p_m->Ctx()->Device() != this->ctx_->Device()) {
+    error::MismatchedDevices(this->ctx_, p_m->Ctx());
+    CHECK_EQ(out_preds->version, 0);
+    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
+    CHECK(proxy) << error::InplacePredictProxy();
+    auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
+    this->PredictBatchImpl(p_fmat.get(), out_preds, false, layer_begin, layer_end);
+    return;
+  }
+
+  bool known_type = this->ctx_->DispatchDevice(
+      [&, begin = tree_begin, end = tree_end] {
+        return this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+      },
+      [&, begin = tree_begin, end = tree_end] {
+        return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+      });
+  if (!known_type) {
+    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
+    CHECK(proxy) << error::InplacePredictProxy();
+    LOG(FATAL) << "Unknown data type for inplace prediction:" << proxy->Adapter().type().name();
+  }
+}
+
+[[nodiscard]] std::unique_ptr<Predictor> const& GBTree::GetPredictor(
+    bool is_training, HostDeviceVector<float> const* out_pred, DMatrix* f_dmat) const {
+  // Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
+  // prevent data copy.
+  if (f_dmat && !f_dmat->SingleColBlock()) {
+    if (ctx_->IsCPU()) {
+      return cpu_predictor_;
+    } else {
+      common::AssertGPUSupport();
      CHECK(gpu_predictor_);
      return gpu_predictor_;
-#else
-      common::AssertGPUSupport();
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    }
-    if (tparam_.predictor == PredictorType::kOneAPIPredictor) {
-#if defined(XGBOOST_USE_ONEAPI)
-      CHECK(oneapi_predictor_);
-      return oneapi_predictor_;
-#else
-      common::AssertOneAPISupport();
-#endif  // defined(XGBOOST_USE_ONEAPI)
-    }
-    CHECK(cpu_predictor_);
-    return cpu_predictor_;
  }

  // Data comes from Device DMatrix.
-  auto is_ellpack = f_dmat && f_dmat->PageExists<EllpackPage>() &&
-                    !f_dmat->PageExists<SparsePage>();
+  auto is_ellpack =
+      f_dmat && f_dmat->PageExists<EllpackPage>() && !f_dmat->PageExists<SparsePage>();
  // Data comes from device memory, like CuDF or CuPy.
-  auto is_from_device =
-      f_dmat && f_dmat->PageExists<SparsePage>() &&
-      (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
+  auto is_from_device = f_dmat && f_dmat->PageExists<SparsePage>() &&
+                        (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
  auto on_device = is_ellpack || is_from_device;

  // Use GPU Predictor if data is already on device and gpu_id is set.
-  if (on_device && ctx_->gpu_id >= 0) {
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
+  if (on_device && ctx_->IsCUDA()) {
+    common::AssertGPUSupport();
    CHECK(gpu_predictor_);
    return gpu_predictor_;
-#else
-    LOG(FATAL) << "Data is on CUDA device, but XGBoost is not compiled with "
-                  "CUDA/HIP support.";
-    return cpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  }

  // GPU_Hist by default has prediction cache calculated from quantile values,
@@ -665,23 +599,19 @@ GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
  if ((out_pred && out_pred->Size() == 0) && (model_.param.num_trees != 0) &&
      // FIXME(trivialfis): Implement a better method for testing whether data
      // is on device after DMatrix refactoring is done.
-      !on_device) {
+      !on_device && is_training) {
    CHECK(cpu_predictor_);
    return cpu_predictor_;
  }

-  if (tparam_.tree_method == TreeMethod::kGPUHist) {
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
+  if (ctx_->IsCPU()) {
+    return cpu_predictor_;
+  } else {
+    common::AssertGPUSupport();
    CHECK(gpu_predictor_);
    return gpu_predictor_;
-#else
-    common::AssertGPUSupport();
-    return cpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  }

-  CHECK(cpu_predictor_);
  return cpu_predictor_;
 }

@@ -796,7 +726,7 @@ class Dart : public GBTree {
                        bool training, unsigned layer_begin,
                        unsigned layer_end) const {
    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
-    auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
+    auto& predictor = this->GetPredictor(training, &p_out_preds->predictions, p_fmat);
    CHECK(predictor);
    predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
                                  model_);
@@ -805,7 +735,7 @@ class Dart : public GBTree {
    auto n_groups = model_.learner_model_param->num_output_group;

    PredictionCacheEntry predts;  // temporary storage for prediction
-    if (ctx_->gpu_id != Context::kCpuId) {
+    if (ctx_->IsCUDA()) {
      predts.predictions.SetDevice(ctx_->gpu_id);
    }
    predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
@@ -860,15 +790,16 @@ class Dart : public GBTree {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    auto n_groups = model_.learner_model_param->num_output_group;

-    std::vector<Predictor const*> predictors {
-      cpu_predictor_.get(),
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-      gpu_predictor_.get()
-#endif  // defined(XGBOOST_USE_CUDA)
-    };
-    Predictor const* predictor{nullptr};
-    StringView msg{"Unsupported data type for inplace predict."};
+    if (ctx_->Device() != p_fmat->Ctx()->Device()) {
+      error::MismatchedDevices(ctx_, p_fmat->Ctx());
+      auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
+      CHECK(proxy) << error::InplacePredictProxy();
+      auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
+      this->PredictBatchImpl(p_fmat.get(), p_out_preds, false, layer_begin, layer_end);
+      return;
+    }

+    StringView msg{"Unsupported data type for inplace predict."};
    PredictionCacheEntry predts;
    if (ctx_->gpu_id != Context::kCpuId) {
      predts.predictions.SetDevice(ctx_->gpu_id);
@@ -877,32 +808,29 @@ class Dart : public GBTree {

    auto predict_impl = [&](size_t i) {
      predts.predictions.Fill(0);
-      if (tparam_.predictor == PredictorType::kAuto) {
-        // Try both predictor implementations
-        bool success = false;
-        for (auto const& p : predictors) {
-          if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) {
-            success = true;
-            predictor = p;
-            break;
-          }
-        }
-        CHECK(success) << msg;
-      } else {
-        predictor = this->GetPredictor().get();
-        bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
-        CHECK(success) << msg << std::endl
-                       << "Current Predictor: "
-                       << (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor"
-                                                                             : "gpu_predictor");
-      }
+      bool success = this->ctx_->DispatchDevice(
+          [&] {
+            return cpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+          },
+          [&] {
+            return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+          });
+      CHECK(success) << msg;
    };

    // Inplace predict is not used for training, so no need to drop tree.
    for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
      predict_impl(i);
      if (i == tree_begin) {
-        predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
+        this->ctx_->DispatchDevice(
+            [&] {
+              this->cpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
+                                                       model_);
+            },
+            [&] {
+              this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
+                                                       model_);
+            });
      }
      // Multiple the tree weight
      auto w = this->weight_drop_.at(i);
@@ -912,12 +840,12 @@ class Dart : public GBTree {
      size_t n_rows = p_fmat->Info().num_row_;
      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
-        auto base_score = model_.learner_model_param->BaseScore(predts.predictions.DeviceIdx());
+        auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
        GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
                                 predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
                                 group);
      } else {
-        auto base_score = model_.learner_model_param->BaseScore(Context::kCpuId);
+        auto base_score = model_.learner_model_param->BaseScore(DeviceOrd::CPU());
        auto& h_predts = predts.predictions.HostVector();
        auto& h_out_predts = p_out_preds->predictions.HostVector();
        common::ParallelFor(n_rows, ctx_->Threads(), [&](auto ridx) {
@@ -932,26 +860,23 @@ class Dart : public GBTree {
                       std::vector<bst_float> *out_preds,
                       unsigned layer_begin, unsigned layer_end) override {
    DropTrees(false);
-    auto &predictor = this->GetPredictor();
+    auto &predictor = this->GetPredictor(false);
    uint32_t _, tree_end;
    std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
    predictor->PredictInstance(inst, out_preds, model_, tree_end);
  }

-  void PredictContribution(DMatrix* p_fmat,
-                           HostDeviceVector<bst_float>* out_contribs,
-                           unsigned layer_begin, unsigned layer_end, bool approximate, int,
-                           unsigned) override {
-    CHECK(configured_);
+  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
+                           bst_layer_t layer_begin, bst_layer_t layer_end,
+                           bool approximate) override {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_, tree_end, &weight_drop_,
                                        approximate);
  }

-  void PredictInteractionContributions(
-      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
-      unsigned layer_begin, unsigned layer_end, bool approximate) override {
-    CHECK(configured_);
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                                       bst_layer_t layer_begin, bst_layer_t layer_end,
+                                       bool approximate) override {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_, tree_end,
                                                    &weight_drop_, approximate);
--- a/src/gbm/gbtree.cu
+++ b/src/gbm/gbtree.cu
@@ -1,26 +1,24 @@
-/*!
- * Copyright 2021 by Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator

-#include "../common/device_helpers.cuh"
-#include "xgboost/context.h"
-#include "xgboost/linalg.h"
-#include "xgboost/span.h"
+#include "../common/cuda_context.cuh"
+#include "../common/device_helpers.cuh"  // for MakeTransformIterator
+#include "xgboost/base.h"                // for GradientPair
+#include "xgboost/linalg.h"              // for Matrix

-namespace xgboost {
-namespace gbm {
-
-void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
-                     bst_group_t n_groups, bst_group_t group_id,
-                     HostDeviceVector<GradientPair> *out_gpair) {
-  auto mat = linalg::TensorView<GradientPair const, 2>(
-      in_gpair->ConstDeviceSpan(),
-      {in_gpair->Size() / n_groups, static_cast<size_t>(n_groups)},
-      in_gpair->DeviceIdx());
-  auto v_in = mat.Slice(linalg::All(), group_id);
-  out_gpair->Resize(v_in.Size());
-  auto d_out = out_gpair->DeviceSpan();
-  dh::LaunchN(v_in.Size(), [=] __device__(size_t i) { d_out[i] = v_in(i); });
+namespace xgboost::gbm {
+void GPUCopyGradient(Context const *ctx, linalg::Matrix<GradientPair> const *in_gpair,
+                     bst_group_t group_id, linalg::Matrix<GradientPair> *out_gpair) {
+  auto v_in = in_gpair->View(ctx->Device()).Slice(linalg::All(), group_id);
+  out_gpair->SetDevice(ctx->Device());
+  out_gpair->Reshape(v_in.Size(), 1);
+  auto d_out = out_gpair->View(ctx->Device());
+  auto cuctx = ctx->CUDACtx();
+  auto it = dh::MakeTransformIterator<GradientPair>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return v_in(i); });
+  thrust::copy(cuctx->CTP(), it, it + v_in.Size(), d_out.Values().data());
 }

 void GPUDartPredictInc(common::Span<float> out_predts,
@@ -42,5 +40,4 @@ void GPUDartInplacePredictInc(common::Span<float> out_predts, common::Span<float
    out_predts[offset] += (predts[offset] - base_score(0)) * tree_w;
  });
 }
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -43,37 +43,23 @@ enum class TreeProcessType : int {
  kDefault = 0,
  kUpdate = 1
 };
-
-enum class PredictorType : int {
-  kAuto = 0,
-  kCPUPredictor,
-  kGPUPredictor,
-  kOneAPIPredictor
-};
 }  // namespace xgboost

 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
-DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
-
-namespace xgboost {
-namespace gbm {

+namespace xgboost::gbm {
 /*! \brief training parameters */
 struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
  /*! \brief tree updater sequence */
  std::string updater_seq;
  /*! \brief type of boosting process to run */
  TreeProcessType process_type;
-  // predictor type
-  PredictorType predictor;
  // tree construction method
  TreeMethod tree_method;
  // declare parameters
  DMLC_DECLARE_PARAMETER(GBTreeTrainParam) {
-    DMLC_DECLARE_FIELD(updater_seq)
-        .set_default("grow_colmaker,prune")
-        .describe("Tree updater sequence.");
+    DMLC_DECLARE_FIELD(updater_seq).describe("Tree updater sequence.").set_default("");
    DMLC_DECLARE_FIELD(process_type)
        .set_default(TreeProcessType::kDefault)
        .add_enum("default", TreeProcessType::kDefault)
@@ -81,13 +67,6 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
        .describe("Whether to run the normal boosting process that creates new trees,"\
                  " or to update the trees in an existing model.");
    DMLC_DECLARE_ALIAS(updater_seq, updater);
-    DMLC_DECLARE_FIELD(predictor)
-        .set_default(PredictorType::kAuto)
-        .add_enum("auto", PredictorType::kAuto)
-        .add_enum("cpu_predictor", PredictorType::kCPUPredictor)
-        .add_enum("gpu_predictor", PredictorType::kGPUPredictor)
-        .add_enum("oneapi_predictor", PredictorType::kOneAPIPredictor)
-        .describe("Predictor algorithm type");
    DMLC_DECLARE_FIELD(tree_method)
        .set_default(TreeMethod::kAuto)
        .add_enum("auto",      TreeMethod::kAuto)
@@ -189,44 +168,29 @@ bool SliceTrees(bst_layer_t begin, bst_layer_t end, bst_layer_t step, GBTreeMode
 class GBTree : public GradientBooster {
 public:
  explicit GBTree(LearnerModelParam const* booster_config, Context const* ctx)
-      : GradientBooster{ctx}, model_(booster_config, ctx_) {}
-
-  void Configure(const Args& cfg) override;
-  // Revise `tree_method` and `updater` parameters after seeing the training
-  // data matrix, only useful when tree_method is auto.
-  void PerformTreeMethodHeuristic(DMatrix* fmat);
-  /*! \brief Map `tree_method` parameter to `updater` parameter */
-  void ConfigureUpdaters();
-  void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);
+      : GradientBooster{ctx}, model_(booster_config, ctx_) {
+    monitor_.Init(__func__);
+  }

+  void Configure(Args const& cfg) override;
  /**
-   * \brief Optionally update the leaf value.
+   * @brief Optionally update the leaf value.
   */
  void UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const& predictions,
-                      ObjFunction const* obj,
-                      std::int32_t group_idx,
+                      ObjFunction const* obj, std::int32_t group_idx,
                      std::vector<HostDeviceVector<bst_node_t>> const& node_position,
                      std::vector<std::unique_ptr<RegTree>>* p_trees);
+  /**
+   * @brief Carry out one iteration of boosting.
+   */
+  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry* predt,
+               ObjFunction const* obj) override;

-  /*! \brief Carry out one iteration of boosting */
-  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
-               PredictionCacheEntry* predt, ObjFunction const* obj) override;
+  [[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }

-  bool UseGPU() const override {
-    return
-        tparam_.predictor == PredictorType::kGPUPredictor ||
-        tparam_.tree_method == TreeMethod::kGPUHist;
-  }
-
-  GBTreeTrainParam const& GetTrainParam() const {
-    return tparam_;
-  }
-
-  void Load(dmlc::Stream* fi) override {
-    model_.Load(fi);
-    this->cfg_.clear();
-  }
+  [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }

+  void Load(dmlc::Stream* fi) override { model_.Load(fi); }
  void Save(dmlc::Stream* fo) const override {
    model_.Save(fo);
  }
@@ -246,39 +210,14 @@ class GBTree : public GradientBooster {
    return !model_.trees.empty() || !model_.trees_to_update.empty();
  }

+  void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
+                        bst_layer_t layer_begin, bst_layer_t layer_end) const;
+
  void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool training,
                    bst_layer_t layer_begin, bst_layer_t layer_end) override;

  void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,
-                      bst_layer_t layer_begin, bst_layer_t layer_end) const override {
-    CHECK(configured_);
-    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
-    std::vector<Predictor const *> predictors{
-      cpu_predictor_.get(),
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-      gpu_predictor_.get()
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-    };
-    StringView msg{"Unsupported data type for inplace predict."};
-    if (tparam_.predictor == PredictorType::kAuto) {
-      // Try both predictor implementations
-      for (auto const &p : predictors) {
-        if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) {
-          return;
-        }
-      }
-      LOG(FATAL) << msg;
-    } else {
-      bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds,
-                                                          tree_begin, tree_end);
-      CHECK(success) << msg << std::endl
-                     << "Current Predictor: "
-                     << (tparam_.predictor == PredictorType::kCPUPredictor
-                             ? "cpu_predictor"
-                             : "gpu_predictor");
-    }
-  }
+                      bst_layer_t layer_begin, bst_layer_t layer_end) const override;

  void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
                    std::vector<bst_feature_t>* features,
@@ -347,7 +286,6 @@ class GBTree : public GradientBooster {

  void PredictInstance(const SparsePage::Inst& inst, std::vector<bst_float>* out_preds,
                       uint32_t layer_begin, uint32_t layer_end) override {
-    CHECK(configured_);
    std::uint32_t _, tree_end;
    std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
    cpu_predictor_->PredictInstance(inst, out_preds, model_, tree_end);
@@ -359,32 +297,27 @@ class GBTree : public GradientBooster {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
                               "n_iteration), use model slicing instead.";
-    this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
+    this->GetPredictor(false)->PredictLeaf(p_fmat, out_preds, model_, tree_end);
  }

-  void PredictContribution(DMatrix* p_fmat,
-                           HostDeviceVector<bst_float>* out_contribs,
-                           uint32_t layer_begin, uint32_t layer_end, bool approximate,
-                           int, unsigned) override {
-    CHECK(configured_);
+  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                           bst_layer_t layer_begin, bst_layer_t layer_end,
+                           bool approximate) override {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0)
-        << "Predict contribution supports only iteration end: (0, "
-           "n_iteration), using model slicing instead.";
-    this->GetPredictor()->PredictContribution(
-        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
+    CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "
+                               "n_iteration), using model slicing instead.";
+    this->GetPredictor(false)->PredictContribution(p_fmat, out_contribs, model_, tree_end, nullptr,
+                                                   approximate);
  }

-  void PredictInteractionContributions(
-      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
-      uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
-    CHECK(configured_);
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                                       bst_layer_t layer_begin, bst_layer_t layer_end,
+                                       bool approximate) override {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0)
-        << "Predict interaction contribution supports only iteration end: (0, "
-           "n_iteration), using model slicing instead.";
-    this->GetPredictor()->PredictInteractionContributions(
-        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
+    CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "
+                               "n_iteration), using model slicing instead.";
+    this->GetPredictor(false)->PredictInteractionContributions(p_fmat, out_contribs, model_,
+                                                               tree_end, nullptr, approximate);
  }

  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
@@ -393,15 +326,13 @@ class GBTree : public GradientBooster {
  }

 protected:
-  // initialize updater before using them
-  void InitUpdater(Args const& cfg);
-
-  void BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+  void BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
                     std::vector<HostDeviceVector<bst_node_t>>* out_position,
                     std::vector<std::unique_ptr<RegTree>>* ret);

-  std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
-                                                 DMatrix* f_dmat = nullptr) const;
+  [[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
+      bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
+      DMatrix* f_dmat = nullptr) const;

  // commit new trees all at once
  virtual void CommitModel(TreesOneIter&& new_trees);
@@ -412,26 +343,18 @@ class GBTree : public GradientBooster {
  GBTreeTrainParam tparam_;
  // Tree training parameter
  tree::TrainParam tree_param_;
-  // ----training fields----
-  bool showed_updater_warning_ {false};
  bool specified_updater_   {false};
-  bool configured_ {false};
-  // configurations for tree
-  Args cfg_;
  // the updaters that can be applied to each of tree
  std::vector<std::unique_ptr<TreeUpdater>> updaters_;
  // Predictors
  std::unique_ptr<Predictor> cpu_predictor_;
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-  std::unique_ptr<Predictor> gpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+  std::unique_ptr<Predictor> gpu_predictor_{nullptr};
 #if defined(XGBOOST_USE_ONEAPI)
  std::unique_ptr<Predictor> oneapi_predictor_;
 #endif  // defined(XGBOOST_USE_ONEAPI)
  common::Monitor monitor_;
 };

-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm

 #endif  // XGBOOST_GBM_GBTREE_H_
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -40,6 +40,7 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
+#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization, ...
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@@ -278,15 +279,15 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
  // Make sure read access everywhere for thread-safe prediction.
  std::as_const(base_score_).HostView();
  if (!ctx->IsCPU()) {
-    std::as_const(base_score_).View(ctx->gpu_id);
+    std::as_const(base_score_).View(ctx->Device());
  }
  CHECK(std::as_const(base_score_).Data()->HostCanRead());
 }

-linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device) const {
+linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device) const {
  // multi-class is not yet supported.
  CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
-  if (device == Context::kCpuId) {
+  if (device.IsCPU()) {
    // Make sure that we won't run into race condition.
    CHECK(base_score_.Data()->HostCanRead());
    return base_score_.HostView();
@@ -299,7 +300,7 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device)
 }

 linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* ctx) const {
-  return this->BaseScore(ctx->gpu_id);
+  return this->BaseScore(ctx->Device());
 }

 void LearnerModelParam::Copy(LearnerModelParam const& that) {
@@ -308,7 +309,7 @@ void LearnerModelParam::Copy(LearnerModelParam const& that) {
  base_score_.Data()->Copy(*that.base_score_.Data());
  std::as_const(base_score_).HostView();
  if (that.base_score_.DeviceIdx() != Context::kCpuId) {
-    std::as_const(base_score_).View(that.base_score_.DeviceIdx());
+    std::as_const(base_score_).View(that.base_score_.Device());
  }
  CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
  CHECK(base_score_.Data()->HostCanRead());
@@ -356,21 +357,6 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 using LearnerAPIThreadLocalStore =
    dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;

-namespace {
-StringView ModelMsg() {
-  return StringView{
-      R"doc(
-  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
-  older XGBoost, please export the model by calling `Booster.save_model` from that version
-  first, then load it back in current version. See:
-
-    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
-
-  for more details about differences between saving model and serializing.
-)doc"};
-}
-}  // anonymous namespace
-
 class LearnerConfiguration : public Learner {
 private:
  std::mutex config_lock_;
@@ -402,7 +388,7 @@ class LearnerConfiguration : public Learner {
    this->ConfigureTargets();

    auto task = UsePtr(obj_)->Task();
-    linalg::Tensor<float, 1> base_score({1}, Ctx()->gpu_id);
+    linalg::Tensor<float, 1> base_score({1}, Ctx()->Device());
    auto h_base_score = base_score.HostView();

    // transform to margin
@@ -438,7 +424,7 @@ class LearnerConfiguration : public Learner {
    if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
      if (p_fmat) {
        auto const& info = p_fmat->Info();
-        info.Validate(Ctx()->gpu_id);
+        info.Validate(Ctx()->Ordinal());
        // We estimate it from input data.
        linalg::Tensor<float, 1> base_score;
        InitEstimation(info, &base_score);
@@ -530,7 +516,7 @@ class LearnerConfiguration : public Learner {
    }

    if (!Version::Same(origin_version)) {
-      LOG(WARNING) << ModelMsg();
+      error::WarnOldSerialization();
      return;  // skip configuration if version is not matched
    }

@@ -561,7 +547,7 @@ class LearnerConfiguration : public Learner {
    for (size_t i = 0; i < n_metrics; ++i) {
      auto old_serialization = IsA<String>(j_metrics[i]);
      if (old_serialization) {
-        LOG(WARNING) << ModelMsg();
+        error::WarnOldSerialization();
        metric_names_[i] = get<String>(j_metrics[i]);
      } else {
        metric_names_[i] = get<String>(j_metrics[i]["name"]);
@@ -597,8 +583,9 @@ class LearnerConfiguration : public Learner {
    auto& objective_fn = learner_parameters["objective"];
    obj_->SaveConfig(&objective_fn);

-    std::vector<Json> metrics(metrics_.size(), Json{Object{}});
+    std::vector<Json> metrics(metrics_.size());
    for (size_t i = 0; i < metrics_.size(); ++i) {
+      metrics[i] = Object{};
      metrics_[i]->SaveConfig(&metrics[i]);
    }
    learner_parameters["metrics"] = Array(std::move(metrics));
@@ -704,19 +691,20 @@ class LearnerConfiguration : public Learner {
      stack.pop();
      auto const &obj = get<Object const>(j_obj);

-      for (auto const &kv : obj) {
+      for (auto const& kv : obj) {
        if (is_parameter(kv.first)) {
          auto parameter = get<Object const>(kv.second);
-          std::transform(parameter.begin(), parameter.end(), std::back_inserter(keys),
-                         [](std::pair<std::string const&, Json const&> const& kv) {
-                           return kv.first;
-                         });
+          std::transform(
+              parameter.begin(), parameter.end(), std::back_inserter(keys),
+              [](std::pair<std::string const&, Json const&> const& kv) { return kv.first; });
        } else if (IsA<Object>(kv.second)) {
          stack.push(kv.second);
-        } else if (kv.first == "metrics") {
+        } else if (IsA<Array>(kv.second)) {
          auto const& array = get<Array const>(kv.second);
          for (auto const& v : array) {
-            stack.push(v);
+            if (IsA<Object>(v) || IsA<Array>(v)) {
+              stack.push(v);
+            }
          }
        }
      }
@@ -725,6 +713,7 @@ class LearnerConfiguration : public Learner {
    // FIXME(trivialfis): Make eval_metric a training parameter.
    keys.emplace_back(kEvalMetric);
    keys.emplace_back("num_output_group");
+    keys.emplace_back("gpu_id");  // deprecated param.

    std::sort(keys.begin(), keys.end());

@@ -763,9 +752,7 @@ class LearnerConfiguration : public Learner {
        CHECK(matrix.first.ptr);
        CHECK(!matrix.second.ref.expired());
        const uint64_t num_col = matrix.first.ptr->Info().num_col_;
-        CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
-            << "Unfortunately, XGBoost does not support data matrices with "
-            << std::numeric_limits<unsigned>::max() << " features or greater";
+        error::MaxFeatureSize(num_col);
        num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
      }

@@ -810,7 +797,7 @@ class LearnerConfiguration : public Learner {
    bool has_nc {cfg_.find("num_class") != cfg_.cend()};
    // Inject num_class into configuration.
    // FIXME(jiamingy): Remove the duplicated parameter in softmax
-    cfg_["num_class"] = common::ToString(mparam_.num_class);
+    cfg_["num_class"] = std::to_string(mparam_.num_class);
    auto& args = *p_args;
    args = {cfg_.cbegin(), cfg_.cend()};  // renew
    obj_->Configure(args);
@@ -821,14 +808,13 @@ class LearnerConfiguration : public Learner {

  void ConfigureMetrics(Args const& args) {
    for (auto const& name : metric_names_) {
-      auto DupCheck = [&name](std::unique_ptr<Metric> const& m) {
-                        return m->Name() != name;
-                      };
+      auto DupCheck = [&name](std::unique_ptr<Metric> const& m) { return m->Name() != name; };
      if (std::all_of(metrics_.begin(), metrics_.end(), DupCheck)) {
        metrics_.emplace_back(std::unique_ptr<Metric>(Metric::Create(name, &ctx_)));
        mparam_.contain_eval_metrics = 1;
      }
    }
+
    for (auto& p_metric : metrics_) {
      p_metric->Configure(args);
    }
@@ -862,8 +848,7 @@ class LearnerConfiguration : public Learner {
  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
 #ifndef XGBOOST_USE_HIP
    base_score->Reshape(1);
-    collective::ApplyWithLabels(info, base_score->Data()->HostPointer(),
-                                sizeof(bst_float) * base_score->Size(),
+    collective::ApplyWithLabels(info, base_score->Data(),
                                [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
 #else
    if (info.IsVerticalFederated()) {
@@ -1101,7 +1086,7 @@ class LearnerIO : public LearnerConfiguration {
    mparam_.major_version = std::get<0>(Version::Self());
    mparam_.minor_version = std::get<1>(Version::Self());

-    cfg_["num_feature"] = common::ToString(mparam_.num_feature);
+    cfg_["num_feature"] = std::to_string(mparam_.num_feature);

    auto n = tparam_.__DICT__();
    cfg_.insert(n.cbegin(), n.cend());
@@ -1185,7 +1170,7 @@ class LearnerIO : public LearnerConfiguration {
      Json memory_snapshot;
      if (header[1] == '"') {
        memory_snapshot = Json::Load(StringView{buffer});
-        LOG(WARNING) << ModelMsg();
+        error::WarnOldSerialization();
      } else if (std::isalpha(header[1])) {
        memory_snapshot = Json::Load(StringView{buffer}, std::ios::binary);
      } else {
@@ -1204,7 +1189,7 @@ class LearnerIO : public LearnerConfiguration {
      header.resize(serialisation_header_.size());
      CHECK_EQ(fp.Read(&header[0], header.size()), serialisation_header_.size());
      // Avoid printing the content in loaded header, which might be random binary code.
-      CHECK(header == serialisation_header_) << ModelMsg();
+      CHECK(header == serialisation_header_) << error::OldSerialization();
      int64_t sz {-1};
      CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
      if (!DMLC_IO_NO_ENDIAN_SWAP) {
@@ -1307,14 +1292,14 @@ class LearnerImpl : public LearnerIO {
    monitor_.Start("GetGradient");
    GetGradient(predt.predictions, train->Info(), iter, &gpair_);
    monitor_.Stop("GetGradient");
-    TrainingObserver::Instance().Observe(gpair_, "Gradients");
+    TrainingObserver::Instance().Observe(*gpair_.Data(), "Gradients");

    gbm_->DoBoost(train.get(), &gpair_, &predt, obj_.get());
    monitor_.Stop("UpdateOneIter");
  }

  void BoostOneIter(int iter, std::shared_ptr<DMatrix> train,
-                    HostDeviceVector<GradientPair>* in_gpair) override {
+                    linalg::Matrix<GradientPair>* in_gpair) override {
    monitor_.Start("BoostOneIter");
    this->Configure();

@@ -1324,6 +1309,9 @@ class LearnerImpl : public LearnerIO {

    this->ValidateDMatrix(train.get(), true);

+    CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
+        << "The number of columns in gradient should be equal to the number of targets/classes in "
+           "the model.";
    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
    gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
    monitor_.Stop("BoostOneIter");
@@ -1367,10 +1355,9 @@ class LearnerImpl : public LearnerIO {
  }

  void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
-               HostDeviceVector<bst_float> *out_preds, unsigned layer_begin,
-               unsigned layer_end, bool training,
-               bool pred_leaf, bool pred_contribs, bool approx_contribs,
-               bool pred_interactions) override {
+               HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
+               bst_layer_t layer_end, bool training, bool pred_leaf, bool pred_contribs,
+               bool approx_contribs, bool pred_interactions) override {
    int multiple_predictions = static_cast<int>(pred_leaf) +
                               static_cast<int>(pred_interactions) +
                               static_cast<int>(pred_contribs);
@@ -1392,7 +1379,7 @@ class LearnerImpl : public LearnerIO {
      auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
      this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
      // Copy the prediction cache to output prediction. out_preds comes from C API
-      out_preds->SetDevice(ctx_.gpu_id);
+      out_preds->SetDevice(ctx_.Device());
      out_preds->Resize(prediction.predictions.Size());
      out_preds->Copy(prediction.predictions);
      if (!output_margin) {
@@ -1418,13 +1405,16 @@ class LearnerImpl : public LearnerIO {
  }

  void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
-                      HostDeviceVector<bst_float>** out_preds, uint32_t iteration_begin,
-                      uint32_t iteration_end) override {
+                      HostDeviceVector<float>** out_preds, bst_layer_t iteration_begin,
+                      bst_layer_t iteration_end) override {
    this->Configure();
    this->CheckModelInitialized();

    auto& out_predictions = this->GetThreadLocal().prediction_entry;
+    out_predictions.Reset();
+
    this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
+
    if (type == PredictionType::kValue) {
      obj_->PredTransform(&out_predictions.predictions);
    } else if (type == PredictionType::kMargin) {
@@ -1479,26 +1469,25 @@ class LearnerImpl : public LearnerIO {
    }

    if (p_fmat->Info().num_row_ == 0) {
-      LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+      error::WarnEmptyDataset();
    }
  }

 private:
-  void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration,
-                   HostDeviceVector<GradientPair>* out_gpair) {
-#ifndef XGBOOST_USE_HIP
-    out_gpair->Resize(preds.Size());
-    collective::ApplyWithLabels(info, out_gpair->HostPointer(),
-                                out_gpair->Size() * sizeof(GradientPair),
-                                [&] { obj_->GetGradient(preds, info, iteration, out_gpair); });
-#else
+  void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
+                   std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
+#if defined(XGBOOST_USE_CUDA)
+    out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
+    collective::ApplyWithLabels(info, out_gpair->Data(),
+                                [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
+#elif defined(XGBOOST_USE_HIP)
    if (info.IsVerticalFederated()) {
-      out_gpair->Resize(preds.Size());
-      collective::ApplyWithLabels(info, out_gpair->HostPointer(),
-                                out_gpair->Size() * sizeof(GradientPair),
-                                [&] { obj_->GetGradient(preds, info, iteration, out_gpair); });
-    } else {
-      obj_->GetGradient(preds, info, iteration, out_gpair);
+        out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
+        collective::ApplyWithLabels(info, out_gpair->Data(),
+                                [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
+    }
+    else {
+      obj_->GetGradient(preds, info, iter, out_gpair);
    }
 #endif
  }
@@ -1506,7 +1495,7 @@ class LearnerImpl : public LearnerIO {
  /*! \brief random number transformation seed. */
  static int32_t constexpr kRandSeedMagic = 127;
  // gradient pairs
-  HostDeviceVector<GradientPair> gpair_;
+  linalg::Matrix<GradientPair> gpair_;
  /*! \brief Temporary storage to prediction.  Useful for storing data transformed by
   *  objective function */
  PredictionContainer output_predictions_;
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@@ -9,8 +9,7 @@
 #include "coordinate_common.h"
 #include "xgboost/json.h"

-namespace xgboost {
-namespace linear {
+namespace xgboost::linear {

 DMLC_REGISTER_PARAMETER(CoordinateParam);
 DMLC_REGISTRY_FILE_TAG(updater_coordinate);
@@ -39,36 +38,38 @@ class CoordinateUpdater : public LinearUpdater {
    FromJson(config.at("linear_train_param"), &tparam_);
    FromJson(config.at("coordinate_param"), &cparam_);
  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
+  void SaveConfig(Json *p_out) const override {
+    LOG(DEBUG) << "Save config for CPU updater.";
+    auto &out = *p_out;
    out["linear_train_param"] = ToJson(tparam_);
    out["coordinate_param"] = ToJson(cparam_);
  }

-  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
-              gbm::GBLinearModel *model, double sum_instance_weight) override {
+  void Update(linalg::Matrix<GradientPair> *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model,
+              double sum_instance_weight) override {
+    auto gpair = in_gpair->Data();
    tparam_.DenormalizePenalties(sum_instance_weight);
    const int ngroup = model->learner_model_param->num_output_group;
    // update bias
    for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
-      auto grad = GetBiasGradientParallel(group_idx, ngroup, in_gpair->ConstHostVector(), p_fmat,
+      auto grad = GetBiasGradientParallel(group_idx, ngroup, gpair->ConstHostVector(), p_fmat,
                                          ctx_->Threads());
      auto dbias = static_cast<float>(tparam_.learning_rate *
                                      CoordinateDeltaBias(grad.first, grad.second));
      model->Bias()[group_idx] += dbias;
-      UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
+      UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &gpair->HostVector(), p_fmat);
    }
    // prepare for updating the weights
-    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
+    selector_->Setup(ctx_, *model, gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
                     tparam_.reg_lambda_denorm, cparam_.top_k);
    // update weights
    for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
      for (unsigned i = 0U; i < model->learner_model_param->num_feature; i++) {
        int fidx =
-            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
+            selector_->NextFeature(ctx_, i, *model, group_idx, gpair->ConstHostVector(), p_fmat,
                                   tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
        if (fidx < 0) break;
-        this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
+        this->UpdateFeature(fidx, group_idx, &gpair->HostVector(), p_fmat, model);
      }
    }
    monitor_.Stop("UpdateFeature");
@@ -99,5 +100,4 @@ class CoordinateUpdater : public LinearUpdater {
 XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
    .describe("Update linear model according to coordinate descent algorithm.")
    .set_body([]() { return new CoordinateUpdater(); });
-}  // namespace linear
-}  // namespace xgboost
+}  // namespace xgboost::linear
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -15,8 +15,7 @@
 #include "../common/timer.h"
 #include "./param.h"

-namespace xgboost {
-namespace linear {
+namespace xgboost::linear {

 DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);

@@ -29,7 +28,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
 class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
 public:
  // set training parameter
-  void Configure(Args const& args) override {
+  void Configure(Args const &args) override {
    tparam_.UpdateAllowUnknown(args);
    coord_param_.UpdateAllowUnknown(args);
    selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
@@ -41,8 +40,9 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    FromJson(config.at("linear_train_param"), &tparam_);
    FromJson(config.at("coordinate_param"), &coord_param_);
  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
+  void SaveConfig(Json *p_out) const override {
+    LOG(DEBUG) << "Save config for GPU updater.";
+    auto &out = *p_out;
    out["linear_train_param"] = ToJson(tparam_);
    out["coordinate_param"] = ToJson(coord_param_);
  }
@@ -106,18 +106,18 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    }
  }

-  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
-              gbm::GBLinearModel *model, double sum_instance_weight) override {
+  void Update(linalg::Matrix<GradientPair> *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model,
+              double sum_instance_weight) override {
    tparam_.DenormalizePenalties(sum_instance_weight);
    monitor_.Start("LazyInitDevice");
    this->LazyInitDevice(p_fmat, *(model->learner_model_param));
    monitor_.Stop("LazyInitDevice");

    monitor_.Start("UpdateGpair");
-    auto &in_gpair_host = in_gpair->ConstHostVector();
+
    // Update gpair
-    if (ctx_->gpu_id >= 0) {
-      this->UpdateGpair(in_gpair_host);
+    if (ctx_->IsCUDA()) {
+      this->UpdateGpair(in_gpair->Data()->ConstHostVector());
    }
    monitor_.Stop("UpdateGpair");

@@ -125,15 +125,15 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    this->UpdateBias(model);
    monitor_.Stop("UpdateBias");
    // prepare for updating the weights
-    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
-                     tparam_.reg_lambda_denorm, coord_param_.top_k);
+    selector_->Setup(ctx_, *model, in_gpair->Data()->ConstHostVector(), p_fmat,
+                     tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm, coord_param_.top_k);
    monitor_.Start("UpdateFeature");
    for (uint32_t group_idx = 0; group_idx < model->learner_model_param->num_output_group;
         ++group_idx) {
      for (auto i = 0U; i < model->learner_model_param->num_feature; i++) {
        auto fidx =
-            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
-                                   tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
+            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->Data()->ConstHostVector(),
+                                   p_fmat, tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
        if (fidx < 0) break;
        this->UpdateFeature(fidx, group_idx, model);
      }
@@ -279,5 +279,4 @@ XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
        "Update linear model according to coordinate descent algorithm. GPU "
        "accelerated.")
    .set_body([]() { return new GPUCoordinateUpdater(); });
-}  // namespace linear
-}  // namespace xgboost
+}  // namespace xgboost::linear
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@@ -6,8 +6,7 @@
 #include <xgboost/linear_updater.h>
 #include "coordinate_common.h"

-namespace xgboost {
-namespace linear {
+namespace xgboost::linear {

 DMLC_REGISTRY_FILE_TAG(updater_shotgun);

@@ -32,30 +31,31 @@ class ShotgunUpdater : public LinearUpdater {
    out["linear_train_param"] = ToJson(param_);
  }

-  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
-              gbm::GBLinearModel *model, double sum_instance_weight) override {
-    auto &gpair = in_gpair->HostVector();
+  void Update(linalg::Matrix<GradientPair> *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model,
+              double sum_instance_weight) override {
+    auto gpair = in_gpair->Data();
    param_.DenormalizePenalties(sum_instance_weight);
    const int ngroup = model->learner_model_param->num_output_group;

    // update bias
    for (int gid = 0; gid < ngroup; ++gid) {
-      auto grad = GetBiasGradientParallel(gid, ngroup, in_gpair->ConstHostVector(), p_fmat,
+      auto grad = GetBiasGradientParallel(gid, ngroup, gpair->ConstHostVector(), p_fmat,
                                          ctx_->Threads());
      auto dbias = static_cast<bst_float>(param_.learning_rate *
                               CoordinateDeltaBias(grad.first, grad.second));
      model->Bias()[gid] += dbias;
-      UpdateBiasResidualParallel(ctx_, gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
+      UpdateBiasResidualParallel(ctx_, gid, ngroup, dbias, &gpair->HostVector(), p_fmat);
    }

    // lock-free parallel updates of weights
-    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
+    selector_->Setup(ctx_, *model, gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
                     param_.reg_lambda_denorm, 0);
+    auto &h_gpair = gpair->HostVector();
    for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx_)) {
      auto page = batch.GetView();
      const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
      common::ParallelFor(nfeat, ctx_->Threads(), [&](auto i) {
-        int ii = selector_->NextFeature(ctx_, i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
+        int ii = selector_->NextFeature(ctx_, i, *model, 0, gpair->ConstHostVector(), p_fmat,
                                        param_.reg_alpha_denorm, param_.reg_lambda_denorm);
        if (ii < 0) return;
        const bst_uint fid = ii;
@@ -63,7 +63,7 @@ class ShotgunUpdater : public LinearUpdater {
        for (int gid = 0; gid < ngroup; ++gid) {
          double sum_grad = 0.0, sum_hess = 0.0;
          for (auto &c : col) {
-            const GradientPair &p = gpair[c.index * ngroup + gid];
+            const GradientPair &p = h_gpair[c.index * ngroup + gid];
            if (p.GetHess() < 0.0f) continue;
            const bst_float v = c.fvalue;
            sum_grad += p.GetGrad() * v;
@@ -77,7 +77,7 @@ class ShotgunUpdater : public LinearUpdater {
          w += dw;
          // update grad values
          for (auto &c : col) {
-            GradientPair &p = gpair[c.index * ngroup + gid];
+            GradientPair &p = h_gpair[c.index * ngroup + gid];
            if (p.GetHess() < 0.0f) continue;
            p += GradientPair(p.GetHess() * c.fvalue * dw, 0);
          }
@@ -98,5 +98,4 @@ XGBOOST_REGISTER_LINEAR_UPDATER(ShotgunUpdater, "shotgun")
        "Update linear model according to shotgun coordinate descent "
        "algorithm.")
    .set_body([]() { return new ShotgunUpdater(); });
-}  // namespace linear
-}  // namespace xgboost
+}  // namespace xgboost::linear
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -82,22 +82,19 @@ template <typename BinaryAUC>
 double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaInfo const &info,
                     size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) {
  CHECK_NE(n_classes, 0);
-  auto const labels = info.labels.View(Context::kCpuId);
+  auto const labels = info.labels.HostView();
  if (labels.Shape(0) != 0) {
    CHECK_EQ(labels.Shape(1), 1) << "AUC doesn't support multi-target model.";
  }

  std::vector<double> results_storage(n_classes * 3, 0);
-  linalg::TensorView<double, 2> results(results_storage, {n_classes, static_cast<size_t>(3)},
-                                        Context::kCpuId);
+  auto results = linalg::MakeTensorView(ctx, results_storage, n_classes, 3);
  auto local_area = results.Slice(linalg::All(), 0);
  auto tp = results.Slice(linalg::All(), 1);
  auto auc = results.Slice(linalg::All(), 2);

  auto weights = common::OptionalWeights{info.weights_.ConstHostSpan()};
-  auto predts_t = linalg::TensorView<float const, 2>(
-      predts, {static_cast<size_t>(info.num_row_), n_classes},
-      Context::kCpuId);
+  auto predts_t = linalg::MakeTensorView(ctx, predts, info.num_row_, n_classes);

  if (info.labels.Size() != 0) {
    common::ParallelFor(n_classes, n_threads, [&](auto c) {
@@ -108,8 +105,8 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
        response[i] = labels(i) == c ? 1.0f : 0.0;
      }
      double fp;
-      std::tie(fp, tp(c), auc(c)) =
-          binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
+      std::tie(fp, tp(c), auc(c)) = binary_auc(
+          ctx, proba, linalg::MakeVec(response.data(), response.size(), ctx->Device()), weights);
      local_area(c) = fp * tp(c);
    });
  }
@@ -220,7 +217,7 @@ std::pair<double, uint32_t> RankingAUC(Context const *ctx, std::vector<float> co
  CHECK_GE(info.group_ptr_.size(), 2);
  uint32_t n_groups = info.group_ptr_.size() - 1;
  auto s_predts = common::Span<float const>{predts};
-  auto labels = info.labels.View(Context::kCpuId);
+  auto labels = info.labels.View(ctx->Device());
  auto s_weights = info.weights_.ConstHostSpan();

  std::atomic<uint32_t> invalid_groups{0};
@@ -363,8 +360,8 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
                                           info.labels.HostView().Slice(linalg::All(), 0),
                                           common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
-      std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info,
-                                              ctx_->gpu_id, &this->d_cache_);
+      std::tie(fp, tp, auc) =
+          GPUBinaryROCAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
    }
    return std::make_tuple(fp, tp, auc);
  }
@@ -381,8 +378,7 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
-                                                   std::int32_t,
-                                                   std::shared_ptr<DeviceAUCCache> *) {
+                                                   DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
  common::AssertGPUSupport();
  return {};
 }
@@ -414,8 +410,8 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
          BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                      common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
-      std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info,
-                                             ctx_->gpu_id, &this->d_cache_);
+      std::tie(pr, re, auc) =
+          GPUBinaryPRAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
    }
    return std::make_tuple(pr, re, auc);
  }
@@ -459,7 +455,7 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
-                                                  std::int32_t, std::shared_ptr<DeviceAUCCache> *) {
+                                                  DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
  common::AssertGPUSupport();
  return {};
 }
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -91,15 +91,14 @@ void InitCacheOnce(common::Span<float const> predts, std::shared_ptr<DeviceAUCCa
 template <typename Fn>
 std::tuple<double, double, double>
 GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
-             int32_t device, common::Span<size_t const> d_sorted_idx,
+             DeviceOrd device, common::Span<size_t const> d_sorted_idx,
             Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
  auto labels = info.labels.View(device);
  auto weights = info.weights_.ConstDeviceSpan();
-
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#elif defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(device));
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipSetDevice(device.ordinal));
 #endif

  CHECK_NE(labels.Size(), 0);
@@ -194,7 +193,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
 }

 std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, std::int32_t device,
+                                                   MetaInfo const &info, DeviceOrd device,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache) {
  auto &cache = *p_cache;
  InitCacheOnce<false>(predts, p_cache);
@@ -350,14 +349,14 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
 * up each class in all kernels.
 */
 template <bool scale, typename Fn>
-double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span<uint32_t> d_class_ptr,
-                           size_t n_classes, std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#elif defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(device));
+double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
+                           common::Span<uint32_t> d_class_ptr, size_t n_classes,
+                           std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipSetDevice(device.ordinal));
 #endif
-
  /**
   * Sorted idx
   */
@@ -528,11 +527,12 @@ double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
  dh::TemporaryArray<uint32_t> class_ptr(n_classes + 1, 0);
  MultiClassSortedIdx(ctx, predts, dh::ToSpan(class_ptr), cache);

-  auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev,
-                              double tp, size_t /*class_id*/) {
+  auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev, double tp,
+                              size_t /*class_id*/) {
    return TrapezoidArea(fp_prev, fp, tp_prev, tp);
  };
-  return GPUMultiClassAUCOVR<true>(info, ctx->gpu_id, dh::ToSpan(class_ptr), n_classes, cache, fn);
+  return GPUMultiClassAUCOVR<true>(info, ctx->Device(), dh::ToSpan(class_ptr), n_classes, cache,
+                                   fn);
 }

 namespace {
@@ -581,7 +581,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
  /**
   * Sort the labels
   */
-  auto d_labels = info.labels.View(ctx->gpu_id);
+  auto d_labels = info.labels.View(ctx->Device());

  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
  common::SegmentedArgSort<false, false>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
@@ -679,7 +679,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
 }

 std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, std::int32_t device,
+                                                  MetaInfo const &info, DeviceOrd device,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache) {
  auto& cache = *p_cache;
  InitCacheOnce<false>(predts, p_cache);
@@ -744,7 +744,7 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
  /**
   * Get total positive/negative
   */
-  auto labels = info.labels.View(ctx->gpu_id);
+  auto labels = info.labels.View(ctx->Device());
  auto n_samples = info.num_row_;
  dh::caching_device_vector<Pair> totals(n_classes);
  auto key_it =
@@ -785,13 +785,13 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
    return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                  d_totals[class_id].first);
  };
-  return GPUMultiClassAUCOVR<false>(info, ctx->gpu_id, d_class_ptr, n_classes, cache, fn);
+  return GPUMultiClassAUCOVR<false>(info, ctx->Device(), d_class_ptr, n_classes, cache, fn);
 }

 template <typename Fn>
 std::pair<double, uint32_t>
 GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
-                    common::Span<uint32_t> d_group_ptr, int32_t device,
+                    common::Span<uint32_t> d_group_ptr, DeviceOrd device,
                    std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
  /**
   * Sorted idx
@@ -960,7 +960,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
  common::SegmentedArgSort<false, false>(ctx, predts, d_group_ptr, d_sorted_idx);

  dh::XGBDeviceAllocator<char> alloc;
-  auto labels = info.labels.View(ctx->gpu_id);
+  auto labels = info.labels.View(ctx->Device());

 #if defined(XGBOOST_USE_HIP)
  if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()),
@@ -1016,7 +1016,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
    return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                  d_totals[group_id].first);
  };
-  return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->gpu_id, cache, fn);
+  return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->Device(), cache, fn);
 }
 }  // namespace metric
 }  // namespace xgboost
--- a/src/metric/auc.h
+++ b/src/metric/auc.h
@@ -30,7 +30,7 @@ XGBOOST_DEVICE inline double TrapezoidArea(double x0, double x1, double y0, doub
 struct DeviceAUCCache;

 std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, std::int32_t device,
+                                                   MetaInfo const &info, DeviceOrd,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache);

 double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
@@ -45,7 +45,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
 * PR AUC *
 **********/
 std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, std::int32_t device,
+                                                  MetaInfo const &info, DeviceOrd,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache);

 double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -45,7 +45,7 @@ namespace {
 template <typename Fn>
 PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
  PackedReduceResult result;
-  auto labels = info.labels.View(ctx->gpu_id);
+  auto labels = info.labels.View(ctx->Device());
  if (ctx->IsCPU()) {
    auto n_threads = ctx->Threads();
    std::vector<double> score_tloc(n_threads, 0.0);
@@ -199,10 +199,10 @@ class PseudoErrorLoss : public MetricNoCache {

  double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
    CHECK_EQ(info.labels.Shape(0), info.num_row_);
-    auto labels = info.labels.View(ctx_->gpu_id);
-    preds.SetDevice(ctx_->gpu_id);
+    auto labels = info.labels.View(ctx_->Device());
+    preds.SetDevice(ctx_->Device());
    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                     : info.weights_.ConstDeviceSpan());
    float slope = this->param_.huber_slope;
@@ -365,11 +365,11 @@ struct EvalEWiseBase : public MetricNoCache {
    if (info.labels.Size() != 0) {
      CHECK_NE(info.labels.Shape(1), 0);
    }
-    auto labels = info.labels.View(ctx_->gpu_id);
-    info.weights_.SetDevice(ctx_->gpu_id);
+    auto labels = info.labels.View(ctx_->Device());
+    info.weights_.SetDevice(ctx_->Device());
    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                     : info.weights_.ConstDeviceSpan());
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();

    auto d_policy = policy_;
@@ -460,16 +460,16 @@ class QuantileError : public MetricNoCache {
    }

    auto const* ctx = ctx_;
-    auto y_true = info.labels.View(ctx->gpu_id);
-    preds.SetDevice(ctx->gpu_id);
-    alpha_.SetDevice(ctx->gpu_id);
+    auto y_true = info.labels.View(ctx->Device());
+    preds.SetDevice(ctx->Device());
+    alpha_.SetDevice(ctx->Device());
    auto alpha = ctx->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
    std::size_t n_targets = preds.Size() / info.num_row_ / alpha_.Size();
    CHECK_NE(n_targets, 0);
    auto y_predt = linalg::MakeTensorView(ctx, &preds, static_cast<std::size_t>(info.num_row_),
                                          alpha_.Size(), n_targets);

-    info.weights_.SetDevice(ctx->gpu_id);
+    info.weights_.SetDevice(ctx->Device());
    common::OptionalWeights weight{ctx->IsCPU() ? info.weights_.ConstHostSpan()
                                                : info.weights_.ConstDeviceSpan()};

--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -68,13 +68,14 @@ struct EvalAMS : public MetricNoCache {
    const auto &h_preds = preds.ConstHostVector();
    common::ParallelFor(ndata, ctx_->Threads(),
                        [&](bst_omp_uint i) { rec[i] = std::make_pair(h_preds[i], i); });
-    common::Sort(ctx_, rec.begin(), rec.end(), common::CmpFirst);
+    common::Sort(ctx_, rec.begin(), rec.end(),
+                 [](auto const& l, auto const& r) { return l.first > r.first; });
    auto ntop = static_cast<unsigned>(ratio_ * ndata);
    if (ntop == 0) ntop = ndata;
    const double br = 10.0;
    unsigned thresindex = 0;
    double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
-    const auto& labels = info.labels.View(Context::kCpuId);
+    const auto& labels = info.labels.View(DeviceOrd::CPU());
    for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
      const unsigned ridx = rec[i].second;
      const bst_float wt = info.GetWeight(ridx);
@@ -133,7 +134,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
    std::vector<double> sum_tloc(ctx_->Threads(), 0.0);

    {
-      const auto& labels = info.labels.View(Context::kCpuId);
+      const auto& labels = info.labels.HostView();
      const auto &h_preds = preds.ConstHostVector();

      dmlc::OMPException exc;
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -39,7 +39,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt,
                            std::shared_ptr<ltr::PreCache> p_cache) {
  auto d_gptr = p_cache->DataGroupPtr(ctx);
-  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);

  predt.SetDevice(ctx->gpu_id);
  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
@@ -95,7 +95,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
  if (!d_weight.Empty()) {
    CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
  }
-  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
  predt.SetDevice(ctx->gpu_id);
  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());

@@ -125,9 +125,9 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt, bool minus,
                            std::shared_ptr<ltr::MAPCache> p_cache) {
  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
-  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);

-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
  auto key_it = dh::MakeTransformIterator<std::size_t>(
      thrust::make_counting_iterator(0ul),
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -11,6 +11,7 @@
 #include <hipcub/hipcub.hpp>           // NOLINT
 #endif

+#include "../collective/aggregator.h"
 #include "../common/cuda_context.cuh"  // CUDAContext
 #include "../common/device_helpers.cuh"
 #include "../common/stats.cuh"
@@ -30,11 +31,10 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                          HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
  // copy position to buffer
 #if defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 #elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
+  dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
 #endif
-
  auto cuctx = ctx->CUDACtx();
  size_t n_samples = position.size();
  dh::device_vector<bst_node_t> sorted_position(position.size());
@@ -115,11 +115,11 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   */
  auto& nidx = *p_nidx;
  auto& nptr = *p_nptr;
-  nidx.SetDevice(ctx->gpu_id);
+  nidx.SetDevice(ctx->Device());
  nidx.Resize(n_leaf);
  auto d_node_idx = nidx.DeviceSpan();

-  nptr.SetDevice(ctx->gpu_id);
+  nptr.SetDevice(ctx->Device());
  nptr.Resize(n_leaf + 1, 0);
  auto d_node_ptr = nptr.DeviceSpan();

@@ -172,11 +172,10 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                          HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
 #if defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 #elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
+  dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
 #endif
-
  dh::device_vector<size_t> ridx;
  HostDeviceVector<size_t> nptr;
  HostDeviceVector<bst_node_t> nidx;
@@ -188,38 +187,39 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
  }

-  HostDeviceVector<float> quantiles;
-  predt.SetDevice(ctx->gpu_id);
-
+  predt.SetDevice(ctx->Device());
  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), info.num_row_,
                                        predt.Size() / info.num_row_);
  CHECK_LT(group_idx, d_predt.Shape(1));
  auto t_predt = d_predt.Slice(linalg::All(), group_idx);
-  auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), IdxY(info, group_idx));
-
-  auto d_row_index = dh::ToSpan(ridx);
-  auto seg_beg = nptr.DevicePointer();
-  auto seg_end = seg_beg + nptr.Size();
-  auto val_beg = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
-                                                  [=] XGBOOST_DEVICE(size_t i) {
-                                                    float p = t_predt(d_row_index[i]);
-                                                    auto y = d_labels(d_row_index[i]);
-                                                    return y - p;
-                                                  });
-  CHECK_EQ(d_labels.Shape(0), position.size());
-  auto val_end = val_beg + d_labels.Shape(0);
-  CHECK_EQ(nidx.Size() + 1, nptr.Size());
-  if (info.weights_.Empty()) {
-    common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles);
-  } else {
-    info.weights_.SetDevice(ctx->gpu_id);
-    auto d_weights = info.weights_.ConstDeviceSpan();
-    CHECK_EQ(d_weights.size(), d_row_index.size());
-    auto w_it = thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index));
-    common::SegmentedWeightedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, w_it,
-                                      w_it + d_weights.size(), &quantiles);
-  }

+  HostDeviceVector<float> quantiles;
+  collective::ApplyWithLabels(info, &quantiles, [&] {
+    auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx));
+    auto d_row_index = dh::ToSpan(ridx);
+    auto seg_beg = nptr.DevicePointer();
+    auto seg_end = seg_beg + nptr.Size();
+    auto val_beg = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                    [=] XGBOOST_DEVICE(size_t i) {
+                                                      float p = t_predt(d_row_index[i]);
+                                                      auto y = d_labels(d_row_index[i]);
+                                                      return y - p;
+                                                    });
+    CHECK_EQ(d_labels.Shape(0), position.size());
+    auto val_end = val_beg + d_labels.Shape(0);
+    CHECK_EQ(nidx.Size() + 1, nptr.Size());
+    if (info.weights_.Empty()) {
+      common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles);
+    } else {
+      info.weights_.SetDevice(ctx->Device());
+      auto d_weights = info.weights_.ConstDeviceSpan();
+      CHECK_EQ(d_weights.size(), d_row_index.size());
+      auto w_it =
+          thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index));
+      common::SegmentedWeightedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, w_it,
+                                        w_it + d_weights.size(), &quantiles);
+    }
+  });
  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
 }
 }  // namespace detail
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2022 by Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
 * \file aft_obj.cu
 * \brief Definition of AFT loss for survival analysis.
 * \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
@@ -41,11 +41,9 @@ class AFTObj : public ObjFunction {
  ObjInfo Task() const override { return ObjInfo::kSurvival; }

  template <typename Distribution>
-  void GetGradientImpl(const HostDeviceVector<bst_float> &preds,
-                       const MetaInfo &info,
-                       HostDeviceVector<GradientPair> *out_gpair,
-                       size_t ndata, int device, bool is_null_weight,
-                       float aft_loss_distribution_scale) {
+  void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
+                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
+                       bool is_null_weight, float aft_loss_distribution_scale) {
    common::Transform<>::Init(
        [=] XGBOOST_DEVICE(size_t _idx,
        common::Span<GradientPair> _out_gpair,
@@ -66,16 +64,17 @@ class AFTObj : public ObjFunction {
      _out_gpair[_idx] = GradientPair(grad * w, hess * w);
    },
    common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(), device).Eval(
-        out_gpair, &preds, &info.labels_lower_bound_, &info.labels_upper_bound_,
+        out_gpair->Data(), &preds, &info.labels_lower_bound_, &info.labels_upper_bound_,
        &info.weights_);
  }

  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, int /*iter*/,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
+                   linalg::Matrix<GradientPair>* out_gpair) override {
    const size_t ndata = preds.Size();
    CHECK_EQ(info.labels_lower_bound_.Size(), ndata);
    CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
-    out_gpair->Resize(ndata);
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(ndata, 1);
    const int device = ctx_->gpu_id;
    const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
    const bool is_null_weight = info.weights_.Size() == 0;
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -27,8 +27,8 @@ class HingeObj : public ObjFunction {
  void Configure(Args const&) override {}
  ObjInfo Task() const override { return ObjInfo::kRegression; }

-  void GetGradient(const HostDeviceVector<bst_float> &preds, const MetaInfo &info, int /*iter*/,
-                   HostDeviceVector<GradientPair> *out_gpair) override {
+  void GetGradient(const HostDeviceVector<bst_float> &preds, const MetaInfo &info,
+                   std::int32_t /*iter*/, linalg::Matrix<GradientPair> *out_gpair) override {
    CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
    CHECK_EQ(preds.Size(), info.labels.Size())
        << "labels are not correctly provided"
@@ -41,7 +41,8 @@ class HingeObj : public ObjFunction {
      CHECK_EQ(info.weights_.Size(), ndata)
          << "Number of weights should be equal to number of data points.";
    }
-    out_gpair->Resize(ndata);
+    CHECK_EQ(info.labels.Shape(1), 1) << "Multi-target for `binary:hinge` is not yet supported.";
+    out_gpair->Reshape(ndata, 1);
    common::Transform<>::Init(
        [=] XGBOOST_DEVICE(size_t _idx,
                           common::Span<GradientPair> _out_gpair,
@@ -63,7 +64,7 @@ class HingeObj : public ObjFunction {
        },
        common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
        ctx_->gpu_id).Eval(
-            out_gpair, &preds, info.labels.Data(), &info.weights_);
+            out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
  }

  void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -21,7 +21,7 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
  }
  // Avoid altering any state in child objective.
  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
-  HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
+  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);

  Json config{Object{}};
  this->SaveConfig(&config);
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -109,12 +109,12 @@ class LambdaRankObj : public FitIntercept {
    lj_.SetDevice(ctx_->gpu_id);

    if (ctx_->IsCPU()) {
-      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
-                                             lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
+      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
+                                             lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
                                             &li_, &lj_, p_cache_);
    } else {
-      cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
-                                              lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
+      cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
+                                              lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
                                              &li_, &lj_, p_cache_);
    }

@@ -165,9 +165,8 @@ class LambdaRankObj : public FitIntercept {
  void CalcLambdaForGroup(std::int32_t iter, common::Span<float const> g_predt,
                          linalg::VectorView<float const> g_label, float w,
                          common::Span<std::size_t const> g_rank, bst_group_t g, Delta delta,
-                          common::Span<GradientPair> g_gpair) {
-    std::fill_n(g_gpair.data(), g_gpair.size(), GradientPair{});
-    auto p_gpair = g_gpair.data();
+                          linalg::VectorView<GradientPair> g_gpair) {
+    std::fill_n(g_gpair.Values().data(), g_gpair.Size(), GradientPair{});

    auto ti_plus = ti_plus_.HostView();
    auto tj_minus = tj_minus_.HostView();
@@ -198,8 +197,8 @@ class LambdaRankObj : public FitIntercept {

      std::size_t idx_high = g_rank[rank_high];
      std::size_t idx_low = g_rank[rank_low];
-      p_gpair[idx_high] += pg;
-      p_gpair[idx_low] += ng;
+      g_gpair(idx_high) += pg;
+      g_gpair(idx_low) += ng;

      if (unbiased) {
        auto k = ti_plus.Size();
@@ -225,12 +224,13 @@ class LambdaRankObj : public FitIntercept {
    MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
    if (sum_lambda > 0.0) {
      double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
-      std::transform(g_gpair.data(), g_gpair.data() + g_gpair.size(), g_gpair.data(),
-                     [norm](GradientPair const& g) { return g * norm; });
+      std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
+                     g_gpair.Values().data(), [norm](GradientPair const& g) { return g * norm; });
    }

    auto w_norm = p_cache_->WeightNorm();
-    std::transform(g_gpair.begin(), g_gpair.end(), g_gpair.begin(),
+    std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
+                   g_gpair.Values().data(),
                   [&](GradientPair const& gpair) { return gpair * w * w_norm; });
  }

@@ -301,7 +301,7 @@ class LambdaRankObj : public FitIntercept {
  }

  void GetGradient(HostDeviceVector<float> const& predt, MetaInfo const& info, std::int32_t iter,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
+                   linalg::Matrix<GradientPair>* out_gpair) override {
    CHECK_EQ(info.labels.Size(), predt.Size()) << error::LabelScoreSize();

    // init/renew cache
@@ -339,7 +339,7 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
  void CalcLambdaForGroupNDCG(std::int32_t iter, common::Span<float const> g_predt,
                              linalg::VectorView<float const> g_label, float w,
                              common::Span<std::size_t const> g_rank,
-                              common::Span<GradientPair> g_gpair,
+                              linalg::VectorView<GradientPair> g_gpair,
                              linalg::VectorView<double const> inv_IDCG,
                              common::Span<double const> discount, bst_group_t g) {
    auto delta = [&](auto y_high, auto y_low, std::size_t rank_high, std::size_t rank_low,
@@ -351,20 +351,22 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
  }

  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
-                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+                       const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
    if (ctx_->IsCUDA()) {
      cuda_impl::LambdaRankGetGradientNDCG(
-          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
-          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
-          out_gpair);
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
+          tj_minus_.View(ctx_->Device()), li_full_.View(ctx_->Device()),
+          lj_full_.View(ctx_->Device()), out_gpair);
      return;
    }

    bst_group_t n_groups = p_cache_->Groups();
    auto gptr = p_cache_->DataGroupPtr(ctx_);

-    out_gpair->Resize(info.num_row_);
-    auto h_gpair = out_gpair->HostSpan();
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, 1);
+
+    auto h_gpair = out_gpair->HostView();
    auto h_predt = predt.ConstHostSpan();
    auto h_label = info.labels.HostView();
    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
@@ -378,7 +380,8 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
      std::size_t cnt = gptr[g + 1] - gptr[g];
      auto w = h_weight[g];
      auto g_predt = h_predt.subspan(gptr[g], cnt);
-      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_gpair =
+          h_gpair.Slice(linalg::Range(static_cast<std::size_t>(gptr[g]), gptr[g] + cnt), 0);
      auto g_label = h_label.Slice(make_range(g), 0);
      auto g_rank = rank_idx.subspan(gptr[g], cnt);

@@ -420,7 +423,7 @@ void LambdaRankGetGradientNDCG(Context const*, std::int32_t, HostDeviceVector<fl
                               linalg::VectorView<double const>,  // input bias ratio
                               linalg::VectorView<double const>,  // input bias ratio
                               linalg::VectorView<double>, linalg::VectorView<double>,
-                               HostDeviceVector<GradientPair>*) {
+                               linalg::Matrix<GradientPair>*) {
  common::AssertGPUSupport();
 }

@@ -470,20 +473,23 @@ void MAPStat(Context const* ctx, linalg::VectorView<float const> label,
 class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
 public:
  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
-                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+                       const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the MAP objective.";
    if (ctx_->IsCUDA()) {
      return cuda_impl::LambdaRankGetGradientMAP(
-          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
-          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
-          out_gpair);
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
+          tj_minus_.View(ctx_->Device()), li_full_.View(ctx_->Device()),
+          lj_full_.View(ctx_->Device()), out_gpair);
    }

    auto gptr = p_cache_->DataGroupPtr(ctx_).data();
    bst_group_t n_groups = p_cache_->Groups();

-    out_gpair->Resize(info.num_row_);
-    auto h_gpair = out_gpair->HostSpan();
+    CHECK_EQ(info.labels.Shape(1), 1) << "multi-target for learning to rank is not yet supported.";
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
+
+    auto h_gpair = out_gpair->HostView();
    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
    auto h_predt = predt.ConstHostSpan();
    auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
@@ -514,7 +520,7 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
      auto cnt = gptr[g + 1] - gptr[g];
      auto w = h_weight[g];
      auto g_predt = h_predt.subspan(gptr[g], cnt);
-      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.Slice(linalg::Range(gptr[g], gptr[g] + cnt), 0);
      auto g_label = h_label.Slice(make_range(g));
      auto g_rank = rank_idx.subspan(gptr[g], cnt);

@@ -545,7 +551,7 @@ void LambdaRankGetGradientMAP(Context const*, std::int32_t, HostDeviceVector<flo
                              linalg::VectorView<double const>,  // input bias ratio
                              linalg::VectorView<double const>,  // input bias ratio
                              linalg::VectorView<double>, linalg::VectorView<double>,
-                              HostDeviceVector<GradientPair>*) {
+                              linalg::Matrix<GradientPair>*) {
  common::AssertGPUSupport();
 }
 }  // namespace cuda_impl
@@ -557,20 +563,22 @@ void LambdaRankGetGradientMAP(Context const*, std::int32_t, HostDeviceVector<flo
 class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::RankingCache> {
 public:
  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
-                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+                       const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the pairwise objective.";
    if (ctx_->IsCUDA()) {
      return cuda_impl::LambdaRankGetGradientPairwise(
-          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
-          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
-          out_gpair);
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
+          tj_minus_.View(ctx_->Device()), li_full_.View(ctx_->Device()),
+          lj_full_.View(ctx_->Device()), out_gpair);
    }

    auto gptr = p_cache_->DataGroupPtr(ctx_);
    bst_group_t n_groups = p_cache_->Groups();

-    out_gpair->Resize(info.num_row_);
-    auto h_gpair = out_gpair->HostSpan();
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
+
+    auto h_gpair = out_gpair->HostView();
    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
    auto h_predt = predt.ConstHostSpan();
    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
@@ -585,7 +593,7 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
      auto cnt = gptr[g + 1] - gptr[g];
      auto w = h_weight[g];
      auto g_predt = h_predt.subspan(gptr[g], cnt);
-      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.Slice(linalg::Range(gptr[g], gptr[g] + cnt), 0);
      auto g_label = h_label.Slice(make_range(g));
      auto g_rank = rank_idx.subspan(gptr[g], cnt);

@@ -611,7 +619,7 @@ void LambdaRankGetGradientPairwise(Context const*, std::int32_t, HostDeviceVecto
                                   linalg::VectorView<double const>,  // input bias ratio
                                   linalg::VectorView<double const>,  // input bias ratio
                                   linalg::VectorView<double>, linalg::VectorView<double>,
-                                   HostDeviceVector<GradientPair>*) {
+                                   linalg::Matrix<GradientPair>*) {
  common::AssertGPUSupport();
 }
 }  // namespace cuda_impl
--- a/Show More
+++ b/Show More