enable ROCm on latest XGBoost

2023-10-23 11:07:08 -07:00
parent fb19e15ce3 3b86260b50
commit 15421e40d9
328 changed files with 8028 additions and 3642 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,10 +11,10 @@ set_source_files_properties(
  PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
 target_sources(objxgboost PRIVATE ${RABIT_SOURCES})

-if (USE_CUDA)
+if(USE_CUDA)
  file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
  target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
-endif (USE_CUDA)
+endif()

 if (USE_HIP)
  file(GLOB_RECURSE HIP_SOURCES *.hip *.hip.h)
@@ -27,9 +27,9 @@ target_include_directories(objxgboost
  ${xgboost_SOURCE_DIR}/dmlc-core/include
  ${xgboost_SOURCE_DIR}/rabit/include)

-if (LOG_CAPI_INVOCATION)
+if(LOG_CAPI_INVOCATION)
  target_compile_definitions(objxgboost PRIVATE -DLOG_CAPI_INVOCATION=1)
-endif (LOG_CAPI_INVOCATION)
+endif()

 # For MSVC: Call msvc_use_static_runtime() once again to completely
 # replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -271,8 +271,8 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
  if (cache_info != nullptr) {
    scache = cache_info;
  }
-  xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
-                                 XGBoostBatchCSR> adapter(data_handle, callback);
+  xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR> adapter(
+      data_handle, callback);
  xgboost_CHECK_C_ARG_PTR(out);
  *out = new std::shared_ptr<DMatrix> {
    DMatrix::Create(
@@ -447,8 +447,11 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
  auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
+  auto data_split_mode =
+      static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
  xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(
+      DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
  API_END();
 }

@@ -483,8 +486,11 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
  auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
+  auto data_split_mode =
+      static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
  xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(
+      DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));

  API_END();
 }
@@ -534,33 +540,8 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
  API_END();
 }

-XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array,
-                                     void *ptr_schema) {
-  API_BEGIN();
-  static_cast<data::RecordBatchesIterAdapter *>(data_handle)
-      ->SetData(static_cast<struct ArrowArray *>(ptr_array),
-                static_cast<struct ArrowSchema *>(ptr_schema));
-  API_END();
-}
-
-XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config,
-                                             DMatrixHandle *out) {
-  API_BEGIN();
-  xgboost_CHECK_C_ARG_PTR(config);
-  auto jconfig = Json::Load(StringView{config});
-  auto missing = GetMissing(jconfig);
-  auto n_batches = RequiredArg<Integer>(jconfig, "nbatch", __func__);
-  auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
-  data::RecordBatchesIterAdapter adapter(next, n_batches);
-  xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
-  API_END();
-}
-
-XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
-                                  const int* idxset,
-                                  xgboost::bst_ulong len,
-                                  DMatrixHandle* out) {
+XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle, const int *idxset, xgboost::bst_ulong len,
+                                  DMatrixHandle *out) {
  xgboost_CHECK_C_ARG_PTR(out);
  return XGDMatrixSliceDMatrixEx(handle, idxset, len, out, 0);
 }
@@ -749,6 +730,15 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle const handle, xgboost::bst_ulon
  API_END();
 }

+XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  auto p_m = CastDMatrixHandle(handle);
+  xgboost_CHECK_C_ARG_PTR(out);
+  *out = static_cast<xgboost::bst_ulong>(p_m->Info().data_split_mode);
+  API_END();
+}
+
 XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
                                  xgboost::bst_ulong *out_indptr, unsigned *out_indices,
                                  float *out_data) {
@@ -1375,29 +1365,6 @@ XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_co
  API_END();
 }

-XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_len,
-                                 const char **out_dptr) {
-  API_BEGIN();
-  CHECK_HANDLE();
-
-  auto *learner = static_cast<Learner*>(handle);
-  std::string& raw_str = learner->GetThreadLocal().ret_str;
-  raw_str.resize(0);
-
-  common::MemoryBufferStream fo(&raw_str);
-  LOG(WARNING) << error::DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
-
-  learner->Configure();
-  learner->SaveModel(&fo);
-
-  xgboost_CHECK_C_ARG_PTR(out_dptr);
-  xgboost_CHECK_C_ARG_PTR(out_len);
-
-  *out_dptr = dmlc::BeginPtr(raw_str);
-  *out_len = static_cast<xgboost::bst_ulong>(raw_str.length());
-  API_END();
-}
-
 // The following two functions are `Load` and `Save` for memory based
 // serialization methods. E.g. Python pickle.
 XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, xgboost::bst_ulong *out_len,
@@ -1432,36 +1399,13 @@ XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle,
  API_END();
 }

-XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
-                                         int* version) {
-  API_BEGIN();
-  CHECK_HANDLE();
-  auto* bst = static_cast<Learner*>(handle);
-  xgboost_CHECK_C_ARG_PTR(version);
-  *version = rabit::LoadCheckPoint();
-  if (*version != 0) {
-    bst->Configure();
-  }
-  API_END();
-}
-
-XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) {
-  API_BEGIN();
-  CHECK_HANDLE();
-  auto *learner = static_cast<Learner *>(handle);
-  learner->Configure();
-  rabit::CheckPoint();
-  API_END();
-}
-
-XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer,
-                           int end_layer, int step,
+XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer, int end_layer, int step,
                           BoosterHandle *out) {
  API_BEGIN();
  CHECK_HANDLE();
  xgboost_CHECK_C_ARG_PTR(out);

-  auto* learner = static_cast<Learner*>(handle);
+  auto *learner = static_cast<Learner *>(handle);
  bool out_of_bound = false;
  auto p_out = learner->Slice(begin_layer, end_layer, step, &out_of_bound);
  if (out_of_bound) {
@@ -1797,7 +1741,7 @@ XGB_DLL int XGCommunicatorAllreduce(void *send_receive_buffer, size_t count, int
 }

 #if defined(XGBOOST_USE_FEDERATED)
-XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_key_path,
+XGB_DLL int XGBRunFederatedServer(int port, std::size_t world_size, char const *server_key_path,
                                  char const *server_cert_path, char const *client_cert_path) {
  API_BEGIN();
  federated::RunServer(port, world_size, server_key_path, server_cert_path, client_cert_path);
@@ -1805,7 +1749,7 @@ XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_k
 }

 // Run a server without SSL for local testing.
-XGB_DLL int XGBRunInsecureFederatedServer(int port, int world_size) {
+XGB_DLL int XGBRunInsecureFederatedServer(int port, std::size_t world_size) {
  API_BEGIN();
  federated::RunInsecureServer(port, world_size);
  API_END();
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -75,7 +75,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
  auto hess_dev = dh::CudaGetPointerDevice(hess.data);
  CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
  auto &gpair = *out_gpair;
-  gpair.SetDevice(grad_dev);
+  gpair.SetDevice(DeviceOrd::CUDA(grad_dev));
  gpair.Reshape(grad.Shape(0), grad.Shape(1));
  auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
  auto cuctx = ctx->CUDACtx();
@@ -153,7 +153,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
  if (learner->Ctx()->IsCUDA()) {
    CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
  }
-  p_predt->SetDevice(proxy->DeviceIdx());
+  p_predt->SetDevice(proxy->Device());

  auto &shape = learner->GetThreadLocal().prediction_shape;
  size_t n_samples = p_m->Info().num_row_;
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2023, XGBoost Contributors
 */
 #ifndef XGBOOST_C_API_C_API_UTILS_H_
 #define XGBOOST_C_API_C_API_UTILS_H_
@@ -13,6 +13,7 @@
 #include <utility>  // for move
 #include <vector>

+#include "../common/json_utils.h"  // for TypeCheck
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"         // DMatrix
 #include "xgboost/feature_map.h"  // for FeatureMap
@@ -254,28 +255,6 @@ inline void GenerateFeatureMap(Learner const *learner,

 void XGBBuildInfoDevice(Json* p_info);

-template <typename JT>
-auto const &RequiredArg(Json const &in, StringView key, StringView func) {
-  auto const &obj = get<Object const>(in);
-  auto it = obj.find(key);
-  if (it == obj.cend() || IsA<Null>(it->second)) {
-    LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
-  }
-  TypeCheck<JT>(it->second, StringView{key});
-  return get<std::remove_const_t<JT> const>(it->second);
-}
-
-template <typename JT, typename T>
-auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
-  auto const &obj = get<Object const>(in);
-  auto it = obj.find(key);
-  if (it != obj.cend() && !IsA<Null>(it->second)) {
-    TypeCheck<JT>(it->second, key);
-    return get<std::remove_const_t<JT> const>(it->second);
-  }
-  return dft;
-}
-
 /**
 * \brief Get shared ptr from DMatrix C handle with additional checks.
 */
--- a/src/collective/aggregator.cuh
+++ b/src/collective/aggregator.cuh
@@ -15,8 +15,7 @@

 #include "communicator-inl.cuh"

-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {

 /**
 * @brief Find the global sum of the given values across all workers.
@@ -31,10 +30,9 @@ namespace collective {
 * @param size Number of values to sum.
 */
 template <typename T>
-void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
+void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) {
  if (info.IsRowSplit()) {
-    collective::AllReduce<collective::Operation::kSum>(device, values, size);
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, values, size);
  }
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
--- a/src/collective/allgather.cc
+++ b/src/collective/allgather.cc
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "allgather.h"
+
+#include <algorithm>  // for min, copy_n
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int8_t, int32_t, int64_t
+#include <memory>     // for shared_ptr
+#include <numeric>    // for partial_sum
+#include <vector>     // for vector
+
+#include "comm.h"                       // for Comm, Channel
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective::cpu_impl {
+Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size_t segment_size,
+                     std::int32_t worker_off, std::shared_ptr<Channel> prev_ch,
+                     std::shared_ptr<Channel> next_ch) {
+  auto world = comm.World();
+  auto rank = comm.Rank();
+  CHECK_LT(worker_off, world);
+
+  for (std::int32_t r = 0; r < world; ++r) {
+    auto send_rank = (rank + world - r + worker_off) % world;
+    auto send_off = send_rank * segment_size;
+    send_off = std::min(send_off, data.size_bytes());
+    auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
+    next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
+
+    auto recv_rank = (rank + world - r - 1 + worker_off) % world;
+    auto recv_off = recv_rank * segment_size;
+    recv_off = std::min(recv_off, data.size_bytes());
+    auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
+    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+    auto rc = prev_ch->Block();
+    if (!rc.OK()) {
+      return rc;
+    }
+  }
+
+  return Success();
+}
+
+[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
+                                    common::Span<std::int8_t const> data,
+                                    common::Span<std::int8_t> erased_result) {
+  auto world = comm.World();
+  auto rank = comm.Rank();
+
+  auto prev = BootstrapPrev(rank, comm.World());
+  auto next = BootstrapNext(rank, comm.World());
+
+  auto prev_ch = comm.Chan(prev);
+  auto next_ch = comm.Chan(next);
+
+  // get worker offset
+  std::vector<std::int64_t> offset(world + 1, 0);
+  std::partial_sum(sizes.cbegin(), sizes.cend(), offset.begin() + 1);
+  CHECK_EQ(*offset.cbegin(), 0);
+
+  // copy data
+  auto current = erased_result.subspan(offset[rank], data.size_bytes());
+  auto erased_data = EraseType(data);
+  std::copy_n(erased_data.data(), erased_data.size(), current.data());
+
+  for (std::int32_t r = 0; r < world; ++r) {
+    auto send_rank = (rank + world - r) % world;
+    auto send_off = offset[send_rank];
+    auto send_size = sizes[send_rank];
+    auto send_seg = erased_result.subspan(send_off, send_size);
+    next_ch->SendAll(send_seg);
+
+    auto recv_rank = (rank + world - r - 1) % world;
+    auto recv_off = offset[recv_rank];
+    auto recv_size = sizes[recv_rank];
+    auto recv_seg = erased_result.subspan(recv_off, recv_size);
+    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+
+    auto rc = prev_ch->Block();
+    if (!rc.OK()) {
+      return rc;
+    }
+  }
+  return comm.Block();
+}
+}  // namespace xgboost::collective::cpu_impl
--- a/src/collective/allgather.h
+++ b/src/collective/allgather.h
@@ -0,0 +1,72 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <memory>       // for shared_ptr
+#include <numeric>      // for accumulate
+#include <type_traits>  // for remove_cv_t
+#include <vector>       // for vector
+
+#include "comm.h"                       // for Comm, Channel, EraseType
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+namespace cpu_impl {
+/**
+ * @param worker_off Segment offset. For example, if the rank 2 worker specifis worker_off
+ *                   = 1, then it owns the third segment.
+ */
+[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data,
+                                   std::size_t segment_size, std::int32_t worker_off,
+                                   std::shared_ptr<Channel> prev_ch,
+                                   std::shared_ptr<Channel> next_ch);
+
+[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
+                                    common::Span<std::int8_t const> data,
+                                    common::Span<std::int8_t> erased_result);
+}  // namespace cpu_impl
+
+template <typename T>
+[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<T> data, std::size_t size) {
+  auto n_bytes = sizeof(T) * size;
+  auto erased = EraseType(data);
+
+  auto rank = comm.Rank();
+  auto prev = BootstrapPrev(rank, comm.World());
+  auto next = BootstrapNext(rank, comm.World());
+
+  auto prev_ch = comm.Chan(prev);
+  auto next_ch = comm.Chan(next);
+  auto rc = cpu_impl::RingAllgather(comm, erased, n_bytes, 0, prev_ch, next_ch);
+  if (!rc.OK()) {
+    return rc;
+  }
+  return comm.Block();
+}
+
+template <typename T>
+[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<T> data,
+                                    std::vector<std::remove_cv_t<T>>* p_out) {
+  auto world = comm.World();
+  auto rank = comm.Rank();
+
+  std::vector<std::int64_t> sizes(world, 0);
+  sizes[rank] = data.size_bytes();
+  auto rc = RingAllgather(comm, common::Span{sizes.data(), sizes.size()}, 1);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  std::vector<T>& result = *p_out;
+  auto n_total_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
+  result.resize(n_total_bytes / sizeof(T));
+  auto h_result = common::Span{result.data(), result.size()};
+  auto erased_result = EraseType(h_result);
+  auto erased_data = EraseType(data);
+
+  return cpu_impl::RingAllgatherV(comm, sizes, erased_data, erased_result);
+}
+}  // namespace xgboost::collective
--- a/src/collective/allreduce.cc
+++ b/src/collective/allreduce.cc
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "allreduce.h"
+
+#include <algorithm>  // for min
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, int8_t
+#include <vector>     // for vector
+
+#include "../data/array_interface.h"    // for Type, DispatchDType
+#include "allgather.h"                  // for RingAllgather
+#include "comm.h"                       // for Comm
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective::cpu_impl {
+template <typename T>
+Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
+                              std::size_t n_bytes_in_seg, Func const& op) {
+  auto rank = comm.Rank();
+  auto world = comm.World();
+
+  auto dst_rank = BootstrapNext(rank, world);
+  auto src_rank = BootstrapPrev(rank, world);
+  auto next_ch = comm.Chan(dst_rank);
+  auto prev_ch = comm.Chan(src_rank);
+
+  std::vector<std::int8_t> buffer(n_bytes_in_seg, 0);
+  auto s_buf = common::Span{buffer.data(), buffer.size()};
+
+  for (std::int32_t r = 0; r < world - 1; ++r) {
+    // send to ring next
+    auto send_off = ((rank + world - r) % world) * n_bytes_in_seg;
+    send_off = std::min(send_off, data.size_bytes());
+    auto seg_nbytes = std::min(data.size_bytes() - send_off, n_bytes_in_seg);
+    auto send_seg = data.subspan(send_off, seg_nbytes);
+
+    next_ch->SendAll(send_seg);
+
+    // receive from ring prev
+    auto recv_off = ((rank + world - r - 1) % world) * n_bytes_in_seg;
+    recv_off = std::min(recv_off, data.size_bytes());
+    seg_nbytes = std::min(data.size_bytes() - recv_off, n_bytes_in_seg);
+    CHECK_EQ(seg_nbytes % sizeof(T), 0);
+    auto recv_seg = data.subspan(recv_off, seg_nbytes);
+    auto seg = s_buf.subspan(0, recv_seg.size());
+
+    prev_ch->RecvAll(seg);
+    auto rc = prev_ch->Block();
+    if (!rc.OK()) {
+      return rc;
+    }
+
+    // accumulate to recv_seg
+    CHECK_EQ(seg.size(), recv_seg.size());
+    op(seg, recv_seg);
+  }
+
+  return Success();
+}
+
+Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
+                     ArrayInterfaceHandler::Type type) {
+  return DispatchDType(type, [&](auto t) {
+    using T = decltype(t);
+    // Divide the data into segments according to the number of workers.
+    auto n_bytes_elem = sizeof(T);
+    CHECK_EQ(data.size_bytes() % n_bytes_elem, 0);
+    auto n = data.size_bytes() / n_bytes_elem;
+    auto world = comm.World();
+    auto n_bytes_in_seg = common::DivRoundUp(n, world) * sizeof(T);
+    auto rc = RingScatterReduceTyped<T>(comm, data, n_bytes_in_seg, op);
+    if (!rc.OK()) {
+      return rc;
+    }
+
+    auto prev = BootstrapPrev(comm.Rank(), comm.World());
+    auto next = BootstrapNext(comm.Rank(), comm.World());
+    auto prev_ch = comm.Chan(prev);
+    auto next_ch = comm.Chan(next);
+
+    rc = RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
+    if (!rc.OK()) {
+      return rc;
+    }
+    return comm.Block();
+  });
+}
+}  // namespace xgboost::collective::cpu_impl
--- a/src/collective/allreduce.h
+++ b/src/collective/allreduce.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>      // for int8_t
+#include <functional>   // for function
+#include <type_traits>  // for is_invocable_v
+
+#include "../data/array_interface.h"    // for ArrayInterfaceHandler
+#include "comm.h"                       // for Comm, RestoreType
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+namespace cpu_impl {
+using Func =
+    std::function<void(common::Span<std::int8_t const> lhs, common::Span<std::int8_t> out)>;
+
+Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
+                     ArrayInterfaceHandler::Type type);
+}  // namespace cpu_impl
+
+template <typename T, typename Fn>
+std::enable_if_t<std::is_invocable_v<Fn, common::Span<T const>, common::Span<T>>, Result> Allreduce(
+    Comm const& comm, common::Span<T> data, Fn redop) {
+  auto erased = EraseType(data);
+  auto type = ToDType<T>::kType;
+
+  auto erased_fn = [type, redop](common::Span<std::int8_t const> lhs,
+                                 common::Span<std::int8_t> out) {
+    CHECK_EQ(lhs.size(), out.size()) << "Invalid input for reduction.";
+    auto lhs_t = RestoreType<T const>(lhs);
+    auto rhs_t = RestoreType<T>(out);
+    redop(lhs_t, rhs_t);
+  };
+
+  return cpu_impl::RingAllreduce(comm, erased, erased_fn, type);
+}
+}  // namespace xgboost::collective
--- a/src/collective/broadcast.cc
+++ b/src/collective/broadcast.cc
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "broadcast.h"
+
+#include <cmath>    // for ceil, log2
+#include <cstdint>  // for int32_t, int8_t
+#include <utility>  // for move
+
+#include "../common/bitfield.h"         // for TrailingZeroBits, RBitField32
+#include "comm.h"                       // for Comm
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective::cpu_impl {
+namespace {
+std::int32_t ShiftedParentRank(std::int32_t shifted_rank, std::int32_t depth) {
+  std::uint32_t mask{std::uint32_t{0} - 1};  // Oxff...
+  RBitField32 maskbits{common::Span<std::uint32_t>{&mask, 1}};
+  RBitField32 rankbits{
+      common::Span<std::uint32_t>{reinterpret_cast<std::uint32_t*>(&shifted_rank), 1}};
+  // prepare for counting trailing zeros.
+  for (std::int32_t i = 0; i < depth + 1; ++i) {
+    if (rankbits.Check(i)) {
+      maskbits.Set(i);
+    } else {
+      maskbits.Clear(i);
+    }
+  }
+
+  CHECK_NE(mask, 0);
+  auto k = TrailingZeroBits(mask);
+  auto shifted_parent = shifted_rank - (1 << k);
+  return shifted_parent;
+}
+
+// Shift the root node to rank 0
+std::int32_t ShiftLeft(std::int32_t rank, std::int32_t world, std::int32_t root) {
+  auto shifted_rank = (rank + world - root) % world;
+  return shifted_rank;
+}
+// shift back to the original rank
+std::int32_t ShiftRight(std::int32_t rank, std::int32_t world, std::int32_t root) {
+  auto orig = (rank + root) % world;
+  return orig;
+}
+}  // namespace
+
+Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root) {
+  // Binomial tree broadcast
+  // * Wiki
+  // https://en.wikipedia.org/wiki/Broadcast_(parallel_pattern)#Binomial_Tree_Broadcast
+  // * Impl
+  // https://people.mpi-inf.mpg.de/~mehlhorn/ftp/NewToolbox/collective.pdf
+
+  auto rank = comm.Rank();
+  auto world = comm.World();
+
+  // shift root to rank 0
+  auto shifted_rank = ShiftLeft(rank, world, root);
+  std::int32_t depth = std::ceil(std::log2(static_cast<double>(world))) - 1;
+
+  if (shifted_rank != 0) {  // not root
+    auto parent = ShiftRight(ShiftedParentRank(shifted_rank, depth), world, root);
+    comm.Chan(parent)->RecvAll(data);
+    auto rc = comm.Chan(parent)->Block();
+    if (!rc.OK()) {
+      return Fail("broadcast failed.", std::move(rc));
+    }
+  }
+
+  for (std::int32_t i = depth; i >= 0; --i) {
+    CHECK_GE((i + 1), 0);  // weird clang-tidy error that i might be negative
+    if (shifted_rank % (1 << (i + 1)) == 0 && shifted_rank + (1 << i) < world) {
+      auto sft_peer = shifted_rank + (1 << i);
+      auto peer = ShiftRight(sft_peer, world, root);
+      CHECK_NE(peer, root);
+      comm.Chan(peer)->SendAll(data);
+    }
+  }
+
+  return comm.Block();
+}
+}  // namespace xgboost::collective::cpu_impl
--- a/src/collective/broadcast.h
+++ b/src/collective/broadcast.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>  // for int32_t, int8_t
+
+#include "comm.h"                       // for Comm
+#include "xgboost/collective/result.h"  // for
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+namespace cpu_impl {
+Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root);
+}
+
+/**
+ * @brief binomial tree broadcast is used on CPU with the default implementation.
+ */
+template <typename T>
+[[nodiscard]] Result Broadcast(Comm const& comm, common::Span<T> data, std::int32_t root) {
+  auto n_total_bytes = data.size_bytes();
+  auto erased =
+      common::Span<std::int8_t>{reinterpret_cast<std::int8_t*>(data.data()), n_total_bytes};
+  return cpu_impl::Broadcast(comm, erased, root);
+}
+}  // namespace xgboost::collective
--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@@ -0,0 +1,304 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "comm.h"
+
+#include <algorithm>  // for copy
+#include <chrono>     // for seconds
+#include <memory>     // for shared_ptr
+#include <string>     // for string
+#include <utility>    // for move, forward
+
+#include "allgather.h"
+#include "protocol.h"                   // for kMagic
+#include "xgboost/base.h"               // for XGBOOST_STRICT_R_MODE
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/json.h"               // for Json, Object
+#include "xgboost/string_view.h"        // for StringView
+
+namespace xgboost::collective {
+Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
+           std::int32_t retry, std::string task_id)
+    : timeout_{timeout},
+      retry_{retry},
+      tracker_{host, port, -1},
+      task_id_{std::move(task_id)},
+      loop_{std::make_shared<Loop>(timeout)} {}
+
+Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, std::int32_t retry,
+                          std::string const& task_id, TCPSocket* out, std::int32_t rank,
+                          std::int32_t world) {
+  // get information from tracker
+  CHECK(!info.host.empty());
+  auto rc = Connect(info.host, info.port, retry, timeout, out);
+  if (!rc.OK()) {
+    return Fail("Failed to connect to the tracker.", std::move(rc));
+  }
+
+  TCPSocket& tracker = *out;
+  return std::move(rc)
+      << [&] { return tracker.NonBlocking(false); }
+      << [&] { return tracker.RecvTimeout(timeout); }
+      << [&] { return proto::Magic{}.Verify(&tracker); }
+      << [&] { return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id); };
+}
+
+[[nodiscard]] Result Comm::ConnectTracker(TCPSocket* out) const {
+  return ConnectTrackerImpl(this->TrackerInfo(), this->Timeout(), this->retry_, this->task_id_, out,
+                            this->Rank(), this->World());
+}
+
+[[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
+                                    proto::PeerInfo ninfo, std::chrono::seconds timeout,
+                                    std::int32_t retry,
+                                    std::vector<std::shared_ptr<TCPSocket>>* out_workers) {
+  auto next = std::make_shared<TCPSocket>();
+  auto prev = std::make_shared<TCPSocket>();
+
+  auto rc = Success() << [&] {
+    auto rc = Connect(ninfo.host, ninfo.port, retry, timeout, next.get());
+    if (!rc.OK()) {
+      return Fail("Bootstrap failed to connect to ring next.", std::move(rc));
+    }
+    return rc;
+  } << [&] {
+    return next->NonBlocking(true);
+  } << [&] {
+    SockAddrV4 addr;
+    return listener->Accept(prev.get(), &addr);
+  } << [&] { return prev->NonBlocking(true); };
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  // exchange host name and port
+  std::vector<std::int8_t> buffer(HOST_NAME_MAX * comm.World(), 0);
+  auto s_buffer = common::Span{buffer.data(), buffer.size()};
+  auto next_host = s_buffer.subspan(HOST_NAME_MAX * comm.Rank(), HOST_NAME_MAX);
+  if (next_host.size() < ninfo.host.size()) {
+    return Fail("Got an invalid host name.");
+  }
+  std::copy(ninfo.host.cbegin(), ninfo.host.cend(), next_host.begin());
+
+  auto prev_ch = std::make_shared<Channel>(comm, prev);
+  auto next_ch = std::make_shared<Channel>(comm, next);
+
+  auto block = [&] {
+    for (auto ch : {prev_ch, next_ch}) {
+      auto rc = ch->Block();
+      if (!rc.OK()) {
+        return rc;
+      }
+    }
+    return Success();
+  };
+
+  rc = std::move(rc) << [&] {
+    return cpu_impl::RingAllgather(comm, s_buffer, HOST_NAME_MAX, 0, prev_ch, next_ch);
+  } << [&] { return block(); };
+  if (!rc.OK()) {
+    return Fail("Failed to get host names from peers.", std::move(rc));
+  }
+
+  std::vector<std::int32_t> peers_port(comm.World(), -1);
+  peers_port[comm.Rank()] = ninfo.port;
+  rc = std::move(rc) << [&] {
+    auto s_ports = common::Span{reinterpret_cast<std::int8_t*>(peers_port.data()),
+                                peers_port.size() * sizeof(ninfo.port)};
+    return cpu_impl::RingAllgather(comm, s_ports, sizeof(ninfo.port), 0, prev_ch, next_ch);
+  } << [&] { return block(); };
+  if (!rc.OK()) {
+    return Fail("Failed to get the port from peers.", std::move(rc));
+  }
+
+  std::vector<proto::PeerInfo> peers(comm.World());
+  for (auto r = 0; r < comm.World(); ++r) {
+    auto nhost = s_buffer.subspan(HOST_NAME_MAX * r, HOST_NAME_MAX);
+    auto nport = peers_port[r];
+    auto nrank = BootstrapNext(r, comm.World());
+
+    peers[nrank] = {std::string{reinterpret_cast<char const*>(nhost.data())}, nport, nrank};
+  }
+  CHECK_EQ(peers[comm.Rank()].port, lport);
+  for (auto const& p : peers) {
+    CHECK_NE(p.port, -1);
+  }
+
+  std::vector<std::shared_ptr<TCPSocket>>& workers = *out_workers;
+  workers.resize(comm.World());
+
+  for (std::int32_t r = (comm.Rank() + 1); r < comm.World(); ++r) {
+    auto const& peer = peers[r];
+    std::shared_ptr<TCPSocket> worker{TCPSocket::CreatePtr(comm.Domain())};
+    rc = std::move(rc)
+         << [&] { return Connect(peer.host, peer.port, retry, timeout, worker.get()); }
+         << [&] { return worker->RecvTimeout(timeout); };
+    if (!rc.OK()) {
+      return rc;
+    }
+
+    auto rank = comm.Rank();
+    auto n_bytes = worker->SendAll(&rank, sizeof(comm.Rank()));
+    if (n_bytes != sizeof(comm.Rank())) {
+      return Fail("Failed to send rank.");
+    }
+    workers[r] = std::move(worker);
+  }
+
+  for (std::int32_t r = 0; r < comm.Rank(); ++r) {
+    SockAddrV4 addr;
+    auto peer = std::shared_ptr<TCPSocket>(TCPSocket::CreatePtr(comm.Domain()));
+    rc = std::move(rc) << [&] { return listener->Accept(peer.get(), &addr); }
+                       << [&] { return peer->RecvTimeout(timeout); };
+    if (!rc.OK()) {
+      return rc;
+    }
+    std::int32_t rank{-1};
+    auto n_bytes = peer->RecvAll(&rank, sizeof(rank));
+    if (n_bytes != sizeof(comm.Rank())) {
+      return Fail("Failed to recv rank.");
+    }
+    workers[rank] = std::move(peer);
+  }
+
+  for (std::int32_t r = 0; r < comm.World(); ++r) {
+    if (r == comm.Rank()) {
+      continue;
+    }
+    CHECK(workers[r]);
+  }
+
+  return Success();
+}
+
+RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
+                     std::int32_t retry, std::string task_id)
+    : Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
+  auto rc = this->Bootstrap(timeout_, retry_, task_id_);
+  CHECK(rc.OK()) << rc.Report();
+}
+
+[[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
+                                          std::string task_id) {
+  TCPSocket tracker;
+  std::int32_t world{-1};
+  auto rc = ConnectTrackerImpl(this->TrackerInfo(), timeout, retry, task_id, &tracker, this->Rank(),
+                               world);
+  if (!rc.OK()) {
+    return Fail("Bootstrap failed.", std::move(rc));
+  }
+
+  this->domain_ = tracker.Domain();
+
+  // Start command
+  TCPSocket listener = TCPSocket::Create(tracker.Domain());
+  std::int32_t lport = listener.BindHost();
+  listener.Listen();
+
+  // create worker for listening to error notice.
+  auto domain = tracker.Domain();
+  std::shared_ptr<TCPSocket> error_sock{TCPSocket::CreatePtr(domain)};
+  auto eport = error_sock->BindHost();
+  error_sock->Listen();
+  error_worker_ = std::thread{[this, error_sock = std::move(error_sock)] {
+    auto conn = error_sock->Accept();
+    // On Windows accept returns an invalid socket after network is shutdown.
+    if (conn.IsClosed()) {
+      return;
+    }
+    LOG(WARNING) << "Another worker is running into error.";
+    std::string scmd;
+    conn.Recv(&scmd);
+    auto jcmd = Json::Load(scmd);
+    auto rc = this->Shutdown();
+    if (!rc.OK()) {
+      LOG(WARNING) << "Fail to shutdown worker:" << rc.Report();
+    }
+#if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
+    exit(-1);
+#else
+    LOG(FATAL) << rc.Report();
+#endif
+  }};
+  error_worker_.detach();
+
+  proto::Start start;
+  rc = std::move(rc) << [&] { return start.WorkerSend(lport, &tracker, eport); }
+                     << [&] { return start.WorkerRecv(&tracker, &world); };
+  if (!rc.OK()) {
+    return rc;
+  }
+  this->world_ = world;
+
+  // get ring neighbors
+  std::string snext;
+  tracker.Recv(&snext);
+  auto jnext = Json::Load(StringView{snext});
+
+  proto::PeerInfo ninfo{jnext};
+
+  // get the rank of this worker
+  this->rank_ = BootstrapPrev(ninfo.rank, world);
+  this->tracker_.rank = rank_;
+
+  std::vector<std::shared_ptr<TCPSocket>> workers;
+  rc = ConnectWorkers(*this, &listener, lport, ninfo, timeout, retry, &workers);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  CHECK(this->channels_.empty());
+  for (auto& w : workers) {
+    if (w) {
+      w->SetNoDelay();
+      rc = w->NonBlocking(true);
+    }
+    if (!rc.OK()) {
+      return rc;
+    }
+    this->channels_.emplace_back(std::make_shared<Channel>(*this, w));
+  }
+  return rc;
+}
+
+RabitComm::~RabitComm() noexcept(false) {
+  if (!IsDistributed()) {
+    return;
+  }
+  auto rc = this->Shutdown();
+  if (!rc.OK()) {
+    LOG(WARNING) << rc.Report();
+  }
+}
+
+[[nodiscard]] Result RabitComm::Shutdown() {
+  TCPSocket tracker;
+  return Success() << [&] {
+    return ConnectTrackerImpl(tracker_, timeout_, retry_, task_id_, &tracker, Rank(), World());
+  } << [&] {
+    return this->Block();
+  } << [&] {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker.Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Faled to send cmd.");
+    }
+    return Success();
+  };
+}
+
+[[nodiscard]] Result RabitComm::LogTracker(std::string msg) const {
+  TCPSocket out;
+  proto::Print print;
+  return Success() << [&] { return this->ConnectTracker(&out); }
+                   << [&] { return print.WorkerSend(&out, msg); };
+}
+
+[[nodiscard]] Result RabitComm::SignalError(Result const& res) {
+  TCPSocket out;
+  return Success() << [&] { return this->ConnectTracker(&out); }
+                   << [&] { return proto::ErrorCMD{}.WorkerSend(&out, res); };
+}
+}  // namespace xgboost::collective
--- a/src/collective/comm.h
+++ b/src/collective/comm.h
@@ -0,0 +1,156 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <chrono>       // for seconds
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <memory>       // for shared_ptr
+#include <string>       // for string
+#include <thread>       // for thread
+#include <type_traits>  // for remove_const_t
+#include <utility>      // for move
+#include <vector>       // for vector
+
+#include "loop.h"                       // for Loop
+#include "protocol.h"                   // for PeerInfo
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+
+inline constexpr std::int32_t DefaultTimeoutSec() { return 300; }  // 5min
+inline constexpr std::int32_t DefaultRetry() { return 3; }
+
+// indexing into the ring
+inline std::int32_t BootstrapNext(std::int32_t r, std::int32_t world) {
+  auto nrank = (r + world + 1) % world;
+  return nrank;
+}
+
+inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
+  auto nrank = (r + world - 1) % world;
+  return nrank;
+}
+
+class Channel;
+
+/**
+ * @brief Base communicator storing info about the tracker and other communicators.
+ */
+class Comm {
+ protected:
+  std::int32_t world_{1};
+  std::int32_t rank_{0};
+  std::chrono::seconds timeout_{DefaultTimeoutSec()};
+  std::int32_t retry_{DefaultRetry()};
+
+  proto::PeerInfo tracker_;
+  SockDomain domain_{SockDomain::kV4};
+  std::thread error_worker_;
+  std::string task_id_;
+  std::vector<std::shared_ptr<Channel>> channels_;
+  std::shared_ptr<Loop> loop_{new Loop{std::chrono::seconds{
+      DefaultTimeoutSec()}}};  // fixme: require federated comm to have a timeout
+
+ public:
+  Comm() = default;
+  Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout, std::int32_t retry,
+       std::string task_id);
+  virtual ~Comm() noexcept(false) {}  // NOLINT
+
+  Comm(Comm const& that) = delete;
+  Comm& operator=(Comm const& that) = delete;
+  Comm(Comm&& that) = delete;
+  Comm& operator=(Comm&& that) = delete;
+
+  [[nodiscard]] auto TrackerInfo() const { return tracker_; }
+  [[nodiscard]] Result ConnectTracker(TCPSocket* out) const;
+  [[nodiscard]] auto Domain() const { return domain_; }
+  [[nodiscard]] auto Timeout() const { return timeout_; }
+
+  [[nodiscard]] auto Rank() const { return rank_; }
+  [[nodiscard]] auto World() const { return world_; }
+  [[nodiscard]] bool IsDistributed() const { return World() > 1; }
+  void Submit(Loop::Op op) const { loop_->Submit(op); }
+  [[nodiscard]] Result Block() const { return loop_->Block(); }
+
+  [[nodiscard]] virtual std::shared_ptr<Channel> Chan(std::int32_t rank) const {
+    return channels_.at(rank);
+  }
+  [[nodiscard]] virtual bool IsFederated() const = 0;
+  [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
+
+  [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
+};
+
+class RabitComm : public Comm {
+  [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
+                                 std::string task_id);
+  [[nodiscard]] Result Shutdown();
+
+ public:
+  // bootstrapping construction.
+  RabitComm() = default;
+  // ctor for testing where environment is known.
+  RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
+            std::int32_t retry, std::string task_id);
+  ~RabitComm() noexcept(false) override;
+
+  [[nodiscard]] bool IsFederated() const override { return false; }
+  [[nodiscard]] Result LogTracker(std::string msg) const override;
+
+  [[nodiscard]] Result SignalError(Result const&) override;
+};
+
+/**
+ * @brief Communication channel between workers.
+ */
+class Channel {
+  std::shared_ptr<TCPSocket> sock_{nullptr};
+  Result rc_;
+  Comm const& comm_;
+
+ public:
+  explicit Channel(Comm const& comm, std::shared_ptr<TCPSocket> sock)
+      : sock_{std::move(sock)}, comm_{comm} {}
+
+  void SendAll(std::int8_t const* ptr, std::size_t n) {
+    Loop::Op op{Loop::Op::kWrite, comm_.Rank(), const_cast<std::int8_t*>(ptr), n, sock_.get(), 0};
+    CHECK(sock_.get());
+    comm_.Submit(std::move(op));
+  }
+  void SendAll(common::Span<std::int8_t const> data) {
+    this->SendAll(data.data(), data.size_bytes());
+  }
+
+  void RecvAll(std::int8_t* ptr, std::size_t n) {
+    Loop::Op op{Loop::Op::kRead, comm_.Rank(), ptr, n, sock_.get(), 0};
+    CHECK(sock_.get());
+    comm_.Submit(std::move(op));
+  }
+  void RecvAll(common::Span<std::int8_t> data) { this->RecvAll(data.data(), data.size_bytes()); }
+
+  [[nodiscard]] auto Socket() const { return sock_; }
+  [[nodiscard]] Result Block() { return comm_.Block(); }
+};
+
+enum class Op { kMax = 0, kMin = 1, kSum = 2, kBitwiseAND = 3, kBitwiseOR = 4, kBitwiseXOR = 5 };
+
+template <typename T, typename U = std::conditional_t<std::is_const_v<T>,
+                                                      std::add_const_t<std::int8_t>, std::int8_t>>
+common::Span<U> EraseType(common::Span<T> data) {
+  auto n_total_bytes = data.size_bytes();
+  auto erased = common::Span{reinterpret_cast<std::add_pointer_t<U>>(data.data()), n_total_bytes};
+  return erased;
+}
+
+template <typename T, typename U>
+common::Span<T> RestoreType(common::Span<U> data) {
+  static_assert(std::is_same_v<std::remove_const_t<U>, std::int8_t>);
+  auto n_total_bytes = data.size_bytes();
+  auto restored = common::Span{reinterpret_cast<T*>(data.data()), n_total_bytes / sizeof(T)};
+  return restored;
+}
+}  // namespace xgboost::collective
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -57,9 +57,7 @@ namespace collective {
 *   - federated_client_key: Client key file path. Only needed for the SSL mode.
 *   - federated_client_cert: Client certificate file path. Only needed for the SSL mode.
 */
-inline void Init(Json const& config) {
-  Communicator::Init(config);
-}
+inline void Init(Json const &config) { Communicator::Init(config); }

 /*!
 * \brief Finalize the collective communicator.
@@ -141,17 +139,89 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
  }
 }

+/**
+ * @brief Gathers a single value all processes and distributes the result to all processes.
+ *
+ * @param input The single value.
+ */
+template <typename T>
+inline std::vector<T> Allgather(T const &input) {
+  std::string_view str_input{reinterpret_cast<char const *>(&input), sizeof(T)};
+  auto const output = Communicator::Get()->AllGather(str_input);
+  CHECK_EQ(output.size() % sizeof(T), 0);
+  std::vector<T> result(output.size() / sizeof(T));
+  std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
+  return result;
+}
+
 /**
 * @brief Gathers data from all processes and distributes it to all processes.
 *
- * This assumes all ranks have the same size, and input data has been sliced into the
- * corresponding position.
+ * This assumes all ranks have the same size.
 *
- * @param send_receive_buffer Buffer storing the data.
- * @param size                Size of the data in bytes.
+ * @param input Buffer storing the data.
 */
-inline void Allgather(void *send_receive_buffer, std::size_t size) {
-  Communicator::Get()->AllGather(send_receive_buffer, size);
+template <typename T>
+inline std::vector<T> Allgather(std::vector<T> const &input) {
+  if (input.empty()) {
+    return input;
+  }
+  std::string_view str_input{reinterpret_cast<char const *>(input.data()),
+                             input.size() * sizeof(T)};
+  auto const output = Communicator::Get()->AllGather(str_input);
+  CHECK_EQ(output.size() % sizeof(T), 0);
+  std::vector<T> result(output.size() / sizeof(T));
+  std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
+  return result;
+}
+
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ * @param input Buffer storing the data.
+ */
+template <typename T>
+inline std::vector<T> AllgatherV(std::vector<T> const &input) {
+  std::string_view str_input{reinterpret_cast<char const *>(input.data()),
+                             input.size() * sizeof(T)};
+  auto const output = Communicator::Get()->AllGatherV(str_input);
+  CHECK_EQ(output.size() % sizeof(T), 0);
+  std::vector<T> result(output.size() / sizeof(T));
+  if (!output.empty()) {
+    std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
+  }
+  return result;
+}
+
+/**
+ * @brief Gathers variable-length strings from all processes and distributes them to all processes.
+ * @param input Variable-length list of variable-length strings.
+ */
+inline std::vector<std::string> AllgatherStrings(std::vector<std::string> const &input) {
+  std::size_t total_size{0};
+  for (auto const &s : input) {
+    total_size += s.length() + 1;  // +1 for null-terminators
+  }
+  std::string flat_string;
+  flat_string.reserve(total_size);
+  for (auto const &s : input) {
+    flat_string.append(s);
+    flat_string.push_back('\0');  // Append a null-terminator after each string
+  }
+
+  auto const output = Communicator::Get()->AllGatherV(flat_string);
+
+  std::vector<std::string> result;
+  std::size_t start_index = 0;
+  // Iterate through the output, find each null-terminated substring.
+  for (std::size_t i = 0; i < output.size(); i++) {
+    if (output[i] == '\0') {
+      // Construct a std::string from the char* substring
+      result.emplace_back(&output[start_index]);
+      // Move to the next substring
+      start_index = i + 1;
+    }
+  }
+  return result;
 }

 /*!
@@ -226,7 +296,7 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
 }

 template <typename T>
-struct AllgatherVResult {
+struct SpecialAllgatherVResult {
  std::vector<std::size_t> offsets;
  std::vector<std::size_t> sizes;
  std::vector<T> result;
@@ -241,14 +311,10 @@ struct AllgatherVResult {
 * @param sizes  Sizes of each input.
 */
 template <typename T>
-inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
-                                      std::vector<std::size_t> const &sizes) {
-  auto num_inputs = sizes.size();
-
+inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
+                                                    std::vector<std::size_t> const &sizes) {
  // Gather the sizes across all workers.
-  std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
-  std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
-  collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
+  auto const all_sizes = Allgather(sizes);

  // Calculate input offsets (std::exclusive_scan).
  std::vector<std::size_t> offsets(all_sizes.size());
@@ -257,11 +323,7 @@ inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
  }

  // Gather all the inputs.
-  auto total_input_size = offsets.back() + all_sizes.back();
-  std::vector<T> all_inputs(total_input_size);
-  std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
-  // We cannot use allgather here, since each worker might have a different size.
-  Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
+  auto const all_inputs = AllgatherV(inputs);

  return {offsets, all_sizes, all_inputs};
 }
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@@ -11,9 +11,7 @@
 #include "../../plugin/federated/federated_communicator.h"
 #endif

-namespace xgboost {
-namespace collective {
-
+namespace xgboost::collective {
 thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
 thread_local CommunicatorType Communicator::type_{};

@@ -57,6 +55,4 @@ void Communicator::Finalize() {
  communicator_.reset(new NoOpCommunicator());
 }
 #endif
-
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -125,13 +125,17 @@ class Communicator {
  /**
   * @brief Gathers data from all processes and distributes it to all processes.
   *
-   * This assumes all ranks have the same size, and input data has been sliced into the
-   * corresponding position.
+   * This assumes all ranks have the same size.
   *
-   * @param send_receive_buffer Buffer storing the data.
-   * @param size                Size of the data in bytes.
+   * @param input Buffer storing the data.
   */
-  virtual void AllGather(void *send_receive_buffer, std::size_t size) = 0;
+  virtual std::string AllGather(std::string_view input) = 0;
+
+  /**
+   * @brief Gathers variable-length data from all processes and distributes it to all processes.
+   * @param input Buffer storing the data.
+   */
+  virtual std::string AllGatherV(std::string_view input) = 0;

  /**
   * @brief Combines values from all processes and distributes the result back to all processes.
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -40,12 +40,10 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
    }

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    host_buffer_.resize(send_size * world_size_);
-    dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
-                             cudaMemcpyDefault));
-    Allgather(host_buffer_.data(), host_buffer_.size());
-    dh::safe_cuda(
-        cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
+    host_buffer_.resize(send_size);
+    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_buffer, send_size, cudaMemcpyDefault));
+    auto const output = Allgather(host_buffer_);
+    dh::safe_cuda(cudaMemcpy(receive_buffer, output.data(), output.size(), cudaMemcpyDefault));
  }

  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
--- a/src/collective/in_memory_communicator.h
+++ b/src/collective/in_memory_communicator.h
@@ -60,11 +60,16 @@ class InMemoryCommunicator : public Communicator {
  bool IsDistributed() const override { return true; }
  bool IsFederated() const override { return false; }

-  void AllGather(void* in_out, std::size_t size) override {
+  std::string AllGather(std::string_view input) override {
    std::string output;
-    handler_.Allgather(static_cast<const char*>(in_out), size, &output, sequence_number_++,
-                       GetRank());
-    output.copy(static_cast<char*>(in_out), size);
+    handler_.Allgather(input.data(), input.size(), &output, sequence_number_++, GetRank());
+    return output;
+  }
+
+  std::string AllGatherV(std::string_view input) override {
+    std::string output;
+    handler_.AllgatherV(input.data(), input.size(), &output, sequence_number_++, GetRank());
+    return output;
  }

  void AllReduce(void* in_out, std::size_t size, DataType data_type, Operation operation) override {
--- a/src/collective/in_memory_handler.cc
+++ b/src/collective/in_memory_handler.cc
@@ -16,23 +16,49 @@ class AllgatherFunctor {
 public:
  std::string const name{"Allgather"};

-  AllgatherFunctor(int world_size, int rank) : world_size_{world_size}, rank_{rank} {}
+  AllgatherFunctor(std::size_t world_size, std::size_t rank)
+      : world_size_{world_size}, rank_{rank} {}

  void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
    if (buffer->empty()) {
-      // Copy the input if this is the first request.
-      buffer->assign(input, bytes);
-    } else {
-      // Splice the input into the common buffer.
-      auto const per_rank = bytes / world_size_;
-      auto const index = rank_ * per_rank;
-      buffer->replace(index, per_rank, input + index, per_rank);
+      // Resize the buffer if this is the first request.
+      buffer->resize(bytes * world_size_);
+    }
+
+    // Splice the input into the common buffer.
+    buffer->replace(rank_ * bytes, bytes, input, bytes);
+  }
+
+ private:
+  std::size_t world_size_;
+  std::size_t rank_;
+};
+
+/**
+ * @brief Functor for variable-length allgather.
+ */
+class AllgatherVFunctor {
+ public:
+  std::string const name{"AllgatherV"};
+
+  AllgatherVFunctor(std::size_t world_size, std::size_t rank,
+                    std::map<std::size_t, std::string_view>* data)
+      : world_size_{world_size}, rank_{rank}, data_{data} {}
+
+  void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
+    data_->emplace(rank_, std::string_view{input, bytes});
+    if (data_->size() == world_size_) {
+      for (auto const& kv : *data_) {
+        buffer->append(kv.second);
+      }
+      data_->clear();
    }
  }

 private:
-  int world_size_;
-  int rank_;
+  std::size_t world_size_;
+  std::size_t rank_;
+  std::map<std::size_t, std::string_view>* data_;
 };

 /**
@@ -154,7 +180,7 @@ class BroadcastFunctor {
 public:
  std::string const name{"Broadcast"};

-  BroadcastFunctor(int rank, int root) : rank_{rank}, root_{root} {}
+  BroadcastFunctor(std::size_t rank, std::size_t root) : rank_{rank}, root_{root} {}

  void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
    if (rank_ == root_) {
@@ -164,11 +190,11 @@ class BroadcastFunctor {
  }

 private:
-  int rank_;
-  int root_;
+  std::size_t rank_;
+  std::size_t root_;
 };

-void InMemoryHandler::Init(int world_size, int) {
+void InMemoryHandler::Init(std::size_t world_size, std::size_t) {
  CHECK(world_size_ < world_size) << "In memory handler already initialized.";

  std::unique_lock<std::mutex> lock(mutex_);
@@ -178,7 +204,7 @@ void InMemoryHandler::Init(int world_size, int) {
  cv_.notify_all();
 }

-void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
+void InMemoryHandler::Shutdown(uint64_t sequence_number, std::size_t) {
  CHECK(world_size_ > 0) << "In memory handler already shutdown.";

  std::unique_lock<std::mutex> lock(mutex_);
@@ -194,24 +220,30 @@ void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
 }

 void InMemoryHandler::Allgather(char const* input, std::size_t bytes, std::string* output,
-                                std::size_t sequence_number, int rank) {
+                                std::size_t sequence_number, std::size_t rank) {
  Handle(input, bytes, output, sequence_number, rank, AllgatherFunctor{world_size_, rank});
 }

+void InMemoryHandler::AllgatherV(char const* input, std::size_t bytes, std::string* output,
+                                 std::size_t sequence_number, std::size_t rank) {
+  Handle(input, bytes, output, sequence_number, rank, AllgatherVFunctor{world_size_, rank, &aux_});
+}
+
 void InMemoryHandler::Allreduce(char const* input, std::size_t bytes, std::string* output,
-                                std::size_t sequence_number, int rank, DataType data_type,
+                                std::size_t sequence_number, std::size_t rank, DataType data_type,
                                Operation op) {
  Handle(input, bytes, output, sequence_number, rank, AllreduceFunctor{data_type, op});
 }

 void InMemoryHandler::Broadcast(char const* input, std::size_t bytes, std::string* output,
-                                std::size_t sequence_number, int rank, int root) {
+                                std::size_t sequence_number, std::size_t rank, std::size_t root) {
  Handle(input, bytes, output, sequence_number, rank, BroadcastFunctor{rank, root});
 }

 template <class HandlerFunctor>
 void InMemoryHandler::Handle(char const* input, std::size_t bytes, std::string* output,
-                             std::size_t sequence_number, int rank, HandlerFunctor const& functor) {
+                             std::size_t sequence_number, std::size_t rank,
+                             HandlerFunctor const& functor) {
  // Pass through if there is only 1 client.
  if (world_size_ == 1) {
    if (input != output->data()) {
--- a/src/collective/in_memory_handler.h
+++ b/src/collective/in_memory_handler.h
@@ -3,6 +3,7 @@
 */
 #pragma once
 #include <condition_variable>
+#include <map>
 #include <string>

 #include "communicator.h"
@@ -31,7 +32,7 @@ class InMemoryHandler {
   *
   * This is used when the handler only needs to be initialized once with a known world size.
   */
-  explicit InMemoryHandler(int worldSize) : world_size_{worldSize} {}
+  explicit InMemoryHandler(std::size_t worldSize) : world_size_{worldSize} {}

  /**
   * @brief Initialize the handler with the world size and rank.
@@ -41,7 +42,7 @@ class InMemoryHandler {
   * This is used when multiple objects/threads are accessing the same handler and need to
   * initialize it collectively.
   */
-  void Init(int world_size, int rank);
+  void Init(std::size_t world_size, std::size_t rank);

  /**
   * @brief Shut down the handler.
@@ -51,7 +52,7 @@ class InMemoryHandler {
   * This is used when multiple objects/threads are accessing the same handler and need to
   * shut it down collectively.
   */
-  void Shutdown(uint64_t sequence_number, int rank);
+  void Shutdown(uint64_t sequence_number, std::size_t rank);

  /**
   * @brief Perform allgather.
@@ -62,7 +63,18 @@ class InMemoryHandler {
   * @param rank Index of the worker.
   */
  void Allgather(char const* input, std::size_t bytes, std::string* output,
-                 std::size_t sequence_number, int rank);
+                 std::size_t sequence_number, std::size_t rank);
+
+  /**
+   * @brief Perform variable-length allgather.
+   * @param input The input buffer.
+   * @param bytes Number of bytes in the input buffer.
+   * @param output The output buffer.
+   * @param sequence_number Call sequence number.
+   * @param rank Index of the worker.
+   */
+  void AllgatherV(char const* input, std::size_t bytes, std::string* output,
+                  std::size_t sequence_number, std::size_t rank);

  /**
   * @brief Perform allreduce.
@@ -75,7 +87,7 @@ class InMemoryHandler {
   * @param op The reduce operation.
   */
  void Allreduce(char const* input, std::size_t bytes, std::string* output,
-                 std::size_t sequence_number, int rank, DataType data_type, Operation op);
+                 std::size_t sequence_number, std::size_t rank, DataType data_type, Operation op);

  /**
   * @brief Perform broadcast.
@@ -87,7 +99,7 @@ class InMemoryHandler {
   * @param root Index of the worker to broadcast from.
   */
  void Broadcast(char const* input, std::size_t bytes, std::string* output,
-                 std::size_t sequence_number, int rank, int root);
+                 std::size_t sequence_number, std::size_t rank, std::size_t root);

 private:
  /**
@@ -102,15 +114,16 @@ class InMemoryHandler {
   */
  template <class HandlerFunctor>
  void Handle(char const* input, std::size_t size, std::string* output, std::size_t sequence_number,
-              int rank, HandlerFunctor const& functor);
+              std::size_t rank, HandlerFunctor const& functor);

-  int world_size_{};                    /// Number of workers.
-  int received_{};                      /// Number of calls received with the current sequence.
-  int sent_{};                          /// Number of calls completed with the current sequence.
-  std::string buffer_{};                /// A shared common buffer.
-  uint64_t sequence_number_{};          /// Call sequence number.
-  mutable std::mutex mutex_;            /// Lock.
-  mutable std::condition_variable cv_;  /// Conditional variable to wait on.
+  std::size_t world_size_{};  /// Number of workers.
+  std::size_t received_{};    /// Number of calls received with the current sequence.
+  std::size_t sent_{};        /// Number of calls completed with the current sequence.
+  std::string buffer_{};      /// A shared common buffer.
+  std::map<std::size_t, std::string_view> aux_{};  /// A shared auxiliary map.
+  uint64_t sequence_number_{};                     /// Call sequence number.
+  mutable std::mutex mutex_;                       /// Lock.
+  mutable std::condition_variable cv_;             /// Conditional variable to wait on.
 };

 }  // namespace collective
--- a/src/collective/loop.cc
+++ b/src/collective/loop.cc
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "loop.h"
+
+#include <queue>  // for queue
+
+#include "rabit/internal/socket.h"      // for PollHelper
+#include "xgboost/collective/socket.h"  // for FailWithCode
+#include "xgboost/logging.h"            // for CHECK
+
+namespace xgboost::collective {
+Result Loop::EmptyQueue() {
+  timer_.Start(__func__);
+  auto error = [this] {
+    this->stop_ = true;
+    timer_.Stop(__func__);
+  };
+
+  while (!queue_.empty() && !stop_) {
+    std::queue<Op> qcopy;
+    rabit::utils::PollHelper poll;
+
+    // watch all ops
+    while (!queue_.empty()) {
+      auto op = queue_.front();
+      queue_.pop();
+
+      switch (op.code) {
+        case Op::kRead: {
+          poll.WatchRead(*op.sock);
+          break;
+        }
+        case Op::kWrite: {
+          poll.WatchWrite(*op.sock);
+          break;
+        }
+        default: {
+          error();
+          return Fail("Invalid socket operation.");
+        }
+      }
+      qcopy.push(op);
+    }
+
+    // poll, work on fds that are ready.
+    timer_.Start("poll");
+    auto rc = poll.Poll(timeout_);
+    timer_.Stop("poll");
+    if (!rc.OK()) {
+      error();
+      return rc;
+    }
+    // we wonldn't be here if the queue is empty.
+    CHECK(!qcopy.empty());
+
+    while (!qcopy.empty() && !stop_) {
+      auto op = qcopy.front();
+      qcopy.pop();
+
+      std::int32_t n_bytes_done{0};
+      CHECK(op.sock->NonBlocking());
+
+      switch (op.code) {
+        case Op::kRead: {
+          if (poll.CheckRead(*op.sock)) {
+            n_bytes_done = op.sock->Recv(op.ptr + op.off, op.n - op.off);
+          }
+          break;
+        }
+        case Op::kWrite: {
+          if (poll.CheckWrite(*op.sock)) {
+            n_bytes_done = op.sock->Send(op.ptr + op.off, op.n - op.off);
+          }
+          break;
+        }
+        default: {
+          error();
+          return Fail("Invalid socket operation.");
+        }
+      }
+
+      if (n_bytes_done == -1 && !system::LastErrorWouldBlock()) {
+        stop_ = true;
+        auto rc = system::FailWithCode("Invalid socket output.");
+        error();
+        return rc;
+      }
+      op.off += n_bytes_done;
+      CHECK_LE(op.off, op.n);
+
+      if (op.off != op.n) {
+        // not yet finished, push back to queue for next round.
+        queue_.push(op);
+      }
+    }
+  }
+  timer_.Stop(__func__);
+  return Success();
+}
+
+void Loop::Process() {
+  // consumer
+  while (true) {
+    std::unique_lock lock{mu_};
+    cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
+    if (stop_) {
+      break;
+    }
+    CHECK(!mu_.try_lock());
+
+    this->rc_ = this->EmptyQueue();
+    if (!rc_.OK()) {
+      stop_ = true;
+      cv_.notify_one();
+      break;
+    }
+
+    CHECK(queue_.empty());
+    CHECK(!mu_.try_lock());
+    cv_.notify_one();
+  }
+
+  if (rc_.OK()) {
+    CHECK(queue_.empty());
+  }
+}
+
+Result Loop::Stop() {
+  std::unique_lock lock{mu_};
+  stop_ = true;
+  lock.unlock();
+
+  CHECK_EQ(this->Block().OK(), this->rc_.OK());
+
+  if (curr_exce_) {
+    std::rethrow_exception(curr_exce_);
+  }
+
+  return Success();
+}
+
+Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
+  timer_.Init(__func__);
+  worker_ = std::thread{[this] {
+    try {
+      this->Process();
+    } catch (std::exception const& e) {
+      std::lock_guard<std::mutex> guard{mu_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+        rc_ = Fail("Exception was thrown");
+      }
+      stop_ = true;
+      cv_.notify_all();
+    } catch (...) {
+      std::lock_guard<std::mutex> guard{mu_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+        rc_ = Fail("Exception was thrown");
+      }
+      stop_ = true;
+      cv_.notify_all();
+    }
+  }};
+}
+}  // namespace xgboost::collective
--- a/src/collective/loop.h
+++ b/src/collective/loop.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <chrono>              // for seconds
+#include <condition_variable>  // for condition_variable
+#include <cstddef>             // for size_t
+#include <cstdint>             // for int8_t, int32_t
+#include <exception>           // for exception_ptr
+#include <mutex>               // for unique_lock, mutex
+#include <queue>               // for queue
+#include <thread>              // for thread
+#include <utility>             // for move
+
+#include "../common/timer.h"            // for Monitor
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+
+namespace xgboost::collective {
+class Loop {
+ public:
+  struct Op {
+    enum Code : std::int8_t { kRead = 0, kWrite = 1 } code;
+    std::int32_t rank{-1};
+    std::int8_t* ptr{nullptr};
+    std::size_t n{0};
+    TCPSocket* sock{nullptr};
+    std::size_t off{0};
+
+    Op(Code c, std::int32_t rank, std::int8_t* ptr, std::size_t n, TCPSocket* sock, std::size_t off)
+        : code{c}, rank{rank}, ptr{ptr}, n{n}, sock{sock}, off{off} {}
+    Op(Op const&) = default;
+    Op& operator=(Op const&) = default;
+    Op(Op&&) = default;
+    Op& operator=(Op&&) = default;
+  };
+
+ private:
+  std::thread worker_;
+  std::condition_variable cv_;
+  std::mutex mu_;
+  std::queue<Op> queue_;
+  std::chrono::seconds timeout_;
+  Result rc_;
+  bool stop_{false};
+  std::exception_ptr curr_exce_{nullptr};
+  common::Monitor timer_;
+
+  Result EmptyQueue();
+  void Process();
+
+ public:
+  Result Stop();
+
+  void Submit(Op op) {
+    // producer
+    std::unique_lock lock{mu_};
+    queue_.push(op);
+    lock.unlock();
+    cv_.notify_one();
+  }
+
+  [[nodiscard]] Result Block() {
+    {
+      std::unique_lock lock{mu_};
+      cv_.notify_all();
+    }
+    std::unique_lock lock{mu_};
+    cv_.wait(lock, [this] { return this->queue_.empty() || stop_; });
+    return std::move(rc_);
+  }
+
+  explicit Loop(std::chrono::seconds timeout);
+
+  ~Loop() noexcept(false) {
+    this->Stop();
+
+    if (worker_.joinable()) {
+      worker_.join();
+    }
+  }
+};
+}  // namespace xgboost::collective
--- a/src/collective/noop_communicator.h
+++ b/src/collective/noop_communicator.h
@@ -17,10 +17,11 @@ class NoOpCommunicator : public Communicator {
  NoOpCommunicator() : Communicator(1, 0) {}
  bool IsDistributed() const override { return false; }
  bool IsFederated() const override { return false; }
-  void AllGather(void *, std::size_t) override {}
+  std::string AllGather(std::string_view) override { return {}; }
+  std::string AllGatherV(std::string_view) override { return {}; }
  void AllReduce(void *, std::size_t, DataType, Operation) override {}
  void Broadcast(void *, std::size_t, int) override {}
-  std::string GetProcessorName() override { return ""; }
+  std::string GetProcessorName() override { return {}; }
  void Print(const std::string &message) override { LOG(CONSOLE) << message; }

 protected:
--- a/src/collective/protocol.h
+++ b/src/collective/protocol.h
@@ -0,0 +1,214 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/json.h"               // for Json
+
+namespace xgboost::collective::proto {
+struct PeerInfo {
+  std::string host;
+  std::int32_t port{-1};
+  std::int32_t rank{-1};
+
+  PeerInfo() = default;
+  PeerInfo(std::string host, std::int32_t port, std::int32_t rank)
+      : host{std::move(host)}, port{port}, rank{rank} {}
+
+  explicit PeerInfo(Json const& peer)
+      : host{get<String>(peer["host"])},
+        port{static_cast<std::int32_t>(get<Integer const>(peer["port"]))},
+        rank{static_cast<std::int32_t>(get<Integer const>(peer["rank"]))} {}
+
+  [[nodiscard]] Json ToJson() const {
+    Json info{Object{}};
+    info["rank"] = rank;
+    info["host"] = String{host};
+    info["port"] = Integer{port};
+    return info;
+  }
+
+  [[nodiscard]] auto HostPort() const { return host + ":" + std::to_string(this->port); }
+};
+
+struct Magic {
+  static constexpr std::int32_t kMagic = 0xff99;
+
+  [[nodiscard]] Result Verify(xgboost::collective::TCPSocket* p_sock) {
+    std::int32_t magic{kMagic};
+    auto n_bytes = p_sock->SendAll(&magic, sizeof(magic));
+    if (n_bytes != sizeof(magic)) {
+      return Fail("Failed to verify.");
+    }
+
+    magic = 0;
+    n_bytes = p_sock->RecvAll(&magic, sizeof(magic));
+    if (n_bytes != sizeof(magic)) {
+      return Fail("Failed to verify.");
+    }
+    if (magic != kMagic) {
+      return xgboost::collective::Fail("Invalid verification number.");
+    }
+    return Success();
+  }
+};
+
+enum class CMD : std::int32_t {
+  kInvalid = 0,
+  kStart = 1,
+  kShutdown = 2,
+  kError = 3,
+  kPrint = 4,
+};
+
+struct Connect {
+  [[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::int32_t world, std::int32_t rank,
+                                  std::string task_id) const {
+    Json jinit{Object{}};
+    jinit["world_size"] = Integer{world};
+    jinit["rank"] = Integer{rank};
+    jinit["task_id"] = String{task_id};
+    std::string msg;
+    Json::Dump(jinit, &msg);
+    auto n_bytes = tracker->Send(msg);
+    if (n_bytes != msg.size()) {
+      return Fail("Failed to send init command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result TrackerRecv(TCPSocket* sock, std::int32_t* world, std::int32_t* rank,
+                                   std::string* task_id) const {
+    std::string init;
+    sock->Recv(&init);
+    auto jinit = Json::Load(StringView{init});
+    *world = get<Integer const>(jinit["world_size"]);
+    *rank = get<Integer const>(jinit["rank"]);
+    *task_id = get<String const>(jinit["task_id"]);
+    return Success();
+  }
+};
+
+class Start {
+ private:
+  [[nodiscard]] Result TrackerSend(std::int32_t world, TCPSocket* worker) const {
+    Json jcmd{Object{}};
+    jcmd["world_size"] = Integer{world};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = worker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send init command from tracker.");
+    }
+    return Success();
+  }
+
+ public:
+  [[nodiscard]] Result WorkerSend(std::int32_t lport, TCPSocket* tracker,
+                                  std::int32_t eport) const {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kStart)};
+    jcmd["port"] = Integer{lport};
+    jcmd["error_port"] = Integer{eport};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send init command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result WorkerRecv(TCPSocket* tracker, std::int32_t* p_world) const {
+    std::string scmd;
+    auto n_bytes = tracker->Recv(&scmd);
+    if (n_bytes <= 0) {
+      return Fail("Failed to recv init command from tracker.");
+    }
+    auto jcmd = Json::Load(scmd);
+    auto world = get<Integer const>(jcmd["world_size"]);
+    if (world <= 0) {
+      return Fail("Invalid world size.");
+    }
+    *p_world = world;
+    return Success();
+  }
+  [[nodiscard]] Result TrackerHandle(Json jcmd, std::int32_t* recv_world, std::int32_t world,
+                                     std::int32_t* p_port, TCPSocket* p_sock,
+                                     std::int32_t* eport) const {
+    *p_port = get<Integer const>(jcmd["port"]);
+    if (*p_port <= 0) {
+      return Fail("Invalid port.");
+    }
+    if (*recv_world != -1) {
+      return Fail("Invalid initialization sequence.");
+    }
+    *recv_world = world;
+    *eport = get<Integer const>(jcmd["error_port"]);
+    return TrackerSend(world, p_sock);
+  }
+};
+
+struct Print {
+  [[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::string msg) const {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kPrint)};
+    jcmd["msg"] = String{std::move(msg)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send print command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg) const {
+    if (!IsA<String>(jcmd["msg"])) {
+      return Fail("Invalid print command.");
+    }
+    auto msg = get<String const>(jcmd["msg"]);
+    *p_msg = msg;
+    return Success();
+  }
+};
+
+struct ErrorCMD {
+  [[nodiscard]] Result WorkerSend(TCPSocket* tracker, Result const& res) const {
+    auto msg = res.Report();
+    auto code = res.Code().value();
+    Json jcmd{Object{}};
+    jcmd["msg"] = String{std::move(msg)};
+    jcmd["code"] = Integer{code};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kError)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send error command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg, int* p_code) const {
+    if (!IsA<String>(jcmd["msg"]) || !IsA<Integer>(jcmd["code"])) {
+      return Fail("Invalid error command.");
+    }
+    auto msg = get<String const>(jcmd["msg"]);
+    auto code = get<Integer const>(jcmd["code"]);
+    *p_msg = msg;
+    *p_code = code;
+    return Success();
+  }
+};
+
+struct ShutdownCMD {
+  [[nodiscard]] Result Send(TCPSocket* peer) const {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = peer->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send shutdown command from worker.");
+    }
+    return Success();
+  }
+};
+}  // namespace xgboost::collective::proto
--- a/src/collective/rabit_communicator.h
+++ b/src/collective/rabit_communicator.h
@@ -7,6 +7,7 @@
 #include <string>
 #include <vector>

+#include "communicator-inl.h"
 #include "communicator.h"
 #include "xgboost/json.h"

@@ -55,10 +56,29 @@ class RabitCommunicator : public Communicator {

  bool IsFederated() const override { return false; }

-  void AllGather(void *send_receive_buffer, std::size_t size) override {
-    auto const per_rank = size / GetWorldSize();
+  std::string AllGather(std::string_view input) override {
+    auto const per_rank = input.size();
+    auto const total_size = per_rank * GetWorldSize();
    auto const index = per_rank * GetRank();
-    rabit::Allgather(static_cast<char *>(send_receive_buffer), size, index, per_rank, per_rank);
+    std::string result(total_size, '\0');
+    result.replace(index, per_rank, input);
+    rabit::Allgather(result.data(), total_size, index, per_rank, per_rank);
+    return result;
+  }
+
+  std::string AllGatherV(std::string_view input) override {
+    auto const size_node_slice = input.size();
+    auto const all_sizes = collective::Allgather(size_node_slice);
+    auto const total_size = std::accumulate(all_sizes.cbegin(), all_sizes.cend(), 0ul);
+    auto const begin_index =
+        std::accumulate(all_sizes.cbegin(), all_sizes.cbegin() + GetRank(), 0ul);
+    auto const size_prev_slice =
+        GetRank() == 0 ? all_sizes[GetWorldSize() - 1] : all_sizes[GetRank() - 1];
+
+    std::string result(total_size, '\0');
+    result.replace(begin_index, size_node_slice, input);
+    rabit::Allgather(result.data(), total_size, begin_index, size_node_slice, size_prev_slice);
+    return result;
  }

  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
--- a/src/collective/socket.cc
+++ b/src/collective/socket.cc
@@ -3,6 +3,7 @@
 */
 #include "xgboost/collective/socket.h"

+#include <array>         // for array
 #include <cstddef>       // std::size_t
 #include <cstdint>       // std::int32_t
 #include <cstring>       // std::memcpy, std::memset
@@ -92,13 +93,18 @@ std::size_t TCPSocket::Recv(std::string *p_str) {

  conn = TCPSocket::Create(addr.Domain());
  CHECK_EQ(static_cast<std::int32_t>(conn.Domain()), static_cast<std::int32_t>(addr.Domain()));
-  conn.SetNonBlock(true);
+  auto non_blocking = conn.NonBlocking();
+  auto rc = conn.NonBlocking(true);
+  if (!rc.OK()) {
+    return Fail("Failed to set socket option.", std::move(rc));
+  }

  Result last_error;
-  auto log_failure = [&host, &last_error](Result err, char const *file, std::int32_t line) {
+  auto log_failure = [&host, &last_error, port](Result err, char const *file, std::int32_t line) {
    last_error = std::move(err);
    LOG(WARNING) << std::filesystem::path{file}.filename().string() << "(" << line
-                 << "): Failed to connect to:" << host << " Error:" << last_error.Report();
+                 << "): Failed to connect to:" << host << ":" << port
+                 << " Error:" << last_error.Report();
  };

  for (std::int32_t attempt = 0; attempt < std::max(retry, 1); ++attempt) {
@@ -112,39 +118,42 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
    }

    auto rc = connect(conn.Handle(), addr_handle, addr_len);
-    if (rc != 0) {
-      auto errcode = system::LastError();
-      if (!system::ErrorWouldBlock(errcode)) {
-        log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
-                    __FILE__, __LINE__);
-        continue;
-      }
-
-      rabit::utils::PollHelper poll;
-      poll.WatchWrite(conn);
-      auto result = poll.Poll(timeout);
-      if (!result.OK()) {
-        log_failure(std::move(result), __FILE__, __LINE__);
-        continue;
-      }
-      if (!poll.CheckWrite(conn)) {
-        log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}),
-                    __FILE__, __LINE__);
-        continue;
-      }
-      result = conn.GetSockError();
-      if (!result.OK()) {
-        log_failure(std::move(result), __FILE__, __LINE__);
-        continue;
-      }
-
-      conn.SetNonBlock(false);
-      return Success();
-
-    } else {
-      conn.SetNonBlock(false);
-      return Success();
+    if (rc == 0) {
+      return conn.NonBlocking(non_blocking);
    }
+
+    auto errcode = system::LastError();
+    if (!system::ErrorWouldBlock(errcode)) {
+      log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
+                  __FILE__, __LINE__);
+      continue;
+    }
+
+    rabit::utils::PollHelper poll;
+    poll.WatchWrite(conn);
+    auto result = poll.Poll(timeout);
+    if (!result.OK()) {
+      // poll would fail if there's a socket error, we log the root cause instead of the
+      // poll failure.
+      auto sockerr = conn.GetSockError();
+      if (!sockerr.OK()) {
+        result = std::move(sockerr);
+      }
+      log_failure(std::move(result), __FILE__, __LINE__);
+      continue;
+    }
+    if (!poll.CheckWrite(conn)) {
+      log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}), __FILE__,
+                  __LINE__);
+      continue;
+    }
+    result = conn.GetSockError();
+    if (!result.OK()) {
+      log_failure(std::move(result), __FILE__, __LINE__);
+      continue;
+    }
+
+    return conn.NonBlocking(non_blocking);
  }

  std::stringstream ss;
@@ -152,4 +161,13 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
  conn.Close();
  return Fail(ss.str(), std::move(last_error));
 }
+
+[[nodiscard]] Result GetHostName(std::string *p_out) {
+  std::array<char, HOST_NAME_MAX> buf;
+  if (gethostname(&buf[0], HOST_NAME_MAX) != 0) {
+    return system::FailWithCode("Failed to get host name.");
+  }
+  *p_out = buf.data();
+  return Success();
+}
 }  // namespace xgboost::collective
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -0,0 +1,296 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#if defined(__unix__) || defined(__APPLE__)
+#include <netdb.h>       // gethostbyname
+#include <sys/socket.h>  // socket, AF_INET6, AF_INET, connect, getsockname
+#endif                   // defined(__unix__) || defined(__APPLE__)
+
+#if !defined(NOMINMAX) && defined(_WIN32)
+#define NOMINMAX
+#endif  // !defined(NOMINMAX)
+
+#if defined(_WIN32)
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#endif  // defined(_WIN32)
+
+#include <algorithm>  // for sort
+#include <chrono>     // for seconds
+#include <cstdint>    // for int32_t
+#include <string>     // for string
+#include <utility>    // for move, forward
+
+#include "../common/json_utils.h"
+#include "comm.h"
+#include "protocol.h"  // for kMagic, PeerInfo
+#include "tracker.h"
+#include "xgboost/collective/result.h"  // for Result, Fail, Success
+#include "xgboost/collective/socket.h"  // for GetHostName, FailWithCode, MakeSockAddress, ...
+#include "xgboost/json.h"
+
+namespace xgboost::collective {
+Tracker::Tracker(Json const& config)
+    : n_workers_{static_cast<std::int32_t>(
+          RequiredArg<Integer const>(config, "n_workers", __func__))},
+      port_{static_cast<std::int32_t>(OptionalArg<Integer const>(config, "port", Integer::Int{0}))},
+      timeout_{std::chrono::seconds{OptionalArg<Integer const>(
+          config, "timeout", static_cast<std::int64_t>(collective::DefaultTimeoutSec()))}} {}
+
+RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr)
+    : sock_{std::move(sock)} {
+  auto host = addr.Addr();
+
+  std::int32_t rank{0};
+  rc_ = Success()
+        << [&] { return proto::Magic{}.Verify(&sock_); }
+        << [&] { return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_); };
+  if (!rc_.OK()) {
+    return;
+  }
+
+  std::string cmd;
+  sock_.Recv(&cmd);
+  auto jcmd = Json::Load(StringView{cmd});
+  cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
+  std::int32_t port{0};
+  if (cmd_ == proto::CMD::kStart) {
+    proto::Start start;
+    rc_ = start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
+  } else if (cmd_ == proto::CMD::kPrint) {
+    proto::Print print;
+    rc_ = print.TrackerHandle(jcmd, &msg_);
+  } else if (cmd_ == proto::CMD::kError) {
+    proto::ErrorCMD error;
+    rc_ = error.TrackerHandle(jcmd, &msg_, &code_);
+  }
+  if (!rc_.OK()) {
+    return;
+  }
+
+  info_ = proto::PeerInfo{host, port, rank};
+}
+
+RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
+  std::string self;
+  auto rc = collective::GetHostAddress(&self);
+  auto host = OptionalArg<String>(config, "host", self);
+
+  listener_ = TCPSocket::Create(SockDomain::kV4);
+  rc = listener_.Bind(host, &this->port_);
+  CHECK(rc.OK()) << rc.Report();
+  listener_.Listen();
+}
+
+Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
+  auto& workers = *p_workers;
+
+  std::sort(workers.begin(), workers.end(), WorkerCmp{});
+
+  std::vector<std::thread> bootstrap_threads;
+  for (std::int32_t r = 0; r < n_workers_; ++r) {
+    auto& worker = workers[r];
+    auto next = BootstrapNext(r, n_workers_);
+    auto const& next_w = workers[next];
+    bootstrap_threads.emplace_back([next, &worker, &next_w] {
+      auto jnext = proto::PeerInfo{next_w.Host(), next_w.Port(), next}.ToJson();
+      std::string str;
+      Json::Dump(jnext, &str);
+      worker.Send(StringView{str});
+    });
+  }
+
+  for (auto& t : bootstrap_threads) {
+    t.join();
+  }
+
+  for (auto const& w : workers) {
+    worker_error_handles_.emplace_back(w.Host(), w.ErrorPort());
+  }
+  return Success();
+}
+
+[[nodiscard]] std::future<Result> RabitTracker::Run() {
+  // a state machine to keep track of consistency.
+  struct State {
+    std::int32_t const n_workers;
+
+    std::int32_t n_shutdown{0};
+    bool during_restart{false};
+    std::vector<WorkerProxy> pending;
+
+    explicit State(std::int32_t world) : n_workers{world} {}
+    State(State const& that) = delete;
+    State& operator=(State&& that) = delete;
+
+    void Start(WorkerProxy&& worker) {
+      CHECK_LT(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+
+      pending.emplace_back(std::forward<WorkerProxy>(worker));
+
+      CHECK_LE(pending.size(), n_workers);
+    }
+    void Shutdown() {
+      CHECK_GE(n_shutdown, 0);
+      CHECK_LT(n_shutdown, n_workers);
+
+      ++n_shutdown;
+
+      CHECK_LE(n_shutdown, n_workers);
+    }
+    void Error() {
+      CHECK_LE(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+
+      during_restart = true;
+    }
+    [[nodiscard]] bool Ready() const {
+      CHECK_LE(pending.size(), n_workers);
+      return static_cast<std::int32_t>(pending.size()) == n_workers;
+    }
+    void Bootstrap() {
+      CHECK_EQ(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+
+      // A reset.
+      n_shutdown = 0;
+      during_restart = false;
+      pending.clear();
+    }
+    [[nodiscard]] bool ShouldContinue() const {
+      CHECK_LE(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+      // - Without error, we should shutdown after all workers are offline.
+      // - With error, all workers are offline, and we have during_restart as true.
+      return n_shutdown != n_workers || during_restart;
+    }
+  };
+
+  return std::async(std::launch::async, [this] {
+    State state{this->n_workers_};
+
+    while (state.ShouldContinue()) {
+      TCPSocket sock;
+      SockAddrV4 addr;
+      auto rc = listener_.Accept(&sock, &addr);
+      if (!rc.OK()) {
+        return Fail("Failed to accept connection.", std::move(rc));
+      }
+
+      auto worker = WorkerProxy{n_workers_, std::move(sock), std::move(addr)};
+      if (!worker.Status().OK()) {
+        return Fail("Failed to initialize worker proxy.", std::move(worker.Status()));
+      }
+      switch (worker.Command()) {
+        case proto::CMD::kStart: {
+          state.Start(std::move(worker));
+          if (state.Ready()) {
+            rc = this->Bootstrap(&state.pending);
+            state.Bootstrap();
+          }
+          if (!rc.OK()) {
+            return rc;
+          }
+          continue;
+        }
+        case proto::CMD::kShutdown: {
+          state.Shutdown();
+          continue;
+        }
+        case proto::CMD::kError: {
+          if (state.during_restart) {
+            continue;
+          }
+          state.Error();
+          auto msg = worker.Msg();
+          auto code = worker.Code();
+          LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank()
+                       << "]: " << msg << " code:" << code;
+          auto host = worker.Host();
+          // We signal all workers for the error, if they haven't aborted already.
+          for (auto& w : worker_error_handles_) {
+            if (w.first == host) {
+              continue;
+            }
+            TCPSocket out;
+            // retry is set to 1, just let the worker timeout or error. Otherwise the
+            // tracker and the worker might be waiting for each other.
+            auto rc = Connect(w.first, w.second, 1, timeout_, &out);
+            // send signal to stop the worker.
+            proto::ShutdownCMD shutdown;
+            rc = shutdown.Send(&out);
+            if (!rc.OK()) {
+              return Fail("Failed to inform workers to stop.");
+            }
+          }
+
+          continue;
+        }
+        case proto::CMD::kPrint: {
+          LOG(CONSOLE) << worker.Msg();
+          continue;
+        }
+        case proto::CMD::kInvalid:
+        default: {
+          return Fail("Invalid command received.");
+        }
+      }
+    }
+    return Success();
+  });
+}
+
+[[nodiscard]] Result GetHostAddress(std::string* out) {
+  auto rc = GetHostName(out);
+  if (!rc.OK()) {
+    return rc;
+  }
+  auto host = gethostbyname(out->c_str());
+
+  // get ip address from host
+  std::string ip;
+  rc = INetNToP(host, &ip);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  if (!(ip.size() >= 4 && ip.substr(0, 4) == "127.")) {
+    // return if this is a public IP address.
+    // not entirely accurate, we have other reserved IPs
+    *out = ip;
+    return Success();
+  }
+
+  // Create an UDP socket to prob the public IP address, it's fine even if it's
+  // unreachable.
+  auto sock = socket(AF_INET, SOCK_DGRAM, 0);
+  if (sock == -1) {
+    return Fail("Failed to create socket.");
+  }
+
+  auto paddr = MakeSockAddress(StringView{"10.255.255.255"}, 1);
+  sockaddr const* addr_handle = reinterpret_cast<const sockaddr*>(&paddr.V4().Handle());
+  socklen_t addr_len{sizeof(paddr.V4().Handle())};
+  auto err = connect(sock, addr_handle, addr_len);
+  if (err != 0) {
+    return system::FailWithCode("Failed to find IP address.");
+  }
+
+  // get the IP address from socket desrciptor
+  struct sockaddr_in addr;
+  socklen_t len = sizeof(addr);
+  if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr), &len) == -1) {
+    return Fail("Failed to get sock name.");
+  }
+  ip = inet_ntoa(addr.sin_addr);
+
+  err = system::CloseSocket(sock);
+  if (err != 0) {
+    return system::FailWithCode("Failed to close socket.");
+  }
+
+  *out = ip;
+  return Success();
+}
+}  // namespace xgboost::collective
--- a/src/collective/tracker.h
+++ b/src/collective/tracker.h
@@ -0,0 +1,141 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <chrono>   // for seconds
+#include <cstdint>  // for int32_t
+#include <future>   // for future
+#include <string>   // for string
+#include <utility>  // for pair
+#include <vector>   // for vector
+
+#include "protocol.h"
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/json.h"               // for Json
+
+namespace xgboost::collective {
+/**
+ *
+ * @brief Implementation of RABIT tracker.
+ *
+ * * What is a tracker
+ *
+ *   The implementation of collective follows what RABIT did in the past. It requires a
+ *   tracker to coordinate initialization and error recovery of workers. While the
+ *   original implementation attempted to attain error resislient inside the collective
+ *   module, which turned out be too challenging due to large amount of external
+ *   states. The new implementation here differs from RABIT in the way that neither state
+ *   recovery nor resislient is handled inside the collective, it merely provides the
+ *   mechanism to signal error to other workers through the use of a centralized tracker.
+ *
+ *   There are three major functionalities provided the a tracker, namely:
+ *   - Initialization. Share the node addresses among all workers.
+ *   - Logging.
+ *   - Signal error. If an exception is thrown in one (or many) of the workers, it can
+ *     signal an error to the tracker and the tracker will notify other workers.
+ */
+class Tracker {
+ protected:
+  std::int32_t n_workers_{0};
+  std::int32_t port_{-1};
+  std::chrono::seconds timeout_{0};
+
+ public:
+  explicit Tracker(Json const& config);
+  Tracker(std::int32_t n_worders, std::int32_t port, std::chrono::seconds timeout)
+      : n_workers_{n_worders}, port_{port}, timeout_{timeout} {}
+
+  virtual ~Tracker() noexcept(false){};  // NOLINT
+  [[nodiscard]] virtual std::future<Result> Run() = 0;
+  [[nodiscard]] virtual Json WorkerArgs() const = 0;
+  [[nodiscard]] std::chrono::seconds Timeout() const { return timeout_; }
+};
+
+class RabitTracker : public Tracker {
+  // a wrapper for connected worker socket.
+  class WorkerProxy {
+    TCPSocket sock_;
+    proto::PeerInfo info_;
+    std::int32_t eport_{0};
+    std::int32_t world_{-1};
+    std::string task_id_;
+
+    proto::CMD cmd_{proto::CMD::kInvalid};
+    std::string msg_;
+    std::int32_t code_{0};
+    Result rc_;
+
+   public:
+    explicit WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr);
+    WorkerProxy(WorkerProxy const& that) = delete;
+    WorkerProxy(WorkerProxy&& that) = default;
+    WorkerProxy& operator=(WorkerProxy const&) = delete;
+    WorkerProxy& operator=(WorkerProxy&&) = default;
+
+    [[nodiscard]] auto Host() const { return info_.host; }
+    [[nodiscard]] auto TaskID() const { return task_id_; }
+    [[nodiscard]] auto Port() const { return info_.port; }
+    [[nodiscard]] auto Rank() const { return info_.rank; }
+    [[nodiscard]] auto ErrorPort() const { return eport_; }
+    [[nodiscard]] auto Command() const { return cmd_; }
+    [[nodiscard]] auto Msg() const { return msg_; }
+    [[nodiscard]] auto Code() const { return code_; }
+
+    [[nodiscard]] Result const& Status() const { return rc_; }
+    [[nodiscard]] Result& Status() { return rc_; }
+
+    void Send(StringView value) { this->sock_.Send(value); }
+  };
+  // provide an ordering for workers, this helps us get deterministic topology.
+  struct WorkerCmp {
+    [[nodiscard]] bool operator()(WorkerProxy const& lhs, WorkerProxy const& rhs) {
+      auto const& lh = lhs.Host();
+      auto const& rh = rhs.Host();
+
+      if (lh != rh) {
+        return lh < rh;
+      }
+      return lhs.TaskID() < rhs.TaskID();
+    }
+  };
+
+ private:
+  std::string host_;
+  // record for how to reach out to workers if error happens.
+  std::vector<std::pair<std::string, std::int32_t>> worker_error_handles_;
+  // listening socket for incoming workers.
+  TCPSocket listener_;
+
+  Result Bootstrap(std::vector<WorkerProxy>* p_workers);
+
+ public:
+  explicit RabitTracker(StringView host, std::int32_t n_worders, std::int32_t port,
+                        std::chrono::seconds timeout)
+      : Tracker{n_worders, port, timeout}, host_{host.c_str(), host.size()} {
+    listener_ = TCPSocket::Create(SockDomain::kV4);
+    auto rc = listener_.Bind(host, &this->port_);
+    CHECK(rc.OK()) << rc.Report();
+    listener_.Listen();
+  }
+
+  explicit RabitTracker(Json const& config);
+  ~RabitTracker() noexcept(false) override = default;
+
+  std::future<Result> Run() override;
+
+  [[nodiscard]] std::int32_t Port() const { return port_; }
+  [[nodiscard]] Json WorkerArgs() const override {
+    Json args{Object{}};
+    args["DMLC_TRACKER_URI"] = String{host_};
+    args["DMLC_TRACKER_PORT"] = this->Port();
+    return args;
+  }
+};
+
+// Prob the public IP address of the host, need a better method.
+//
+// This is directly translated from the previous Python implementation, we should find a
+// more riguous approach, can use some expertise in network programming.
+[[nodiscard]] Result GetHostAddress(std::string* out);
+}  // namespace xgboost::collective
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -5,17 +5,16 @@
 #ifndef XGBOOST_COMMON_BITFIELD_H_
 #define XGBOOST_COMMON_BITFIELD_H_

-#include <algorithm>
-#include <bitset>
-#include <cinttypes>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
+#include <algorithm>    // for min
+#include <bitset>       // for bitset
+#include <cstdint>      // for uint32_t, uint64_t, uint8_t
+#include <ostream>      // for ostream
+#include <type_traits>  // for conditional_t, is_signed_v

 #if defined(__CUDACC__)
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
+
 #include "device_helpers.cuh"
 #elif defined(__HIP_PLATFORM_AMD__)
 #include <thrust/copy.h>
@@ -23,8 +22,8 @@
 #include "device_helpers.hip.h"
 #endif  // defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)

-#include "xgboost/span.h"
 #include "common.h"
+#include "xgboost/span.h"  // for Span

 namespace xgboost {

@@ -79,7 +78,7 @@ struct BitFieldContainer {
 private:
  value_type* bits_{nullptr};
  size_type n_values_{0};
-  static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
+  static_assert(!std::is_signed_v<VT>, "Must use an unsiged type as the underlying storage.");

 public:
  XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
@@ -244,11 +243,39 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {

 // Format: <Const><Direction>BitField<size of underlying type in bits>, underlying type
 // must be unsigned.
-using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
-using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;
+using LBitField64 = BitFieldContainer<std::uint64_t, LBitsPolicy<std::uint64_t>>;
+using RBitField8 = BitFieldContainer<std::uint8_t, RBitsPolicy<unsigned char>>;

-using LBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t>>;
-using CLBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t, true>, true>;
+using LBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t>>;
+using CLBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t, true>, true>;
+using RBitField32 = BitFieldContainer<std::uint32_t, RBitsPolicy<std::uint32_t>>;
+
+namespace detail {
+inline std::uint32_t TrailingZeroBitsImpl(std::uint32_t value) {
+  auto n = sizeof(value) * 8;
+  std::uint32_t cnt{0};
+  for (decltype(n) i = 0; i < n; i++) {
+    if ((value >> i) & 1) {
+      break;
+    }
+    cnt++;
+  }
+  return cnt;
+}
+}  // namespace detail
+
+inline std::uint32_t TrailingZeroBits(std::uint32_t value) {
+  if (value == 0) {
+    return sizeof(value) * 8;
+  }
+#if defined(__GNUC__)
+  return __builtin_ctz(value);
+#elif defined(_MSC_VER)
+  return _tzcnt_u32(value);
+#else
+  return detail::TrailingZeroBitsImpl(value);
+#endif  //  __GNUC__
+}
 }       // namespace xgboost

 #endif  // XGBOOST_COMMON_BITFIELD_H_
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -6,7 +6,6 @@
 #ifndef XGBOOST_COMMON_COMMON_H_
 #define XGBOOST_COMMON_COMMON_H_

-#include <algorithm>  // for max
 #include <array>      // for array
 #include <cmath>      // for ceil
 #include <cstddef>    // for size_t
@@ -203,7 +202,7 @@ inline void SetDevice(std::int32_t device) {
 #endif

 /**
- * Last index of a group in a CSR style of index pointer.
+ * @brief Last index of a group in a CSR style of index pointer.
 */
 template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -135,7 +135,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
 #endif
 }

-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                dh::device_vector<Entry>* p_sorted_entries,
                                dh::device_vector<float>* p_sorted_weights,
                                dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@@ -252,13 +252,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
      sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
        return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
      });
-  detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
  if (sketch_container->HasCategorical()) {
    auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
-    detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
+    detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
                                       &column_sizes_scan);
  }

@@ -359,7 +359,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b

  HistogramCuts cuts;
  SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
-                                   ctx->Ordinal());
+                                   ctx->Device());
  CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
  for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
    std::size_t page_nnz = page.data.Size();
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -86,9 +86,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 }

 template <std::uint32_t kBlockThreads, typename Kernel>
-std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
+std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
  int n_mps = 0;
-  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
  int n_blocks_per_mp = 0;
  dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                              kBlockThreads, shared_mem));
@@ -110,11 +110,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
 * \param out_column_size Output buffer for the size of each column.
 */
 template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
-void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
+void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
                               data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
  thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);

-  std::size_t max_shared_memory = dh::MaxSharedMemory(device);
+  std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
  // Not strictly correct as we should use number of samples to determine the type of
  // counter. However, the sample size is not known due to sliding window on number of
  // elements.
@@ -158,7 +158,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
 }

 template <typename BatchIt>
-void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
+void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
                        IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
                        HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                        dh::caching_device_vector<size_t>* column_sizes_scan) {
@@ -228,7 +228,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
 void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
-                            float missing, size_t columns, size_t cuts_per_feature, int device,
+                            float missing, size_t columns, size_t cuts_per_feature,
+                            DeviceOrd device,
                            HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                            dh::caching_device_vector<size_t>* column_sizes_scan,
                            dh::device_vector<Entry>* sorted_entries) {
@@ -252,7 +253,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                  dh::device_vector<Entry>* sorted_entries);

-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                dh::device_vector<Entry>* p_sorted_entries,
                                dh::device_vector<float>* p_sorted_weights,
                                dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -290,7 +291,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t

 template <typename AdapterBatch>
 void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
-                          int device, size_t columns, size_t begin, size_t end,
+                          DeviceOrd device, size_t columns, size_t begin, size_t end,
                          float missing, SketchContainer *sketch_container,
                          int num_cuts) {
  // Copy current subset of valid elements into temporary storage and sort
@@ -335,11 +336,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
 template <typename Batch>
 void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
                                  int num_cuts_per_feature,
-                                  bool is_ranking, float missing, int device,
+                                  bool is_ranking, float missing, DeviceOrd device,
                                  size_t columns, size_t begin, size_t end,
                                  SketchContainer *sketch_container) {
  dh::XGBCachingDeviceAllocator<char> alloc;
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  info.weights_.SetDevice(device);
  auto weights = info.weights_.ConstDeviceSpan();

@@ -451,14 +452,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
  size_t num_rows = batch.NumRows();
  size_t num_cols = batch.NumCols();
  size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
-  int32_t device = sketch_container->DeviceIdx();
+  auto device = sketch_container->DeviceIdx();
  bool weighted = !info.weights_.Empty();

  if (weighted) {
    sketch_batch_num_elements = detail::SketchBatchNumElements(
        sketch_batch_num_elements,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, true);
+        device.ordinal, num_cuts_per_feature, true);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
      size_t end =
          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
@@ -471,7 +472,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
    sketch_batch_num_elements = detail::SketchBatchNumElements(
        sketch_batch_num_elements,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, false);
+        device.ordinal, num_cuts_per_feature, false);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
      size_t end =
          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(size, v);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -81,7 +81,7 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }

 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return -1; }
+DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }

 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
  return false;
 }

-template <typename T>
-void HostDeviceVector<T>::SetDevice(int) const {}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}

@@ -178,6 +175,7 @@ template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
+template class HostDeviceVector<int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
 template <typename T>
 class HostDeviceVectorImpl {
 public:
-  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
      data_d_->resize(size, v);
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {

  // Initializer can be std::vector<T> or std::initializer_list<T>
  template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
      gpu_access_ = GPUAccess::kWrite;
      LazyResizeDevice(init.size());
      Copy(init);
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
    gpu_access_{that.gpu_access_} {}

  ~HostDeviceVectorImpl() {
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
      SetDevice();
    }
  }

-  size_t Size() const {
+  [[nodiscard]] size_t Size() const {
    return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
  }

-  int DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }

  T* DevicePointer() {
    LazySyncDevice(GPUAccess::kWrite);
@@ -138,8 +138,7 @@ class HostDeviceVectorImpl {
    } else {
      auto ptr = other->ConstDevicePointer();
      SetDevice();
-      CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
-
+      CHECK_EQ(this->Device(), other->Device());
      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
                                    ptr,
                                    other->Size() * sizeof(T),
@@ -157,24 +156,25 @@ class HostDeviceVectorImpl {
    return data_h_;
  }

-  void SetDevice(int device) {
+  void SetDevice(DeviceOrd device) {
    if (device_ == device) { return; }
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
      LazySyncHost(GPUAccess::kNone);
    }

-    if (device_ >= 0 && device >= 0) {
-      CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
+    if (device_.IsCUDA() && device.IsCUDA()) {
+      CHECK_EQ(device_.ordinal, device.ordinal)
+          << "New device ordinal is different from previous one.";
    }
    device_ = device;
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
      LazyResizeDevice(data_h_.size());
    }
  }

  void Resize(size_t new_size, T v) {
    if (new_size == Size()) { return; }
-    if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
+    if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
      // fast on-device resize
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
@@ -221,16 +221,16 @@ class HostDeviceVectorImpl {
    gpu_access_ = access;
  }

-  bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
-  bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
-  bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
-  bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
-  bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
-  bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
-  GPUAccess Access() const { return gpu_access_; }
+  [[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
+  [[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
+  [[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
+  [[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
+  [[nodiscard]] GPUAccess Access() const { return gpu_access_; }

 private:
-  int device_{-1};
+  DeviceOrd device_{DeviceOrd::CPU()};
  std::vector<T> data_h_{};
  std::unique_ptr<dh::device_vector<T>> data_d_{};
  GPUAccess gpu_access_{GPUAccess::kNone};
@@ -264,11 +264,11 @@ class HostDeviceVectorImpl {
  }

  void SetDevice() {
-    CHECK_GE(device_, 0);
+    CHECK_GE(device_.ordinal, 0);
    if (cudaSetDeviceHandler == nullptr) {
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
    } else {
-      (*cudaSetDeviceHandler)(device_);
+      (*cudaSetDeviceHandler)(device_.ordinal);
    }

    if (!data_d_) {
@@ -278,15 +278,15 @@ class HostDeviceVectorImpl {
 };

 template<typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
    : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}

 template <typename T>
@@ -314,7 +314,9 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }

 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
+DeviceOrd HostDeviceVector<T>::Device() const {
+  return impl_->Device();
+}

 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() {
@@ -394,14 +396,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
  return impl_->Access();
 }

-template <typename T>
-void HostDeviceVector<T>::SetDevice(int device) const {
-  impl_->SetDevice(device);
-}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
-  impl_->SetDevice(device.ordinal);
+  impl_->SetDevice(device);
 }

 template <typename T>
@@ -416,6 +413,7 @@ template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
+template class HostDeviceVector<int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -8,7 +8,7 @@
 #define XGBOOST_COMMON_IO_H_

 #include <dmlc/io.h>
-#include <rabit/rabit.h>
+#include <rabit/internal/io.h>  // for MemoryFixSizeBuffer, MemoryBufferStream

 #include <algorithm>    // for min, fill_n, copy_n
 #include <array>        // for array
@@ -382,7 +382,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
   * @param length    See the `length` parameter of `mmap` for details.
   */
  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
-      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
+      : AlignedResourceReadStream{std::shared_ptr<MmapResource>{  // NOLINT
+            new MmapResource{std::move(path), offset, length}}} {}
  ~PrivateMmapConstStream() noexcept(false) override;
 };

--- a/src/common/json_utils.h
+++ b/src/common/json_utils.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ *
+ * @brief Utils tailored for XGBoost.
+ */
+#pragma once
+
+#include <string>       // for string
+#include <type_traits>  // for enable_if_t, remove_const_t
+
+#include "xgboost/json.h"
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost {
+namespace detail {
+template <typename Head>
+bool TypeCheckImpl(Json const &value) {
+  return IsA<Head>(value);
+}
+
+template <typename Head, typename... JT>
+std::enable_if_t<sizeof...(JT) != 0, bool> TypeCheckImpl(Json const &value) {
+  return IsA<Head>(value) || TypeCheckImpl<JT...>(value);
+}
+
+template <typename Head>
+std::string TypeCheckError() {
+  return "`" + Head{}.TypeStr() + "`";
+}
+
+template <typename Head, typename... JT>
+std::enable_if_t<sizeof...(JT) != 0, std::string> TypeCheckError() {
+  return "`" + Head{}.TypeStr() + "`, " + TypeCheckError<JT...>();
+}
+}  // namespace detail
+
+/**
+ * @brief Type check for JSON-based parameters
+ *
+ * @tparam JT    Expected JSON types.
+ * @param  value Value to be checked.
+ */
+template <typename... JT>
+void TypeCheck(Json const &value, StringView name) {
+  if (!detail::TypeCheckImpl<JT...>(value)) {
+    LOG(FATAL) << "Invalid type for: `" << name << "`, expecting one of the: {`"
+               << detail::TypeCheckError<JT...>() << "}, got: `" << value.GetValue().TypeStr()
+               << "`";
+  }
+}
+
+template <typename JT>
+auto const &RequiredArg(Json const &in, StringView key, StringView func) {
+  auto const &obj = get<Object const>(in);
+  auto it = obj.find(key);
+  if (it == obj.cend() || IsA<Null>(it->second)) {
+    LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
+  }
+  TypeCheck<JT>(it->second, StringView{key});
+  return get<std::remove_const_t<JT> const>(it->second);
+}
+
+template <typename JT, typename T>
+auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
+  auto const &obj = get<Object const>(in);
+  auto it = obj.find(key);
+  if (it != obj.cend() && !IsA<Null>(it->second)) {
+    TypeCheck<JT>(it->second, key);
+
+    return get<std::remove_const_t<JT> const>(it->second);
+  }
+  return dft;
+}
+}  // namespace xgboost
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -44,7 +44,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_

 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
-  ctx->IsCPU() ? ElementWiseKernelHost(t, ctx->Threads(), fn) : ElementWiseKernelDevice(t, fn);
+  ctx->IsCUDA() ? ElementWiseKernelDevice(t, fn) : ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
 }  // namespace linalg
 }  // namespace xgboost
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -55,7 +55,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr)

 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
-  if (!ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
    common::AssertGPUSupport();
  }
  ElementWiseKernelHost(t, ctx->Threads(), fn);
--- a/src/common/numeric.cc
+++ b/src/common/numeric.cc
@@ -11,13 +11,14 @@
 namespace xgboost {
 namespace common {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
-  if (ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
+    return cuda_impl::Reduce(ctx, values);
+  } else {
    auto const& h_values = values.ConstHostVector();
    auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
    static_assert(std::is_same<decltype(result), double>::value);
    return result;
  }
-  return cuda_impl::Reduce(ctx, values);
 }
 }  // namespace common
 }  // namespace xgboost
--- a/src/common/numeric.cu
+++ b/src/common/numeric.cu
@@ -8,11 +8,9 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector

-namespace xgboost {
-namespace common {
-namespace cuda_impl {
+namespace xgboost::common::cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
-  values.SetDevice(ctx->gpu_id);
+  values.SetDevice(ctx->Device());
  auto const d_values = values.ConstDeviceSpan();
  dh::XGBCachingDeviceAllocator<char> alloc;

@@ -24,6 +22,4 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
                    thrust::plus<float>{});
 #endif
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@@ -24,9 +24,9 @@ struct OptionalWeights {
 inline OptionalWeights MakeOptionalWeights(Context const* ctx,
                                           HostDeviceVector<float> const& weights) {
  if (ctx->IsCUDA()) {
-    weights.SetDevice(ctx->gpu_id);
+    weights.SetDevice(ctx->Device());
  }
-  return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
+  return OptionalWeights{ctx->IsCUDA() ? weights.ConstDeviceSpan() : weights.ConstHostSpan()};
 }
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_OPTIONAL_WEIGHT_H_
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -242,11 +242,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 // summary does the output element come from) result by definition of merged rank.  So we
 // run it in 2 passes to obtain the merge path and then customize the standard merge
 // algorithm.
-void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
+void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
               Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
               Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
-  dh::safe_cuda(cudaSetDevice(device));
-
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  CHECK_EQ(d_x.size() + d_y.size(), out.size());
  CHECK_EQ(x_ptr.size(), out_ptr.size());
  CHECK_EQ(y_ptr.size(), out_ptr.size());
@@ -344,8 +343,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
 void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
                           common::Span<OffsetT> cuts_ptr,
                           size_t total_cuts, Span<float> weights) {
-  dh::safe_cuda(cudaSetDevice(device_));
-
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  Span<SketchEntry> out;
  dh::device_vector<SketchEntry> cuts;
  bool first_window = this->Current().empty();
@@ -404,7 +402,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
   * pruning or merging. We preserve the first type and remove the second type.
   */
  timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
  dh::XGBCachingDeviceAllocator<char> alloc;

@@ -461,7 +459,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col

 void SketchContainer::Prune(size_t to) {
  timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));

  OffsetT to_total = 0;
  auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -496,8 +494,7 @@ void SketchContainer::Prune(size_t to) {

 void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
                            Span<SketchEntry const> that) {
-  dh::safe_cuda(cudaSetDevice(device_));
-
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  timer_.Start(__func__);
  if (this->Current().size() == 0) {
    CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -532,8 +529,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
 }

 void SketchContainer::FixError() {
-  dh::safe_cuda(cudaSetDevice(device_));
-
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
  auto in = dh::ToSpan(this->Current());
  dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -558,7 +554,7 @@ void SketchContainer::FixError() {
 }

 void SketchContainer::AllReduce(bool is_column_split) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  auto world = collective::GetWorldSize();
  if (world == 1 || is_column_split) {
    return;
@@ -585,15 +581,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
  auto offset = rank * d_columns_ptr.size();
  thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
               gathered_ptrs.begin() + offset);
-  collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
+  collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
                                                     gathered_ptrs.size());

  // Get the data from all workers.
  std::vector<size_t> recv_lengths;
  dh::caching_device_vector<char> recvbuf;
-  collective::AllGatherV(device_, this->Current().data().get(),
+  collective::AllGatherV(device_.ordinal, this->Current().data().get(),
                         dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
-  collective::Synchronize(device_);
+  collective::Synchronize(device_.ordinal);

  // Segment the received data.
  auto s_recvbuf = dh::ToSpan(recvbuf);
@@ -640,7 +636,7 @@ struct InvalidCatOp {

 void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
  timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  p_cuts->min_vals_.Resize(num_columns_);

  // Sync between workers.
@@ -690,21 +686,41 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
        });
    CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1);
    max_values.resize(d_in_columns_ptr.size() - 1);
+
+    // In some cases (e.g. column-wise data split), we may have empty columns, so we need to keep
+    // track of the unique keys (feature indices) after the thrust::reduce_by_key` call.
+    dh::caching_device_vector<size_t> d_max_keys(d_in_columns_ptr.size() - 1);
    dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
-
 #if defined(XGBOOST_USE_CUDA)
-    thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
-                          thrust::make_discard_iterator(), d_max_values.begin(),
-                          thrust::equal_to<bst_feature_t>{},
-                          [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
-#elif defined(XGBOOST_USE_HIP)
-    thrust::reduce_by_key(thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
-                          thrust::make_discard_iterator(), d_max_values.begin(),
-                          thrust::equal_to<bst_feature_t>{},
-                          [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
-#endif
+    auto new_end = thrust::reduce_by_key(
+        thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
+        d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
+        [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
+    d_max_keys.erase(new_end.first, d_max_keys.end());
+    d_max_values.erase(new_end.second, d_max_values.end());

-    dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values));
+    // The device vector needs to be initialized explicitly since we may have some missing columns.
+    SketchEntry default_entry{};
+    dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
+                                                         default_entry);
+    thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(),
+                    d_max_keys.begin(), d_max_results.begin());
+#elif defined(XGBOOST_USE_HIP)
+    auto new_end = thrust::reduce_by_key(
+        thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
+        d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
+        [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
+    d_max_keys.erase(new_end.first, d_max_keys.end());
+    d_max_values.erase(new_end.second, d_max_values.end());
+
+    // The device vector needs to be initialized explicitly since we may have some missing columns.
+    SketchEntry default_entry{};
+    dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
+                                                         default_entry);
+    thrust::scatter(thrust::hip::par(alloc), d_max_values.begin(), d_max_values.end(),
+                    d_max_keys.begin(), d_max_results.begin());
+#endif
+    dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results));
    auto max_it = MakeIndexTransformIter([&](auto i) {
      if (IsCat(h_feature_types, i)) {
        return max_values[i].value;
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -41,7 +41,7 @@ class SketchContainer {
  bst_row_t num_rows_;
  bst_feature_t num_columns_;
  int32_t num_bins_;
-  int32_t device_;
+  DeviceOrd device_;

  // Double buffer as neither prune nor merge can be performed inplace.
  dh::device_vector<SketchEntry> entries_a_;
@@ -93,35 +93,32 @@ class SketchContainer {
   * \param num_rows    Total number of rows in known dataset (typically the rows in current worker).
   * \param device      GPU ID.
   */
-   SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
-                   int32_t max_bin, bst_feature_t num_columns,
-                   bst_row_t num_rows, int32_t device)
-       : num_rows_{num_rows},
-         num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
-     CHECK_GE(device, 0);
-     // Initialize Sketches for this dmatrix
-     this->columns_ptr_.SetDevice(device_);
-     this->columns_ptr_.Resize(num_columns + 1);
-     this->columns_ptr_b_.SetDevice(device_);
-     this->columns_ptr_b_.Resize(num_columns + 1);
+  SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
+                  bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
+      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
+    CHECK(device.IsCUDA());
+    // Initialize Sketches for this dmatrix
+    this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_b_.SetDevice(device_);
+    this->columns_ptr_b_.Resize(num_columns + 1);

-     this->feature_types_.Resize(feature_types.Size());
-     this->feature_types_.Copy(feature_types);
-     // Pull to device.
-     this->feature_types_.SetDevice(device);
-     this->feature_types_.ConstDeviceSpan();
-     this->feature_types_.ConstHostSpan();
+    this->feature_types_.Resize(feature_types.Size());
+    this->feature_types_.Copy(feature_types);
+    // Pull to device.
+    this->feature_types_.SetDevice(device);
+    this->feature_types_.ConstDeviceSpan();
+    this->feature_types_.ConstHostSpan();

-     auto d_feature_types = feature_types_.ConstDeviceSpan();
-     has_categorical_ =
-         !d_feature_types.empty() &&
-         thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
-                        common::IsCatOp{});
+    auto d_feature_types = feature_types_.ConstDeviceSpan();
+    has_categorical_ =
+        !d_feature_types.empty() &&
+        thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});

-     timer_.Init(__func__);
-   }
+    timer_.Init(__func__);
+  }
  /* \brief Return GPU ID for this container. */
-  int32_t DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
  /* \brief Whether the predictor matrix contains categorical features. */
  bool HasCategorical() const { return has_categorical_; }
  /* \brief Accumulate weights of duplicated entries in input. */
@@ -175,9 +172,7 @@ class SketchContainer {
  template <typename KeyComp = thrust::equal_to<size_t>>
  size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
    timer_.Start(__func__);
-
-    dh::safe_cuda(cudaSetDevice(device_));
-
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
    this->columns_ptr_.SetDevice(device_);
    Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
    CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
@@ -195,7 +190,7 @@ class SketchContainer {
        d_column_scan.data() + d_column_scan.size(), entries.data(),
        entries.data() + entries.size(), scan_out.DevicePointer(),
        entries.data(), detail::SketchUnique{}, key_comp);
-#else
+#elif defined(XGBOOST_USE_CUDA)
    size_t n_uniques = dh::SegmentedUnique(
        thrust::cuda::par(alloc), d_column_scan.data(),
        d_column_scan.data() + d_column_scan.size(), entries.data(),
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -35,13 +35,13 @@ struct WQSummary {
  /*! \brief an entry in the sketch summary */
  struct Entry {
    /*! \brief minimum rank */
-    RType rmin;
+    RType rmin{};
    /*! \brief maximum rank */
-    RType rmax;
+    RType rmax{};
    /*! \brief maximum weight */
-    RType wmin;
+    RType wmin{};
    /*! \brief the value of data */
-    DType value;
+    DType value{};
    // constructor
    XGBOOST_DEVICE Entry() {}  // NOLINT
    // constructor
--- a/src/common/quantile_loss_utils.cc
+++ b/src/common/quantile_loss_utils.cc
@@ -1,19 +1,19 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023, XGBoost contributors
 */
 #include "quantile_loss_utils.h"

-#include <cctype>             // std::isspace
-#include <istream>            // std::istream
-#include <ostream>            // std::ostream
-#include <string>             // std::string
-#include <vector>             // std::vector
+#include <cctype>   // for isspace
+#include <istream>  // for istream
+#include <ostream>  // for ostream
+#include <string>   // for string
+#include <vector>   // for vector

-#include "xgboost/json.h"     // F32Array,TypeCheck,get,Number
-#include "xgboost/json_io.h"  // JsonWriter
+#include "../common/json_utils.h"  // for TypeCheck
+#include "xgboost/json.h"          // for F32Array, get, Number
+#include "xgboost/json_io.h"       // for JsonWriter

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 std::ostream& operator<<(std::ostream& os, const ParamFloatArray& array) {
  auto const& t = array.Get();
  xgboost::F32Array arr{t.size()};
@@ -70,5 +70,4 @@ std::istream& operator>>(std::istream& is, ParamFloatArray& array) {
 }

 DMLC_REGISTER_PARAMETER(QuantileLossParam);
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -197,10 +197,10 @@ class RankingCache {
      CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
          << error::GroupSize() << "the size of label.";
    }
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
      this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
    }
    if (!info.weights_.Empty()) {
      CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
@@ -218,7 +218,7 @@ class RankingCache {
  // Constructed as [1, n_samples] if group ptr is not supplied by the user
  common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
    group_ptr_.SetDevice(ctx->Device());
-    return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
+    return ctx->IsCUDA() ? group_ptr_.ConstDeviceSpan() : group_ptr_.ConstHostSpan();
  }

  [[nodiscard]] auto const& Param() const { return param_; }
@@ -231,10 +231,10 @@ class RankingCache {
      sorted_idx_cache_.SetDevice(ctx->Device());
      sorted_idx_cache_.Resize(predt.size());
    }
-    if (ctx->IsCPU()) {
-      return this->MakeRankOnCPU(ctx, predt);
-    } else {
+    if (ctx->IsCUDA()) {
      return this->MakeRankOnCUDA(ctx, predt);
+    } else {
+      return this->MakeRankOnCPU(ctx, predt);
    }
  }
  // The function simply returns a uninitialized buffer as this is only used by the
@@ -307,10 +307,10 @@ class NDCGCache : public RankingCache {
 public:
  NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
      : RankingCache{ctx, info, p} {
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
      this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
    }
  }

@@ -318,7 +318,7 @@ class NDCGCache : public RankingCache {
    return inv_idcg_.View(ctx->Device());
  }
  common::Span<double const> Discount(Context const* ctx) const {
-    return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
+    return ctx->IsCUDA() ? discounts_.ConstDeviceSpan() : discounts_.ConstHostSpan();
  }
  linalg::VectorView<double> Dcg(Context const* ctx) {
    if (dcg_.Size() == 0) {
@@ -387,10 +387,10 @@ class PreCache : public RankingCache {
 public:
  PreCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
      : RankingCache{ctx, info, p} {
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
      this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
    }
  }

@@ -399,7 +399,7 @@ class PreCache : public RankingCache {
      pre_.SetDevice(ctx->Device());
      pre_.Resize(this->Groups());
    }
-    return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
+    return ctx->IsCUDA() ? pre_.DeviceSpan() : pre_.HostSpan();
  }
 };

@@ -418,10 +418,10 @@ class MAPCache : public RankingCache {
 public:
  MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
      : RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
      this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
    }
  }

@@ -430,21 +430,21 @@ class MAPCache : public RankingCache {
      n_rel_.SetDevice(ctx->Device());
      n_rel_.Resize(n_samples_);
    }
-    return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
+    return ctx->IsCUDA() ? n_rel_.DeviceSpan() : n_rel_.HostSpan();
  }
  common::Span<double> Acc(Context const* ctx) {
    if (acc_.Empty()) {
      acc_.SetDevice(ctx->Device());
      acc_.Resize(n_samples_);
    }
-    return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
+    return ctx->IsCUDA() ? acc_.DeviceSpan() : acc_.HostSpan();
  }
  common::Span<double> Map(Context const* ctx) {
    if (map_.Empty()) {
      map_.SetDevice(ctx->Device());
      map_.Resize(this->Groups());
    }
-    return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
+    return ctx->IsCUDA() ? map_.DeviceSpan() : map_.HostSpan();
  }
 };

--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -76,7 +76,7 @@ class RefResourceView {

  [[nodiscard]] size_type size() const { return size_; }  // NOLINT
  [[nodiscard]] size_type size_bytes() const {            // NOLINT
-    return Span{data(), size()}.size_bytes();
+    return Span<const value_type>{data(), size()}.size_bytes();
  }
  [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
  [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -15,8 +15,7 @@
 #include "xgboost/linalg.h"              // Tensor, UnravelIndex, Apply
 #include "xgboost/logging.h"             // CHECK_EQ

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
            HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
  if (!ctx->IsCPU()) {
@@ -46,11 +45,13 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
 }

 void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
-  v.SetDevice(ctx->gpu_id);
-  out->SetDevice(ctx->gpu_id);
+  v.SetDevice(ctx->Device());
+  out->SetDevice(ctx->Device());
  out->Reshape(1);

-  if (ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
+    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
+  } else {
    auto h_v = v.HostView();
    float n = v.Size();
    MemStackAllocator<float, DefaultMaxThreads()> tloc(ctx->Threads(), 0.0f);
@@ -58,9 +59,6 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
                [&](auto i) { tloc[omp_get_thread_num()] += h_v(i) / n; });
    auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f);
    out->HostView()(0) = ret;
-  } else {
-    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
  }
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@@ -15,19 +15,16 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // linalg::TensorView, UnravelIndex, Apply

-namespace xgboost {
-namespace common {
-namespace cuda_impl {
-
 #if defined(XGBOOST_USE_HIP)
 namespace cub = hipcub;
 #endif

+namespace xgboost::common::cuda_impl {
 void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
            common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
  CHECK_GE(t.Shape(1), 1);
  HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
-  segments.SetDevice(ctx->gpu_id);
+  segments.SetDevice(ctx->Device());
  auto d_segments = segments.DeviceSpan();
  dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
              [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
@@ -36,7 +33,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
        return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
      });

-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
  out->Reshape(t.Shape(1));
  if (weights.Empty()) {
    common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
@@ -65,6 +62,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
  dh::TemporaryArray<char> temp{bytes};
  cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
  auto d_sorted_idx = dh::ToSpan(sorted_idx);
  auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));

-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
  quantiles->Resize(n_segments);
  auto d_results = quantiles->DeviceSpan();

@@ -226,7 +226,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
 #endif

  auto n_segments = std::distance(seg_beg, seg_end) - 1;
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
  quantiles->Resize(n_segments);
  auto d_results = quantiles->DeviceSpan();
  auto d_weight_cdf = dh::ToSpan(weights_cdf);
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@@ -3,14 +3,23 @@
 */
 #include "threading_utils.h"

-#include <fstream>
-#include <string>
+#include <algorithm>   // for max
+#include <exception>   // for exception
+#include <filesystem>  // for path, exists
+#include <fstream>     // for ifstream
+#include <string>      // for string

-#include "xgboost/logging.h"
+#include "common.h"  // for DivRoundUp

-namespace xgboost {
-namespace common {
-int32_t GetCfsCPUCount() noexcept {
+namespace xgboost::common {
+/**
+ * Modified from
+ * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
+ *
+ * MIT License: Copyright (c) 2016 Domagoj Šarić
+ */
+std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
+                              std::filesystem::path const& peroid_path) {
 #if defined(__linux__)
  // https://bugs.openjdk.java.net/browse/JDK-8146115
  // http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
@@ -31,8 +40,8 @@ int32_t GetCfsCPUCount() noexcept {
    }
  };
  // complete fair scheduler from Linux
-  auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"));
-  auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us"));
+  auto const cfs_quota(read_int(quota_path.c_str()));
+  auto const cfs_period(read_int(peroid_path.c_str()));
  if ((cfs_quota > 0) && (cfs_period > 0)) {
    return std::max(cfs_quota / cfs_period, 1);
  }
@@ -40,6 +49,47 @@ int32_t GetCfsCPUCount() noexcept {
  return -1;
 }

+std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
+  std::int32_t cnt{-1};
+#if defined(__linux__)
+  namespace fs = std::filesystem;
+
+  std::int32_t a{0}, b{0};
+
+  auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
+  try {
+    std::ifstream fin{bandwidth_path, std::ios::in};
+    fin >> a;
+    fin >> b;
+  } catch (std::exception const&) {
+    warn();
+    return cnt;
+  }
+  if (a > 0 && b > 0) {
+    cnt = std::max(common::DivRoundUp(a, b), 1);
+  }
+#endif  //  defined(__linux__)
+  return cnt;
+}
+
+std::int32_t GetCfsCPUCount() noexcept {
+  namespace fs = std::filesystem;
+  fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
+  auto has_v2 = fs::exists(bandwidth_path);
+  if (has_v2) {
+    return GetCGroupV2Count(bandwidth_path);
+  }
+
+  fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
+  fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
+  auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
+  if (has_v1) {
+    return GetCGroupV1Count(quota_path, peroid_path);
+  }
+
+  return -1;
+}
+
 std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
  // Don't use parallel if we are in a parallel region.
  if (omp_in_parallel()) {
@@ -54,5 +104,4 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
  n_threads = std::max(n_threads, 1);
  return n_threads;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -253,11 +253,6 @@ inline std::int32_t OmpGetThreadLimit() {
 * \brief Get thread limit from CFS.
 *
 *   This function has non-trivial overhead and should not be called repeatly.
- *
- * Modified from
- * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
- *
- * MIT License: Copyright (c) 2016 Domagoj Šarić
 */
 std::int32_t GetCfsCPUCount() noexcept;

--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -62,8 +62,8 @@ class Transform {
  template <typename Functor>
  struct Evaluator {
   public:
-    Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
-        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
+    Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
+        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}

    /*!
     * \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -73,7 +73,7 @@ class Transform {
     */
    template <typename... HDV>
    void Eval(HDV... vectors) const {
-      bool on_device = device_ >= 0;
+      bool on_device = device_.IsCUDA();

      if (on_device) {
        LaunchCUDA(func_, vectors...);
@@ -118,11 +118,11 @@ class Transform {
    }
    // Recursive unpack for Shard.
    template <typename T>
-    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+    void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
      vector->SetDevice(device);
    }
    template <typename Head, typename... Rest>
-    void UnpackShard(int device,
+    void UnpackShard(DeviceOrd device,
                     const HostDeviceVector<Head> *_vector,
                     const HostDeviceVector<Rest> *... _vectors) const {
      _vector->SetDevice(device);
@@ -142,13 +142,7 @@ class Transform {
      // granularity is used in data vector.
      size_t shard_size = range_size;
      Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-
-#if defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipSetDevice(device_));
-#elif defined(XGBOOST_USE_CUDA)
-      dh::safe_cuda(cudaSetDevice(device_));
-#endif
-
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
      const int kGrids =
          static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
      if (kGrids == 0) {
@@ -182,7 +176,7 @@ class Transform {
    /*! \brief Range object specifying parallel threads index range. */
    Range range_;
    int32_t n_threads_;
-    int32_t device_;
+    DeviceOrd device_;
  };

 public:
@@ -200,8 +194,8 @@ class Transform {
   */
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
-                                 int32_t device_idx) {
-    return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
+                                 DeviceOrd device) {
+    return Evaluator<Functor>{func, std::move(range), n_threads, device};
  }
 };

--- a/src/context.cc
+++ b/src/context.cc
@@ -20,7 +20,6 @@ namespace xgboost {

 DMLC_REGISTER_PARAMETER(Context);

-bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;

 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
@@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
    return std::nullopt;
  }

-  std::int32_t parsed_id{Context::kCpuId};
+  std::int32_t parsed_id{DeviceOrd::CPUOrdinal()};
  auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
  if (res.ec != std::errc()) {
    return std::nullopt;
@@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {

  auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
  DeviceOrd device;
-  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  device.ordinal = DeviceOrd::InvalidOrdinal();  // mark it invalid for check.
  if (split_it == s_device.cend()) {
    // no ordinal.
    if (s_device == DeviceSym::CPU()) {
@@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
    device = DeviceOrd::CUDA(opt_id.value());
  }

-  if (device.ordinal < Context::kCpuId) {
+  if (device.ordinal < DeviceOrd::CPUOrdinal()) {
    fatal();
  }
  device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
@@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 }
 }  // namespace

+std::ostream& operator<<(std::ostream& os, DeviceOrd ord) {
+  os << ord.Name();
+  return os;
+}
+
+void Context::Init(Args const& kwargs) {
+  auto unknown = this->UpdateAllowUnknown(kwargs);
+  if (!unknown.empty()) {
+    std::stringstream ss;
+    std::size_t i = 0;
+    ss << "[Internal Error] Unknown parameters passed to the Context {";
+    for (auto const& [k, _] : unknown) {
+      ss << '"' << k << '"';
+      if (++i != unknown.size()) {
+        ss << ", ";
+      }
+    }
+    ss << "}\n";
+    LOG(FATAL) << ss.str();
+  }
+}
+
 void Context::ConfigureGpuId(bool require_gpu) {
  if (this->IsCPU() && require_gpu) {
    this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
@@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
    error::WarnDeprecatedGPUId();
    auto opt_id = ParseInt(StringView{gpu_id_it->second});
    CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
-    if (opt_id.value() > Context::kCpuId) {
+    if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
    } else {
      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
@@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
  this->SetDevice(new_d);

  if (this->IsCPU()) {
-    CHECK_EQ(this->device_.ordinal, kCpuId);
+    CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal());
  } else {
-    CHECK_GT(this->device_.ordinal, kCpuId);
+    CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal());
  }
 }

--- a/src/data/adapter.cc
+++ b/src/data/adapter.cc
@@ -0,0 +1,28 @@
+/**
+ *  Copyright 2019-2023, XGBoost Contributors
+ */
+#include "adapter.h"
+
+#include "../c_api/c_api_error.h"  // for API_BEGIN, API_END
+#include "xgboost/c_api.h"
+
+namespace xgboost::data {
+template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
+bool IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>::Next() {
+  if ((*next_callback_)(
+          data_handle_,
+          [](void *handle, XGBoostBatchCSR batch) -> int {
+            API_BEGIN();
+            static_cast<IteratorAdapter *>(handle)->SetData(batch);
+            API_END();
+          },
+          this) != 0) {
+    at_first_ = false;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template class IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
+}  // namespace xgboost::data
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -1,5 +1,5 @@
-/*!
- *  Copyright (c) 2019~2021 by Contributors
+/**
+ *  Copyright 2019-2023, XGBoost Contributors
 * \file adapter.h
 */
 #ifndef XGBOOST_DATA_ADAPTER_H_
@@ -16,11 +16,9 @@
 #include <utility>  // std::move
 #include <vector>

-#include "../c_api/c_api_error.h"
 #include "../common/error_msg.h"  // for MaxFeatureSize
 #include "../common/math.h"
 #include "array_interface.h"
-#include "arrow-cdi.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/logging.h"
@@ -743,8 +741,10 @@ class FileAdapter : dmlc::DataIter<FileAdapterBatch> {
  dmlc::Parser<uint32_t>* parser_;
 };

-/*! \brief Data iterator that takes callback to return data, used in JVM package for
- *  accepting data iterator. */
+/**
+ * @brief Data iterator that takes callback to return data, used in JVM package for accepting data
+ *        iterator.
+ */
 template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
 class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
 public:
@@ -758,23 +758,9 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
    CHECK(at_first_) << "Cannot reset IteratorAdapter";
  }

-  bool Next() override {
-    if ((*next_callback_)(
-            data_handle_,
-            [](void *handle, XGBoostBatchCSR batch) -> int {
-              API_BEGIN();
-              static_cast<IteratorAdapter *>(handle)->SetData(batch);
-              API_END();
-            },
-            this) != 0) {
-      at_first_ = false;
-      return true;
-    } else {
-      return false;
-    }
-  }
+  [[nodiscard]] bool Next() override;

-  FileAdapterBatch const& Value() const override {
+  [[nodiscard]] FileAdapterBatch const& Value() const override {
    return *batch_.get();
  }

@@ -822,12 +808,12 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
    block_.index = dmlc::BeginPtr(index_);
    block_.value = dmlc::BeginPtr(value_);

-    batch_.reset(new FileAdapterBatch(&block_, row_offset_));
+    batch_ = std::make_unique<FileAdapterBatch>(&block_, row_offset_);
    row_offset_ += offset_.size() - 1;
  }

-  size_t NumColumns() const { return columns_; }
-  size_t NumRows() const { return kAdapterUnknownSize; }
+  [[nodiscard]] std::size_t NumColumns() const { return columns_; }
+  [[nodiscard]] std::size_t NumRows() const { return kAdapterUnknownSize; }

 private:
  std::vector<size_t> offset_;
@@ -849,356 +835,6 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
  std::unique_ptr<FileAdapterBatch> batch_;
 };

-enum ColumnDType : uint8_t {
-  kUnknown,
-  kInt8,
-  kUInt8,
-  kInt16,
-  kUInt16,
-  kInt32,
-  kUInt32,
-  kInt64,
-  kUInt64,
-  kFloat,
-  kDouble
-};
-
-class Column {
- public:
-  Column() = default;
-
-  Column(size_t col_idx, size_t length, size_t null_count, const uint8_t* bitmap)
-    : col_idx_{col_idx}, length_{length}, null_count_{null_count}, bitmap_{bitmap} {}
-
-  virtual ~Column() = default;
-
-  Column(const Column&) = delete;
-  Column& operator=(const Column&) = delete;
-  Column(Column&&) = delete;
-  Column& operator=(Column&&) = delete;
-
-  // whether the valid bit is set for this element
-  bool IsValid(size_t row_idx) const {
-    return (!bitmap_ || (bitmap_[row_idx/8] & (1 << (row_idx%8))));
-  }
-
-  virtual COOTuple GetElement(size_t row_idx) const = 0;
-
-  virtual bool IsValidElement(size_t row_idx) const = 0;
-
-  virtual std::vector<float> AsFloatVector() const = 0;
-
-  virtual std::vector<uint64_t> AsUint64Vector() const = 0;
-
-  size_t Length() const { return length_; }
-
- protected:
-  size_t col_idx_;
-  size_t length_;
-  size_t null_count_;
-  const uint8_t* bitmap_;
-};
-
-// Only columns of primitive types are supported. An ArrowColumnarBatch is a
-// collection of std::shared_ptr<PrimitiveColumn>. These columns can be of different data types.
-// Hence, PrimitiveColumn is a class template; and all concrete PrimitiveColumns
-// derive from the abstract class Column.
-template <typename T>
-class PrimitiveColumn : public Column {
-  static constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
-
- public:
-  PrimitiveColumn(size_t idx, size_t length, size_t null_count,
-                  const uint8_t* bitmap, const T* data, float missing)
-    : Column{idx, length, null_count, bitmap}, data_{data}, missing_{missing} {}
-
-  COOTuple GetElement(size_t row_idx) const override {
-    CHECK(data_ && row_idx < length_) << "Column is empty or out-of-bound index of the column";
-    return { row_idx, col_idx_, IsValidElement(row_idx) ?
-                  static_cast<float>(data_[row_idx]) : kNaN };
-  }
-
-  bool IsValidElement(size_t row_idx) const override {
-    // std::isfinite needs to cast to double to prevent msvc report error
-    return IsValid(row_idx)
-            && std::isfinite(static_cast<double>(data_[row_idx]))
-            && static_cast<float>(data_[row_idx]) != missing_;
-  }
-
-  std::vector<float> AsFloatVector() const override {
-    CHECK(data_) << "Column is empty";
-    std::vector<float> fv(length_);
-    std::transform(data_, data_ + length_, fv.begin(),
-        [](T v) { return static_cast<float>(v); });
-    return fv;
-  }
-
-  std::vector<uint64_t> AsUint64Vector() const override {
-    CHECK(data_) << "Column is empty";
-    std::vector<uint64_t> iv(length_);
-    std::transform(data_, data_ + length_, iv.begin(),
-        [](T v) { return static_cast<uint64_t>(v); });
-    return iv;
-  }
-
- private:
-  const T* data_;
-  float missing_;  // user specified missing value
-};
-
-struct ColumnarMetaInfo {
-  // data type of the column
-  ColumnDType type{ColumnDType::kUnknown};
-  // location of the column in an Arrow record batch
-  int64_t loc{-1};
-};
-
-struct ArrowSchemaImporter {
-  std::vector<ColumnarMetaInfo> columns;
-
-  // map Arrow format strings to types
-  static ColumnDType FormatMap(char const* format_str) {
-    CHECK(format_str) << "Format string cannot be empty";
-    switch (format_str[0]) {
-      case 'c':
-        return ColumnDType::kInt8;
-      case 'C':
-        return ColumnDType::kUInt8;
-      case 's':
-        return ColumnDType::kInt16;
-      case 'S':
-        return ColumnDType::kUInt16;
-      case 'i':
-        return ColumnDType::kInt32;
-      case 'I':
-        return ColumnDType::kUInt32;
-      case 'l':
-        return ColumnDType::kInt64;
-      case 'L':
-        return ColumnDType::kUInt64;
-      case 'f':
-        return ColumnDType::kFloat;
-      case 'g':
-        return ColumnDType::kDouble;
-      default:
-        CHECK(false) << "Column data type not supported by XGBoost";
-        return ColumnDType::kUnknown;
-    }
-  }
-
-  void Import(struct ArrowSchema *schema) {
-    if (schema) {
-      CHECK(std::string(schema->format) == "+s"); // NOLINT
-      CHECK(columns.empty());
-      for (auto i = 0; i < schema->n_children; ++i) {
-        std::string name{schema->children[i]->name};
-        ColumnDType type = FormatMap(schema->children[i]->format);
-        ColumnarMetaInfo col_info{type, i};
-        columns.push_back(col_info);
-      }
-      if (schema->release) {
-        schema->release(schema);
-      }
-    }
-  }
-};
-
-class ArrowColumnarBatch {
- public:
-  ArrowColumnarBatch(struct ArrowArray *rb, struct ArrowSchemaImporter* schema)
-    : rb_{rb}, schema_{schema} {
-    CHECK(rb_) << "Cannot import non-existent record batch";
-    CHECK(!schema_->columns.empty()) << "Cannot import record batch without a schema";
-  }
-
-  size_t Import(float missing) {
-    auto& infov = schema_->columns;
-    for (size_t i = 0; i < infov.size(); ++i) {
-      columns_.push_back(CreateColumn(i, infov[i], missing));
-    }
-
-    // Compute the starting location for every row in this batch
-    auto batch_size = rb_->length;
-    auto num_columns = columns_.size();
-    row_offsets_.resize(batch_size + 1, 0);
-    for (auto i = 0; i < batch_size; ++i) {
-      row_offsets_[i+1] = row_offsets_[i];
-      for (size_t j = 0; j < num_columns; ++j) {
-        if (GetColumn(j).IsValidElement(i)) {
-          row_offsets_[i+1]++;
-        }
-      }
-    }
-    // return number of elements in the batch
-    return row_offsets_.back();
-  }
-
-  ArrowColumnarBatch(const ArrowColumnarBatch&) = delete;
-  ArrowColumnarBatch& operator=(const ArrowColumnarBatch&) = delete;
-  ArrowColumnarBatch(ArrowColumnarBatch&&) = delete;
-  ArrowColumnarBatch& operator=(ArrowColumnarBatch&&) = delete;
-
-  virtual ~ArrowColumnarBatch() {
-    if (rb_ && rb_->release) {
-      rb_->release(rb_);
-      rb_ = nullptr;
-    }
-    columns_.clear();
-  }
-
-  size_t Size() const { return rb_ ? rb_->length : 0; }
-
-  size_t NumColumns() const { return columns_.size(); }
-
-  size_t NumElements() const { return row_offsets_.back(); }
-
-  const Column& GetColumn(size_t col_idx) const {
-    return *columns_[col_idx];
-  }
-
-  void ShiftRowOffsets(size_t batch_offset) {
-    std::transform(row_offsets_.begin(), row_offsets_.end(), row_offsets_.begin(),
-        [=](size_t c) { return c + batch_offset; });
-  }
-
-  const std::vector<size_t>& RowOffsets() const { return row_offsets_; }
-
- private:
-  std::shared_ptr<Column> CreateColumn(size_t idx,
-                                      ColumnarMetaInfo info,
-                                      float missing) const {
-    if (info.loc < 0) {
-      return nullptr;
-    }
-
-    auto loc_in_batch = info.loc;
-    auto length = rb_->length;
-    auto null_count = rb_->null_count;
-    auto buffers0 = rb_->children[loc_in_batch]->buffers[0];
-    auto buffers1 = rb_->children[loc_in_batch]->buffers[1];
-    const uint8_t* bitmap = buffers0 ? reinterpret_cast<const uint8_t*>(buffers0) : nullptr;
-    const uint8_t* data = buffers1 ? reinterpret_cast<const uint8_t*>(buffers1) : nullptr;
-
-    // if null_count is not computed, compute it here
-    if (null_count < 0) {
-      if (!bitmap) {
-        null_count = 0;
-      } else {
-        null_count = length;
-        for (auto i = 0; i < length; ++i) {
-          if (bitmap[i/8] & (1 << (i%8))) {
-            null_count--;
-          }
-        }
-      }
-    }
-
-    switch (info.type) {
-      case ColumnDType::kInt8:
-        return std::make_shared<PrimitiveColumn<int8_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int8_t*>(data), missing);
-      case ColumnDType::kUInt8:
-        return std::make_shared<PrimitiveColumn<uint8_t>>(
-            idx, length, null_count, bitmap, data, missing);
-      case ColumnDType::kInt16:
-        return std::make_shared<PrimitiveColumn<int16_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int16_t*>(data), missing);
-      case ColumnDType::kUInt16:
-        return std::make_shared<PrimitiveColumn<uint16_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const uint16_t*>(data), missing);
-      case ColumnDType::kInt32:
-        return std::make_shared<PrimitiveColumn<int32_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int32_t*>(data), missing);
-      case ColumnDType::kUInt32:
-        return std::make_shared<PrimitiveColumn<uint32_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const uint32_t*>(data), missing);
-      case ColumnDType::kInt64:
-        return std::make_shared<PrimitiveColumn<int64_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int64_t*>(data), missing);
-      case ColumnDType::kUInt64:
-        return std::make_shared<PrimitiveColumn<uint64_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const uint64_t*>(data), missing);
-      case ColumnDType::kFloat:
-        return std::make_shared<PrimitiveColumn<float>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const float*>(data), missing);
-      case ColumnDType::kDouble:
-        return std::make_shared<PrimitiveColumn<double>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const double*>(data), missing);
-      default:
-        return nullptr;
-    }
-  }
-
-  struct ArrowArray* rb_;
-  struct ArrowSchemaImporter* schema_;
-  std::vector<std::shared_ptr<Column>> columns_;
-  std::vector<size_t> row_offsets_;
-};
-
-using ArrowColumnarBatchVec = std::vector<std::unique_ptr<ArrowColumnarBatch>>;
-class RecordBatchesIterAdapter: public dmlc::DataIter<ArrowColumnarBatchVec> {
- public:
-  RecordBatchesIterAdapter(XGDMatrixCallbackNext* next_callback, int nbatch)
-      : next_callback_{next_callback}, nbatches_{nbatch} {}
-
-  void BeforeFirst() override {
-    CHECK(at_first_) << "Cannot reset RecordBatchesIterAdapter";
-  }
-
-  bool Next() override {
-    batches_.clear();
-    while (batches_.size() < static_cast<size_t>(nbatches_) && (*next_callback_)(this) != 0) {
-      at_first_ = false;
-    }
-
-    if (batches_.size() > 0) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  void SetData(struct ArrowArray* rb, struct ArrowSchema* schema) {
-    // Schema is only imported once at the beginning, regardless how many
-    // baches are comming.
-    // But even schema is not imported we still need to release its C data
-    // exported from Arrow.
-    if (at_first_ && schema) {
-      schema_.Import(schema);
-    } else {
-      if (schema && schema->release) {
-        schema->release(schema);
-      }
-    }
-    if (rb) {
-      batches_.push_back(std::make_unique<ArrowColumnarBatch>(rb, &schema_));
-    }
-  }
-
-  const ArrowColumnarBatchVec& Value() const override {
-    return batches_;
-  }
-
-  size_t NumColumns() const { return schema_.columns.size(); }
-  size_t NumRows() const { return kAdapterUnknownSize; }
-
- private:
-  XGDMatrixCallbackNext *next_callback_;
-  bool at_first_{true};
-  int nbatches_;
-  struct ArrowSchemaImporter schema_;
-  ArrowColumnarBatchVec batches_;
-};
-
 class SparsePageAdapterBatch {
  HostSparsePageView page_;

--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -16,7 +16,7 @@
 #include <utility>
 #include <vector>

-#include "../common/bitfield.h"
+#include "../common/bitfield.h"  // for RBitField8
 #include "../common/common.h"
 #include "../common/error_msg.h"  // for NoF128
 #include "xgboost/base.h"
@@ -106,7 +106,20 @@ struct ArrayInterfaceErrors {
 */
 class ArrayInterfaceHandler {
 public:
-  enum Type : std::int8_t { kF2, kF4, kF8, kF16, kI1, kI2, kI4, kI8, kU1, kU2, kU4, kU8 };
+  enum Type : std::int8_t {
+    kF2 = 0,
+    kF4 = 1,
+    kF8 = 2,
+    kF16 = 3,
+    kI1 = 4,
+    kI2 = 5,
+    kI4 = 6,
+    kI8 = 7,
+    kU1 = 8,
+    kU2 = 9,
+    kU4 = 10,
+    kU8 = 11,
+  };

  template <typename PtrType>
  static PtrType GetPtrFromArrayData(Object::Map const &obj) {
@@ -589,6 +602,57 @@ class ArrayInterface {
  ArrayInterfaceHandler::Type type{ArrayInterfaceHandler::kF16};
 };

+template <typename Fn>
+auto DispatchDType(ArrayInterfaceHandler::Type dtype, Fn dispatch) {
+  switch (dtype) {
+    case ArrayInterfaceHandler::kF2: {
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
+      return dispatch(__half{});
+#else
+      LOG(FATAL) << "half type is only supported for CUDA input.";
+      break;
+#endif
+    }
+    case ArrayInterfaceHandler::kF4: {
+      return dispatch(float{});
+    }
+    case ArrayInterfaceHandler::kF8: {
+      return dispatch(double{});
+    }
+    case ArrayInterfaceHandler::kF16: {
+      using T = long double;
+      CHECK(sizeof(T) == 16) << error::NoF128();
+      return dispatch(T{});
+    }
+    case ArrayInterfaceHandler::kI1: {
+      return dispatch(std::int8_t{});
+    }
+    case ArrayInterfaceHandler::kI2: {
+      return dispatch(std::int16_t{});
+    }
+    case ArrayInterfaceHandler::kI4: {
+      return dispatch(std::int32_t{});
+    }
+    case ArrayInterfaceHandler::kI8: {
+      return dispatch(std::int64_t{});
+    }
+    case ArrayInterfaceHandler::kU1: {
+      return dispatch(std::uint8_t{});
+    }
+    case ArrayInterfaceHandler::kU2: {
+      return dispatch(std::uint16_t{});
+    }
+    case ArrayInterfaceHandler::kU4: {
+      return dispatch(std::uint32_t{});
+    }
+    case ArrayInterfaceHandler::kU8: {
+      return dispatch(std::uint64_t{});
+    }
+  }
+
+  return std::result_of_t<Fn(std::int8_t)>();
+}
+
 template <std::int32_t D, typename Fn>
 void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
  // Only used for cuDF at the moment.
@@ -604,60 +668,7 @@ void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
                                                      std::numeric_limits<std::size_t>::max()},
                                array.shape, array.strides, device});
  };
-  switch (array.type) {
-    case ArrayInterfaceHandler::kF2: {
-#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
-      dispatch(__half{});
-#endif
-      break;
-    }
-    case ArrayInterfaceHandler::kF4: {
-      dispatch(float{});
-      break;
-    }
-    case ArrayInterfaceHandler::kF8: {
-      dispatch(double{});
-      break;
-    }
-    case ArrayInterfaceHandler::kF16: {
-      using T = long double;
-      CHECK(sizeof(long double) == 16) << error::NoF128();
-      dispatch(T{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI1: {
-      dispatch(std::int8_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI2: {
-      dispatch(std::int16_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI4: {
-      dispatch(std::int32_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI8: {
-      dispatch(std::int64_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU1: {
-      dispatch(std::uint8_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU2: {
-      dispatch(std::uint16_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU4: {
-      dispatch(std::uint32_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU8: {
-      dispatch(std::uint64_t{});
-      break;
-    }
-  }
+  DispatchDType(array.type, dispatch);
 }

 /**
--- a/src/data/arrow-cdi.h
+++ b/src/data/arrow-cdi.h
@@ -1,66 +0,0 @@
-/* Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#pragma once
-
-#include <cstdint>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ARROW_FLAG_DICTIONARY_ORDERED 1
-#define ARROW_FLAG_NULLABLE 2
-#define ARROW_FLAG_MAP_KEYS_SORTED 4
-
-struct ArrowSchema {
-  // Array type description
-  const char* format;
-  const char* name;
-  const char* metadata;
-  int64_t flags;
-  int64_t n_children;
-  struct ArrowSchema** children;
-  struct ArrowSchema* dictionary;
-
-  // Release callback
-  void (*release)(struct ArrowSchema*);
-  // Opaque producer-specific data
-  void* private_data;
-};
-
-struct ArrowArray {
-  // Array data description
-  int64_t length;
-  int64_t null_count;
-  int64_t offset;
-  int64_t n_buffers;
-  int64_t n_children;
-  const void** buffers;
-  struct ArrowArray** children;
-  struct ArrowArray* dictionary;
-
-  // Release callback
-  void (*release)(struct ArrowArray*);
-  // Opaque producer-specific data
-  void* private_data;
-};
-
-#ifdef __cplusplus
-}
-#endif
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -635,22 +635,39 @@ void MetaInfo::GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
 }

 void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) {
-  if (size != 0 && this->num_col_ != 0) {
+  if (size != 0 && this->num_col_ != 0 && !IsColumnSplit()) {
    CHECK_EQ(size, this->num_col_) << "Length of " << key << " must be equal to number of columns.";
    CHECK(info);
  }
  if (!std::strcmp(key, "feature_type")) {
    feature_type_names.clear();
-    auto& h_feature_types = feature_types.HostVector();
    for (size_t i = 0; i < size; ++i) {
      auto elem = info[i];
      feature_type_names.emplace_back(elem);
    }
+    if (IsColumnSplit()) {
+      feature_type_names = collective::AllgatherStrings(feature_type_names);
+      CHECK_EQ(feature_type_names.size(), num_col_)
+          << "Length of " << key << " must be equal to number of columns.";
+    }
+    auto& h_feature_types = feature_types.HostVector();
    LoadFeatureType(feature_type_names, &h_feature_types);
  } else if (!std::strcmp(key, "feature_name")) {
-    feature_names.clear();
-    for (size_t i = 0; i < size; ++i) {
-      feature_names.emplace_back(info[i]);
+    if (IsColumnSplit()) {
+      std::vector<std::string> local_feature_names{};
+      auto const rank = collective::GetRank();
+      for (std::size_t i = 0; i < size; ++i) {
+        auto elem = std::to_string(rank) + "." + info[i];
+        local_feature_names.emplace_back(elem);
+      }
+      feature_names = collective::AllgatherStrings(local_feature_names);
+      CHECK_EQ(feature_names.size(), num_col_)
+        << "Length of " << key << " must be equal to number of columns.";
+    } else {
+      feature_names.clear();
+      for (size_t i = 0; i < size; ++i) {
+        feature_names.emplace_back(info[i]);
+      }
    }
  } else {
    LOG(FATAL) << "Unknown feature info name: " << key;
@@ -687,13 +704,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col

  linalg::Stack(&this->labels, that.labels);

-  this->weights_.SetDevice(that.weights_.DeviceIdx());
+  this->weights_.SetDevice(that.weights_.Device());
  this->weights_.Extend(that.weights_);

-  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
+  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
  this->labels_lower_bound_.Extend(that.labels_lower_bound_);

-  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
+  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
  this->labels_upper_bound_.Extend(that.labels_upper_bound_);

  linalg::Stack(&this->base_margin_, that.base_margin_);
@@ -723,13 +740,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
  }
  if (!that.feature_weights.Empty()) {
    this->feature_weights.Resize(that.feature_weights.Size());
-    this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
+    this->feature_weights.SetDevice(that.feature_weights.Device());
    this->feature_weights.Copy(that.feature_weights);
  }
 }

 void MetaInfo::SynchronizeNumberOfColumns() {
-  if (IsVerticalFederated()) {
+  if (IsColumnSplit()) {
    collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
  } else {
    collective::Allreduce<collective::Operation::kMax>(&num_col_, 1);
@@ -738,22 +755,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {

 namespace {
 template <typename T>
-void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
+void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
+  bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
  if (!valid) {
    LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
                  "the booster. The device ordinal of the data is: "
-               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
+               << v.Device() << "; the device ordinal of the Booster is: " << device;
  }
 }

 template <typename T, std::int32_t D>
-void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
+void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
  CheckDevice(device, *v.Data());
 }
 }  // anonymous namespace

-void MetaInfo::Validate(std::int32_t device) const {
+void MetaInfo::Validate(DeviceOrd device) const {
  if (group_ptr_.size() != 0 && weights_.Size() != 0) {
    CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
    return;
@@ -850,14 +867,6 @@ DMatrix* TryLoadBinary(std::string fname, bool silent) {
 }  // namespace

 DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
-  auto need_split = false;
-  if (collective::IsFederated()) {
-    LOG(CONSOLE) << "XGBoost federated mode detected, not splitting data among workers";
-  } else if (collective::IsDistributed()) {
-    LOG(CONSOLE) << "XGBoost distributed mode detected, will split data among workers";
-    need_split = true;
-  }
-
  std::string fname, cache_file;
  auto dlm_pos = uri.find('#');
  if (dlm_pos != std::string::npos) {
@@ -865,24 +874,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
    fname = uri.substr(0, dlm_pos);
    CHECK_EQ(cache_file.find('#'), std::string::npos)
        << "Only one `#` is allowed in file path for cache file specification.";
-    if (need_split && data_split_mode == DataSplitMode::kRow) {
-      std::ostringstream os;
-      std::vector<std::string> cache_shards = common::Split(cache_file, ':');
-      for (size_t i = 0; i < cache_shards.size(); ++i) {
-        size_t pos = cache_shards[i].rfind('.');
-        if (pos == std::string::npos) {
-          os << cache_shards[i] << ".r" << collective::GetRank() << "-"
-             << collective::GetWorldSize();
-        } else {
-          os << cache_shards[i].substr(0, pos) << ".r" << collective::GetRank() << "-"
-             << collective::GetWorldSize() << cache_shards[i].substr(pos, cache_shards[i].length());
-        }
-        if (i + 1 != cache_shards.size()) {
-          os << ':';
-        }
-      }
-      cache_file = os.str();
-    }
  } else {
    fname = uri;
  }
@@ -894,19 +885,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
  }

  int partid = 0, npart = 1;
-  if (need_split && data_split_mode == DataSplitMode::kRow) {
-    partid = collective::GetRank();
-    npart = collective::GetWorldSize();
-  } else {
-    // test option to load in part
-    npart = 1;
-  }
-
-  if (npart != 1) {
-    LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
-  }
-
-  DMatrix* dmat{nullptr};
+  DMatrix* dmat{};

  if (cache_file.empty()) {
    fname = data::ValidateFileFormat(fname);
@@ -916,6 +895,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
    dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
                           cache_file, data_split_mode);
  } else {
+    CHECK(data_split_mode != DataSplitMode::kCol)
+        << "Column-wise data split is not supported for external memory.";
    data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
    dmat = new data::SparsePageDMatrix{&iter,
                                       iter.Proxy(),
@@ -926,17 +907,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
                                       cache_file};
  }

-  if (need_split && data_split_mode == DataSplitMode::kCol) {
-    if (!cache_file.empty()) {
-      LOG(FATAL) << "Column-wise data split is not support for external memory.";
-    }
-    LOG(CONSOLE) << "Splitting data by column";
-    auto* sliced = dmat->SliceCol(npart, partid);
-    delete dmat;
-    return sliced;
-  } else {
-    return dmat;
-  }
+  return dmat;
 }

 template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
@@ -1011,9 +982,6 @@ template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter*
 template DMatrix* DMatrix::Create(
    data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
    float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
-template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
-    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&,
-    DataSplitMode data_split_mode);

 SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
  SparsePage transpose;
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -33,13 +33,13 @@ template <typename T, int32_t D>
 void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
  ArrayInterface<D> array(arr_interface);
  if (array.n == 0) {
-    p_out->SetDevice(0);
+    p_out->SetDevice(DeviceOrd::CUDA(0));
    p_out->Reshape(array.shape);
    return;
  }
  CHECK_EQ(array.valid.Capacity(), 0)
      << "Meta info like label or weight can not have missing value.";
-  auto ptr_device = SetDeviceToPtr(array.data);
+  auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
  p_out->SetDevice(ptr_device);

  if (array.is_contiguous && array.type == ToDType<T>::kType) {
@@ -55,7 +55,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
    return;
  }
  p_out->Reshape(array.shape);
-  auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
+  auto t = p_out->View(ptr_device);
  linalg::ElementWiseTransformDevice(
      t,
      [=] __device__(size_t i, T) {
@@ -91,7 +91,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
      });
  dh::caching_device_vector<bool> flag(1);
  auto d_flag = dh::ToSpan(flag);
-  auto d = SetDeviceToPtr(array_interface.data);
+  auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
  dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
  dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
    auto typed = TypedIndex<uint32_t, 1>{array_interface};
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
  CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
      : columns_(columns),
        num_rows_(num_rows) {}
-  size_t Size() const { return num_rows_ * columns_.size(); }
-  __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
+  [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
    size_t column_idx = idx % columns_.size();
    size_t row_idx = idx / columns_.size();
    auto const& column = columns_[column_idx];
@@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
    return {row_idx, column_idx, value};
  }

-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
    auto const& column = columns_[fidx];
    float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
                      ? column(ridx)
@@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
    return value;
  }

-  XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }

 private:
  common::Span<ArrayInterface<1>> columns_;
@@ -120,16 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
      return;
    }

-    device_idx_ = dh::CudaGetPointerDevice(first_column.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
-
-    dh::safe_cuda(cudaSetDevice(device_idx_));
-
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
+    CHECK(device_.IsCUDA());
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
    for (auto& json_col : json_columns) {
      auto column = ArrayInterface<1>(get<Object const>(json_col));
      columns.push_back(column);
      num_rows_ = std::max(num_rows_, column.Shape(0));
-      CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
+      CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
          << "All columns should use the same device.";
      CHECK_EQ(num_rows_, column.Shape(0))
          << "All columns should have same number of rows.";
@@ -145,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
    return batch_;
  }

-  size_t NumRows() const { return num_rows_; }
-  size_t NumColumns() const { return columns_.size(); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return num_rows_; }
+  [[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }

 private:
  CudfAdapterBatch batch_;
  dh::device_vector<ArrayInterface<1>> columns_;
  size_t num_rows_{0};
-  int32_t device_idx_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };

 class CupyAdapterBatch : public detail::NoMetaInfo {
@@ -161,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
  CupyAdapterBatch() = default;
  explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
    : array_interface_(std::move(array_interface)) {}
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
    return array_interface_.Shape(0) * array_interface_.Shape(1);
  }
-  __device__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
    size_t column_idx = idx % array_interface_.Shape(1);
    size_t row_idx = idx / array_interface_.Shape(1);
    float value = array_interface_(row_idx, column_idx);
    return {row_idx, column_idx, value};
  }
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
    float value = array_interface_(ridx, fidx);
    return value;
  }

-  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }

 private:
  ArrayInterface<2> array_interface_;
@@ -191,29 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
    if (array_interface_.Shape(0) == 0) {
      return;
    }
-    device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
+    CHECK(device_.IsCUDA());
  }
  explicit CupyAdapter(std::string cuda_interface_str)
      : CupyAdapter{StringView{cuda_interface_str}} {}
-  const CupyAdapterBatch& Value() const override { return batch_; }
+  [[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }

-  size_t NumRows() const { return array_interface_.Shape(0); }
-  size_t NumColumns() const { return array_interface_.Shape(1); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }

 private:
  ArrayInterface<2> array_interface_;
  CupyAdapterBatch batch_;
-  int32_t device_idx_ {Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };

 // Returns maximum row length
 template <typename AdapterBatchT>
-std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
+std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
                         float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
-
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  IsValidFunctor is_valid(missing);
  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));

--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -98,23 +98,18 @@ __global__ void CompressBinEllpackKernel(
 }

 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
-                                 bool is_dense, size_t row_stride,
-                                 size_t n_rows)
-    : is_dense(is_dense),
-      cuts_(std::move(cuts)),
-      row_stride(row_stride),
-      n_rows(n_rows) {
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
+                                 size_t row_stride, size_t n_rows)
+    : is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
  monitor_.Init("ellpack_page");
-
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));

  monitor_.Start("InitCompressedData");
  InitCompressedData(device);
  monitor_.Stop("InitCompressedData");
 }

-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
                                 const SparsePage &page, bool is_dense,
                                 size_t row_stride,
                                 common::Span<FeatureType const> feature_types)
@@ -128,7 +123,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
    : is_dense(dmat->IsDense()) {
  monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));

  n_rows = dmat->Info().num_row_;

@@ -143,15 +138,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
  monitor_.Stop("Quantiles");

  monitor_.Start("InitCompressedData");
-  this->InitCompressedData(ctx->gpu_id);
+  this->InitCompressedData(ctx->Device());
  monitor_.Stop("InitCompressedData");

-  dmat->Info().feature_types.SetDevice(ctx->gpu_id);
+  dmat->Info().feature_types.SetDevice(ctx->Device());
  auto ft = dmat->Info().feature_types.ConstDeviceSpan();
  monitor_.Start("BinningCompression");
  CHECK(dmat->SingleColBlock());
  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(ctx->gpu_id, batch, ft);
+    CreateHistIndices(ctx->Device(), batch, ft);
  }
  monitor_.Stop("BinningCompression");
 }
@@ -214,7 +209,7 @@ struct TupleScanOp {
 // to remove missing data
 template <typename AdapterBatchT>
 void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
-                       EllpackPageImpl* dst, int device_idx, float missing) {
+                       EllpackPageImpl* dst, DeviceOrd device, float missing) {
  // Some witchcraft happens here
  // The goal is to copy valid elements out of the input to an ELLPACK matrix
  // with a given row stride, using no extra working memory Standard stream
@@ -246,7 +241,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
  // Tuple[2] = The index in the input data
  using Tuple = thrust::tuple<size_t, size_t, size_t>;

-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();

@@ -298,10 +293,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 #endif
 }

-void WriteNullValues(EllpackPageImpl* dst, int device_idx,
-                     common::Span<size_t> row_counts) {
+void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
  // Write the null values
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
  auto row_stride = dst->row_stride;
@@ -318,11 +312,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
 }

 template <typename AdapterBatch>
-EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                                 common::Span<size_t> row_counts_span,
                                 common::Span<FeatureType const> feature_types, size_t row_stride,
                                 size_t n_rows, common::HistogramCuts const& cuts) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));

  *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
  CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -331,7 +325,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,

 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
  template EllpackPageImpl::EllpackPageImpl(                                               \
-      __BATCH_T batch, float missing, int device, bool is_dense,                           \
+      __BATCH_T batch, float missing, DeviceOrd device, bool is_dense,                     \
      common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
      size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);

@@ -388,9 +382,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
      [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
  row_stride = *std::max_element(it, it + page.Size());

-  CHECK_GE(ctx->gpu_id, 0);
+  CHECK(ctx->IsCUDA());
  monitor_.Start("InitCompressedData");
-  InitCompressedData(ctx->gpu_id);
+  InitCompressedData(ctx->Device());
  monitor_.Stop("InitCompressedData");

  // copy gidx
@@ -400,7 +394,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));

-  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
+  auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
  auto null = accessor.NullValue();
  CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
 }
@@ -425,8 +419,7 @@ struct CopyPage {
 };

 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
-                             size_t offset) {
+size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
  monitor_.Start("Copy");
  size_t num_elements = page->n_rows * page->row_stride;
  CHECK_EQ(row_stride, page->row_stride);
@@ -486,7 +479,7 @@ struct CompactPage {
 };

 // Compacts the data from the given EllpackPage into the current page.
-void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
+void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
                              common::Span<size_t> row_indexes) {
  monitor_.Start("Compact");
  CHECK_EQ(row_stride, page->row_stride);
@@ -499,13 +492,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
 }

 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(int device) {
+void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
  size_t num_symbols = NumSymbols();

  // Required buffer size for storing data matrix in ELLPack format.
  size_t compressed_size_bytes =
-    common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
-      num_symbols);
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
  gidx_buffer.SetDevice(device);
  // Don't call fill unnecessarily
  if (gidx_buffer.Size() == 0) {
@@ -517,7 +509,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
 }

 // Compress a CSR page into ELLPACK.
-void EllpackPageImpl::CreateHistIndices(int device,
+void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
                                        const SparsePage& row_batch,
                                        common::Span<FeatureType const> feature_types) {
  if (row_batch.Size() == 0) return;
@@ -527,7 +519,7 @@ void EllpackPageImpl::CreateHistIndices(int device,

  // bin and compress entries in batches of rows
  size_t gpu_batch_nrows =
-      std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
+      std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
               static_cast<size_t>(row_batch.Size()));

  size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
@@ -592,7 +584,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
 }

 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
-    int device, common::Span<FeatureType const> feature_types) const {
+    DeviceOrd device, common::Span<FeatureType const> feature_types) const {
  gidx_buffer.SetDevice(device);
  return {device,
          cuts_,
@@ -606,7 +598,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
 }
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
    common::Span<FeatureType const> feature_types) const {
-  return {Context::kCpuId,
+  return {DeviceOrd::CPU(),
          cuts_,
          is_dense,
          row_stride,
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {

  common::Span<const FeatureType> feature_types;

-  EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
-                        bool is_dense, size_t row_stride, size_t base_rowid,
-                        size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter,
+  EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
+                        size_t row_stride, size_t base_rowid, size_t n_rows,
+                        common::CompressedIterator<uint32_t> gidx_iter,
                        common::Span<FeatureType const> feature_types)
      : is_dense(is_dense),
        row_stride(row_stride),
        base_rowid(base_rowid),
-        n_rows(n_rows) ,gidx_iter(gidx_iter),
+        n_rows(n_rows),
+        gidx_iter(gidx_iter),
        feature_types{feature_types} {
-    if (device == Context::kCpuId) {
+    if (device.IsCPU()) {
      gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
      feature_segments = cuts.cut_ptrs_.ConstHostSpan();
      min_fvalue = cuts.min_vals_.ConstHostSpan();
@@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
  }
  // Get a matrix element, uses binary search for look up Return NaN if missing
  // Given a row index and a feature index, returns the corresponding cut value
-  __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
    ridx -= base_rowid;
    auto row_begin = row_stride * ridx;
    auto row_end = row_begin + row_stride;
@@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
  }

  template <bool is_cat>
-  __device__ uint32_t SearchBin(float value, size_t column_id) const {
+  [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
    auto beg = feature_segments[column_id];
    auto end = feature_segments[column_id + 1];
    uint32_t idx = 0;
@@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
    return idx;
  }

-  __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
    auto gidx = GetBinIndex(ridx, fidx);
    if (gidx == -1) {
      return nan("");
@@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
  }

  // Check if the row id is withing range of the current batch.
-  __device__ bool IsInRange(size_t row_id) const {
+  [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
    return row_id >= base_rowid && row_id < base_rowid + n_rows;
  }
  /*! \brief Return the total number of symbols (total number of bins plus 1 for
   * not found). */
-  XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }

-  XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }

-  XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }

-  XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
 };


@@ -141,14 +142,13 @@ class EllpackPageImpl {
   * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
   * and the given number of rows.
   */
-  EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
-                  size_t row_stride, size_t n_rows);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
+                  size_t n_rows);
  /*!
   * \brief Constructor used for external memory.
   */
-  EllpackPageImpl(int device, common::HistogramCuts cuts,
-                  const SparsePage &page, bool is_dense, size_t row_stride,
-                  common::Span<FeatureType const> feature_types);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
+                  bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);

  /*!
   * \brief Constructor from an existing DMatrix.
@@ -159,7 +159,7 @@ class EllpackPageImpl {
  explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);

  template <typename AdapterBatch>
-  explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+  explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                           common::Span<size_t> row_counts_span,
                           common::Span<FeatureType const> feature_types, size_t row_stride,
                           size_t n_rows, common::HistogramCuts const& cuts);
@@ -176,7 +176,7 @@ class EllpackPageImpl {
   * @param offset The number of elements to skip before copying.
   * @returns The number of elements copied.
   */
-  size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
+  size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);

  /*! \brief Compact the given ELLPACK page into the current page.
   *
@@ -184,11 +184,10 @@ class EllpackPageImpl {
   * @param page The ELLPACK page to compact from.
   * @param row_indexes Row indexes for the compacted page.
   */
-  void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
-
+  void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);

  /*! \return Number of instances in the page. */
-  size_t Size() const;
+  [[nodiscard]] size_t Size() const;

  /*! \brief Set the base row id for this page. */
  void SetBaseRowId(std::size_t row_id) {
@@ -204,12 +203,12 @@ class EllpackPageImpl {

  /*! \brief Return the total number of symbols (total number of bins plus 1 for
   * not found). */
-  size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
+  [[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }

-  EllpackDeviceAccessor
-  GetDeviceAccessor(int device,
-                    common::Span<FeatureType const> feature_types = {}) const;
-  EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
+      DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
+      common::Span<FeatureType const> feature_types = {}) const;

 private:
  /*!
@@ -218,13 +217,13 @@ class EllpackPageImpl {
   * @param device The GPU device to use.
   * @param row_batch The CSR page.
   */
-  void CreateHistIndices(int device,
+  void CreateHistIndices(DeviceOrd device,
                         const SparsePage& row_batch,
                         common::Span<FeatureType const> feature_types);
  /*!
   * \brief Initialize the buffer to store compressed features.
   */
-  void InitCompressedData(int device);
+  void InitCompressedData(DeviceOrd device);


 public:
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,7 +10,7 @@

 namespace xgboost::data {
 void EllpackPageSource::Fetch() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  if (!this->ReadCache()) {
    if (count_ != 0 && !sync_) {
      // source is initialized to be the 0th page during construction, so when count_ is 0
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
  BatchParam param_;
  common::Span<FeatureType const> feature_types_;
  std::unique_ptr<common::HistogramCuts> cuts_;
-  std::int32_t device_;
+  DeviceOrd device_;

 public:
  EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
                    std::shared_ptr<Cache> cache, BatchParam param,
                    std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
                    common::Span<FeatureType const> feature_types,
-                    std::shared_ptr<SparsePageSource> source, std::int32_t device)
+                    std::shared_ptr<SparsePageSource> source, DeviceOrd device)
      : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
        is_dense_{is_dense},
        row_stride_{row_stride},
@@ -50,6 +50,7 @@ inline void EllpackPageSource::Fetch() {
  // silent the warning about unused variables.
  (void)(row_stride_);
  (void)(is_dense_);
+  (void)(device_);
  common::AssertGPUSupport();
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
  auto pctx = MakeProxy(proxy_)->Ctx();

  Context ctx;
-  ctx.UpdateAllowUnknown(
-      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
  // hardcoded parameter.
  BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};

@@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
    return HostAdapterDispatch(proxy, [&](auto const& value) {
      size_t n_threads = ctx->Threads();
      size_t n_features = column_sizes.size();
-      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
+      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
      column_sizes_tloc.Data()->Fill(0ul);
      auto view = column_sizes_tloc.HostView();
      common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -48,10 +48,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  int32_t current_device;

  dh::safe_cuda(cudaGetDevice(&current_device));
-
-  auto get_device = [&]() -> int32_t {
-    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
-    CHECK_NE(d, Context::kCpuId);
+  auto get_device = [&]() {
+    auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
+    CHECK(!d.IsCPU());
    return d;
  };

@@ -61,11 +60,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  common::HistogramCuts cuts;
  do {
    // We use do while here as the first batch is fetched in ctor
-    // ctx_.gpu_id = proxy->DeviceIdx();
-    CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
-
-    dh::safe_cuda(cudaSetDevice(get_device()));
-
+    CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
    if (cols == 0) {
      cols = num_cols();
      collective::Allreduce<collective::Operation::kMax>(&cols, 1);
@@ -103,8 +99,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  auto n_features = cols;
  CHECK_GE(n_features, 1) << "Data must has at least 1 column.";

-  dh::safe_cuda(cudaSetDevice(get_device()));
-
+  dh::safe_cuda(cudaSetDevice(get_device().ordinal));
  if (!ref) {
    HostDeviceVector<FeatureType> ft;
    common::SketchContainer final_sketch(
@@ -143,9 +138,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  size_t n_batches_for_verification = 0;
  while (iter.Next()) {
    init_page();
-
-    dh::safe_cuda(cudaSetDevice(get_device()));
-
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
    auto rows = num_rows();
    dh::device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
@@ -197,18 +190,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
  if (!ellpack_) {
    ellpack_.reset(new EllpackPage());
    if (ctx->IsCUDA()) {
-      this->Info().feature_types.SetDevice(ctx->gpu_id);
+      this->Info().feature_types.SetDevice(ctx->Device());
      *ellpack_->Impl() =
          EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
    } else if (fmat_ctx_.IsCUDA()) {
-      this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
+      this->Info().feature_types.SetDevice(fmat_ctx_.Device());
      *ellpack_->Impl() =
          EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
    } else {
      // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
      // for cut reference.
      auto cuda_ctx = ctx->MakeCUDA();
-      this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
+      this->Info().feature_types.SetDevice(cuda_ctx.Device());
      *ellpack_->Impl() =
          EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
    }
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }

-void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
-                              char const *c_values, bst_feature_t n_features, bool on_host) {
+void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
+                              bst_feature_t n_features, bool on_host) {
  CHECK(on_host) << "Not implemented on device.";
  std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
      StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }

 namespace cuda_impl {
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
    return;
  }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }

 void DMatrixProxy::FromCudaArray(StringView interface_str) {
@@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
    return;
  }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }

 namespace cuda_impl {
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)

 public:
-  int DeviceIdx() const { return ctx_.gpu_id; }
+  DeviceOrd Device() const { return ctx_.Device(); }

  void SetCUDAArray(char const* c_interface) {
    common::AssertGPUSupport();
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -75,11 +75,9 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
 }

 void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
-  if (info_.IsVerticalFederated()) {
-    std::vector<uint64_t> buffer(collective::GetWorldSize());
-    buffer[collective::GetRank()] = info_.num_col_;
-    collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
-    auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0);
+  if (info_.IsColumnSplit() && collective::GetWorldSize() > 1) {
+    auto const cols = collective::Allgather(info_.num_col_);
+    auto const offset = std::accumulate(cols.cbegin(), cols.cbegin() + collective::GetRank(), 0ul);
    if (offset == 0) {
      return;
    }
@@ -253,7 +251,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
    }
    if (batch.BaseMargin() != nullptr) {
      info_.base_margin_ = decltype(info_.base_margin_){
-          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId};
+          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
    }
    if (batch.Qid() != nullptr) {
      qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
@@ -361,78 +359,4 @@ template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int n
 template SimpleDMatrix::SimpleDMatrix(
    IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
    float missing, int nthread, DataSplitMode data_split_mode);
-
-template <>
-SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
-                             DataSplitMode data_split_mode) {
-  Context ctx;
-  ctx.nthread = nthread;
-
-  auto& offset_vec = sparse_page_->offset.HostVector();
-  auto& data_vec = sparse_page_->data.HostVector();
-  uint64_t total_batch_size = 0;
-  uint64_t total_elements = 0;
-
-  adapter->BeforeFirst();
-  // Iterate over batches of input data
-  while (adapter->Next()) {
-    auto& batches = adapter->Value();
-    size_t num_elements = 0;
-    size_t num_rows = 0;
-    // Import Arrow RecordBatches
-#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx.Threads())
-    for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
-      num_elements += batches[i]->Import(missing);
-      num_rows += batches[i]->Size();
-    }
-    total_elements += num_elements;
-    total_batch_size += num_rows;
-    // Compute global offset for every row and starting row for every batch
-    std::vector<uint64_t> batch_offsets(batches.size());
-    for (size_t i = 0; i < batches.size(); ++i) {
-      if (i == 0) {
-        batch_offsets[i] = total_batch_size - num_rows;
-        batches[i]->ShiftRowOffsets(total_elements - num_elements);
-      } else {
-        batch_offsets[i] = batch_offsets[i - 1] + batches[i - 1]->Size();
-        batches[i]->ShiftRowOffsets(batches[i - 1]->RowOffsets().back());
-      }
-    }
-    // Pre-allocate DMatrix memory
-    data_vec.resize(total_elements);
-    offset_vec.resize(total_batch_size + 1);
-    // Copy data into DMatrix
-#pragma omp parallel num_threads(ctx.Threads())
-    {
-#pragma omp for nowait
-      for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
-        size_t begin = batches[i]->RowOffsets()[0];
-        for (size_t k = 0; k < batches[i]->Size(); ++k) {
-          for (size_t j = 0; j < batches[i]->NumColumns(); ++j) {
-            auto element = batches[i]->GetColumn(j).GetElement(k);
-            if (!std::isnan(element.value)) {
-              data_vec[begin++] = Entry(element.column_idx, element.value);
-            }
-          }
-        }
-      }
-#pragma omp for nowait
-      for (int i = 0; i < static_cast<int>(batches.size()); ++i) {
-        auto& offsets = batches[i]->RowOffsets();
-        std::copy(offsets.begin() + 1, offsets.end(), offset_vec.begin() + batch_offsets[i] + 1);
-      }
-    }
-  }
-  // Synchronise worker columns
-  info_.num_col_ = adapter->NumColumns();
-  info_.data_split_mode = data_split_mode;
-  ReindexFeatures(&ctx);
-  info_.SynchronizeNumberOfColumns();
-
-  info_.num_row_ = total_batch_size;
-  info_.num_nonzero_ = data_vec.size();
-  CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
-
-  fmat_ctx_ = ctx;
-}
 }  // namespace xgboost::data
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -10,9 +10,7 @@
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"

-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 // Does not currently support metainfo as no on-device data source contains this
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
@@ -21,14 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                             DataSplitMode data_split_mode) {
  CHECK(data_split_mode != DataSplitMode::kCol)
      << "Column-wise data split is currently not supported on the GPU.";
-  auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
-                                                                      : adapter->DeviceIdx();
-  CHECK_GE(device, 0);
-
-  dh::safe_cuda(cudaSetDevice(device));
+  auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
+                    ? DeviceOrd::CUDA(dh::CurrentDevice())
+                    : adapter->Device();
+  CHECK(device.IsCUDA());
+  dh::safe_cuda(cudaSetDevice(device.ordinal));

  Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});

  CHECK(adapter->NumRows() != kAdapterUnknownSize);
  CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@@ -53,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
                                      int nthread, DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
                                      int nthread, DataSplitMode data_split_mode);
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -54,11 +54,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
 }

 template <typename AdapterBatchT>
-void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
-                     int device_idx, float missing) {
-
-  dh::safe_cuda(cudaSetDevice(device_idx));
-
+void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
+                     float missing) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  IsValidFunctor is_valid(missing);
  // Count elements per row
  dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
@@ -71,22 +69,19 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
  });

  dh::XGBCachingDeviceAllocator<char> alloc;
-
-#if defined(XGBOOST_USE_HIP)
-  thrust::exclusive_scan(thrust::hip::par(alloc),
-      thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data() + offset.size()),
-      thrust::device_pointer_cast(offset.data()));
-#elif defined(XGBOOST_USE_CUDA)
-  thrust::exclusive_scan(thrust::cuda::par(alloc),
-      thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data() + offset.size()),
-      thrust::device_pointer_cast(offset.data()));
+#if defined(XGBOOST_USE_CUDA)
+  thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+                         thrust::device_pointer_cast(offset.data() + offset.size()),
+                         thrust::device_pointer_cast(offset.data()));
+#elif defined(XGBOOST_USE_HIP)
+  thrust::exclusive_scan(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
+                         thrust::device_pointer_cast(offset.data() + offset.size()),
+                         thrust::device_pointer_cast(offset.data()));
 #endif
 }

 template <typename AdapterBatchT>
-size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
+size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
                        SparsePage* page) {
  bool valid = NoInfInData(batch, IsValidFunctor{missing});
  CHECK(valid) << error::InfInData();
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2022 by XGBoost Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
 * \file simple_dmatrix.h
 * \brief In-memory version of DMatrix.
 * \author Tianqi Chen
@@ -15,8 +15,7 @@

 #include "gradient_index.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 // Used for single batch data.
 class SimpleDMatrix : public DMatrix {
 public:
@@ -65,9 +64,10 @@ class SimpleDMatrix : public DMatrix {
  /**
   * \brief Reindex the features based on a global view.
   *
-   * In some cases (e.g. vertical federated learning), features are loaded locally with indices
-   * starting from 0. However, all the algorithms assume the features are globally indexed, so we
-   * reindex the features based on the offset needed to obtain the global view.
+   * In some cases (e.g. column-wise data split and vertical federated learning), features are
+   * loaded locally with indices starting from 0. However, all the algorithms assume the features
+   * are globally indexed, so we reindex the features based on the offset needed to obtain the
+   * global view.
   */
  void ReindexFeatures(Context const* ctx);

@@ -75,6 +75,5 @@ class SimpleDMatrix : public DMatrix {
  // Context used only for DMatrix initialization.
  Context fmat_ctx_;
 };
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SIMPLE_DMATRIX_H_
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
    ellpack_page_source_.reset();  // make sure resource is released before making new ones.
    ellpack_page_source_ = std::make_shared<EllpackPageSource>(
        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
-        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
+        ctx->Device());
  } else {
    CHECK(sparse_page_source_);
    ellpack_page_source_->Reset();
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
 }  // namespace detail

 void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
-  auto device = proxy->DeviceIdx();
-  if (device < 0) {
-    device = dh::CurrentDevice();
+  auto device = proxy->Device();
+  if (device.IsCPU()) {
+    device = DeviceOrd::CUDA(dh::CurrentDevice());
  }
-  CHECK_GE(device, 0);
+  CHECK(device.IsCUDA());

  cuda_impl::Dispatch(proxy,
                      [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -177,15 +177,15 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    }
    // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
    // to let user adjust number of pre-fetched batches when needed.
-    uint32_t constexpr kPreFetch = 3;
-
-    size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
+    std::int32_t n_prefetches = std::max(nthreads_, 3);
+    std::int32_t n_prefetch_batches =
+        std::min(static_cast<std::uint32_t>(n_prefetches), n_batches_);
    CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
    std::size_t fetch_it = count_;

    exce_.Rethrow();

-    for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
+    for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
      fetch_it %= n_batches_;  // ring
      if (ring_->at(fetch_it).valid()) {
        continue;
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
  monitor_.Start("BoostNewTrees");

-  predt->predictions.SetDevice(ctx_->Ordinal());
+  predt->predictions.SetDevice(ctx_->Device());
  auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                    model_.learner_model_param->OutputLength());
  CHECK_NE(n_groups, 0);
@@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
  } else {
    CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
    linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
-                                     ctx_->Ordinal()};
+                                     ctx_->Device()};
    bool update_predict = true;
    for (bst_target_t gid = 0; gid < n_groups; ++gid) {
      node_position.clear();
@@ -736,7 +736,7 @@ class Dart : public GBTree {

    PredictionCacheEntry predts;  // temporary storage for prediction
    if (ctx_->IsCUDA()) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+      predts.predictions.SetDevice(ctx_->Device());
    }
    predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
    // multi-target is not yet supported.
@@ -761,8 +761,8 @@ class Dart : public GBTree {
      CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());

      size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+      if (predts.predictions.Device().IsCUDA()) {
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
        GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
                          predts.predictions.DeviceSpan(), w, n_rows, n_groups,
                          group);
@@ -801,8 +801,8 @@ class Dart : public GBTree {

    StringView msg{"Unsupported data type for inplace predict."};
    PredictionCacheEntry predts;
-    if (ctx_->gpu_id != Context::kCpuId) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+    if (ctx_->IsCUDA()) {
+      predts.predictions.SetDevice(ctx_->Device());
    }
    predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);

@@ -838,8 +838,8 @@ class Dart : public GBTree {
      CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());

      size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+      if (predts.predictions.Device().IsCUDA()) {
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
        auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
        GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
                                 predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -106,14 +106,30 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
  Validate(*this);
 }

+namespace {
+std::int32_t IOThreads(Context const* ctx) {
+  CHECK(ctx);
+  std::int32_t n_threads = ctx->Threads();
+  // CRAN checks for number of threads used by examples, but we might not have the right
+  // number of threads when serializing/unserializing models as nthread is a booster
+  // parameter, which is only effective after booster initialization.
+  //
+  // The threshold ratio of CPU time to user time for R is 2.5, we set the number of
+  // threads to 2.
+#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
+  n_threads = std::min(2, n_threads);
+#endif
+  return n_threads;
+}
+}  // namespace
+
 void GBTreeModel::SaveModel(Json* p_out) const {
  auto& out = *p_out;
  CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
  out["gbtree_model_param"] = ToJson(param);
  std::vector<Json> trees_json(trees.size());

-  CHECK(ctx_);
-  common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
+  common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) {
    auto const& tree = trees[t];
    Json jtree{Object{}};
    tree->SaveModel(&jtree);
@@ -151,9 +167,7 @@ void GBTreeModel::LoadModel(Json const& in) {
  CHECK_EQ(tree_info_json.size(), param.num_trees);
  tree_info.resize(param.num_trees);

-  CHECK(ctx_);
-
-  common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
+  common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) {
    auto tree_id = get<Integer const>(trees_json[t]["id"]);
    trees.at(tree_id).reset(new RegTree{});
    trees[tree_id]->LoadModel(trees_json[t]);
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -278,7 +278,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
  std::swap(base_score_, base_margin);
  // Make sure read access everywhere for thread-safe prediction.
  std::as_const(base_score_).HostView();
-  if (!ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
    std::as_const(base_score_).View(ctx->Device());
  }
  CHECK(std::as_const(base_score_).Data()->HostCanRead());
@@ -287,7 +287,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
 linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device) const {
  // multi-class is not yet supported.
  CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
-  if (device.IsCPU()) {
+  if (!device.IsCUDA()) {
    // Make sure that we won't run into race condition.
    CHECK(base_score_.Data()->HostCanRead());
    return base_score_.HostView();
@@ -305,10 +305,10 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* c

 void LearnerModelParam::Copy(LearnerModelParam const& that) {
  base_score_.Reshape(that.base_score_.Shape());
-  base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
+  base_score_.Data()->SetDevice(that.base_score_.Device());
  base_score_.Data()->Copy(*that.base_score_.Data());
  std::as_const(base_score_).HostView();
-  if (that.base_score_.DeviceIdx() != Context::kCpuId) {
+  if (!that.base_score_.Device().IsCPU()) {
    std::as_const(base_score_).View(that.base_score_.Device());
  }
  CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
@@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
    if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
      if (p_fmat) {
        auto const& info = p_fmat->Info();
-        info.Validate(Ctx()->Ordinal());
+        info.Validate(Ctx()->Device());
        // We estimate it from input data.
        linalg::Tensor<float, 1> base_score;
        InitEstimation(info, &base_score);
@@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner {
    monitor_.Init("Learner");
    for (std::shared_ptr<DMatrix> const& d : cache) {
      if (d) {
-        prediction_container_.Cache(d, Context::kCpuId);
+        prediction_container_.Cache(d, DeviceOrd::CPU());
      }
    }
  }
@@ -1057,7 +1057,7 @@ class LearnerIO : public LearnerConfiguration {
                                                        ? std::numeric_limits<float>::quiet_NaN()
                                                        : obj_->ProbToMargin(mparam_.base_score)},
                                                   {1},
-                                                   Context::kCpuId},
+                                                   DeviceOrd::CPU()},
                          obj_->Task(), tparam_.multi_strategy);

    if (attributes_.find("objective") != attributes_.cend()) {
@@ -1282,7 +1282,7 @@ class LearnerImpl : public LearnerIO {

    this->ValidateDMatrix(train.get(), true);

-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());

    monitor_.Start("PredictRaw");
    this->PredictRaw(train.get(), &predt, true, 0, 0);
@@ -1312,7 +1312,7 @@ class LearnerImpl : public LearnerIO {
    CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
        << "The number of columns in gradient should be equal to the number of targets/classes in "
           "the model.";
-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());
    gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
    monitor_.Stop("BoostOneIter");
  }
@@ -1330,17 +1330,19 @@ class LearnerImpl : public LearnerIO {
    if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
      metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
      auto config = obj_->DefaultMetricConfig();
-      metrics_.back()->LoadConfig(config);
+      if (!IsA<Null>(config)) {
+        metrics_.back()->LoadConfig(config);
+      }
      metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
    }

    for (size_t i = 0; i < data_sets.size(); ++i) {
      std::shared_ptr<DMatrix> m = data_sets[i];
-      auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
+      auto &predt = prediction_container_.Cache(m, ctx_.Device());
      this->ValidateDMatrix(m.get(), false);
      this->PredictRaw(m.get(), &predt, false, 0, 0);

-      auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions;
+      auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions;
      out.Resize(predt.predictions.Size());
      out.Copy(predt.predictions);

@@ -1376,7 +1378,7 @@ class LearnerImpl : public LearnerIO {
    } else if (pred_leaf) {
      gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
    } else {
-      auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
+      auto& prediction = prediction_container_.Cache(data, ctx_.Device());
      this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
      // Copy the prediction cache to output prediction. out_preds comes from C API
      out_preds->SetDevice(ctx_.Device());
@@ -1456,7 +1458,7 @@ class LearnerImpl : public LearnerIO {

  void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
    MetaInfo const& info = p_fmat->Info();
-    info.Validate(ctx_.gpu_id);
+    info.Validate(ctx_.Device());

    if (is_training) {
      CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
  }

  void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
-    if (ctx_->gpu_id < 0) return;
+    if (ctx_->IsCPU()) return;

    num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);

@@ -60,8 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
      return;
    }

-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    // The begin and end indices for the section of each column associated with
    // this device
    std::vector<std::pair<bst_uint, bst_uint>> column_segments;
@@ -135,7 +134,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
         ++group_idx) {
      // Get gradient
      auto grad = GradientPair(0, 0);
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
        grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
      }
      auto dbias = static_cast<float>(
@@ -144,7 +143,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
      model->Bias()[group_idx] += dbias;

      // Update residual
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
        UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
      }
    }
@@ -155,7 +154,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    bst_float &w = (*model)[fidx][group_idx];
    // Get gradient
    auto grad = GradientPair(0, 0);
-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
      grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
    }
    auto dw = static_cast<float>(tparam_.learning_rate *
@@ -164,15 +163,14 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
                                                 tparam_.reg_lambda_denorm));
    w += dw;

-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
      UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
    }
  }

  // This needs to be public because of the __device__ lambda.
  GradientPair GetBiasGradient(int group_idx, int num_group) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    auto counting = thrust::make_counting_iterator(0ull);
    auto f = [=] __device__(size_t idx) {
      return idx * num_group + group_idx;
@@ -196,8 +194,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT

  // This needs to be public because of the __device__ lambda.
  GradientPair GetGradient(int group_idx, int num_group, int fidx) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
    size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
    common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -23,8 +23,7 @@
 #include "xgboost/linalg.h"
 #include "xgboost/metric.h"

-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(auc);
 /**
@@ -257,10 +256,10 @@ template <typename Curve>
 class EvalAUC : public MetricNoCache {
  double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
    double auc {0};
-    if (ctx_->gpu_id != Context::kCpuId) {
-      preds.SetDevice(ctx_->gpu_id);
-      info.labels.SetDevice(ctx_->gpu_id);
-      info.weights_.SetDevice(ctx_->gpu_id);
+    if (ctx_->Device().IsCUDA()) {
+      preds.SetDevice(ctx_->Device());
+      info.labels.SetDevice(ctx_->Device());
+      info.weights_.SetDevice(ctx_->Device());
    }
    //  We use the global size to handle empty dataset.
    std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
@@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
    double auc{0};
    uint32_t valid_groups = 0;
    auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      std::tie(auc, valid_groups) =
          RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
    } else {
@@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
    double auc{0};
    auto n_threads = ctx_->Threads();
    CHECK_NE(n_classes, 0);
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
    } else {
      auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
@@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
  std::tuple<double, double, double>
  EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
    double fp, tp, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
                                           info.labels.HostView().Slice(linalg::All(), 0),
                                           common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
  }

 public:
-  char const* Name() const override {
+  [[nodiscard]] char const* Name() const override {
    return "auc";
  }
 };
@@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
  std::tuple<double, double, double>
  EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
    double pr, re, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      std::tie(pr, re, auc) =
          BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                      common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {

  double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
                        size_t n_classes) {
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      auto n_threads = this->ctx_->Threads();
      return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
    } else {
@@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
    double auc{0};
    uint32_t valid_groups = 0;
    auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      auto labels = info.labels.Data()->ConstHostSpan();
      if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
        InvalidLabels();
@@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
  }

 public:
-  const char *Name() const override { return "aucpr"; }
+  [[nodiscard]] const char *Name() const override { return "aucpr"; }
 };

 XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
@@ -473,5 +472,4 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *, common::Span<f
  return {};
 }
 #endif
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -926,8 +926,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
                                                 common::Span<float const> predts,
                                                 MetaInfo const &info,
                                                 std::shared_ptr<DeviceAUCCache> *p_cache) {
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
-
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
  if (predts.empty()) {
    return std::make_pair(0.0, static_cast<uint32_t>(0));
  }
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -46,7 +46,26 @@ template <typename Fn>
 PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
  PackedReduceResult result;
  auto labels = info.labels.View(ctx->Device());
-  if (ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
+    dh::XGBCachingDeviceAllocator<char> alloc;
+    thrust::counting_iterator<size_t> begin(0);
+    thrust::counting_iterator<size_t> end = begin + labels.Size();
+    result = thrust::transform_reduce(
+        thrust::cuda::par(alloc), begin, end,
+        [=] XGBOOST_DEVICE(size_t i) {
+          auto idx = linalg::UnravelIndex(i, labels.Shape());
+          auto sample_id = std::get<0>(idx);
+          auto target_id = std::get<1>(idx);
+          auto res = loss(i, sample_id, target_id);
+          float v{std::get<0>(res)}, wt{std::get<1>(res)};
+          return PackedReduceResult{v, wt};
+        },
+        PackedReduceResult{}, thrust::plus<PackedReduceResult>());
+#else
+    common::AssertGPUSupport();
+#endif  //  defined(XGBOOST_USE_CUDA)
+  } else {
    auto n_threads = ctx->Threads();
    std::vector<double> score_tloc(n_threads, 0.0);
    std::vector<double> weight_tloc(n_threads, 0.0);
@@ -69,41 +88,6 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
    double residue_sum = std::accumulate(score_tloc.cbegin(), score_tloc.cend(), 0.0);
    double weights_sum = std::accumulate(weight_tloc.cbegin(), weight_tloc.cend(), 0.0);
    result = PackedReduceResult{residue_sum, weights_sum};
-  } else {
-#if defined(XGBOOST_USE_CUDA)
-    dh::XGBCachingDeviceAllocator<char> alloc;
-    thrust::counting_iterator<size_t> begin(0);
-    thrust::counting_iterator<size_t> end = begin + labels.Size();
-    result = thrust::transform_reduce(
-        thrust::cuda::par(alloc), begin, end,
-        [=] XGBOOST_DEVICE(size_t i) {
-          auto idx = linalg::UnravelIndex(i, labels.Shape());
-          auto sample_id = std::get<0>(idx);
-          auto target_id = std::get<1>(idx);
-          auto res = loss(i, sample_id, target_id);
-          float v{std::get<0>(res)}, wt{std::get<1>(res)};
-          return PackedReduceResult{v, wt};
-        },
-        PackedReduceResult{}, thrust::plus<PackedReduceResult>());
-#elif defined(XGBOOST_USE_HIP)
-    dh::XGBCachingDeviceAllocator<char> alloc;
-    thrust::counting_iterator<size_t> begin(0);
-    thrust::counting_iterator<size_t> end = begin + labels.Size();
-
-    result = thrust::transform_reduce(
-        thrust::hip::par(alloc), begin, end,
-        [=] XGBOOST_DEVICE(size_t i) {
-          auto idx = linalg::UnravelIndex(i, labels.Shape());
-          auto sample_id = std::get<0>(idx);
-          auto target_id = std::get<1>(idx);
-          auto res = loss(i, sample_id, target_id);
-          float v{std::get<0>(res)}, wt{std::get<1>(res)};
-          return PackedReduceResult{v, wt};
-        },
-        PackedReduceResult{}, thrust::plus<PackedReduceResult>());
-#else
-    common::AssertGPUSupport();
-#endif  //  defined(XGBOOST_USE_CUDA)
  }
  return result;
 }
@@ -201,10 +185,10 @@ class PseudoErrorLoss : public MetricNoCache {
    CHECK_EQ(info.labels.Shape(0), info.num_row_);
    auto labels = info.labels.View(ctx_->Device());
    preds.SetDevice(ctx_->Device());
-    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
+    auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
    info.weights_.SetDevice(ctx_->Device());
-    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                     : info.weights_.ConstDeviceSpan());
+    common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                   : info.weights_.ConstHostSpan());
    float slope = this->param_.huber_slope;
    CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
    PackedReduceResult result =
@@ -367,10 +351,10 @@ struct EvalEWiseBase : public MetricNoCache {
    }
    auto labels = info.labels.View(ctx_->Device());
    info.weights_.SetDevice(ctx_->Device());
-    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                     : info.weights_.ConstDeviceSpan());
+    common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                   : info.weights_.ConstHostSpan());
    preds.SetDevice(ctx_->Device());
-    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
+    auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();

    auto d_policy = policy_;
    auto result =
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -149,24 +149,24 @@ class MultiClassMetricsReduction {

 #endif  // XGBOOST_USE_CUDA || defined(XGBOOST_USE_HIP)

-  PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class,
+  PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class,
                            const HostDeviceVector<bst_float>& weights,
                            const HostDeviceVector<bst_float>& labels,
                            const HostDeviceVector<bst_float>& preds) {
    PackedReduceResult result;

-    if (device < 0) {
+    if (device.IsCPU()) {
      result =
-          CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads());
+          CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
    }
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    else {  // NOLINT
-      device_ = tparam.gpu_id;
+      device_ = ctx.Device();
      preds.SetDevice(device_);
      labels.SetDevice(device_);
      weights.SetDevice(device_);

-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
      result = DeviceReduceMetrics(weights, labels, preds, n_class);
    }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -176,8 +176,8 @@ class MultiClassMetricsReduction {
 private:
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  dh::PinnedMemory label_error_;
-  int device_{-1};
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+  DeviceOrd device_{DeviceOrd::CPU()};
+#endif  // defined(XGBOOST_USE_CUDA)
 };

 /*!
@@ -198,7 +198,7 @@ struct EvalMClassBase : public MetricNoCache {
      CHECK_GE(nclass, 1U)
          << "mlogloss and merror are only used for multi-class classification,"
          << " use logloss for binary classification";
-      int device = ctx_->gpu_id;
+      auto device = ctx_->Device();
      auto result =
          reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
      dat[0] = result.Residue();
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -41,7 +41,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
  auto d_gptr = p_cache->DataGroupPtr(ctx);
  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);

-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
  auto topk = p_cache->Param().TopK();
  auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
@@ -96,7 +96,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
    CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
  }
  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());

  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -148,19 +148,18 @@ class ElementWiseSurvivalMetricsReduction {
      const HostDeviceVector<bst_float>& preds) {
    PackedReduceResult result;

-    if (ctx.gpu_id < 0) {
+    if (ctx.IsCPU()) {
      result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
                                preds, ctx.Threads());
    }
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    else {  // NOLINT
-      preds.SetDevice(ctx.gpu_id);
-      labels_lower_bound.SetDevice(ctx.gpu_id);
-      labels_upper_bound.SetDevice(ctx.gpu_id);
-      weights.SetDevice(ctx.gpu_id);
-
-      dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
+      preds.SetDevice(ctx.Device());
+      labels_lower_bound.SetDevice(ctx.Device());
+      labels_upper_bound.SetDevice(ctx.Device());
+      weights.SetDevice(ctx.Device());

+      dh::safe_cuda(cudaSetDevice(ctx.Ordinal()));
      result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
    }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -96,13 +96,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
 inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
                           std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
-  if (ctx->IsCPU()) {
-    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
-                               predt, alpha, p_tree);
-  } else {
-    position.SetDevice(ctx->gpu_id);
+  if (ctx->IsCUDA()) {
+    position.SetDevice(ctx->Device());
    detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
                                 predt, alpha, p_tree);
+  } else {
+    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
+                               predt, alpha, p_tree);
  }
 }
 }  // namespace obj
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -42,7 +42,7 @@ class AFTObj : public ObjFunction {

  template <typename Distribution>
  void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
-                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
+                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, DeviceOrd device,
                       bool is_null_weight, float aft_loss_distribution_scale) {
    common::Transform<>::Init(
        [=] XGBOOST_DEVICE(size_t _idx,
@@ -75,7 +75,7 @@ class AFTObj : public ObjFunction {
    CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(ndata, 1);
-    const int device = ctx_->gpu_id;
+    const auto device = ctx_->Device();
    const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
    const bool is_null_weight = info.weights_.Size() == 0;
    if (!is_null_weight) {
@@ -108,7 +108,7 @@ class AFTObj : public ObjFunction {
          _preds[_idx] = exp(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }

--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2022 by XGBoost Contributors
+/**
+ * Copyright 2018-2023, XGBoost Contributors
 * \file hinge.cc
 * \brief Provides an implementation of the hinge loss function
 * \author Henry Gouk
@@ -13,8 +13,7 @@
 #include "../common/transform.h"
 #include "../common/common.h"

-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
@@ -63,7 +62,7 @@ class HingeObj : public ObjFunction {
          _out_gpair[_idx] = GradientPair(g, h);
        },
        common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
-        ctx_->gpu_id).Eval(
+        ctx_->Device()).Eval(
            out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
  }

@@ -73,11 +72,11 @@ class HingeObj : public ObjFunction {
          _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }

-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
    return "error";
  }

@@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
 .describe("Hinge loss. Expects labels to be in [0,1f]")
 .set_body([]() { return new HingeObj(); });

-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
    CheckInitInputs(info);
  }
  // Avoid altering any state in child objective.
-  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
-  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
+  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device());
+  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->Device());

  Json config{Object{}};
  this->SaveConfig(&config);
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -103,19 +103,19 @@ class LambdaRankObj : public FitIntercept {

  // Update position biased for unbiased click data
  void UpdatePositionBias() {
-    li_full_.SetDevice(ctx_->gpu_id);
-    lj_full_.SetDevice(ctx_->gpu_id);
-    li_.SetDevice(ctx_->gpu_id);
-    lj_.SetDevice(ctx_->gpu_id);
+    li_full_.SetDevice(ctx_->Device());
+    lj_full_.SetDevice(ctx_->Device());
+    li_.SetDevice(ctx_->Device());
+    lj_.SetDevice(ctx_->Device());

-    if (ctx_->IsCPU()) {
-      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
-                                             lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
-                                             &li_, &lj_, p_cache_);
-    } else {
+    if (ctx_->IsCUDA()) {
      cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
                                              lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
                                              &li_, &lj_, p_cache_);
+    } else {
+      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
+                                             lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
+                                             &li_, &lj_, p_cache_);
    }

    li_full_.Data()->Fill(0.0);
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -296,12 +296,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
            linalg::VectorView<double> li, linalg::VectorView<double> lj,
            linalg::Matrix<GradientPair>* out_gpair) {
  // boilerplate
-  std::int32_t device_id = ctx->gpu_id;
-  dh::safe_cuda(cudaSetDevice(device_id));
+  auto device = ctx->Device();
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  auto n_groups = p_cache->Groups();

-  info.labels.SetDevice(device_id);
-  preds.SetDevice(device_id);
+  info.labels.SetDevice(device);
+  preds.SetDevice(device);
  out_gpair->SetDevice(ctx->Device());
  out_gpair->Reshape(preds.Size(), 1);

--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const int nclass = param_.num_class;
    const auto ndata = static_cast<int64_t>(preds.Size() / nclass);

-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
    out_gpair->SetDevice(device);
    info.labels.SetDevice(device);
    info.weights_.SetDevice(device);
@@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const int nclass = param_.num_class;
    const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);

-    auto device = io_preds->DeviceIdx();
+    auto device = io_preds->Device();
    if (prob) {
      common::Transform<>::Init(
          [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -70,16 +70,16 @@ class QuantileRegression : public ObjFunction {
    out_gpair->Reshape(info.num_row_, n_targets);
    auto gpair = out_gpair->View(ctx_->Device());

-    info.weights_.SetDevice(ctx_->gpu_id);
-    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                 : info.weights_.ConstDeviceSpan()};
+    info.weights_.SetDevice(ctx_->Device());
+    common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                  : info.weights_.ConstHostSpan()};

-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
    auto predt = linalg::MakeVec(&preds);
    auto n_samples = info.num_row_;

-    alpha_.SetDevice(ctx_->gpu_id);
-    auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
+    alpha_.SetDevice(ctx_->Device());
+    auto alpha = ctx_->IsCUDA() ? alpha_.ConstDeviceSpan() : alpha_.ConstHostSpan();

    linalg::ElementWiseKernel(
        ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
@@ -103,11 +103,48 @@ class QuantileRegression : public ObjFunction {
    CHECK(!alpha_.Empty());

    auto n_targets = this->Targets(info);
-    base_score->SetDevice(ctx_->gpu_id);
+    base_score->SetDevice(ctx_->Device());
    base_score->Reshape(n_targets);

    double sw{0};
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+      alpha_.SetDevice(ctx_->Device());
+      auto d_alpha = alpha_.ConstDeviceSpan();
+      auto d_labels = info.labels.View(ctx_->Device());
+      auto seg_it = dh::MakeTransformIterator<std::size_t>(
+          thrust::make_counting_iterator(0ul),
+          [=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
+      CHECK_EQ(d_labels.Shape(1), 1);
+      auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                     [=] XGBOOST_DEVICE(std::size_t i) {
+                                                       auto sample_idx = i % d_labels.Shape(0);
+                                                       return d_labels(sample_idx, 0);
+                                                     });
+      auto n = d_labels.Size() * d_alpha.size();
+      CHECK_EQ(base_score->Size(), d_alpha.size());
+      if (info.weights_.Empty()) {
+        common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
+                                  val_it + n, base_score->Data());
+        sw = info.num_row_;
+      } else {
+        info.weights_.SetDevice(ctx_->Device());
+        auto d_weights = info.weights_.ConstDeviceSpan();
+        auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                          [=] XGBOOST_DEVICE(std::size_t i) {
+                                                            auto sample_idx = i % d_labels.Shape(0);
+                                                            return d_weights[sample_idx];
+                                                          });
+        common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
+                                          val_it, val_it + n, weight_it, weight_it + n,
+                                          base_score->Data());
+        sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
+                        thrust::plus<double>{});
+      }
+#else
+      common::AssertGPUSupport();
+#endif  // defined(XGBOOST_USE_CUDA)
+    } else {
      auto quantiles = base_score->HostView();
      auto h_weights = info.weights_.ConstHostVector();
      if (info.weights_.Empty()) {
@@ -127,43 +164,6 @@ class QuantileRegression : public ObjFunction {
                                                  linalg::cend(h_labels), std::cbegin(h_weights));
        }
      }
-    } else {
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-      alpha_.SetDevice(ctx_->gpu_id);
-      auto d_alpha = alpha_.ConstDeviceSpan();
-      auto d_labels = info.labels.View(ctx_->Device());
-      auto seg_it = dh::MakeTransformIterator<std::size_t>(
-          thrust::make_counting_iterator(0ul),
-          [=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
-      CHECK_EQ(d_labels.Shape(1), 1);
-      auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
-                                                     [=] XGBOOST_DEVICE(std::size_t i) {
-                                                       auto sample_idx = i % d_labels.Shape(0);
-                                                       return d_labels(sample_idx, 0);
-                                                     });
-      auto n = d_labels.Size() * d_alpha.size();
-      CHECK_EQ(base_score->Size(), d_alpha.size());
-      if (info.weights_.Empty()) {
-        common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
-                                  val_it + n, base_score->Data());
-        sw = info.num_row_;
-      } else {
-        info.weights_.SetDevice(ctx_->gpu_id);
-        auto d_weights = info.weights_.ConstDeviceSpan();
-        auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
-                                                          [=] XGBOOST_DEVICE(std::size_t i) {
-                                                            auto sample_idx = i % d_labels.Shape(0);
-                                                            return d_weights[sample_idx];
-                                                          });
-        common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
-                                          val_it, val_it + n, weight_it, weight_it + n,
-                                          base_score->Data());
-        sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
-                        thrust::plus<double>{});
-      }
-#else
-      common::AssertGPUSupport();
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
    }

    // For multiple quantiles, we should extend the base score to a vector instead of
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept {

    size_t const ndata = preds.Size();
    out_gpair->SetDevice(ctx_->Device());
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();

    bool is_null_weight = info.weights_.Size() == 0;
    auto scale_pos_weight = param_.scale_pos_weight;
@@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept {
    additional_input_.HostVector().begin()[1] = is_null_weight;

    const size_t nthreads = ctx_->Threads();
-    bool on_device = device >= 0;
+    bool on_device = device.IsCUDA();
    // On CPU we run the transformation each thread processing a contigious block of data
    // for better performance.
    const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
@@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept {
          _preds[_idx] = Loss::PredTransform(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }

@@ -246,16 +246,16 @@ class PseudoHuberRegression : public FitIntercept {
    CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
    auto labels = info.labels.View(ctx_->Device());

-    out_gpair->SetDevice(ctx_->gpu_id);
+    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(info.num_row_, this->Targets(info));
    auto gpair = out_gpair->View(ctx_->Device());

-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
    auto predt = linalg::MakeVec(&preds);

-    info.weights_.SetDevice(ctx_->gpu_id);
-    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                 : info.weights_.ConstDeviceSpan()};
+    info.weights_.SetDevice(ctx_->Device());
+    common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                  : info.weights_.ConstHostSpan()};

    linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(size_t i, float const y) mutable {
      auto sample_id = std::get<0>(linalg::UnravelIndex(i, labels.Shape()));
@@ -287,6 +287,13 @@ class PseudoHuberRegression : public FitIntercept {
    }
    FromJson(in["pseudo_huber_param"], &param_);
  }
+  [[nodiscard]] Json DefaultMetricConfig() const override {
+    CHECK(param_.GetInitialised());
+    Json config{Object{}};
+    config["name"] = String{this->DefaultEvalMetric()};
+    config["pseudo_huber_param"] = ToJson(param_);
+    return config;
+  }
 };

 XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror")
@@ -320,7 +327,7 @@ class PoissonRegression : public FitIntercept {
    size_t const ndata = preds.Size();
    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(info.num_row_, this->Targets(info));
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
    label_correct_.Resize(1);
    label_correct_.Fill(1);

@@ -362,7 +369,7 @@ class PoissonRegression : public FitIntercept {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -505,7 +512,7 @@ class GammaRegression : public FitIntercept {
    CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
    CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
    const size_t ndata = preds.Size();
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(info.num_row_, this->Targets(info));
    label_correct_.Resize(1);
@@ -548,7 +555,7 @@ class GammaRegression : public FitIntercept {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -606,7 +613,7 @@ class TweedieRegression : public FitIntercept {
    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(info.num_row_, this->Targets(info));

-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
    label_correct_.Resize(1);
    label_correct_.Fill(1);

@@ -653,7 +660,7 @@ class TweedieRegression : public FitIntercept {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }

@@ -704,11 +711,11 @@ class MeanAbsoluteError : public ObjFunction {
    out_gpair->Reshape(info.num_row_, this->Targets(info));
    auto gpair = out_gpair->View(ctx_->Device());

-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
    auto predt = linalg::MakeVec(&preds);
-    info.weights_.SetDevice(ctx_->gpu_id);
-    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                 : info.weights_.ConstDeviceSpan()};
+    info.weights_.SetDevice(ctx_->Device());
+    common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                  : info.weights_.ConstHostSpan()};

    linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, float y) mutable {
      auto sign = [](auto x) {
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -180,33 +180,30 @@ struct DeviceAdapterLoader {

  XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
                                         bst_feature_t num_features, bst_row_t num_rows,
-                                         size_t entry_start, float missing) :
-    batch{batch},
-    columns{num_features},
-    use_shared{use_shared},
-    is_valid{missing} {
-      extern __shared__ float _smem[];
-      smem = _smem;
-      if (use_shared) {
-        uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-        size_t shared_elements = blockDim.x * num_features;
-        dh::BlockFill(smem, shared_elements, nanf(""));
-        __syncthreads();
-        if (global_idx < num_rows) {
-          auto beg = global_idx * columns;
-          auto end = (global_idx + 1) * columns;
-          for (size_t i = beg; i < end; ++i) {
-            auto value = batch.GetElement(i).value;
-            if (is_valid(value)) {
-              smem[threadIdx.x * num_features + (i - beg)] = value;
-            }
+                                         size_t entry_start, float missing)
+      : batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
+    extern __shared__ float _smem[];
+    smem = _smem;
+    if (use_shared) {
+      uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      size_t shared_elements = blockDim.x * num_features;
+      dh::BlockFill(smem, shared_elements, nanf(""));
+      __syncthreads();
+      if (global_idx < num_rows) {
+        auto beg = global_idx * columns;
+        auto end = (global_idx + 1) * columns;
+        for (size_t i = beg; i < end; ++i) {
+          auto value = batch.GetElement(i).value;
+          if (is_valid(value)) {
+            smem[threadIdx.x * num_features + (i - beg)] = value;
          }
        }
      }
-      __syncthreads();
    }
+    __syncthreads();
+  }

-  XGBOOST_DEV_INLINE  float GetElement(size_t  ridx, size_t  fidx) const {
+  [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
    if (use_shared) {
      return smem[threadIdx.x * columns + fidx];
    }
@@ -340,11 +337,11 @@ class DeviceModel {
  size_t tree_end_;  // NOLINT
  int num_group;

-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
-    dh::safe_cuda(cudaSetDevice(gpu_id));
+  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
+    dh::safe_cuda(cudaSetDevice(device.ordinal));

    // Copy decision trees to device
-    tree_segments = HostDeviceVector<size_t>({}, gpu_id);
+    tree_segments = HostDeviceVector<size_t>({}, device);
    auto& h_tree_segments = tree_segments.HostVector();
    h_tree_segments.reserve((tree_end - tree_begin) + 1);
    size_t sum = 0;
@@ -354,8 +351,8 @@ class DeviceModel {
      h_tree_segments.push_back(sum);
    }

-    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
-    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
+    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
+    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
    auto d_nodes = nodes.DevicePointer();
    auto d_stats = stats.DevicePointer();
    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -370,12 +367,12 @@ class DeviceModel {
          sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
    }

-    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
+    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
    auto& h_tree_group = tree_group.HostVector();
    std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());

    // Initialize categorical splits.
-    split_types.SetDevice(gpu_id);
+    split_types.SetDevice(device);
    std::vector<FeatureType>& h_split_types = split_types.HostVector();
    h_split_types.resize(h_tree_segments.back());
    for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -384,8 +381,8 @@ class DeviceModel {
                h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
    }

-    categories = HostDeviceVector<uint32_t>({}, gpu_id);
-    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id);
+    categories = HostDeviceVector<uint32_t>({}, device);
+    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
    std::vector<uint32_t> &h_categories = categories.HostVector();
    std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
    for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -398,7 +395,7 @@ class DeviceModel {
    }

    categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
-        h_tree_segments.back(), {}, gpu_id);
+        h_tree_segments.back(), {}, device);
    std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
        categories_node_segments.HostVector();
    for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -490,8 +487,8 @@ struct PathInfo {
 void ExtractPaths(
    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
    DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
-    int gpu_id) {
-  dh::safe_cuda(cudaSetDevice(gpu_id));
+    DeviceOrd device) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  auto& device_model = *model;

  dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@@ -654,11 +651,12 @@ __global__ void MaskBitVectorKernel(
    common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
    std::size_t tree_begin, std::size_t tree_end, std::size_t num_features, std::size_t num_rows,
    std::size_t entry_start, std::size_t num_nodes, bool use_shared, float missing) {
+  // This needs to be always instantiated since the data is loaded cooperatively by all threads.
+  SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
  auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (row_idx >= num_rows) {
    return;
  }
-  SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);

  std::size_t tree_offset = 0;
  for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -689,10 +687,10 @@ __global__ void MaskBitVectorKernel(
  }
 }

-__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
-                                          BitVector const& decision_bits,
-                                          BitVector const& missing_bits, std::size_t num_nodes,
-                                          std::size_t tree_offset) {
+__device__ bst_node_t GetLeafIndexByBitVector(bst_row_t ridx, TreeView const& tree,
+                                              BitVector const& decision_bits,
+                                              BitVector const& missing_bits, std::size_t num_nodes,
+                                              std::size_t tree_offset) {
  bst_node_t nidx = 0;
  RegTree::Node n = tree.d_tree[nidx];
  while (!n.IsLeaf()) {
@@ -704,9 +702,19 @@ __device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
    }
    n = tree.d_tree[nidx];
  }
+  return nidx;
+}
+
+__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
+                                          BitVector const& decision_bits,
+                                          BitVector const& missing_bits, std::size_t num_nodes,
+                                          std::size_t tree_offset) {
+  auto const nidx =
+      GetLeafIndexByBitVector(ridx, tree, decision_bits, missing_bits, num_nodes, tree_offset);
  return tree.d_tree[nidx].LeafValue();
 }

+template <bool predict_leaf>
 __global__ void PredictByBitVectorKernel(
    common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
    common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
@@ -722,27 +730,39 @@ __global__ void PredictByBitVectorKernel(
  }

  std::size_t tree_offset = 0;
-  if (num_group == 1) {
-    float sum = 0;
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+  if constexpr (predict_leaf) {
+    for (size_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
                      d_cat_node_segments, d_categories};
-      sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
-                                      tree_offset);
+      auto const leaf = GetLeafIndexByBitVector(row_idx, d_tree, decision_bits, missing_bits,
+                                                num_nodes, tree_offset);
+      d_out_predictions[row_idx * (tree_end - tree_begin) + tree_idx] = static_cast<float>(leaf);
      tree_offset += d_tree.d_tree.size();
    }
-    d_out_predictions[row_idx] += sum;
  } else {
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto const tree_group = d_tree_group[tree_idx];
-      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                      d_cat_node_segments, d_categories};
-      bst_uint out_prediction_idx = row_idx * num_group + tree_group;
-      d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
-          row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
-      tree_offset += d_tree.d_tree.size();
+    if (num_group == 1) {
+      float sum = 0;
+      for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                          d_cat_node_segments, d_categories};
+          sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
+                                          tree_offset);
+          tree_offset += d_tree.d_tree.size();
+      }
+      d_out_predictions[row_idx] += sum;
+    } else {
+      for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          auto const tree_group = d_tree_group[tree_idx];
+          TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                          d_cat_node_segments, d_categories};
+          bst_uint out_prediction_idx = row_idx * num_group + tree_group;
+          d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
+              row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
+          tree_offset += d_tree.d_tree.size();
+      }
    }
  }
 }
@@ -754,21 +774,29 @@ class ColumnSplitHelper {
  void PredictBatch(DMatrix* dmat, HostDeviceVector<float>* out_preds,
                    gbm::GBTreeModel const& model, DeviceModel const& d_model) const {
    CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
-    PredictDMatrix(dmat, out_preds, d_model, model.learner_model_param->num_feature,
-                   model.learner_model_param->num_output_group);
+    PredictDMatrix<false>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
+                          model.learner_model_param->num_output_group);
+  }
+
+  void PredictLeaf(DMatrix* dmat, HostDeviceVector<float>* out_preds, gbm::GBTreeModel const& model,
+                   DeviceModel const& d_model) const {
+    CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
+    PredictDMatrix<true>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
+                         model.learner_model_param->num_output_group);
  }

 private:
  using BitType = BitVector::value_type;

+  template <bool predict_leaf>
  void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
                      bst_feature_t num_features, std::uint32_t num_group) const {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    dh::caching_device_vector<BitType> decision_storage{};
    dh::caching_device_vector<BitType> missing_storage{};

    auto constexpr kBlockThreads = 128;
-    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
+    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
    auto const shared_memory_bytes =
        SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
    auto const use_shared = shared_memory_bytes != 0;
@@ -781,8 +809,8 @@ class ColumnSplitHelper {
      BitVector decision_bits{dh::ToSpan(decision_storage)};
      BitVector missing_bits{dh::ToSpan(missing_storage)};

-      batch.offset.SetDevice(ctx_->gpu_id);
-      batch.data.SetDevice(ctx_->gpu_id);
+      batch.offset.SetDevice(ctx_->Device());
+      batch.data.SetDevice(ctx_->Device());
      std::size_t entry_start = 0;
      SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);

@@ -798,7 +826,7 @@ class ColumnSplitHelper {
      AllReduceBitVectors(&decision_storage, &missing_storage);

      dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
-          PredictByBitVectorKernel, model.nodes.ConstDeviceSpan(),
+          PredictByBitVectorKernel<predict_leaf>, model.nodes.ConstDeviceSpan(),
          out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
          model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
          model.categories_tree_segments.ConstDeviceSpan(),
@@ -813,15 +841,14 @@ class ColumnSplitHelper {
  void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
                           dh::caching_device_vector<BitType>* missing_storage) const {
    collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
+        ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
    collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
-    collective::Synchronize(ctx_->gpu_id);
+        ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
  }

  void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
-                               dh::caching_device_vector<BitType>* missing_storage,
-                               std::size_t total_bits) const {
+                        dh::caching_device_vector<BitType>* missing_storage,
+                        std::size_t total_bits) const {
    auto const size = BitVector::ComputeStorageSize(total_bits);
    if (decision_storage->size() < size) {
      decision_storage->resize(size);
@@ -844,12 +871,12 @@ class GPUPredictor : public xgboost::Predictor {
                       size_t num_features,
                       HostDeviceVector<bst_float>* predictions,
                       size_t batch_offset, bool is_dense) const {
-    batch.offset.SetDevice(ctx_->gpu_id);
-    batch.data.SetDevice(ctx_->gpu_id);
+    batch.offset.SetDevice(ctx_->Device());
+    batch.data.SetDevice(ctx_->Device());
    const uint32_t BLOCK_THREADS = 128;
    size_t num_rows = batch.Size();
    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
    size_t shared_memory_bytes =
        SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
    bool use_shared = shared_memory_bytes != 0;
@@ -905,12 +932,12 @@ class GPUPredictor : public xgboost::Predictor {
    if (tree_end - tree_begin == 0) {
      return;
    }
-    out_preds->SetDevice(ctx_->gpu_id);
+    out_preds->SetDevice(ctx_->Device());
    auto const& info = dmat->Info();
    DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
+    d_model.Init(model, tree_begin, tree_end, ctx_->Device());

-    if (dmat->Info().IsColumnSplit()) {
+    if (info.IsColumnSplit()) {
      column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
      return;
    }
@@ -925,10 +952,10 @@ class GPUPredictor : public xgboost::Predictor {
    } else {
      size_t batch_offset = 0;
      for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
+        dmat->Info().feature_types.SetDevice(ctx_->Device());
        auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
        this->PredictInternal(
-            page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types),
+            page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
            d_model,
            out_preds,
            batch_offset);
@@ -942,16 +969,15 @@ class GPUPredictor : public xgboost::Predictor {
      : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}

  ~GPUPredictor() override {
-    if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
-      dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
+      dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    }
  }

  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
                    const gbm::GBTreeModel& model, uint32_t tree_begin,
                    uint32_t tree_end = 0) const override {
-    int device = ctx_->gpu_id;
-    CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
+    CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
    auto* out_preds = &predts->predictions;
    if (tree_end == 0) {
      tree_end = model.trees.size();
@@ -969,9 +995,9 @@ class GPUPredictor : public xgboost::Predictor {
    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
    CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
        << "Number of columns in data must equal to trained model.";
-    CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
-        << "XGBoost is running on device: " << this->ctx_->gpu_id << ", "
-        << "but data is on: " << m->DeviceIdx();
+    CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
+        << "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
+        << "but data is on: " << m->Device().Name();
    if (p_m) {
      p_m->Info().num_row_ = m->NumRows();
      this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
@@ -980,16 +1006,16 @@ class GPUPredictor : public xgboost::Predictor {
      info.num_row_ = m->NumRows();
      this->InitOutPredictions(info, &(out_preds->predictions), model);
    }
-    out_preds->predictions.SetDevice(m->DeviceIdx());
+    out_preds->predictions.SetDevice(m->Device());

    const uint32_t BLOCK_THREADS = 128;
    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));

-    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
+    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
    size_t shared_memory_bytes =
        SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
    DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
+    d_model.Init(model, tree_begin, tree_end, m->Device());

    bool use_shared = shared_memory_bytes != 0;
    size_t entry_start = 0;
@@ -1039,10 +1065,10 @@ class GPUPredictor : public xgboost::Predictor {
    if (tree_weights != nullptr) {
      LOG(FATAL) << "Dart booster feature " << not_implemented;
    }
-
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
-    out_contribs->SetDevice(ctx_->gpu_id);
+    CHECK(!p_fmat->Info().IsColumnSplit())
+        << "Predict contribution support for column-wise data split is not yet implemented.";
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    out_contribs->SetDevice(ctx_->Device());
    if (tree_end == 0 || tree_end > model.trees.size()) {
      tree_end = static_cast<uint32_t>(model.trees.size());
    }
@@ -1060,12 +1086,12 @@ class GPUPredictor : public xgboost::Predictor {
    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
        device_paths;
    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
    dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
+      batch.offset.SetDevice(ctx_->Device());
      SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                       model.learner_model_param->num_feature);
      auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1074,7 +1100,7 @@ class GPUPredictor : public xgboost::Predictor {
          dh::tend(phis));
    }
    // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
    const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();

    auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1099,10 +1125,8 @@ class GPUPredictor : public xgboost::Predictor {
    if (tree_weights != nullptr) {
      LOG(FATAL) << "Dart booster feature " << not_implemented;
    }
-
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
-    out_contribs->SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    out_contribs->SetDevice(ctx_->Device());
    if (tree_end == 0 || tree_end > model.trees.size()) {
      tree_end = static_cast<uint32_t>(model.trees.size());
    }
@@ -1121,12 +1145,12 @@ class GPUPredictor : public xgboost::Predictor {
    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
        device_paths;
    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
    dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
+      batch.offset.SetDevice(ctx_->Device());
      SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                       model.learner_model_param->num_feature);
      auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1135,7 +1159,7 @@ class GPUPredictor : public xgboost::Predictor {
          dh::tend(phis));
    }
    // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
    const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();

    auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1160,30 +1184,35 @@ class GPUPredictor : public xgboost::Predictor {
  void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
                   const gbm::GBTreeModel &model,
                   unsigned tree_end) const override {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());

    const MetaInfo& info = p_fmat->Info();
+    bst_row_t num_rows = info.num_row_;
+    if (tree_end == 0 || tree_end > model.trees.size()) {
+      tree_end = static_cast<uint32_t>(model.trees.size());
+    }
+    predictions->SetDevice(ctx_->Device());
+    predictions->Resize(num_rows * tree_end);
+    DeviceModel d_model;
+    d_model.Init(model, 0, tree_end, this->ctx_->Device());
+
+    if (info.IsColumnSplit()) {
+      column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
+      return;
+    }
+
    constexpr uint32_t kBlockThreads = 128;
    size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
        info.num_col_, max_shared_memory_bytes);
    bool use_shared = shared_memory_bytes != 0;
    bst_feature_t num_features = info.num_col_;
-    bst_row_t num_rows = info.num_row_;
    size_t entry_start = 0;

-    if (tree_end == 0 || tree_end > model.trees.size()) {
-      tree_end = static_cast<uint32_t>(model.trees.size());
-    }
-    predictions->SetDevice(ctx_->gpu_id);
-    predictions->Resize(num_rows * tree_end);
-    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
-
    if (p_fmat->PageExists<SparsePage>()) {
      for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
-        batch.data.SetDevice(ctx_->gpu_id);
-        batch.offset.SetDevice(ctx_->gpu_id);
+        batch.data.SetDevice(ctx_->Device());
+        batch.offset.SetDevice(ctx_->Device());
        bst_row_t batch_offset = 0;
        SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                            model.learner_model_param->num_feature};
@@ -1208,7 +1237,7 @@ class GPUPredictor : public xgboost::Predictor {
    } else {
      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
        bst_row_t batch_offset = 0;
-        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
+        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
        size_t num_rows = batch.Size();
        auto grid =
            static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
@@ -1236,9 +1265,9 @@ class GPUPredictor : public xgboost::Predictor {

 private:
  /*! \brief Reconfigure the device when GPU is changed. */
-  static size_t ConfigureDevice(int device) {
-    if (device >= 0) {
-      return dh::MaxSharedMemory(device);
+  static size_t ConfigureDevice(DeviceOrd device) {
+    if (device.IsCUDA()) {
+      return dh::MaxSharedMemory(device.ordinal);
    }
    return 0;
  }
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
  std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};

  const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
-  if (ctx_->gpu_id >= 0) {
-    out_preds->SetDevice(ctx_->gpu_id);
+  if (ctx_->Device().IsCUDA()) {
+    out_preds->SetDevice(ctx_->Device());
  }
  if (!base_margin->Empty()) {
    out_preds->Resize(n);
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -19,8 +19,7 @@
 #include "xgboost/linalg.h"                // TensorView, Tensor, Constant
 #include "xgboost/logging.h"               // CHECK_EQ

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace cpu_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
              linalg::TensorView<GradientPair const, 2> gpair,
@@ -68,13 +67,12 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien

 void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
              bst_target_t n_targets, linalg::Vector<float>* out) {
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
  out->Reshape(n_targets);

  gpair.SetDevice(ctx->Device());
  auto gpair_t = gpair.View(ctx->Device());
-  ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
-      : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
+  ctx->IsCUDA() ? cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()))
+                : cpu_impl::FitStump(ctx, info, gpair_t, out->HostView());
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -21,9 +21,7 @@
 #include "xgboost/logging.h"  // CHECK_EQ
 #include "xgboost/span.h"     // span

-namespace xgboost {
-namespace tree {
-namespace cuda_impl {
+namespace xgboost::tree::cuda_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
              linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
  auto n_targets = out.Size();
@@ -56,7 +54,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
  thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
                        thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));

-  collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
+  collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
                        d_sum.Size() * 2);

  thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
@@ -65,6 +63,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
                           CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
                     });
 }
-}  // namespace cuda_impl
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree::cuda_impl
--- a/Show More
+++ b/Show More