Remove internal use of gpu_id. (#9568)

2023-09-20 23:29:51 +08:00
parent 38ac52dd87
commit 8c676c889d
121 changed files with 1012 additions and 1044 deletions
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -190,7 +190,7 @@ jobs:
      run: |
        mkdir build_msvc
        cd build_msvc
-        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON  -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON
+        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON
        cmake --build . --config Release --parallel $(nproc)
    - name: Install Python package
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -29,31 +29,37 @@ struct DeviceSym {
 *        viewing types like `linalg::TensorView`.
 */
 struct DeviceOrd {
  // Constant representing the device ID of CPU.
  static bst_d_ordinal_t constexpr CPUOrdinal() { return -1; }
  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
  enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
  // CUDA device ordinal.
-  bst_d_ordinal_t ordinal{-1};
+  bst_d_ordinal_t ordinal{CPUOrdinal()};
  [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
  [[nodiscard]] bool IsCPU() const { return device == kCPU; }
-  DeviceOrd() = default;
+  constexpr DeviceOrd() = default;
  constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
-  DeviceOrd(DeviceOrd const& that) = default;
+  constexpr DeviceOrd(DeviceOrd const& that) = default;
-  DeviceOrd& operator=(DeviceOrd const& that) = default;
+  constexpr DeviceOrd& operator=(DeviceOrd const& that) = default;
-  DeviceOrd(DeviceOrd&& that) = default;
+  constexpr DeviceOrd(DeviceOrd&& that) = default;
-  DeviceOrd& operator=(DeviceOrd&& that) = default;
+  constexpr DeviceOrd& operator=(DeviceOrd&& that) = default;
  /**
   * @brief Constructor for CPU.
   */
-  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
+  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, CPUOrdinal()}; }
  /**
   * @brief Constructor for CUDA device.
   *
   * @param ordinal CUDA device ordinal.
   */
-  [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
+  [[nodiscard]] static constexpr auto CUDA(bst_d_ordinal_t ordinal) {
    return DeviceOrd{kCUDA, ordinal};
  }
  [[nodiscard]] bool operator==(DeviceOrd const& that) const {
    return device == that.device && ordinal == that.ordinal;
@@ -78,25 +84,26 @@ struct DeviceOrd {
 static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
 std::ostream& operator<<(std::ostream& os, DeviceOrd ord);
 /**
 * @brief Runtime context for XGBoost. Contains information like threads and device.
 */
 struct Context : public XGBoostParameter<Context> {
 private:
  // User interfacing parameter for device ordinal
  std::string device{DeviceSym::CPU()};  // NOLINT
-  // The device object for the current context. We are in the middle of replacing the
+  // The device ordinal set by user
  // `gpu_id` with this device field.
  DeviceOrd device_{DeviceOrd::CPU()};
 public:
  // Constant representing the device ID of CPU.
  static bst_d_ordinal_t constexpr kCpuId = -1;
  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
  static std::int64_t constexpr kDefaultSeed = 0;
 public:
  Context();
  void Init(Args const& kwargs);
  template <typename Container>
  Args UpdateAllowUnknown(Container const& kwargs) {
    auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
@@ -104,7 +111,6 @@ struct Context : public XGBoostParameter<Context> {
    return args;
  }
  std::int32_t gpu_id{kCpuId};
  // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
  std::int32_t nthread{0};  // NOLINT
  // stored random seed
@@ -116,7 +122,8 @@ struct Context : public XGBoostParameter<Context> {
  bool validate_parameters{false};
  /**
-   * @brief Configure the parameter `gpu_id'.
+   * @brief Configure the parameter `device'. Deprecated, will remove once `gpu_id` is
   *        removed.
   *
   * @param require_gpu Whether GPU is explicitly required by the user through other
   *                    configurations.
@@ -212,9 +219,7 @@ struct Context : public XGBoostParameter<Context> {
 private:
  void SetDeviceOrdinal(Args const& kwargs);
  Context& SetDevice(DeviceOrd d) {
-    this->device_ = d;
+    this->device = (this->device_ = d).Name();
    this->gpu_id = d.ordinal;  // this can be removed once we move away from `gpu_id`.
    this->device = d.Name();
    return *this;
  }
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -106,10 +106,10 @@ class MetaInfo {
  MetaInfo& operator=(MetaInfo&& that) = default;
  MetaInfo& operator=(MetaInfo const& that) = delete;
-  /*!
+  /**
-   * \brief Validate all metainfo.
+   * @brief Validate all metainfo.
   */
-  void Validate(int32_t device) const;
+  void Validate(DeviceOrd device) const;
  MetaInfo Slice(common::Span<int32_t const> ridxs) const;
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -88,9 +88,9 @@ class HostDeviceVector {
  static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types");
 public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
+  explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU());
-  HostDeviceVector(std::initializer_list<T> init, int device = -1);
+  HostDeviceVector(std::initializer_list<T> init, DeviceOrd device = DeviceOrd::CPU());
-  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
+  explicit HostDeviceVector(const std::vector<T>& init, DeviceOrd device = DeviceOrd::CPU());
  ~HostDeviceVector();
  HostDeviceVector(const HostDeviceVector<T>&) = delete;
@@ -99,17 +99,9 @@ class HostDeviceVector {
  HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
  HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
-  bool Empty() const { return Size() == 0; }
+  [[nodiscard]] bool Empty() const { return Size() == 0; }
-  size_t Size() const;
+  [[nodiscard]] std::size_t Size() const;
-  int DeviceIdx() const;
+  [[nodiscard]] DeviceOrd Device() const;
  DeviceOrd Device() const {
    auto idx = this->DeviceIdx();
    if (idx == DeviceOrd::CPU().ordinal) {
      return DeviceOrd::CPU();
    } else {
      return DeviceOrd::CUDA(idx);
    }
  }
  common::Span<T> DeviceSpan();
  common::Span<const T> ConstDeviceSpan() const;
  common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
@@ -135,13 +127,12 @@ class HostDeviceVector {
  const std::vector<T>& ConstHostVector() const;
  const std::vector<T>& HostVector() const {return ConstHostVector(); }
-  bool HostCanRead() const;
+  [[nodiscard]] bool HostCanRead() const;
-  bool HostCanWrite() const;
+  [[nodiscard]] bool HostCanWrite() const;
-  bool DeviceCanRead() const;
+  [[nodiscard]] bool DeviceCanRead() const;
-  bool DeviceCanWrite() const;
+  [[nodiscard]] bool DeviceCanWrite() const;
-  GPUAccess DeviceAccess() const;
+  [[nodiscard]] GPUAccess DeviceAccess() const;
  void SetDevice(int device) const;
  void SetDevice(DeviceOrd device) const;
  void Resize(size_t new_size, T v = T());
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -659,13 +659,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
 template <typename T>
 auto MakeVec(HostDeviceVector<T> *data) {
-  return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(),
+  return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
-                 data->Size(), data->Device());
+                 data->Device());
 }
 template <typename T>
 auto MakeVec(HostDeviceVector<T> const *data) {
-  return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(),
+  return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
                 data->Size(), data->Device());
 }
@@ -757,13 +757,13 @@ class Tensor {
  Order order_{Order::kC};
  template <typename I, std::int32_t D>
-  void Initialize(I const (&shape)[D], std::int32_t device) {
+  void Initialize(I const (&shape)[D], DeviceOrd device) {
    static_assert(D <= kDim, "Invalid shape.");
    std::copy(shape, shape + D, shape_);
    for (auto i = D; i < kDim; ++i) {
      shape_[i] = 1;
    }
-    if (device >= 0) {
+    if (device.IsCUDA()) {
      data_.SetDevice(device);
      data_.ConstDevicePointer();  // Pull to device;
    }
@@ -780,14 +780,11 @@ class Tensor {
   * See \ref TensorView for parameters of this constructor.
   */
  template <typename I, int32_t D>
  explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
      : Tensor{common::Span<I const, D>{shape}, device, order} {}
  template <typename I, int32_t D>
  explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
-      : Tensor{common::Span<I const, D>{shape}, device.ordinal, order} {}
+      : Tensor{common::Span<I const, D>{shape}, device, order} {}
  template <typename I, size_t D>
-  explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC)
+  explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC)
      : order_{order} {
    // No device unroll as this is a host only function.
    std::copy(shape.data(), shape.data() + D, shape_);
@@ -795,11 +792,11 @@ class Tensor {
      shape_[i] = 1;
    }
    auto size = detail::CalcSize(shape_);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
      data_.SetDevice(device);
    }
    data_.Resize(size);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
      data_.DevicePointer();  // Pull to device
    }
  }
@@ -807,7 +804,7 @@ class Tensor {
   * Initialize from 2 host iterators.
   */
  template <typename It, typename I, int32_t D>
-  explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC)
+  explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC)
      : order_{order} {
    auto &h_vec = data_.HostVector();
    h_vec.insert(h_vec.begin(), begin, end);
@@ -816,7 +813,7 @@ class Tensor {
  }
  template <typename I, int32_t D>
-  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], std::int32_t device,
+  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
                  Order order = kC)
      : order_{order} {
    auto &h_vec = data_.HostVector();
@@ -824,10 +821,6 @@ class Tensor {
    // shape
    this->Initialize(shape, device);
  }
  template <typename I, int32_t D>
  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
                  Order order = kC)
      : Tensor{data, shape, device.ordinal, order} {}
  /**
   * \brief Index operator. Not thread safe, should not be used in performance critical
   *        region. For more efficient indexing, consider getting a view first.
@@ -944,9 +937,7 @@ class Tensor {
  /**
   * \brief Set device ordinal for this tensor.
   */
  void SetDevice(int32_t device) const { data_.SetDevice(device); }
  void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
  [[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
  [[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
 };
@@ -962,7 +953,7 @@ using Vector = Tensor<T, 1>;
 template <typename T, typename... Index>
 auto Empty(Context const *ctx, Index &&...index) {
  Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->gpu_id);
+  t.SetDevice(ctx->Device());
  t.Reshape(index...);
  return t;
 }
@@ -973,7 +964,7 @@ auto Empty(Context const *ctx, Index &&...index) {
 template <typename T, typename... Index>
 auto Constant(Context const *ctx, T v, Index &&...index) {
  Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->gpu_id);
+  t.SetDevice(ctx->Device());
  t.Reshape(index...);
  t.Data()->Fill(std::move(v));
  return t;
@@ -990,8 +981,8 @@ auto Zeros(Context const *ctx, Index &&...index) {
 // Only first axis is supported for now.
 template <typename T, int32_t D>
 void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
-  if (r.DeviceIdx() >= 0) {
+  if (r.Device().IsCUDA()) {
-    l->SetDevice(r.DeviceIdx());
+    l->SetDevice(r.Device());
  }
  l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {
    for (size_t i = 1; i < D; ++i) {
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -52,9 +52,9 @@ class PredictionContainer : public DMatrixCache<PredictionCacheEntry> {
 public:
  PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
-  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, std::int32_t device) {
+  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, DeviceOrd device) {
    auto p_cache = this->CacheItem(m);
-    if (device != Context::kCpuId) {
+    if (device.IsCUDA()) {
      p_cache->predictions.SetDevice(device);
    }
    return *p_cache;
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -66,7 +66,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
  auto hess_dev = dh::CudaGetPointerDevice(hess.data);
  CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
  auto &gpair = *out_gpair;
-  gpair.SetDevice(grad_dev);
+  gpair.SetDevice(DeviceOrd::CUDA(grad_dev));
  gpair.Reshape(grad.Shape(0), grad.Shape(1));
  auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
  auto cuctx = ctx->CUDACtx();
@@ -144,7 +144,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
  if (learner->Ctx()->IsCUDA()) {
    CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
  }
-  p_predt->SetDevice(proxy->DeviceIdx());
+  p_predt->SetDevice(proxy->Device());
  auto &shape = learner->GetThreadLocal().prediction_shape;
  size_t n_samples = p_m->Info().num_row_;
--- a/src/collective/aggregator.cuh
+++ b/src/collective/aggregator.cuh
@@ -15,8 +15,7 @@
 #include "communicator-inl.cuh"
-namespace xgboost {
+namespace xgboost::collective {
 namespace collective {
 /**
 * @brief Find the global sum of the given values across all workers.
@@ -31,10 +30,9 @@ namespace collective {
 * @param size Number of values to sum.
 */
 template <typename T>
-void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
+void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) {
  if (info.IsRowSplit()) {
-    collective::AllReduce<collective::Operation::kSum>(device, values, size);
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, values, size);
  }
 }
-}  // namespace collective
+}  // namespace xgboost::collective
 }  // namespace xgboost
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -123,7 +123,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
      [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
 }
-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                dh::device_vector<Entry>* p_sorted_entries,
                                dh::device_vector<float>* p_sorted_weights,
                                dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@@ -240,13 +240,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
      sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
        return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
      });
-  detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
  if (sketch_container->HasCategorical()) {
    auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
-    detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
+    detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
                                       &column_sizes_scan);
  }
@@ -347,7 +347,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
  HistogramCuts cuts;
  SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
-                                   ctx->Ordinal());
+                                   ctx->Device());
  CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
  for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
    std::size_t page_nnz = page.data.Size();
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -82,9 +82,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 }
 template <std::uint32_t kBlockThreads, typename Kernel>
-std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
+std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
  int n_mps = 0;
-  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
  int n_blocks_per_mp = 0;
  dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                              kBlockThreads, shared_mem));
@@ -106,11 +106,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
 * \param out_column_size Output buffer for the size of each column.
 */
 template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
-void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
+void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
                               data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
  thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
-  std::size_t max_shared_memory = dh::MaxSharedMemory(device);
+  std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
  // Not strictly correct as we should use number of samples to determine the type of
  // counter. However, the sample size is not known due to sliding window on number of
  // elements.
@@ -154,7 +154,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
 }
 template <typename BatchIt>
-void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
+void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
                        IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
                        HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                        dh::caching_device_vector<size_t>* column_sizes_scan) {
@@ -215,7 +215,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
 void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
-                            float missing, size_t columns, size_t cuts_per_feature, int device,
+                            float missing, size_t columns, size_t cuts_per_feature,
                            DeviceOrd device,
                            HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                            dh::caching_device_vector<size_t>* column_sizes_scan,
                            dh::device_vector<Entry>* sorted_entries) {
@@ -239,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                  dh::device_vector<Entry>* sorted_entries);
-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                dh::device_vector<Entry>* p_sorted_entries,
                                dh::device_vector<float>* p_sorted_weights,
                                dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -277,7 +278,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
 template <typename AdapterBatch>
 void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
-                          int device, size_t columns, size_t begin, size_t end,
+                          DeviceOrd device, size_t columns, size_t begin, size_t end,
                          float missing, SketchContainer *sketch_container,
                          int num_cuts) {
  // Copy current subset of valid elements into temporary storage and sort
@@ -316,11 +317,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
 template <typename Batch>
 void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
                                  int num_cuts_per_feature,
-                                  bool is_ranking, float missing, int device,
+                                  bool is_ranking, float missing, DeviceOrd device,
                                  size_t columns, size_t begin, size_t end,
                                  SketchContainer *sketch_container) {
  dh::XGBCachingDeviceAllocator<char> alloc;
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  info.weights_.SetDevice(device);
  auto weights = info.weights_.ConstDeviceSpan();
@@ -412,14 +413,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
  size_t num_rows = batch.NumRows();
  size_t num_cols = batch.NumCols();
  size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
-  int32_t device = sketch_container->DeviceIdx();
+  auto device = sketch_container->DeviceIdx();
  bool weighted = !info.weights_.Empty();
  if (weighted) {
    sketch_batch_num_elements = detail::SketchBatchNumElements(
        sketch_batch_num_elements,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, true);
+        device.ordinal, num_cuts_per_feature, true);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
      size_t end =
          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
@@ -432,7 +433,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
    sketch_batch_num_elements = detail::SketchBatchNumElements(
        sketch_batch_num_elements,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, false);
+        device.ordinal, num_cuts_per_feature, false);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
      size_t end =
          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
 };
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(size, v);
 }
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -81,7 +81,7 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return -1; }
+DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
  return false;
 }
 template <typename T>
 void HostDeviceVector<T>::SetDevice(int) const {}
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
 template <typename T>
 class HostDeviceVectorImpl {
 public:
-  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
+  HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
-    if (device >= 0) {
+    if (device.IsCUDA()) {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
      data_d_->resize(size, v);
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
  // Initializer can be std::vector<T> or std::initializer_list<T>
  template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
+  HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
-    if (device >= 0) {
+    if (device.IsCUDA()) {
      gpu_access_ = GPUAccess::kWrite;
      LazyResizeDevice(init.size());
      Copy(init);
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
    gpu_access_{that.gpu_access_} {}
  ~HostDeviceVectorImpl() {
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
      SetDevice();
    }
  }
-  size_t Size() const {
+  [[nodiscard]] size_t Size() const {
    return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
  }
-  int DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
  T* DevicePointer() {
    LazySyncDevice(GPUAccess::kWrite);
@@ -138,7 +138,7 @@ class HostDeviceVectorImpl {
    } else {
      auto ptr = other->ConstDevicePointer();
      SetDevice();
-      CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
+      CHECK_EQ(this->Device(), other->Device());
      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
                                    ptr,
                                    other->Size() * sizeof(T),
@@ -156,24 +156,25 @@ class HostDeviceVectorImpl {
    return data_h_;
  }
-  void SetDevice(int device) {
+  void SetDevice(DeviceOrd device) {
    if (device_ == device) { return; }
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
      LazySyncHost(GPUAccess::kNone);
    }
-    if (device_ >= 0 && device >= 0) {
+    if (device_.IsCUDA() && device.IsCUDA()) {
-      CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
+      CHECK_EQ(device_.ordinal, device.ordinal)
          << "New device ordinal is different from previous one.";
    }
    device_ = device;
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
      LazyResizeDevice(data_h_.size());
    }
  }
  void Resize(size_t new_size, T v) {
    if (new_size == Size()) { return; }
-    if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
+    if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
      // fast on-device resize
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
@@ -218,16 +219,16 @@ class HostDeviceVectorImpl {
    gpu_access_ = access;
  }
-  bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
+  [[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
-  bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
-  bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
+  [[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
-  bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
+  [[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
-  bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
-  bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
+  [[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
-  GPUAccess Access() const { return gpu_access_; }
+  [[nodiscard]] GPUAccess Access() const { return gpu_access_; }
 private:
-  int device_{-1};
+  DeviceOrd device_{DeviceOrd::CPU()};
  std::vector<T> data_h_{};
  std::unique_ptr<dh::device_vector<T>> data_d_{};
  GPUAccess gpu_access_{GPUAccess::kNone};
@@ -259,11 +260,11 @@ class HostDeviceVectorImpl {
  }
  void SetDevice() {
-    CHECK_GE(device_, 0);
+    CHECK_GE(device_.ordinal, 0);
    if (cudaSetDeviceHandler == nullptr) {
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
    } else {
-      (*cudaSetDeviceHandler)(device_);
+      (*cudaSetDeviceHandler)(device_.ordinal);
    }
    if (!data_d_) {
@@ -273,15 +274,15 @@ class HostDeviceVectorImpl {
 };
 template<typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
    : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 template <typename T>
@@ -309,7 +310,9 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
+DeviceOrd HostDeviceVector<T>::Device() const {
  return impl_->Device();
 }
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() {
@@ -389,14 +392,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
  return impl_->Access();
 }
 template <typename T>
 void HostDeviceVector<T>::SetDevice(int device) const {
  impl_->SetDevice(device);
 }
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
-  impl_->SetDevice(device.ordinal);
+  impl_->SetDevice(device);
 }
 template <typename T>
--- a/src/common/numeric.cu
+++ b/src/common/numeric.cu
@@ -8,16 +8,12 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
-namespace xgboost {
+namespace xgboost::common::cuda_impl {
 namespace common {
 namespace cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
-  values.SetDevice(ctx->gpu_id);
+  values.SetDevice(ctx->Device());
  auto const d_values = values.ConstDeviceSpan();
  dh::XGBCachingDeviceAllocator<char> alloc;
  return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
                    thrust::plus<float>{});
 }
-}  // namespace cuda_impl
+}  // namespace xgboost::common::cuda_impl
 }  // namespace common
 }  // namespace xgboost
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@@ -24,7 +24,7 @@ struct OptionalWeights {
 inline OptionalWeights MakeOptionalWeights(Context const* ctx,
                                           HostDeviceVector<float> const& weights) {
  if (ctx->IsCUDA()) {
-    weights.SetDevice(ctx->gpu_id);
+    weights.SetDevice(ctx->Device());
  }
  return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
 }
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -207,10 +207,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 // summary does the output element come from) result by definition of merged rank.  So we
 // run it in 2 passes to obtain the merge path and then customize the standard merge
 // algorithm.
-void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
+void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
               Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
               Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  CHECK_EQ(d_x.size() + d_y.size(), out.size());
  CHECK_EQ(x_ptr.size(), out_ptr.size());
  CHECK_EQ(y_ptr.size(), out_ptr.size());
@@ -308,7 +308,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
 void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
                           common::Span<OffsetT> cuts_ptr,
                           size_t total_cuts, Span<float> weights) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  Span<SketchEntry> out;
  dh::device_vector<SketchEntry> cuts;
  bool first_window = this->Current().empty();
@@ -367,7 +367,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
   * pruning or merging. We preserve the first type and remove the second type.
   */
  timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
  dh::XGBCachingDeviceAllocator<char> alloc;
@@ -407,7 +407,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
 void SketchContainer::Prune(size_t to) {
  timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  OffsetT to_total = 0;
  auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -442,7 +442,7 @@ void SketchContainer::Prune(size_t to) {
 void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
                            Span<SketchEntry const> that) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  timer_.Start(__func__);
  if (this->Current().size() == 0) {
    CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -477,7 +477,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
 }
 void SketchContainer::FixError() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
  auto in = dh::ToSpan(this->Current());
  dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -502,7 +502,7 @@ void SketchContainer::FixError() {
 }
 void SketchContainer::AllReduce(bool is_column_split) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  auto world = collective::GetWorldSize();
  if (world == 1 || is_column_split) {
    return;
@@ -529,15 +529,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
  auto offset = rank * d_columns_ptr.size();
  thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
               gathered_ptrs.begin() + offset);
-  collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
+  collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
                                                     gathered_ptrs.size());
  // Get the data from all workers.
  std::vector<size_t> recv_lengths;
  dh::caching_device_vector<char> recvbuf;
-  collective::AllGatherV(device_, this->Current().data().get(),
+  collective::AllGatherV(device_.ordinal, this->Current().data().get(),
                         dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
-  collective::Synchronize(device_);
+  collective::Synchronize(device_.ordinal);
  // Segment the received data.
  auto s_recvbuf = dh::ToSpan(recvbuf);
@@ -584,7 +584,7 @@ struct InvalidCatOp {
 void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
  timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  p_cuts->min_vals_.Resize(num_columns_);
  // Sync between workers.
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -41,7 +41,7 @@ class SketchContainer {
  bst_row_t num_rows_;
  bst_feature_t num_columns_;
  int32_t num_bins_;
-  int32_t device_;
+  DeviceOrd device_;
  // Double buffer as neither prune nor merge can be performed inplace.
  dh::device_vector<SketchEntry> entries_a_;
@@ -93,35 +93,32 @@ class SketchContainer {
   * \param num_rows    Total number of rows in known dataset (typically the rows in current worker).
   * \param device      GPU ID.
   */
-   SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
+  SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
-                   int32_t max_bin, bst_feature_t num_columns,
+                  bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
-                   bst_row_t num_rows, int32_t device)
+      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
-       : num_rows_{num_rows},
+    CHECK(device.IsCUDA());
-         num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
+    // Initialize Sketches for this dmatrix
-     CHECK_GE(device, 0);
+    this->columns_ptr_.SetDevice(device_);
-     // Initialize Sketches for this dmatrix
+    this->columns_ptr_.Resize(num_columns + 1);
-     this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_b_.SetDevice(device_);
-     this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_b_.Resize(num_columns + 1);
     this->columns_ptr_b_.SetDevice(device_);
     this->columns_ptr_b_.Resize(num_columns + 1);
-     this->feature_types_.Resize(feature_types.Size());
+    this->feature_types_.Resize(feature_types.Size());
-     this->feature_types_.Copy(feature_types);
+    this->feature_types_.Copy(feature_types);
-     // Pull to device.
+    // Pull to device.
-     this->feature_types_.SetDevice(device);
+    this->feature_types_.SetDevice(device);
-     this->feature_types_.ConstDeviceSpan();
+    this->feature_types_.ConstDeviceSpan();
-     this->feature_types_.ConstHostSpan();
+    this->feature_types_.ConstHostSpan();
-     auto d_feature_types = feature_types_.ConstDeviceSpan();
+    auto d_feature_types = feature_types_.ConstDeviceSpan();
-     has_categorical_ =
+    has_categorical_ =
-         !d_feature_types.empty() &&
+        !d_feature_types.empty() &&
-         thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
+        thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
                        common::IsCatOp{});
-     timer_.Init(__func__);
+    timer_.Init(__func__);
-   }
+  }
  /* \brief Return GPU ID for this container. */
-  int32_t DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
  /* \brief Whether the predictor matrix contains categorical features. */
  bool HasCategorical() const { return has_categorical_; }
  /* \brief Accumulate weights of duplicated entries in input. */
@@ -175,7 +172,7 @@ class SketchContainer {
  template <typename KeyComp = thrust::equal_to<size_t>>
  size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
    timer_.Start(__func__);
-    dh::safe_cuda(cudaSetDevice(device_));
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
    this->columns_ptr_.SetDevice(device_);
    Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
    CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -15,8 +15,7 @@
 #include "xgboost/linalg.h"              // Tensor, UnravelIndex, Apply
 #include "xgboost/logging.h"             // CHECK_EQ
-namespace xgboost {
+namespace xgboost::common {
 namespace common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
            HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
  if (!ctx->IsCPU()) {
@@ -46,8 +45,8 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
 }
 void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
-  v.SetDevice(ctx->gpu_id);
+  v.SetDevice(ctx->Device());
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
  out->Reshape(1);
  if (ctx->IsCPU()) {
@@ -62,5 +61,4 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
  }
 }
-}  // namespace common
+}  // namespace xgboost::common
 }  // namespace xgboost
--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@@ -15,14 +15,12 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // linalg::TensorView, UnravelIndex, Apply
-namespace xgboost {
+namespace xgboost::common::cuda_impl {
 namespace common {
 namespace cuda_impl {
 void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
            common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
  CHECK_GE(t.Shape(1), 1);
  HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
-  segments.SetDevice(ctx->gpu_id);
+  segments.SetDevice(ctx->Device());
  auto d_segments = segments.DeviceSpan();
  dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
              [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
@@ -31,7 +29,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
        return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
      });
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
  out->Reshape(t.Shape(1));
  if (weights.Empty()) {
    common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
@@ -60,6 +58,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
  dh::TemporaryArray<char> temp{bytes};
  cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
 }
-}  // namespace cuda_impl
+}  // namespace xgboost::common::cuda_impl
 }  // namespace common
 }  // namespace xgboost
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
  auto d_sorted_idx = dh::ToSpan(sorted_idx);
  auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
  quantiles->Resize(n_segments);
  auto d_results = quantiles->DeviceSpan();
@@ -220,7 +220,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
                                scan_val, weights_cdf.begin());
  auto n_segments = std::distance(seg_beg, seg_end) - 1;
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
  quantiles->Resize(n_segments);
  auto d_results = quantiles->DeviceSpan();
  auto d_weight_cdf = dh::ToSpan(weights_cdf);
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -60,8 +60,8 @@ class Transform {
  template <typename Functor>
  struct Evaluator {
   public:
-    Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
+    Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
-        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
+        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
    /*!
     * \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -71,7 +71,7 @@ class Transform {
     */
    template <typename... HDV>
    void Eval(HDV... vectors) const {
-      bool on_device = device_ >= 0;
+      bool on_device = device_.IsCUDA();
      if (on_device) {
        LaunchCUDA(func_, vectors...);
@@ -116,11 +116,11 @@ class Transform {
    }
    // Recursive unpack for Shard.
    template <typename T>
-    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+    void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
      vector->SetDevice(device);
    }
    template <typename Head, typename... Rest>
-    void UnpackShard(int device,
+    void UnpackShard(DeviceOrd device,
                     const HostDeviceVector<Head> *_vector,
                     const HostDeviceVector<Rest> *... _vectors) const {
      _vector->SetDevice(device);
@@ -140,7 +140,7 @@ class Transform {
      // granularity is used in data vector.
      size_t shard_size = range_size;
      Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
      const int kGrids =
          static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
      if (kGrids == 0) {
@@ -174,7 +174,7 @@ class Transform {
    /*! \brief Range object specifying parallel threads index range. */
    Range range_;
    int32_t n_threads_;
-    int32_t device_;
+    DeviceOrd device_;
  };
 public:
@@ -192,8 +192,8 @@ class Transform {
   */
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
-                                 int32_t device_idx) {
+                                 DeviceOrd device) {
-    return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
+    return Evaluator<Functor>{func, std::move(range), n_threads, device};
  }
 };
--- a/src/context.cc
+++ b/src/context.cc
@@ -20,7 +20,6 @@ namespace xgboost {
 DMLC_REGISTER_PARAMETER(Context);
 bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;
 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
@@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
    return std::nullopt;
  }
-  std::int32_t parsed_id{Context::kCpuId};
+  std::int32_t parsed_id{DeviceOrd::CPUOrdinal()};
  auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
  if (res.ec != std::errc()) {
    return std::nullopt;
@@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
  auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
  DeviceOrd device;
-  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  device.ordinal = DeviceOrd::InvalidOrdinal();  // mark it invalid for check.
  if (split_it == s_device.cend()) {
    // no ordinal.
    if (s_device == DeviceSym::CPU()) {
@@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
    device = DeviceOrd::CUDA(opt_id.value());
  }
-  if (device.ordinal < Context::kCpuId) {
+  if (device.ordinal < DeviceOrd::CPUOrdinal()) {
    fatal();
  }
  device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
@@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 }
 }  // namespace
 std::ostream& operator<<(std::ostream& os, DeviceOrd ord) {
  os << ord.Name();
  return os;
 }
 void Context::Init(Args const& kwargs) {
  auto unknown = this->UpdateAllowUnknown(kwargs);
  if (!unknown.empty()) {
    std::stringstream ss;
    std::size_t i = 0;
    ss << "[Internal Error] Unknown parameters passed to the Context {";
    for (auto const& [k, _] : unknown) {
      ss << '"' << k << '"';
      if (++i != unknown.size()) {
        ss << ", ";
      }
    }
    ss << "}\n";
    LOG(FATAL) << ss.str();
  }
 }
 void Context::ConfigureGpuId(bool require_gpu) {
  if (this->IsCPU() && require_gpu) {
    this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
@@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
    error::WarnDeprecatedGPUId();
    auto opt_id = ParseInt(StringView{gpu_id_it->second});
    CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
-    if (opt_id.value() > Context::kCpuId) {
+    if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
    } else {
      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
@@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
  this->SetDevice(new_d);
  if (this->IsCPU()) {
-    CHECK_EQ(this->device_.ordinal, kCpuId);
+    CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal());
  } else {
-    CHECK_GT(this->device_.ordinal, kCpuId);
+    CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal());
  }
 }
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -687,13 +687,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
  linalg::Stack(&this->labels, that.labels);
-  this->weights_.SetDevice(that.weights_.DeviceIdx());
+  this->weights_.SetDevice(that.weights_.Device());
  this->weights_.Extend(that.weights_);
-  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
+  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
  this->labels_lower_bound_.Extend(that.labels_lower_bound_);
-  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
+  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
  this->labels_upper_bound_.Extend(that.labels_upper_bound_);
  linalg::Stack(&this->base_margin_, that.base_margin_);
@@ -723,7 +723,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
  }
  if (!that.feature_weights.Empty()) {
    this->feature_weights.Resize(that.feature_weights.Size());
-    this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
+    this->feature_weights.SetDevice(that.feature_weights.Device());
    this->feature_weights.Copy(that.feature_weights);
  }
 }
@@ -738,22 +738,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {
 namespace {
 template <typename T>
-void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
+void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
-  bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
+  bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
  if (!valid) {
    LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
                  "the booster. The device ordinal of the data is: "
-               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
+               << v.Device() << "; the device ordinal of the Booster is: " << device;
  }
 }
 template <typename T, std::int32_t D>
-void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
+void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
  CheckDevice(device, *v.Data());
 }
 }  // anonymous namespace
-void MetaInfo::Validate(std::int32_t device) const {
+void MetaInfo::Validate(DeviceOrd device) const {
  if (group_ptr_.size() != 0 && weights_.Size() != 0) {
    CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
    return;
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -29,13 +29,13 @@ template <typename T, int32_t D>
 void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
  ArrayInterface<D> array(arr_interface);
  if (array.n == 0) {
-    p_out->SetDevice(0);
+    p_out->SetDevice(DeviceOrd::CUDA(0));
    p_out->Reshape(array.shape);
    return;
  }
  CHECK_EQ(array.valid.Capacity(), 0)
      << "Meta info like label or weight can not have missing value.";
-  auto ptr_device = SetDeviceToPtr(array.data);
+  auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
  p_out->SetDevice(ptr_device);
  if (array.is_contiguous && array.type == ToDType<T>::kType) {
@@ -50,7 +50,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
    return;
  }
  p_out->Reshape(array.shape);
-  auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
+  auto t = p_out->View(ptr_device);
  linalg::ElementWiseTransformDevice(
      t,
      [=] __device__(size_t i, T) {
@@ -86,7 +86,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
      });
  dh::caching_device_vector<bool> flag(1);
  auto d_flag = dh::ToSpan(flag);
-  auto d = SetDeviceToPtr(array_interface.data);
+  auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
  dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
  dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
    auto typed = TypedIndex<uint32_t, 1>{array_interface};
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
  CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
      : columns_(columns),
        num_rows_(num_rows) {}
-  size_t Size() const { return num_rows_ * columns_.size(); }
+  [[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
-  __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
    size_t column_idx = idx % columns_.size();
    size_t row_idx = idx / columns_.size();
    auto const& column = columns_[column_idx];
@@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
    return {row_idx, column_idx, value};
  }
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
    auto const& column = columns_[fidx];
    float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
                      ? column(ridx)
@@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
    return value;
  }
-  XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
 private:
  common::Span<ArrayInterface<1>> columns_;
@@ -120,14 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
      return;
    }
-    device_idx_ = dh::CudaGetPointerDevice(first_column.data);
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
-    CHECK_NE(device_idx_, Context::kCpuId);
+    CHECK(device_.IsCUDA());
-    dh::safe_cuda(cudaSetDevice(device_idx_));
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
    for (auto& json_col : json_columns) {
      auto column = ArrayInterface<1>(get<Object const>(json_col));
      columns.push_back(column);
      num_rows_ = std::max(num_rows_, column.Shape(0));
-      CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
+      CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
          << "All columns should use the same device.";
      CHECK_EQ(num_rows_, column.Shape(0))
          << "All columns should have same number of rows.";
@@ -143,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
    return batch_;
  }
-  size_t NumRows() const { return num_rows_; }
+  [[nodiscard]] std::size_t NumRows() const { return num_rows_; }
-  size_t NumColumns() const { return columns_.size(); }
+  [[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 private:
  CudfAdapterBatch batch_;
  dh::device_vector<ArrayInterface<1>> columns_;
  size_t num_rows_{0};
-  int32_t device_idx_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };
 class CupyAdapterBatch : public detail::NoMetaInfo {
@@ -159,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
  CupyAdapterBatch() = default;
  explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
    : array_interface_(std::move(array_interface)) {}
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
    return array_interface_.Shape(0) * array_interface_.Shape(1);
  }
-  __device__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
    size_t column_idx = idx % array_interface_.Shape(1);
    size_t row_idx = idx / array_interface_.Shape(1);
    float value = array_interface_(row_idx, column_idx);
    return {row_idx, column_idx, value};
  }
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
    float value = array_interface_(ridx, fidx);
    return value;
  }
-  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
 private:
  ArrayInterface<2> array_interface_;
@@ -189,28 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
    if (array_interface_.Shape(0) == 0) {
      return;
    }
-    device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
-    CHECK_NE(device_idx_, Context::kCpuId);
+    CHECK(device_.IsCUDA());
  }
  explicit CupyAdapter(std::string cuda_interface_str)
      : CupyAdapter{StringView{cuda_interface_str}} {}
-  const CupyAdapterBatch& Value() const override { return batch_; }
+  [[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }
-  size_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
-  size_t NumColumns() const { return array_interface_.Shape(1); }
+  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 private:
  ArrayInterface<2> array_interface_;
  CupyAdapterBatch batch_;
-  int32_t device_idx_ {Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };
 // Returns maximum row length
 template <typename AdapterBatchT>
-std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
+std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
                         float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  IsValidFunctor is_valid(missing);
  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -94,22 +94,18 @@ __global__ void CompressBinEllpackKernel(
 }
 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
-                                 bool is_dense, size_t row_stride,
+                                 size_t row_stride, size_t n_rows)
-                                 size_t n_rows)
+    : is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
    : is_dense(is_dense),
      cuts_(std::move(cuts)),
      row_stride(row_stride),
      n_rows(n_rows) {
  monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  monitor_.Start("InitCompressedData");
  InitCompressedData(device);
  monitor_.Stop("InitCompressedData");
 }
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
                                 const SparsePage &page, bool is_dense,
                                 size_t row_stride,
                                 common::Span<FeatureType const> feature_types)
@@ -123,7 +119,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
    : is_dense(dmat->IsDense()) {
  monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
  n_rows = dmat->Info().num_row_;
@@ -138,15 +134,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
  monitor_.Stop("Quantiles");
  monitor_.Start("InitCompressedData");
-  this->InitCompressedData(ctx->gpu_id);
+  this->InitCompressedData(ctx->Device());
  monitor_.Stop("InitCompressedData");
-  dmat->Info().feature_types.SetDevice(ctx->gpu_id);
+  dmat->Info().feature_types.SetDevice(ctx->Device());
  auto ft = dmat->Info().feature_types.ConstDeviceSpan();
  monitor_.Start("BinningCompression");
  CHECK(dmat->SingleColBlock());
  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(ctx->gpu_id, batch, ft);
+    CreateHistIndices(ctx->Device(), batch, ft);
  }
  monitor_.Stop("BinningCompression");
 }
@@ -209,7 +205,7 @@ struct TupleScanOp {
 // to remove missing data
 template <typename AdapterBatchT>
 void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
-                       EllpackPageImpl* dst, int device_idx, float missing) {
+                       EllpackPageImpl* dst, DeviceOrd device, float missing) {
  // Some witchcraft happens here
  // The goal is to copy valid elements out of the input to an ELLPACK matrix
  // with a given row stride, using no extra working memory Standard stream
@@ -241,7 +237,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
  // Tuple[2] = The index in the input data
  using Tuple = thrust::tuple<size_t, size_t, size_t>;
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
@@ -280,10 +276,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 #endif
 }
-void WriteNullValues(EllpackPageImpl* dst, int device_idx,
+void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
                     common::Span<size_t> row_counts) {
  // Write the null values
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
  auto row_stride = dst->row_stride;
@@ -300,11 +295,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
 }
 template <typename AdapterBatch>
-EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                                 common::Span<size_t> row_counts_span,
                                 common::Span<FeatureType const> feature_types, size_t row_stride,
                                 size_t n_rows, common::HistogramCuts const& cuts) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
  CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -313,7 +308,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
  template EllpackPageImpl::EllpackPageImpl(                                               \
-      __BATCH_T batch, float missing, int device, bool is_dense,                           \
+      __BATCH_T batch, float missing, DeviceOrd device, bool is_dense,                     \
      common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
      size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);
@@ -370,9 +365,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
      [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
  row_stride = *std::max_element(it, it + page.Size());
-  CHECK_GE(ctx->gpu_id, 0);
+  CHECK(ctx->IsCUDA());
  monitor_.Start("InitCompressedData");
-  InitCompressedData(ctx->gpu_id);
+  InitCompressedData(ctx->Device());
  monitor_.Stop("InitCompressedData");
  // copy gidx
@@ -382,7 +377,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
-  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
+  auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
  auto null = accessor.NullValue();
  CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
 }
@@ -407,8 +402,7 @@ struct CopyPage {
 };
 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
+size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
                             size_t offset) {
  monitor_.Start("Copy");
  size_t num_elements = page->n_rows * page->row_stride;
  CHECK_EQ(row_stride, page->row_stride);
@@ -468,7 +462,7 @@ struct CompactPage {
 };
 // Compacts the data from the given EllpackPage into the current page.
-void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
+void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
                              common::Span<size_t> row_indexes) {
  monitor_.Start("Compact");
  CHECK_EQ(row_stride, page->row_stride);
@@ -481,13 +475,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
 }
 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(int device) {
+void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
  size_t num_symbols = NumSymbols();
  // Required buffer size for storing data matrix in ELLPack format.
  size_t compressed_size_bytes =
-    common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
      num_symbols);
  gidx_buffer.SetDevice(device);
  // Don't call fill unnecessarily
  if (gidx_buffer.Size() == 0) {
@@ -499,7 +492,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
 }
 // Compress a CSR page into ELLPACK.
-void EllpackPageImpl::CreateHistIndices(int device,
+void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
                                        const SparsePage& row_batch,
                                        common::Span<FeatureType const> feature_types) {
  if (row_batch.Size() == 0) return;
@@ -509,7 +502,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
  // bin and compress entries in batches of rows
  size_t gpu_batch_nrows =
-      std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
+      std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
               static_cast<size_t>(row_batch.Size()));
  size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
@@ -572,7 +565,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
 }
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
-    int device, common::Span<FeatureType const> feature_types) const {
+    DeviceOrd device, common::Span<FeatureType const> feature_types) const {
  gidx_buffer.SetDevice(device);
  return {device,
          cuts_,
@@ -586,7 +579,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
 }
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
    common::Span<FeatureType const> feature_types) const {
-  return {Context::kCpuId,
+  return {DeviceOrd::CPU(),
          cuts_,
          is_dense,
          row_stride,
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {
  common::Span<const FeatureType> feature_types;
-  EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
+  EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
-                        bool is_dense, size_t row_stride, size_t base_rowid,
+                        size_t row_stride, size_t base_rowid, size_t n_rows,
-                        size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter,
+                        common::CompressedIterator<uint32_t> gidx_iter,
                        common::Span<FeatureType const> feature_types)
      : is_dense(is_dense),
        row_stride(row_stride),
        base_rowid(base_rowid),
-        n_rows(n_rows) ,gidx_iter(gidx_iter),
+        n_rows(n_rows),
        gidx_iter(gidx_iter),
        feature_types{feature_types} {
-    if (device == Context::kCpuId) {
+    if (device.IsCPU()) {
      gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
      feature_segments = cuts.cut_ptrs_.ConstHostSpan();
      min_fvalue = cuts.min_vals_.ConstHostSpan();
@@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
  }
  // Get a matrix element, uses binary search for look up Return NaN if missing
  // Given a row index and a feature index, returns the corresponding cut value
-  __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
    ridx -= base_rowid;
    auto row_begin = row_stride * ridx;
    auto row_end = row_begin + row_stride;
@@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
  }
  template <bool is_cat>
-  __device__ uint32_t SearchBin(float value, size_t column_id) const {
+  [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
    auto beg = feature_segments[column_id];
    auto end = feature_segments[column_id + 1];
    uint32_t idx = 0;
@@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
    return idx;
  }
-  __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
    auto gidx = GetBinIndex(ridx, fidx);
    if (gidx == -1) {
      return nan("");
@@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
  }
  // Check if the row id is withing range of the current batch.
-  __device__ bool IsInRange(size_t row_id) const {
+  [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
    return row_id >= base_rowid && row_id < base_rowid + n_rows;
  }
  /*! \brief Return the total number of symbols (total number of bins plus 1 for
   * not found). */
-  XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
-  XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
-  XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
-  XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
 };
@@ -141,14 +142,13 @@ class EllpackPageImpl {
   * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
   * and the given number of rows.
   */
-  EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
-                  size_t row_stride, size_t n_rows);
+                  size_t n_rows);
  /*!
   * \brief Constructor used for external memory.
   */
-  EllpackPageImpl(int device, common::HistogramCuts cuts,
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
-                  const SparsePage &page, bool is_dense, size_t row_stride,
+                  bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);
                  common::Span<FeatureType const> feature_types);
  /*!
   * \brief Constructor from an existing DMatrix.
@@ -159,7 +159,7 @@ class EllpackPageImpl {
  explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
  template <typename AdapterBatch>
-  explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+  explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                           common::Span<size_t> row_counts_span,
                           common::Span<FeatureType const> feature_types, size_t row_stride,
                           size_t n_rows, common::HistogramCuts const& cuts);
@@ -176,7 +176,7 @@ class EllpackPageImpl {
   * @param offset The number of elements to skip before copying.
   * @returns The number of elements copied.
   */
-  size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
+  size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
  /*! \brief Compact the given ELLPACK page into the current page.
   *
@@ -184,11 +184,10 @@ class EllpackPageImpl {
   * @param page The ELLPACK page to compact from.
   * @param row_indexes Row indexes for the compacted page.
   */
-  void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
+  void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
  /*! \return Number of instances in the page. */
-  size_t Size() const;
+  [[nodiscard]] size_t Size() const;
  /*! \brief Set the base row id for this page. */
  void SetBaseRowId(std::size_t row_id) {
@@ -204,12 +203,12 @@ class EllpackPageImpl {
  /*! \brief Return the total number of symbols (total number of bins plus 1 for
   * not found). */
-  size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
+  [[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
-  EllpackDeviceAccessor
+  [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
-  GetDeviceAccessor(int device,
+      DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
-                    common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
-  EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const;
+      common::Span<FeatureType const> feature_types = {}) const;
 private:
  /*!
@@ -218,13 +217,13 @@ class EllpackPageImpl {
   * @param device The GPU device to use.
   * @param row_batch The CSR page.
   */
-  void CreateHistIndices(int device,
+  void CreateHistIndices(DeviceOrd device,
                         const SparsePage& row_batch,
                         common::Span<FeatureType const> feature_types);
  /*!
   * \brief Initialize the buffer to store compressed features.
   */
-  void InitCompressedData(int device);
+  void InitCompressedData(DeviceOrd device);
 public:
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,7 +10,7 @@
 namespace xgboost::data {
 void EllpackPageSource::Fetch() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  if (!this->ReadCache()) {
    if (count_ != 0 && !sync_) {
      // source is initialized to be the 0th page during construction, so when count_ is 0
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
  BatchParam param_;
  common::Span<FeatureType const> feature_types_;
  std::unique_ptr<common::HistogramCuts> cuts_;
-  std::int32_t device_;
+  DeviceOrd device_;
 public:
  EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
                    std::shared_ptr<Cache> cache, BatchParam param,
                    std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
                    common::Span<FeatureType const> feature_types,
-                    std::shared_ptr<SparsePageSource> source, std::int32_t device)
+                    std::shared_ptr<SparsePageSource> source, DeviceOrd device)
      : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
        is_dense_{is_dense},
        row_stride_{row_stride},
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
  auto pctx = MakeProxy(proxy_)->Ctx();
  Context ctx;
-  ctx.UpdateAllowUnknown(
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
  // hardcoded parameter.
  BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
@@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
    return HostAdapterDispatch(proxy, [&](auto const& value) {
      size_t n_threads = ctx->Threads();
      size_t n_features = column_sizes.size();
-      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
+      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
      column_sizes_tloc.Data()->Fill(0ul);
      auto view = column_sizes_tloc.HostView();
      common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -47,9 +47,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  int32_t current_device;
  dh::safe_cuda(cudaGetDevice(&current_device));
-  auto get_device = [&]() -> int32_t {
+  auto get_device = [&]() {
-    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
+    auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
-    CHECK_NE(d, Context::kCpuId);
+    CHECK(!d.IsCPU());
    return d;
  };
@@ -59,9 +59,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  common::HistogramCuts cuts;
  do {
    // We use do while here as the first batch is fetched in ctor
-    // ctx_.gpu_id = proxy->DeviceIdx();
+    CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
-    CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
    dh::safe_cuda(cudaSetDevice(get_device()));
    if (cols == 0) {
      cols = num_cols();
      collective::Allreduce<collective::Operation::kMax>(&cols, 1);
@@ -93,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  auto n_features = cols;
  CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
-  dh::safe_cuda(cudaSetDevice(get_device()));
+  dh::safe_cuda(cudaSetDevice(get_device().ordinal));
  if (!ref) {
    HostDeviceVector<FeatureType> ft;
    common::SketchContainer final_sketch(
@@ -132,7 +131,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  size_t n_batches_for_verification = 0;
  while (iter.Next()) {
    init_page();
-    dh::safe_cuda(cudaSetDevice(get_device()));
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
    auto rows = num_rows();
    dh::device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
@@ -184,18 +183,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
  if (!ellpack_) {
    ellpack_.reset(new EllpackPage());
    if (ctx->IsCUDA()) {
-      this->Info().feature_types.SetDevice(ctx->gpu_id);
+      this->Info().feature_types.SetDevice(ctx->Device());
      *ellpack_->Impl() =
          EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
    } else if (fmat_ctx_.IsCUDA()) {
-      this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
+      this->Info().feature_types.SetDevice(fmat_ctx_.Device());
      *ellpack_->Impl() =
          EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
    } else {
      // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
      // for cut reference.
      auto cuda_ctx = ctx->MakeCUDA();
-      this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
+      this->Info().feature_types.SetDevice(cuda_ctx.Device());
      *ellpack_->Impl() =
          EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
    }
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }
-void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
+void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
-                              char const *c_values, bst_feature_t n_features, bool on_host) {
+                              bst_feature_t n_features, bool on_host) {
  CHECK(on_host) << "Not implemented on device.";
  std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
      StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }
 namespace cuda_impl {
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
    return;
  }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }
 void DMatrixProxy::FromCudaArray(StringView interface_str) {
@@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
    return;
  }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }
 namespace cuda_impl {
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA)
 public:
-  int DeviceIdx() const { return ctx_.gpu_id; }
+  DeviceOrd Device() const { return ctx_.Device(); }
  void SetCUDAArray(char const* c_interface) {
    common::AssertGPUSupport();
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -253,7 +253,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
    }
    if (batch.BaseMargin() != nullptr) {
      info_.base_margin_ = decltype(info_.base_margin_){
-          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId};
+          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
    }
    if (batch.Qid() != nullptr) {
      qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -10,9 +10,7 @@
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"
-namespace xgboost {
+namespace xgboost::data {
 namespace data {
 // Does not currently support metainfo as no on-device data source contains this
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
@@ -21,13 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                             DataSplitMode data_split_mode) {
  CHECK(data_split_mode != DataSplitMode::kCol)
      << "Column-wise data split is currently not supported on the GPU.";
-  auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
+  auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
-                                                                      : adapter->DeviceIdx();
+                    ? DeviceOrd::CUDA(dh::CurrentDevice())
-  CHECK_GE(device, 0);
+                    : adapter->Device();
-  dh::safe_cuda(cudaSetDevice(device));
+  CHECK(device.IsCUDA());
  dh::safe_cuda(cudaSetDevice(device.ordinal));
  Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
  CHECK(adapter->NumRows() != kAdapterUnknownSize);
  CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@@ -52,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
                                      int nthread, DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
                                      int nthread, DataSplitMode data_split_mode);
-}  // namespace data
+}  // namespace xgboost::data
 }  // namespace xgboost
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -40,9 +40,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
 }
 template <typename AdapterBatchT>
-void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
+void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
-                     int device_idx, float missing) {
+                     float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  IsValidFunctor is_valid(missing);
  // Count elements per row
  dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
@@ -55,14 +55,13 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
  });
  dh::XGBCachingDeviceAllocator<char> alloc;
-  thrust::exclusive_scan(thrust::cuda::par(alloc),
+  thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data()),
+                         thrust::device_pointer_cast(offset.data() + offset.size()),
-      thrust::device_pointer_cast(offset.data() + offset.size()),
+                         thrust::device_pointer_cast(offset.data()));
      thrust::device_pointer_cast(offset.data()));
 }
 template <typename AdapterBatchT>
-size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
+size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
                        SparsePage* page) {
  bool valid = NoInfInData(batch, IsValidFunctor{missing});
  CHECK(valid) << error::InfInData();
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
    ellpack_page_source_.reset();  // make sure resource is released before making new ones.
    ellpack_page_source_ = std::make_shared<EllpackPageSource>(
        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
-        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
        ctx->Device());
  } else {
    CHECK(sparse_page_source_);
    ellpack_page_source_->Reset();
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
 }  // namespace detail
 void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
-  auto device = proxy->DeviceIdx();
+  auto device = proxy->Device();
-  if (device < 0) {
+  if (device.IsCPU()) {
-    device = dh::CurrentDevice();
+    device = DeviceOrd::CUDA(dh::CurrentDevice());
  }
-  CHECK_GE(device, 0);
+  CHECK(device.IsCUDA());
  cuda_impl::Dispatch(proxy,
                      [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
  monitor_.Start("BoostNewTrees");
-  predt->predictions.SetDevice(ctx_->Ordinal());
+  predt->predictions.SetDevice(ctx_->Device());
  auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                    model_.learner_model_param->OutputLength());
  CHECK_NE(n_groups, 0);
@@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
  } else {
    CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
    linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
-                                     ctx_->Ordinal()};
+                                     ctx_->Device()};
    bool update_predict = true;
    for (bst_target_t gid = 0; gid < n_groups; ++gid) {
      node_position.clear();
@@ -736,7 +736,7 @@ class Dart : public GBTree {
    PredictionCacheEntry predts;  // temporary storage for prediction
    if (ctx_->IsCUDA()) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+      predts.predictions.SetDevice(ctx_->Device());
    }
    predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
    // multi-target is not yet supported.
@@ -761,8 +761,8 @@ class Dart : public GBTree {
      CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
      size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
+      if (predts.predictions.Device().IsCUDA()) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
        GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
                          predts.predictions.DeviceSpan(), w, n_rows, n_groups,
                          group);
@@ -801,8 +801,8 @@ class Dart : public GBTree {
    StringView msg{"Unsupported data type for inplace predict."};
    PredictionCacheEntry predts;
-    if (ctx_->gpu_id != Context::kCpuId) {
+    if (ctx_->IsCUDA()) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+      predts.predictions.SetDevice(ctx_->Device());
    }
    predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
@@ -838,8 +838,8 @@ class Dart : public GBTree {
      CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
      size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
+      if (predts.predictions.Device().IsCUDA()) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
        auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
        GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
                                 predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -305,10 +305,10 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* c
 void LearnerModelParam::Copy(LearnerModelParam const& that) {
  base_score_.Reshape(that.base_score_.Shape());
-  base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
+  base_score_.Data()->SetDevice(that.base_score_.Device());
  base_score_.Data()->Copy(*that.base_score_.Data());
  std::as_const(base_score_).HostView();
-  if (that.base_score_.DeviceIdx() != Context::kCpuId) {
+  if (!that.base_score_.Device().IsCPU()) {
    std::as_const(base_score_).View(that.base_score_.Device());
  }
  CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
@@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
    if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
      if (p_fmat) {
        auto const& info = p_fmat->Info();
-        info.Validate(Ctx()->Ordinal());
+        info.Validate(Ctx()->Device());
        // We estimate it from input data.
        linalg::Tensor<float, 1> base_score;
        InitEstimation(info, &base_score);
@@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner {
    monitor_.Init("Learner");
    for (std::shared_ptr<DMatrix> const& d : cache) {
      if (d) {
-        prediction_container_.Cache(d, Context::kCpuId);
+        prediction_container_.Cache(d, DeviceOrd::CPU());
      }
    }
  }
@@ -1046,7 +1046,7 @@ class LearnerIO : public LearnerConfiguration {
                                                        ? std::numeric_limits<float>::quiet_NaN()
                                                        : obj_->ProbToMargin(mparam_.base_score)},
                                                   {1},
-                                                   Context::kCpuId},
+                                                   DeviceOrd::CPU()},
                          obj_->Task(), tparam_.multi_strategy);
    if (attributes_.find("objective") != attributes_.cend()) {
@@ -1271,7 +1271,7 @@ class LearnerImpl : public LearnerIO {
    this->ValidateDMatrix(train.get(), true);
-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());
    monitor_.Start("PredictRaw");
    this->PredictRaw(train.get(), &predt, true, 0, 0);
@@ -1301,7 +1301,7 @@ class LearnerImpl : public LearnerIO {
    CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
        << "The number of columns in gradient should be equal to the number of targets/classes in "
           "the model.";
-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());
    gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
    monitor_.Stop("BoostOneIter");
  }
@@ -1327,11 +1327,11 @@ class LearnerImpl : public LearnerIO {
    for (size_t i = 0; i < data_sets.size(); ++i) {
      std::shared_ptr<DMatrix> m = data_sets[i];
-      auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
+      auto &predt = prediction_container_.Cache(m, ctx_.Device());
      this->ValidateDMatrix(m.get(), false);
      this->PredictRaw(m.get(), &predt, false, 0, 0);
-      auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions;
+      auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions;
      out.Resize(predt.predictions.Size());
      out.Copy(predt.predictions);
@@ -1367,7 +1367,7 @@ class LearnerImpl : public LearnerIO {
    } else if (pred_leaf) {
      gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
    } else {
-      auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
+      auto& prediction = prediction_container_.Cache(data, ctx_.Device());
      this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
      // Copy the prediction cache to output prediction. out_preds comes from C API
      out_preds->SetDevice(ctx_.Device());
@@ -1447,7 +1447,7 @@ class LearnerImpl : public LearnerIO {
  void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
    MetaInfo const& info = p_fmat->Info();
-    info.Validate(ctx_.gpu_id);
+    info.Validate(ctx_.Device());
    if (is_training) {
      CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
  }
  void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
-    if (ctx_->gpu_id < 0) return;
+    if (ctx_->IsCPU()) return;
    num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
@@ -60,7 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
      return;
    }
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    // The begin and end indices for the section of each column associated with
    // this device
    std::vector<std::pair<bst_uint, bst_uint>> column_segments;
@@ -133,7 +133,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
         ++group_idx) {
      // Get gradient
      auto grad = GradientPair(0, 0);
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
        grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
      }
      auto dbias = static_cast<float>(
@@ -142,7 +142,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
      model->Bias()[group_idx] += dbias;
      // Update residual
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
        UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
      }
    }
@@ -153,7 +153,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    bst_float &w = (*model)[fidx][group_idx];
    // Get gradient
    auto grad = GradientPair(0, 0);
-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
      grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
    }
    auto dw = static_cast<float>(tparam_.learning_rate *
@@ -162,14 +162,14 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
                                                 tparam_.reg_lambda_denorm));
    w += dw;
-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
      UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
    }
  }
  // This needs to be public because of the __device__ lambda.
  GradientPair GetBiasGradient(int group_idx, int num_group) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    auto counting = thrust::make_counting_iterator(0ull);
    auto f = [=] __device__(size_t idx) {
      return idx * num_group + group_idx;
@@ -193,7 +193,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
  // This needs to be public because of the __device__ lambda.
  GradientPair GetGradient(int group_idx, int num_group, int fidx) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
    size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
    common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -23,8 +23,7 @@
 #include "xgboost/linalg.h"
 #include "xgboost/metric.h"
-namespace xgboost {
+namespace xgboost::metric {
 namespace metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(auc);
 /**
@@ -257,10 +256,10 @@ template <typename Curve>
 class EvalAUC : public MetricNoCache {
  double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
    double auc {0};
-    if (ctx_->gpu_id != Context::kCpuId) {
+    if (ctx_->Device().IsCUDA()) {
-      preds.SetDevice(ctx_->gpu_id);
+      preds.SetDevice(ctx_->Device());
-      info.labels.SetDevice(ctx_->gpu_id);
+      info.labels.SetDevice(ctx_->Device());
-      info.weights_.SetDevice(ctx_->gpu_id);
+      info.weights_.SetDevice(ctx_->Device());
    }
    //  We use the global size to handle empty dataset.
    std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
@@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
    double auc{0};
    uint32_t valid_groups = 0;
    auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      std::tie(auc, valid_groups) =
          RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
    } else {
@@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
    double auc{0};
    auto n_threads = ctx_->Threads();
    CHECK_NE(n_classes, 0);
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
    } else {
      auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
@@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
  std::tuple<double, double, double>
  EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
    double fp, tp, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
                                           info.labels.HostView().Slice(linalg::All(), 0),
                                           common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
  }
 public:
-  char const* Name() const override {
+  [[nodiscard]] char const* Name() const override {
    return "auc";
  }
 };
@@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
  std::tuple<double, double, double>
  EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
    double pr, re, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      std::tie(pr, re, auc) =
          BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                      common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
  double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
                        size_t n_classes) {
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      auto n_threads = this->ctx_->Threads();
      return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
    } else {
@@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
    double auc{0};
    uint32_t valid_groups = 0;
    auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
      auto labels = info.labels.Data()->ConstHostSpan();
      if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
        InvalidLabels();
@@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
  }
 public:
-  const char *Name() const override { return "aucpr"; }
+  [[nodiscard]] const char *Name() const override { return "aucpr"; }
 };
 XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
@@ -473,5 +472,4 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *, common::Span<f
  return {};
 }
 #endif
-}  // namespace metric
+}  // namespace xgboost::metric
 }  // namespace xgboost
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -824,7 +824,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
                                                 common::Span<float const> predts,
                                                 MetaInfo const &info,
                                                 std::shared_ptr<DeviceAUCCache> *p_cache) {
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
  if (predts.empty()) {
    return std::make_pair(0.0, static_cast<uint32_t>(0));
  }
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -127,24 +127,24 @@ class MultiClassMetricsReduction {
 #endif  // XGBOOST_USE_CUDA
-  PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class,
+  PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class,
                            const HostDeviceVector<bst_float>& weights,
                            const HostDeviceVector<bst_float>& labels,
                            const HostDeviceVector<bst_float>& preds) {
    PackedReduceResult result;
-    if (device < 0) {
+    if (device.IsCPU()) {
      result =
-          CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads());
+          CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
    }
 #if defined(XGBOOST_USE_CUDA)
    else {  // NOLINT
-      device_ = tparam.gpu_id;
+      device_ = ctx.Device();
      preds.SetDevice(device_);
      labels.SetDevice(device_);
      weights.SetDevice(device_);
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
      result = DeviceReduceMetrics(weights, labels, preds, n_class);
    }
 #endif  // defined(XGBOOST_USE_CUDA)
@@ -154,7 +154,7 @@ class MultiClassMetricsReduction {
 private:
 #if defined(XGBOOST_USE_CUDA)
  dh::PinnedMemory label_error_;
-  int device_{-1};
+  DeviceOrd device_{DeviceOrd::CPU()};
 #endif  // defined(XGBOOST_USE_CUDA)
 };
@@ -176,7 +176,7 @@ struct EvalMClassBase : public MetricNoCache {
      CHECK_GE(nclass, 1U)
          << "mlogloss and merror are only used for multi-class classification,"
          << " use logloss for binary classification";
-      int device = ctx_->gpu_id;
+      auto device = ctx_->Device();
      auto result =
          reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
      dat[0] = result.Residue();
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -35,7 +35,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
  auto d_gptr = p_cache->DataGroupPtr(ctx);
  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
  auto topk = p_cache->Param().TopK();
  auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
@@ -90,7 +90,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
    CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
  }
  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -130,18 +130,18 @@ class ElementWiseSurvivalMetricsReduction {
      const HostDeviceVector<bst_float>& preds) {
    PackedReduceResult result;
-    if (ctx.gpu_id < 0) {
+    if (ctx.IsCPU()) {
      result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
                                preds, ctx.Threads());
    }
 #if defined(XGBOOST_USE_CUDA)
    else {  // NOLINT
-      preds.SetDevice(ctx.gpu_id);
+      preds.SetDevice(ctx.Device());
-      labels_lower_bound.SetDevice(ctx.gpu_id);
+      labels_lower_bound.SetDevice(ctx.Device());
-      labels_upper_bound.SetDevice(ctx.gpu_id);
+      labels_upper_bound.SetDevice(ctx.Device());
-      weights.SetDevice(ctx.gpu_id);
+      weights.SetDevice(ctx.Device());
-      dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
+      dh::safe_cuda(cudaSetDevice(ctx.Ordinal()));
      result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
    }
 #endif  // defined(XGBOOST_USE_CUDA)
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -100,7 +100,7 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> cons
    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
                               predt, alpha, p_tree);
  } else {
-    position.SetDevice(ctx->gpu_id);
+    position.SetDevice(ctx->Device());
    detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
                                 predt, alpha, p_tree);
  }
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -42,7 +42,7 @@ class AFTObj : public ObjFunction {
  template <typename Distribution>
  void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
-                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
+                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, DeviceOrd device,
                       bool is_null_weight, float aft_loss_distribution_scale) {
    common::Transform<>::Init(
        [=] XGBOOST_DEVICE(size_t _idx,
@@ -75,7 +75,7 @@ class AFTObj : public ObjFunction {
    CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(ndata, 1);
-    const int device = ctx_->gpu_id;
+    const auto device = ctx_->Device();
    const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
    const bool is_null_weight = info.weights_.Size() == 0;
    if (!is_null_weight) {
@@ -108,7 +108,7 @@ class AFTObj : public ObjFunction {
          _preds[_idx] = exp(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -1,5 +1,5 @@
-/*!
+/**
- * Copyright 2018-2022 by XGBoost Contributors
+ * Copyright 2018-2023, XGBoost Contributors
 * \file hinge.cc
 * \brief Provides an implementation of the hinge loss function
 * \author Henry Gouk
@@ -13,8 +13,7 @@
 #include "../common/transform.h"
 #include "../common/common.h"
-namespace xgboost {
+namespace xgboost::obj {
 namespace obj {
 #if defined(XGBOOST_USE_CUDA)
 DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
@@ -63,7 +62,7 @@ class HingeObj : public ObjFunction {
          _out_gpair[_idx] = GradientPair(g, h);
        },
        common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
-        ctx_->gpu_id).Eval(
+        ctx_->Device()).Eval(
            out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
  }
@@ -73,11 +72,11 @@ class HingeObj : public ObjFunction {
          _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
    return "error";
  }
@@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
 .describe("Hinge loss. Expects labels to be in [0,1f]")
 .set_body([]() { return new HingeObj(); });
-}  // namespace obj
+}  // namespace xgboost::obj
 }  // namespace xgboost
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
    CheckInitInputs(info);
  }
  // Avoid altering any state in child objective.
-  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
+  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device());
-  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
+  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->Device());
  Json config{Object{}};
  this->SaveConfig(&config);
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -103,10 +103,10 @@ class LambdaRankObj : public FitIntercept {
  // Update position biased for unbiased click data
  void UpdatePositionBias() {
-    li_full_.SetDevice(ctx_->gpu_id);
+    li_full_.SetDevice(ctx_->Device());
-    lj_full_.SetDevice(ctx_->gpu_id);
+    lj_full_.SetDevice(ctx_->Device());
-    li_.SetDevice(ctx_->gpu_id);
+    li_.SetDevice(ctx_->Device());
-    lj_.SetDevice(ctx_->gpu_id);
+    lj_.SetDevice(ctx_->Device());
    if (ctx_->IsCPU()) {
      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -290,12 +290,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
            linalg::VectorView<double> li, linalg::VectorView<double> lj,
            linalg::Matrix<GradientPair>* out_gpair) {
  // boilerplate
-  std::int32_t device_id = ctx->gpu_id;
+  auto device = ctx->Device();
-  dh::safe_cuda(cudaSetDevice(device_id));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  auto n_groups = p_cache->Groups();
-  info.labels.SetDevice(device_id);
+  info.labels.SetDevice(device);
-  preds.SetDevice(device_id);
+  preds.SetDevice(device);
  out_gpair->SetDevice(ctx->Device());
  out_gpair->Reshape(preds.Size(), 1);
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const int nclass = param_.num_class;
    const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
    out_gpair->SetDevice(device);
    info.labels.SetDevice(device);
    info.weights_.SetDevice(device);
@@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const int nclass = param_.num_class;
    const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
-    auto device = io_preds->DeviceIdx();
+    auto device = io_preds->Device();
    if (prob) {
      common::Transform<>::Init(
          [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -70,15 +70,15 @@ class QuantileRegression : public ObjFunction {
    out_gpair->Reshape(info.num_row_, n_targets);
    auto gpair = out_gpair->View(ctx_->Device());
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                 : info.weights_.ConstDeviceSpan()};
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
    auto predt = linalg::MakeVec(&preds);
    auto n_samples = info.num_row_;
-    alpha_.SetDevice(ctx_->gpu_id);
+    alpha_.SetDevice(ctx_->Device());
    auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
    linalg::ElementWiseKernel(
@@ -103,7 +103,7 @@ class QuantileRegression : public ObjFunction {
    CHECK(!alpha_.Empty());
    auto n_targets = this->Targets(info);
-    base_score->SetDevice(ctx_->gpu_id);
+    base_score->SetDevice(ctx_->Device());
    base_score->Reshape(n_targets);
    double sw{0};
@@ -129,7 +129,7 @@ class QuantileRegression : public ObjFunction {
      }
    } else {
 #if defined(XGBOOST_USE_CUDA)
-      alpha_.SetDevice(ctx_->gpu_id);
+      alpha_.SetDevice(ctx_->Device());
      auto d_alpha = alpha_.ConstDeviceSpan();
      auto d_labels = info.labels.View(ctx_->Device());
      auto seg_it = dh::MakeTransformIterator<std::size_t>(
@@ -148,7 +148,7 @@ class QuantileRegression : public ObjFunction {
                                  val_it + n, base_score->Data());
        sw = info.num_row_;
      } else {
-        info.weights_.SetDevice(ctx_->gpu_id);
+        info.weights_.SetDevice(ctx_->Device());
        auto d_weights = info.weights_.ConstDeviceSpan();
        auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                                          [=] XGBOOST_DEVICE(std::size_t i) {
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept {
    size_t const ndata = preds.Size();
    out_gpair->SetDevice(ctx_->Device());
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
    bool is_null_weight = info.weights_.Size() == 0;
    auto scale_pos_weight = param_.scale_pos_weight;
@@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept {
    additional_input_.HostVector().begin()[1] = is_null_weight;
    const size_t nthreads = ctx_->Threads();
-    bool on_device = device >= 0;
+    bool on_device = device.IsCUDA();
    // On CPU we run the transformation each thread processing a contigious block of data
    // for better performance.
    const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
@@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept {
          _preds[_idx] = Loss::PredTransform(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }
@@ -246,14 +246,14 @@ class PseudoHuberRegression : public FitIntercept {
    CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
    auto labels = info.labels.View(ctx_->Device());
-    out_gpair->SetDevice(ctx_->gpu_id);
+    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(info.num_row_, this->Targets(info));
    auto gpair = out_gpair->View(ctx_->Device());
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
    auto predt = linalg::MakeVec(&preds);
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                 : info.weights_.ConstDeviceSpan()};
@@ -327,7 +327,7 @@ class PoissonRegression : public FitIntercept {
    size_t const ndata = preds.Size();
    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(info.num_row_, this->Targets(info));
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
    label_correct_.Resize(1);
    label_correct_.Fill(1);
@@ -369,7 +369,7 @@ class PoissonRegression : public FitIntercept {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -512,7 +512,7 @@ class GammaRegression : public FitIntercept {
    CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
    CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
    const size_t ndata = preds.Size();
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(info.num_row_, this->Targets(info));
    label_correct_.Resize(1);
@@ -555,7 +555,7 @@ class GammaRegression : public FitIntercept {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -613,7 +613,7 @@ class TweedieRegression : public FitIntercept {
    out_gpair->SetDevice(ctx_->Device());
    out_gpair->Reshape(info.num_row_, this->Targets(info));
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
    label_correct_.Resize(1);
    label_correct_.Fill(1);
@@ -660,7 +660,7 @@ class TweedieRegression : public FitIntercept {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
        .Eval(io_preds);
  }
@@ -711,9 +711,9 @@ class MeanAbsoluteError : public ObjFunction {
    out_gpair->Reshape(info.num_row_, this->Targets(info));
    auto gpair = out_gpair->View(ctx_->Device());
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
    auto predt = linalg::MakeVec(&preds);
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                 : info.weights_.ConstDeviceSpan()};
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -180,33 +180,30 @@ struct DeviceAdapterLoader {
  XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
                                         bst_feature_t num_features, bst_row_t num_rows,
-                                         size_t entry_start, float missing) :
+                                         size_t entry_start, float missing)
-    batch{batch},
+      : batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
-    columns{num_features},
+    extern __shared__ float _smem[];
-    use_shared{use_shared},
+    smem = _smem;
-    is_valid{missing} {
+    if (use_shared) {
-      extern __shared__ float _smem[];
+      uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-      smem = _smem;
+      size_t shared_elements = blockDim.x * num_features;
-      if (use_shared) {
+      dh::BlockFill(smem, shared_elements, nanf(""));
-        uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      __syncthreads();
-        size_t shared_elements = blockDim.x * num_features;
+      if (global_idx < num_rows) {
-        dh::BlockFill(smem, shared_elements, nanf(""));
+        auto beg = global_idx * columns;
-        __syncthreads();
+        auto end = (global_idx + 1) * columns;
-        if (global_idx < num_rows) {
+        for (size_t i = beg; i < end; ++i) {
-          auto beg = global_idx * columns;
+          auto value = batch.GetElement(i).value;
-          auto end = (global_idx + 1) * columns;
+          if (is_valid(value)) {
-          for (size_t i = beg; i < end; ++i) {
+            smem[threadIdx.x * num_features + (i - beg)] = value;
            auto value = batch.GetElement(i).value;
            if (is_valid(value)) {
              smem[threadIdx.x * num_features + (i - beg)] = value;
            }
          }
        }
      }
      __syncthreads();
    }
    __syncthreads();
  }
-  XGBOOST_DEV_INLINE  float GetElement(size_t  ridx, size_t  fidx) const {
+  [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
    if (use_shared) {
      return smem[threadIdx.x * columns + fidx];
    }
@@ -340,11 +337,11 @@ class DeviceModel {
  size_t tree_end_;  // NOLINT
  int num_group;
-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
+  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
-    dh::safe_cuda(cudaSetDevice(gpu_id));
+    dh::safe_cuda(cudaSetDevice(device.ordinal));
    // Copy decision trees to device
-    tree_segments = HostDeviceVector<size_t>({}, gpu_id);
+    tree_segments = HostDeviceVector<size_t>({}, device);
    auto& h_tree_segments = tree_segments.HostVector();
    h_tree_segments.reserve((tree_end - tree_begin) + 1);
    size_t sum = 0;
@@ -354,8 +351,8 @@ class DeviceModel {
      h_tree_segments.push_back(sum);
    }
-    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
+    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
-    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
+    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
    auto d_nodes = nodes.DevicePointer();
    auto d_stats = stats.DevicePointer();
    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -369,12 +366,12 @@ class DeviceModel {
          sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
    }
-    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
+    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
    auto& h_tree_group = tree_group.HostVector();
    std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
    // Initialize categorical splits.
-    split_types.SetDevice(gpu_id);
+    split_types.SetDevice(device);
    std::vector<FeatureType>& h_split_types = split_types.HostVector();
    h_split_types.resize(h_tree_segments.back());
    for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -383,8 +380,8 @@ class DeviceModel {
                h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
    }
-    categories = HostDeviceVector<uint32_t>({}, gpu_id);
+    categories = HostDeviceVector<uint32_t>({}, device);
-    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id);
+    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
    std::vector<uint32_t> &h_categories = categories.HostVector();
    std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
    for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -397,7 +394,7 @@ class DeviceModel {
    }
    categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
-        h_tree_segments.back(), {}, gpu_id);
+        h_tree_segments.back(), {}, device);
    std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
        categories_node_segments.HostVector();
    for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -485,8 +482,8 @@ struct PathInfo {
 void ExtractPaths(
    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
    DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
-    int gpu_id) {
+    DeviceOrd device) {
-  dh::safe_cuda(cudaSetDevice(gpu_id));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  auto& device_model = *model;
  dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@@ -773,12 +770,12 @@ class ColumnSplitHelper {
  template <bool predict_leaf>
  void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
                      bst_feature_t num_features, std::uint32_t num_group) const {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    dh::caching_device_vector<BitType> decision_storage{};
    dh::caching_device_vector<BitType> missing_storage{};
    auto constexpr kBlockThreads = 128;
-    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
+    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
    auto const shared_memory_bytes =
        SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
    auto const use_shared = shared_memory_bytes != 0;
@@ -791,8 +788,8 @@ class ColumnSplitHelper {
      BitVector decision_bits{dh::ToSpan(decision_storage)};
      BitVector missing_bits{dh::ToSpan(missing_storage)};
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.offset.SetDevice(ctx_->Device());
-      batch.data.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
      std::size_t entry_start = 0;
      SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
@@ -823,9 +820,9 @@ class ColumnSplitHelper {
  void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
                           dh::caching_device_vector<BitType>* missing_storage) const {
    collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
+        ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
    collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
+        ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
  }
  void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
@@ -853,12 +850,12 @@ class GPUPredictor : public xgboost::Predictor {
                       size_t num_features,
                       HostDeviceVector<bst_float>* predictions,
                       size_t batch_offset, bool is_dense) const {
-    batch.offset.SetDevice(ctx_->gpu_id);
+    batch.offset.SetDevice(ctx_->Device());
-    batch.data.SetDevice(ctx_->gpu_id);
+    batch.data.SetDevice(ctx_->Device());
    const uint32_t BLOCK_THREADS = 128;
    size_t num_rows = batch.Size();
    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
    size_t shared_memory_bytes =
        SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
    bool use_shared = shared_memory_bytes != 0;
@@ -914,10 +911,10 @@ class GPUPredictor : public xgboost::Predictor {
    if (tree_end - tree_begin == 0) {
      return;
    }
-    out_preds->SetDevice(ctx_->gpu_id);
+    out_preds->SetDevice(ctx_->Device());
    auto const& info = dmat->Info();
    DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
+    d_model.Init(model, tree_begin, tree_end, ctx_->Device());
    if (info.IsColumnSplit()) {
      column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
@@ -934,10 +931,10 @@ class GPUPredictor : public xgboost::Predictor {
    } else {
      size_t batch_offset = 0;
      for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
+        dmat->Info().feature_types.SetDevice(ctx_->Device());
        auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
        this->PredictInternal(
-            page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types),
+            page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
            d_model,
            out_preds,
            batch_offset);
@@ -951,16 +948,15 @@ class GPUPredictor : public xgboost::Predictor {
      : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
  ~GPUPredictor() override {
-    if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
+    if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
-      dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+      dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    }
  }
  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
                    const gbm::GBTreeModel& model, uint32_t tree_begin,
                    uint32_t tree_end = 0) const override {
-    int device = ctx_->gpu_id;
+    CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
    CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
    auto* out_preds = &predts->predictions;
    if (tree_end == 0) {
      tree_end = model.trees.size();
@@ -978,9 +974,9 @@ class GPUPredictor : public xgboost::Predictor {
    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
    CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
        << "Number of columns in data must equal to trained model.";
-    CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
+    CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
-        << "XGBoost is running on device: " << this->ctx_->gpu_id << ", "
+        << "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
-        << "but data is on: " << m->DeviceIdx();
+        << "but data is on: " << m->Device().Name();
    if (p_m) {
      p_m->Info().num_row_ = m->NumRows();
      this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
@@ -989,16 +985,16 @@ class GPUPredictor : public xgboost::Predictor {
      info.num_row_ = m->NumRows();
      this->InitOutPredictions(info, &(out_preds->predictions), model);
    }
-    out_preds->predictions.SetDevice(m->DeviceIdx());
+    out_preds->predictions.SetDevice(m->Device());
    const uint32_t BLOCK_THREADS = 128;
    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
-    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
+    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
    size_t shared_memory_bytes =
        SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
    DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
+    d_model.Init(model, tree_begin, tree_end, m->Device());
    bool use_shared = shared_memory_bytes != 0;
    size_t entry_start = 0;
@@ -1050,9 +1046,8 @@ class GPUPredictor : public xgboost::Predictor {
    }
    CHECK(!p_fmat->Info().IsColumnSplit())
        << "Predict contribution support for column-wise data split is not yet implemented.";
-
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    out_contribs->SetDevice(ctx_->Device());
    out_contribs->SetDevice(ctx_->gpu_id);
    if (tree_end == 0 || tree_end > model.trees.size()) {
      tree_end = static_cast<uint32_t>(model.trees.size());
    }
@@ -1070,12 +1065,12 @@ class GPUPredictor : public xgboost::Predictor {
    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
        device_paths;
    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
    dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.offset.SetDevice(ctx_->Device());
      SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                       model.learner_model_param->num_feature);
      auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1084,7 +1079,7 @@ class GPUPredictor : public xgboost::Predictor {
          dh::tend(phis));
    }
    // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
    const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
    auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1109,8 +1104,8 @@ class GPUPredictor : public xgboost::Predictor {
    if (tree_weights != nullptr) {
      LOG(FATAL) << "Dart booster feature " << not_implemented;
    }
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
-    out_contribs->SetDevice(ctx_->gpu_id);
+    out_contribs->SetDevice(ctx_->Device());
    if (tree_end == 0 || tree_end > model.trees.size()) {
      tree_end = static_cast<uint32_t>(model.trees.size());
    }
@@ -1129,12 +1124,12 @@ class GPUPredictor : public xgboost::Predictor {
    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
        device_paths;
    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
    dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.offset.SetDevice(ctx_->Device());
      SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                       model.learner_model_param->num_feature);
      auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1143,7 +1138,7 @@ class GPUPredictor : public xgboost::Predictor {
          dh::tend(phis));
    }
    // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
    const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
    auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1168,24 +1163,24 @@ class GPUPredictor : public xgboost::Predictor {
  void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
                   const gbm::GBTreeModel &model,
                   unsigned tree_end) const override {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
    const MetaInfo& info = p_fmat->Info();
    bst_row_t num_rows = info.num_row_;
    if (tree_end == 0 || tree_end > model.trees.size()) {
      tree_end = static_cast<uint32_t>(model.trees.size());
    }
-    predictions->SetDevice(ctx_->gpu_id);
+    predictions->SetDevice(ctx_->Device());
    predictions->Resize(num_rows * tree_end);
    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, this->ctx_->Device());
    if (info.IsColumnSplit()) {
      column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
      return;
    }
    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
    constexpr uint32_t kBlockThreads = 128;
    size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
        info.num_col_, max_shared_memory_bytes);
@@ -1195,8 +1190,8 @@ class GPUPredictor : public xgboost::Predictor {
    if (p_fmat->PageExists<SparsePage>()) {
      for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
-        batch.data.SetDevice(ctx_->gpu_id);
+        batch.data.SetDevice(ctx_->Device());
-        batch.offset.SetDevice(ctx_->gpu_id);
+        batch.offset.SetDevice(ctx_->Device());
        bst_row_t batch_offset = 0;
        SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                            model.learner_model_param->num_feature};
@@ -1221,7 +1216,7 @@ class GPUPredictor : public xgboost::Predictor {
    } else {
      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
        bst_row_t batch_offset = 0;
-        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
+        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
        size_t num_rows = batch.Size();
        auto grid =
            static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
@@ -1249,9 +1244,9 @@ class GPUPredictor : public xgboost::Predictor {
 private:
  /*! \brief Reconfigure the device when GPU is changed. */
-  static size_t ConfigureDevice(int device) {
+  static size_t ConfigureDevice(DeviceOrd device) {
-    if (device >= 0) {
+    if (device.IsCUDA()) {
-      return dh::MaxSharedMemory(device);
+      return dh::MaxSharedMemory(device.ordinal);
    }
    return 0;
  }
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
  std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
  const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
-  if (ctx_->gpu_id >= 0) {
+  if (ctx_->Device().IsCUDA()) {
-    out_preds->SetDevice(ctx_->gpu_id);
+    out_preds->SetDevice(ctx_->Device());
  }
  if (!base_margin->Empty()) {
    out_preds->Resize(n);
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -19,8 +19,7 @@
 #include "xgboost/linalg.h"                // TensorView, Tensor, Constant
 #include "xgboost/logging.h"               // CHECK_EQ
-namespace xgboost {
+namespace xgboost::tree {
 namespace tree {
 namespace cpu_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
              linalg::TensorView<GradientPair const, 2> gpair,
@@ -68,7 +67,7 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien
 void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
              bst_target_t n_targets, linalg::Vector<float>* out) {
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
  out->Reshape(n_targets);
  gpair.SetDevice(ctx->Device());
@@ -76,5 +75,4 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
  ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
      : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
 }
-}  // namespace tree
+}  // namespace xgboost::tree
 }  // namespace xgboost
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -21,9 +21,7 @@
 #include "xgboost/logging.h"  // CHECK_EQ
 #include "xgboost/span.h"     // span
-namespace xgboost {
+namespace xgboost::tree::cuda_impl {
 namespace tree {
 namespace cuda_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
              linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
  auto n_targets = out.Size();
@@ -50,7 +48,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
  thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
                        thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
-  collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
+  collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
                        d_sum.Size() * 2);
  thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
@@ -59,6 +57,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
                           CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
                     });
 }
-}  // namespace cuda_impl
+}  // namespace xgboost::tree::cuda_impl
 }  // namespace tree
 }  // namespace xgboost
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -413,7 +413,7 @@ void GPUHistEvaluator::EvaluateSplits(
    auto const world_size = collective::GetWorldSize();
    dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
    auto all_candidates = dh::ToSpan(all_candidate_storage);
-    collective::AllGather(device_, out_splits.data(), all_candidates.data(),
+    collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(),
                          out_splits.size() * sizeof(DeviceSplitCandidate));
    // Reduce to get the best candidate from all workers.
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -85,7 +85,7 @@ class GPUHistEvaluator {
  std::size_t node_categorical_storage_size_ = 0;
  // Is the data split column-wise?
  bool is_column_split_ = false;
-  int32_t device_;
+  DeviceOrd device_;
  // Copy the categories from device to host asynchronously.
  void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -133,14 +133,14 @@ class GPUHistEvaluator {
  }
 public:
-  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, int32_t device)
+  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, DeviceOrd device)
      : tree_evaluator_{param, n_features, device}, param_{param} {}
  /**
   * \brief Reset the evaluator, should be called before any use.
   */
  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
             bst_feature_t n_features, TrainParam const &param, bool is_column_split,
-             int32_t device);
+             DeviceOrd device);
  /**
   * \brief Get host category storage for nidx.  Different from the internal version, this
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023 by XGBoost Contributors
 *
 * \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
 *        compilation time.
@@ -12,11 +12,10 @@
 #include "evaluate_splits.cuh"
 #include "xgboost/data.h"
-namespace xgboost {
+namespace xgboost::tree {
 namespace tree {
 void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
                             bst_feature_t n_features, TrainParam const &param,
-                             bool is_column_split, int32_t device) {
+                             bool is_column_split, DeviceOrd device) {
  param_ = param;
  tree_evaluator_ = TreeEvaluator{param, n_features, device};
  has_categoricals_ = cuts.HasCategorical();
@@ -127,6 +126,4 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
                             });
  return dh::ToSpan(cat_sorted_idx_);
 }
-
+}  // namespace xgboost::tree
 }  // namespace tree
 }  // namespace xgboost
--- a/src/tree/gpu_hist/feature_groups.cuh
+++ b/src/tree/gpu_hist/feature_groups.cuh
@@ -1,5 +1,5 @@
-/*!
+/**
- * Copyright 2020 by XGBoost Contributors
+ * Copyright 2020-2023 by XGBoost Contributors
 */
 #ifndef FEATURE_GROUPS_CUH_
 #define FEATURE_GROUPS_CUH_
@@ -102,11 +102,10 @@ struct FeatureGroups {
    InitSingle(cuts);
  }
-  FeatureGroupsAccessor DeviceAccessor(int device) const {
+  [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
    feature_segments.SetDevice(device);
    bin_segments.SetDevice(device);
-    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(),
+    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins};
        max_group_bins};
  }
 private:
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
    for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
      auto page = batch.Impl();
      if (!page_) {
-        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+        page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->Cuts(), page->is_dense,
                                                  page->row_stride, dmat->Info().num_row_);
      }
-      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
+      size_t num_elements = page_->Copy(ctx->Device(), page, offset);
      offset += num_elements;
    }
    page_concatenated_ = true;
@@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
  auto first_page = (*batch_iterator.begin()).Impl();
  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                  first_page->row_stride, sample_rows));
  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
  for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
  }
  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
@@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
  auto first_page = (*batch_iterator.begin()).Impl();
  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                  first_page->row_stride, sample_rows));
  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
  for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
  }
  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -13,15 +13,15 @@
 namespace xgboost {
 namespace tree {
-RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
+RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
  thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
 }
 RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
 }
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -199,7 +199,7 @@ class RowPartitioner {
  static constexpr bst_node_t kIgnoredTreePosition = -1;
 private:
-  int device_idx_;
+  DeviceOrd device_idx_;
  /*! \brief In here if you want to find the rows belong to a node nid, first you need to
   * get the indices segment from ridx_segments[nid], then get the row index that
   * represents position of row in input data X.  `RowPartitioner::GetRows` would be a
@@ -223,7 +223,7 @@ class RowPartitioner {
  dh::PinnedMemory pinned2_;
 public:
-  RowPartitioner(int device_idx, size_t num_rows);
+  RowPartitioner(DeviceOrd device_idx, size_t num_rows);
  ~RowPartitioner();
  RowPartitioner(const RowPartitioner&) = delete;
  RowPartitioner& operator=(const RowPartitioner&) = delete;
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -477,7 +477,7 @@ class HistEvaluator {
      : ctx_{ctx},
        param_{param},
        column_sampler_{std::move(sampler)},
-        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
+        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
        is_col_split_{info.IsColumnSplit()} {
    interaction_constraints_.Configure(*param, info.num_col_);
    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
@@ -696,7 +696,7 @@ class HistMultiEvaluator {
    stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
    gain_.resize(1);
-    linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
+    linalg::Vector<float> weight({n_targets}, ctx_->Device());
    CalcWeight(*param_, root_sum, weight.HostView());
    auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
    gain_.front() = root_gain;
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -1,5 +1,5 @@
-/*!
+/**
- * Copyright 2018-2020 by Contributors
+ * Copyright 2018-2023 by Contributors
 * \file split_evaluator.h
 * \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
 * \author Henry Gouk
@@ -23,8 +23,7 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/tree_model.h"
-namespace xgboost {
+namespace xgboost::tree {
 namespace tree {
 class TreeEvaluator {
  // hist and exact use parent id to calculate constraints.
  static constexpr bst_node_t kRootParentId =
@@ -33,13 +32,13 @@ class TreeEvaluator {
  HostDeviceVector<float> lower_bounds_;
  HostDeviceVector<float> upper_bounds_;
  HostDeviceVector<int32_t> monotone_;
-  int32_t device_;
+  DeviceOrd device_;
  bool has_constraint_;
 public:
-  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, int32_t device) {
+  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) {
    device_ = device;
-    if (device != Context::kCpuId) {
+    if (device.IsCUDA()) {
      lower_bounds_.SetDevice(device);
      upper_bounds_.SetDevice(device);
      monotone_.SetDevice(device);
@@ -59,7 +58,7 @@ class TreeEvaluator {
      has_constraint_ = true;
    }
-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
      // Pull to device early.
      lower_bounds_.ConstDeviceSpan();
      upper_bounds_.ConstDeviceSpan();
@@ -122,7 +121,7 @@ class TreeEvaluator {
    }
    // Fast floating point division instruction on device
-    XGBOOST_DEVICE float Divide(float a, float b) const {
+    [[nodiscard]] XGBOOST_DEVICE float Divide(float a, float b) const {
 #ifdef __CUDA_ARCH__
      return __fdividef(a, b);
 #else
@@ -154,7 +153,7 @@ class TreeEvaluator {
 public:
  /* Get a view to the evaluator that can be passed down to device. */
  template <typename ParamT = TrainParam> auto GetEvaluator() const {
-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
      auto constraints = monotone_.ConstDevicePointer();
      return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
                                    upper_bounds_.ConstDevicePointer(), has_constraint_};
@@ -215,7 +214,6 @@ enum SplitType {
  // partition-based categorical split
  kPart = 2
 };
-}  // namespace tree
+}  // namespace xgboost::tree
 }  // namespace xgboost
 #endif  // XGBOOST_TREE_SPLIT_EVALUATOR_H_
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -154,7 +154,7 @@ class ColMaker: public TreeUpdater {
        : param_(param),
          colmaker_train_param_{colmaker_train_param},
          ctx_{ctx},
-          tree_evaluator_(param_, column_densities.size(), Context::kCpuId),
+          tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()),
          interaction_constraints_{std::move(_interaction_constraints)},
          column_densities_(column_densities) {}
    // update one tree, growing
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -74,7 +74,7 @@ class DeviceHistogramStorage {
  dh::device_vector<typename GradientSumT::ValueT> overflow_;
  std::map<int, size_t> overflow_nidx_map_;
  int n_bins_;
-  int device_id_;
+  DeviceOrd device_id_;
  static constexpr size_t kNumItemsInGradientSum =
      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
@@ -82,7 +82,7 @@ class DeviceHistogramStorage {
 public:
  // Start with about 16mb
  DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(int device_id, int n_bins) {
+  void Init(DeviceOrd device_id, int n_bins) {
    this->n_bins_ = n_bins;
    this->device_id_ = device_id;
  }
@@ -196,7 +196,7 @@ struct GPUHistMakerDevice {
                     common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
                     TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
                     uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
-      : evaluator_{_param, n_features, ctx->gpu_id},
+      : evaluator_{_param, n_features, ctx->Device()},
        ctx_(ctx),
        feature_types{_feature_types},
        param(std::move(_param)),
@@ -211,7 +211,7 @@ struct GPUHistMakerDevice {
    }
    CHECK(column_sampler_);
-    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
+    monitor.Init(std::string("GPUHistMakerDevice") + ctx_->Device().Name());
  }
  ~GPUHistMakerDevice() = default;
@@ -220,7 +220,7 @@ struct GPUHistMakerDevice {
    if (!feature_groups) {
      CHECK(page);
      feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
-                                                       dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                                       dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
                                                       sizeof(GradientPairPrecise));
    }
  }
@@ -231,7 +231,7 @@ struct GPUHistMakerDevice {
    this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
                                param.colsample_bynode, param.colsample_bylevel,
                                param.colsample_bytree);
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    this->interaction_constraints.Reset();
@@ -246,15 +246,15 @@ struct GPUHistMakerDevice {
    gpair = sample.gpair;
    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
-                           dmat->Info().IsColumnSplit(), ctx_->gpu_id);
+                           dmat->Info().IsColumnSplit(), ctx_->Device());
    quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
    row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
    // Init histogram
-    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
+    hist.Init(ctx_->Device(), page->Cuts().TotalBins());
    hist.Reset();
    this->InitFeatureGroupsOnce();
@@ -267,7 +267,7 @@ struct GPUHistMakerDevice {
    sampled_features->SetDevice(ctx_->Device());
    common::Span<bst_feature_t> feature_set =
        interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
    EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
    EvaluateSplitSharedInputs shared_inputs{
        gpu_param,
@@ -289,7 +289,7 @@ struct GPUHistMakerDevice {
    dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
    std::vector<bst_node_t> nidx(2 * candidates.size());
    auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
    EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
                                            matrix.feature_segments, matrix.gidx_fvalue_map,
                                            matrix.min_fvalue,
@@ -342,9 +342,9 @@ struct GPUHistMakerDevice {
  void BuildHist(int nidx) {
    auto d_node_hist = hist.GetNodeHistogram(nidx);
    auto d_ridx = row_partitioner->GetRows(nidx);
-    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id),
+    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
-                           feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist,
+                           feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
-                           *quantiser);
+                           d_node_hist, *quantiser);
  }
  // Attempt to do subtraction trick
@@ -413,10 +413,10 @@ struct GPUHistMakerDevice {
    });
    collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
+        ctx_->Ordinal(), decision_storage.data().get(), decision_storage.size());
    collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
+        ctx_->Ordinal(), missing_storage.data().get(), missing_storage.size());
-    collective::Synchronize(ctx_->gpu_id);
+    collective::Synchronize(ctx_->Ordinal());
    row_partitioner->UpdatePositionBatch(
        nidx, left_nidx, right_nidx, split_data,
@@ -454,7 +454,7 @@ struct GPUHistMakerDevice {
      CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
    }
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
    if (info_.IsColumnSplit()) {
      UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
@@ -524,9 +524,9 @@ struct GPUHistMakerDevice {
      common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
      common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
      HostDeviceVector<bst_node_t>* p_out_position) {
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
    auto d_gpair = this->gpair;
-    p_out_position->SetDevice(ctx_->gpu_id);
+    p_out_position->SetDevice(ctx_->Device());
    p_out_position->Resize(row_partitioner->GetRows().size());
    auto new_position_op = [=] __device__(size_t row_id, int position) {
@@ -613,7 +613,7 @@ struct GPUHistMakerDevice {
    monitor.Start("AllReduce");
    auto d_node_hist = hist.GetNodeHistogram(nidx).data();
    using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-    collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
+    collective::GlobalSum(info_, ctx_->Device(), reinterpret_cast<ReduceT*>(d_node_hist),
                          page->Cuts().TotalBins() * 2 * num_histograms);
    monitor.Stop("AllReduce");
@@ -855,7 +855,7 @@ class GPUHistMaker : public TreeUpdater {
  }
  void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
-    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
+    CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device";
    info_ = &dmat->Info();
    // Synchronise the column sampling seed
@@ -864,8 +864,8 @@ class GPUHistMaker : public TreeUpdater {
    this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
    auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
-    info_->feature_types.SetDevice(ctx_->gpu_id);
+    info_->feature_types.SetDevice(ctx_->Device());
    maker = std::make_unique<GPUHistMakerDevice>(
        ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
        *param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
@@ -890,7 +890,7 @@ class GPUHistMaker : public TreeUpdater {
    this->InitData(param, p_fmat, p_tree);
    monitor_.Stop("InitData");
-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
    maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
  }
@@ -1023,7 +1023,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
    this->InitData(p_fmat, p_tree);
    monitor_.Stop("InitData");
-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
    maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
  }
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -518,7 +518,7 @@ class QuantileHistMaker : public TreeUpdater {
    auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
    if (need_copy()) {
      // allocate buffer
-      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
+      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->Device(), linalg::Order::kF};
      h_sample_out = sample_out.HostView();
    }
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -34,7 +34,7 @@ void VerifyAllReduceBitwiseAND() {
  auto const rank = collective::GetRank();
  std::bitset<64> original{};
  original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
  collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
  collective::Synchronize(rank);
  EXPECT_EQ(buffer.HostVector()[0], 0ULL);
@@ -56,7 +56,7 @@ void VerifyAllReduceBitwiseOR() {
  auto const rank = collective::GetRank();
  std::bitset<64> original{};
  original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
  collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
  collective::Synchronize(rank);
  EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
@@ -78,7 +78,7 @@ void VerifyAllReduceBitwiseXOR() {
  auto const rank = collective::GetRank();
  std::bitset<64> original{~0ULL};
  original[rank] = false;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
  collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
  collective::Synchronize(rank);
  EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -147,7 +147,7 @@ TEST(CutsBuilder, SearchGroupInd) {
  EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
-  p_mat->Info().Validate(-1);
+  p_mat->Info().Validate(DeviceOrd::CPU());
  EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
               dmlc::Error);
@@ -330,7 +330,7 @@ TEST(HistUtil, IndexBinData) {
 void TestSketchFromWeights(bool with_group) {
  size_t constexpr kRows = 300, kCols = 20, kBins = 256;
  size_t constexpr kGroups = 10;
-  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
+  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
  Context ctx;
  common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -208,7 +208,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
  ASSERT_EQ(info.feature_types.Size(), n_features);
  HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
-  cuts_ptr.SetDevice(0);
+  cuts_ptr.SetDevice(DeviceOrd::CUDA(0));
  dh::device_vector<float> weight(n_samples * n_features, 0);
  dh::Iota(dh::ToSpan(weight));
@@ -221,7 +221,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
  thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
                      detail::EntryCompareOp());
-  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
+  detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries,
                                     &weight, &columns_ptr);
  auto const& h_cptr = cuts_ptr.ConstHostVector();
@@ -363,7 +363,8 @@ template <typename Adapter>
 auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
  common::HistogramCuts batched_cuts;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
+  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
                                   DeviceOrd::CUDA(0));
  MetaInfo info;
  AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
  sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
@@ -430,7 +431,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
  ConsoleLogger::Configure({{"verbosity", "3"}});
  common::HistogramCuts batched_cuts;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
  AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
  HistogramCuts cuts;
@@ -458,7 +459,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
  ConsoleLogger::Configure({{"verbosity", "3"}});
  common::HistogramCuts batched_cuts;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
  AdapterDeviceSketch(adapter.Value(), num_bins, info,
                      std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
@@ -493,7 +494,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
  }
  ASSERT_EQ(info.feature_types.Size(), 1);
-  SketchContainer container(info.feature_types, num_bins, 1, n, 0);
+  SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0));
  AdapterDeviceSketch(adapter.Value(), num_bins, info,
                      std::numeric_limits<float>::quiet_NaN(), &container);
  HistogramCuts cuts;
@@ -566,7 +567,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
  auto n = n_samples * n_features;
  std::vector<float> x;
  x.resize(n);
@@ -606,21 +607,21 @@ void TestGetColumnSize(std::size_t n_samples) {
  std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
  ASSERT_EQ(h_column_size, h_column_size_1);
  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
  ASSERT_EQ(h_column_size, h_column_size_1);
  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
  ASSERT_EQ(h_column_size, h_column_size_1);
 }
@@ -697,9 +698,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
  size_t constexpr kRows = 300, kCols = 20, kBins = 256;
  size_t constexpr kGroups = 10;
  HostDeviceVector<float> storage;
-  std::string m =
+  std::string m = RandomDataGenerator{kRows, kCols, 0}
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
+                      .Device(DeviceOrd::CUDA(0))
-          &storage);
+                      .GenerateArrayInterface(&storage);
  MetaInfo info;
  Context ctx;
  auto& h_weights = info.weights_.HostVector();
@@ -718,14 +719,14 @@ void TestAdapterSketchFromWeights(bool with_group) {
    info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
  }
-  info.weights_.SetDevice(0);
+  info.weights_.SetDevice(DeviceOrd::CUDA(0));
  info.num_row_ = kRows;
  info.num_col_ = kCols;
  data::CupyAdapter adapter(m);
  auto const& batch = adapter.Value();
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0));
  AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
@@ -769,7 +770,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
      // https://github.com/dmlc/xgboost/issues/7946
      h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
    }
-    SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+    SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
    AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                        &sketch_container);
    sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -1,7 +1,6 @@
-/*!
+/**
- * Copyright 2018 XGBoost contributors
+ * Copyright 2018-2023 XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -9,14 +8,13 @@
 #include "../../../src/common/device_helpers.cuh"
 #include <xgboost/host_device_vector.h>
-namespace xgboost {
+namespace xgboost::common {
 namespace common {
 namespace {
-void SetDeviceForTest(int device) {
+void SetDeviceForTest(DeviceOrd device) {
  int n_devices;
  dh::safe_cuda(cudaGetDeviceCount(&n_devices));
-  device %= n_devices;
+  device.ordinal %= n_devices;
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 }
 }  // namespace
@@ -31,13 +29,13 @@ struct HostDeviceVectorSetDeviceHandler {
  }
 };
-void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
+void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
  // create the vector
  v->SetDevice(device);
  v->Resize(n);
  ASSERT_EQ(v->Size(), n);
-  ASSERT_EQ(v->DeviceIdx(), device);
+  ASSERT_EQ(v->Device(), device);
  // ensure that the device have read-write access
  ASSERT_TRUE(v->DeviceCanRead());
  ASSERT_TRUE(v->DeviceCanWrite());
@@ -57,7 +55,7 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
 }
 void PlusOne(HostDeviceVector<int> *v) {
-  int device = v->DeviceIdx();
+  auto device = v->Device();
  SetDeviceForTest(device);
  thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
                    [=]__device__(unsigned int a){ return a + 1; });
@@ -69,7 +67,7 @@ void CheckDevice(HostDeviceVector<int>* v,
                 unsigned int first,
                 GPUAccess access) {
  ASSERT_EQ(v->Size(), size);
-  SetDeviceForTest(v->DeviceIdx());
+  SetDeviceForTest(v->Device());
  ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
                            thrust::make_counting_iterator(first)));
@@ -100,7 +98,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
  ASSERT_FALSE(v->DeviceCanWrite());
 }
-void TestHostDeviceVector(size_t n, int device) {
+void TestHostDeviceVector(size_t n, DeviceOrd device) {
  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
  HostDeviceVector<int> v;
  InitHostDeviceVector(n, device, &v);
@@ -113,13 +111,13 @@ void TestHostDeviceVector(size_t n, int device) {
 TEST(HostDeviceVector, Basic) {
  size_t n = 1001;
-  int device = 0;
+  DeviceOrd device = DeviceOrd::CUDA(0);
  TestHostDeviceVector(n, device);
 }
 TEST(HostDeviceVector, Copy) {
  size_t n = 1001;
-  int device = 0;
+  auto device = DeviceOrd::CUDA(0);
  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
  HostDeviceVector<int> v;
@@ -143,15 +141,15 @@ TEST(HostDeviceVector, SetDevice) {
    h_vec[i] = i;
  }
  HostDeviceVector<int> vec (h_vec);
-  auto device = 0;
+  auto device = DeviceOrd::CUDA(0);
  vec.SetDevice(device);
  ASSERT_EQ(vec.Size(), h_vec.size());
  auto span = vec.DeviceSpan();  // sync to device
-  vec.SetDevice(-1);  // pull back to cpu.
+  vec.SetDevice(DeviceOrd::CPU());  // pull back to cpu.
  ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_EQ(vec.DeviceIdx(), -1);
+  ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
  auto h_vec_1 = vec.HostVector();
  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@@ -159,7 +157,7 @@ TEST(HostDeviceVector, SetDevice) {
 TEST(HostDeviceVector, Span) {
  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.SetDevice(0);
+  vec.SetDevice(DeviceOrd::CUDA(0));
  auto span = vec.DeviceSpan();
  ASSERT_EQ(vec.Size(), span.size());
  ASSERT_EQ(vec.DevicePointer(), span.data());
@@ -183,5 +181,4 @@ TEST(HostDeviceVector, Empty) {
  ASSERT_FALSE(another.Empty());
  ASSERT_TRUE(vec.Empty());
 }
-}  // namespace common
+}  // namespace xgboost::common
 }  // namespace xgboost
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -12,7 +12,7 @@ namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
  auto device = DeviceOrd::CUDA(0);
-  Tensor<float, 3> l{{2, 3, 4}, 0};
+  Tensor<float, 3> l{{2, 3, 4}, device};
  {
    /**
     * Non-contiguous
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -9,9 +9,7 @@
 #include "../../../src/data/adapter.h"
 #include "xgboost/context.h"
-namespace xgboost {
+namespace xgboost::common {
 namespace common {
 TEST(Quantile, LoadBalance) {
  size_t constexpr kRows = 1000, kCols = 100;
  auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
@@ -314,7 +312,7 @@ void TestSameOnAllWorkers() {
        }
        auto m = RandomDataGenerator{kRows, kCols, 0}
-                     .Device(Context::kCpuId)
+                     .Device(DeviceOrd::CPU())
                     .Type(ft)
                     .MaxCategory(17)
                     .Seed(rank + seed)
@@ -373,6 +371,4 @@ TEST(Quantile, SameOnAllWorkers) {
  auto constexpr kWorkers = 4;
  RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
 }
-
+}  // namespace xgboost::common
 }  // namespace common
 }  // namespace xgboost
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -25,7 +25,7 @@ class MGPUQuantileTest : public BaseMGPUTest {};
 TEST(GPUQuantile, Basic) {
  constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
  dh::caching_device_vector<Entry> entries;
  dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
  thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
@@ -38,12 +38,12 @@ void TestSketchUnique(float sparsity) {
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
                                    .Seed(seed)
-                                    .Device(0)
+                                    .Device(FstCU())
                                    .GenerateArrayInterface(&storage);
    data::CupyAdapter adapter(interface_str);
    AdapterDeviceSketch(adapter.Value(), n_bins, info,
@@ -58,7 +58,7 @@ void TestSketchUnique(float sparsity) {
        thrust::make_counting_iterator(0llu),
        [=] __device__(size_t idx) { return batch.GetElement(idx); });
    auto end = kCols * kRows;
-    detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
+    detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
                               &cut_sizes_scan, &column_sizes_scan);
    auto const& cut_sizes = cut_sizes_scan.HostVector();
    ASSERT_LE(sketch.Data().size(), cut_sizes.back());
@@ -86,9 +86,9 @@ TEST(GPUQuantile, Unique) {
 }
 // if with_error is true, the test tolerates floating point error
-void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,
+void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
                          Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  std::vector<SketchEntry> h_in(in.size());
  dh::CopyDeviceSpanToVector(&h_in, in);
  std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
@@ -123,13 +123,12 @@ TEST(GPUQuantile, Prune) {
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
    HostDeviceVector<float> storage;
-    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
+    std::string interface_str =
-                                    .Device(0)
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
-                                    .Seed(seed)
+            &storage);
                                    .GenerateArrayInterface(&storage);
    data::CupyAdapter adapter(interface_str);
    AdapterDeviceSketch(adapter.Value(), n_bins, info,
                        std::numeric_limits<float>::quiet_NaN(), &sketch);
@@ -145,7 +144,7 @@ TEST(GPUQuantile, Prune) {
    ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
                                  sketch.Data().data() + sketch.Data().size(),
                                  detail::SketchUnique{}));
-    TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr());
  });
 }
@@ -153,10 +152,10 @@ TEST(GPUQuantile, MergeEmpty) {
  constexpr size_t kRows = 1000, kCols = 100;
  size_t n_bins = 10;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
  HostDeviceVector<float> storage_0;
  std::string interface_str_0 =
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
+      RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface(
          &storage_0);
  data::CupyAdapter adapter_0(interface_str_0);
  MetaInfo info;
@@ -193,34 +192,33 @@ TEST(GPUQuantile, MergeBasic) {
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
    HostDeviceVector<float> storage_0;
    std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                  .Device(FstCU())
                                      .Seed(seed)
                                      .GenerateArrayInterface(&storage_0);
    data::CupyAdapter adapter_0(interface_str_0);
    AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                        std::numeric_limits<float>::quiet_NaN(), &sketch_0);
-    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0);
+    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU());
    HostDeviceVector<float> storage_1;
-    std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
+    std::string interface_str_1 =
-                                      .Device(0)
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
-                                      .Seed(seed)
+            &storage_1);
                                      .GenerateArrayInterface(&storage_1);
    data::CupyAdapter adapter_1(interface_str_1);
-    AdapterDeviceSketch(adapter_1.Value(), n_bins, info,
+    AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
-                        std::numeric_limits<float>::quiet_NaN(), &sketch_1);
+                        &sketch_1);
    size_t size_before_merge = sketch_0.Data().size();
    sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
    if (info.weights_.Size() != 0) {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
      sketch_0.FixError();
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
    } else {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
    }
    auto columns_ptr = sketch_0.ColumnsPtr();
@@ -240,24 +238,22 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
  MetaInfo info;
  int32_t seed = 0;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, cols, rows, 0);
+  SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU());
  HostDeviceVector<float> storage_0;
-  std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
+  std::string interface_str_0 =
-                                    .Device(0)
+      RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
-                                    .Seed(seed)
+          &storage_0);
                                    .GenerateArrayInterface(&storage_0);
  data::CupyAdapter adapter_0(interface_str_0);
  AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                      std::numeric_limits<float>::quiet_NaN(),
                      &sketch_0);
  size_t f_rows = rows * frac;
-  SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0);
+  SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU());
  HostDeviceVector<float> storage_1;
-  std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
+  std::string interface_str_1 =
-                                    .Device(0)
+      RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
-                                    .Seed(seed)
+          &storage_1);
                                    .GenerateArrayInterface(&storage_1);
  auto data_1 = storage_1.DeviceSpan();
  auto tuple_it = thrust::make_tuple(
      thrust::make_counting_iterator<size_t>(0ul), data_1.data());
@@ -279,7 +275,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
  size_t size_before_merge = sketch_0.Data().size();
  sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
-  TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+  TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
  auto columns_ptr = sketch_0.ColumnsPtr();
  std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
@@ -310,11 +306,10 @@ TEST(GPUQuantile, MergeDuplicated) {
 TEST(GPUQuantile, MultiMerge) {
  constexpr size_t kRows = 20, kCols = 1;
  int32_t world = 2;
-  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
+  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
                                 MetaInfo const &info) {
    // Set up single node version
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());
    size_t intermediate_num_cuts = std::min(
        kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@@ -322,12 +317,12 @@ TEST(GPUQuantile, MultiMerge) {
    for (auto rank = 0; rank < world; ++rank) {
      HostDeviceVector<float> storage;
      std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                      .Device(FstCU())
                                      .Seed(rank + seed)
                                      .GenerateArrayInterface(&storage);
      data::CupyAdapter adapter(interface_str);
      HostDeviceVector<FeatureType> ft;
-      containers.emplace_back(ft, n_bins, kCols, kRows, 0);
+      containers.emplace_back(ft, n_bins, kCols, kRows, FstCU());
      AdapterDeviceSketch(adapter.Value(), n_bins, info,
                          std::numeric_limits<float>::quiet_NaN(),
                          &containers.back());
@@ -337,12 +332,10 @@ TEST(GPUQuantile, MultiMerge) {
      sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
      sketch_on_single_node.FixError();
    }
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
                         sketch_on_single_node.ColumnsPtr());
    sketch_on_single_node.Unique();
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
                         sketch_on_single_node.ColumnsPtr());
  });
 }
@@ -351,7 +344,7 @@ void TestAllReduceBasic() {
  auto const world = collective::GetWorldSize();
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);
    // Set up single node version;
    HostDeviceVector<FeatureType> ft({}, device);
@@ -483,7 +476,7 @@ void TestSameOnAllWorkers() {
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                 MetaInfo const &info) {
    auto const rank = collective::GetRank();
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);
    HostDeviceVector<FeatureType> ft({}, device);
    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
    HostDeviceVector<float> storage({}, device);
@@ -514,9 +507,9 @@ void TestSameOnAllWorkers() {
    thrust::copy(thrust::device, local_data.data(),
                 local_data.data() + local_data.size(),
                 all_workers.begin() + local_data.size() * rank);
-    collective::AllReduce<collective::Operation::kSum>(device, all_workers.data().get(),
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, all_workers.data().get(),
                                                       all_workers.size());
-    collective::Synchronize(device);
+    collective::Synchronize(device.ordinal);
    auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
    std::vector<float> h_base_line(base_line.size());
@@ -562,7 +555,7 @@ TEST(GPUQuantile, Push) {
  columns_ptr[1] = kRows;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
  sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
  auto sketch_data = sketch.Data();
@@ -602,7 +595,7 @@ TEST(GPUQuantile, MultiColPush) {
  int32_t n_bins = 16;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
  dh::device_vector<Entry> d_entries {entries};
  dh::device_vector<size_t> columns_ptr(kCols + 1, 0);
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@@ -95,7 +95,7 @@ void TestRankingCache(Context const* ctx) {
  HostDeviceVector<float> predt(info.num_row_, 0);
  auto& h_predt = predt.HostVector();
  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
  auto rank_idx =
      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
@@ -129,7 +129,7 @@ void TestNDCGCache(Context const* ctx) {
    auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
    // empty label
    ASSERT_THROW(fail(), dmlc::Error);
-    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
+    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, DeviceOrd::CPU()};
    // invalid label
    ASSERT_THROW(fail(), dmlc::Error);
    auto h_labels = info.labels.HostView();
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -35,7 +35,7 @@ void TestCalcQueriesInvIDCG() {
  auto d_scores = dh::ToSpan(scores);
  common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
-  linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
+  linalg::Vector<double> inv_IDCG({n_groups}, ctx.Device());
  ltr::LambdaRankParam p;
  p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
@@ -70,7 +70,7 @@ void TestRankingCache(Context const* ctx) {
  HostDeviceVector<float> predt(info.num_row_, 0);
  auto& h_predt = predt.HostVector();
  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
  auto rank_idx =
      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -9,12 +9,11 @@
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
 #include "../helpers.h"
-namespace xgboost {
+namespace xgboost::common {
 namespace common {
 TEST(Stats, Quantile) {
  Context ctx;
  {
-    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
+    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, DeviceOrd::CPU());
    std::vector<size_t> index{0, 2, 3, 4, 6};
    auto h_arr = arr.HostView();
    auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
@@ -40,8 +39,8 @@ TEST(Stats, Quantile) {
 TEST(Stats, WeightedQuantile) {
  Context ctx;
-  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
+  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, DeviceOrd::CPU());
-  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);
+  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, DeviceOrd::CPU());
  auto h_arr = arr.HostView();
  auto h_weight = weight.HostView();
@@ -64,7 +63,7 @@ TEST(Stats, Median) {
  Context ctx;
  {
-    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, Context::kCpuId};
+    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, DeviceOrd::CPU()};
    HostDeviceVector<float> weights;
    linalg::Tensor<float, 1> out;
    Median(&ctx, values, weights, &out);
@@ -83,7 +82,7 @@ TEST(Stats, Median) {
  {
    ctx = ctx.MakeCPU();
    // 4x2 matrix
-    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
+    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.Device()};
    HostDeviceVector<float> weights;
    linalg::Tensor<float, 1> out;
    Median(&ctx, values, weights, &out);
@@ -102,14 +101,14 @@ TEST(Stats, Median) {
 namespace {
 void TestMean(Context const* ctx) {
  std::size_t n{128};
-  linalg::Vector<float> data({n}, ctx->gpu_id);
+  linalg::Vector<float> data({n}, ctx->Device());
  auto h_v = data.HostView().Values();
  std::iota(h_v.begin(), h_v.end(), .0f);
  auto nf = static_cast<float>(n);
  float mean = nf * (nf - 1) / 2 / n;
-  linalg::Vector<float> res{{1}, ctx->gpu_id};
+  linalg::Vector<float> res{{1}, ctx->Device()};
  Mean(ctx, data, &res);
  auto h_res = res.HostView();
  ASSERT_EQ(h_res.Size(), 1);
@@ -128,5 +127,4 @@ TEST(Stats, GPUMean) {
  TestMean(&ctx);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
-}  // namespace common
+}  // namespace xgboost::common
 }  // namespace xgboost
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -20,8 +20,8 @@ namespace common {
 namespace {
 class StatsGPU : public ::testing::Test {
 private:
-  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0};
+  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, FstCU()};
-  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, 0};
+  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, FstCU()};
  HostDeviceVector<float> results_;
  using TestSet = std::vector<std::pair<float, float>>;
  Context ctx_;
@@ -46,7 +46,7 @@ class StatsGPU : public ::testing::Test {
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
    auto d_arr = arr.View(DeviceOrd::CUDA(0));
    auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -58,7 +58,7 @@ class StatsGPU : public ::testing::Test {
    // one alpha for each segment
    HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
    auto d_alphas = alphas.ConstDeviceSpan();
    auto w_it = thrust::make_constant_iterator(0.1f);
    SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
@@ -80,7 +80,7 @@ class StatsGPU : public ::testing::Test {
    auto val_it =
        dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                         [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
-    linalg::Tensor<float, 1> weights{{10}, 0};
+    linalg::Tensor<float, 1> weights{{10}, FstCU()};
    linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
                                       [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
    auto w_it = weights.Data()->ConstDevicePointer();
@@ -101,7 +101,7 @@ class StatsGPU : public ::testing::Test {
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
    auto d_arr = arr.View(DeviceOrd::CUDA(0));
    auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -113,7 +113,7 @@ class StatsGPU : public ::testing::Test {
    // one alpha for each segment
    HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
    auto d_alphas = alphas.ConstDeviceSpan();
    SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
                      val_it + d_arr.Size(), &results_);
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -11,63 +11,59 @@
 #include "../../../src/common/transform.h"
 #include "../helpers.h"
 namespace xgboost::common {
 namespace {
 constexpr DeviceOrd TransformDevice() {
 #if defined(__CUDACC__)
-
+  return DeviceOrd::CUDA(0);
 #define TRANSFORM_GPU 0
 #else
-
+  return DeviceOrd::CPU();
 #define TRANSFORM_GPU -1
 #endif
-
+}
-namespace xgboost {
+}  // namespace
 namespace common {
 template <typename T>
 struct TestTransformRange {
-  void XGBOOST_DEVICE operator()(size_t _idx,
+  void XGBOOST_DEVICE operator()(std::size_t _idx, Span<float> _out, Span<const float> _in) {
                                 Span<bst_float> _out, Span<const bst_float> _in) {
    _out[_idx] = _in[_idx];
  }
 };
 TEST(Transform, DeclareUnifiedTest(Basic)) {
-  const size_t size {256};
+  const size_t size{256};
-  std::vector<bst_float> h_in(size);
+  std::vector<float> h_in(size);
-  std::vector<bst_float> h_out(size);
+  std::vector<float> h_out(size);
  std::iota(h_in.begin(), h_in.end(), 0);
-  std::vector<bst_float> h_sol(size);
+  std::vector<float> h_sol(size);
  std::iota(h_sol.begin(), h_sol.end(), 0);
-  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
+  auto device = TransformDevice();
-  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
+  HostDeviceVector<float> const in_vec{h_in, device};
  HostDeviceVector<float> out_vec{h_out, device};
  out_vec.Fill(0);
-  Transform<>::Init(TestTransformRange<bst_float>{},
+  Transform<>::Init(TestTransformRange<float>{},
                    Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(),
-                    TRANSFORM_GPU)
+                    TransformDevice())
      .Eval(&out_vec, &in_vec);
-  std::vector<bst_float> res = out_vec.HostVector();
+  std::vector<float> res = out_vec.HostVector();
  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
 }
 #if !defined(__CUDACC__)
 TEST(TransformDeathTest, Exception) {
-  size_t const kSize {16};
+  size_t const kSize{16};
-  std::vector<bst_float> h_in(kSize);
+  std::vector<float> h_in(kSize);
-  const HostDeviceVector<bst_float> in_vec{h_in, -1};
+  const HostDeviceVector<float> in_vec{h_in, DeviceOrd::CPU()};
  EXPECT_DEATH(
      {
        Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; },
                          Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(),
-                          -1)
+                          DeviceOrd::CPU())
            .Eval(&in_vec);
      },
      "");
 }
 #endif
-
+}  // namespace xgboost::common
 } // namespace common
 } // namespace xgboost
--- a/tests/cpp/common/test_transform_range.cu
+++ b/tests/cpp/common/test_transform_range.cu
@@ -0,0 +1,5 @@
 /**
 * Copyright 2023 XGBoost contributors
 */
 // Dummy file to keep the CUDA tests.
 #include "test_transform_range.cc"
--- a/tests/cpp/data/test_device_adapter.cu
+++ b/tests/cpp/data/test_device_adapter.cu
@@ -59,12 +59,12 @@ TEST(DeviceAdapter, GetRowCounts) {
  for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
    HostDeviceVector<float> storage;
    auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
-                       .Device(ctx.gpu_id)
+                       .Device(ctx.Device())
                       .GenerateArrayInterface(&storage);
    auto adapter = CupyAdapter{str_arr};
    HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
-    offset.SetDevice(ctx.gpu_id);
+    offset.SetDevice(ctx.Device());
-    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id,
+    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
                                std::numeric_limits<float>::quiet_NaN());
    ASSERT_EQ(rstride, n_features);
  }
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
  Context ctx{MakeCUDACtx(0)};
  auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
  auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(0);
+  auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU());
  ASSERT_EQ(kCats, accessor.NumBins());
  auto x_copy = x;
@@ -152,13 +152,12 @@ TEST(EllpackPage, Copy) {
  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
  // Create an empty result page.
-  EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
+  EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kRows);
                         kRows);
  // Copy batch pages into the result page.
  size_t offset = 0;
  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    size_t num_elements = result.Copy(0, batch.Impl(), offset);
+    size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset);
    offset += num_elements;
  }
@@ -172,10 +171,12 @@ TEST(EllpackPage, Copy) {
    EXPECT_EQ(impl->base_rowid, current_row);
    for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row,
                                         row_d.data().get()));
      thrust::copy(row_d.begin(), row_d.end(), row.begin());
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row,
                                         row_result_d.data().get()));
      thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
      EXPECT_EQ(row, row_result);
@@ -199,8 +200,7 @@ TEST(EllpackPage, Compact) {
  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
  // Create an empty result page.
-  EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
+  EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kCompactedRows);
                         kCompactedRows);
  // Compact batch pages into the result page.
  std::vector<size_t> row_indexes_h {
@@ -209,7 +209,7 @@ TEST(EllpackPage, Compact) {
  thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
  common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    result.Compact(0, batch.Impl(), row_indexes_span);
+    result.Compact(FstCU(), batch.Impl(), row_indexes_span);
  }
  size_t current_row = 0;
@@ -228,13 +228,13 @@ TEST(EllpackPage, Compact) {
        continue;
      }
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0),
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()),
                                         current_row, row_d.data().get()));
      dh::safe_cuda(cudaDeviceSynchronize());
      thrust::copy(row_d.begin(), row_d.end(), row.begin());
      dh::LaunchN(kCols,
-                  ReadRowFunction(result.GetDeviceAccessor(0), compacted_row,
+                  ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row,
                                  row_result_d.data().get()));
      thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -30,7 +30,7 @@ namespace xgboost::data {
 TEST(GradientIndex, ExternalMemoryBaseRowID) {
  Context ctx;
  auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
-                    .Device(ctx.gpu_id)
+                    .Device(ctx.Device())
                    .Batches(8)
                    .GenerateSparsePageDMatrix("cache", true);
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -11,9 +11,7 @@
 #include "../helpers.h"
 #include "test_iterative_dmatrix.h"
-namespace xgboost {
+namespace xgboost::data {
 namespace data {
 void TestEquivalent(float sparsity) {
  Context ctx{MakeCUDACtx(0)};
@@ -23,14 +21,14 @@ void TestEquivalent(float sparsity) {
  std::size_t offset = 0;
  auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
  std::unique_ptr<EllpackPageImpl> page_concatenated {
-    new EllpackPageImpl(0, first->Cuts(), first->is_dense,
+    new EllpackPageImpl(ctx.Device(), first->Cuts(), first->is_dense,
                        first->row_stride, 1000 * 100)};
  for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
    auto page = batch.Impl();
-    size_t num_elements = page_concatenated->Copy(0, page, offset);
+    size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset);
    offset += num_elements;
  }
-  auto from_iter = page_concatenated->GetDeviceAccessor(0);
+  auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
  ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
  ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
@@ -40,7 +38,7 @@ void TestEquivalent(float sparsity) {
      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
  auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
  for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
-    auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
+    auto from_data = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
    std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
    std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
@@ -152,10 +150,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
  auto impl = ellpack.Impl();
  common::CompressedIterator<uint32_t> iterator(
      impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
-  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue());
+  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
-  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue());
+  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
  // null values get placed after valid values in a row
-  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue());
+  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
  EXPECT_EQ(m.Info().num_col_, cols);
  EXPECT_EQ(m.Info().num_row_, rows);
  EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
@@ -183,5 +181,4 @@ TEST(IterativeDeviceDMatrix, Ref) {
  TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
      &ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
 }
-}  // namespace data
+}  // namespace xgboost::data
 }  // namespace xgboost
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -12,6 +12,7 @@
 #include "../helpers.h"
 #include "xgboost/base.h"
 namespace xgboost {
 TEST(MetaInfo, GetSet) {
  xgboost::Context ctx;
  xgboost::MetaInfo info;
@@ -236,9 +237,9 @@ TEST(MetaInfo, Validate) {
  info.num_nonzero_ = 12;
  info.num_col_ = 3;
  std::vector<xgboost::bst_group_t> groups (11);
-  xgboost::Context ctx;
+  Context ctx;
  info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
-  EXPECT_THROW(info.Validate(0), dmlc::Error);
+  EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
  std::vector<float> labels(info.num_row_ + 1);
  EXPECT_THROW(
@@ -261,11 +262,11 @@ TEST(MetaInfo, Validate) {
  info.group_ptr_.clear();
  labels.resize(info.num_row_);
  info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
-  info.labels.SetDevice(0);
+  info.labels.SetDevice(FstCU());
-  EXPECT_THROW(info.Validate(1), dmlc::Error);
+  EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
  xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
-  d_groups.SetDevice(0);
+  d_groups.SetDevice(FstCU());
  d_groups.DevicePointer();  // pull to device
  std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
      d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
@@ -306,6 +307,5 @@ TEST(MetaInfo, HostExtend) {
  }
 }
 namespace xgboost {
 TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
 }  // namespace xgboost
--- a/tests/cpp/data/test_proxy_dmatrix.cc
+++ b/tests/cpp/data/test_proxy_dmatrix.cc
@@ -1,31 +1,27 @@
-/*!
+/**
- * Copyright 2021 XGBoost contributors
+ * Copyright 2021-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include "../helpers.h"
 #include "../../../src/data/proxy_dmatrix.h"
 #include "../../../src/data/adapter.h"
-namespace xgboost {
+#include "../../../src/data/adapter.h"
-namespace data {
+#include "../../../src/data/proxy_dmatrix.h"
 #include "../helpers.h"
 namespace xgboost::data {
 TEST(ProxyDMatrix, HostData) {
  DMatrixProxy proxy;
  size_t constexpr kRows = 100, kCols = 10;
  std::vector<HostDeviceVector<float>> label_storage(1);
  HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5)
+  auto data =
-                  .Device(0)
+      RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
                  .GenerateArrayInterface(&storage);
  proxy.SetArrayData(data.c_str());
-  auto n_samples = HostAdapterDispatch(
+  auto n_samples = HostAdapterDispatch(&proxy, [](auto const &value) { return value.Size(); });
      &proxy, [](auto const &value) { return value.Size(); });
  ASSERT_EQ(n_samples, kRows);
-  auto n_features = HostAdapterDispatch(
+  auto n_features = HostAdapterDispatch(&proxy, [](auto const &value) { return value.NumCols(); });
      &proxy, [](auto const &value) { return value.NumCols(); });
  ASSERT_EQ(n_features, kCols);
 }
-}  // namespace data
+}  // namespace xgboost::data
 }  // namespace xgboost
--- a/tests/cpp/data/test_proxy_dmatrix.cu
+++ b/tests/cpp/data/test_proxy_dmatrix.cu
@@ -15,10 +15,12 @@ namespace xgboost::data {
 TEST(ProxyDMatrix, DeviceData) {
  constexpr size_t kRows{100}, kCols{100};
  HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5).Device(0).GenerateArrayInterface(&storage);
+  auto data =
      RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
  std::vector<HostDeviceVector<float>> label_storage(1);
-  auto labels =
+  auto labels = RandomDataGenerator(kRows, 1, 0)
-      RandomDataGenerator(kRows, 1, 0).Device(0).GenerateColumnarArrayInterface(&label_storage);
+                    .Device(FstCU())
                    .GenerateColumnarArrayInterface(&label_storage);
  DMatrixProxy proxy;
  proxy.SetCUDAArray(data.c_str());
@@ -31,7 +33,7 @@ TEST(ProxyDMatrix, DeviceData) {
  std::vector<HostDeviceVector<float>> columnar_storage(kCols);
  data = RandomDataGenerator(kRows, kCols, 0)
-             .Device(0)
+             .Device(FstCU())
             .GenerateColumnarArrayInterface(&columnar_storage);
  proxy.SetCUDAArray(data.c_str());
  ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -268,7 +268,7 @@ TEST(SimpleDMatrix, Slice) {
  std::iota(upper.begin(), upper.end(), 1.0f);
  auto& margin = p_m->Info().base_margin_;
-  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
+  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
  std::array<int32_t, 3> ridxs {1, 3, 5};
  std::unique_ptr<DMatrix> out { p_m->Slice(ridxs) };
@@ -341,7 +341,7 @@ TEST(SimpleDMatrix, SliceCol) {
  std::iota(upper.begin(), upper.end(), 1.0f);
  auto& margin = p_m->Info().base_margin_;
-  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
+  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
  auto constexpr kSlices {2};
  auto constexpr kSliceSize {4};
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -134,11 +134,11 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
  size_t offset = 0;
  for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
    if (!impl_ext) {
-      impl_ext.reset(new EllpackPageImpl(
+      impl_ext = std::make_unique<EllpackPageImpl>(batch.Impl()->gidx_buffer.Device(),
-          batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
+                                                   batch.Impl()->Cuts(), batch.Impl()->is_dense,
-          batch.Impl()->is_dense, batch.Impl()->row_stride, kRows));
+                                                   batch.Impl()->row_stride, kRows);
    }
-    auto n_elems = impl_ext->Copy(0, batch.Impl(), offset);
+    auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset);
    offset += n_elems;
  }
  EXPECT_EQ(impl_ext->base_rowid, 0);
@@ -198,10 +198,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
    EXPECT_EQ(impl_ext->base_rowid, current_row);
    for (size_t i = 0; i < impl_ext->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
                                         row_d.data().get()));
      thrust::copy(row_d.begin(), row_d.end(), row.begin());
-      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(ctx.Device()), current_row,
                                         row_ext_d.data().get()));
      thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
      EXPECT_EQ(row, row_ext);
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -65,7 +65,7 @@ TEST(GBTree, PredictionCache) {
  gbtree.Configure({{"tree_method", "hist"}});
  auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
  gpair.Data()->Copy(GenerateRandomGradients(kRows));
  PredictionCacheEntry out_predictions;
@@ -156,7 +156,7 @@ TEST(GBTree, ChoosePredictor) {
  // pull data into device.
  data.HostVector();
-  data.SetDevice(0);
+  data.SetDevice(DeviceOrd::CUDA(0));
  data.DeviceSpan();
  ASSERT_FALSE(data.HostCanWrite());
@@ -215,7 +215,7 @@ TEST(GBTree, ChooseTreeMethod) {
    }
    learner->Configure();
    for (std::int32_t i = 0; i < 3; ++i) {
-      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, Context::kCpuId};
+      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, DeviceOrd::CPU()};
      gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
      learner->BoostOneIter(0, Xy, &gpair);
    }
@@ -400,7 +400,7 @@ class Dart : public testing::TestWithParam<char const*> {
    if (device == "GPU") {
      ctx = MakeCUDACtx(0);
    }
-    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id);
+    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.Device());
    auto array_str = rng.GenerateArrayInterface(&data);
    auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
@@ -710,7 +710,7 @@ TEST(GBTree, InplacePredictionError) {
  auto test_qdm_err = [&](std::string booster, Context const* ctx) {
    std::shared_ptr<DMatrix> p_fmat;
    bst_bin_t max_bins = 16;
-    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins);
+    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->Device()).Bins(max_bins);
    if (ctx->IsCPU()) {
      p_fmat = rng.GenerateQuantileDMatrix(true);
    } else {
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -22,7 +22,7 @@ void TestInplaceFallback(Context const* ctx) {
  bst_feature_t n_features{32};
  HostDeviceVector<float> X_storage;
  // use a different device than the learner
-  std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1;
+  auto data_ordinal = ctx->IsCPU() ? DeviceOrd::CUDA(0) : DeviceOrd::CPU();
  auto X = RandomDataGenerator{n_samples, n_features, 0.0}
               .Device(data_ordinal)
               .GenerateArrayInterface(&X_storage);
@@ -30,7 +30,7 @@ void TestInplaceFallback(Context const* ctx) {
  auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
  std::shared_ptr<DMatrix> Xy;
-  if (data_ordinal == Context::kCpuId) {
+  if (data_ordinal.IsCPU()) {
    auto X_adapter = data::ArrayAdapter{StringView{X}};
    Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
  } else {
@@ -49,7 +49,7 @@ void TestInplaceFallback(Context const* ctx) {
  std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
  auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
-  if (data_ordinal == Context::kCpuId) {
+  if (data_ordinal.IsCPU()) {
    proxy->SetArrayData(StringView{X});
  } else {
    proxy->SetCUDAArray(X.c_str());
@@ -64,7 +64,7 @@ void TestInplaceFallback(Context const* ctx) {
  // test when the contexts match
  Context new_ctx = *proxy->Ctx();
-  ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
+  ASSERT_NE(new_ctx.Ordinal(), ctx->Ordinal());
  learner->SetParam("device", new_ctx.DeviceName());
  HostDeviceVector<float>* out_predt_1{nullptr};
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -119,8 +119,10 @@ void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                      std::vector<xgboost::bst_float> out_hess) {
  xgboost::MetaInfo info;
  info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Tensor<float, 2>{
+  info.labels = xgboost::linalg::Tensor<float, 2>{labels.cbegin(),
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
+                                                  labels.cend(),
                                                  {labels.size(), static_cast<std::size_t>(1)},
                                                  xgboost::DeviceOrd::CPU()};
  info.weights_.HostVector() = weights;
  CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
@@ -155,8 +157,10 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                             std::vector<xgboost::bst_float> out_hess) {
  xgboost::MetaInfo info;
  info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Matrix<float>{
+  info.labels = xgboost::linalg::Matrix<float>{labels.cbegin(),
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
+                                               labels.cend(),
                                               {labels.size(), static_cast<std::size_t>(1)},
                                               xgboost::DeviceOrd::CPU()};
  info.weights_.HostVector() = weights;
  info.group_ptr_ = groups;
@@ -171,8 +175,9 @@ xgboost::bst_float GetMetricEval(xgboost::Metric* metric,
                                 xgboost::DataSplitMode data_split_mode) {
  return GetMultiMetricEval(
      metric, preds,
-      xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1}, weights,
+      xgboost::linalg::Tensor<float, 2>{
-      groups, data_split_mode);
+          labels.begin(), labels.end(), {labels.size()}, xgboost::DeviceOrd::CPU()},
      weights, groups, data_split_mode);
 }
 double GetMultiMetricEval(xgboost::Metric* metric,
@@ -215,7 +220,7 @@ void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const
      p_fmat->Info().labels.Data());
  CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
  p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
-  if (device_ != Context::kCpuId) {
+  if (device_.IsCUDA()) {
    p_fmat->Info().labels.SetDevice(device_);
  }
 }
@@ -236,7 +241,7 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
      v = dist(&lcg);
    }
  }
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
    out->SetDevice(device_);
    out->DeviceSpan();
  }
@@ -258,7 +263,7 @@ std::string RandomDataGenerator::GenerateArrayInterface(
 std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
    HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features,
-    std::size_t batches, std::int32_t device) {
+    std::size_t batches, DeviceOrd device) {
  std::vector<std::string> result(batches);
  std::vector<Json> objects;
@@ -267,7 +272,7 @@ std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
  auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) {
    Json array_interface{Object()};
    array_interface["data"] = std::vector<Json>(2);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
      array_interface["data"][0] =
          Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
      array_interface["stream"] = Null{};
@@ -359,7 +364,7 @@ void RandomDataGenerator::GenerateCSR(
    h_rptr.emplace_back(rptr);
  }
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
    value->SetDevice(device_);
    value->DeviceSpan();
    row_ptr->SetDevice(device_);
@@ -400,7 +405,7 @@ void RandomDataGenerator::GenerateCSR(
      out->Info().labels.Reshape(this->rows_, this->n_targets_);
    }
  }
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
    out->Info().labels.SetDevice(device_);
    out->Info().feature_types.SetDevice(device_);
    for (auto const& page : out->GetBatches<SparsePage>()) {
@@ -423,7 +428,7 @@ void RandomDataGenerator::GenerateCSR(
  CHECK_GE(this->n_batches_, 1)
      << "Must set the n_batches before generating an external memory DMatrix.";
  std::unique_ptr<ArrayIterForTest> iter;
-  if (device_ == Context::kCpuId) {
+  if (device_.IsCPU()) {
    iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
  } else {
 #if defined(XGBOOST_USE_CUDA)
@@ -487,7 +492,7 @@ int CudaArrayIterForTest::Next() {
 NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
                                             size_t batches)
    : ArrayIterForTest{sparsity, rows, cols, batches} {
-  rng_->Device(Context::kCpuId);
+  rng_->Device(DeviceOrd::CPU());
  std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
  this->Reset();
 }
@@ -644,8 +649,8 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
    labels[i] = i;
  }
  p_dmat->Info().labels =
-      linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1};
+      linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, DeviceOrd::CPU()};
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Device());
  auto h_gpair = gpair.HostView();
  for (size_t i = 0; i < kRows; ++i) {
    h_gpair(i) = GradientPair{static_cast<float>(i), 1};
@@ -674,7 +679,7 @@ ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector<float> c
  CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
  this->data_.Copy(data);
  std::tie(batches_, interface_) =
-      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->gpu_id);
+      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->Device());
 }
 ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -9,7 +9,7 @@ namespace xgboost {
 CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
                                           size_t cols, size_t batches)
    : ArrayIterForTest{sparsity, rows, cols, batches} {
-  rng_->Device(0);
+  rng_->Device(FstCU());
  std::tie(batches_, interface_) =
      rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
  this->Reset();
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -231,7 +231,7 @@ class RandomDataGenerator {
  bst_target_t n_targets_{1};
-  std::int32_t device_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
  std::size_t n_batches_{0};
  std::uint64_t seed_{0};
  SimpleLCG lcg_;
@@ -256,7 +256,7 @@ class RandomDataGenerator {
    upper_ = v;
    return *this;
  }
-  RandomDataGenerator& Device(int32_t d) {
+  RandomDataGenerator& Device(DeviceOrd d) {
    device_ = d;
    return *this;
  }
@@ -391,7 +391,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
 * \brief Make a context that uses CUDA if device >= 0.
 */
 inline Context MakeCUDACtx(std::int32_t device) {
-  if (device == Context::kCpuId) {
+  if (device == DeviceOrd::CPUOrdinal()) {
    return Context{};
  }
  return Context{}.MakeCUDA(device);
@@ -501,7 +501,7 @@ RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
 * \brief Make learner model param
 */
 inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups,
-                                int32_t device = Context::kCpuId) {
+                                DeviceOrd device = DeviceOrd::CPU()) {
  size_t shape[1]{1};
  LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
                           n_groups, 1, MultiStrategy::kOneOutputPerTree);
@@ -571,4 +571,5 @@ class BaseMGPUTest : public ::testing::Test {
 class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
 inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 }  // namespace xgboost
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -1,3 +1,8 @@
 /**
 * Copyright 2020-2023, XGBoost contributors
 */
 #pragma once
 #if defined(__CUDACC__)
 #include "../../src/data/ellpack_page.cuh"
 #endif
@@ -24,8 +29,8 @@ class HistogramCutsWrapper : public common::HistogramCuts {
 };
 }  //  anonymous namespace
-inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
+inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
-    int n_rows, int n_cols, bst_float sparsity= 0) {
+                                                         bst_float sparsity = 0) {
  auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
  const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
@@ -49,7 +54,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
  }
  auto page = std::unique_ptr<EllpackPageImpl>(
-      new EllpackPageImpl(0, cmat, batch, dmat->IsDense(), row_stride, {}));
+      new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {}));
  return page;
 }
--- a/tests/cpp/metric/test_auc.h
+++ b/tests/cpp/metric/test_auc.h
@@ -28,7 +28,7 @@ inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow)
  // Invalid dataset
  auto p_fmat = EmptyDMatrix();
  MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1};
+  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, DeviceOrd::CPU()};
  float auc = metric->Evaluate({1, 1}, p_fmat);
  ASSERT_TRUE(std::isnan(auc));
  *info.labels.Data() = HostDeviceVector<float>{};
--- a/Show More
+++ b/Show More