Remove internal use of gpu_id. (#9568)

2023-09-20 23:29:51 +08:00
parent 38ac52dd87
commit 8c676c889d
121 changed files with 1012 additions and 1044 deletions
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -687,13 +687,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col

  linalg::Stack(&this->labels, that.labels);

-  this->weights_.SetDevice(that.weights_.DeviceIdx());
+  this->weights_.SetDevice(that.weights_.Device());
  this->weights_.Extend(that.weights_);

-  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
+  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
  this->labels_lower_bound_.Extend(that.labels_lower_bound_);

-  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
+  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
  this->labels_upper_bound_.Extend(that.labels_upper_bound_);

  linalg::Stack(&this->base_margin_, that.base_margin_);
@@ -723,7 +723,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
  }
  if (!that.feature_weights.Empty()) {
    this->feature_weights.Resize(that.feature_weights.Size());
-    this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
+    this->feature_weights.SetDevice(that.feature_weights.Device());
    this->feature_weights.Copy(that.feature_weights);
  }
 }
@@ -738,22 +738,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {

 namespace {
 template <typename T>
-void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
+void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
+  bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
  if (!valid) {
    LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
                  "the booster. The device ordinal of the data is: "
-               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
+               << v.Device() << "; the device ordinal of the Booster is: " << device;
  }
 }

 template <typename T, std::int32_t D>
-void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
+void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
  CheckDevice(device, *v.Data());
 }
 }  // anonymous namespace

-void MetaInfo::Validate(std::int32_t device) const {
+void MetaInfo::Validate(DeviceOrd device) const {
  if (group_ptr_.size() != 0 && weights_.Size() != 0) {
    CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
    return;
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -29,13 +29,13 @@ template <typename T, int32_t D>
 void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
  ArrayInterface<D> array(arr_interface);
  if (array.n == 0) {
-    p_out->SetDevice(0);
+    p_out->SetDevice(DeviceOrd::CUDA(0));
    p_out->Reshape(array.shape);
    return;
  }
  CHECK_EQ(array.valid.Capacity(), 0)
      << "Meta info like label or weight can not have missing value.";
-  auto ptr_device = SetDeviceToPtr(array.data);
+  auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
  p_out->SetDevice(ptr_device);

  if (array.is_contiguous && array.type == ToDType<T>::kType) {
@@ -50,7 +50,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
    return;
  }
  p_out->Reshape(array.shape);
-  auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
+  auto t = p_out->View(ptr_device);
  linalg::ElementWiseTransformDevice(
      t,
      [=] __device__(size_t i, T) {
@@ -86,7 +86,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
      });
  dh::caching_device_vector<bool> flag(1);
  auto d_flag = dh::ToSpan(flag);
-  auto d = SetDeviceToPtr(array_interface.data);
+  auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
  dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
  dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
    auto typed = TypedIndex<uint32_t, 1>{array_interface};
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
  CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
      : columns_(columns),
        num_rows_(num_rows) {}
-  size_t Size() const { return num_rows_ * columns_.size(); }
-  __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
+  [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
    size_t column_idx = idx % columns_.size();
    size_t row_idx = idx / columns_.size();
    auto const& column = columns_[column_idx];
@@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
    return {row_idx, column_idx, value};
  }

-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
    auto const& column = columns_[fidx];
    float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
                      ? column(ridx)
@@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
    return value;
  }

-  XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }

 private:
  common::Span<ArrayInterface<1>> columns_;
@@ -120,14 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
      return;
    }

-    device_idx_ = dh::CudaGetPointerDevice(first_column.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
-    dh::safe_cuda(cudaSetDevice(device_idx_));
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
+    CHECK(device_.IsCUDA());
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
    for (auto& json_col : json_columns) {
      auto column = ArrayInterface<1>(get<Object const>(json_col));
      columns.push_back(column);
      num_rows_ = std::max(num_rows_, column.Shape(0));
-      CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
+      CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
          << "All columns should use the same device.";
      CHECK_EQ(num_rows_, column.Shape(0))
          << "All columns should have same number of rows.";
@@ -143,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
    return batch_;
  }

-  size_t NumRows() const { return num_rows_; }
-  size_t NumColumns() const { return columns_.size(); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return num_rows_; }
+  [[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }

 private:
  CudfAdapterBatch batch_;
  dh::device_vector<ArrayInterface<1>> columns_;
  size_t num_rows_{0};
-  int32_t device_idx_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };

 class CupyAdapterBatch : public detail::NoMetaInfo {
@@ -159,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
  CupyAdapterBatch() = default;
  explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
    : array_interface_(std::move(array_interface)) {}
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
    return array_interface_.Shape(0) * array_interface_.Shape(1);
  }
-  __device__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
    size_t column_idx = idx % array_interface_.Shape(1);
    size_t row_idx = idx / array_interface_.Shape(1);
    float value = array_interface_(row_idx, column_idx);
    return {row_idx, column_idx, value};
  }
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
    float value = array_interface_(ridx, fidx);
    return value;
  }

-  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }

 private:
  ArrayInterface<2> array_interface_;
@@ -189,28 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
    if (array_interface_.Shape(0) == 0) {
      return;
    }
-    device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
+    CHECK(device_.IsCUDA());
  }
  explicit CupyAdapter(std::string cuda_interface_str)
      : CupyAdapter{StringView{cuda_interface_str}} {}
-  const CupyAdapterBatch& Value() const override { return batch_; }
+  [[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }

-  size_t NumRows() const { return array_interface_.Shape(0); }
-  size_t NumColumns() const { return array_interface_.Shape(1); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }

 private:
  ArrayInterface<2> array_interface_;
  CupyAdapterBatch batch_;
-  int32_t device_idx_ {Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };

 // Returns maximum row length
 template <typename AdapterBatchT>
-std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
+std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
                         float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  IsValidFunctor is_valid(missing);
  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));

--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -94,22 +94,18 @@ __global__ void CompressBinEllpackKernel(
 }

 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
-                                 bool is_dense, size_t row_stride,
-                                 size_t n_rows)
-    : is_dense(is_dense),
-      cuts_(std::move(cuts)),
-      row_stride(row_stride),
-      n_rows(n_rows) {
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
+                                 size_t row_stride, size_t n_rows)
+    : is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
  monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));

  monitor_.Start("InitCompressedData");
  InitCompressedData(device);
  monitor_.Stop("InitCompressedData");
 }

-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
                                 const SparsePage &page, bool is_dense,
                                 size_t row_stride,
                                 common::Span<FeatureType const> feature_types)
@@ -123,7 +119,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
    : is_dense(dmat->IsDense()) {
  monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));

  n_rows = dmat->Info().num_row_;

@@ -138,15 +134,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
  monitor_.Stop("Quantiles");

  monitor_.Start("InitCompressedData");
-  this->InitCompressedData(ctx->gpu_id);
+  this->InitCompressedData(ctx->Device());
  monitor_.Stop("InitCompressedData");

-  dmat->Info().feature_types.SetDevice(ctx->gpu_id);
+  dmat->Info().feature_types.SetDevice(ctx->Device());
  auto ft = dmat->Info().feature_types.ConstDeviceSpan();
  monitor_.Start("BinningCompression");
  CHECK(dmat->SingleColBlock());
  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(ctx->gpu_id, batch, ft);
+    CreateHistIndices(ctx->Device(), batch, ft);
  }
  monitor_.Stop("BinningCompression");
 }
@@ -209,7 +205,7 @@ struct TupleScanOp {
 // to remove missing data
 template <typename AdapterBatchT>
 void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
-                       EllpackPageImpl* dst, int device_idx, float missing) {
+                       EllpackPageImpl* dst, DeviceOrd device, float missing) {
  // Some witchcraft happens here
  // The goal is to copy valid elements out of the input to an ELLPACK matrix
  // with a given row stride, using no extra working memory Standard stream
@@ -241,7 +237,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
  // Tuple[2] = The index in the input data
  using Tuple = thrust::tuple<size_t, size_t, size_t>;

-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();

@@ -280,10 +276,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 #endif
 }

-void WriteNullValues(EllpackPageImpl* dst, int device_idx,
-                     common::Span<size_t> row_counts) {
+void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
  // Write the null values
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
  auto row_stride = dst->row_stride;
@@ -300,11 +295,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
 }

 template <typename AdapterBatch>
-EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                                 common::Span<size_t> row_counts_span,
                                 common::Span<FeatureType const> feature_types, size_t row_stride,
                                 size_t n_rows, common::HistogramCuts const& cuts) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));

  *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
  CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -313,7 +308,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,

 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
  template EllpackPageImpl::EllpackPageImpl(                                               \
-      __BATCH_T batch, float missing, int device, bool is_dense,                           \
+      __BATCH_T batch, float missing, DeviceOrd device, bool is_dense,                     \
      common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
      size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);

@@ -370,9 +365,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
      [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
  row_stride = *std::max_element(it, it + page.Size());

-  CHECK_GE(ctx->gpu_id, 0);
+  CHECK(ctx->IsCUDA());
  monitor_.Start("InitCompressedData");
-  InitCompressedData(ctx->gpu_id);
+  InitCompressedData(ctx->Device());
  monitor_.Stop("InitCompressedData");

  // copy gidx
@@ -382,7 +377,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));

-  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
+  auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
  auto null = accessor.NullValue();
  CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
 }
@@ -407,8 +402,7 @@ struct CopyPage {
 };

 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
-                             size_t offset) {
+size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
  monitor_.Start("Copy");
  size_t num_elements = page->n_rows * page->row_stride;
  CHECK_EQ(row_stride, page->row_stride);
@@ -468,7 +462,7 @@ struct CompactPage {
 };

 // Compacts the data from the given EllpackPage into the current page.
-void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
+void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
                              common::Span<size_t> row_indexes) {
  monitor_.Start("Compact");
  CHECK_EQ(row_stride, page->row_stride);
@@ -481,13 +475,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
 }

 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(int device) {
+void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
  size_t num_symbols = NumSymbols();

  // Required buffer size for storing data matrix in ELLPack format.
  size_t compressed_size_bytes =
-    common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
-      num_symbols);
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
  gidx_buffer.SetDevice(device);
  // Don't call fill unnecessarily
  if (gidx_buffer.Size() == 0) {
@@ -499,7 +492,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
 }

 // Compress a CSR page into ELLPACK.
-void EllpackPageImpl::CreateHistIndices(int device,
+void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
                                        const SparsePage& row_batch,
                                        common::Span<FeatureType const> feature_types) {
  if (row_batch.Size() == 0) return;
@@ -509,7 +502,7 @@ void EllpackPageImpl::CreateHistIndices(int device,

  // bin and compress entries in batches of rows
  size_t gpu_batch_nrows =
-      std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
+      std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
               static_cast<size_t>(row_batch.Size()));

  size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
@@ -572,7 +565,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
 }

 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
-    int device, common::Span<FeatureType const> feature_types) const {
+    DeviceOrd device, common::Span<FeatureType const> feature_types) const {
  gidx_buffer.SetDevice(device);
  return {device,
          cuts_,
@@ -586,7 +579,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
 }
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
    common::Span<FeatureType const> feature_types) const {
-  return {Context::kCpuId,
+  return {DeviceOrd::CPU(),
          cuts_,
          is_dense,
          row_stride,
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {

  common::Span<const FeatureType> feature_types;

-  EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
-                        bool is_dense, size_t row_stride, size_t base_rowid,
-                        size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter,
+  EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
+                        size_t row_stride, size_t base_rowid, size_t n_rows,
+                        common::CompressedIterator<uint32_t> gidx_iter,
                        common::Span<FeatureType const> feature_types)
      : is_dense(is_dense),
        row_stride(row_stride),
        base_rowid(base_rowid),
-        n_rows(n_rows) ,gidx_iter(gidx_iter),
+        n_rows(n_rows),
+        gidx_iter(gidx_iter),
        feature_types{feature_types} {
-    if (device == Context::kCpuId) {
+    if (device.IsCPU()) {
      gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
      feature_segments = cuts.cut_ptrs_.ConstHostSpan();
      min_fvalue = cuts.min_vals_.ConstHostSpan();
@@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
  }
  // Get a matrix element, uses binary search for look up Return NaN if missing
  // Given a row index and a feature index, returns the corresponding cut value
-  __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
    ridx -= base_rowid;
    auto row_begin = row_stride * ridx;
    auto row_end = row_begin + row_stride;
@@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
  }

  template <bool is_cat>
-  __device__ uint32_t SearchBin(float value, size_t column_id) const {
+  [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
    auto beg = feature_segments[column_id];
    auto end = feature_segments[column_id + 1];
    uint32_t idx = 0;
@@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
    return idx;
  }

-  __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
    auto gidx = GetBinIndex(ridx, fidx);
    if (gidx == -1) {
      return nan("");
@@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
  }

  // Check if the row id is withing range of the current batch.
-  __device__ bool IsInRange(size_t row_id) const {
+  [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
    return row_id >= base_rowid && row_id < base_rowid + n_rows;
  }
  /*! \brief Return the total number of symbols (total number of bins plus 1 for
   * not found). */
-  XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }

-  XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }

-  XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }

-  XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
 };


@@ -141,14 +142,13 @@ class EllpackPageImpl {
   * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
   * and the given number of rows.
   */
-  EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
-                  size_t row_stride, size_t n_rows);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
+                  size_t n_rows);
  /*!
   * \brief Constructor used for external memory.
   */
-  EllpackPageImpl(int device, common::HistogramCuts cuts,
-                  const SparsePage &page, bool is_dense, size_t row_stride,
-                  common::Span<FeatureType const> feature_types);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
+                  bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);

  /*!
   * \brief Constructor from an existing DMatrix.
@@ -159,7 +159,7 @@ class EllpackPageImpl {
  explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);

  template <typename AdapterBatch>
-  explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+  explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                           common::Span<size_t> row_counts_span,
                           common::Span<FeatureType const> feature_types, size_t row_stride,
                           size_t n_rows, common::HistogramCuts const& cuts);
@@ -176,7 +176,7 @@ class EllpackPageImpl {
   * @param offset The number of elements to skip before copying.
   * @returns The number of elements copied.
   */
-  size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
+  size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);

  /*! \brief Compact the given ELLPACK page into the current page.
   *
@@ -184,11 +184,10 @@ class EllpackPageImpl {
   * @param page The ELLPACK page to compact from.
   * @param row_indexes Row indexes for the compacted page.
   */
-  void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
-
+  void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);

  /*! \return Number of instances in the page. */
-  size_t Size() const;
+  [[nodiscard]] size_t Size() const;

  /*! \brief Set the base row id for this page. */
  void SetBaseRowId(std::size_t row_id) {
@@ -204,12 +203,12 @@ class EllpackPageImpl {

  /*! \brief Return the total number of symbols (total number of bins plus 1 for
   * not found). */
-  size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
+  [[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }

-  EllpackDeviceAccessor
-  GetDeviceAccessor(int device,
-                    common::Span<FeatureType const> feature_types = {}) const;
-  EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
+      DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
+      common::Span<FeatureType const> feature_types = {}) const;

 private:
  /*!
@@ -218,13 +217,13 @@ class EllpackPageImpl {
   * @param device The GPU device to use.
   * @param row_batch The CSR page.
   */
-  void CreateHistIndices(int device,
+  void CreateHistIndices(DeviceOrd device,
                         const SparsePage& row_batch,
                         common::Span<FeatureType const> feature_types);
  /*!
   * \brief Initialize the buffer to store compressed features.
   */
-  void InitCompressedData(int device);
+  void InitCompressedData(DeviceOrd device);


 public:
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,7 +10,7 @@

 namespace xgboost::data {
 void EllpackPageSource::Fetch() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  if (!this->ReadCache()) {
    if (count_ != 0 && !sync_) {
      // source is initialized to be the 0th page during construction, so when count_ is 0
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
  BatchParam param_;
  common::Span<FeatureType const> feature_types_;
  std::unique_ptr<common::HistogramCuts> cuts_;
-  std::int32_t device_;
+  DeviceOrd device_;

 public:
  EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
                    std::shared_ptr<Cache> cache, BatchParam param,
                    std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
                    common::Span<FeatureType const> feature_types,
-                    std::shared_ptr<SparsePageSource> source, std::int32_t device)
+                    std::shared_ptr<SparsePageSource> source, DeviceOrd device)
      : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
        is_dense_{is_dense},
        row_stride_{row_stride},
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
  auto pctx = MakeProxy(proxy_)->Ctx();

  Context ctx;
-  ctx.UpdateAllowUnknown(
-      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
  // hardcoded parameter.
  BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};

@@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
    return HostAdapterDispatch(proxy, [&](auto const& value) {
      size_t n_threads = ctx->Threads();
      size_t n_features = column_sizes.size();
-      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
+      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
      column_sizes_tloc.Data()->Fill(0ul);
      auto view = column_sizes_tloc.HostView();
      common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -47,9 +47,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,

  int32_t current_device;
  dh::safe_cuda(cudaGetDevice(&current_device));
-  auto get_device = [&]() -> int32_t {
-    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
-    CHECK_NE(d, Context::kCpuId);
+  auto get_device = [&]() {
+    auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
+    CHECK(!d.IsCPU());
    return d;
  };

@@ -59,9 +59,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  common::HistogramCuts cuts;
  do {
    // We use do while here as the first batch is fetched in ctor
-    // ctx_.gpu_id = proxy->DeviceIdx();
-    CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
-    dh::safe_cuda(cudaSetDevice(get_device()));
+    CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
    if (cols == 0) {
      cols = num_cols();
      collective::Allreduce<collective::Operation::kMax>(&cols, 1);
@@ -93,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  auto n_features = cols;
  CHECK_GE(n_features, 1) << "Data must has at least 1 column.";

-  dh::safe_cuda(cudaSetDevice(get_device()));
+  dh::safe_cuda(cudaSetDevice(get_device().ordinal));
  if (!ref) {
    HostDeviceVector<FeatureType> ft;
    common::SketchContainer final_sketch(
@@ -132,7 +131,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  size_t n_batches_for_verification = 0;
  while (iter.Next()) {
    init_page();
-    dh::safe_cuda(cudaSetDevice(get_device()));
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
    auto rows = num_rows();
    dh::device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
@@ -184,18 +183,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
  if (!ellpack_) {
    ellpack_.reset(new EllpackPage());
    if (ctx->IsCUDA()) {
-      this->Info().feature_types.SetDevice(ctx->gpu_id);
+      this->Info().feature_types.SetDevice(ctx->Device());
      *ellpack_->Impl() =
          EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
    } else if (fmat_ctx_.IsCUDA()) {
-      this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
+      this->Info().feature_types.SetDevice(fmat_ctx_.Device());
      *ellpack_->Impl() =
          EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
    } else {
      // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
      // for cut reference.
      auto cuda_ctx = ctx->MakeCUDA();
-      this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
+      this->Info().feature_types.SetDevice(cuda_ctx.Device());
      *ellpack_->Impl() =
          EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
    }
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }

-void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
-                              char const *c_values, bst_feature_t n_features, bool on_host) {
+void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
+                              bst_feature_t n_features, bool on_host) {
  CHECK(on_host) << "Not implemented on device.";
  std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
      StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }

 namespace cuda_impl {
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
    return;
  }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }

 void DMatrixProxy::FromCudaArray(StringView interface_str) {
@@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
    return;
  }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }

 namespace cuda_impl {
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA)

 public:
-  int DeviceIdx() const { return ctx_.gpu_id; }
+  DeviceOrd Device() const { return ctx_.Device(); }

  void SetCUDAArray(char const* c_interface) {
    common::AssertGPUSupport();
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -253,7 +253,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
    }
    if (batch.BaseMargin() != nullptr) {
      info_.base_margin_ = decltype(info_.base_margin_){
-          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId};
+          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
    }
    if (batch.Qid() != nullptr) {
      qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -10,9 +10,7 @@
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"

-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 // Does not currently support metainfo as no on-device data source contains this
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
@@ -21,13 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                             DataSplitMode data_split_mode) {
  CHECK(data_split_mode != DataSplitMode::kCol)
      << "Column-wise data split is currently not supported on the GPU.";
-  auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
-                                                                      : adapter->DeviceIdx();
-  CHECK_GE(device, 0);
-  dh::safe_cuda(cudaSetDevice(device));
+  auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
+                    ? DeviceOrd::CUDA(dh::CurrentDevice())
+                    : adapter->Device();
+  CHECK(device.IsCUDA());
+  dh::safe_cuda(cudaSetDevice(device.ordinal));

  Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});

  CHECK(adapter->NumRows() != kAdapterUnknownSize);
  CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@@ -52,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
                                      int nthread, DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
                                      int nthread, DataSplitMode data_split_mode);
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -40,9 +40,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
 }

 template <typename AdapterBatchT>
-void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
-                     int device_idx, float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
+void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
+                     float missing) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  IsValidFunctor is_valid(missing);
  // Count elements per row
  dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
@@ -55,14 +55,13 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
  });

  dh::XGBCachingDeviceAllocator<char> alloc;
-  thrust::exclusive_scan(thrust::cuda::par(alloc),
-      thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data() + offset.size()),
-      thrust::device_pointer_cast(offset.data()));
+  thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+                         thrust::device_pointer_cast(offset.data() + offset.size()),
+                         thrust::device_pointer_cast(offset.data()));
 }

 template <typename AdapterBatchT>
-size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
+size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
                        SparsePage* page) {
  bool valid = NoInfInData(batch, IsValidFunctor{missing});
  CHECK(valid) << error::InfInData();
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
    ellpack_page_source_.reset();  // make sure resource is released before making new ones.
    ellpack_page_source_ = std::make_shared<EllpackPageSource>(
        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
-        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
+        ctx->Device());
  } else {
    CHECK(sparse_page_source_);
    ellpack_page_source_->Reset();
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
 }  // namespace detail

 void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
-  auto device = proxy->DeviceIdx();
-  if (device < 0) {
-    device = dh::CurrentDevice();
+  auto device = proxy->Device();
+  if (device.IsCPU()) {
+    device = DeviceOrd::CUDA(dh::CurrentDevice());
  }
-  CHECK_GE(device, 0);
+  CHECK(device.IsCUDA());

  cuda_impl::Dispatch(proxy,
                      [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });