Remove internal use of gpu_id. (#9568)

2023-09-20 23:29:51 +08:00
parent 38ac52dd87
commit 8c676c889d
121 changed files with 1012 additions and 1044 deletions
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -123,7 +123,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
      [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
 }

-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                dh::device_vector<Entry>* p_sorted_entries,
                                dh::device_vector<float>* p_sorted_weights,
                                dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@@ -240,13 +240,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
      sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
        return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
      });
-  detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
  if (sketch_container->HasCategorical()) {
    auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
-    detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
+    detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
                                       &column_sizes_scan);
  }

@@ -347,7 +347,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b

  HistogramCuts cuts;
  SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
-                                   ctx->Ordinal());
+                                   ctx->Device());
  CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
  for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
    std::size_t page_nnz = page.data.Size();
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -82,9 +82,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 }

 template <std::uint32_t kBlockThreads, typename Kernel>
-std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
+std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
  int n_mps = 0;
-  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
  int n_blocks_per_mp = 0;
  dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                              kBlockThreads, shared_mem));
@@ -106,11 +106,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
 * \param out_column_size Output buffer for the size of each column.
 */
 template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
-void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
+void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
                               data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
  thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);

-  std::size_t max_shared_memory = dh::MaxSharedMemory(device);
+  std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
  // Not strictly correct as we should use number of samples to determine the type of
  // counter. However, the sample size is not known due to sliding window on number of
  // elements.
@@ -154,7 +154,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
 }

 template <typename BatchIt>
-void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
+void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
                        IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
                        HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                        dh::caching_device_vector<size_t>* column_sizes_scan) {
@@ -215,7 +215,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
 void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
-                            float missing, size_t columns, size_t cuts_per_feature, int device,
+                            float missing, size_t columns, size_t cuts_per_feature,
+                            DeviceOrd device,
                            HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                            dh::caching_device_vector<size_t>* column_sizes_scan,
                            dh::device_vector<Entry>* sorted_entries) {
@@ -239,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                  dh::device_vector<Entry>* sorted_entries);

-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                dh::device_vector<Entry>* p_sorted_entries,
                                dh::device_vector<float>* p_sorted_weights,
                                dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -277,7 +278,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t

 template <typename AdapterBatch>
 void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
-                          int device, size_t columns, size_t begin, size_t end,
+                          DeviceOrd device, size_t columns, size_t begin, size_t end,
                          float missing, SketchContainer *sketch_container,
                          int num_cuts) {
  // Copy current subset of valid elements into temporary storage and sort
@@ -316,11 +317,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
 template <typename Batch>
 void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
                                  int num_cuts_per_feature,
-                                  bool is_ranking, float missing, int device,
+                                  bool is_ranking, float missing, DeviceOrd device,
                                  size_t columns, size_t begin, size_t end,
                                  SketchContainer *sketch_container) {
  dh::XGBCachingDeviceAllocator<char> alloc;
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  info.weights_.SetDevice(device);
  auto weights = info.weights_.ConstDeviceSpan();

@@ -412,14 +413,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
  size_t num_rows = batch.NumRows();
  size_t num_cols = batch.NumCols();
  size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
-  int32_t device = sketch_container->DeviceIdx();
+  auto device = sketch_container->DeviceIdx();
  bool weighted = !info.weights_.Empty();

  if (weighted) {
    sketch_batch_num_elements = detail::SketchBatchNumElements(
        sketch_batch_num_elements,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, true);
+        device.ordinal, num_cuts_per_feature, true);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
      size_t end =
          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
@@ -432,7 +433,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
    sketch_batch_num_elements = detail::SketchBatchNumElements(
        sketch_batch_num_elements,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, false);
+        device.ordinal, num_cuts_per_feature, false);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
      size_t end =
          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(size, v);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -81,7 +81,7 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }

 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return -1; }
+DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }

 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
  return false;
 }

-template <typename T>
-void HostDeviceVector<T>::SetDevice(int) const {}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}

--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
 template <typename T>
 class HostDeviceVectorImpl {
 public:
-  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
      data_d_->resize(size, v);
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {

  // Initializer can be std::vector<T> or std::initializer_list<T>
  template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
      gpu_access_ = GPUAccess::kWrite;
      LazyResizeDevice(init.size());
      Copy(init);
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
    gpu_access_{that.gpu_access_} {}

  ~HostDeviceVectorImpl() {
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
      SetDevice();
    }
  }

-  size_t Size() const {
+  [[nodiscard]] size_t Size() const {
    return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
  }

-  int DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }

  T* DevicePointer() {
    LazySyncDevice(GPUAccess::kWrite);
@@ -138,7 +138,7 @@ class HostDeviceVectorImpl {
    } else {
      auto ptr = other->ConstDevicePointer();
      SetDevice();
-      CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
+      CHECK_EQ(this->Device(), other->Device());
      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
                                    ptr,
                                    other->Size() * sizeof(T),
@@ -156,24 +156,25 @@ class HostDeviceVectorImpl {
    return data_h_;
  }

-  void SetDevice(int device) {
+  void SetDevice(DeviceOrd device) {
    if (device_ == device) { return; }
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
      LazySyncHost(GPUAccess::kNone);
    }

-    if (device_ >= 0 && device >= 0) {
-      CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
+    if (device_.IsCUDA() && device.IsCUDA()) {
+      CHECK_EQ(device_.ordinal, device.ordinal)
+          << "New device ordinal is different from previous one.";
    }
    device_ = device;
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
      LazyResizeDevice(data_h_.size());
    }
  }

  void Resize(size_t new_size, T v) {
    if (new_size == Size()) { return; }
-    if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
+    if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
      // fast on-device resize
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
@@ -218,16 +219,16 @@ class HostDeviceVectorImpl {
    gpu_access_ = access;
  }

-  bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
-  bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
-  bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
-  bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
-  bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
-  bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
-  GPUAccess Access() const { return gpu_access_; }
+  [[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
+  [[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
+  [[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
+  [[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
+  [[nodiscard]] GPUAccess Access() const { return gpu_access_; }

 private:
-  int device_{-1};
+  DeviceOrd device_{DeviceOrd::CPU()};
  std::vector<T> data_h_{};
  std::unique_ptr<dh::device_vector<T>> data_d_{};
  GPUAccess gpu_access_{GPUAccess::kNone};
@@ -259,11 +260,11 @@ class HostDeviceVectorImpl {
  }

  void SetDevice() {
-    CHECK_GE(device_, 0);
+    CHECK_GE(device_.ordinal, 0);
    if (cudaSetDeviceHandler == nullptr) {
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
    } else {
-      (*cudaSetDeviceHandler)(device_);
+      (*cudaSetDeviceHandler)(device_.ordinal);
    }

    if (!data_d_) {
@@ -273,15 +274,15 @@ class HostDeviceVectorImpl {
 };

 template<typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
    : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}

 template <typename T>
@@ -309,7 +310,9 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }

 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
+DeviceOrd HostDeviceVector<T>::Device() const {
+  return impl_->Device();
+}

 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() {
@@ -389,14 +392,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
  return impl_->Access();
 }

-template <typename T>
-void HostDeviceVector<T>::SetDevice(int device) const {
-  impl_->SetDevice(device);
-}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
-  impl_->SetDevice(device.ordinal);
+  impl_->SetDevice(device);
 }

 template <typename T>
--- a/src/common/numeric.cu
+++ b/src/common/numeric.cu
@@ -8,16 +8,12 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector

-namespace xgboost {
-namespace common {
-namespace cuda_impl {
+namespace xgboost::common::cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
-  values.SetDevice(ctx->gpu_id);
+  values.SetDevice(ctx->Device());
  auto const d_values = values.ConstDeviceSpan();
  dh::XGBCachingDeviceAllocator<char> alloc;
  return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
                    thrust::plus<float>{});
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@@ -24,7 +24,7 @@ struct OptionalWeights {
 inline OptionalWeights MakeOptionalWeights(Context const* ctx,
                                           HostDeviceVector<float> const& weights) {
  if (ctx->IsCUDA()) {
-    weights.SetDevice(ctx->gpu_id);
+    weights.SetDevice(ctx->Device());
  }
  return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
 }
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -207,10 +207,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 // summary does the output element come from) result by definition of merged rank.  So we
 // run it in 2 passes to obtain the merge path and then customize the standard merge
 // algorithm.
-void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
+void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
               Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
               Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  CHECK_EQ(d_x.size() + d_y.size(), out.size());
  CHECK_EQ(x_ptr.size(), out_ptr.size());
  CHECK_EQ(y_ptr.size(), out_ptr.size());
@@ -308,7 +308,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
 void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
                           common::Span<OffsetT> cuts_ptr,
                           size_t total_cuts, Span<float> weights) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  Span<SketchEntry> out;
  dh::device_vector<SketchEntry> cuts;
  bool first_window = this->Current().empty();
@@ -367,7 +367,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
   * pruning or merging. We preserve the first type and remove the second type.
   */
  timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
  dh::XGBCachingDeviceAllocator<char> alloc;

@@ -407,7 +407,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col

 void SketchContainer::Prune(size_t to) {
  timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));

  OffsetT to_total = 0;
  auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -442,7 +442,7 @@ void SketchContainer::Prune(size_t to) {

 void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
                            Span<SketchEntry const> that) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  timer_.Start(__func__);
  if (this->Current().size() == 0) {
    CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -477,7 +477,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
 }

 void SketchContainer::FixError() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
  auto in = dh::ToSpan(this->Current());
  dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -502,7 +502,7 @@ void SketchContainer::FixError() {
 }

 void SketchContainer::AllReduce(bool is_column_split) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  auto world = collective::GetWorldSize();
  if (world == 1 || is_column_split) {
    return;
@@ -529,15 +529,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
  auto offset = rank * d_columns_ptr.size();
  thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
               gathered_ptrs.begin() + offset);
-  collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
+  collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
                                                     gathered_ptrs.size());

  // Get the data from all workers.
  std::vector<size_t> recv_lengths;
  dh::caching_device_vector<char> recvbuf;
-  collective::AllGatherV(device_, this->Current().data().get(),
+  collective::AllGatherV(device_.ordinal, this->Current().data().get(),
                         dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
-  collective::Synchronize(device_);
+  collective::Synchronize(device_.ordinal);

  // Segment the received data.
  auto s_recvbuf = dh::ToSpan(recvbuf);
@@ -584,7 +584,7 @@ struct InvalidCatOp {

 void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
  timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  p_cuts->min_vals_.Resize(num_columns_);

  // Sync between workers.
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -41,7 +41,7 @@ class SketchContainer {
  bst_row_t num_rows_;
  bst_feature_t num_columns_;
  int32_t num_bins_;
-  int32_t device_;
+  DeviceOrd device_;

  // Double buffer as neither prune nor merge can be performed inplace.
  dh::device_vector<SketchEntry> entries_a_;
@@ -93,35 +93,32 @@ class SketchContainer {
   * \param num_rows    Total number of rows in known dataset (typically the rows in current worker).
   * \param device      GPU ID.
   */
-   SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
-                   int32_t max_bin, bst_feature_t num_columns,
-                   bst_row_t num_rows, int32_t device)
-       : num_rows_{num_rows},
-         num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
-     CHECK_GE(device, 0);
-     // Initialize Sketches for this dmatrix
-     this->columns_ptr_.SetDevice(device_);
-     this->columns_ptr_.Resize(num_columns + 1);
-     this->columns_ptr_b_.SetDevice(device_);
-     this->columns_ptr_b_.Resize(num_columns + 1);
+  SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
+                  bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
+      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
+    CHECK(device.IsCUDA());
+    // Initialize Sketches for this dmatrix
+    this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_b_.SetDevice(device_);
+    this->columns_ptr_b_.Resize(num_columns + 1);

-     this->feature_types_.Resize(feature_types.Size());
-     this->feature_types_.Copy(feature_types);
-     // Pull to device.
-     this->feature_types_.SetDevice(device);
-     this->feature_types_.ConstDeviceSpan();
-     this->feature_types_.ConstHostSpan();
+    this->feature_types_.Resize(feature_types.Size());
+    this->feature_types_.Copy(feature_types);
+    // Pull to device.
+    this->feature_types_.SetDevice(device);
+    this->feature_types_.ConstDeviceSpan();
+    this->feature_types_.ConstHostSpan();

-     auto d_feature_types = feature_types_.ConstDeviceSpan();
-     has_categorical_ =
-         !d_feature_types.empty() &&
-         thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
-                        common::IsCatOp{});
+    auto d_feature_types = feature_types_.ConstDeviceSpan();
+    has_categorical_ =
+        !d_feature_types.empty() &&
+        thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});

-     timer_.Init(__func__);
-   }
+    timer_.Init(__func__);
+  }
  /* \brief Return GPU ID for this container. */
-  int32_t DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
  /* \brief Whether the predictor matrix contains categorical features. */
  bool HasCategorical() const { return has_categorical_; }
  /* \brief Accumulate weights of duplicated entries in input. */
@@ -175,7 +172,7 @@ class SketchContainer {
  template <typename KeyComp = thrust::equal_to<size_t>>
  size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
    timer_.Start(__func__);
-    dh::safe_cuda(cudaSetDevice(device_));
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
    this->columns_ptr_.SetDevice(device_);
    Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
    CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -15,8 +15,7 @@
 #include "xgboost/linalg.h"              // Tensor, UnravelIndex, Apply
 #include "xgboost/logging.h"             // CHECK_EQ

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
            HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
  if (!ctx->IsCPU()) {
@@ -46,8 +45,8 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
 }

 void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
-  v.SetDevice(ctx->gpu_id);
-  out->SetDevice(ctx->gpu_id);
+  v.SetDevice(ctx->Device());
+  out->SetDevice(ctx->Device());
  out->Reshape(1);

  if (ctx->IsCPU()) {
@@ -62,5 +61,4 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
  }
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@@ -15,14 +15,12 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // linalg::TensorView, UnravelIndex, Apply

-namespace xgboost {
-namespace common {
-namespace cuda_impl {
+namespace xgboost::common::cuda_impl {
 void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
            common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
  CHECK_GE(t.Shape(1), 1);
  HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
-  segments.SetDevice(ctx->gpu_id);
+  segments.SetDevice(ctx->Device());
  auto d_segments = segments.DeviceSpan();
  dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
              [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
@@ -31,7 +29,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
        return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
      });

-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
  out->Reshape(t.Shape(1));
  if (weights.Empty()) {
    common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
@@ -60,6 +58,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
  dh::TemporaryArray<char> temp{bytes};
  cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
  auto d_sorted_idx = dh::ToSpan(sorted_idx);
  auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));

-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
  quantiles->Resize(n_segments);
  auto d_results = quantiles->DeviceSpan();

@@ -220,7 +220,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
                                scan_val, weights_cdf.begin());

  auto n_segments = std::distance(seg_beg, seg_end) - 1;
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
  quantiles->Resize(n_segments);
  auto d_results = quantiles->DeviceSpan();
  auto d_weight_cdf = dh::ToSpan(weights_cdf);
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -60,8 +60,8 @@ class Transform {
  template <typename Functor>
  struct Evaluator {
   public:
-    Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
-        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
+    Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
+        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}

    /*!
     * \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -71,7 +71,7 @@ class Transform {
     */
    template <typename... HDV>
    void Eval(HDV... vectors) const {
-      bool on_device = device_ >= 0;
+      bool on_device = device_.IsCUDA();

      if (on_device) {
        LaunchCUDA(func_, vectors...);
@@ -116,11 +116,11 @@ class Transform {
    }
    // Recursive unpack for Shard.
    template <typename T>
-    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+    void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
      vector->SetDevice(device);
    }
    template <typename Head, typename... Rest>
-    void UnpackShard(int device,
+    void UnpackShard(DeviceOrd device,
                     const HostDeviceVector<Head> *_vector,
                     const HostDeviceVector<Rest> *... _vectors) const {
      _vector->SetDevice(device);
@@ -140,7 +140,7 @@ class Transform {
      // granularity is used in data vector.
      size_t shard_size = range_size;
      Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
      const int kGrids =
          static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
      if (kGrids == 0) {
@@ -174,7 +174,7 @@ class Transform {
    /*! \brief Range object specifying parallel threads index range. */
    Range range_;
    int32_t n_threads_;
-    int32_t device_;
+    DeviceOrd device_;
  };

 public:
@@ -192,8 +192,8 @@ class Transform {
   */
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
-                                 int32_t device_idx) {
-    return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
+                                 DeviceOrd device) {
+    return Evaluator<Functor>{func, std::move(range), n_threads, device};
  }
 };