enable ROCm on latest XGBoost

2023-10-23 11:07:08 -07:00
parent fb19e15ce3 3b86260b50
commit 15421e40d9
328 changed files with 8028 additions and 3642 deletions
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -19,8 +19,7 @@
 #include "xgboost/linalg.h"                // TensorView, Tensor, Constant
 #include "xgboost/logging.h"               // CHECK_EQ

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace cpu_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
              linalg::TensorView<GradientPair const, 2> gpair,
@@ -68,13 +67,12 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien

 void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
              bst_target_t n_targets, linalg::Vector<float>* out) {
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
  out->Reshape(n_targets);

  gpair.SetDevice(ctx->Device());
  auto gpair_t = gpair.View(ctx->Device());
-  ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
-      : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
+  ctx->IsCUDA() ? cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()))
+                : cpu_impl::FitStump(ctx, info, gpair_t, out->HostView());
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -21,9 +21,7 @@
 #include "xgboost/logging.h"  // CHECK_EQ
 #include "xgboost/span.h"     // span

-namespace xgboost {
-namespace tree {
-namespace cuda_impl {
+namespace xgboost::tree::cuda_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
              linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
  auto n_targets = out.Size();
@@ -56,7 +54,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
  thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
                        thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));

-  collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
+  collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
                        d_sum.Size() * 2);

  thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
@@ -65,6 +63,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
                           CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
                     });
 }
-}  // namespace cuda_impl
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree::cuda_impl
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -451,7 +451,7 @@ void GPUHistEvaluator::EvaluateSplits(
    auto const world_size = collective::GetWorldSize();
    dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
    auto all_candidates = dh::ToSpan(all_candidate_storage);
-    collective::AllGather(device_, out_splits.data(), all_candidates.data(),
+    collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(),
                          out_splits.size() * sizeof(DeviceSplitCandidate));

    // Reduce to get the best candidate from all workers.
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -85,7 +85,7 @@ class GPUHistEvaluator {
  std::size_t node_categorical_storage_size_ = 0;
  // Is the data split column-wise?
  bool is_column_split_ = false;
-  int32_t device_;
+  DeviceOrd device_;

  // Copy the categories from device to host asynchronously.
  void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -133,14 +133,14 @@ class GPUHistEvaluator {
  }

 public:
-  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, int32_t device)
+  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, DeviceOrd device)
      : tree_evaluator_{param, n_features, device}, param_{param} {}
  /**
   * \brief Reset the evaluator, should be called before any use.
   */
  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
             bst_feature_t n_features, TrainParam const &param, bool is_column_split,
-             int32_t device);
+             DeviceOrd device);

  /**
   * \brief Get host category storage for nidx.  Different from the internal version, this
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023 by XGBoost Contributors
 *
 * \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
 *        compilation time.
@@ -12,11 +12,10 @@
 #include "evaluate_splits.cuh"
 #include "xgboost/data.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
                             bst_feature_t n_features, TrainParam const &param,
-                             bool is_column_split, int32_t device) {
+                             bool is_column_split, DeviceOrd device) {
  param_ = param;
  tree_evaluator_ = TreeEvaluator{param, n_features, device};
  has_categoricals_ = cuts.HasCategorical();
@@ -201,6 +200,4 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
 #endif
  return dh::ToSpan(cat_sorted_idx_);
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/gpu_hist/feature_groups.cuh
+++ b/src/tree/gpu_hist/feature_groups.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
 */
 #ifndef FEATURE_GROUPS_CUH_
 #define FEATURE_GROUPS_CUH_
@@ -102,11 +102,10 @@ struct FeatureGroups {
    InitSingle(cuts);
  }

-  FeatureGroupsAccessor DeviceAccessor(int device) const {
+  [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
    feature_segments.SetDevice(device);
    bin_segments.SetDevice(device);
-    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(),
-        max_group_bins};
+    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins};
  }

 private:
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
    for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
      auto page = batch.Impl();
      if (!page_) {
-        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+        page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->Cuts(), page->is_dense,
                                                  page->row_stride, dmat->Info().num_row_);
      }
-      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
+      size_t num_elements = page_->Copy(ctx->Device(), page, offset);
      offset += num_elements;
    }
    page_concatenated_ = true;
@@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
  auto first_page = (*batch_iterator.begin()).Impl();
  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                  first_page->row_stride, sample_rows));

  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
  for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
  }

  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
@@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
  auto first_page = (*batch_iterator.begin()).Impl();
  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                  first_page->row_stride, sample_rows));

  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
  for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
  }

  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -13,17 +13,15 @@
 namespace xgboost {
 namespace tree {

-RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
+RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
-
-  dh::safe_cuda(cudaSetDevice(device_idx_));
-
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
  thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
 }

 RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
 }

 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -210,7 +210,7 @@ class RowPartitioner {
  static constexpr bst_node_t kIgnoredTreePosition = -1;

 private:
-  int device_idx_;
+  DeviceOrd device_idx_;
  /*! \brief In here if you want to find the rows belong to a node nid, first you need to
   * get the indices segment from ridx_segments[nid], then get the row index that
   * represents position of row in input data X.  `RowPartitioner::GetRows` would be a
@@ -234,7 +234,7 @@ class RowPartitioner {
  dh::PinnedMemory pinned2_;

 public:
-  RowPartitioner(int device_idx, size_t num_rows);
+  RowPartitioner(DeviceOrd device_idx, size_t num_rows);
  ~RowPartitioner();
  RowPartitioner(const RowPartitioner&) = delete;
  RowPartitioner& operator=(const RowPartitioner&) = delete;
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -292,20 +292,19 @@ class HistEvaluator {
   */
  std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
    auto const world = collective::GetWorldSize();
-    auto const rank = collective::GetRank();
    auto const num_entries = entries.size();

    // First, gather all the primitive fields.
-    std::vector<CPUExpandEntry> all_entries(num_entries * world);
+    std::vector<CPUExpandEntry> local_entries(num_entries);
    std::vector<uint32_t> cat_bits;
    std::vector<std::size_t> cat_bits_sizes;
    for (std::size_t i = 0; i < num_entries; i++) {
-      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
+      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
    }
-    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
+    auto all_entries = collective::Allgather(local_entries);

    // Gather all the cat_bits.
-    auto gathered = collective::AllgatherV(cat_bits, cat_bits_sizes);
+    auto gathered = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);

    common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
      // Copy the cat_bits back into all expand entries.
@@ -477,7 +476,7 @@ class HistEvaluator {
      : ctx_{ctx},
        param_{param},
        column_sampler_{std::move(sampler)},
-        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
+        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
        is_col_split_{info.IsColumnSplit()} {
    interaction_constraints_.Configure(*param, info.num_col_);
    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
@@ -579,28 +578,24 @@ class HistMultiEvaluator {
   */
  std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
    auto const world = collective::GetWorldSize();
-    auto const rank = collective::GetRank();
    auto const num_entries = entries.size();

    // First, gather all the primitive fields.
-    std::vector<MultiExpandEntry> all_entries(num_entries * world);
+    std::vector<MultiExpandEntry> local_entries(num_entries);
    std::vector<uint32_t> cat_bits;
    std::vector<std::size_t> cat_bits_sizes;
    std::vector<GradientPairPrecise> gradients;
    for (std::size_t i = 0; i < num_entries; i++) {
-      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
-                                                         &gradients);
+      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes, &gradients);
    }
-    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(MultiExpandEntry));
+    auto all_entries = collective::Allgather(local_entries);

    // Gather all the cat_bits.
-    auto gathered_cat_bits = collective::AllgatherV(cat_bits, cat_bits_sizes);
+    auto gathered_cat_bits = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);

    // Gather all the gradients.
    auto const num_gradients = gradients.size();
-    std::vector<GradientPairPrecise> all_gradients(num_gradients * world);
-    std::copy_n(gradients.cbegin(), num_gradients, all_gradients.begin() + num_gradients * rank);
-    collective::Allgather(all_gradients.data(), all_gradients.size() * sizeof(GradientPairPrecise));
+    auto const all_gradients = collective::Allgather(gradients);

    auto const total_entries = num_entries * world;
    auto const gradients_per_entry = num_gradients / num_entries;
@@ -696,7 +691,7 @@ class HistMultiEvaluator {
    stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
    gain_.resize(1);

-    linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
+    linalg::Vector<float> weight({n_targets}, ctx_->Device());
    CalcWeight(*param_, root_sum, weight.HostView());
    auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
    gain_.front() = root_gain;
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2020 by Contributors
+/**
+ * Copyright 2018-2023 by Contributors
 * \file split_evaluator.h
 * \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
 * \author Henry Gouk
@@ -23,8 +23,7 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/tree_model.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 class TreeEvaluator {
  // hist and exact use parent id to calculate constraints.
  static constexpr bst_node_t kRootParentId =
@@ -33,13 +32,13 @@ class TreeEvaluator {
  HostDeviceVector<float> lower_bounds_;
  HostDeviceVector<float> upper_bounds_;
  HostDeviceVector<int32_t> monotone_;
-  int32_t device_;
+  DeviceOrd device_;
  bool has_constraint_;

 public:
-  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, int32_t device) {
+  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) {
    device_ = device;
-    if (device != Context::kCpuId) {
+    if (device.IsCUDA()) {
      lower_bounds_.SetDevice(device);
      upper_bounds_.SetDevice(device);
      monotone_.SetDevice(device);
@@ -59,7 +58,7 @@ class TreeEvaluator {
      has_constraint_ = true;
    }

-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
      // Pull to device early.
      lower_bounds_.ConstDeviceSpan();
      upper_bounds_.ConstDeviceSpan();
@@ -122,8 +121,8 @@ class TreeEvaluator {
    }

    // Fast floating point division instruction on device
-    XGBOOST_DEVICE float Divide(float a, float b) const {
-#if defined(__CUDA_ARCH__)
+    [[nodiscard]] XGBOOST_DEVICE float Divide(float a, float b) const {
+#ifdef __CUDA_ARCH__
      return __fdividef(a, b);
 #elif defined(__HIP_PLATFORM_AMD__)
      return a / b;
@@ -156,7 +155,7 @@ class TreeEvaluator {
 public:
  /* Get a view to the evaluator that can be passed down to device. */
  template <typename ParamT = TrainParam> auto GetEvaluator() const {
-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
      auto constraints = monotone_.ConstDevicePointer();
      return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
                                    upper_bounds_.ConstDevicePointer(), has_constraint_};
@@ -217,7 +216,6 @@ enum SplitType {
  // partition-based categorical split
  kPart = 2
 };
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree

 #endif  // XGBOOST_TREE_SPLIT_EVALUATOR_H_
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -154,7 +154,7 @@ class ColMaker: public TreeUpdater {
        : param_(param),
          colmaker_train_param_{colmaker_train_param},
          ctx_{ctx},
-          tree_evaluator_(param_, column_densities.size(), Context::kCpuId),
+          tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()),
          interaction_constraints_{std::move(_interaction_constraints)},
          column_densities_(column_densities) {}
    // update one tree, growing
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -75,7 +75,7 @@ class DeviceHistogramStorage {
  dh::device_vector<typename GradientSumT::ValueT> overflow_;
  std::map<int, size_t> overflow_nidx_map_;
  int n_bins_;
-  int device_id_;
+  DeviceOrd device_id_;
  static constexpr size_t kNumItemsInGradientSum =
      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
@@ -83,7 +83,7 @@ class DeviceHistogramStorage {
 public:
  // Start with about 16mb
  DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(int device_id, int n_bins) {
+  void Init(DeviceOrd device_id, int n_bins) {
    this->n_bins_ = n_bins;
    this->device_id_ = device_id;
  }
@@ -197,7 +197,7 @@ struct GPUHistMakerDevice {
                     common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
                     TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
                     uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
-      : evaluator_{_param, n_features, ctx->gpu_id},
+      : evaluator_{_param, n_features, ctx->Device()},
        ctx_(ctx),
        feature_types{_feature_types},
        param(std::move(_param)),
@@ -212,7 +212,7 @@ struct GPUHistMakerDevice {
    }

    CHECK(column_sampler_);
-    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
+    monitor.Init(std::string("GPUHistMakerDevice") + ctx_->Device().Name());
  }

  ~GPUHistMakerDevice() = default;
@@ -221,7 +221,7 @@ struct GPUHistMakerDevice {
    if (!feature_groups) {
      CHECK(page);
      feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
-                                                       dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                                       dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
                                                       sizeof(GradientPairPrecise));
    }
  }
@@ -232,7 +232,7 @@ struct GPUHistMakerDevice {
    this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
                                param.colsample_bynode, param.colsample_bylevel,
                                param.colsample_bytree);
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));

    this->interaction_constraints.Reset();

@@ -247,15 +247,15 @@ struct GPUHistMakerDevice {
    gpair = sample.gpair;

    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
-                           dmat->Info().IsColumnSplit(), ctx_->gpu_id);
+                           dmat->Info().IsColumnSplit(), ctx_->Device());

    quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());

    row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);

    // Init histogram
-    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
+    hist.Init(ctx_->Device(), page->Cuts().TotalBins());
    hist.Reset();

    this->InitFeatureGroupsOnce();
@@ -268,7 +268,7 @@ struct GPUHistMakerDevice {
    sampled_features->SetDevice(ctx_->Device());
    common::Span<bst_feature_t> feature_set =
        interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
    EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
    EvaluateSplitSharedInputs shared_inputs{
        gpu_param,
@@ -290,7 +290,7 @@ struct GPUHistMakerDevice {
    dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
    std::vector<bst_node_t> nidx(2 * candidates.size());
    auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
    EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
                                            matrix.feature_segments, matrix.gidx_fvalue_map,
                                            matrix.min_fvalue,
@@ -343,9 +343,9 @@ struct GPUHistMakerDevice {
  void BuildHist(int nidx) {
    auto d_node_hist = hist.GetNodeHistogram(nidx);
    auto d_ridx = row_partitioner->GetRows(nidx);
-    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id),
-                           feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist,
-                           *quantiser);
+    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
+                           feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
+                           d_node_hist, *quantiser);
  }

  // Attempt to do subtraction trick
@@ -414,10 +414,10 @@ struct GPUHistMakerDevice {
    });

    collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
+        ctx_->Ordinal(), decision_storage.data().get(), decision_storage.size());
    collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
-    collective::Synchronize(ctx_->gpu_id);
+        ctx_->Ordinal(), missing_storage.data().get(), missing_storage.size());
+    collective::Synchronize(ctx_->Ordinal());

    row_partitioner->UpdatePositionBatch(
        nidx, left_nidx, right_nidx, split_data,
@@ -455,7 +455,7 @@ struct GPUHistMakerDevice {
      CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
    }

-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());

    if (info_.IsColumnSplit()) {
      UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
@@ -527,9 +527,9 @@ struct GPUHistMakerDevice {
      common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
      common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
      HostDeviceVector<bst_node_t>* p_out_position) {
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
    auto d_gpair = this->gpair;
-    p_out_position->SetDevice(ctx_->gpu_id);
+    p_out_position->SetDevice(ctx_->Device());
    p_out_position->Resize(row_partitioner->GetRows().size());

    auto new_position_op = [=] __device__(size_t row_id, int position) {
@@ -619,7 +619,7 @@ struct GPUHistMakerDevice {
    monitor.Start("AllReduce");
    auto d_node_hist = hist.GetNodeHistogram(nidx).data();
    using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-    collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
+    collective::GlobalSum(info_, ctx_->Device(), reinterpret_cast<ReduceT*>(d_node_hist),
                          page->Cuts().TotalBins() * 2 * num_histograms);

    monitor.Stop("AllReduce");
@@ -862,7 +862,7 @@ class GPUHistMaker : public TreeUpdater {
  }

  void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
-    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
+    CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device";
    info_ = &dmat->Info();

    // Synchronise the column sampling seed
@@ -871,9 +871,8 @@ class GPUHistMaker : public TreeUpdater {
    this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);

    auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
-    info_->feature_types.SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    info_->feature_types.SetDevice(ctx_->Device());
    maker = std::make_unique<GPUHistMakerDevice>(
        ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
        *param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
@@ -898,7 +897,7 @@ class GPUHistMaker : public TreeUpdater {
    this->InitData(param, p_fmat, p_tree);
    monitor_.Stop("InitData");

-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
    maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
  }

@@ -1031,7 +1030,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
    this->InitData(p_fmat, p_tree);
    monitor_.Stop("InitData");

-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
    maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
  }

--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -518,7 +518,7 @@ class QuantileHistMaker : public TreeUpdater {
    auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
    if (need_copy()) {
      // allocate buffer
-      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
+      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->Device(), linalg::Order::kF};
      h_sample_out = sample_out.HostView();
    }