Remove internal use of gpu_id. (#9568)

2023-09-20 23:29:51 +08:00
parent 38ac52dd87
commit 8c676c889d
121 changed files with 1012 additions and 1044 deletions
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -413,7 +413,7 @@ void GPUHistEvaluator::EvaluateSplits(
    auto const world_size = collective::GetWorldSize();
    dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
    auto all_candidates = dh::ToSpan(all_candidate_storage);
-    collective::AllGather(device_, out_splits.data(), all_candidates.data(),
+    collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(),
                          out_splits.size() * sizeof(DeviceSplitCandidate));

    // Reduce to get the best candidate from all workers.
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -85,7 +85,7 @@ class GPUHistEvaluator {
  std::size_t node_categorical_storage_size_ = 0;
  // Is the data split column-wise?
  bool is_column_split_ = false;
-  int32_t device_;
+  DeviceOrd device_;

  // Copy the categories from device to host asynchronously.
  void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -133,14 +133,14 @@ class GPUHistEvaluator {
  }

 public:
-  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, int32_t device)
+  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, DeviceOrd device)
      : tree_evaluator_{param, n_features, device}, param_{param} {}
  /**
   * \brief Reset the evaluator, should be called before any use.
   */
  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
             bst_feature_t n_features, TrainParam const &param, bool is_column_split,
-             int32_t device);
+             DeviceOrd device);

  /**
   * \brief Get host category storage for nidx.  Different from the internal version, this
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023 by XGBoost Contributors
 *
 * \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
 *        compilation time.
@@ -12,11 +12,10 @@
 #include "evaluate_splits.cuh"
 #include "xgboost/data.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
                             bst_feature_t n_features, TrainParam const &param,
-                             bool is_column_split, int32_t device) {
+                             bool is_column_split, DeviceOrd device) {
  param_ = param;
  tree_evaluator_ = TreeEvaluator{param, n_features, device};
  has_categoricals_ = cuts.HasCategorical();
@@ -127,6 +126,4 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
                             });
  return dh::ToSpan(cat_sorted_idx_);
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/gpu_hist/feature_groups.cuh
+++ b/src/tree/gpu_hist/feature_groups.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
 */
 #ifndef FEATURE_GROUPS_CUH_
 #define FEATURE_GROUPS_CUH_
@@ -102,11 +102,10 @@ struct FeatureGroups {
    InitSingle(cuts);
  }

-  FeatureGroupsAccessor DeviceAccessor(int device) const {
+  [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
    feature_segments.SetDevice(device);
    bin_segments.SetDevice(device);
-    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(),
-        max_group_bins};
+    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins};
  }

 private:
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
    for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
      auto page = batch.Impl();
      if (!page_) {
-        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+        page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->Cuts(), page->is_dense,
                                                  page->row_stride, dmat->Info().num_row_);
      }
-      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
+      size_t num_elements = page_->Copy(ctx->Device(), page, offset);
      offset += num_elements;
    }
    page_concatenated_ = true;
@@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
  auto first_page = (*batch_iterator.begin()).Impl();
  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                  first_page->row_stride, sample_rows));

  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
  for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
  }

  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
@@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
  auto first_page = (*batch_iterator.begin()).Impl();
  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                  first_page->row_stride, sample_rows));

  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
  for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
  }

  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -13,15 +13,15 @@
 namespace xgboost {
 namespace tree {

-RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
+RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
  thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
 }

 RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
 }

 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -199,7 +199,7 @@ class RowPartitioner {
  static constexpr bst_node_t kIgnoredTreePosition = -1;

 private:
-  int device_idx_;
+  DeviceOrd device_idx_;
  /*! \brief In here if you want to find the rows belong to a node nid, first you need to
   * get the indices segment from ridx_segments[nid], then get the row index that
   * represents position of row in input data X.  `RowPartitioner::GetRows` would be a
@@ -223,7 +223,7 @@ class RowPartitioner {
  dh::PinnedMemory pinned2_;

 public:
-  RowPartitioner(int device_idx, size_t num_rows);
+  RowPartitioner(DeviceOrd device_idx, size_t num_rows);
  ~RowPartitioner();
  RowPartitioner(const RowPartitioner&) = delete;
  RowPartitioner& operator=(const RowPartitioner&) = delete;