Use the new DeviceOrd in the linalg module. (#9527)

2023-08-29 13:37:29 +08:00
parent 942b957eef
commit ddf2e68821
43 changed files with 252 additions and 273 deletions
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -82,22 +82,19 @@ template <typename BinaryAUC>
 double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaInfo const &info,
                     size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) {
  CHECK_NE(n_classes, 0);
-  auto const labels = info.labels.View(Context::kCpuId);
+  auto const labels = info.labels.HostView();
  if (labels.Shape(0) != 0) {
    CHECK_EQ(labels.Shape(1), 1) << "AUC doesn't support multi-target model.";
  }

  std::vector<double> results_storage(n_classes * 3, 0);
-  linalg::TensorView<double, 2> results(results_storage, {n_classes, static_cast<size_t>(3)},
-                                        Context::kCpuId);
+  auto results = linalg::MakeTensorView(ctx, results_storage, n_classes, 3);
  auto local_area = results.Slice(linalg::All(), 0);
  auto tp = results.Slice(linalg::All(), 1);
  auto auc = results.Slice(linalg::All(), 2);

  auto weights = common::OptionalWeights{info.weights_.ConstHostSpan()};
-  auto predts_t = linalg::TensorView<float const, 2>(
-      predts, {static_cast<size_t>(info.num_row_), n_classes},
-      Context::kCpuId);
+  auto predts_t = linalg::MakeTensorView(ctx, predts, info.num_row_, n_classes);

  if (info.labels.Size() != 0) {
    common::ParallelFor(n_classes, n_threads, [&](auto c) {
@@ -108,8 +105,8 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
        response[i] = labels(i) == c ? 1.0f : 0.0;
      }
      double fp;
-      std::tie(fp, tp(c), auc(c)) =
-          binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
+      std::tie(fp, tp(c), auc(c)) = binary_auc(
+          ctx, proba, linalg::MakeVec(response.data(), response.size(), ctx->Device()), weights);
      local_area(c) = fp * tp(c);
    });
  }
@@ -220,7 +217,7 @@ std::pair<double, uint32_t> RankingAUC(Context const *ctx, std::vector<float> co
  CHECK_GE(info.group_ptr_.size(), 2);
  uint32_t n_groups = info.group_ptr_.size() - 1;
  auto s_predts = common::Span<float const>{predts};
-  auto labels = info.labels.View(Context::kCpuId);
+  auto labels = info.labels.View(ctx->Device());
  auto s_weights = info.weights_.ConstHostSpan();

  std::atomic<uint32_t> invalid_groups{0};
@@ -363,8 +360,8 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
                                           info.labels.HostView().Slice(linalg::All(), 0),
                                           common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
-      std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info,
-                                              ctx_->gpu_id, &this->d_cache_);
+      std::tie(fp, tp, auc) =
+          GPUBinaryROCAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
    }
    return std::make_tuple(fp, tp, auc);
  }
@@ -381,8 +378,7 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")

 #if !defined(XGBOOST_USE_CUDA)
 std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
-                                                   std::int32_t,
-                                                   std::shared_ptr<DeviceAUCCache> *) {
+                                                   DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
  common::AssertGPUSupport();
  return {};
 }
@@ -414,8 +410,8 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
          BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                      common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
-      std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info,
-                                             ctx_->gpu_id, &this->d_cache_);
+      std::tie(pr, re, auc) =
+          GPUBinaryPRAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
    }
    return std::make_tuple(pr, re, auc);
  }
@@ -459,7 +455,7 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")

 #if !defined(XGBOOST_USE_CUDA)
 std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
-                                                  std::int32_t, std::shared_ptr<DeviceAUCCache> *) {
+                                                  DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
  common::AssertGPUSupport();
  return {};
 }
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -85,11 +85,11 @@ void InitCacheOnce(common::Span<float const> predts, std::shared_ptr<DeviceAUCCa
 template <typename Fn>
 std::tuple<double, double, double>
 GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
-             int32_t device, common::Span<size_t const> d_sorted_idx,
+             DeviceOrd device, common::Span<size_t const> d_sorted_idx,
             Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
  auto labels = info.labels.View(device);
  auto weights = info.weights_.ConstDeviceSpan();
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));

  CHECK_NE(labels.Size(), 0);
  CHECK_EQ(labels.Size(), predts.size());
@@ -168,7 +168,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
 }

 std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, std::int32_t device,
+                                                   MetaInfo const &info, DeviceOrd device,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache) {
  auto &cache = *p_cache;
  InitCacheOnce<false>(predts, p_cache);
@@ -309,9 +309,10 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
 * up each class in all kernels.
 */
 template <bool scale, typename Fn>
-double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span<uint32_t> d_class_ptr,
-                           size_t n_classes, std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
-  dh::safe_cuda(cudaSetDevice(device));
+double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
+                           common::Span<uint32_t> d_class_ptr, size_t n_classes,
+                           std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  /**
   * Sorted idx
   */
@@ -467,11 +468,12 @@ double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
  dh::TemporaryArray<uint32_t> class_ptr(n_classes + 1, 0);
  MultiClassSortedIdx(ctx, predts, dh::ToSpan(class_ptr), cache);

-  auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev,
-                              double tp, size_t /*class_id*/) {
+  auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev, double tp,
+                              size_t /*class_id*/) {
    return TrapezoidArea(fp_prev, fp, tp_prev, tp);
  };
-  return GPUMultiClassAUCOVR<true>(info, ctx->gpu_id, dh::ToSpan(class_ptr), n_classes, cache, fn);
+  return GPUMultiClassAUCOVR<true>(info, ctx->Device(), dh::ToSpan(class_ptr), n_classes, cache,
+                                   fn);
 }

 namespace {
@@ -512,7 +514,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
  /**
   * Sort the labels
   */
-  auto d_labels = info.labels.View(ctx->gpu_id);
+  auto d_labels = info.labels.View(ctx->Device());

  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
  common::SegmentedArgSort<false, false>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
@@ -604,7 +606,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
 }

 std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, std::int32_t device,
+                                                  MetaInfo const &info, DeviceOrd device,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache) {
  auto& cache = *p_cache;
  InitCacheOnce<false>(predts, p_cache);
@@ -662,7 +664,7 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
  /**
   * Get total positive/negative
   */
-  auto labels = info.labels.View(ctx->gpu_id);
+  auto labels = info.labels.View(ctx->Device());
  auto n_samples = info.num_row_;
  dh::caching_device_vector<Pair> totals(n_classes);
  auto key_it =
@@ -695,13 +697,13 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
    return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                  d_totals[class_id].first);
  };
-  return GPUMultiClassAUCOVR<false>(info, ctx->gpu_id, d_class_ptr, n_classes, cache, fn);
+  return GPUMultiClassAUCOVR<false>(info, ctx->Device(), d_class_ptr, n_classes, cache, fn);
 }

 template <typename Fn>
 std::pair<double, uint32_t>
 GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
-                    common::Span<uint32_t> d_group_ptr, int32_t device,
+                    common::Span<uint32_t> d_group_ptr, DeviceOrd device,
                    std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
  /**
   * Sorted idx
@@ -843,7 +845,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
  common::SegmentedArgSort<false, false>(ctx, predts, d_group_ptr, d_sorted_idx);

  dh::XGBDeviceAllocator<char> alloc;
-  auto labels = info.labels.View(ctx->gpu_id);
+  auto labels = info.labels.View(ctx->Device());
  if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
                     dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
    InvalidLabels();
@@ -882,7 +884,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
    return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                  d_totals[group_id].first);
  };
-  return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->gpu_id, cache, fn);
+  return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->Device(), cache, fn);
 }
 }  // namespace metric
 }  // namespace xgboost
--- a/src/metric/auc.h
+++ b/src/metric/auc.h
@@ -30,7 +30,7 @@ XGBOOST_DEVICE inline double TrapezoidArea(double x0, double x1, double y0, doub
 struct DeviceAUCCache;

 std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, std::int32_t device,
+                                                   MetaInfo const &info, DeviceOrd,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache);

 double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
@@ -45,7 +45,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
 * PR AUC *
 **********/
 std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, std::int32_t device,
+                                                  MetaInfo const &info, DeviceOrd,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache);

 double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -45,7 +45,7 @@ namespace {
 template <typename Fn>
 PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
  PackedReduceResult result;
-  auto labels = info.labels.View(ctx->gpu_id);
+  auto labels = info.labels.View(ctx->Device());
  if (ctx->IsCPU()) {
    auto n_threads = ctx->Threads();
    std::vector<double> score_tloc(n_threads, 0.0);
@@ -183,10 +183,10 @@ class PseudoErrorLoss : public MetricNoCache {

  double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
    CHECK_EQ(info.labels.Shape(0), info.num_row_);
-    auto labels = info.labels.View(ctx_->gpu_id);
-    preds.SetDevice(ctx_->gpu_id);
+    auto labels = info.labels.View(ctx_->Device());
+    preds.SetDevice(ctx_->Device());
    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                     : info.weights_.ConstDeviceSpan());
    float slope = this->param_.huber_slope;
@@ -349,11 +349,11 @@ struct EvalEWiseBase : public MetricNoCache {
    if (info.labels.Size() != 0) {
      CHECK_NE(info.labels.Shape(1), 0);
    }
-    auto labels = info.labels.View(ctx_->gpu_id);
-    info.weights_.SetDevice(ctx_->gpu_id);
+    auto labels = info.labels.View(ctx_->Device());
+    info.weights_.SetDevice(ctx_->Device());
    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                     : info.weights_.ConstDeviceSpan());
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();

    auto d_policy = policy_;
@@ -444,16 +444,16 @@ class QuantileError : public MetricNoCache {
    }

    auto const* ctx = ctx_;
-    auto y_true = info.labels.View(ctx->gpu_id);
-    preds.SetDevice(ctx->gpu_id);
-    alpha_.SetDevice(ctx->gpu_id);
+    auto y_true = info.labels.View(ctx->Device());
+    preds.SetDevice(ctx->Device());
+    alpha_.SetDevice(ctx->Device());
    auto alpha = ctx->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
    std::size_t n_targets = preds.Size() / info.num_row_ / alpha_.Size();
    CHECK_NE(n_targets, 0);
    auto y_predt = linalg::MakeTensorView(ctx, &preds, static_cast<std::size_t>(info.num_row_),
                                          alpha_.Size(), n_targets);

-    info.weights_.SetDevice(ctx->gpu_id);
+    info.weights_.SetDevice(ctx->Device());
    common::OptionalWeights weight{ctx->IsCPU() ? info.weights_.ConstHostSpan()
                                                : info.weights_.ConstDeviceSpan()};

--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -75,7 +75,7 @@ struct EvalAMS : public MetricNoCache {
    const double br = 10.0;
    unsigned thresindex = 0;
    double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
-    const auto& labels = info.labels.View(Context::kCpuId);
+    const auto& labels = info.labels.View(DeviceOrd::CPU());
    for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
      const unsigned ridx = rec[i].second;
      const bst_float wt = info.GetWeight(ridx);
@@ -134,7 +134,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
    std::vector<double> sum_tloc(ctx_->Threads(), 0.0);

    {
-      const auto& labels = info.labels.View(Context::kCpuId);
+      const auto& labels = info.labels.HostView();
      const auto &h_preds = preds.ConstHostVector();

      dmlc::OMPException exc;
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -33,7 +33,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt,
                            std::shared_ptr<ltr::PreCache> p_cache) {
  auto d_gptr = p_cache->DataGroupPtr(ctx);
-  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);

  predt.SetDevice(ctx->gpu_id);
  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
@@ -89,7 +89,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
  if (!d_weight.Empty()) {
    CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
  }
-  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
  predt.SetDevice(ctx->gpu_id);
  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());

@@ -119,9 +119,9 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt, bool minus,
                            std::shared_ptr<ltr::MAPCache> p_cache) {
  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
-  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);

-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
  auto key_it = dh::MakeTransformIterator<std::size_t>(
      thrust::make_counting_iterator(0ul),