initial merge

2023-03-25 04:31:55 +01:00
parent d97be6f396 cff50fe3ef
commit 7fbc561e17
146 changed files with 6730 additions and 4082 deletions
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -20,23 +20,51 @@
 //   corresponding headers that brings in those function declaration can't be included with CUDA).
 //   This precludes the CPU and GPU logic to coexist inside a .cu file

+#include "rank_metric.h"
+
+#include <dmlc/omp.h>
 #include <dmlc/registry.h>
-#include <xgboost/metric.h>

-#include <cmath>
-#include <vector>
+#include <algorithm>                         // for stable_sort, copy, fill_n, min, max
+#include <array>                             // for array
+#include <cmath>                             // for log, sqrt
+#include <cstddef>                           // for size_t, std
+#include <cstdint>                           // for uint32_t
+#include <functional>                        // for less, greater
+#include <map>                               // for operator!=, _Rb_tree_const_iterator
+#include <memory>                            // for allocator, unique_ptr, shared_ptr, __shared_...
+#include <numeric>                           // for accumulate
+#include <ostream>                           // for operator<<, basic_ostream, ostringstream
+#include <string>                            // for char_traits, operator<, basic_string, to_string
+#include <utility>                           // for pair, make_pair
+#include <vector>                            // for vector

-#include "../collective/communicator-inl.h"
-#include "../common/algorithm.h"  // Sort
-#include "../common/math.h"
-#include "../common/ranking_utils.h"  // MakeMetricName
-#include "../common/threading_utils.h"
-#include "metric_common.h"
-#include "xgboost/host_device_vector.h"
+#include "../collective/communicator-inl.h"  // for IsDistributed, Allreduce
+#include "../collective/communicator.h"      // for Operation
+#include "../common/algorithm.h"             // for ArgSort, Sort
+#include "../common/linalg_op.h"             // for cbegin, cend
+#include "../common/math.h"                  // for CmpFirst
+#include "../common/optional_weight.h"       // for OptionalWeights, MakeOptionalWeights
+#include "../common/ranking_utils.h"         // for LambdaRankParam, NDCGCache, ParseMetricName
+#include "../common/threading_utils.h"       // for ParallelFor
+#include "../common/transform_iterator.h"    // for IndexTransformIter
+#include "dmlc/common.h"                     // for OMPException
+#include "metric_common.h"                   // for MetricNoCache, GPUMetric, PackedReduceResult
+#include "xgboost/base.h"                    // for bst_float, bst_omp_uint, bst_group_t, Args
+#include "xgboost/cache.h"                   // for DMatrixCache
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/data.h"                    // for MetaInfo, DMatrix
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/json.h"                    // for Json, FromJson, IsA, ToJson, get, Null, Object
+#include "xgboost/linalg.h"                  // for Tensor, TensorView, Range, VectorView, MakeT...
+#include "xgboost/logging.h"                 // for CHECK, ConsoleLogger, LOG_INFO, CHECK_EQ
+#include "xgboost/metric.h"                  // for MetricReg, XGBOOST_REGISTER_METRIC, Metric
+#include "xgboost/span.h"                    // for Span, operator!=
+#include "xgboost/string_view.h"             // for StringView

 namespace {

-using PredIndPair = std::pair<xgboost::bst_float, uint32_t>;
+using PredIndPair = std::pair<xgboost::bst_float, xgboost::ltr::rel_degree_t>;
 using PredIndPairContainer = std::vector<PredIndPair>;

 /*
@@ -87,8 +115,7 @@ class PerGroupWeightPolicy {

 }  // anonymous namespace

-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(rank_metric);

@@ -257,71 +284,6 @@ struct EvalPrecision : public EvalRank {
  }
 };

-/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
-struct EvalNDCG : public EvalRank {
- private:
-  double CalcDCG(const PredIndPairContainer &rec) const {
-    double sumdcg = 0.0;
-    for (size_t i = 0; i < rec.size() && i < this->topn; ++i) {
-      const unsigned rel = rec[i].second;
-      if (rel != 0) {
-        sumdcg += ((1 << rel) - 1) / std::log2(i + 2.0);
-      }
-    }
-    return sumdcg;
-  }
-
- public:
-  explicit EvalNDCG(const char* name, const char* param) : EvalRank(name, param) {}
-
-  double EvalGroup(PredIndPairContainer *recptr) const override {
-    PredIndPairContainer &rec(*recptr);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
-    double dcg = CalcDCG(rec);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpSecond);
-    double idcg = CalcDCG(rec);
-    if (idcg == 0.0f) {
-      if (this->minus) {
-        return 0.0f;
-      } else {
-        return 1.0f;
-      }
-    }
-    return dcg/idcg;
-  }
-};
-
-/*! \brief Mean Average Precision at N, for both classification and rank */
-struct EvalMAP : public EvalRank {
- public:
-  explicit EvalMAP(const char* name, const char* param) : EvalRank(name, param) {}
-
-  double EvalGroup(PredIndPairContainer *recptr) const override {
-    PredIndPairContainer &rec(*recptr);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
-    unsigned nhits = 0;
-    double sumap = 0.0;
-    for (size_t i = 0; i < rec.size(); ++i) {
-      if (rec[i].second != 0) {
-        nhits += 1;
-        if (i < this->topn) {
-          sumap += static_cast<double>(nhits) / (i + 1);
-        }
-      }
-    }
-    if (nhits != 0) {
-      sumap /= nhits;
-      return sumap;
-    } else {
-      if (this->minus) {
-        return 0.0;
-      } else {
-        return 1.0;
-      }
-    }
-  }
-};
-
 /*! \brief Cox: Partial likelihood of the Cox proportional hazards model */
 struct EvalCox : public MetricNoCache {
 public:
@@ -377,16 +339,213 @@ XGBOOST_REGISTER_METRIC(Precision, "pre")
 .describe("precision@k for rank.")
 .set_body([](const char* param) { return new EvalPrecision("pre", param); });

-XGBOOST_REGISTER_METRIC(NDCG, "ndcg")
-.describe("ndcg@k for rank.")
-.set_body([](const char* param) { return new EvalNDCG("ndcg", param); });
-
-XGBOOST_REGISTER_METRIC(MAP, "map")
-.describe("map@k for rank.")
-.set_body([](const char* param) { return new EvalMAP("map", param); });
-
 XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik")
 .describe("Negative log partial likelihood of Cox proportional hazards model.")
 .set_body([](const char*) { return new EvalCox(); });
-}  // namespace metric
-}  // namespace xgboost
+
+// ranking metrics that requires cache
+template <typename Cache>
+class EvalRankWithCache : public Metric {
+ protected:
+  ltr::LambdaRankParam param_;
+  bool minus_{false};
+  std::string name_;
+
+  DMatrixCache<Cache> cache_{DMatrixCache<Cache>::DefaultSize()};
+
+ public:
+  EvalRankWithCache(StringView name, const char* param) {
+    auto constexpr kMax = ltr::LambdaRankParam::NotSet();
+    std::uint32_t topn{kMax};
+    this->name_ = ltr::ParseMetricName(name, param, &topn, &minus_);
+    if (topn != kMax) {
+      param_.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", std::to_string(topn)},
+                                     {"lambdarank_pair_method", "topk"}});
+    }
+    param_.UpdateAllowUnknown(Args{});
+  }
+  void Configure(Args const&) override {
+    // do not configure, otherwise the ndcg param will be forced into the same as the one in
+    // objective.
+  }
+  void LoadConfig(Json const& in) override {
+    if (IsA<Null>(in)) {
+      return;
+    }
+    auto const& obj = get<Object const>(in);
+    auto it = obj.find("lambdarank_param");
+    if (it != obj.cend()) {
+      FromJson(it->second, &param_);
+    }
+  }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String{this->Name()};
+    out["lambdarank_param"] = ToJson(param_);
+  }
+
+  double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
+    auto const& info = p_fmat->Info();
+    auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
+    if (p_cache->Param() != param_) {
+      p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
+    }
+    CHECK(p_cache->Param() == param_);
+    CHECK_EQ(preds.Size(), info.labels.Size());
+
+    return this->Eval(preds, info, p_cache);
+  }
+
+  virtual double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
+                      std::shared_ptr<Cache> p_cache) = 0;
+};
+
+namespace {
+double Finalize(double score, double sw) {
+  std::array<double, 2> dat{score, sw};
+  collective::Allreduce<collective::Operation::kSum>(dat.data(), dat.size());
+  if (sw > 0.0) {
+    score = score / sw;
+  }
+
+  CHECK_LE(score, 1.0 + kRtEps)
+      << "Invalid output score, might be caused by invalid query group weight.";
+  score = std::min(1.0, score);
+
+  return score;
+}
+}  // namespace
+
+/**
+ * \brief Implement the NDCG score function for learning to rank.
+ *
+ *     Ties are ignored, which can lead to different result with other implementations.
+ */
+class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
+ public:
+  using EvalRankWithCache::EvalRankWithCache;
+  const char* Name() const override { return name_.c_str(); }
+
+  double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
+              std::shared_ptr<ltr::NDCGCache> p_cache) override {
+    if (ctx_->IsCUDA()) {
+      auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
+      return Finalize(ndcg.Residue(), ndcg.Weights());
+    }
+
+    // group local ndcg
+    auto group_ptr = p_cache->DataGroupPtr(ctx_);
+    bst_group_t n_groups = group_ptr.size() - 1;
+    auto ndcg_gloc = p_cache->Dcg(ctx_);
+    std::fill_n(ndcg_gloc.Values().data(), ndcg_gloc.Size(), 0.0);
+
+    auto h_inv_idcg = p_cache->InvIDCG(ctx_);
+    auto p_discount = p_cache->Discount(ctx_).data();
+
+    auto h_label = info.labels.HostView();
+    auto h_predt = linalg::MakeTensorView(ctx_, &preds, preds.Size());
+    auto weights = common::MakeOptionalWeights(ctx_, info.weights_);
+
+    common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
+      auto g_predt = h_predt.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
+      auto g_labels = h_label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]), 0);
+      auto sorted_idx = common::ArgSort<std::size_t>(ctx_, linalg::cbegin(g_predt),
+                                                     linalg::cend(g_predt), std::greater<>{});
+      double ndcg{.0};
+      double inv_idcg = h_inv_idcg(g);
+      if (inv_idcg <= 0.0) {
+        ndcg_gloc(g) = minus_ ? 0.0 : 1.0;
+        return;
+      }
+      std::size_t n{std::min(sorted_idx.size(), static_cast<std::size_t>(param_.TopK()))};
+      if (param_.ndcg_exp_gain) {
+        for (std::size_t i = 0; i < n; ++i) {
+          ndcg += p_discount[i] * ltr::CalcDCGGain(g_labels(sorted_idx[i])) * inv_idcg;
+        }
+      } else {
+        for (std::size_t i = 0; i < n; ++i) {
+          ndcg += p_discount[i] * g_labels(sorted_idx[i]) * inv_idcg;
+        }
+      }
+      ndcg_gloc(g) += ndcg * weights[g];
+    });
+    double sum_w{0};
+    if (weights.Empty()) {
+      sum_w = n_groups;
+    } else {
+      sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
+    }
+    auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
+    return Finalize(ndcg, sum_w);
+  }
+};
+
+class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
+ public:
+  using EvalRankWithCache::EvalRankWithCache;
+  const char* Name() const override { return name_.c_str(); }
+
+  double Eval(HostDeviceVector<float> const& predt, MetaInfo const& info,
+              std::shared_ptr<ltr::MAPCache> p_cache) override {
+    if (ctx_->IsCUDA()) {
+      auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
+      return Finalize(map.Residue(), map.Weights());
+    }
+
+    auto gptr = p_cache->DataGroupPtr(ctx_);
+    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
+    auto h_predt = linalg::MakeTensorView(ctx_, &predt, predt.Size());
+
+    auto map_gloc = p_cache->Map(ctx_);
+    std::fill_n(map_gloc.data(), map_gloc.size(), 0.0);
+    auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
+
+    common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
+      auto g_predt = h_predt.Slice(linalg::Range(gptr[g], gptr[g + 1]));
+      auto g_label = h_label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
+      auto g_rank = rank_idx.subspan(gptr[g]);
+
+      auto n = std::min(static_cast<std::size_t>(param_.TopK()), g_label.Size());
+      double n_hits{0.0};
+      for (std::size_t i = 0; i < n; ++i) {
+        auto p = g_label(g_rank[i]);
+        n_hits += p;
+        map_gloc[g] += n_hits / static_cast<double>((i + 1)) * p;
+      }
+      for (std::size_t i = n; i < g_label.Size(); ++i) {
+        n_hits += g_label(g_rank[i]);
+      }
+      if (n_hits > 0.0) {
+        map_gloc[g] /= std::min(n_hits, static_cast<double>(param_.TopK()));
+      } else {
+        map_gloc[g] = minus_ ? 0.0 : 1.0;
+      }
+    });
+
+    auto sw = 0.0;
+    auto weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    if (!weight.Empty()) {
+      CHECK_EQ(weight.weights.size(), p_cache->Groups());
+    }
+    for (std::size_t i = 0; i < map_gloc.size(); ++i) {
+      map_gloc[i] = map_gloc[i] * weight[i];
+      sw += weight[i];
+    }
+    auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
+    return Finalize(sum, sw);
+  }
+};
+
+XGBOOST_REGISTER_METRIC(EvalMAP, "map")
+    .describe("map@k for ranking.")
+    .set_body([](char const* param) {
+      return new EvalMAPScore{"map", param};
+    });
+
+XGBOOST_REGISTER_METRIC(EvalNDCG, "ndcg")
+    .describe("ndcg@k for ranking.")
+    .set_body([](char const* param) {
+      return new EvalNDCG{"ndcg", param};
+    });
+}  // namespace xgboost::metric
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -2,22 +2,29 @@
 * Copyright 2020-2023 by XGBoost Contributors
 */
 #include <dmlc/registry.h>
-#include <thrust/iterator/counting_iterator.h>  // make_counting_iterator
-#include <thrust/reduce.h>                      // reduce
-#include <xgboost/metric.h>
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/reduce.h>                      // for reduce

-#include <cstddef>                       // std::size_t
-#include <memory>                        // std::shared_ptr
+#include <algorithm>                            // for transform
+#include <cstddef>                              // for size_t
+#include <memory>                               // for shared_ptr
+#include <vector>                               // for vector

-#include "../common/cuda_context.cuh"    // CUDAContext
+#include "../common/cuda_context.cuh"           // for CUDAContext
+#include "../common/device_helpers.cuh"         // for MakeTransformIterator
+#include "../common/optional_weight.h"          // for MakeOptionalWeights
+#include "../common/ranking_utils.cuh"          // for CalcQueriesDCG, NDCGCache
 #include "metric_common.h"
-#include "xgboost/base.h"                // XGBOOST_DEVICE
-#include "xgboost/context.h"             // Context
-#include "xgboost/data.h"                // MetaInfo
-#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "rank_metric.h"
+#include "xgboost/base.h"                // for XGBOOST_DEVICE
+#include "xgboost/context.h"             // for Context
+#include "xgboost/data.h"                // for MetaInfo
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/linalg.h"              // for MakeTensorView
+#include "xgboost/logging.h"             // for CHECK
+#include "xgboost/metric.h"

-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(rank_metric_gpu);

@@ -134,200 +141,125 @@ struct EvalPrecisionGpu {
  }
 };

-/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
-struct EvalNDCGGpu {
- public:
-  static void ComputeDCG(const dh::SegmentSorter<float> &pred_sorter,
-                         const float *dlabels,
-                         const EvalRankConfig &ecfg,
-                         // The order in which labels have to be accessed. The order is determined
-                         // by sorting the predictions or the labels for the entire dataset
-                         const xgboost::common::Span<const uint32_t> &dlabels_sort_order,
-                         dh::caching_device_vector<double> *dcgptr) {
-    dh::caching_device_vector<double> &dcgs(*dcgptr);
-    // Group info on device
-    const auto &dgroups = pred_sorter.GetGroupsSpan();
-    const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
-
-    // First, determine non zero labels in the dataset individually
-    auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
-      return (static_cast<unsigned>(dlabels[dlabels_sort_order[idx]]));
-    };  // NOLINT
-
-    // Find each group's DCG value
-    const auto nitems = pred_sorter.GetNumItems();
-    auto *ddcgs = dcgs.data().get();
-
-    int device_id = -1;
-
-#if defined(XGBOOST_USE_CUDA)
-    dh::safe_cuda(cudaGetDevice(&device_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipGetDevice(&device_id));
-#endif
-
-    // For each group item compute the aggregated precision
-    dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
-      const auto group_idx = dgroup_idx[idx];
-      const auto group_begin = dgroups[group_idx];
-      const auto ridx = idx - group_begin;
-      auto label = DetermineNonTrivialLabelLambda(idx);
-      if (ridx < ecfg.topn && label) {
-        atomicAdd(&ddcgs[group_idx], ((1 << label) - 1) / std::log2(ridx + 2.0));
-      }
-    });
-  }
-
-  static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
-                           const float *dlabels,
-                           const EvalRankConfig &ecfg) {
-    // Sort the labels and compute IDCG
-    dh::SegmentSorter<float> segment_label_sorter;
-    segment_label_sorter.SortItems(dlabels, pred_sorter.GetNumItems(),
-                                   pred_sorter.GetGroupSegmentsSpan());
-
-    uint32_t ngroups = pred_sorter.GetNumGroups();
-
-    dh::caching_device_vector<double> idcg(ngroups, 0);
-    ComputeDCG(pred_sorter, dlabels, ecfg, segment_label_sorter.GetOriginalPositionsSpan(), &idcg);
-
-    // Compute the DCG values next
-    dh::caching_device_vector<double> dcg(ngroups, 0);
-    ComputeDCG(pred_sorter, dlabels, ecfg, pred_sorter.GetOriginalPositionsSpan(), &dcg);
-
-    double *ddcg = dcg.data().get();
-    double *didcg = idcg.data().get();
-
-    int device_id = -1;
-
-#if defined(XGBOOST_USE_CUDA)
-    dh::safe_cuda(cudaGetDevice(&device_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipGetDevice(&device_id));
-#endif
-
-    // Compute the group's DCG and reduce it across all groups
-    dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
-      if (didcg[gidx] == 0.0f) {
-        ddcg[gidx] = (ecfg.minus) ? 0.0f : 1.0f;
-      } else {
-        ddcg[gidx] /= didcg[gidx];
-      }
-    });
-
-    // Allocator to be used for managing space overhead while performing reductions
-    dh::XGBCachingDeviceAllocator<char> alloc;
-
-#if defined(XGBOOST_USE_CUDA)
-    return thrust::reduce(thrust::cuda::par(alloc), dcg.begin(), dcg.end());
-#elif defined(XGBOOST_USE_HIP)
-    return thrust::reduce(thrust::hip::par(alloc), dcg.begin(), dcg.end());
-#endif
-  }
-};
-
-/*! \brief Mean Average Precision at N, for both classification and rank */
-struct EvalMAPGpu {
- public:
-  static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
-                           const float *dlabels,
-                           const EvalRankConfig &ecfg) {
-    // Group info on device
-    const auto &dgroups = pred_sorter.GetGroupsSpan();
-    const auto ngroups = pred_sorter.GetNumGroups();
-    const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
-
-    // Original positions of the predictions after they have been sorted
-    const auto &dpreds_orig_pos = pred_sorter.GetOriginalPositionsSpan();
-
-    // First, determine non zero labels in the dataset individually
-    const auto nitems = pred_sorter.GetNumItems();
-    dh::caching_device_vector<uint32_t> hits(nitems, 0);
-    auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
-      return (static_cast<unsigned>(dlabels[dpreds_orig_pos[idx]]) != 0) ? 1 : 0;
-    };  // NOLINT
-
-    thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
-                      thrust::make_counting_iterator(nitems),
-                      hits.begin(),
-                      DetermineNonTrivialLabelLambda);
-
-    // Allocator to be used by sort for managing space overhead while performing prefix scans
-    dh::XGBCachingDeviceAllocator<char> alloc;
-
-    // Next, prefix scan the nontrivial labels that are segmented to accumulate them.
-    // This is required for computing the metric sum
-    // Data segmented into different groups...
-#if defined(XGBOOST_USE_CUDA)
-    thrust::inclusive_scan_by_key(thrust::cuda::par(alloc),
-                                  dh::tcbegin(dgroup_idx), dh::tcend(dgroup_idx),
-                                  hits.begin(),  // Input value
-                                  hits.begin());  // In-place scan
-#elif defined(XGBOOST_USE_HIP)
-    thrust::inclusive_scan_by_key(thrust::hip::par(alloc),
-                                  dh::tcbegin(dgroup_idx), dh::tcend(dgroup_idx),
-                                  hits.begin(),  // Input value
-                                  hits.begin());  // In-place scan
-#endif
-
-    // Find each group's metric sum
-    dh::caching_device_vector<double> sumap(ngroups, 0);
-    auto *dsumap = sumap.data().get();
-    const auto *dhits = hits.data().get();
-
-    int device_id = -1;
-
-#if defined(XGBOOST_USE_CUDA)
-    dh::safe_cuda(cudaGetDevice(&device_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipGetDevice(&device_id));
-#endif
-
-    // For each group item compute the aggregated precision
-    dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
-      if (DetermineNonTrivialLabelLambda(idx)) {
-        const auto group_idx = dgroup_idx[idx];
-        const auto group_begin = dgroups[group_idx];
-        const auto ridx = idx - group_begin;
-        if (ridx < ecfg.topn) {
-          atomicAdd(&dsumap[group_idx],
-                    static_cast<double>(dhits[idx]) / (ridx + 1));
-        }
-      }
-    });
-
-    // Aggregate the group's item precisions
-    dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
-      auto nhits = dgroups[gidx + 1] ? dhits[dgroups[gidx + 1] - 1] : 0;
-      if (nhits != 0) {
-        dsumap[gidx] /= nhits;
-      } else {
-        if (ecfg.minus) {
-          dsumap[gidx] = 0;
-        } else {
-          dsumap[gidx] = 1;
-        }
-      }
-    });
-
-#if defined(XGBOOST_USE_CUDA)
-    return thrust::reduce(thrust::cuda::par(alloc), sumap.begin(), sumap.end());
-#elif defined(XGBOOST_USE_HIP)
-    return thrust::reduce(thrust::hip::par(alloc), sumap.begin(), sumap.end());
-#endif
-  }
-};
-
 XGBOOST_REGISTER_GPU_METRIC(PrecisionGpu, "pre")
 .describe("precision@k for rank computed on GPU.")
 .set_body([](const char* param) { return new EvalRankGpu<EvalPrecisionGpu>("pre", param); });

-XGBOOST_REGISTER_GPU_METRIC(NDCGGpu, "ndcg")
-.describe("ndcg@k for rank computed on GPU.")
-.set_body([](const char* param) { return new EvalRankGpu<EvalNDCGGpu>("ndcg", param); });
+namespace cuda_impl {
+PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
+                             HostDeviceVector<float> const &predt, bool minus,
+                             std::shared_ptr<ltr::NDCGCache> p_cache) {
+  CHECK(p_cache);

-XGBOOST_REGISTER_GPU_METRIC(MAPGpu, "map")
-.describe("map@k for rank computed on GPU.")
-.set_body([](const char* param) { return new EvalRankGpu<EvalMAPGpu>("map", param); });
-}  // namespace metric
-}  // namespace xgboost
+  auto const &p = p_cache->Param();
+  auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
+  if (!d_weight.Empty()) {
+    CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
+  }
+  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  predt.SetDevice(ctx->gpu_id);
+  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
+
+  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
+
+  auto d_inv_idcg = p_cache->InvIDCG(ctx);
+  auto d_sorted_idx = p_cache->SortedIdx(ctx, d_predt.Values());
+  auto d_out_dcg = p_cache->Dcg(ctx);
+
+  ltr::cuda_impl::CalcQueriesDCG(ctx, d_label, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(),
+                                 d_out_dcg);
+  auto it = dh::MakeTransformIterator<PackedReduceResult>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
+        if (d_inv_idcg(i) <= 0.0) {
+          return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[i])};
+        }
+        return PackedReduceResult{d_out_dcg(i) * d_inv_idcg(i) * d_weight[i],
+                                  static_cast<double>(d_weight[i])};
+      });
+  auto pair = thrust::reduce(ctx->CUDACtx()->CTP(), it, it + d_out_dcg.Size(),
+                             PackedReduceResult{0.0, 0.0});
+  return pair;
+}
+
+PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
+                            HostDeviceVector<float> const &predt, bool minus,
+                            std::shared_ptr<ltr::MAPCache> p_cache) {
+  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
+  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+
+  predt.SetDevice(ctx->gpu_id);
+  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
+  auto key_it = dh::MakeTransformIterator<std::size_t>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(std::size_t i) { return dh::SegmentId(d_group_ptr, i); });
+
+  auto get_label = [=] XGBOOST_DEVICE(std::size_t i) {
+    auto g = key_it[i];
+    auto g_begin = d_group_ptr[g];
+    auto g_end = d_group_ptr[g + 1];
+    i -= g_begin;
+    auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
+    auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
+    return g_label(g_rank[i]);
+  };
+  auto it = dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), get_label);
+
+  auto cuctx = ctx->CUDACtx();
+  auto n_rel = p_cache->NumRelevant(ctx);
+  thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + d_label.Size(), it, n_rel.data());
+
+  double topk = p_cache->Param().TopK();
+  auto map = p_cache->Map(ctx);
+  thrust::fill_n(cuctx->CTP(), map.data(), map.size(), 0.0);
+  {
+    auto val_it = dh::MakeTransformIterator<double>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
+          auto g = key_it[i];
+          auto g_begin = d_group_ptr[g];
+          auto g_end = d_group_ptr[g + 1];
+          i -= g_begin;
+          if (i >= topk) {
+            return 0.0;
+          }
+
+          auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
+          auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
+          auto label = g_label(g_rank[i]);
+
+          auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
+          auto nhits = g_n_rel[i];
+          return nhits / static_cast<double>(i + 1) * label;
+        });
+
+    std::size_t bytes;
+    cub::DeviceSegmentedReduce::Sum(nullptr, bytes, val_it, map.data(), p_cache->Groups(),
+                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
+    dh::TemporaryArray<char> temp(bytes);
+    cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, val_it, map.data(), p_cache->Groups(),
+                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
+  }
+
+  PackedReduceResult result{0.0, 0.0};
+  {
+    auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
+    if (!d_weight.Empty()) {
+      CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
+    }
+    auto val_it = dh::MakeTransformIterator<PackedReduceResult>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t g) {
+          auto g_begin = d_group_ptr[g];
+          auto g_end = d_group_ptr[g + 1];
+          auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
+          if (!g_n_rel.empty() && g_n_rel.back() > 0.0) {
+            return PackedReduceResult{map[g] * d_weight[g] / std::min(g_n_rel.back(), topk),
+                                      static_cast<double>(d_weight[g])};
+          }
+          return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[g])};
+        });
+    result =
+        thrust::reduce(cuctx->CTP(), val_it, val_it + map.size(), PackedReduceResult{0.0, 0.0});
+  }
+  return result;
+}
+}  // namespace cuda_impl
+}  // namespace xgboost::metric
--- a/src/metric/rank_metric.h
+++ b/src/metric/rank_metric.h
@@ -0,0 +1,44 @@
+#ifndef XGBOOST_METRIC_RANK_METRIC_H_
+#define XGBOOST_METRIC_RANK_METRIC_H_
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include <memory>                        // for shared_ptr
+
+#include "../common/common.h"            // for AssertGPUSupport
+#include "../common/ranking_utils.h"     // for NDCGCache, MAPCache
+#include "metric_common.h"               // for PackedReduceResult
+#include "xgboost/context.h"             // for Context
+#include "xgboost/data.h"                // for MetaInfo
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+
+namespace xgboost {
+namespace metric {
+namespace cuda_impl {
+PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
+                             HostDeviceVector<float> const &predt, bool minus,
+                             std::shared_ptr<ltr::NDCGCache> p_cache);
+
+PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
+                            HostDeviceVector<float> const &predt, bool minus,
+                            std::shared_ptr<ltr::MAPCache> p_cache);
+
+#if !defined(XGBOOST_USE_CUDA)
+inline PackedReduceResult NDCGScore(Context const *, MetaInfo const &,
+                                    HostDeviceVector<float> const &, bool,
+                                    std::shared_ptr<ltr::NDCGCache>) {
+  common::AssertGPUSupport();
+  return {};
+}
+
+inline PackedReduceResult MAPScore(Context const *, MetaInfo const &,
+                                   HostDeviceVector<float> const &, bool,
+                                   std::shared_ptr<ltr::MAPCache>) {
+  common::AssertGPUSupport();
+  return {};
+}
+#endif
+}  // namespace cuda_impl
+}  // namespace metric
+}  // namespace xgboost
+#endif  // XGBOOST_METRIC_RANK_METRIC_H_