Implement NDCG cache. (#8893)

2023-03-13 22:16:31 +08:00
parent 9bade7203a
commit 8be6095ece
7 changed files with 798 additions and 11 deletions
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -11,7 +11,6 @@
 #include <string>                        // for char_traits, string
 #include <vector>                        // for vector

-#include "./math.h"                      // for CloseTo
 #include "dmlc/parameter.h"              // for FieldEntry, DMLC_DECLARE_FIELD
 #include "error_msg.h"                   // for GroupWeight, GroupSize
 #include "xgboost/base.h"                // for XGBOOST_DEVICE, bst_group_t
@@ -19,7 +18,7 @@
 #include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/linalg.h"              // for Vector, VectorView, Tensor
-#include "xgboost/logging.h"             // for LogCheck_EQ, CHECK_EQ, CHECK
+#include "xgboost/logging.h"             // for CHECK_EQ, CHECK
 #include "xgboost/parameter.h"           // for XGBoostParameter
 #include "xgboost/span.h"                // for Span
 #include "xgboost/string_view.h"         // for StringView
@@ -34,6 +33,25 @@ using rel_degree_t = std::uint32_t;  // NOLINT
 */
 using position_t = std::uint32_t;  // NOLINT

+/**
+ * \brief Maximum relevance degree for NDCG
+ */
+constexpr std::size_t MaxRel() { return sizeof(rel_degree_t) * 8 - 1; }
+static_assert(MaxRel() == 31);
+
+XGBOOST_DEVICE inline double CalcDCGGain(rel_degree_t label) {
+  return static_cast<double>((1u << label) - 1);
+}
+
+XGBOOST_DEVICE inline double CalcDCGDiscount(std::size_t idx) {
+  return 1.0 / std::log2(static_cast<double>(idx) + 2.0);
+}
+
+XGBOOST_DEVICE inline double CalcInvIDCG(double idcg) {
+  auto inv_idcg = (idcg == 0.0 ? 0.0 : (1.0 / idcg));  // handle irrelevant document
+  return inv_idcg;
+}
+
 enum class PairMethod : std::int32_t {
  kTopK = 0,
  kMean = 1,
@@ -115,7 +133,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
        .describe("Number of pairs for each sample in the list.");
    DMLC_DECLARE_FIELD(lambdarank_unbiased)
        .set_default(false)
-        .describe("Unbiased lambda mart. Use IPW to debias click position");
+        .describe("Unbiased lambda mart. Use extended IPW to debias click position");
    DMLC_DECLARE_FIELD(lambdarank_bias_norm)
        .set_default(2.0)
        .set_lower_bound(0.0)
@@ -126,6 +144,220 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
  }
 };

+/**
+ * \brief Common cached items for ranking tasks.
+ */
+class RankingCache {
+ private:
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+  // Cached parameter
+  LambdaRankParam param_;
+  // offset to data groups.
+  HostDeviceVector<bst_group_t> group_ptr_;
+  // store the sorted index of prediction.
+  HostDeviceVector<std::size_t> sorted_idx_cache_;
+  // Maximum size of group
+  std::size_t max_group_size_{0};
+  // Normalization for weight
+  double weight_norm_{1.0};
+  /**
+   * CUDA cache
+   */
+  // offset to threads assigned to each group for gradient calculation
+  HostDeviceVector<std::size_t> threads_group_ptr_;
+  // Sorted index of label for finding buckets.
+  HostDeviceVector<std::size_t> y_sorted_idx_cache_;
+  // Cached labels sorted by the model
+  HostDeviceVector<float> y_ranked_by_model_;
+  // store rounding factor for objective for each group
+  linalg::Vector<GradientPair> roundings_;
+  // rounding factor for cost
+  HostDeviceVector<double> cost_rounding_;
+  // temporary storage for creating rounding factors. Stored as byte to avoid having cuda
+  // data structure in here.
+  HostDeviceVector<std::uint8_t> max_lambdas_;
+  // total number of cuda threads used for gradient calculation
+  std::size_t n_cuda_threads_{0};
+
+  // Create model rank list on GPU
+  common::Span<std::size_t const> MakeRankOnCUDA(Context const* ctx,
+                                                 common::Span<float const> predt);
+  // Create model rank list on CPU
+  common::Span<std::size_t const> MakeRankOnCPU(Context const* ctx,
+                                                common::Span<float const> predt);
+
+ protected:
+  [[nodiscard]] std::size_t MaxGroupSize() const { return max_group_size_; }
+
+ public:
+  RankingCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : param_{p} {
+    CHECK(param_.GetInitialised());
+    if (!info.group_ptr_.empty()) {
+      CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
+          << error::GroupSize() << "the size of label.";
+    }
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+    if (!info.weights_.Empty()) {
+      CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
+    }
+  }
+  [[nodiscard]] std::size_t MaxPositionSize() const {
+    // Use truncation level as bound.
+    if (param_.HasTruncation()) {
+      return param_.NumPair();
+    }
+    // Hardcoded maximum size of positions to track. We don't need too many of them as the
+    // bias decreases exponentially.
+    return std::min(max_group_size_, static_cast<std::size_t>(32));
+  }
+  // Constructed as [1, n_samples] if group ptr is not supplied by the user
+  common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
+    group_ptr_.SetDevice(ctx->gpu_id);
+    return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
+  }
+
+  [[nodiscard]] auto const& Param() const { return param_; }
+  [[nodiscard]] std::size_t Groups() const { return group_ptr_.Size() - 1; }
+  [[nodiscard]] double WeightNorm() const { return weight_norm_; }
+
+  // Create a rank list by model prediction
+  common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
+    if (sorted_idx_cache_.Empty()) {
+      sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      sorted_idx_cache_.Resize(predt.size());
+    }
+    if (ctx->IsCPU()) {
+      return this->MakeRankOnCPU(ctx, predt);
+    } else {
+      return this->MakeRankOnCUDA(ctx, predt);
+    }
+  }
+  // The function simply returns a uninitialized buffer as this is only used by the
+  // objective for creating pairs.
+  common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
+    CHECK(ctx->IsCUDA());
+    if (y_sorted_idx_cache_.Empty()) {
+      y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      y_sorted_idx_cache_.Resize(n_samples);
+    }
+    return y_sorted_idx_cache_.DeviceSpan();
+  }
+  common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
+    CHECK(ctx->IsCUDA());
+    if (y_ranked_by_model_.Empty()) {
+      y_ranked_by_model_.SetDevice(ctx->gpu_id);
+      y_ranked_by_model_.Resize(n_samples);
+    }
+    return y_ranked_by_model_.DeviceSpan();
+  }
+
+  // CUDA cache getters, the cache is shared between metric and objective, some of these
+  // fields are lazy initialized to avoid unnecessary allocation.
+  [[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
+    CHECK(!threads_group_ptr_.Empty());
+    return threads_group_ptr_.ConstDeviceSpan();
+  }
+  [[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
+
+  linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
+    if (roundings_.Size() == 0) {
+      roundings_.SetDevice(ctx->gpu_id);
+      roundings_.Reshape(Groups());
+    }
+    return roundings_.View(ctx->gpu_id);
+  }
+  common::Span<double> CUDACostRounding(Context const* ctx) {
+    if (cost_rounding_.Size() == 0) {
+      cost_rounding_.SetDevice(ctx->gpu_id);
+      cost_rounding_.Resize(1);
+    }
+    return cost_rounding_.DeviceSpan();
+  }
+  template <typename Type>
+  common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
+    max_lambdas_.SetDevice(ctx->gpu_id);
+    std::size_t bytes = n * sizeof(Type);
+    if (bytes != max_lambdas_.Size()) {
+      max_lambdas_.Resize(bytes);
+    }
+    return common::Span<Type>{reinterpret_cast<Type*>(max_lambdas_.DevicePointer()), n};
+  }
+};
+
+class NDCGCache : public RankingCache {
+  // NDCG discount
+  HostDeviceVector<double> discounts_;
+  // 1.0 / IDCG
+  linalg::Vector<double> inv_idcg_;
+  /**
+   * CUDA cache
+   */
+  // store the intermediate DCG calculation result for metric
+  linalg::Vector<double> dcg_;
+
+ public:
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+
+ public:
+  NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
+      : RankingCache{ctx, info, p} {
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+  }
+
+  linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
+    return inv_idcg_.View(ctx->gpu_id);
+  }
+  common::Span<double const> Discount(Context const* ctx) const {
+    return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
+  }
+  linalg::VectorView<double> Dcg(Context const* ctx) {
+    if (dcg_.Size() == 0) {
+      dcg_.SetDevice(ctx->gpu_id);
+      dcg_.Reshape(this->Groups());
+    }
+    return dcg_.View(ctx->gpu_id);
+  }
+};
+
+/**
+ * \brief Validate label for NDCG
+ *
+ * \tparam NoneOf Implementation of std::none_of. Specified as a parameter to reuse the
+ *                check for both CPU and GPU.
+ */
+template <typename NoneOf>
+void CheckNDCGLabels(ltr::LambdaRankParam const& p, linalg::VectorView<float const> labels,
+                     NoneOf none_of) {
+  auto d_labels = labels.Values();
+  if (p.ndcg_exp_gain) {
+    auto label_is_integer =
+        none_of(d_labels.data(), d_labels.data() + d_labels.size(), [] XGBOOST_DEVICE(float v) {
+          auto l = std::floor(v);
+          return std::fabs(l - v) > kRtEps || v < 0.0f;
+        });
+    CHECK(label_is_integer)
+        << "When using relevance degree as target, label must be either 0 or positive integer.";
+  }
+
+  if (p.ndcg_exp_gain) {
+    auto label_is_valid = none_of(d_labels.data(), d_labels.data() + d_labels.size(),
+                                  [] XGBOOST_DEVICE(ltr::rel_degree_t v) { return v > MaxRel(); });
+    CHECK(label_is_valid) << "Relevance degress must be lesser than or equal to " << MaxRel()
+                          << " when the exponential NDCG gain function is used. "
+                          << "Set `ndcg_exp_gain` to false to use custom DCG gain.";
+  }
+}
+
 /**
 * \brief Parse name for ranking metric given parameters.
 *