diff --git a/src/common/error_msg.h b/src/common/error_msg.h new file mode 100644 index 000000000..48a2c92a4 --- /dev/null +++ b/src/common/error_msg.h @@ -0,0 +1,24 @@ +/** + * Copyright 2023 by XGBoost contributors + * + * \brief Common error message for various checks. + */ +#ifndef XGBOOST_COMMON_ERROR_MSG_H_ +#define XGBOOST_COMMON_ERROR_MSG_H_ + +#include "xgboost/string_view.h" // for StringView + +namespace xgboost::error { +constexpr StringView GroupWeight() { + return "Size of weight must equal to the number of query groups when ranking group is used."; +} + +constexpr StringView GroupSize() { + return "Invalid query group structure. The number of rows obtained from group doesn't equal to "; +} + +constexpr StringView LabelScoreSize() { + return "The size of label doesn't match the size of prediction."; +} +} // namespace xgboost::error +#endif // XGBOOST_COMMON_ERROR_MSG_H_ diff --git a/src/common/ranking_utils.cc b/src/common/ranking_utils.cc index f0b1c1a5e..8fad9a206 100644 --- a/src/common/ranking_utils.cc +++ b/src/common/ranking_utils.cc @@ -3,15 +3,28 @@ */ #include "ranking_utils.h" -#include // std::uint32_t -#include // std::ostringstream -#include // std::string,std::sscanf +#include // for copy_n, max, min, none_of, all_of +#include // for size_t +#include // for sscanf +#include // for exception +#include // for greater +#include // for reverse_iterator +#include // for char_traits, string -#include "xgboost/string_view.h" // StringView +#include "algorithm.h" // for ArgSort +#include "linalg_op.h" // for cbegin, cend +#include "optional_weight.h" // for MakeOptionalWeights +#include "threading_utils.h" // for ParallelFor +#include "xgboost/base.h" // for bst_group_t +#include "xgboost/context.h" // for Context +#include "xgboost/data.h" // for MetaInfo +#include "xgboost/linalg.h" // for All, TensorView, Range, Tensor, Vector +#include "xgboost/logging.h" // for Error, LogCheck_EQ, CHECK_EQ -namespace xgboost { -namespace ltr { -std::string MakeMetricName(StringView name, StringView param, std::uint32_t* topn, bool* minus) { +namespace xgboost::ltr { +DMLC_REGISTER_PARAMETER(LambdaRankParam); + +std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) { std::string out_name; if (!param.empty()) { std::ostringstream os; @@ -30,5 +43,18 @@ std::string MakeMetricName(StringView name, StringView param, std::uint32_t* top } return out_name; } -} // namespace ltr -} // namespace xgboost + +std::string MakeMetricName(StringView name, position_t topn, bool minus) { + std::ostringstream ss; + if (topn == LambdaRankParam::NotSet()) { + ss << name; + } else { + ss << name << "@" << topn; + } + if (minus) { + ss << "-"; + } + std::string out_name = ss.str(); + return out_name; +} +} // namespace xgboost::ltr diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h index 35ee36c21..631de4d70 100644 --- a/src/common/ranking_utils.h +++ b/src/common/ranking_utils.h @@ -3,17 +3,131 @@ */ #ifndef XGBOOST_COMMON_RANKING_UTILS_H_ #define XGBOOST_COMMON_RANKING_UTILS_H_ +#include // for min +#include // for log2, fabs, floor +#include // for size_t +#include // for uint32_t, uint8_t, int32_t +#include // for numeric_limits +#include // for char_traits, string +#include // for vector -#include // std::size_t -#include // std::uint32_t -#include // std::string +#include "./math.h" // for CloseTo +#include "dmlc/parameter.h" // for FieldEntry, DMLC_DECLARE_FIELD +#include "error_msg.h" // for GroupWeight, GroupSize +#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t +#include "xgboost/context.h" // for Context +#include "xgboost/data.h" // for MetaInfo +#include "xgboost/host_device_vector.h" // for HostDeviceVector +#include "xgboost/linalg.h" // for Vector, VectorView, Tensor +#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK +#include "xgboost/parameter.h" // for XGBoostParameter +#include "xgboost/span.h" // for Span +#include "xgboost/string_view.h" // for StringView -#include "xgboost/string_view.h" // StringView - -namespace xgboost { -namespace ltr { +namespace xgboost::ltr { /** - * \brief Construct name for ranking metric given parameters. + * \brief Relevance degree + */ +using rel_degree_t = std::uint32_t; // NOLINT +/** + * \brief top-k position + */ +using position_t = std::uint32_t; // NOLINT + +enum class PairMethod : std::int32_t { + kTopK = 0, + kMean = 1, +}; +} // namespace xgboost::ltr + +DECLARE_FIELD_ENUM_CLASS(xgboost::ltr::PairMethod); + +namespace xgboost::ltr { +struct LambdaRankParam : public XGBoostParameter { + private: + static constexpr position_t DefaultK() { return 32; } + static constexpr position_t DefaultSamplePairs() { return 1; } + + protected: + // pairs + // should be accessed by getter for auto configuration. + // nolint so that we can keep the string name. + PairMethod lambdarank_pair_method{PairMethod::kMean}; // NOLINT + std::size_t lambdarank_num_pair_per_sample{NotSet()}; // NOLINT + + public: + static constexpr position_t NotSet() { return std::numeric_limits::max(); } + + // unbiased + bool lambdarank_unbiased{false}; + double lambdarank_bias_norm{2.0}; + // ndcg + bool ndcg_exp_gain{true}; + + bool operator==(LambdaRankParam const& that) const { + return lambdarank_pair_method == that.lambdarank_pair_method && + lambdarank_num_pair_per_sample == that.lambdarank_num_pair_per_sample && + lambdarank_unbiased == that.lambdarank_unbiased && + lambdarank_bias_norm == that.lambdarank_bias_norm && ndcg_exp_gain == that.ndcg_exp_gain; + } + bool operator!=(LambdaRankParam const& that) const { return !(*this == that); } + + [[nodiscard]] double Regularizer() const { return 1.0 / (1.0 + this->lambdarank_bias_norm); } + + /** + * \brief Get number of pairs for each sample + */ + [[nodiscard]] position_t NumPair() const { + if (lambdarank_num_pair_per_sample == NotSet()) { + switch (lambdarank_pair_method) { + case PairMethod::kMean: + return DefaultSamplePairs(); + case PairMethod::kTopK: + return DefaultK(); + } + } else { + return lambdarank_num_pair_per_sample; + } + LOG(FATAL) << "Unreachable."; + return 0; + } + + [[nodiscard]] bool HasTruncation() const { return lambdarank_pair_method == PairMethod::kTopK; } + + // Used for evaluation metric and cache initialization, iterate through top-k or the whole list + [[nodiscard]] auto TopK() const { + if (HasTruncation()) { + return NumPair(); + } else { + return NotSet(); + } + } + + DMLC_DECLARE_PARAMETER(LambdaRankParam) { + DMLC_DECLARE_FIELD(lambdarank_pair_method) + .set_default(PairMethod::kMean) + .add_enum("mean", PairMethod::kMean) + .add_enum("topk", PairMethod::kTopK) + .describe("Method for constructing pairs."); + DMLC_DECLARE_FIELD(lambdarank_num_pair_per_sample) + .set_default(NotSet()) + .set_lower_bound(1) + .describe("Number of pairs for each sample in the list."); + DMLC_DECLARE_FIELD(lambdarank_unbiased) + .set_default(false) + .describe("Unbiased lambda mart. Use IPW to debias click position"); + DMLC_DECLARE_FIELD(lambdarank_bias_norm) + .set_default(2.0) + .set_lower_bound(0.0) + .describe("Lp regularization for unbiased lambdarank."); + DMLC_DECLARE_FIELD(ndcg_exp_gain) + .set_default(true) + .describe("When set to true, the label gain is 2^rel - 1, otherwise it's rel."); + } +}; + +/** + * \brief Parse name for ranking metric given parameters. * * \param [in] name Null terminated string for metric name * \param [in] param Null terminated string for parameter like the `3-` in `ndcg@3-`. @@ -23,7 +137,11 @@ namespace ltr { * * \return The name of the metric. */ -std::string MakeMetricName(StringView name, StringView param, std::uint32_t* topn, bool* minus); -} // namespace ltr -} // namespace xgboost +std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus); + +/** + * \brief Parse name for ranking metric given parameters. + */ +std::string MakeMetricName(StringView name, position_t topn, bool minus); +} // namespace xgboost::ltr #endif // XGBOOST_COMMON_RANKING_UTILS_H_ diff --git a/src/common/threading_utils.cuh b/src/common/threading_utils.cuh index c21d312d2..db5fe82f9 100644 --- a/src/common/threading_utils.cuh +++ b/src/common/threading_utils.cuh @@ -43,36 +43,33 @@ XGBOOST_DEVICE inline std::size_t DiscreteTrapezoidArea(std::size_t n, std::size * with h <= n */ template -inline size_t -SegmentedTrapezoidThreads(xgboost::common::Span group_ptr, - xgboost::common::Span out_group_threads_ptr, - size_t h) { +std::size_t SegmentedTrapezoidThreads(xgboost::common::Span group_ptr, + xgboost::common::Span out_group_threads_ptr, + std::size_t h) { CHECK_GE(group_ptr.size(), 1); CHECK_EQ(group_ptr.size(), out_group_threads_ptr.size()); - dh::LaunchN( - group_ptr.size(), [=] XGBOOST_DEVICE(size_t idx) { - if (idx == 0) { - out_group_threads_ptr[0] = 0; - return; - } + dh::LaunchN(group_ptr.size(), [=] XGBOOST_DEVICE(std::size_t idx) { + if (idx == 0) { + out_group_threads_ptr[0] = 0; + return; + } - size_t cnt = static_cast(group_ptr[idx] - group_ptr[idx - 1]); - out_group_threads_ptr[idx] = DiscreteTrapezoidArea(cnt, h); - }); + std::size_t cnt = static_cast(group_ptr[idx] - group_ptr[idx - 1]); + out_group_threads_ptr[idx] = DiscreteTrapezoidArea(cnt, h); + }); dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(), out_group_threads_ptr.size()); - size_t total = 0; - dh::safe_cuda(cudaMemcpy( - &total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1, - sizeof(total), cudaMemcpyDeviceToHost)); + std::size_t total = 0; + dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1, + sizeof(total), cudaMemcpyDeviceToHost)); return total; } /** * Called inside kernel to obtain coordinate from trapezoid grid. */ -XGBOOST_DEVICE inline void UnravelTrapeziodIdx(size_t i_idx, size_t n, - size_t *out_i, size_t *out_j) { +XGBOOST_DEVICE inline void UnravelTrapeziodIdx(std::size_t i_idx, std::size_t n, std::size_t *out_i, + std::size_t *out_j) { auto &i = *out_i; auto &j = *out_j; double idx = static_cast(i_idx); diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc index d39c7302a..69e6e24cd 100644 --- a/src/metric/rank_metric.cc +++ b/src/metric/rank_metric.cc @@ -234,7 +234,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig { protected: explicit EvalRank(const char* name, const char* param) { - this->name = ltr::MakeMetricName(name, param, &topn, &minus); + this->name = ltr::ParseMetricName(name, param, &topn, &minus); } virtual double EvalGroup(PredIndPairContainer *recptr) const = 0; diff --git a/tests/cpp/common/test_ranking_utils.cc b/tests/cpp/common/test_ranking_utils.cc index ea72edd9f..c73cffed7 100644 --- a/tests/cpp/common/test_ranking_utils.cc +++ b/tests/cpp/common/test_ranking_utils.cc @@ -1,38 +1,69 @@ /** * Copyright 2023 by XGBoost Contributors */ -#include +#include // for Test, AssertionResult, Message, TestPartR... +#include // for ASSERT_NEAR, ASSERT_T... +#include // for Args +#include // for Context +#include // for StringView -#include // std::uint32_t +#include // for uint32_t +#include // for pair -#include "../../../src/common/ranking_utils.h" +#include "../../../src/common/ranking_utils.h" // for LambdaRankParam, ParseMetricName, MakeMet... -namespace xgboost { -namespace ltr { -TEST(RankingUtils, MakeMetricName) { +namespace xgboost::ltr { +TEST(RankingUtils, LambdaRankParam) { + // make sure no memory is shared in dmlc parameter. + LambdaRankParam p0; + p0.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", "3"}}); + ASSERT_EQ(p0.NumPair(), 3); + + LambdaRankParam p1; + p1.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", "8"}}); + + ASSERT_EQ(p0.NumPair(), 3); + ASSERT_EQ(p1.NumPair(), 8); + + p0.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", "17"}}); + ASSERT_EQ(p0.NumPair(), 17); + ASSERT_EQ(p1.NumPair(), 8); +} + +TEST(RankingUtils, ParseMetricName) { std::uint32_t topn{32}; bool minus{false}; - auto name = MakeMetricName("ndcg", "3-", &topn, &minus); + auto name = ParseMetricName("ndcg", "3-", &topn, &minus); ASSERT_EQ(name, "ndcg@3-"); ASSERT_EQ(topn, 3); ASSERT_TRUE(minus); - name = MakeMetricName("ndcg", "6", &topn, &minus); + name = ParseMetricName("ndcg", "6", &topn, &minus); ASSERT_EQ(topn, 6); ASSERT_TRUE(minus); // unchanged minus = false; - name = MakeMetricName("ndcg", "-", &topn, &minus); + name = ParseMetricName("ndcg", "-", &topn, &minus); ASSERT_EQ(topn, 6); // unchanged ASSERT_TRUE(minus); - name = MakeMetricName("ndcg", nullptr, &topn, &minus); + name = ParseMetricName("ndcg", nullptr, &topn, &minus); ASSERT_EQ(topn, 6); // unchanged ASSERT_TRUE(minus); // unchanged - name = MakeMetricName("ndcg", StringView{}, &topn, &minus); + name = ParseMetricName("ndcg", StringView{}, &topn, &minus); ASSERT_EQ(topn, 6); // unchanged ASSERT_TRUE(minus); // unchanged } -} // namespace ltr -} // namespace xgboost + +TEST(RankingUtils, MakeMetricName) { + auto name = MakeMetricName("map", LambdaRankParam::NotSet(), true); + ASSERT_EQ(name, "map-"); + name = MakeMetricName("map", LambdaRankParam::NotSet(), false); + ASSERT_EQ(name, "map"); + name = MakeMetricName("map", 2, true); + ASSERT_EQ(name, "map@2-"); + name = MakeMetricName("map", 2, false); + ASSERT_EQ(name, "map@2"); +} +} // namespace xgboost::ltr