From 8d1098a983e36479a7b3dc354425154a9ff35ea9 Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Sat, 4 May 2019 03:53:05 -0400 Subject: [PATCH] In AUC and AUCPR metrics, detect whether weights are per-instance or per-group (#4216) * In AUC and AUCPR metrics, detect whether weights are per-instance or per-group * Fix C++ style check * Add a test for weighted AUC --- src/metric/rank_metric.cc | 120 +++++++++++++++++++++++++++++++---- tests/python/test_ranking.py | 38 +++++++++++ 2 files changed, 144 insertions(+), 14 deletions(-) diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc index 0f3d77936..6e4832109 100644 --- a/src/metric/rank_metric.cc +++ b/src/metric/rank_metric.cc @@ -14,6 +14,59 @@ #include "../common/host_device_vector.h" #include "../common/math.h" +namespace { + +/* + * Adapter to access instance weights. + * + * - For ranking task, weights are per-group + * - For binary classification task, weights are per-instance + * + * WeightPolicy::GetWeightOfInstance() : + * get weight associated with an individual instance, using index into + * `info.weights` + * WeightPolicy::GetWeightOfSortedRecord() : + * get weight associated with an individual instance, using index into + * sorted records `rec` (in ascending order of predicted labels). `rec` is + * of type PredIndPairContainer + */ + +using PredIndPairContainer + = std::vector>; + +class PerInstanceWeightPolicy { + public: + inline static xgboost::bst_float + GetWeightOfInstance(const xgboost::MetaInfo& info, + unsigned instance_id, unsigned group_id) { + return info.GetWeight(instance_id); + } + inline static xgboost::bst_float + GetWeightOfSortedRecord(const xgboost::MetaInfo& info, + const PredIndPairContainer& rec, + unsigned record_id, unsigned group_id) { + return info.GetWeight(rec[record_id].second); + } +}; + +class PerGroupWeightPolicy { + public: + inline static xgboost::bst_float + GetWeightOfInstance(const xgboost::MetaInfo& info, + unsigned instance_id, unsigned group_id) { + return info.GetWeight(group_id); + } + + inline static xgboost::bst_float + GetWeightOfSortedRecord(const xgboost::MetaInfo& info, + const PredIndPairContainer& rec, + unsigned record_id, unsigned group_id) { + return info.GetWeight(group_id); + } +}; + +} // anonymous namespace + namespace xgboost { namespace metric { // tag the this file, used by force static link later. @@ -88,16 +141,18 @@ struct EvalAMS : public Metric { /*! \brief Area Under Curve, for both classification and rank */ struct EvalAuc : public Metric { + private: + template bst_float Eval(const HostDeviceVector &preds, const MetaInfo &info, - bool distributed) override { + bool distributed) { CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty"; CHECK_EQ(preds.Size(), info.labels_.Size()) << "label size predict size not match"; std::vector tgptr(2, 0); tgptr[1] = static_cast(info.labels_.Size()); - const std::vector &gptr = info.group_ptr_.size() == 0 ? tgptr : info.group_ptr_; + const std::vector &gptr = info.group_ptr_.empty() ? tgptr : info.group_ptr_; CHECK_EQ(gptr.back(), info.labels_.Size()) << "EvalAuc: group structure must match number of prediction"; const auto ngroup = static_cast(gptr.size() - 1); @@ -108,9 +163,9 @@ struct EvalAuc : public Metric { std::vector> rec; const auto& labels = info.labels_.HostVector(); const std::vector& h_preds = preds.HostVector(); - for (bst_omp_uint k = 0; k < ngroup; ++k) { + for (bst_omp_uint group_id = 0; group_id < ngroup; ++group_id) { rec.clear(); - for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) { + for (unsigned j = gptr[group_id]; j < gptr[group_id + 1]; ++j) { rec.emplace_back(h_preds[j], j); } XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst); @@ -118,7 +173,8 @@ struct EvalAuc : public Metric { double sum_pospair = 0.0; double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0; for (size_t j = 0; j < rec.size(); ++j) { - const bst_float wt = info.GetWeight(rec[j].second); + const bst_float wt + = WeightPolicy::GetWeightOfSortedRecord(info, rec, j, group_id); const bst_float ctr = labels[rec[j].second]; // keep bucketing predictions in same bucket if (j != 0 && rec[j].first != rec[j - 1].first) { @@ -154,6 +210,21 @@ struct EvalAuc : public Metric { return static_cast(sum_auc) / ngroup; } } + + public: + bst_float Eval(const HostDeviceVector &preds, + const MetaInfo &info, + bool distributed) override { + // For ranking task, weights are per-group + // For binary classification task, weights are per-instance + const bool is_ranking_task = + !info.group_ptr_.empty() && info.weights_.Size() != info.num_row_; + if (is_ranking_task) { + return Eval(preds, info, distributed); + } else { + return Eval(preds, info, distributed); + } + } const char* Name() const override { return "auc"; } @@ -370,9 +441,11 @@ struct EvalAucPR : public Metric { // implementation of AUC-PR for weighted data // translated from PRROC R Package // see https://doi.org/10.1371/journal.pone.0092209 - - bst_float Eval(const HostDeviceVector &preds, const MetaInfo &info, - bool distributed) override { + private: + template + bst_float Eval(const HostDeviceVector &preds, + const MetaInfo &info, + bool distributed) { CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty"; CHECK_EQ(preds.Size(), info.labels_.Size()) << "label size predict size not match"; @@ -391,13 +464,15 @@ struct EvalAucPR : public Metric { const auto& h_labels = info.labels_.HostVector(); const std::vector& h_preds = preds.HostVector(); - for (bst_omp_uint k = 0; k < ngroup; ++k) { + for (bst_omp_uint group_id = 0; group_id < ngroup; ++group_id) { double total_pos = 0.0; double total_neg = 0.0; rec.clear(); - for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) { - total_pos += info.GetWeight(j) * h_labels[j]; - total_neg += info.GetWeight(j) * (1.0f - h_labels[j]); + for (unsigned j = gptr[group_id]; j < gptr[group_id + 1]; ++j) { + const bst_float wt + = WeightPolicy::GetWeightOfInstance(info, j, group_id); + total_pos += wt * h_labels[j]; + total_neg += wt * (1.0f - h_labels[j]); rec.emplace_back(h_preds[j], j); } XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst); @@ -408,8 +483,10 @@ struct EvalAucPR : public Metric { // calculate AUC double tp = 0.0, prevtp = 0.0, fp = 0.0, prevfp = 0.0, h = 0.0, a = 0.0, b = 0.0; for (size_t j = 0; j < rec.size(); ++j) { - tp += info.GetWeight(rec[j].second) * h_labels[rec[j].second]; - fp += info.GetWeight(rec[j].second) * (1.0f - h_labels[rec[j].second]); + const bst_float wt + = WeightPolicy::GetWeightOfSortedRecord(info, rec, j, group_id); + tp += wt * h_labels[rec[j].second]; + fp += wt * (1.0f - h_labels[rec[j].second]); if ((j < rec.size() - 1 && rec[j].first != rec[j + 1].first) || j == rec.size() - 1) { if (tp == prevtp) { a = 1.0; @@ -449,6 +526,21 @@ struct EvalAucPR : public Metric { return static_cast(sum_auc) / ngroup; } } + + public: + bst_float Eval(const HostDeviceVector &preds, + const MetaInfo &info, + bool distributed) override { + // For ranking task, weights are per-group + // For binary classification task, weights are per-instance + const bool is_ranking_task = + !info.group_ptr_.empty() && info.weights_.Size() != info.num_row_; + if (is_ranking_task) { + return Eval(preds, info, distributed); + } else { + return Eval(preds, info, distributed); + } + } const char *Name() const override { return "aucpr"; } }; diff --git a/tests/python/test_ranking.py b/tests/python/test_ranking.py index 50c1dbfbc..d42d67f14 100644 --- a/tests/python/test_ranking.py +++ b/tests/python/test_ranking.py @@ -25,3 +25,41 @@ def test_ranking_with_unweighted_data(): assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:])) auc_rec = evals_result['train']['aucpr'] assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:])) + +def test_ranking_with_weighted_data(): + Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17]) + Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3]) + X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4)) + y = np.array([0.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 1.0, + 0.0, 1.0, 1.0, 0.0, 0.0]) + weights = np.array([1.0, 2.0, 3.0, 4.0]) + + group = np.array([5, 5, 5, 5], dtype=np.uint) + dtrain = xgboost.DMatrix(X, label=y, weight=weights) + dtrain.set_group(group) + + params = {'eta': 1, 'tree_method': 'exact', + 'objective': 'rank:pairwise', 'eval_metric': ['auc', 'aucpr'], + 'max_depth': 1} + evals_result = {} + bst = xgboost.train(params, dtrain, 10, evals=[(dtrain, 'train')], + evals_result=evals_result) + auc_rec = evals_result['train']['auc'] + assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:])) + auc_rec = evals_result['train']['aucpr'] + assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:])) + + for i in range(1, 11): + pred = bst.predict(dtrain, ntree_limit=i) + # is_sorted[i]: is i-th group correctly sorted by the ranking predictor? + is_sorted = [] + for k in range(0, 20, 5): + ind = np.argsort(-pred[k:k+5]) + z = y[ind+k] + is_sorted.append(all(i >= j for i, j in zip(z, z[1:]))) + # Since we give weights 1, 2, 3, 4 to the four query groups, + # the ranking predictor will first try to correctly sort the last query group + # before correctly sorting other groups. + assert all(p <= q for p, q in zip(is_sorted, is_sorted[1:]))