From 5a472145de531692916c2ba937c444fa05bebf0b Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 17 Aug 2014 20:32:02 -0700 Subject: [PATCH] check in rank loss --- src/learner/evaluation-inl.hpp | 4 +- src/learner/objective-inl.hpp | 294 +++++++++++++++++++++++++++++++++ src/learner/objective.h | 3 + 3 files changed, 300 insertions(+), 1 deletion(-) diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp index 184197d45..43fe48726 100644 --- a/src/learner/evaluation-inl.hpp +++ b/src/learner/evaluation-inl.hpp @@ -216,7 +216,9 @@ struct EvalRankList : public IEvaluator { const MetaInfo &info) const { utils::Check(preds.size() == info.labels.size(), "label size predict size not match"); - const std::vector &gptr = info.group_ptr; + // quick consistency when group is not available + std::vector tgptr(2, 0); tgptr[1] = preds.size(); + const std::vector &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr; utils::Assert(gptr.size() != 0, "must specify group when constructing rank file"); utils::Assert(gptr.back() == preds.size(), "EvalRanklist: group structure must match number of prediction"); diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp index 5f23e3b00..137349975 100644 --- a/src/learner/objective-inl.hpp +++ b/src/learner/objective-inl.hpp @@ -7,9 +7,13 @@ */ #include #include +#include +#include #include "../data.h" #include "./objective.h" #include "./helper_utils.h" +#include "../utils/random.h" +#include "../utils/omp.h" namespace xgboost { namespace learner { @@ -223,6 +227,296 @@ class SoftmaxMultiClassObj : public IObjFunction { int output_prob; }; +/*! \brief objective for lambda rank */ +class LambdaRankObj : public IObjFunction { + public: + LambdaRankObj(void) { + loss.loss_type = LossType::kLogisticRaw; + fix_list_weight = 0.0f; + num_pairsample = 1; + } + virtual ~LambdaRankObj(void) {} + virtual void SetParam(const char *name, const char *val) { + if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val); + if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast(atof(val)); + if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val); + } + virtual void GetGradient(const std::vector& preds, + const MetaInfo &info, + int iter, + std::vector *out_gpair) { + utils::Assert(preds.size() == info.labels.size(), "label size predict size not match"); + std::vector &gpair = *out_gpair; + gpair.resize(preds.size()); + // quick consistency when group is not available + std::vector tgptr(2, 0); tgptr[1] = preds.size(); + const std::vector &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr; + utils::Check(gptr.size() != 0 && gptr.back() == preds.size(), + "group structure not consistent with #rows"); + const unsigned ngroup = static_cast(gptr.size() - 1); + + #pragma omp parallel + { + // parall construct, declare random number generator here, so that each + // thread use its own random number generator, seed by thread id and current iteration + random::Random rnd; rnd.Seed(iter* 1111 + omp_get_thread_num()); + std::vector pairs; + std::vector lst; + std::vector< std::pair > rec; + + #pragma omp for schedule(static) + for (unsigned k = 0; k < ngroup; ++k) { + lst.clear(); pairs.clear(); + for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) { + lst.push_back(ListEntry(preds[j], info.labels[j], j)); + gpair[j] = bst_gpair(0.0f, 0.0f); + } + std::sort(lst.begin(), lst.end(), ListEntry::CmpPred); + rec.resize(lst.size()); + for (unsigned i = 0; i < lst.size(); ++i) { + rec[i] = std::make_pair(lst[i].label, i); + } + std::sort(rec.begin(), rec.end(), CmpFirst); + // enumerate buckets with same label, for each item in the lst, grab another sample randomly + for (unsigned i = 0; i < rec.size(); ) { + unsigned j = i + 1; + while (j < rec.size() && rec[j].first == rec[i].first) ++j; + // bucket in [i,j), get a sample outside bucket + unsigned nleft = i, nright = rec.size() - j; + if (nleft + nright != 0) { + int nsample = num_pairsample; + while (nsample --) { + for (unsigned pid = i; pid < j; ++pid) { + unsigned ridx = static_cast(rnd.RandDouble() * (nleft+nright)); + if (ridx < nleft) { + pairs.push_back(LambdaPair(rec[ridx].second, rec[pid].second)); + } else { + pairs.push_back(LambdaPair(rec[pid].second, rec[ridx+j-i].second)); + } + } + } + } + i = j; + } + // get lambda weight for the pairs + this->GetLambdaWeight(lst, &pairs); + // rescale each gradient and hessian so that the lst have constant weighted + float scale = 1.0f / num_pairsample; + if (fix_list_weight != 0.0f) { + scale *= fix_list_weight / (gptr[k+1] - gptr[k]); + } + for (size_t i = 0; i < pairs.size(); ++i) { + const ListEntry &pos = lst[pairs[i].pos_index]; + const ListEntry &neg = lst[pairs[i].neg_index]; + const float w = pairs[i].weight * scale; + float p = loss.PredTransform(pos.pred - neg.pred); + float g = loss.FirstOrderGradient(p, 1.0f); + float h = loss.SecondOrderGradient(p, 1.0f); + // accumulate gradient and hessian in both pid, and nid + gpair[pos.rindex].grad += g * w; + gpair[pos.rindex].hess += 2.0f * h; + gpair[neg.rindex].grad -= g * w; + gpair[neg.rindex].hess += 2.0f * h; + } + } + } + } + virtual const char* DefaultEvalMetric(void) { + return "map"; + } + + protected: + /*! \brief helper information in a list */ + struct ListEntry { + /*! \brief the predict score we in the data */ + float pred; + /*! \brief the actual label of the entry */ + float label; + /*! \brief row index in the data matrix */ + unsigned rindex; + // constructor + ListEntry(float pred, float label, unsigned rindex) + : pred(pred), label(label), rindex(rindex) {} + // comparator by prediction + inline static bool CmpPred(const ListEntry &a, const ListEntry &b) { + return a.pred > b.pred; + } + // comparator by label + inline static bool CmpLabel(const ListEntry &a, const ListEntry &b) { + return a.label > b.label; + } + }; + /*! \brief a pair in the lambda rank */ + struct LambdaPair { + /*! \brief positive index: this is a position in the list */ + unsigned pos_index; + /*! \brief negative index: this is a position in the list */ + unsigned neg_index; + /*! \brief weight to be filled in */ + float weight; + // constructor + LambdaPair(unsigned pos_index, unsigned neg_index) + : pos_index(pos_index), neg_index(neg_index), weight(1.0f) {} + }; + /*! + * \brief get lambda weight for existing pairs + * \param list a list that is sorted by pred score + * \param io_pairs record of pairs, containing the pairs to fill in weights + */ + virtual void GetLambdaWeight(const std::vector &sorted_list, + std::vector *io_pairs) = 0; + + private: + // loss function + LossType loss; + // number of samples peformed for each instance + int num_pairsample; + // fix weight of each elements in list + float fix_list_weight; +}; + +class PairwiseRankObj: public LambdaRankObj{ + public: + virtual ~PairwiseRankObj(void){} + + protected: + virtual void GetLambdaWeight(const std::vector &sorted_list, + std::vector *io_pairs) {} +}; + +// beta version: NDCG lambda rank +class LambdaRankObjNDCG : public LambdaRankObj { + public: + virtual ~LambdaRankObjNDCG(void) {} + + protected: + virtual void GetLambdaWeight(const std::vector &sorted_list, + std::vector *io_pairs) { + std::vector &pairs = *io_pairs; + float IDCG; + { + std::vector labels(sorted_list.size()); + for (size_t i = 0; i < sorted_list.size(); ++i) { + labels[i] = sorted_list[i].label; + } + std::sort(labels.begin(), labels.end(), std::greater()); + IDCG = CalcDCG(labels); + } + + if (IDCG == 0.0) { + for (size_t i = 0; i < pairs.size(); ++i) { + pairs[i].weight = 0.0f; + } + } else { + IDCG = 1.0f / IDCG; + for (size_t i = 0; i < pairs.size(); ++i) { + unsigned pos_idx = pairs[i].pos_index; + unsigned neg_idx = pairs[i].neg_index; + float pos_loginv = 1.0f / logf(pos_idx+2.0f); + float neg_loginv = 1.0f / logf(neg_idx+2.0f); + int pos_label = static_cast(sorted_list[pos_idx].label); + int neg_label = static_cast(sorted_list[neg_idx].label); + float original = ((1< &labels) { + double sumdcg = 0.0; + for (size_t i = 0; i < labels.size(); ++i) { + const unsigned rel = labels[i]; + if (rel != 0) { + sumdcg += ((1<(sumdcg); + } +}; + +class LambdaRankObjMAP : public LambdaRankObj { + public: + virtual ~LambdaRankObjMAP(void) {} + + protected: + struct MAPStats { + /* \brief the accumulated precision */ + float ap_acc; + /* \brief the accumulated precision assuming a positive instance is missing */ + float ap_acc_miss; + /* \brief the accumulated precision assuming that one more positive instance is inserted ahead*/ + float ap_acc_add; + /* \brief the accumulated positive instance count */ + float hits; + MAPStats(void) {} + MAPStats(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits) + : ap_acc(ap_acc), ap_acc_miss(ap_acc_miss), ap_acc_add(ap_acc_add), hits(hits) {} + }; + /* + * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 + * in sorted triples + * \param sorted_list the list containing entry information + * \param index1,index2 the instances switched + * \param map_stats a vector containing the accumulated precisions for each position in a list + */ + inline float GetLambdaMAP(const std::vector &sorted_list, + int index1, int index2, + std::vector &map_stats){ + if (index1 == index2 || map_stats[map_stats.size() - 1].hits == 0) { + return 0.0f; + } + if (index1 > index2) std::swap(index1, index2); + float original = map_stats[index2].ap_acc; + if (index1 != 0) original -= map_stats[index1 - 1].ap_acc; + float changed = 0; + float label1 = sorted_list[index1].label > 0.0f ? 1.0f : 0.0f; + float label2 = sorted_list[index2].label > 0.0f ? 1.0f : 0.0f; + if (label1 == label2) { + return 0.0; + } else if (label1 < label2) { + changed += map_stats[index2 - 1].ap_acc_add - map_stats[index1].ap_acc_add; + changed += (map_stats[index1].hits + 1.0f) / (index1 + 1); + } else { + changed += map_stats[index2 - 1].ap_acc_miss - map_stats[index1].ap_acc_miss; + changed += map_stats[index2].hits / (index2 + 1); + } + + float ans = (changed - original) / (map_stats[map_stats.size() - 1].hits); + if (ans < 0) ans = -ans; + return ans; + } + /* + * \brief obtain preprocessing results for calculating delta MAP + * \param sorted_list the list containing entry information + * \param map_stats a vector containing the accumulated precisions for each position in a list + */ + inline void GetMAPStats(const std::vector &sorted_list, + std::vector &map_acc){ + map_acc.resize(sorted_list.size()); + float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; + for (size_t i = 1; i <= sorted_list.size(); ++i) { + if (sorted_list[i - 1].label > 0.0f) { + hit++; + acc1 += hit / i; + acc2 += (hit - 1) / i; + acc3 += (hit + 1) / i; + } + map_acc[i - 1] = MAPStats(acc1,acc2,acc3,hit); + } + } + virtual void GetLambdaWeight(const std::vector &sorted_list, std::vector *io_pairs) { + std::vector &pairs = *io_pairs; + std::vector map_stats; + GetMAPStats(sorted_list, map_stats); + for (size_t i = 0; i < pairs.size(); ++i) { + pairs[i].weight = + GetLambdaMAP(sorted_list, pairs[i].pos_index, pairs[i].neg_index, map_stats); + } + } +}; + } // namespace learner } // namespace xgboost #endif // XGBOOST_LEARNER_OBJECTIVE_INL_HPP_ diff --git a/src/learner/objective.h b/src/learner/objective.h index bca035854..ff870c034 100644 --- a/src/learner/objective.h +++ b/src/learner/objective.h @@ -73,6 +73,9 @@ inline IObjFunction* CreateObjFunction(const char *name) { if (!strcmp("binary:logitraw", name)) return new RegLossObj(LossType::kLogisticRaw); if (!strcmp("multi:softmax", name)) return new SoftmaxMultiClassObj(0); if (!strcmp("multi:softprob", name)) return new SoftmaxMultiClassObj(1); + if (!strcmp("rank:pairwise", name )) return new PairwiseRankObj(); + if (!strcmp("rank:ndcg", name)) return new LambdaRankObjNDCG(); + if (!strcmp("rank:map", name)) return new LambdaRankObjMAP(); utils::Error("unknown objective function type: %s", name); return NULL; }