diff --git a/demo/rank/mq2008.conf b/demo/rank/mq2008.conf index 0d26580c2..65ad19b8e 100644 --- a/demo/rank/mq2008.conf +++ b/demo/rank/mq2008.conf @@ -2,7 +2,7 @@ # choose the tree booster, 0: tree, 1: linear booster_type = 0 -# so far, we have pairwise rank +# specify objective objective="rank:pairwise" # Tree Booster Parameters diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh index c17ebee05..3867047f3 100755 --- a/demo/rank/runexp.sh +++ b/demo/rank/runexp.sh @@ -1,14 +1,8 @@ -#Download the dataset from web site -wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar +python trans_data.py train.txt mq2008.train mq2008.train.group -#please first install the unrar package -unrar x MQ2008 +python trans_data.py test.txt mq2008.test mq2008.test.group -python trans_data.py MQ2008/Fold1/train.txt mq2008.train mq2008.train.group - -python trans_data.py MQ2008/Fold1/test.txt mq2008.test mq2008.test.group - -python trans_data.py MQ2008/Fold1/vali.txt mq2008.vali mq2008.vali.group +python trans_data.py vali.txt mq2008.vali mq2008.vali.group ../../xgboost mq2008.conf diff --git a/regrank/xgboost_regrank_obj.h b/regrank/xgboost_regrank_obj.h index 24396101b..a33b828d9 100644 --- a/regrank/xgboost_regrank_obj.h +++ b/regrank/xgboost_regrank_obj.h @@ -116,7 +116,11 @@ namespace xgboost{ if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(); if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); - utils::Error("unknown objective function type"); + if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); + if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); + if( !strcmp("rank:map", name ) ) return new LambdaRankObj_MAP(); + if( !strcmp("rank:ndcg", name ) ) return new LambdaRankObj_NDCG(); + utils::Error("unknown objective function type"); return NULL; } }; diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp index 71ebec0ab..70d4347f7 100644 --- a/regrank/xgboost_regrank_obj.hpp +++ b/regrank/xgboost_regrank_obj.hpp @@ -330,6 +330,163 @@ namespace xgboost{ virtual ~PairwiseRankObj(void){} virtual void GetLambdaWeight( const std::vector &sorted_list, std::vector &pairs ){} }; + + + class LambdaRankObj_NDCG : public LambdaRankObj{ + + public: + virtual ~LambdaRankObj_NDCG(void){} + + inline float CalcDCG( const std::vector &labels ){ + double sumdcg = 0.0; + for( size_t i = 0; i < labels.size(); i ++ ){ + const unsigned rel = labels[i]; + if( rel != 0 ){ + sumdcg += logf(2.0f) * ((1<(sumdcg); + } + + inline float GetIDCG(const std::vector &sorted_list){ + std::vector labels; + for (size_t i = 0; i < sorted_list.size(); i++){ + labels.push_back(sorted_list[i].label); + } + + std::sort(labels.begin(), labels.end(), std::greater()); + return CalcDCG(labels); + } + + /* + * \brief Obtain the delta NDCG if trying to switch the positions of instances in index1 or index2 + * in sorted triples. Here DCG is calculated as sigma_i 2^rel_i/log(i + 1) + * \param sorted_list the list containing entry information + * \param index1,index2 the instances switched + * \param the IDCG of the list + */ + inline float GetLambdaNDCG(const std::vector &sorted_list, + int index1, + int index2, float IDCG){ + double original = (1 << static_cast(sorted_list[index1].label)) / log(index1 + 2) + + (1 << static_cast(sorted_list[index2].label)) / log(index2 + 2); + double changed = (1 << static_cast(sorted_list[index2].label)) / log(index1 + 2) + + (1 << static_cast(sorted_list[index1].label)) / log(index2 + 2); + double ans = (original - changed) / IDCG; + if (ans < 0) ans = -ans; + return static_cast(ans); + } + + virtual void GetLambdaWeight(const std::vector &sorted_list, std::vector &pairs){ + float IDCG = GetIDCG(sorted_list); + for (size_t i = 0; i < pairs.size(); i++){ + pairs[i].weight = GetLambdaNDCG(sorted_list, + pairs[i].pos_index, pairs[i].neg_index, IDCG); + } + } + + }; + + class LambdaRankObj_MAP : public LambdaRankObj{ + + class Quadruple{ + public: + /* \brief the accumulated precision */ + float ap_acc_; + /* \brief the accumulated precision assuming a positive instance is missing*/ + float ap_acc_miss_; + /* \brief the accumulated precision assuming that one more positive instance is inserted ahead*/ + float ap_acc_add_; + /* \brief the accumulated positive instance count */ + float hits_; + + Quadruple(){} + + Quadruple(const Quadruple& q){ + ap_acc_ = q.ap_acc_; + ap_acc_miss_ = q.ap_acc_miss_; + ap_acc_add_ = q.ap_acc_add_; + hits_ = q.hits_; + } + + Quadruple(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits + ) :ap_acc_(ap_acc), ap_acc_miss_(ap_acc_miss), ap_acc_add_(ap_acc_add), hits_(hits){ + + } + + }; + + public: + virtual ~LambdaRankObj_MAP(void){} + + /* + * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 + * in sorted triples + * \param sorted_list the list containing entry information + * \param index1,index2 the instances switched + * \param map_acc a vector containing the accumulated precisions for each position in a list + */ + inline float GetLambdaMAP(const std::vector &sorted_list, + int index1, int index2, + std::vector< Quadruple > &map_acc){ + if (index1 == index2 + || sorted_list[index1].label == sorted_list[index2].label + || map_acc[map_acc.size() - 1].hits_ == 0 + ) return 0.0; + if (index1 > index2) std::swap(index1, index2); + float original = map_acc[index2].ap_acc_; // The accumulated precision in the interval [index1,index2] + if (index1 != 0) original -= map_acc[index1 - 1].ap_acc_; + float changed = 0; + if (sorted_list[index1].label < sorted_list[index2].label){ + changed += map_acc[index2 - 1].ap_acc_add_ - map_acc[index1].ap_acc_add_; + changed += (map_acc[index1].hits_ + 1.0f) / (index1 + 1); + } + else{ + changed += map_acc[index2 - 1].ap_acc_miss_ - map_acc[index1].ap_acc_miss_; + changed += map_acc[index2].hits_ / (index2 + 1); + } + if(map_acc[map_acc.size() - 1].hits_ == 0) printf("haha\n"); + + float ans = (changed - original) / (map_acc[map_acc.size() - 1].hits_); + if (ans < 0) ans = -ans; + return ans; + } + + /* + * \brief preprocessing results for calculating delta MAP + * \return The first field is the accumulated precision, the second field is the + * accumulated precision assuming a positive instance is missing, + * the third field is the accumulated precision assuming that one more positive + * instance is inserted, the fourth field is the accumulated positive instance count + */ + inline void GetMAPAcc(const std::vector &sorted_list, + std::vector< Quadruple > &map_acc){ + map_acc.resize(sorted_list.size()); + float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; + for (size_t i = 1; i <= sorted_list.size(); i++){ + if ((int)sorted_list[i - 1].label > 0) { + hit++; + acc1 += hit / i; + acc2 += (hit - 1) / i; + acc3 += (hit + 1) / i; + } + map_acc[i - 1].ap_acc_ = acc1; + map_acc[i - 1].ap_acc_miss_ = acc2; + map_acc[i - 1].ap_acc_add_ = acc3; + map_acc[i - 1].hits_ = hit; + + } + } + virtual void GetLambdaWeight(const std::vector &sorted_list, std::vector &pairs){ + std::vector< Quadruple > map_acc; + GetMAPAcc(sorted_list, map_acc); + for (size_t i = 0; i < pairs.size(); i++){ + pairs[i].weight = GetLambdaMAP(sorted_list, pairs[i].pos_index, pairs[i].neg_index, map_acc); + } + } + + }; + }; }; #endif