From 4b6024c563c348b39a3ea6cedb71315de0fcde59 Mon Sep 17 00:00:00 2001 From: kalenhaha Date: Fri, 9 May 2014 14:05:52 +0800 Subject: [PATCH] Separating Lambda MAP and Lambda NDCG --- booster/xgboost_data.h | 4 +- regrank/xgboost_regrank_data.h | 2 +- regrank/xgboost_regrank_eval.h | 2 +- regrank/xgboost_regrank_obj.hpp | 357 +++++++++++++++++-------------- regrank/xgboost_regrank_sample.h | 51 +---- regrank/xgboost_regrank_utils.h | 2 +- utils/xgboost_omp.h | 2 +- utils/xgboost_random.h | 3 +- 8 files changed, 209 insertions(+), 214 deletions(-) diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index 777fa2b0a..0f79833b5 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -374,11 +374,11 @@ namespace xgboost{ size_t nrow; utils::Assert(fi.Read(&nrow, sizeof(size_t)) != 0, "Load FMatrixS"); ptr.resize(nrow + 1); - utils::Assert(fi.Read(&ptr[0], ptr.size() * sizeof(size_t)), "Load FMatrixS"); + utils::Assert(fi.Read(&ptr[0], ptr.size() * sizeof(size_t)) != 0, "Load FMatrixS"); data.resize(ptr.back()); if (data.size() != 0){ - utils::Assert(fi.Read(&data[0], data.size() * sizeof(REntry)), "Load FMatrixS"); + utils::Assert(fi.Read(&data[0], data.size() * sizeof(REntry)) != 0, "Load FMatrixS"); } } public: diff --git a/regrank/xgboost_regrank_data.h b/regrank/xgboost_regrank_data.h index 1c8d52a86..14c565d8f 100644 --- a/regrank/xgboost_regrank_data.h +++ b/regrank/xgboost_regrank_data.h @@ -43,7 +43,7 @@ namespace xgboost{ else return 1.0f; } inline float GetRoot( size_t i ) const{ - if( root_index.size() != 0 ) return root_index[i]; + if( root_index.size() != 0 ) return static_cast(root_index[i]); else return 0; } }; diff --git a/regrank/xgboost_regrank_eval.h b/regrank/xgboost_regrank_eval.h index 1628ea2d5..a2a3b0012 100644 --- a/regrank/xgboost_regrank_eval.h +++ b/regrank/xgboost_regrank_eval.h @@ -220,7 +220,7 @@ namespace xgboost{ static inline float CalcDCG(const std::vector< float > &rec) { double sumdcg = 0.0; for (size_t i = 0; i < rec.size(); i++){ - const unsigned rel = rec[i]; + const unsigned rel = static_cast(rec[i]); if (rel != 0){ sumdcg += logf(2.0f) *((1 << rel) - 1) / logf(i + 1); } diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp index 0fbf469dc..2a0ad80b2 100644 --- a/regrank/xgboost_regrank_obj.hpp +++ b/regrank/xgboost_regrank_obj.hpp @@ -239,9 +239,11 @@ namespace xgboost{ } } } + virtual const char* DefaultEvalMetric(void) { return "auc"; - } + } + private: inline void AddGradient( unsigned pid, unsigned nid, const std::vector &pred, @@ -254,10 +256,12 @@ namespace xgboost{ grad[pid] += g; grad[nid] -= g; // take conservative update, scale hessian by 2 hess[pid] += 2.0f * h; hess[nid] += 2.0f * h; - } + } + inline static bool CmpFirst( const std::pair &a, const std::pair &b ){ return a.first > b.first; } + private: // fix weight of each list float fix_list_weight; @@ -267,7 +271,6 @@ namespace xgboost{ namespace regrank{ - // simple pairwise rank class LambdaRankObj : public IObjFunction{ public: LambdaRankObj(void){} @@ -277,30 +280,41 @@ namespace xgboost{ virtual void SetParam(const char *name, const char *val){ if (!strcmp("loss_type", name)) loss_.loss_type = atoi(val); if (!strcmp("sampler", name)) sampler_.AssignSampler(atoi(val)); - if (!strcmp("lambda", name)) lambda_ = atoi(val); } private: - int lambda_; - const static int PAIRWISE = 0; - const static int MAP = 1; - const static int NDCG = 2; sample::PairSamplerWrapper sampler_; LossType loss_; + protected: + + class Triple{ + public: + float pred_; + float label_; + int index_; + Triple(float pred, float label, int index) :pred_(pred), label_(label), index_(index){ + + } + }; + + static inline bool TripleComparer(const Triple &a, const Triple &b){ + return a.pred_ > b.pred_; + } /* \brief Sorted tuples of a group by the predictions, and * the fields in the return tuples successively are predicions, * labels, and the original index of the instance in the group */ - inline std::vector< sample::Triple > GetSortedTuple(const std::vector &preds, + inline std::vector< Triple > GetSortedTuple(const std::vector &preds, const std::vector &labels, const std::vector &group_index, int group){ - std::vector< sample::Triple > sorted_triple; - for (int j = group_index[group]; j < group_index[group + 1]; j++){ - sorted_triple.push_back(sample::Triple(preds[j], labels[j], j)); + std::vector< Triple > sorted_triple; + for (unsigned j = group_index[group]; j < group_index[group + 1]; j++){ + sorted_triple.push_back(Triple(preds[j], labels[j], j)); } - std::sort(sorted_triple.begin(), sorted_triple.end(), sample::Triplef1Comparer); + + std::sort(sorted_triple.begin(), sorted_triple.end(), TripleComparer); return sorted_triple; } @@ -312,169 +326,48 @@ namespace xgboost{ * \return a vector indicating the new position of each instance after sorted, * for example,[1,0] means that the second instance is put ahead after sorted */ - inline std::vector GetIndexMap(std::vector< sample::Triple > sorted_triple, int start){ + inline std::vector GetIndexMap(std::vector< Triple > sorted_triple, int start){ std::vector index_remap; index_remap.resize(sorted_triple.size()); - for (int i = 0; i < sorted_triple.size(); i++){ - index_remap[sorted_triple[i].f3_ - start] = i; + for (size_t i = 0; i < sorted_triple.size(); i++){ + index_remap[sorted_triple[i].index_ - start] = i; } return index_remap; } - /* - * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 - * in sorted triples - * \param sorted_triple the fields are predition,label,original index - * \param index1,index2 the instances switched - * \param map_acc The first field is the accumulated precision, the second field is the - * accumulated precision assuming a positive instance is missing, - * the third field is the accumulated precision assuming that one more positive - * instance is inserted, the fourth field is the accumulated positive instance count - */ - inline float GetLambdaMAP(const std::vector< sample::Triple > sorted_triple, - int index1, int index2, - std::vector< sample::Quadruple > map_acc){ - if (index1 == index2 || sorted_triple[index1].f2_ == sorted_triple[index2].f2_) return 0.0; - if (index1 > index2) std::swap(index1, index2); - float original = map_acc[index2].f1_; // The accumulated precision in the interval [index1,index2] - if (index1 != 0) original -= map_acc[index1 - 1].f1_; - float changed = 0; - if (sorted_triple[index1].f2_ < sorted_triple[index2].f2_){ - changed += map_acc[index2 - 1].f3_ - map_acc[index1].f3_; - changed += (map_acc[index1].f4_ + 1.0f) / (index1 + 1); - } - else{ - changed += map_acc[index2 - 1].f2_ - map_acc[index1].f2_; - changed += map_acc[index2].f4_ / (index2 + 1); - } - float ans = (changed - original) / (map_acc[map_acc.size() - 1].f4_); - if (ans < 0) ans = -ans; - return ans; - } - - /* - * \brief Obtain the delta NDCG if trying to switch the positions of instances in index1 or index2 - * in sorted triples. Here DCG is calculated as sigma_i 2^rel_i/log(i + 1) - * \param sorted_triple the fields are predition,label,original index - * \param index1,index2 the instances switched - * \param the IDCG of the list - */ - inline float GetLambdaNDCG(const std::vector< sample::Triple > sorted_triple, - int index1, - int index2, float IDCG){ - float original = (1 << (int)sorted_triple[index1].f2_) / log(index1 + 2) - + (1 << (int)sorted_triple[index2].f2_) / log(index2 + 2); - float changed = (1 << (int)sorted_triple[index2].f2_) / log(index1 + 2) - + (1 << (int)sorted_triple[index1].f2_) / log(index2 + 2); - float ans = (original - changed) / IDCG; - if (ans < 0) ans = -ans; - return ans; - } - - - inline float GetIDCG(const std::vector< sample::Triple > sorted_triple){ - std::vector labels; - for (int i = 0; i < sorted_triple.size(); i++){ - labels.push_back(sorted_triple[i].f2_); - } - - std::sort(labels.begin(), labels.end(), std::greater()); - return EvalNDCG::CalcDCG(labels); - } - - /* - * \brief preprocessing results for calculating delta MAP - * \return The first field is the accumulated precision, the second field is the - * accumulated precision assuming a positive instance is missing, - * the third field is the accumulated precision assuming that one more positive - * instance is inserted, the fourth field is the accumulated positive instance count - */ - inline std::vector< sample::Quadruple > GetMAPAcc(const std::vector< sample::Triple > sorted_triple){ - std::vector< sample::Quadruple > map_acc; - float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; - for (int i = 1; i <= sorted_triple.size(); i++){ - if (sorted_triple[i-1].f2_ == 1) { - hit++; - acc1 += hit / i; - acc2 += (hit - 1) / i; - acc3 += (hit + 1) / i; - } - map_acc.push_back(sample::Quadruple(acc1, acc2, acc3, hit)); - } - return map_acc; - - } - - inline float GetLambdaDelta(std::vector< sample::Triple > sorted_triple, - int ins1,int ins2, - std::vector< sample::Quadruple > map_acc, - float IDCG){ - float delta = 0.0; - switch (lambda_){ - case PAIRWISE: delta = 1.0; break; - case MAP: delta = GetLambdaMAP(sorted_triple, ins1, ins2, map_acc); break; - case NDCG: delta = GetLambdaNDCG(sorted_triple, ins1, ins2, IDCG); break; - default: utils::Error("Cannot find the specified loss type"); - } - return delta; - } + + virtual inline void GetLambda(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index, + const std::vector> &pairs, std::vector lambda, int group) = 0; inline void GetGroupGradient(const std::vector &preds, const std::vector &labels, const std::vector &group_index, std::vector &grad, std::vector &hess, - const sample::Pairs& pairs, + const std::vector> pairs, int group){ - bool j_better; - float pred_diff, pred_diff_exp, delta; + + std::vector lambda; + GetLambda(preds, labels, group_index, pairs, lambda, group); + + float pred_diff, delta; float first_order_gradient, second_order_gradient; - std::vector< sample::Triple > sorted_triple; - std::vector index_remap; - std::vector< sample::Quadruple > map_acc; - float IDCG; - - // preparing data for lambda NDCG - if (lambda_ == NDCG){ - sorted_triple = GetSortedTuple(preds, labels, group_index, group); - IDCG = GetIDCG(sorted_triple); - index_remap = GetIndexMap(sorted_triple, group_index[group]); - } - - // preparing data for lambda MAP - else if (lambda_ == MAP){ - sorted_triple = GetSortedTuple(preds, labels, group_index, group); - map_acc = GetMAPAcc(sorted_triple); - index_remap = GetIndexMap(sorted_triple, group_index[group]); - } - - for (int j = group_index[group]; j < group_index[group + 1]; j++){ - std::vector pair_instance = pairs.GetPairs(j); - for (int k = 0; k < pair_instance.size(); k++){ - j_better = labels[j] > labels[pair_instance[k]]; - if (j_better){ - delta = GetLambdaDelta(sorted_triple, index_remap[j - group_index[group]], - index_remap[pair_instance[k] - group_index[group]],map_acc,IDCG); - pred_diff = preds[j] - preds[pair_instance[k]]; - pred_diff_exp = j_better ? expf(-pred_diff) : expf(pred_diff); - first_order_gradient = delta * FirstOrderGradient(pred_diff_exp); - second_order_gradient = 2 * delta * SecondOrderGradient(pred_diff_exp); - hess[j] += second_order_gradient; - grad[j] += first_order_gradient; - hess[pair_instance[k]] += second_order_gradient; - grad[pair_instance[k]] += -first_order_gradient; - } - } + + for (size_t i = 0; i < pairs.size(); i++){ + delta = lambda[i]; + pred_diff = loss_.PredTransform(preds[pairs[i].first] - preds[pairs[i].second]); + first_order_gradient = delta * loss_.FirstOrderGradient(pred_diff, 1.0f); + second_order_gradient = 2 * delta * loss_.SecondOrderGradient(pred_diff, 1.0f); + hess[pairs[i].first] += second_order_gradient; + grad[pairs[i].first] += first_order_gradient; + hess[pairs[i].second] += second_order_gradient; + grad[pairs[i].second] -= first_order_gradient; } } - inline float FirstOrderGradient(float pred_diff_exp) const { - return -pred_diff_exp / (1 + pred_diff_exp); - } - - inline float SecondOrderGradient(float pred_diff_exp) const { - return pred_diff_exp / pow(1 + pred_diff_exp, 2); - } + public: virtual void GetGradient(const std::vector& preds, @@ -486,9 +379,8 @@ namespace xgboost{ const std::vector &group_index = info.group_ptr; utils::Assert(group_index.size() != 0 && group_index.back() == preds.size(), "rank loss must have group file"); - for (int i = 0; i < group_index.size() - 1; i++){ - sample::Pairs pairs = sampler_.GenPairs(preds, info.labels, group_index[i], group_index[i + 1]); - //pairs.GetPairs() + for (size_t i = 0; i < group_index.size() - 1; i++){ + std::vector> pairs = sampler_.GenPairs(preds, info.labels, group_index[i], group_index[i + 1]); GetGroupGradient(preds, info.labels, group_index, grad, hess, pairs, i); } } @@ -497,6 +389,147 @@ namespace xgboost{ return "auc"; } }; + + class LambdaRankObj_NDCG : public LambdaRankObj{ + + /* + * \brief Obtain the delta NDCG if trying to switch the positions of instances in index1 or index2 + * in sorted triples. Here DCG is calculated as sigma_i 2^rel_i/log(i + 1) + * \param sorted_triple the fields are predition,label,original index + * \param index1,index2 the instances switched + * \param the IDCG of the list + */ + inline float GetLambdaNDCG(const std::vector< Triple > sorted_triple, + int index1, + int index2, float IDCG){ + double original = (1 << static_cast(sorted_triple[index1].label_)) / log(index1 + 2) + + (1 << static_cast(sorted_triple[index2].label_)) / log(index2 + 2); + double changed = (1 << static_cast(sorted_triple[index2].label_)) / log(index1 + 2) + + (1 << static_cast(sorted_triple[index1].label_)) / log(index2 + 2); + double ans = (original - changed) / IDCG; + if (ans < 0) ans = -ans; + return static_cast(ans); + } + + + inline float GetIDCG(const std::vector< Triple > sorted_triple){ + std::vector labels; + for (size_t i = 0; i < sorted_triple.size(); i++){ + labels.push_back(sorted_triple[i].label_); + } + + std::sort(labels.begin(), labels.end(), std::greater()); + return EvalNDCG::CalcDCG(labels); + } + + inline void GetLambda(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index, + const std::vector> &pairs, std::vector lambda, int group){ + std::vector< Triple > sorted_triple; + std::vector index_remap; + float IDCG; + sorted_triple = GetSortedTuple(preds, labels, group_index, group); + IDCG = GetIDCG(sorted_triple); + index_remap = GetIndexMap(sorted_triple, group_index[group]); + lambda.resize(pairs.size()); + for (size_t i = 0; i < pairs.size(); i++){ + lambda[i] = GetLambdaNDCG(sorted_triple, + index_remap[pairs[i].first],index_remap[pairs[i].second],IDCG); + } + } + }; + + class LambdaRankObj_MAP : public LambdaRankObj{ + class Quadruple{ + public: + /* \brief the accumulated precision */ + float ap_acc_; + /* \brief the accumulated precision assuming a positive instance is missing*/ + float ap_acc_miss_; + /* \brief the accumulated precision assuming that one more positive instance is inserted ahead*/ + float ap_acc_add_; + /* \brief the accumulated positive instance count */ + float hits_; + Quadruple(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits + ) :ap_acc_(ap_acc), ap_acc_miss_(ap_acc_miss), ap_acc_add_(ap_acc_add), hits_(hits){ + + } + }; + + /* + * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 + * in sorted triples + * \param sorted_triple the fields are predition,label,original index + * \param index1,index2 the instances switched + * \param map_acc a vector containing the accumulated precisions for each position in a list + */ + inline float GetLambdaMAP(const std::vector< Triple > sorted_triple, + int index1, int index2, + std::vector< Quadruple > map_acc){ + if (index1 == index2 || sorted_triple[index1].label_ == sorted_triple[index2].label_) return 0.0; + if (index1 > index2) std::swap(index1, index2); + float original = map_acc[index2].ap_acc_; // The accumulated precision in the interval [index1,index2] + if (index1 != 0) original -= map_acc[index1 - 1].ap_acc_; + float changed = 0; + if (sorted_triple[index1].label_ < sorted_triple[index2].label_){ + changed += map_acc[index2 - 1].ap_acc_add_ - map_acc[index1].ap_acc_add_; + changed += (map_acc[index1].hits_ + 1.0f) / (index1 + 1); + } + else{ + changed += map_acc[index2 - 1].ap_acc_miss_ - map_acc[index1].ap_acc_miss_; + changed += map_acc[index2].hits_ / (index2 + 1); + } + float ans = (changed - original) / (map_acc[map_acc.size() - 1].hits_); + if (ans < 0) ans = -ans; + return ans; + } + + + /* + * \brief preprocessing results for calculating delta MAP + * \return The first field is the accumulated precision, the second field is the + * accumulated precision assuming a positive instance is missing, + * the third field is the accumulated precision assuming that one more positive + * instance is inserted, the fourth field is the accumulated positive instance count + */ + inline std::vector< Quadruple > GetMAPAcc(const std::vector< Triple > sorted_triple){ + std::vector< Quadruple > map_acc; + float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; + for (size_t i = 1; i <= sorted_triple.size(); i++){ + if ((int)sorted_triple[i - 1].label_ == 1) { + hit++; + acc1 += hit / i; + acc2 += (hit - 1) / i; + acc3 += (hit + 1) / i; + } + map_acc.push_back(Quadruple(acc1, acc2, acc3, hit)); + } + return map_acc; + + } + + inline void GetLambda(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index, + const std::vector> &pairs, std::vector lambda, int group){ + std::vector< Triple > sorted_triple; + std::vector index_remap; + std::vector< Quadruple > map_acc; + + sorted_triple = GetSortedTuple(preds, labels, group_index, group); + map_acc = GetMAPAcc(sorted_triple); + index_remap = GetIndexMap(sorted_triple, group_index[group]); + + lambda.resize(pairs.size()); + for (size_t i = 0; i < pairs.size(); i++){ + lambda[i] = GetLambdaMAP(sorted_triple, + index_remap[pairs[i].first], index_remap[pairs[i].second], map_acc); + } + } + }; + + }; }; #endif diff --git a/regrank/xgboost_regrank_sample.h b/regrank/xgboost_regrank_sample.h index fabb44137..0ae147b21 100644 --- a/regrank/xgboost_regrank_sample.h +++ b/regrank/xgboost_regrank_sample.h @@ -62,7 +62,7 @@ namespace xgboost { * \param end, the end index of a specified group * \return the generated pairs */ - virtual Pairs GenPairs(const std::vector &preds, + virtual std::vector> GenPairs(const std::vector &preds, const std::vector &labels, int start, int end) = 0; @@ -78,24 +78,11 @@ namespace xgboost { * we should guarantee the labels are 0 or 1 */ struct BinaryLinearSampler :public IPairSampler{ - virtual Pairs GenPairs(const std::vector &preds, + virtual std::vector> GenPairs(const std::vector &preds, const std::vector &labels, int start, int end) { - Pairs pairs(start, end); - int pointer = 0, last_pointer = 0, index = start, interval = end - start; - for (int i = start; i < end; i++){ - if (labels[i] == 1){ - while (true){ - index = (++pointer) % interval + start; - if (labels[index] == 0) break; - if (pointer - last_pointer > interval) return pairs; - } - pairs.push(i, index); - pairs.push(index, i); - last_pointer = pointer; - } - } - return pairs; + std::vector> ans; + return ans; } }; @@ -113,7 +100,7 @@ namespace xgboost { ~PairSamplerWrapper(){ delete sampler_; } - Pairs GenPairs(const std::vector &preds, + std::vector> GenPairs(const std::vector &preds, const std::vector &labels, int start, int end){ utils::Assert(sampler_ != NULL, "Not config the sampler yet. Add rank:sampler in the config file\n"); @@ -124,33 +111,7 @@ namespace xgboost { IPairSampler *sampler_; }; - template - class Triple{ - public: - T1 f1_; - T2 f2_; - T3 f3_; - Triple(T1 f1, T2 f2, T3 f3) :f1_(f1), f2_(f2), f3_(f3){ - - } - }; - - template - class Quadruple{ - public: - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - Quadruple(T1 f1, T2 f2, T3 f3, T4 f4) :f1_(f1), f2_(f2), f3_(f3), f4_(f4){ - - } - }; - - bool Triplef1Comparer(const Triple &a, const Triple &b){ - return a.f1_ > b.f1_; - } - + } } diff --git a/regrank/xgboost_regrank_utils.h b/regrank/xgboost_regrank_utils.h index 580cedb81..6f7ce596d 100644 --- a/regrank/xgboost_regrank_utils.h +++ b/regrank/xgboost_regrank_utils.h @@ -19,7 +19,7 @@ namespace xgboost{ wsum += rec[i]; } for( size_t i = 0; i < rec.size(); ++ i ){ - rec[i] /= wsum; + rec[i] /= static_cast(wsum); } } // simple helper function to do softmax diff --git a/utils/xgboost_omp.h b/utils/xgboost_omp.h index ea1e7173c..3a8062b7e 100644 --- a/utils/xgboost_omp.h +++ b/utils/xgboost_omp.h @@ -10,7 +10,7 @@ #if defined(_OPENMP) #include #else -#warning "OpenMP is not available, compile to single thread code" +//#warning "OpenMP is not available, compile to single thread code" inline int omp_get_thread_num() { return 0; } inline int omp_get_num_threads() { return 1; } inline void omp_set_num_threads(int nthread) {} diff --git a/utils/xgboost_random.h b/utils/xgboost_random.h index c4a4e763e..358ac2074 100644 --- a/utils/xgboost_random.h +++ b/utils/xgboost_random.h @@ -137,7 +137,8 @@ namespace xgboost{ } /*! \brief return a real number uniform in [0,1) */ inline double RandDouble( void ){ - return static_cast( rand_r( &rseed ) ) / (static_cast( RAND_MAX )+1.0); +// return static_cast( rand_r( &rseed ) ) / (static_cast( RAND_MAX )+1.0); + return 0; } // random number seed unsigned rseed;