check in rank loss

2014-08-17 20:32:02 -07:00
parent 9df8bb1397
commit 5a472145de
3 changed files with 300 additions and 1 deletions
--- a/src/learner/evaluation-inl.hpp
+++ b/src/learner/evaluation-inl.hpp
@@ -216,7 +216,9 @@ struct EvalRankList : public IEvaluator {
                     const MetaInfo &info) const {
    utils::Check(preds.size() == info.labels.size(),
                  "label size predict size not match");
-    const std::vector<unsigned> &gptr = info.group_ptr;
+    // quick consistency when group is not available
+    std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
+    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
    utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
    utils::Assert(gptr.back() == preds.size(),
                   "EvalRanklist: group structure must match number of prediction");
--- a/src/learner/objective-inl.hpp
+++ b/src/learner/objective-inl.hpp
@@ -7,9 +7,13 @@
 */
 #include <vector>
 #include <cmath>
+#include <algorithm>
+#include <functional>
 #include "../data.h"
 #include "./objective.h"
 #include "./helper_utils.h"
+#include "../utils/random.h"
+#include "../utils/omp.h"

 namespace xgboost {
 namespace learner {
@@ -223,6 +227,296 @@ class SoftmaxMultiClassObj : public IObjFunction {
  int output_prob;
 };

+/*! \brief objective for lambda rank */
+class LambdaRankObj : public IObjFunction {
+ public:
+  LambdaRankObj(void) {
+    loss.loss_type = LossType::kLogisticRaw;
+    fix_list_weight = 0.0f;
+    num_pairsample = 1;
+  }
+  virtual ~LambdaRankObj(void) {}
+  virtual void SetParam(const char *name, const char *val) {
+    if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val);
+    if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast<float>(atof(val));
+    if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val);
+  }
+  virtual void GetGradient(const std::vector<float>& preds,
+                           const MetaInfo &info,
+                           int iter,
+                           std::vector<bst_gpair> *out_gpair) {
+    utils::Assert(preds.size() == info.labels.size(), "label size predict size not match");
+    std::vector<bst_gpair> &gpair = *out_gpair;
+    gpair.resize(preds.size());
+    // quick consistency when group is not available
+    std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
+    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
+    utils::Check(gptr.size() != 0 && gptr.back() == preds.size(),
+                 "group structure not consistent with #rows");
+    const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
+    
+    #pragma omp parallel
+    {
+      // parall construct, declare random number generator here, so that each 
+      // thread use its own random number generator, seed by thread id and current iteration
+      random::Random rnd; rnd.Seed(iter* 1111 + omp_get_thread_num());
+      std::vector<LambdaPair> pairs;
+      std::vector<ListEntry>  lst;
+      std::vector< std::pair<float,unsigned> > rec;
+      
+      #pragma omp for schedule(static)
+      for (unsigned k = 0; k < ngroup; ++k) {
+        lst.clear(); pairs.clear(); 
+        for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
+          lst.push_back(ListEntry(preds[j], info.labels[j], j));
+          gpair[j] = bst_gpair(0.0f, 0.0f);
+        }
+        std::sort(lst.begin(), lst.end(), ListEntry::CmpPred);
+        rec.resize(lst.size());
+        for (unsigned i = 0; i < lst.size(); ++i) {
+          rec[i] = std::make_pair(lst[i].label, i);
+        }
+        std::sort(rec.begin(), rec.end(), CmpFirst);
+        // enumerate buckets with same label, for each item in the lst, grab another sample randomly
+        for (unsigned i = 0; i < rec.size(); ) {
+          unsigned j = i + 1;
+          while (j < rec.size() && rec[j].first == rec[i].first) ++j;
+          // bucket in [i,j), get a sample outside bucket
+          unsigned nleft = i, nright = rec.size() - j;
+          if (nleft + nright != 0) {
+            int nsample = num_pairsample;
+            while (nsample --) {
+              for (unsigned pid = i; pid < j; ++pid) {
+                unsigned ridx = static_cast<unsigned>(rnd.RandDouble() * (nleft+nright));
+                if (ridx < nleft) {
+                  pairs.push_back(LambdaPair(rec[ridx].second, rec[pid].second));
+                } else {
+                  pairs.push_back(LambdaPair(rec[pid].second, rec[ridx+j-i].second));
+                }
+              }
+            }
+          }
+          i = j;
+        }
+        // get lambda weight for the pairs
+        this->GetLambdaWeight(lst, &pairs);
+        // rescale each gradient and hessian so that the lst have constant weighted
+        float scale = 1.0f / num_pairsample;
+        if (fix_list_weight != 0.0f) {
+          scale *= fix_list_weight / (gptr[k+1] - gptr[k]);
+        }
+        for (size_t i = 0; i < pairs.size(); ++i) {
+          const ListEntry &pos = lst[pairs[i].pos_index];
+          const ListEntry &neg = lst[pairs[i].neg_index];
+          const float w = pairs[i].weight * scale;
+          float p = loss.PredTransform(pos.pred - neg.pred);
+          float g = loss.FirstOrderGradient(p, 1.0f);
+          float h = loss.SecondOrderGradient(p, 1.0f);
+          // accumulate gradient and hessian in both pid, and nid
+          gpair[pos.rindex].grad += g * w; 
+          gpair[pos.rindex].hess += 2.0f * h; 
+          gpair[neg.rindex].grad -= g * w;
+          gpair[neg.rindex].hess += 2.0f * h;
+        }
+      }
+    }
+  }
+  virtual const char* DefaultEvalMetric(void) {
+    return "map";
+  }
+
+ protected:
+  /*! \brief helper information in a list */
+  struct ListEntry {
+    /*! \brief the predict score we in the data */
+    float pred;
+    /*! \brief the actual label of the entry */
+    float label;
+    /*! \brief row index in the data matrix */           
+    unsigned rindex;
+    // constructor
+    ListEntry(float pred, float label, unsigned rindex)
+        : pred(pred), label(label), rindex(rindex) {}
+    // comparator by prediction
+    inline static bool CmpPred(const ListEntry &a, const ListEntry &b) {
+      return a.pred > b.pred;
+    }
+    // comparator by label
+    inline static bool CmpLabel(const ListEntry &a, const ListEntry &b) {
+      return a.label > b.label;
+    }
+  };
+  /*! \brief a pair in the lambda rank */
+  struct LambdaPair {
+    /*! \brief positive index: this is a position in the list */
+    unsigned pos_index;
+    /*! \brief negative index: this is a position in the list */
+    unsigned neg_index;
+    /*! \brief weight to be filled in */
+    float weight;
+    // constructor
+    LambdaPair(unsigned pos_index, unsigned neg_index)
+        : pos_index(pos_index), neg_index(neg_index), weight(1.0f) {}
+  };
+  /*!
+   * \brief get lambda weight for existing pairs 
+   * \param list a list that is sorted by pred score
+   * \param io_pairs record of pairs, containing the pairs to fill in weights
+   */
+  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
+                               std::vector<LambdaPair> *io_pairs) = 0;
+
+ private:
+  // loss function
+  LossType loss;
+  // number of samples peformed for each instance
+  int num_pairsample;            
+  // fix weight of each elements in list
+  float fix_list_weight;
+};
+
+class PairwiseRankObj: public LambdaRankObj{
+ public:
+  virtual ~PairwiseRankObj(void){}
+
+ protected:
+  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
+                               std::vector<LambdaPair> *io_pairs) {}
+};
+
+// beta version: NDCG lambda rank
+class LambdaRankObjNDCG : public LambdaRankObj {
+ public:
+  virtual ~LambdaRankObjNDCG(void) {}
+
+ protected:
+  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
+                               std::vector<LambdaPair> *io_pairs) {
+    std::vector<LambdaPair> &pairs = *io_pairs;
+    float IDCG;
+    {
+      std::vector<float> labels(sorted_list.size());
+      for (size_t i = 0; i < sorted_list.size(); ++i) {
+        labels[i] = sorted_list[i].label;
+      }
+      std::sort(labels.begin(), labels.end(), std::greater<float>());
+      IDCG = CalcDCG(labels);
+    }
+    
+    if (IDCG == 0.0) {
+      for (size_t i = 0; i < pairs.size(); ++i) {
+        pairs[i].weight = 0.0f;
+      }
+    } else {
+      IDCG = 1.0f / IDCG;
+      for (size_t i = 0; i < pairs.size(); ++i) {
+        unsigned pos_idx = pairs[i].pos_index;
+        unsigned neg_idx = pairs[i].neg_index;
+        float pos_loginv = 1.0f / logf(pos_idx+2.0f);
+        float neg_loginv = 1.0f / logf(neg_idx+2.0f);
+        int pos_label = static_cast<int>(sorted_list[pos_idx].label);
+        int neg_label = static_cast<int>(sorted_list[neg_idx].label);
+        float original = ((1<<pos_label)-1) * pos_loginv + ((1<<neg_label)-1) * neg_loginv;
+        float changed  = ((1<<neg_label)-1) * pos_loginv + ((1<<pos_label)-1) * neg_loginv;
+        float delta = (original-changed) * IDCG;
+        if (delta < 0.0f) delta = - delta;
+        pairs[i].weight = delta;
+      }
+    }
+  }
+  inline static float CalcDCG(const std::vector<float> &labels) {
+    double sumdcg = 0.0;
+    for (size_t i = 0; i < labels.size(); ++i) {
+      const unsigned rel = labels[i];
+      if (rel != 0) { 
+        sumdcg += ((1<<rel)-1) / logf(i + 2);
+      }
+    }
+    return static_cast<float>(sumdcg);
+  }
+};
+
+class LambdaRankObjMAP : public LambdaRankObj {  
+ public:
+  virtual ~LambdaRankObjMAP(void) {}
+
+ protected:
+  struct MAPStats {
+    /* \brief the accumulated precision */
+    float ap_acc;
+    /* \brief the accumulated precision assuming a positive instance is missing */
+    float ap_acc_miss;
+    /* \brief the accumulated precision assuming that one more positive instance is inserted ahead*/
+    float ap_acc_add;
+    /* \brief the accumulated positive instance count */
+    float hits;
+    MAPStats(void) {}
+    MAPStats(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits)
+        : ap_acc(ap_acc), ap_acc_miss(ap_acc_miss), ap_acc_add(ap_acc_add), hits(hits) {}
+  };
+  /*
+   * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2
+   *        in sorted triples
+   * \param sorted_list the list containing entry information
+   * \param index1,index2 the instances switched
+   * \param map_stats a vector containing the accumulated precisions for each position in a list
+   */
+  inline float GetLambdaMAP(const std::vector<ListEntry> &sorted_list,
+                            int index1, int index2,
+                            std::vector<MAPStats> &map_stats){
+    if (index1 == index2 || map_stats[map_stats.size() - 1].hits == 0) {
+      return 0.0f;
+    }
+    if (index1 > index2) std::swap(index1, index2);
+    float original = map_stats[index2].ap_acc;
+    if (index1 != 0) original -= map_stats[index1 - 1].ap_acc;
+    float changed = 0;
+    float label1 = sorted_list[index1].label > 0.0f ? 1.0f : 0.0f;
+    float label2 = sorted_list[index2].label > 0.0f ? 1.0f : 0.0f;
+    if (label1 == label2) {
+      return 0.0;
+    } else if (label1 < label2) {
+      changed += map_stats[index2 - 1].ap_acc_add - map_stats[index1].ap_acc_add;
+      changed += (map_stats[index1].hits + 1.0f) / (index1 + 1);
+    } else {
+      changed += map_stats[index2 - 1].ap_acc_miss - map_stats[index1].ap_acc_miss;
+      changed += map_stats[index2].hits / (index2 + 1);
+    }
+    
+    float ans = (changed - original) / (map_stats[map_stats.size() - 1].hits);
+    if (ans < 0) ans = -ans;
+    return ans;
+  }  
+  /*
+   * \brief obtain preprocessing results for calculating delta MAP
+   * \param sorted_list the list containing entry information
+   * \param map_stats a vector containing the accumulated precisions for each position in a list
+   */
+  inline void GetMAPStats(const std::vector<ListEntry> &sorted_list,
+                          std::vector<MAPStats> &map_acc){
+    map_acc.resize(sorted_list.size());
+    float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0;
+    for (size_t i = 1; i <= sorted_list.size(); ++i) {
+      if (sorted_list[i - 1].label > 0.0f) {
+        hit++;
+        acc1 += hit / i;
+        acc2 += (hit - 1) / i;
+        acc3 += (hit + 1) / i;
+      }
+      map_acc[i - 1] = MAPStats(acc1,acc2,acc3,hit);
+    }
+  }  
+  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list, std::vector<LambdaPair> *io_pairs) {
+    std::vector<LambdaPair> &pairs = *io_pairs;
+    std::vector<MAPStats> map_stats;
+    GetMAPStats(sorted_list, map_stats);
+    for (size_t i = 0; i < pairs.size(); ++i) {
+      pairs[i].weight = 
+          GetLambdaMAP(sorted_list, pairs[i].pos_index, pairs[i].neg_index, map_stats);
+    }
+  }
+};
+
 }  // namespace learner
 }  // namespace xgboost
 #endif  // XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
--- a/src/learner/objective.h
+++ b/src/learner/objective.h
@@ -73,6 +73,9 @@ inline IObjFunction* CreateObjFunction(const char *name) {
  if (!strcmp("binary:logitraw", name)) return new RegLossObj(LossType::kLogisticRaw);
  if (!strcmp("multi:softmax", name)) return new SoftmaxMultiClassObj(0);
  if (!strcmp("multi:softprob", name)) return new SoftmaxMultiClassObj(1);
+  if (!strcmp("rank:pairwise", name )) return new PairwiseRankObj();
+  if (!strcmp("rank:ndcg", name)) return new LambdaRankObjNDCG();
+  if (!strcmp("rank:map", name)) return new LambdaRankObjMAP();  
  utils::Error("unknown objective function type: %s", name);
  return NULL;
 }