pairwise ranking implemented

2014-04-05 00:14:55 +08:00 · 2014-04-05 00:14:55 +08:00 · c62dea8325
commit c62dea8325
parent 0b1e584d73
3 changed files with 115 additions and 114 deletions
--- a/rank/xgboost_rank.h
+++ b/rank/xgboost_rank.h
@ -94,20 +94,20 @@ public:
    inline void InitTrainer( void ) {
        base_gbm.InitTrainer();
        if( mparam.loss_type == PAIRWISE) {
-            evaluator_.AddEval( "pairwise" );
+            evaluator_.AddEval( "PAIR" );
        } else if( mparam.loss_type == MAP) {
            evaluator_.AddEval( "MAP" );
        } else {
            evaluator_.AddEval( "NDCG" );
        }
        evaluator_.Init();
 	sampler.AssignSampler(mparam.sampler_type);
    }
    /*!
    * \brief initialize the current data storage for model, if the model is used first time, call this function
    */
    inline void InitModel( void ) {
        base_gbm.InitModel();
        mparam.AdjustBase();
    }
    /*!
    * \brief load model from stream
@ -134,6 +134,7 @@ public:
    inline void DumpPath( FILE *fo, const RMatrix &data ) {
        base_gbm.DumpPath( fo, data.data );
    }
    /*!
    * \brief save model to stream
    * \param fo output stream
@ -142,6 +143,7 @@ public:
        base_gbm.SaveModel( fo );
        fo.Write( &mparam, sizeof(ModelParam) );
    }
    /*!
     * \brief update the model for one iteration
     * \param iteration iteration number
@ -177,7 +179,7 @@ public:
        const unsigned ndata = static_cast<unsigned>( data.Size() );
        #pragma omp parallel for schedule( static )
        for( unsigned j = 0; j < ndata; ++ j ) {
-            preds[j] = mparam.base_score + base_gbm.Predict( data.data, j, -1 );
+            preds[j] = base_gbm.Predict( data.data, j, -1 );
        }
    }
@ -219,7 +221,7 @@ private:
        const unsigned ndata = static_cast<unsigned>( data.Size() );
        #pragma omp parallel for schedule( static )
        for( unsigned j = 0; j < ndata; ++ j ) {
-            preds[j] = mparam.base_score + base_gbm.InteractPredict( data.data, j, buffer_offset + j );
+            preds[j] = base_gbm.InteractPredict( data.data, j, buffer_offset + j );
        }
    }
    /*! \brief repredict trial */
@ -238,7 +240,7 @@ private:
        const unsigned ndata = static_cast<unsigned>( data.Size() );
        #pragma omp parallel for schedule( static )
        for( unsigned j = 0; j < ndata; ++ j ) {
-            preds[j] = mparam.base_score + base_gbm.Predict( data.data, j, buffer_offset + j );
+            preds[j] = base_gbm.Predict( data.data, j, buffer_offset + j );
 	}
    }
@ -254,23 +256,23 @@ private:
 	bool j_better;
 	float pred_diff,pred_diff_exp,first_order_gradient,second_order_gradient;
 	for(int i = 0; i < group_index.size() - 1; i++){
-	  xgboost::rank::sample::PairSamplerSet sampler;
+	  
-	  xgboost::rank::sample::Pairs pairs = sampler.GenPairs(preds,labels,group_index[i],group_index[i+1]);
+	  sample::Pairs pairs = sampler.GenPairs(preds,labels,group_index[i],group_index[i+1]);
-	  for(int j = group_index[i]; j < group_index[i+1]; j++){
+	  for(int j = group_index[i]; j < group_index[i + 1]; j++){
-	    std::vector<int> pair_instance = pairs.GetPairs(j);
+	      std::vector<int> pair_instance = pairs.GetPairs(j);
 	    j_better =  labels[j] > labels[pair_instance[k]];
 	    if(j_better){
 	      for(int k = 0; k < pair_instance.size(); k++){
-	         pred_diff = preds[preds[j] - pair_instance[k]];
+		 j_better =  labels[j] > labels[pair_instance[k]];
-	         pred_diff_exp =  j_better? expf(-pred_diff):expf(pred_diff);
+	         if(j_better){
-                 first_order_gradient = mparam.FirstOrderGradient(pred_diff_exp);	    
+	   	     pred_diff = preds[preds[j] - pair_instance[k]];
-	         second_order_gradient = 2 * mparam.SecondOrderGradient(pred_diff_exp);	    
+		     pred_diff_exp =  j_better? expf(-pred_diff):expf(pred_diff);
-	         hess[j] += second_order_gradient;
+                     first_order_gradient = mparam.FirstOrderGradient(pred_diff_exp);	    
-	         grad[j] += first_order_gradient;
+	             second_order_gradient = 2 * mparam.SecondOrderGradient(pred_diff_exp);	    
-	         hess[pair_instance[k]] += second_order_gradient;
+	             hess[j] += second_order_gradient;
-		 grad[pair_instance[k]] += -first_order_gradient;
+	             grad[j] += first_order_gradient;
 	             hess[pair_instance[k]] += second_order_gradient;
 		     grad[pair_instance[k]] += -first_order_gradient;
 	         }
 	      }
 	    }
 	  }
 	}
@ -285,17 +287,16 @@ private:
    /*! \brief training parameter for regression */
    struct ModelParam {
        /* \brief global bias */
        float base_score;
        /* \brief type of loss function */
        int loss_type;
        /* \brief number of features  */
        int num_feature;
        /*! \brief reserved field */
        int reserved[ 16 ];
 	/*! \brief sampler type */
 	int sampler_type;
        /*! \brief constructor */
        ModelParam( void ) {
            base_score = 0.5f;
            loss_type  = 0;
            num_feature = 0;
            memset( reserved, 0, sizeof( reserved ) );
@ -306,36 +307,11 @@ private:
        * \param val  value of the parameter
        */
        inline void SetParam( const char *name, const char *val ) {
            if( !strcmp("base_score", name ) )  base_score = (float)atof( val );
            if( !strcmp("loss_type", name ) )   loss_type = atoi( val );
            if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
-        }
+	    if( !strcmp("rank:sampler",name)) sampler = atoi( val );
        /*!
        * \brief adjust base_score
        */
        inline void AdjustBase( void ) {
            if( loss_type == 1 || loss_type == 2 ) {
                utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" );
                base_score = - logf( 1.0f / base_score - 1.0f );
            }
        }
        /*!
        * \brief transform the linear sum to prediction
        * \param x linear sum of boosting ensemble
        * \return transformed prediction
        */
        inline float PredTransform( float x ) {
            switch( loss_type ) {
            case PAIRWISE:
            case MAP:
            case NDCG:
                return 1.0f/(1.0f + expf(-x));
            default:
                utils::Error("unknown loss_type");
                return 0.0f;
            }
        }
        /*!
        * \brief calculate first order gradient of pairwise loss function(f(x) = ln(1+exp(-x)), 
@ -358,57 +334,11 @@ private:
        inline float SecondOrderGradient( float pred_diff_exp ) const {
            return pred_diff_exp/pow(1 + pred_diff_exp,2);
        }
        /*!
         * \brief calculating the loss, given the predictions, labels and the loss type
         * \param preds the given predictions
         * \param labels the given labels
         * \return the specified loss
         */
        inline float Loss(const std::vector<float> &preds, const std::vector<float> &labels) const {
            switch( loss_type ) {
            case kLinearSquare:
                return SquareLoss(preds,labels);
            case kLogisticNeglik:
            case kLogisticClassify:
                return NegLoglikelihoodLoss(preds,labels);
            default:
                utils::Error("unknown loss_type");
                return 0.0f;
            }
        }
        /*!
         * \brief calculating the square loss, given the predictions and labels
         * \param preds the given predictions
         * \param labels the given labels
         * \return the summation of square loss
         */
        inline float SquareLoss(const std::vector<float> &preds, const std::vector<float> &labels) const {
            float ans = 0.0;
            for(size_t i = 0; i < preds.size(); i++) {
                float dif = preds[i] - labels[i];
                ans += dif * dif;
            }
            return ans;
        }
        /*!
         * \brief calculating the square loss, given the predictions and labels
         * \param preds the given predictions
         * \param labels the given labels
         * \return the summation of square loss
         */
        inline float NegLoglikelihoodLoss(const std::vector<float> &preds, const std::vector<float> &labels) const {
            float ans = 0.0;
            for(size_t i = 0; i < preds.size(); i++)
                ans -= labels[i] * logf(preds[i]) + ( 1 - labels[i] ) * logf(1 - preds[i]);
            return ans;
        }
    };
 private:
    int silent;
    RankEvalSet evaluator_;
    sample::PairSamplerWrapper sampler;
    booster::GBMBase base_gbm;
    ModelParam   mparam;
    const RMatrix *train_;
--- a/rank/xgboost_rank_eval.h
+++ b/rank/xgboost_rank_eval.h
@ -46,6 +46,14 @@ bool PairValueComparer(const Pair &a, const Pair &b){
    return a.value_ < b.value_;
 }
 struct EvalPair : public IRankEvaluator{
    virtual float Eval( const std::vector<float> &preds,
                        const std::vector<float> &labels,
                        const std::vector<int> &group_index  ) const {
 	return 0;
    }  
 };
 /*! \brief Mean Average Precision */
 struct EvalMAP : public IRankEvaluator {
    virtual float Eval( const std::vector<float> &preds,
@ -130,6 +138,7 @@ namespace rank {
 struct RankEvalSet {
 public:
    inline void AddEval( const char *name ) {
        if( !strcmp( name, "PAIR" )) evals_.push_back( &pair_);
        if( !strcmp( name, "MAP") ) evals_.push_back( &map_ );
        if( !strcmp( name, "NDCG") ) evals_.push_back( &ndcg_ );
    }
@ -144,12 +153,13 @@ public:
                      const std::vector<float> &labels,
                      const std::vector<int> &group_index ) const {
        for( size_t i = 0; i < evals_.size(); ++ i ) {
-            float res = evals_[i]->Eval( preds, labels );
+            float res = evals_[i]->Eval( preds, labels,group_index );
            fprintf( fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res );
        }
    }
 private:
    EvalPair pair_;
    EvalMAP map_;
    EvalNDCG ndcg_;
    std::vector<const IRankEvaluator*> evals_;
--- a/rank/xgboost_sample.h
+++ b/rank/xgboost_sample.h
@ -1,21 +1,47 @@
 #ifndef _XGBOOST_SAMPLE_H_
 #define _XGBOOST_SAMPLE_H_
 #include<vector>
 #include"../utils/xgboost_utils.h"
 namespace xgboost {
  namespace rank {
    namespace sample {
      /*
       * \brief the data structure to maintain the sample pairs
       */
      struct Pairs {
      /*
       * \brief constructor given the start and end offset of the sampling group
       *        in overall instances
       * \param start the begin index of the group
       * \param end the end index of the group
       */
      Pairs(int start,int end):start_(start),end_(end_){
 	for(int i = start; i < end; i++){
 	  vector<int> v;
 	  pairs_.push_back(v);
 	}
      }
      /*
       * \brief retrieve the related pair information of an data instances
       * \param index, the index of retrieved instance
       * \return the index of instances paired
       */
      std::vector<int> GetPairs(int index) {
-        utils::assert(index >= start_ && index < end_, "The query index out of sampling bound");
+        utils::assert(index >= start_ && index < end_,"The query index out of sampling bound");
        return pairs_[index-start_];
      }
      /*
       * \brief add in a sampled pair
       * \param index the index of the instance to sample a friend
       * \param paired_index the index of the instance sampled as a friend
       */
      void push(int index,int paired_index){
 	pairs_[index - start_].push_back(paired_index);
      }
      std::vector<std::vector<int>> pairs_;
@ -23,6 +49,9 @@ namespace xgboost {
      int end_;
    };
      /*
       * \brief the interface of pair sampler
       */
      struct IPairSampler {
 	  /*
 	   * \brief Generate sample pairs given the predcions, labels, the start and the end index 
@ -34,31 +63,63 @@ namespace xgboost {
 	   * \return the generated pairs
 	   */
 	  virtual Pairs GenPairs(const std::vector<float> &preds,
-				const std::vector<float> &labels,
+				 const std::vector<float> &labels,
-			      int start,int end) = 0;
+			         int start,int end) = 0;
      };
-      /*! \brief a set of evaluators */
+      enum{
-        struct PairSamplerSet{
+	  BINARY_LINEAR_SAMPLER
-        public:
+      };
            inline void AssignSampler( const char *name ){                
                if( !strcmp( name, "rmse") ) evals_.push_back( &rmse_ );
                if( !strcmp( name, "error") ) evals_.push_back( &error_ );
                if( !strcmp( name, "logloss") ) evals_.push_back( &logloss_ );
            }
      /*! \brief A simple pair sampler when the rank relevence scale is binary
       *         for each positive instance, we will pick a negative
       *         instance and add in a pair. When using binary linear sampler, 
       *         we should guarantee the labels are 0 or 1
       */
      struct BinaryLinearSampler:public IPairSampler{
 	virtual Pairs GenPairs(const std::vector<float> &preds,
 			       const std::vector<float> &labels,
 			       int start,int end) {
  	    Pairs pairs(start,end);
 	    int pointer = 0, last_pointer = 0,index = start, interval = end - start;
 	    for(int i = start; i < end; i++){
 	      if(labels[i] == 1){
 		while(true){
 		  index = (++pointer) % interval + start;
 		  if(labels[index] == 0) break;
 		  if(pointer - last_pointer > interval) return pairs;
 		} 
 		pairs.push(i,index);
 		pairs.push(index,i);
 		last_pointer = pointer;
 	      }
 	    }
 	    return pairs; 
 	}	
      };
      /*! \brief Pair Sampler Wrapper*/
        struct PairSamplerWrapper{
        public:
            inline void AssignSampler( int sampler_index ){                
 	      switch(sampler_index){
 		case BINARY_LINEAR_SAMPLER:sampler_ = &binary_linear_sampler;break;
 		default:utils::Error("Cannot find the specified sampler");
 	      }
 	    }
            Pairs GenPairs(const std::vector<float> &preds,
 			   const std::vector<float> &labels,
 			   int start,int end){
-			
+	      return sampler_.GenPairs(preds,labels,start,end);
 	    }
        private:
-            EvalRMSE  rmse_;
+	    BinaryLinearSampler binary_linear_sampler;
-            EvalError error_;
+	    IPairSampler *sampler_; 
            EvalLogLoss logloss_;
            std::vector<const IEvaluator*> evals_;  
        };
    }
  }