From c62dea83254b2b71f559f7c51e30a033b751f3d4 Mon Sep 17 00:00:00 2001 From: kalenhaha Date: Sat, 5 Apr 2014 00:14:55 +0800 Subject: [PATCH] pairwise ranking implemented --- rank/xgboost_rank.h | 124 +++++++++------------------------------ rank/xgboost_rank_eval.h | 12 +++- rank/xgboost_sample.h | 93 ++++++++++++++++++++++++----- 3 files changed, 115 insertions(+), 114 deletions(-) diff --git a/rank/xgboost_rank.h b/rank/xgboost_rank.h index f5e032a5d..3f0986e25 100644 --- a/rank/xgboost_rank.h +++ b/rank/xgboost_rank.h @@ -94,20 +94,20 @@ public: inline void InitTrainer( void ) { base_gbm.InitTrainer(); if( mparam.loss_type == PAIRWISE) { - evaluator_.AddEval( "pairwise" ); + evaluator_.AddEval( "PAIR" ); } else if( mparam.loss_type == MAP) { evaluator_.AddEval( "MAP" ); } else { evaluator_.AddEval( "NDCG" ); } evaluator_.Init(); + sampler.AssignSampler(mparam.sampler_type); } /*! * \brief initialize the current data storage for model, if the model is used first time, call this function */ inline void InitModel( void ) { base_gbm.InitModel(); - mparam.AdjustBase(); } /*! * \brief load model from stream @@ -134,6 +134,7 @@ public: inline void DumpPath( FILE *fo, const RMatrix &data ) { base_gbm.DumpPath( fo, data.data ); } + /*! * \brief save model to stream * \param fo output stream @@ -142,6 +143,7 @@ public: base_gbm.SaveModel( fo ); fo.Write( &mparam, sizeof(ModelParam) ); } + /*! * \brief update the model for one iteration * \param iteration iteration number @@ -177,7 +179,7 @@ public: const unsigned ndata = static_cast( data.Size() ); #pragma omp parallel for schedule( static ) for( unsigned j = 0; j < ndata; ++ j ) { - preds[j] = mparam.base_score + base_gbm.Predict( data.data, j, -1 ); + preds[j] = base_gbm.Predict( data.data, j, -1 ); } } @@ -219,7 +221,7 @@ private: const unsigned ndata = static_cast( data.Size() ); #pragma omp parallel for schedule( static ) for( unsigned j = 0; j < ndata; ++ j ) { - preds[j] = mparam.base_score + base_gbm.InteractPredict( data.data, j, buffer_offset + j ); + preds[j] = base_gbm.InteractPredict( data.data, j, buffer_offset + j ); } } /*! \brief repredict trial */ @@ -238,7 +240,7 @@ private: const unsigned ndata = static_cast( data.Size() ); #pragma omp parallel for schedule( static ) for( unsigned j = 0; j < ndata; ++ j ) { - preds[j] = mparam.base_score + base_gbm.Predict( data.data, j, buffer_offset + j ); + preds[j] = base_gbm.Predict( data.data, j, buffer_offset + j ); } } @@ -254,23 +256,23 @@ private: bool j_better; float pred_diff,pred_diff_exp,first_order_gradient,second_order_gradient; for(int i = 0; i < group_index.size() - 1; i++){ - xgboost::rank::sample::PairSamplerSet sampler; - xgboost::rank::sample::Pairs pairs = sampler.GenPairs(preds,labels,group_index[i],group_index[i+1]); - for(int j = group_index[i]; j < group_index[i+1]; j++){ - std::vector pair_instance = pairs.GetPairs(j); - j_better = labels[j] > labels[pair_instance[k]]; - if(j_better){ + + sample::Pairs pairs = sampler.GenPairs(preds,labels,group_index[i],group_index[i+1]); + for(int j = group_index[i]; j < group_index[i + 1]; j++){ + std::vector pair_instance = pairs.GetPairs(j); for(int k = 0; k < pair_instance.size(); k++){ - pred_diff = preds[preds[j] - pair_instance[k]]; - pred_diff_exp = j_better? expf(-pred_diff):expf(pred_diff); - first_order_gradient = mparam.FirstOrderGradient(pred_diff_exp); - second_order_gradient = 2 * mparam.SecondOrderGradient(pred_diff_exp); - hess[j] += second_order_gradient; - grad[j] += first_order_gradient; - hess[pair_instance[k]] += second_order_gradient; - grad[pair_instance[k]] += -first_order_gradient; + j_better = labels[j] > labels[pair_instance[k]]; + if(j_better){ + pred_diff = preds[preds[j] - pair_instance[k]]; + pred_diff_exp = j_better? expf(-pred_diff):expf(pred_diff); + first_order_gradient = mparam.FirstOrderGradient(pred_diff_exp); + second_order_gradient = 2 * mparam.SecondOrderGradient(pred_diff_exp); + hess[j] += second_order_gradient; + grad[j] += first_order_gradient; + hess[pair_instance[k]] += second_order_gradient; + grad[pair_instance[k]] += -first_order_gradient; + } } - } } } @@ -282,20 +284,19 @@ private: MAP = 1, NDCG = 2 }; - + /*! \brief training parameter for regression */ struct ModelParam { - /* \brief global bias */ - float base_score; /* \brief type of loss function */ int loss_type; /* \brief number of features */ int num_feature; /*! \brief reserved field */ int reserved[ 16 ]; + /*! \brief sampler type */ + int sampler_type; /*! \brief constructor */ ModelParam( void ) { - base_score = 0.5f; loss_type = 0; num_feature = 0; memset( reserved, 0, sizeof( reserved ) ); @@ -306,36 +307,11 @@ private: * \param val value of the parameter */ inline void SetParam( const char *name, const char *val ) { - if( !strcmp("base_score", name ) ) base_score = (float)atof( val ); if( !strcmp("loss_type", name ) ) loss_type = atoi( val ); if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val ); - } - /*! - * \brief adjust base_score - */ - inline void AdjustBase( void ) { - if( loss_type == 1 || loss_type == 2 ) { - utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" ); - base_score = - logf( 1.0f / base_score - 1.0f ); - } + if( !strcmp("rank:sampler",name)) sampler = atoi( val ); } - /*! - * \brief transform the linear sum to prediction - * \param x linear sum of boosting ensemble - * \return transformed prediction - */ - inline float PredTransform( float x ) { - switch( loss_type ) { - case PAIRWISE: - case MAP: - case NDCG: - return 1.0f/(1.0f + expf(-x)); - default: - utils::Error("unknown loss_type"); - return 0.0f; - } - } /*! * \brief calculate first order gradient of pairwise loss function(f(x) = ln(1+exp(-x)), @@ -358,57 +334,11 @@ private: inline float SecondOrderGradient( float pred_diff_exp ) const { return pred_diff_exp/pow(1 + pred_diff_exp,2); } - - /*! - * \brief calculating the loss, given the predictions, labels and the loss type - * \param preds the given predictions - * \param labels the given labels - * \return the specified loss - */ - inline float Loss(const std::vector &preds, const std::vector &labels) const { - switch( loss_type ) { - case kLinearSquare: - return SquareLoss(preds,labels); - case kLogisticNeglik: - case kLogisticClassify: - return NegLoglikelihoodLoss(preds,labels); - default: - utils::Error("unknown loss_type"); - return 0.0f; - } - } - - /*! - * \brief calculating the square loss, given the predictions and labels - * \param preds the given predictions - * \param labels the given labels - * \return the summation of square loss - */ - inline float SquareLoss(const std::vector &preds, const std::vector &labels) const { - float ans = 0.0; - for(size_t i = 0; i < preds.size(); i++) { - float dif = preds[i] - labels[i]; - ans += dif * dif; - } - return ans; - } - - /*! - * \brief calculating the square loss, given the predictions and labels - * \param preds the given predictions - * \param labels the given labels - * \return the summation of square loss - */ - inline float NegLoglikelihoodLoss(const std::vector &preds, const std::vector &labels) const { - float ans = 0.0; - for(size_t i = 0; i < preds.size(); i++) - ans -= labels[i] * logf(preds[i]) + ( 1 - labels[i] ) * logf(1 - preds[i]); - return ans; - } }; private: int silent; RankEvalSet evaluator_; + sample::PairSamplerWrapper sampler; booster::GBMBase base_gbm; ModelParam mparam; const RMatrix *train_; diff --git a/rank/xgboost_rank_eval.h b/rank/xgboost_rank_eval.h index 4e9a24638..ab36cbafd 100644 --- a/rank/xgboost_rank_eval.h +++ b/rank/xgboost_rank_eval.h @@ -46,6 +46,14 @@ bool PairValueComparer(const Pair &a, const Pair &b){ return a.value_ < b.value_; } +struct EvalPair : public IRankEvaluator{ + virtual float Eval( const std::vector &preds, + const std::vector &labels, + const std::vector &group_index ) const { + return 0; + } +}; + /*! \brief Mean Average Precision */ struct EvalMAP : public IRankEvaluator { virtual float Eval( const std::vector &preds, @@ -130,6 +138,7 @@ namespace rank { struct RankEvalSet { public: inline void AddEval( const char *name ) { + if( !strcmp( name, "PAIR" )) evals_.push_back( &pair_); if( !strcmp( name, "MAP") ) evals_.push_back( &map_ ); if( !strcmp( name, "NDCG") ) evals_.push_back( &ndcg_ ); } @@ -144,12 +153,13 @@ public: const std::vector &labels, const std::vector &group_index ) const { for( size_t i = 0; i < evals_.size(); ++ i ) { - float res = evals_[i]->Eval( preds, labels ); + float res = evals_[i]->Eval( preds, labels,group_index ); fprintf( fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res ); } } private: + EvalPair pair_; EvalMAP map_; EvalNDCG ndcg_; std::vector evals_; diff --git a/rank/xgboost_sample.h b/rank/xgboost_sample.h index bd3bb30d8..81f2fd454 100644 --- a/rank/xgboost_sample.h +++ b/rank/xgboost_sample.h @@ -1,28 +1,57 @@ #ifndef _XGBOOST_SAMPLE_H_ #define _XGBOOST_SAMPLE_H_ +#include #include"../utils/xgboost_utils.h" namespace xgboost { namespace rank { namespace sample { + /* + * \brief the data structure to maintain the sample pairs + */ struct Pairs { + /* + * \brief constructor given the start and end offset of the sampling group + * in overall instances + * \param start the begin index of the group + * \param end the end index of the group + */ + Pairs(int start,int end):start_(start),end_(end_){ + for(int i = start; i < end; i++){ + vector v; + pairs_.push_back(v); + } + } /* * \brief retrieve the related pair information of an data instances * \param index, the index of retrieved instance * \return the index of instances paired */ std::vector GetPairs(int index) { - utils::assert(index >= start_ && index < end_, "The query index out of sampling bound"); + utils::assert(index >= start_ && index < end_,"The query index out of sampling bound"); + return pairs_[index-start_]; } + /* + * \brief add in a sampled pair + * \param index the index of the instance to sample a friend + * \param paired_index the index of the instance sampled as a friend + */ + void push(int index,int paired_index){ + pairs_[index - start_].push_back(paired_index); + } + std::vector> pairs_; int start_; int end_; }; + /* + * \brief the interface of pair sampler + */ struct IPairSampler { /* * \brief Generate sample pairs given the predcions, labels, the start and the end index @@ -34,31 +63,63 @@ namespace xgboost { * \return the generated pairs */ virtual Pairs GenPairs(const std::vector &preds, - const std::vector &labels, - int start,int end) = 0; + const std::vector &labels, + int start,int end) = 0; + }; - /*! \brief a set of evaluators */ - struct PairSamplerSet{ + enum{ + BINARY_LINEAR_SAMPLER + }; + + /*! \brief A simple pair sampler when the rank relevence scale is binary + * for each positive instance, we will pick a negative + * instance and add in a pair. When using binary linear sampler, + * we should guarantee the labels are 0 or 1 + */ + struct BinaryLinearSampler:public IPairSampler{ + virtual Pairs GenPairs(const std::vector &preds, + const std::vector &labels, + int start,int end) { + Pairs pairs(start,end); + int pointer = 0, last_pointer = 0,index = start, interval = end - start; + for(int i = start; i < end; i++){ + if(labels[i] == 1){ + while(true){ + index = (++pointer) % interval + start; + if(labels[index] == 0) break; + if(pointer - last_pointer > interval) return pairs; + } + pairs.push(i,index); + pairs.push(index,i); + last_pointer = pointer; + } + } + return pairs; + } + }; + + + /*! \brief Pair Sampler Wrapper*/ + struct PairSamplerWrapper{ public: - inline void AssignSampler( const char *name ){ - if( !strcmp( name, "rmse") ) evals_.push_back( &rmse_ ); - if( !strcmp( name, "error") ) evals_.push_back( &error_ ); - if( !strcmp( name, "logloss") ) evals_.push_back( &logloss_ ); - } + inline void AssignSampler( int sampler_index ){ + switch(sampler_index){ + case BINARY_LINEAR_SAMPLER:sampler_ = &binary_linear_sampler;break; + + default:utils::Error("Cannot find the specified sampler"); + } + } Pairs GenPairs(const std::vector &preds, const std::vector &labels, int start,int end){ - - + return sampler_.GenPairs(preds,labels,start,end); } private: - EvalRMSE rmse_; - EvalError error_; - EvalLogLoss logloss_; - std::vector evals_; + BinaryLinearSampler binary_linear_sampler; + IPairSampler *sampler_; }; } }