bug fix in pairwise rank

This commit is contained in:
tqchen 2014-05-15 15:37:58 -07:00
parent 37e1473cea
commit e2d13db24e
3 changed files with 45 additions and 36 deletions

View File

@ -2,7 +2,7 @@
# choose the tree booster, 0: tree, 1: linear
booster_type = 0
#objective="rank:pairwise"
objective="rank:pairwise"
#objective="rank:softmax"
#objective="lambdarank:map"
#objective="lambdarank:ndcg"
@ -15,8 +15,9 @@ bst:gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
bst:min_child_weight = 1
# maximum depth of a tree
bst:max_depth = 3
eval_metric='ndcg'
bst:max_depth = 6
eval_metric = "ndcg"
eval_metric = "map"
# Task parameters
# the number of round to do boosting
num_round = 4

View File

@ -174,7 +174,6 @@ namespace xgboost{
}
};
/*! \brief Area under curve, for both classification and rank */
struct EvalAuc : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
@ -292,17 +291,6 @@ namespace xgboost{
struct EvalNDCG : public EvalRankList{
public:
EvalNDCG(const char *name):EvalRankList(name){}
static inline float CalcDCG(const std::vector< float > &rec) {
double sumdcg = 0.0;
for (size_t i = 0; i < rec.size(); i++){
const unsigned rel = static_cast<unsigned>(rec[i]);
if (rel != 0){
sumdcg += logf(2.0f) *((1 << rel) - 1) / logf(i + 1);
}
}
return static_cast<float>(sumdcg);
}
protected:
inline float CalcDCG( const std::vector< std::pair<float,unsigned> > &rec ) const {
double sumdcg = 0.0;
@ -315,9 +303,9 @@ namespace xgboost{
return static_cast<float>(sumdcg);
}
virtual float EvalMetric( std::vector< std::pair<float, unsigned> > &rec ) const {
std::sort(rec.begin(), rec.end(), CmpFirst);
float idcg = this->CalcDCG(rec);
std::sort(rec.begin(), rec.end(), CmpSecond);
float idcg = this->CalcDCG(rec);
std::sort(rec.begin(), rec.end(), CmpFirst);
float dcg = this->CalcDCG(rec);
if( idcg == 0.0f ) return 0.0f;
else return dcg/idcg;

View File

@ -185,13 +185,15 @@ namespace xgboost{
class PairwiseRankObj : public IObjFunction{
public:
PairwiseRankObj(void){
loss.loss_type = LossType::kLinearSquare;
loss.loss_type = LossType::kLogisticRaw;
fix_list_weight = 0.0f;
num_pairsample = 1;
}
virtual ~PairwiseRankObj(){}
virtual void SetParam(const char *name, const char *val){
if( !strcmp( "loss_type", name ) ) loss.loss_type = atoi( val );
if( !strcmp( "fix_list_weight", name ) ) fix_list_weight = (float)atof( val );
if( !strcmp( "num_pairsample", name ) ) num_pairsample = atoi( val );
}
virtual void GetGradient(const std::vector<float>& preds,
const DMatrix::Info &info,
@ -224,21 +226,33 @@ namespace xgboost{
while( j < rec.size() && rec[j].first == rec[i].first ) ++ j;
// bucket in [i,j), get a sample outside bucket
unsigned nleft = i, nright = rec.size() - j;
for( unsigned pid = i; pid < j; ++ pid ){
unsigned ridx = static_cast<int>( rnd.RandDouble() * (nleft+nright) );
if( ridx < nleft ){
// get the samples in left side, ridx is pos sample
this->AddGradient( rec[ridx].second, rec[pid].second, preds, grad, hess );
}else{
// get samples in right side, ridx is negsample
this->AddGradient( rec[pid].second, rec[ridx+j-i].second, preds, grad, hess );
if( nleft + nright != 0 ){
int nsample = num_pairsample;
while( nsample -- ){
for( unsigned pid = i; pid < j; ++ pid ){
unsigned ridx = static_cast<unsigned>( rnd.RandDouble() * (nleft+nright) );
if( ridx < nleft ){
// get the samples in left side, ridx is pos sample
this->AddGradient( rec[ridx].second, rec[pid].second, preds, grad, hess );
}else{
// get samples in right side, ridx is negsample
this->AddGradient( rec[pid].second, rec[ridx+j-i].second, preds, grad, hess );
}
}
}
}
}else{
for( unsigned pid = i; pid < j; ++ pid ){
utils::Assert( rec[pid].first == 0.0f );
}
}
i = j;
}
// rescale each gradient and hessian so that the list have constant weight
float scale = 1.0f / num_pairsample;
if( fix_list_weight != 0.0f ){
float scale = fix_list_weight / (gptr[k+1] - gptr[k]);
scale *= fix_list_weight / (gptr[k+1] - gptr[k]);
}
if( scale != 1.0f ){
for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
grad[j] *= scale; hess[j] *= scale;
}
@ -246,11 +260,9 @@ namespace xgboost{
}
}
}
virtual const char* DefaultEvalMetric(void) {
return "ndcg";
}
private:
inline void AddGradient( unsigned pid, unsigned nid,
const std::vector<float> &pred,
@ -263,13 +275,10 @@ namespace xgboost{
grad[pid] += g; grad[nid] -= g;
// take conservative update, scale hessian by 2
hess[pid] += 2.0f * h; hess[nid] += 2.0f * h;
}
inline static bool CmpFirst( const std::pair<float,unsigned> &a, const std::pair<float,unsigned> &b ){
return a.first > b.first;
}
private:
// number of samples peformed for each instance
int num_pairsample;
// fix weight of each list
float fix_list_weight;
LossType loss;
@ -448,6 +457,17 @@ namespace xgboost{
class LambdaRankObj_NDCG : public LambdaRankObj{
static inline float CalcDCG(const std::vector< float > &rec) {
double sumdcg = 0.0;
for (size_t i = 0; i < rec.size(); i++){
const unsigned rel = static_cast<unsigned>(rec[i]);
if (rel != 0){
sumdcg += logf(2.0f) *((1 << rel) - 1) / logf(i + 2);
}
}
return static_cast<float>(sumdcg);
}
/*
* \brief Obtain the delta NDCG if trying to switch the positions of instances in index1 or index2
* in sorted triples. Here DCG is calculated as sigma_i 2^rel_i/log(i + 1)
@ -475,7 +495,7 @@ namespace xgboost{
}
std::sort(labels.begin(), labels.end(), std::greater<float>());
return EvalNDCG::CalcDCG(labels);
return CalcDCG(labels);
}
inline void GetLambda(const std::vector<float> &preds,