new AUC evaluator, now compatible with weighted loss

This commit is contained in:
tqchen 2014-04-29 17:03:34 -07:00
parent 31edfda03c
commit 87a9c22795

View File

@ -94,34 +94,54 @@ namespace xgboost{
}
};
/*! \brief Area under curve */
/*! \brief Area under curve, for both classification and rank */
struct EvalAuc : public IEvaluator{
inline static bool CmpFirst( const std::pair<float,float> &a, const std::pair<float,float> &b ){
inline static bool CmpFirst( const std::pair<float,unsigned> &a, const std::pair<float,unsigned> &b ){
return a.first > b.first;
}
virtual float Eval( const std::vector<float> &preds,
const DMatrix::Info &info ) const {
const std::vector<float> &labels = info.labels;
const unsigned ndata = static_cast<unsigned>( preds.size() );
std::vector< std::pair<float, float> > rec;
for( unsigned i = 0; i < ndata; ++ i ){
rec.push_back( std::make_pair( preds[i], labels[i]) );
}
random::Shuffle( rec );
std::sort( rec.begin(), rec.end(), CmpFirst );
long npos = 0, nhit = 0;
for( unsigned i = 0; i < ndata; ++ i ){
if( rec[i].second > 0.5f ) {
++ npos;
}else{
// this is the number of correct pairs
nhit += npos;
std::vector<unsigned> tgptr(2,0); tgptr[1] = preds.size();
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Assert( gptr.back() == preds.size(), "EvalAuc: group structure must match number of prediction" );
const unsigned ngroup = static_cast<unsigned>( gptr.size() );
double sum_auc = 0.0f;
#pragma omp parallel reduction(+:sum_auc)
{
// each thread takes a local rec
std::vector< std::pair<float,unsigned> > rec;
#pragma omp for schedule(static)
for( unsigned k = 0; k < ngroup; ++ k ){
rec.clear();
for( unsigned j = gptr[k]; j < gptr[k+1]; ++ j ){
rec.push_back( std::make_pair( preds[j], j ) );
}
std::sort( rec.begin(), rec.end(), CmpFirst );
// calculate AUC
double sum_pospair = 0.0;
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
for( size_t j = 0; j < rec.size(); ++ j ){
const float wt = info.GetWeight( rec[j].second );
const float ctr = info.labels[ rec[j].second ];
// keep bucketing predictions in same bucket
if( j != 0 && rec[j].first != rec[j-1].first ){
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
sum_npos += buf_pos; sum_nneg += buf_neg;
buf_neg = buf_pos = 0.0f;
}
buf_pos += ctr * wt; buf_neg += (1.0f-ctr) * wt;
}
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
sum_npos += buf_pos; sum_nneg += buf_neg;
//
utils::Assert( sum_npos > 0.0 && sum_nneg > 0.0, "the dataset only contains pos or neg samples" );
// this is the AUC
sum_auc += sum_pospair / (sum_npos*sum_nneg);
}
}
long nneg = ndata - npos;
utils::Assert( nneg > 0, "the dataset only contains pos samples" );
return static_cast<float>(nhit) / nneg / npos;
}
// return average AUC over list
return static_cast<float>(sum_auc) / ngroup;
}
virtual const char *Name( void ) const{
return "auc";