From ea354683b4cbb047317e3f8d1a1595fb746b5642 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 24 Apr 2014 22:20:40 -0700 Subject: [PATCH] add auc evaluation metric --- regression/xgboost_reg.h | 16 +++++------ regression/xgboost_reg_eval.h | 51 +++++++++++++++++++++++++++++------ 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/regression/xgboost_reg.h b/regression/xgboost_reg.h index 01cf0d2f3..cf0967076 100644 --- a/regression/xgboost_reg.h +++ b/regression/xgboost_reg.h @@ -172,7 +172,7 @@ namespace xgboost{ preds.resize(data.Size()); const unsigned ndata = static_cast(data.Size()); -#pragma omp parallel for schedule( static ) + #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ preds[j] = mparam.PredTransform (mparam.base_score + base_gbm.Predict(data.data, j, -1)); @@ -213,7 +213,7 @@ namespace xgboost{ inline void InteractPredict(std::vector &preds, const DMatrix &data, unsigned buffer_offset){ preds.resize(data.Size()); const unsigned ndata = static_cast(data.Size()); - #pragma omp parallel for schedule( static ) + #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ preds[j] = mparam.PredTransform (mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j)); @@ -222,7 +222,7 @@ namespace xgboost{ /*! \brief repredict trial */ inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){ const unsigned ndata = static_cast(data.Size()); - #pragma omp parallel for schedule( static ) + #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ base_gbm.InteractRePredict(data.data, j, buffer_offset + j); } @@ -233,7 +233,7 @@ namespace xgboost{ preds.resize(data.Size()); const unsigned ndata = static_cast(data.Size()); -#pragma omp parallel for schedule( static ) + #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ preds[j] = mparam.PredTransform (mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j)); @@ -242,13 +242,13 @@ namespace xgboost{ /*! \brief get the first order and second order gradient, given the transformed predictions and labels */ inline void GetGradient(const std::vector &preds, - const std::vector &labels, - std::vector &grad, - std::vector &hess){ + const std::vector &labels, + std::vector &grad, + std::vector &hess){ grad.resize(preds.size()); hess.resize(preds.size()); const unsigned ndata = static_cast(preds.size()); -#pragma omp parallel for schedule( static ) + #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ grad[j] = mparam.FirstOrderGradient(preds[j], labels[j]); hess[j] = mparam.SecondOrderGradient(preds[j], labels[j]); diff --git a/regression/xgboost_reg_eval.h b/regression/xgboost_reg_eval.h index ff24ca69b..137f44192 100644 --- a/regression/xgboost_reg_eval.h +++ b/regression/xgboost_reg_eval.h @@ -11,6 +11,7 @@ #include #include "../utils/xgboost_utils.h" #include "../utils/xgboost_omp.h" +#include "../utils/xgboost_random.h" namespace xgboost{ namespace regression{ @@ -30,10 +31,10 @@ namespace xgboost{ /*! \brief RMSE */ struct EvalRMSE : public IEvaluator{ virtual float Eval(const std::vector &preds, - const std::vector &labels) const{ + const std::vector &labels) const{ const unsigned ndata = static_cast(preds.size()); float sum = 0.0; -#pragma omp parallel for reduction(+:sum) schedule( static ) + #pragma omp parallel for reduction(+:sum) schedule( static ) for (unsigned i = 0; i < ndata; ++i){ float diff = preds[i] - labels[i]; sum += diff * diff; @@ -48,10 +49,10 @@ namespace xgboost{ /*! \brief Error */ struct EvalError : public IEvaluator{ virtual float Eval(const std::vector &preds, - const std::vector &labels) const{ + const std::vector &labels) const{ const unsigned ndata = static_cast(preds.size()); unsigned nerr = 0; -#pragma omp parallel for reduction(+:nerr) schedule( static ) + #pragma omp parallel for reduction(+:nerr) schedule( static ) for (unsigned i = 0; i < ndata; ++i){ if (preds[i] > 0.5f){ if (labels[i] < 0.5f) nerr += 1; @@ -67,14 +68,46 @@ namespace xgboost{ } }; + /*! \brief Area under curve */ + struct EvalAuc : public IEvaluator{ + inline static bool CmpFirst( const std::pair &a, const std::pair &b ){ + return a.first > b.first; + } + virtual float Eval( const std::vector &preds, + const std::vector &labels ) const{ + const unsigned ndata = static_cast( preds.size() ); + std::vector< std::pair > rec; + for( unsigned i = 0; i < ndata; ++ i ){ + rec.push_back( std::make_pair( preds[i], labels[i]) ); + } + random::Shuffle( rec ); + std::sort( rec.begin(), rec.end(), CmpFirst ); + + long npos = 0, nhit = 0; + for( unsigned i = 0; i < ndata; ++ i ){ + if( rec[i].second > 0.5f ) { + ++ npos; + }else{ + // this is the number of correct pairs + nhit += npos; + } + } + long nneg = ndata - npos; + utils::Assert( nneg > 0, "the dataset only contains pos samples" ); + return static_cast(nhit) / nneg / npos; + } + virtual const char *Name( void ) const{ + return "auc"; + } + }; /*! \brief Error */ struct EvalLogLoss : public IEvaluator{ virtual float Eval(const std::vector &preds, - const std::vector &labels) const{ + const std::vector &labels) const{ const unsigned ndata = static_cast(preds.size()); unsigned nerr = 0; -#pragma omp parallel for reduction(+:nerr) schedule( static ) + #pragma omp parallel for reduction(+:nerr) schedule( static ) for (unsigned i = 0; i < ndata; ++i){ const float y = labels[i]; const float py = preds[i]; @@ -96,14 +129,15 @@ namespace xgboost{ if (!strcmp(name, "rmse")) evals_.push_back(&rmse_); if (!strcmp(name, "error")) evals_.push_back(&error_); if (!strcmp(name, "logloss")) evals_.push_back(&logloss_); + if (!strcmp( name, "auc")) evals_.push_back( &auc_ ); } inline void Init(void){ std::sort(evals_.begin(), evals_.end()); evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin()); } inline void Eval(FILE *fo, const char *evname, - const std::vector &preds, - const std::vector &labels) const{ + const std::vector &preds, + const std::vector &labels) const{ for (size_t i = 0; i < evals_.size(); ++i){ float res = evals_[i]->Eval(preds, labels); fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res); @@ -112,6 +146,7 @@ namespace xgboost{ private: EvalRMSE rmse_; EvalError error_; + EvalAuc auc_; EvalLogLoss logloss_; std::vector evals_; };