From 0bf6261961a68711c66b28aabbee61c686379c1a Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 May 2014 11:52:12 -0700 Subject: [PATCH 1/4] fix omp for bug in obj --- regrank/xgboost_regrank.h | 9 +++++++++ regrank/xgboost_regrank_eval.h | 25 ++++++++++++++++++++++++- regrank/xgboost_regrank_obj.hpp | 12 ++++++------ regrank/xgboost_regrank_utils.h | 4 +++- 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h index b06280b2c..7163f2ed9 100644 --- a/regrank/xgboost_regrank.h +++ b/regrank/xgboost_regrank.h @@ -200,6 +200,11 @@ namespace xgboost{ fprintf(fo, "[%d]", iter); for (size_t i = 0; i < evals.size(); ++i){ this->PredictRaw(preds_, *evals[i]); + for( size_t j = 0 ; j < preds_.size(); ++ j){ + if( fabsf(preds_[j]- 0.5f)>1e-6f){ + printf("p[%lu]=%f\n", j,preds_[j]); + } + } obj_->PredTransform(preds_); evaluator_.Eval(fo, evname[i].c_str(), preds_, evals[i]->info); } @@ -283,6 +288,10 @@ namespace xgboost{ #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group ); + if( preds[j] != 0.5f ){ + printf("pred[%d:%u]=%f\n", bst_group, j, preds[j]); + } + utils::Assert( preds[j] == 0.5f, "BUG"); } }else #pragma omp parallel for schedule( static ) diff --git a/regrank/xgboost_regrank_eval.h b/regrank/xgboost_regrank_eval.h index 0d67f2a58..41544b55b 100644 --- a/regrank/xgboost_regrank_eval.h +++ b/regrank/xgboost_regrank_eval.h @@ -83,7 +83,7 @@ namespace xgboost{ float sum = 0.0f, wsum = 0.0f; #pragma omp parallel for reduction(+:sum,wsum) schedule( static ) for (unsigned i = 0; i < ndata; ++i){ - const float wt = info.GetWeight(i); + const float wt = info.GetWeight(i); if (preds[i] > 0.5f){ if (info.labels[i] < 0.5f) sum += wt; } @@ -99,6 +99,28 @@ namespace xgboost{ } }; + + /*! \brief Error */ + struct EvalMatchError : public IEvaluator{ + virtual float Eval(const std::vector &preds, + const DMatrix::Info &info) const { + const unsigned ndata = static_cast(preds.size()); + float sum = 0.0f, wsum = 0.0f; + #pragma omp parallel for reduction(+:sum,wsum) schedule( static ) + for (unsigned i = 0; i < ndata; ++i){ + const float wt = info.GetWeight(i); + if (static_cast(preds[i]) != static_cast(info.labels[i]) ){ + sum += wt; + } + wsum += wt; + } + return sum / wsum; + } + virtual const char *Name(void) const{ + return "merror"; + } + }; + /*! \brief Area under curve, for both classification and rank */ struct EvalAuc : public IEvaluator{ virtual float Eval(const std::vector &preds, @@ -270,6 +292,7 @@ namespace xgboost{ } if (!strcmp(name, "rmse")) evals_.push_back(new EvalRMSE()); if (!strcmp(name, "error")) evals_.push_back(new EvalError()); + if (!strcmp(name, "merror")) evals_.push_back(new EvalMatchError()); if (!strcmp(name, "logloss")) evals_.push_back(new EvalLogLoss()); if (!strcmp(name, "auc")) evals_.push_back(new EvalAuc()); if (!strncmp(name, "pre@", 4)) evals_.push_back(new EvalPrecision(name)); diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp index 6a1ed7741..f36cee4ad 100644 --- a/regrank/xgboost_regrank_obj.hpp +++ b/regrank/xgboost_regrank_obj.hpp @@ -75,7 +75,7 @@ namespace xgboost{ #pragma omp parallel { std::vector< float > rec; - #pragma for schedule(static) + #pragma omp for schedule(static) for (unsigned k = 0; k < ngroup; ++k){ rec.clear(); int nhit = 0; @@ -125,7 +125,7 @@ namespace xgboost{ #pragma omp parallel { std::vector rec(nclass); - #pragma for schedule(static) + #pragma omp for schedule(static) for (unsigned j = 0; j < ndata; ++j){ for( int k = 0; k < nclass; ++ k ){ rec[k] = preds[j + k * ndata]; @@ -149,22 +149,22 @@ namespace xgboost{ utils::Assert( nclass != 0, "must set num_class to use softmax" ); utils::Assert( preds.size() % nclass == 0, "SoftmaxMultiClassObj: label size and pred size does not match" ); const unsigned ndata = static_cast(preds.size()/nclass); + #pragma omp parallel { std::vector rec(nclass); - #pragma for schedule(static) + #pragma omp for schedule(static) for (unsigned j = 0; j < ndata; ++j){ for( int k = 0; k < nclass; ++ k ){ rec[k] = preds[j + k * ndata]; } - Softmax( rec ); preds[j] = FindMaxIndex( rec ); } } preds.resize( ndata ); } virtual const char* DefaultEvalMetric(void) { - return "error"; + return "merror"; } private: int nclass; @@ -201,7 +201,7 @@ namespace xgboost{ // thread use its own random number generator, seed by thread id and current iteration random::Random rnd; rnd.Seed( iter * 1111 + omp_get_thread_num() ); std::vector< std::pair > rec; - #pragma for schedule(static) + #pragma omp for schedule(static) for (unsigned k = 0; k < ngroup; ++k){ rec.clear(); for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){ diff --git a/regrank/xgboost_regrank_utils.h b/regrank/xgboost_regrank_utils.h index 580cedb81..c040b40d8 100644 --- a/regrank/xgboost_regrank_utils.h +++ b/regrank/xgboost_regrank_utils.h @@ -26,7 +26,9 @@ namespace xgboost{ inline static int FindMaxIndex( std::vector& rec ){ size_t mxid = 0; for( size_t i = 1; i < rec.size(); ++ i ){ - if( rec[i] > rec[mxid] ) mxid = i; + if( rec[i] > rec[mxid]+1e-6f ){ + mxid = i; + } } return (int)mxid; } From a0c0fbbb6193b2f755f84935e62a28340d3c501f Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 8 May 2014 19:31:32 -0700 Subject: [PATCH 2/4] commit the fix --- regrank/xgboost_regrank.h | 10 +--------- regrank/xgboost_regrank_eval.h | 20 ++++++++++++++++---- regrank/xgboost_regrank_obj.hpp | 3 +++ 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h index 7163f2ed9..d0148ab1a 100644 --- a/regrank/xgboost_regrank.h +++ b/regrank/xgboost_regrank.h @@ -200,11 +200,6 @@ namespace xgboost{ fprintf(fo, "[%d]", iter); for (size_t i = 0; i < evals.size(); ++i){ this->PredictRaw(preds_, *evals[i]); - for( size_t j = 0 ; j < preds_.size(); ++ j){ - if( fabsf(preds_[j]- 0.5f)>1e-6f){ - printf("p[%lu]=%f\n", j,preds_[j]); - } - } obj_->PredTransform(preds_); evaluator_.Eval(fo, evname[i].c_str(), preds_, evals[i]->info); } @@ -288,10 +283,7 @@ namespace xgboost{ #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group ); - if( preds[j] != 0.5f ){ - printf("pred[%d:%u]=%f\n", bst_group, j, preds[j]); - } - utils::Assert( preds[j] == 0.5f, "BUG"); + } }else #pragma omp parallel for schedule( static ) diff --git a/regrank/xgboost_regrank_eval.h b/regrank/xgboost_regrank_eval.h index 497d32921..24e514933 100644 --- a/regrank/xgboost_regrank_eval.h +++ b/regrank/xgboost_regrank_eval.h @@ -102,14 +102,22 @@ namespace xgboost{ /*! \brief Error */ struct EvalMatchError : public IEvaluator{ + public: + EvalMatchError(const char *name){ + name_ = name; + abs_ = 0; + if(!strcmp("mabserror", name)) abs_ =1; + } virtual float Eval(const std::vector &preds, const DMatrix::Info &info) const { const unsigned ndata = static_cast(preds.size()); float sum = 0.0f, wsum = 0.0f; #pragma omp parallel for reduction(+:sum,wsum) schedule( static ) for (unsigned i = 0; i < ndata; ++i){ - const float wt = info.GetWeight(i); - if (static_cast(preds[i]) != static_cast(info.labels[i]) ){ + const float wt = info.GetWeight(i); + int label = static_cast(info.labels[i]); + if( label < 0 && abs_ != 0 ) label = -label-1; + if (static_cast(preds[i]) != label ){ sum += wt; } wsum += wt; @@ -117,10 +125,13 @@ namespace xgboost{ return sum / wsum; } virtual const char *Name(void) const{ - return "merror"; + return name_.c_str(); } + int abs_; + std::string name_; }; + /*! \brief Area under curve, for both classification and rank */ struct EvalAuc : public IEvaluator{ virtual float Eval(const std::vector &preds, @@ -303,7 +314,8 @@ namespace xgboost{ } if (!strcmp(name, "rmse")) evals_.push_back(new EvalRMSE()); if (!strcmp(name, "error")) evals_.push_back(new EvalError()); - if (!strcmp(name, "merror")) evals_.push_back(new EvalMatchError()); + if (!strcmp(name, "merror")) evals_.push_back(new EvalMatchError("merror")); + if (!strcmp(name, "mabserror")) evals_.push_back(new EvalMatchError("mabserror")); if (!strcmp(name, "logloss")) evals_.push_back(new EvalLogLoss()); if (!strcmp(name, "auc")) evals_.push_back(new EvalAuc()); if (!strncmp(name, "pre@", 4)) evals_.push_back(new EvalPrecision(name)); diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp index 6f367744c..f3fd4010b 100644 --- a/regrank/xgboost_regrank_obj.hpp +++ b/regrank/xgboost_regrank_obj.hpp @@ -134,6 +134,9 @@ namespace xgboost{ } Softmax( rec ); int label = static_cast(info.labels[j]); + if( label < 0 ){ + label = -label - 1; + } utils::Assert( label < nclass, "SoftmaxMultiClassObj: label exceed num_class" ); for( int k = 0; k < nclass; ++ k ){ float p = rec[ k ]; From 2ccd28339edc87447d0b9ea23fe3c0aad8dd4d05 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 8 May 2014 19:35:06 -0700 Subject: [PATCH 3/4] faster convert to numpy array --- python/xgboost.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/xgboost.py b/python/xgboost.py index 922ca085d..d37566065 100644 --- a/python/xgboost.py +++ b/python/xgboost.py @@ -22,6 +22,13 @@ xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float ) xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry ) xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float ) +def ctypes2numpy( cptr, length ): + # convert a ctypes pointer array to numpy + assert isinstance( cptr, ctypes.POINTER( ctypes.c_float ) ) + res = numpy.zeros( length, dtype='float32' ) + assert ctypes.memmove( res.ctypes.data, cptr, length * res.strides[0] ) + return res + # data matrix used in xgboost class DMatrix: # constructor @@ -73,7 +80,7 @@ class DMatrix: def get_label(self): length = ctypes.c_ulong() labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length)) - return numpy.array( [labels[i] for i in xrange(length.value)] ) + return ctypes2numpy( labels, length.value ); # clear everything def clear(self): xglib.XGDMatrixClear(self.handle) @@ -138,7 +145,7 @@ class Booster: def predict(self, data, bst_group = -1): length = ctypes.c_ulong() preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group) - return numpy.array( [ preds[i] for i in xrange(length.value)]) + return ctypes2numpy( preds, length.value ) def save_model(self, fname): """ save model to file """ xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) ) From 41edad7b3d39d48ed7dfdca04e8516f35795f727 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 8 May 2014 20:15:23 -0700 Subject: [PATCH 4/4] add python o3 --- python/Makefile | 2 +- python/xgboost_python.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/Makefile b/python/Makefile index 0db0a1ed0..f21957a2e 100644 --- a/python/Makefile +++ b/python/Makefile @@ -1,6 +1,6 @@ export CC = gcc export CXX = g++ -export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fopenmp +export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp # specify tensor path SLIB = libxgboostpy.so diff --git a/python/xgboost_python.cpp b/python/xgboost_python.cpp index ee80429f8..d5442c8c3 100644 --- a/python/xgboost_python.cpp +++ b/python/xgboost_python.cpp @@ -75,6 +75,7 @@ namespace xgboost{ inline void CheckInit(void){ if(!init_col_){ this->data.InitData(); + init_col_ = true; } utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix"); }