From 0bf6261961a68711c66b28aabbee61c686379c1a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 7 May 2014 11:52:12 -0700
Subject: [PATCH 1/4] fix omp for bug in obj

---
 regrank/xgboost_regrank.h       |  9 +++++++++
 regrank/xgboost_regrank_eval.h  | 25 ++++++++++++++++++++++++-
 regrank/xgboost_regrank_obj.hpp | 12 ++++++------
 regrank/xgboost_regrank_utils.h |  4 +++-
 4 files changed, 42 insertions(+), 8 deletions(-)
diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h
index b06280b2c..7163f2ed9 100644
--- a/regrank/xgboost_regrank.h
+++ b/regrank/xgboost_regrank.h
@@ -200,6 +200,11 @@ namespace xgboost{
                 fprintf(fo, "[%d]", iter);
                 for (size_t i = 0; i < evals.size(); ++i){
                     this->PredictRaw(preds_, *evals[i]);
+                    for( size_t j = 0 ; j < preds_.size(); ++ j){
+                        if( fabsf(preds_[j]- 0.5f)>1e-6f){
+                            printf("p[%lu]=%f\n", j,preds_[j]);
+                        }
+                    }
                     obj_->PredTransform(preds_);
                     evaluator_.Eval(fo, evname[i].c_str(), preds_, evals[i]->info);
                 }
@@ -283,6 +288,10 @@ namespace xgboost{
                     #pragma omp parallel for schedule( static )
                     for (unsigned j = 0; j < ndata; ++j){
                         preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group );
+                        if( preds[j] != 0.5f ){
+                            printf("pred[%d:%u]=%f\n", bst_group, j, preds[j]);
+                        }
+                        utils::Assert( preds[j] == 0.5f, "BUG");
                     }
                 }else
                     #pragma omp parallel for schedule( static )
diff --git a/regrank/xgboost_regrank_eval.h b/regrank/xgboost_regrank_eval.h
index 0d67f2a58..41544b55b 100644
--- a/regrank/xgboost_regrank_eval.h
+++ b/regrank/xgboost_regrank_eval.h
@@ -83,7 +83,7 @@ namespace xgboost{
                 float sum = 0.0f, wsum = 0.0f;
                 #pragma omp parallel for reduction(+:sum,wsum) schedule( static )
                 for (unsigned i = 0; i < ndata; ++i){
-                    const float wt = info.GetWeight(i);
+                    const float wt = info.GetWeight(i);                    
                     if (preds[i] > 0.5f){
                         if (info.labels[i] < 0.5f) sum += wt;
                     }
@@ -99,6 +99,28 @@ namespace xgboost{
             }
         };
 
+
+        /*! \brief Error */
+        struct EvalMatchError : public IEvaluator{
+            virtual float Eval(const std::vector<float> &preds,
+                               const DMatrix::Info &info) const {
+                const unsigned ndata = static_cast<unsigned>(preds.size());
+                float sum = 0.0f, wsum = 0.0f;
+                #pragma omp parallel for reduction(+:sum,wsum) schedule( static )
+                for (unsigned i = 0; i < ndata; ++i){
+                    const float wt = info.GetWeight(i);                    
+                    if (static_cast<int>(preds[i]) != static_cast<int>(info.labels[i]) ){
+                        sum += wt;
+                    }
+                    wsum += wt;
+                }
+                return sum / wsum;
+            }
+            virtual const char *Name(void) const{
+                return "merror";
+            }
+        };
+
         /*! \brief Area under curve, for both classification and rank */
         struct EvalAuc : public IEvaluator{
             virtual float Eval(const std::vector<float> &preds,
@@ -270,6 +292,7 @@ namespace xgboost{
                 }
                 if (!strcmp(name, "rmse"))    evals_.push_back(new EvalRMSE());
                 if (!strcmp(name, "error"))   evals_.push_back(new EvalError());
+                if (!strcmp(name, "merror"))   evals_.push_back(new EvalMatchError());
                 if (!strcmp(name, "logloss")) evals_.push_back(new EvalLogLoss());
                 if (!strcmp(name, "auc"))    evals_.push_back(new EvalAuc());
                 if (!strncmp(name, "pre@", 4)) evals_.push_back(new EvalPrecision(name));
diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp
index 6a1ed7741..f36cee4ad 100644
--- a/regrank/xgboost_regrank_obj.hpp
+++ b/regrank/xgboost_regrank_obj.hpp
@@ -75,7 +75,7 @@ namespace xgboost{
                 #pragma omp parallel
                 {
                     std::vector< float > rec;                    
-                    #pragma for schedule(static)
+                    #pragma omp for schedule(static)
                     for (unsigned k = 0; k < ngroup; ++k){
                         rec.clear();
                         int nhit = 0;
@@ -125,7 +125,7 @@ namespace xgboost{
                 #pragma omp parallel
                 {
                     std::vector<float> rec(nclass);
-                    #pragma for schedule(static)
+                    #pragma omp for schedule(static)
                     for (unsigned j = 0; j < ndata; ++j){
                         for( int k = 0; k < nclass; ++ k ){
                             rec[k] = preds[j + k * ndata];
@@ -149,22 +149,22 @@ namespace xgboost{
                 utils::Assert( nclass != 0, "must set num_class to use softmax" );
                 utils::Assert( preds.size() % nclass == 0, "SoftmaxMultiClassObj: label size and pred size does not match" );                
                 const unsigned ndata = static_cast<unsigned>(preds.size()/nclass);
+                
                 #pragma omp parallel
                 {
                     std::vector<float> rec(nclass);
-                    #pragma for schedule(static)
+                    #pragma omp for schedule(static)
                     for (unsigned j = 0; j < ndata; ++j){
                         for( int k = 0; k < nclass; ++ k ){
                             rec[k] = preds[j + k * ndata];
                         }
-                        Softmax( rec );
                         preds[j] = FindMaxIndex( rec );
                     }
                 }
                 preds.resize( ndata );
             }
             virtual const char* DefaultEvalMetric(void) {
-                return "error";
+                return "merror";
             }
         private:
             int nclass;
@@ -201,7 +201,7 @@ namespace xgboost{
                     // thread use its own random number generator, seed by thread id and current iteration
                     random::Random rnd; rnd.Seed( iter * 1111 + omp_get_thread_num() );
                     std::vector< std::pair<float,unsigned> > rec;
-                    #pragma for schedule(static)
+                    #pragma omp for schedule(static)
                     for (unsigned k = 0; k < ngroup; ++k){
                         rec.clear();
                         for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
diff --git a/regrank/xgboost_regrank_utils.h b/regrank/xgboost_regrank_utils.h
index 580cedb81..c040b40d8 100644
--- a/regrank/xgboost_regrank_utils.h
+++ b/regrank/xgboost_regrank_utils.h
@@ -26,7 +26,9 @@ namespace xgboost{
         inline static int FindMaxIndex( std::vector<float>& rec ){
             size_t mxid = 0;
             for( size_t i = 1; i < rec.size(); ++ i ){
-                if( rec[i] > rec[mxid] ) mxid = i;
+                if( rec[i] > rec[mxid]+1e-6f ){
+                    mxid = i;
+                }
             }
             return (int)mxid;
         }        

From a0c0fbbb6193b2f755f84935e62a28340d3c501f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 8 May 2014 19:31:32 -0700
Subject: [PATCH 2/4] commit the fix

---
 regrank/xgboost_regrank.h       | 10 +---------
 regrank/xgboost_regrank_eval.h  | 20 ++++++++++++++++----
 regrank/xgboost_regrank_obj.hpp |  3 +++
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h
index 7163f2ed9..d0148ab1a 100644
--- a/regrank/xgboost_regrank.h
+++ b/regrank/xgboost_regrank.h
@@ -200,11 +200,6 @@ namespace xgboost{
                 fprintf(fo, "[%d]", iter);
                 for (size_t i = 0; i < evals.size(); ++i){
                     this->PredictRaw(preds_, *evals[i]);
-                    for( size_t j = 0 ; j < preds_.size(); ++ j){
-                        if( fabsf(preds_[j]- 0.5f)>1e-6f){
-                            printf("p[%lu]=%f\n", j,preds_[j]);
-                        }
-                    }
                     obj_->PredTransform(preds_);
                     evaluator_.Eval(fo, evname[i].c_str(), preds_, evals[i]->info);
                 }
@@ -288,10 +283,7 @@ namespace xgboost{
                     #pragma omp parallel for schedule( static )
                     for (unsigned j = 0; j < ndata; ++j){
                         preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group );
-                        if( preds[j] != 0.5f ){
-                            printf("pred[%d:%u]=%f\n", bst_group, j, preds[j]);
-                        }
-                        utils::Assert( preds[j] == 0.5f, "BUG");
+
                     }
                 }else
                     #pragma omp parallel for schedule( static )
diff --git a/regrank/xgboost_regrank_eval.h b/regrank/xgboost_regrank_eval.h
index 497d32921..24e514933 100644
--- a/regrank/xgboost_regrank_eval.h
+++ b/regrank/xgboost_regrank_eval.h
@@ -102,14 +102,22 @@ namespace xgboost{
 
         /*! \brief Error */
         struct EvalMatchError : public IEvaluator{
+        public:
+            EvalMatchError(const char *name){
+                name_ = name;
+                abs_ = 0;
+                if(!strcmp("mabserror", name)) abs_ =1;
+            }            
             virtual float Eval(const std::vector<float> &preds,
                                const DMatrix::Info &info) const {
                 const unsigned ndata = static_cast<unsigned>(preds.size());
                 float sum = 0.0f, wsum = 0.0f;
                 #pragma omp parallel for reduction(+:sum,wsum) schedule( static )
                 for (unsigned i = 0; i < ndata; ++i){
-                    const float wt = info.GetWeight(i);                    
-                    if (static_cast<int>(preds[i]) != static_cast<int>(info.labels[i]) ){
+                    const float wt = info.GetWeight(i);
+                    int label = static_cast<int>(info.labels[i]);
+                    if( label < 0 && abs_ != 0 ) label = -label-1;
+                    if (static_cast<int>(preds[i]) != label ){
                         sum += wt;
                     }
                     wsum += wt;
@@ -117,10 +125,13 @@ namespace xgboost{
                 return sum / wsum;
             }
             virtual const char *Name(void) const{
-                return "merror";
+                return name_.c_str();
             }
+            int abs_;
+            std::string name_;
         };
 
+
         /*! \brief Area under curve, for both classification and rank */
         struct EvalAuc : public IEvaluator{
             virtual float Eval(const std::vector<float> &preds,
@@ -303,7 +314,8 @@ namespace xgboost{
                 }
                 if (!strcmp(name, "rmse"))    evals_.push_back(new EvalRMSE());
                 if (!strcmp(name, "error"))   evals_.push_back(new EvalError());
-                if (!strcmp(name, "merror"))   evals_.push_back(new EvalMatchError());
+                if (!strcmp(name, "merror"))   evals_.push_back(new EvalMatchError("merror"));
+                if (!strcmp(name, "mabserror"))   evals_.push_back(new EvalMatchError("mabserror"));
                 if (!strcmp(name, "logloss")) evals_.push_back(new EvalLogLoss());
                 if (!strcmp(name, "auc"))    evals_.push_back(new EvalAuc());
                 if (!strncmp(name, "pre@", 4)) evals_.push_back(new EvalPrecision(name));
diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp
index 6f367744c..f3fd4010b 100644
--- a/regrank/xgboost_regrank_obj.hpp
+++ b/regrank/xgboost_regrank_obj.hpp
@@ -134,6 +134,9 @@ namespace xgboost{
                         }
                         Softmax( rec );
                         int label = static_cast<int>(info.labels[j]);
+                        if( label < 0 ){
+                            label = -label - 1;
+                        }
                         utils::Assert( label < nclass, "SoftmaxMultiClassObj: label exceed num_class" );
                         for( int k = 0; k < nclass; ++ k ){
                             float p = rec[ k ];

From 2ccd28339edc87447d0b9ea23fe3c0aad8dd4d05 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 8 May 2014 19:35:06 -0700
Subject: [PATCH 3/4] faster convert to numpy array

---
 python/xgboost.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/xgboost.py b/python/xgboost.py
index 922ca085d..d37566065 100644
--- a/python/xgboost.py
+++ b/python/xgboost.py
@@ -22,6 +22,13 @@ xglib.XGDMatrixGetLabel.restype =  ctypes.POINTER( ctypes.c_float )
 xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
 xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float ) 
 
+def ctypes2numpy( cptr, length ):
+    # convert a ctypes pointer array to numpy
+    assert isinstance( cptr, ctypes.POINTER( ctypes.c_float ) )
+    res = numpy.zeros( length, dtype='float32' )
+    assert ctypes.memmove( res.ctypes.data, cptr, length * res.strides[0] )
+    return res
+
 # data matrix used in xgboost
 class DMatrix:
     # constructor
@@ -73,7 +80,7 @@ class DMatrix:
     def get_label(self):
         length = ctypes.c_ulong()
         labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length))
-        return numpy.array( [labels[i] for i in xrange(length.value)] )
+        return ctypes2numpy( labels, length.value );
     # clear everything
     def clear(self):
         xglib.XGDMatrixClear(self.handle)
@@ -138,7 +145,7 @@ class Booster:
     def predict(self, data, bst_group = -1):
         length = ctypes.c_ulong()
         preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group)
-        return numpy.array( [ preds[i] for i in xrange(length.value)])
+        return ctypes2numpy( preds, length.value )
     def save_model(self, fname):
         """ save model to file """
         xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) )

From 41edad7b3d39d48ed7dfdca04e8516f35795f727 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 8 May 2014 20:15:23 -0700
Subject: [PATCH 4/4] add python o3

---
 python/Makefile           | 2 +-
 python/xgboost_python.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/Makefile b/python/Makefile
index 0db0a1ed0..f21957a2e 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -1,6 +1,6 @@
 export CC  = gcc
 export CXX = g++
-export CFLAGS = -Wall -msse2  -Wno-unknown-pragmas -fopenmp
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fopenmp
 
 # specify tensor path
 SLIB = libxgboostpy.so
diff --git a/python/xgboost_python.cpp b/python/xgboost_python.cpp
index ee80429f8..d5442c8c3 100644
--- a/python/xgboost_python.cpp
+++ b/python/xgboost_python.cpp
@@ -75,6 +75,7 @@ namespace xgboost{
             inline void CheckInit(void){
                 if(!init_col_){
                     this->data.InitData();
+                    init_col_ = true;
                 }
                 utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
             }