make regression module compatible with rank loss, now support weighted loss

2014-04-29 16:16:02 -07:00 · 2014-04-29 16:16:02 -07:00 · 31edfda03c
commit 31edfda03c
parent 7a79c009ce
8 changed files with 478 additions and 346 deletions
--- a/3
+++ b/3
@ -10,9 +10,8 @@ OBJ =
 all: $(BIN) $(OBJ)
 export LDFLAGS= -pthread -lm 

-xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp
+xgboost: regrank/xgboost_regrank_main.cpp regrank/*.h booster/*.h booster/*/*.hpp booster/*.hpp

-#xgboost: rank/xgboost_rank_main.cpp base/*.h rank/*.h booster/*.h booster/*/*.hpp booster/*.hpp

 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
--- a/regrank/xgboost_regrank.h
+++ b/regrank/xgboost_regrank.h
@ -1,28 +1,31 @@
-#ifndef XGBOOST_REG_H
-#define XGBOOST_REG_H
+#ifndef XGBOOST_REGRANK_H
+#define XGBOOST_REGRANK_H
 /*!
-* \file xgboost_reg.h
-* \brief class for gradient boosted regression
+* \file xgboost_regrank.h
+* \brief class for gradient boosted regression and ranking
 * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
 */
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
-#include "xgboost_reg_data.h"
-#include "xgboost_reg_eval.h"
+#include "xgboost_regrank_data.h"
+#include "xgboost_regrank_eval.h"
+#include "xgboost_regrank_obj.h"
 #include "../utils/xgboost_omp.h"
 #include "../booster/xgboost_gbmbase.h"
 #include "../utils/xgboost_utils.h"
 #include "../utils/xgboost_stream.h"

 namespace xgboost{
-    namespace regression{
-        /*! \brief class for gradient boosted regression */
-        class RegBoostLearner{
+    namespace regrank{
+        /*! \brief class for gradient boosted regression and ranking */
+        class RegRankBoostLearner{
        public:
            /*! \brief constructor */
-            RegBoostLearner(void){
+            RegRankBoostLearner(void){
                silent = 0;
+                obj_ = NULL;
+                name_obj_ = "reg";
            }
            /*!
            * \brief a regression booter associated with training and evaluating data
@ -30,7 +33,7 @@ namespace xgboost{
            * \param evals array of evaluating data
            * \param evname name of evaluation data, used print statistics
            */
-            RegBoostLearner(const DMatrix *train,
+            RegRankBoostLearner(const DMatrix *train,
                                const std::vector<DMatrix *> &evals,
                                const std::vector<std::string> &evname){
                silent = 0;
@ -83,8 +86,10 @@ namespace xgboost{
            inline void SetParam(const char *name, const char *val){
                if (!strcmp(name, "silent"))  silent = atoi(val);
                if (!strcmp(name, "eval_metric"))  evaluator_.AddEval(val);
+                if (!strcmp(name, "objective") )   name_obj_ = val;
                mparam.SetParam(name, val);
                base_gbm.SetParam(name, val);
+                cfg_.push_back( std::make_pair( std::string(name), std::string(val) ) );
            }
            /*!
            * \brief initialize solver before training, called before training
@ -92,13 +97,11 @@ namespace xgboost{
            */
            inline void InitTrainer(void){
                base_gbm.InitTrainer();
-                if (mparam.loss_type == kLogisticClassify){
-                    evaluator_.AddEval("error");
+                obj_ = CreateObjFunction( name_obj_.c_str() );
+                for( size_t i = 0; i < cfg_.size(); ++ i ){
+                    obj_->SetParam( cfg_[i].first.c_str(), cfg_[i].second.c_str() );
                }
-                else{
-                    evaluator_.AddEval("rmse");
-                }
-                evaluator_.Init();
+                evaluator_.AddEval( obj_->DefaultEvalMetric() );
            }
            /*!
            * \brief initialize the current data storage for model, if the model is used first time, call this function
@ -146,7 +149,7 @@ namespace xgboost{
             */
            inline void UpdateOneIter(int iter){
                this->PredictBuffer(preds_, *train_, 0);
-                this->GetGradient(preds_, train_->labels, grad_, hess_);
+                obj_->GetGradient(preds_, train_->info, grad_, hess_);
                std::vector<unsigned> root_index;
                base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
            }
@ -162,7 +165,8 @@ namespace xgboost{
                for (size_t i = 0; i < evals_.size(); ++i){
                    std::vector<float> &preds = this->eval_preds_[i];
                    this->PredictBuffer(preds, *evals_[i], buffer_offset);
-                    evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels);
+                    obj_->PredTransform(preds);
+                    evaluator_.Eval(fo, evname_[i].c_str(), preds, evals_[i]->info);
                    buffer_offset += static_cast<int>(evals_[i]->Size());
                }
                fprintf(fo, "\n");
@ -171,18 +175,17 @@ namespace xgboost{
            /*! \brief get prediction, without buffering */
            inline void Predict(std::vector<float> &preds, const DMatrix &data){
                preds.resize(data.Size());
-
                const unsigned ndata = static_cast<unsigned>(data.Size());
                #pragma omp parallel for schedule( static )
                for (unsigned j = 0; j < ndata; ++j){
-                    preds[j] = mparam.PredTransform
-                        (mparam.base_score + base_gbm.Predict(data.data, j, -1));
+                    preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1);
                }
+                obj_->PredTransform( preds );
            }            
        public:
            /*!
-             * \brief update the model for one iteration
-             * \param iteration iteration number
+             * \brief interactive update 
+             * \param action action type 
             */
            inline void UpdateInteract(std::string action){
                this->InteractPredict(preds_, *train_, 0);
@ -198,7 +201,7 @@ namespace xgboost{
                    base_gbm.DelteBooster(); return;
                }

-                this->GetGradient(preds_, train_->labels, grad_, hess_);
+                obj_->GetGradient(preds_, train_->info, grad_, hess_);
                std::vector<unsigned> root_index;
                base_gbm.DoBoost(grad_, hess_, train_->data, root_index);

@ -216,9 +219,9 @@ namespace xgboost{
                const unsigned ndata = static_cast<unsigned>(data.Size());
                #pragma omp parallel for schedule( static )
                for (unsigned j = 0; j < ndata; ++j){
-                    preds[j] = mparam.PredTransform
-                        (mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j));
+                    preds[j] = mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j);                    
                }
+                obj_->PredTransform( preds );
            }
            /*! \brief repredict trial */
            inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){
@ -232,37 +235,13 @@ namespace xgboost{
            /*! \brief get the transformed predictions, given data */
            inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
                preds.resize(data.Size());
-
                const unsigned ndata = static_cast<unsigned>(data.Size());
                #pragma omp parallel for schedule( static )
                for (unsigned j = 0; j < ndata; ++j){
-                    preds[j] = mparam.PredTransform
-                        (mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j));
+                    preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j);
                }
            }
-
-            /*! \brief get the first order and second order gradient, given the transformed predictions and labels */
-            inline void GetGradient(const std::vector<float> &preds,
-                                    const std::vector<float> &labels,
-                                    std::vector<float> &grad,
-                                    std::vector<float> &hess){
-                grad.resize(preds.size()); hess.resize(preds.size());
-
-                const unsigned ndata = static_cast<unsigned>(preds.size());
-                #pragma omp parallel for schedule( static )
-                for (unsigned j = 0; j < ndata; ++j){
-                    grad[j] = mparam.FirstOrderGradient(preds[j], labels[j]);
-                    hess[j] = mparam.SecondOrderGradient(preds[j], labels[j]);
-                }
-            }
-
        private:
-            enum LossType{
-                kLinearSquare = 0,
-                kLogisticNeglik = 1,
-                kLogisticClassify = 2
-            };
-
            /*! \brief training parameter for regression */
            struct ModelParam{
                /* \brief global bias */
@ -277,7 +256,6 @@ namespace xgboost{
                ModelParam(void){
                    base_score = 0.5f;
                    loss_type = 0;
-                    num_feature = 0;
                    memset(reserved, 0, sizeof(reserved));
                }
                /*!
@ -299,92 +277,6 @@ namespace xgboost{
                        base_score = -logf(1.0f / base_score - 1.0f);
                    }
                }
-
-                /*!
-                * \brief transform the linear sum to prediction
-                * \param x linear sum of boosting ensemble
-                * \return transformed prediction
-                */
-                inline float PredTransform(float x){
-                    switch (loss_type){
-                    case kLinearSquare: return x;
-                    case kLogisticClassify:
-                    case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
-                    default: utils::Error("unknown loss_type"); return 0.0f;
-                    }
-                }
-
-                /*!
-                * \brief calculate first order gradient of loss, given transformed prediction
-                * \param predt transformed prediction
-                * \param label true label
-                * \return first order gradient
-                */
-                inline float FirstOrderGradient(float predt, float label) const{
-                    switch (loss_type){
-                    case kLinearSquare: return predt - label;
-                    case kLogisticClassify:
-                    case kLogisticNeglik: return predt - label;
-                    default: utils::Error("unknown loss_type"); return 0.0f;
-                    }
-                }
-                /*!
-                * \brief calculate second order gradient of loss, given transformed prediction
-                * \param predt transformed prediction
-                * \param label true label
-                * \return second order gradient
-                */
-                inline float SecondOrderGradient(float predt, float label) const{
-                    switch (loss_type){
-                    case kLinearSquare: return 1.0f;
-                    case kLogisticClassify:
-                    case kLogisticNeglik: return predt * (1 - predt);
-                    default: utils::Error("unknown loss_type"); return 0.0f;
-                    }
-                }
-
-                /*!
-                 * \brief calculating the loss, given the predictions, labels and the loss type
-                 * \param preds the given predictions
-                 * \param labels the given labels
-                 * \return the specified loss
-                 */
-                inline float Loss(const std::vector<float> &preds, const std::vector<float> &labels) const{
-                    switch (loss_type){
-                    case kLinearSquare: return SquareLoss(preds, labels);
-                    case kLogisticNeglik:
-                    case kLogisticClassify: return NegLoglikelihoodLoss(preds, labels);
-                    default: utils::Error("unknown loss_type"); return 0.0f;
-                    }
-                }
-
-                /*!
-                 * \brief calculating the square loss, given the predictions and labels
-                 * \param preds the given predictions
-                 * \param labels the given labels
-                 * \return the summation of square loss
-                 */
-                inline float SquareLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
-                    float ans = 0.0;
-                    for (size_t i = 0; i < preds.size(); i++){
-                        float dif = preds[i] - labels[i];
-                        ans += dif * dif;
-                    }
-                    return ans;
-                }
-
-                /*!
-                 * \brief calculating the square loss, given the predictions and labels
-                 * \param preds the given predictions
-                 * \param labels the given labels
-                 * \return the summation of square loss
-                 */
-                inline float NegLoglikelihoodLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
-                    float ans = 0.0;
-                    for (size_t i = 0; i < preds.size(); i++)
-                        ans -= labels[i] * logf(preds[i]) + (1 - labels[i]) * logf(1 - preds[i]);
-                    return ans;
-                }
            };
        private:
            int silent;
@ -395,6 +287,11 @@ namespace xgboost{
            std::vector<DMatrix *> evals_;
            std::vector<std::string> evname_;
            std::vector<unsigned> buffer_index_;
+            // objective fnction
+            IObjFunction *obj_;
+            // name of objective function
+            std::string name_obj_;
+            std::vector< std::pair<std::string, std::string> > cfg_;
        private:
            std::vector<float> grad_, hess_, preds_;
            std::vector< std::vector<float> > eval_preds_;
--- a/regrank/xgboost_regrank_data.h
+++ b/regrank/xgboost_regrank_data.h
@ -0,0 +1,205 @@
+#ifndef XGBOOST_REGRANK_DATA_H
+#define XGBOOST_REGRANK_DATA_H
+
+/*!
+ * \file xgboost_regrank_data.h
+ * \brief input data structure for regression, binary classification, and rankning.
+ *     Format:
+ *        The data should contain each data instance in each line.
+ *		  The format of line data is as below:
+ *        label <nonzero feature dimension> [feature index:feature value]+
+ *     When using rank, an addtional group file with suffix group must be provided, giving the number of instances in each group
+ *     When using weighted aware classification(regression), an addtional weight file must be provided, giving the weight of each instance
+ * 
+ * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
+ */
+#include <cstdio>
+#include <vector>
+#include "../booster/xgboost_data.h"
+#include "../utils/xgboost_utils.h"
+#include "../utils/xgboost_stream.h"
+
+namespace xgboost{
+    /*! \brief namespace to handle regression and rank */
+    namespace regrank{
+        /*! \brief data matrix for regression content */
+        struct DMatrix{
+        public:
+            /*! \brief data information besides the features */
+            struct Info{
+                /*! \brief label of each instance */
+                std::vector<float> labels;
+                /*! \brief the index of begin and end of a groupneeded when the learning task is ranking */
+                std::vector<unsigned> group_ptr;
+                /*! \brief weights of each instance, optional */            
+                std::vector<float> weights;
+                /*! \brief get weight of each instances */
+                inline float GetWeight( size_t i ) const{
+                    if(  weights.size() != 0 ) return weights[i];
+                    else return 1.0f;
+                }
+            };
+        public:
+            /*! \brief feature data content */
+            booster::FMatrixS data;
+            /*! \brief information fields */
+            Info info;
+        public:
+            /*! \brief default constructor */
+            DMatrix(void){}
+            /*! \brief get the number of instances */
+            inline size_t Size() const{
+                return info.labels.size();
+            }           
+            /*!
+             * \brief load from text file
+             * \param fname name of text data
+             * \param silent whether print information or not
+             */
+            inline void LoadText(const char* fname, bool silent = false){
+                data.Clear();
+                FILE* file = utils::FopenCheck(fname, "r");
+                float label; bool init = true;
+                char tmp[1024];
+                std::vector<booster::bst_uint> findex;
+                std::vector<booster::bst_float> fvalue;
+                
+                while (fscanf(file, "%s", tmp) == 1){
+                    unsigned index; float value;
+                    if (sscanf(tmp, "%u:%f", &index, &value) == 2){
+                        findex.push_back(index); fvalue.push_back(value);
+                    }
+                    else{
+                        if (!init){
+                            info.labels.push_back(label);
+                            data.AddRow(findex, fvalue);
+                        }
+                        findex.clear(); fvalue.clear();
+                        utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
+                        init = false;
+                    }
+                }
+            
+                info.labels.push_back(label);
+                data.AddRow(findex, fvalue);
+                // initialize column support as well
+                data.InitData();
+                
+                if (!silent){
+                    printf("%ux%u matrix with %lu entries is loaded from %s\n",
+                           (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
+                }
+                fclose(file);
+                this->TryLoadGroup(fname, silent);
+                this->TryLoadWeight(fname, silent);
+            }
+            /*!
+             * \brief load from binary file
+             * \param fname name of binary data
+             * \param silent whether print information or not
+             * \return whether loading is success
+             */
+            inline bool LoadBinary(const char* fname, bool silent = false){
+                FILE *fp = fopen64(fname, "rb");
+                if (fp == NULL) return false;
+                utils::FileStream fs(fp);
+                data.LoadBinary(fs);
+                info.labels.resize(data.NumRow());
+                utils::Assert(fs.Read(&info.labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
+                fs.Close();
+                // initialize column support as well
+                data.InitData();
+                
+                if (!silent){
+                    printf("%ux%u matrix with %lu entries is loaded from %s\n",
+                           (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
+                }
+                this->TryLoadGroup(fname, silent);
+                this->TryLoadWeight(fname, silent);
+                return true;
+            }
+            /*!
+             * \brief save to binary file
+             * \param fname name of binary data
+             * \param silent whether print information or not
+             */
+            inline void SaveBinary(const char* fname, bool silent = false){
+                // initialize column support as well
+                data.InitData();
+                
+                utils::FileStream fs(utils::FopenCheck(fname, "wb"));
+                data.SaveBinary(fs);
+                fs.Write(&info.labels[0], sizeof(float)* data.NumRow());
+                fs.Close();
+                if (!silent){
+                    printf("%ux%u matrix with %lu entries is saved to %s\n",
+                       (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
+                }
+            }
+            /*!
+             * \brief cache load data given a file name, if filename ends with .buffer, direct load binary
+             *        otherwise the function will first check if fname + '.buffer' exists,
+             *        if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
+             *        and try to create a buffer file
+             * \param fname name of binary data
+             * \param silent whether print information or not
+             * \param savebuffer whether do save binary buffer if it is text
+             */
+            inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
+                int len = strlen(fname);
+                if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
+                    this->LoadBinary(fname, silent); return;
+                }
+                char bname[1024];
+                sprintf(bname, "%s.buffer", fname);
+                if (!this->LoadBinary(bname, silent)){
+                    this->LoadText(fname, silent);
+                    if (savebuffer) this->SaveBinary(bname, silent);
+                }
+            }
+        private:
+            inline bool TryLoadGroup(const char* fname, bool silent = false){
+                std::string name = fname;
+                if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
+                    name.resize( name.length() - 7 );
+                }
+                name += ".group";
+                //if exists group data load it in
+                FILE *fi = fopen64(name.c_str(), "r");
+                if (fi == NULL) return false;                
+                info.group_ptr.push_back(0);
+                unsigned nline;
+                while (fscanf(fi, "%u", &nline) == 1){
+                    info.group_ptr.push_back(info.group_ptr.back()+nline);
+                }
+                if(!silent){
+                    printf("%lu groups are loaded from %s\n", info.group_ptr.size()-1, name.c_str());
+                }
+                fclose(fi);
+                utils::Assert( info.group_ptr.back() == data.NumRow(), "DMatrix: group data does not match the number of rows in feature matrix" );
+                return true;
+            }
+            inline bool TryLoadWeight(const char* fname, bool silent = false){
+                std::string name = fname;
+                if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
+                    name.resize( name.length() - 7 );
+                }
+                name += ".weight";
+                //if exists group data load it in
+                FILE *fi = fopen64(name.c_str(), "r");
+                if (fi == NULL) return false;                
+                float wt;
+                while (fscanf(fi, "%f", &wt) == 1){
+                    info.weights.push_back( wt );
+                }
+                if(!silent){
+                    printf("loading weight from %s\n", name.c_str());
+                }
+                fclose(fi);
+                utils::Assert( info.weights.size() == data.NumRow(), "DMatrix: weight data does not match the number of rows in feature matrix" );
+                return true;
+            }
+        };
+    };
+};
+#endif
--- a/regrank/xgboost_regrank_eval.h
+++ b/regrank/xgboost_regrank_eval.h
@ -1,8 +1,8 @@
-#ifndef XGBOOST_REG_EVAL_H
-#define XGBOOST_REG_EVAL_H
+#ifndef XGBOOST_REGRANK_EVAL_H
+#define XGBOOST_REGRANK_EVAL_H
 /*!
-* \file xgboost_reg_eval.h
-* \brief evaluation metrics for regression and classification
+* \file xgboost_regrank_eval.h
+* \brief evaluation metrics for regression and classification and rank
 * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
 */

@ -12,18 +12,19 @@
 #include "../utils/xgboost_utils.h"
 #include "../utils/xgboost_omp.h"
 #include "../utils/xgboost_random.h"
+#include "xgboost_regrank_data.h"

 namespace xgboost{
-    namespace regression{
+    namespace regrank{
        /*! \brief evaluator that evaluates the loss metrics */
        struct IEvaluator{
            /*!
             * \brief evaluate a specific metric
             * \param preds prediction
-             * \param labels label
+             * \param info information, including label etc.
             */
            virtual float Eval(const std::vector<float> &preds,
-            const std::vector<float> &labels) const = 0;
+                               const DMatrix::Info &info ) const = 0;
            /*! \return name of metric */
            virtual const char *Name(void) const = 0;
        };
@ -31,37 +32,62 @@ namespace xgboost{
        /*! \brief RMSE */
        struct EvalRMSE : public IEvaluator{
            virtual float Eval(const std::vector<float> &preds,
-                               const std::vector<float> &labels) const{
+                               const DMatrix::Info &info ) const {
                const unsigned ndata = static_cast<unsigned>(preds.size());
-                float sum = 0.0;
-                #pragma omp parallel for reduction(+:sum) schedule( static )
+                float sum = 0.0, wsum = 0.0;
+                #pragma omp parallel for reduction(+:sum,wsum) schedule( static )
                for (unsigned i = 0; i < ndata; ++i){
-                    float diff = preds[i] - labels[i];
-                    sum += diff * diff;
+                    const float wt = info.GetWeight(i);
+                    const float diff = info.labels[i] - preds[i];
+                    sum += diff*diff * wt;
+                    wsum += wt;
                }
-                return sqrtf(sum / ndata);
+                return sqrtf(sum / wsum);
            }
            virtual const char *Name(void) const{
                return "rmse";
            }
        };

+        /*! \brief Error */
+        struct EvalLogLoss : public IEvaluator{
+            virtual float Eval(const std::vector<float> &preds,
+                               const DMatrix::Info &info ) const {
+                const unsigned ndata = static_cast<unsigned>(preds.size());
+                float sum = 0.0f, wsum = 0.0f;                
+                #pragma omp parallel for reduction(+:sum,wsum) schedule( static )
+                for (unsigned i = 0; i < ndata; ++i){
+                    const float y = info.labels[i];
+                    const float py = preds[i];
+                    const float wt = info.GetWeight(i);
+                    sum -= wt * ( y * std::log(py) + (1.0f - y)*std::log(1 - py) );
+                    wsum+= wt;
+                }
+                return sum / wsum;
+            }
+            virtual const char *Name(void) const{
+                return "negllik";
+            }
+        };
+
        /*! \brief Error */
        struct EvalError : public IEvaluator{
            virtual float Eval(const std::vector<float> &preds,
-                               const std::vector<float> &labels) const{
+                               const DMatrix::Info &info ) const {
                const unsigned ndata = static_cast<unsigned>(preds.size());
-                unsigned nerr = 0;
-                #pragma omp parallel for reduction(+:nerr) schedule( static )
+                float sum = 0.0f, wsum = 0.0f;                
+                #pragma omp parallel for reduction(+:sum,wsum) schedule( static )
                for (unsigned i = 0; i < ndata; ++i){
+                    const float wt = info.GetWeight(i);
                    if (preds[i] > 0.5f){
-                        if (labels[i] < 0.5f) nerr += 1;
+                        if (info.labels[i] < 0.5f) sum += wt; 
                     }
                    else{
-                        if (labels[i] > 0.5f) nerr += 1;
+                        if (info.labels[i] >= 0.5f) sum += wt;
                    }
+                    wsum += wt;
                }
-                return static_cast<float>(nerr) / ndata;
+                return sum / wsum;
            }
            virtual const char *Name(void) const{
                return "error";
@ -74,7 +100,8 @@ namespace xgboost{
                return a.first > b.first;
            }
            virtual float Eval( const std::vector<float> &preds, 
-                                const std::vector<float> &labels ) const{
+                                const DMatrix::Info &info ) const {
+                const std::vector<float> &labels  = info.labels;
                const unsigned ndata = static_cast<unsigned>( preds.size() );
                std::vector< std::pair<float, float> > rec;
                for( unsigned i = 0; i < ndata; ++ i ){
@ -100,54 +127,35 @@ namespace xgboost{
                return "auc";
            }
        };
-
-        /*! \brief Error */
-        struct EvalLogLoss : public IEvaluator{
-            virtual float Eval(const std::vector<float> &preds,
-                               const std::vector<float> &labels) const{
-                const unsigned ndata = static_cast<unsigned>(preds.size());
-                unsigned nerr = 0;
-                #pragma omp parallel for reduction(+:nerr) schedule( static )
-                for (unsigned i = 0; i < ndata; ++i){
-                    const float y = labels[i];
-                    const float py = preds[i];
-                    nerr -= y * std::log(py) + (1.0f - y)*std::log(1 - py);
-                }
-                return static_cast<float>(nerr) / ndata;
-            }
-            virtual const char *Name(void) const{
-                return "negllik";
-            }
-        };
    };

-    namespace regression{
+    namespace regrank{
        /*! \brief a set of evaluators */
        struct EvalSet{
        public:
            inline void AddEval(const char *name){
-                if (!strcmp(name, "rmse")) evals_.push_back(&rmse_);
-                if (!strcmp(name, "error")) evals_.push_back(&error_);
-                if (!strcmp(name, "logloss")) evals_.push_back(&logloss_);
-                if (!strcmp( name, "auc"))   evals_.push_back( &auc_ );
+                for( size_t i = 0; i < evals_.size(); ++ i ){
+                    if(!strcmp(name, evals_[i]->Name())) return;
+                }
+                if (!strcmp(name, "rmse"))    evals_.push_back( new EvalRMSE() );
+                if (!strcmp(name, "error"))   evals_.push_back( new EvalError() );
+                if (!strcmp(name, "logloss")) evals_.push_back( new EvalLogLoss() );
+                if (!strcmp( name, "auc"))    evals_.push_back( new EvalAuc() );
+            }
+            ~EvalSet(){
+                for( size_t i = 0; i < evals_.size(); ++ i ){
+                    delete evals_[i];
                }
-            inline void Init(void){
-                std::sort(evals_.begin(), evals_.end());
-                evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin());
            }
            inline void Eval(FILE *fo, const char *evname,
                             const std::vector<float> &preds,
-                             const std::vector<float> &labels) const{
+                             const DMatrix::Info &info ) const{
                for (size_t i = 0; i < evals_.size(); ++i){
-                    float res = evals_[i]->Eval(preds, labels);
+                    float res = evals_[i]->Eval(preds, info);
                    fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res);
                }
            }
        private:
-            EvalRMSE  rmse_;
-            EvalError error_;
-            EvalAuc   auc_;
-            EvalLogLoss logloss_;
            std::vector<const IEvaluator*> evals_;
        };
    };
--- a/regrank/xgboost_regrank_main.cpp
+++ b/regrank/xgboost_regrank_main.cpp
@ -4,13 +4,13 @@
 #include <ctime>
 #include <string>
 #include <cstring>
-#include "xgboost_reg.h"
+#include "xgboost_regrank.h"
 #include "../utils/xgboost_fmap.h"
 #include "../utils/xgboost_random.h"
 #include "../utils/xgboost_config.h"

 namespace xgboost{
-    namespace regression{
+    namespace regrank{
        /*!
        * \brief wrapping the training process of the gradient boosting regression model,
        *   given the configuation
@ -273,13 +273,13 @@ namespace xgboost{
            DMatrix data;
            std::vector<DMatrix*> deval;
            utils::FeatMap fmap;
-            RegBoostLearner learner;
+            RegRankBoostLearner learner;
        };
    };
 };

 int main( int argc, char *argv[] ){
  xgboost::random::Seed( 0 );
-  xgboost::regression::RegBoostTask tsk;
+  xgboost::regrank::RegBoostTask tsk;
  return tsk.Run( argc, argv );
 }
--- a/regrank/xgboost_regrank_obj.h
+++ b/regrank/xgboost_regrank_obj.h
@ -0,0 +1,112 @@
+#ifndef XGBOOST_REGRANK_OBJ_H
+#define XGBOOST_REGRANK_OBJ_H
+/*!
+ * \file xgboost_regrank_obj.h
+ * \brief defines objective function interface used in xgboost for regression and rank
+ * \author Tianqi Chen, Kailong Chen
+ */
+#include "xgboost_regrank_data.h"
+
+namespace xgboost{
+    namespace regrank{
+        /*! \brief interface of objective function */
+        class IObjFunction{
+        public:
+            /*! \brief virtual destructor */
+            virtual ~IObjFunction(void){}
+            /*!
+             * \brief set parameters from outside
+             * \param name name of the parameter
+             * \param val  value of the parameter
+             */
+            virtual void SetParam(const char *name, const char *val) = 0;            
+            /*! 
+             * \brief get gradient over each of predictions, given existing information
+             * \param preds prediction of current round
+             * \param info information about labels, weights, groups in rank
+             * \param grad gradient over each preds
+             * \param hess second order gradient over each preds
+             */
+            virtual void GetGradient(const std::vector<float>& preds,  
+                                     const DMatrix::Info &info,
+                                     std::vector<float> &grad, 
+                                     std::vector<float> &hess ) = 0;
+            /*! \return the default evaluation metric for the problem */
+            virtual const char* DefaultEvalMetric(void) = 0;
+            /*! 
+             * \brief transform prediction values, this is only called when Prediction is called
+             * \param preds prediction values, saves to this vector as well
+             */
+            virtual void PredTransform(std::vector<float> &preds){}
+        };
+    };
+    
+    namespace regrank{
+        /*! \brief defines functions to calculate some commonly used functions */
+        struct LossType{
+        public:
+            const static int kLinearSquare = 0;
+            const static int kLogisticNeglik = 1;
+            const static int kLogisticClassify = 2;
+        public:
+            /*! \brief indicate which type we are using */
+            int loss_type;
+        public:
+            /*!
+             * \brief transform the linear sum to prediction
+             * \param x linear sum of boosting ensemble
+             * \return transformed prediction
+             */
+            inline float PredTransform(float x){
+                switch (loss_type){
+                case kLinearSquare: return x;
+                case kLogisticClassify:
+                case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
+                default: utils::Error("unknown loss_type"); return 0.0f;
+                }
+            }
+            
+            /*!
+             * \brief calculate first order gradient of loss, given transformed prediction
+             * \param predt transformed prediction
+             * \param label true label
+             * \return first order gradient
+             */
+            inline float FirstOrderGradient(float predt, float label) const{
+                switch (loss_type){
+                case kLinearSquare: return predt - label;
+                case kLogisticClassify:
+                case kLogisticNeglik: return predt - label;
+                default: utils::Error("unknown loss_type"); return 0.0f;
+                }
+            }
+            /*!
+             * \brief calculate second order gradient of loss, given transformed prediction
+             * \param predt transformed prediction
+             * \param label true label
+             * \return second order gradient
+             */
+            inline float SecondOrderGradient(float predt, float label) const{
+                switch (loss_type){
+                case kLinearSquare: return 1.0f;
+                case kLogisticClassify:
+                case kLogisticNeglik: return predt * (1 - predt);
+                default: utils::Error("unknown loss_type"); return 0.0f;
+                }
+            }
+        };
+    };
+};
+
+#include "xgboost_regrank_obj.hpp"
+
+namespace xgboost{
+    namespace regrank{        
+        IObjFunction* CreateObjFunction( const char *name ){
+            if( !strcmp("reg", name ) ) return new RegressionObj();
+            utils::Error("unknown objective function type");
+            return NULL;
+        }
+    };
+};
+#endif
--- a/regrank/xgboost_regrank_obj.hpp
+++ b/regrank/xgboost_regrank_obj.hpp
@ -0,0 +1,52 @@
+#ifndef XGBOOST_REGRANK_OBJ_HPP
+#define XGBOOST_REGRANK_OBJ_HPP
+/*!
+ * \file xgboost_regrank_obj.h
+ * \brief implementation of objective functions
+ * \author Tianqi Chen, Kailong Chen
+ */
+namespace xgboost{
+    namespace regrank{        
+        class RegressionObj : public IObjFunction{
+        public:
+            RegressionObj(void){
+                loss.loss_type = LossType::kLinearSquare;
+            }
+            virtual ~RegressionObj(){}
+            virtual void SetParam(const char *name, const char *val){
+                if( !strcmp( "loss_type", name ) ) loss.loss_type = atoi( val );
+            }
+            virtual void GetGradient(const std::vector<float>& preds,  
+                                     const DMatrix::Info &info,
+                                     std::vector<float> &grad, 
+                                     std::vector<float> &hess ) {
+                grad.resize(preds.size()); hess.resize(preds.size());
+
+                const unsigned ndata = static_cast<unsigned>(preds.size());
+                #pragma omp parallel for schedule( static )
+                for (unsigned j = 0; j < ndata; ++j){
+                    grad[j] = loss.FirstOrderGradient(preds[j], info.labels[j]) * info.GetWeight(j);
+                    hess[j] = loss.SecondOrderGradient(preds[j], info.labels[j]) * info.GetWeight(j);
+                }
+            }
+            virtual const char* DefaultEvalMetric(void) {
+                if( loss.loss_type == LossType::kLogisticClassify ) return "error";
+                else return "rmse";
+            }
+            virtual void PredTransform(std::vector<float> &preds){
+                const unsigned ndata = static_cast<unsigned>(preds.size());
+                #pragma omp parallel for schedule( static )
+                for (unsigned j = 0; j < ndata; ++j){
+                    preds[j] = loss.PredTransform( preds[j] );
+                }
+            }
+        private:
+            LossType loss;
+        };
+    };
+
+    namespace regrank{
+        // TODO rank objective
+    };
+};
+#endif
--- a/regression/xgboost_reg_data.h
+++ b/regression/xgboost_reg_data.h
@ -1,141 +0,0 @@
-#ifndef XGBOOST_REG_DATA_H
-#define XGBOOST_REG_DATA_H
-
-/*!
- * \file xgboost_reg_data.h
- * \brief input data structure for regression and binary classification task.
- *     Format:
- *        The data should contain each data instance in each line.
- *		  The format of line data is as below:
- *        label <nonzero feature dimension> [feature index:feature value]+
- * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
- */
-#include <cstdio>
-#include <vector>
-#include "../booster/xgboost_data.h"
-#include "../utils/xgboost_utils.h"
-#include "../utils/xgboost_stream.h"
-
-namespace xgboost{
-    namespace regression{
-        /*! \brief data matrix for regression content */
-        struct DMatrix{
-        public:
-            /*! \brief feature data content */
-            booster::FMatrixS data;
-            /*! \brief label of each instance */
-            std::vector<float> labels;
-        public:
-            /*! \brief default constructor */
-            DMatrix(void){}
-
-            /*! \brief get the number of instances */
-            inline size_t Size() const{
-                return labels.size();
-            }
-            /*!
-            * \brief load from text file
-            * \param fname name of text data
-            * \param silent whether print information or not
-            */
-            inline void LoadText(const char* fname, bool silent = false){
-                data.Clear();
-                FILE* file = utils::FopenCheck(fname, "r");
-                float label; bool init = true;
-                char tmp[1024];
-                std::vector<booster::bst_uint> findex;
-                std::vector<booster::bst_float> fvalue;
-
-                while (fscanf(file, "%s", tmp) == 1){
-                    unsigned index; float value;
-                    if (sscanf(tmp, "%u:%f", &index, &value) == 2){
-                        findex.push_back(index); fvalue.push_back(value);
-                    }
-                    else{
-                        if (!init){
-                            labels.push_back(label);
-                            data.AddRow(findex, fvalue);
-                        }
-                        findex.clear(); fvalue.clear();
-                        utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
-                        init = false;
-                    }
-                }
-
-                labels.push_back(label);
-                data.AddRow(findex, fvalue);
-                // initialize column support as well
-                data.InitData();
-
-                if (!silent){
-                    printf("%ux%u matrix with %lu entries is loaded from %s\n",
-                        (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
-                }
-                fclose(file);
-            }
-            /*!
-            * \brief load from binary file
-            * \param fname name of binary data
-            * \param silent whether print information or not
-            * \return whether loading is success
-            */
-            inline bool LoadBinary(const char* fname, bool silent = false){
-                FILE *fp = fopen64(fname, "rb");
-                if (fp == NULL) return false;
-                utils::FileStream fs(fp);
-                data.LoadBinary(fs);
-                labels.resize(data.NumRow());
-                utils::Assert(fs.Read(&labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
-                fs.Close();
-                // initialize column support as well
-                data.InitData();
-
-                if (!silent){
-                    printf("%ux%u matrix with %lu entries is loaded from %s\n",
-                        (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
-                }
-                return true;
-            }
-            /*!
-            * \brief save to binary file
-            * \param fname name of binary data
-            * \param silent whether print information or not
-            */
-            inline void SaveBinary(const char* fname, bool silent = false){
-                // initialize column support as well
-                data.InitData();
-
-                utils::FileStream fs(utils::FopenCheck(fname, "wb"));
-                data.SaveBinary(fs);
-                fs.Write(&labels[0], sizeof(float)* data.NumRow());
-                fs.Close();
-                if (!silent){
-                    printf("%ux%u matrix with %lu entries is saved to %s\n",
-                        (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
-                }
-            }
-            /*!
-            * \brief cache load data given a file name, if filename ends with .buffer, direct load binary
-            *        otherwise the function will first check if fname + '.buffer' exists,
-            *        if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
-            *        and try to create a buffer file
-            * \param fname name of binary data
-            * \param silent whether print information or not
-            * \param savebuffer whether do save binary buffer if it is text
-            */
-            inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
-                int len = strlen(fname);
-                if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
-                    this->LoadBinary(fname, silent); return;
-                }
-                char bname[1024];
-                sprintf(bname, "%s.buffer", fname);
-                if (!this->LoadBinary(bname, silent)){
-                    this->LoadText(fname, silent);
-                    if (savebuffer) this->SaveBinary(bname, silent);
-                }
-            }
-        };
-    };
-};
-#endif