diff --git a/Makefile b/Makefile index 75baf5662..f23ce1085 100644 --- a/Makefile +++ b/Makefile @@ -10,9 +10,8 @@ OBJ = all: $(BIN) $(OBJ) export LDFLAGS= -pthread -lm -xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp +xgboost: regrank/xgboost_regrank_main.cpp regrank/*.h booster/*.h booster/*/*.hpp booster/*.hpp -#xgboost: rank/xgboost_rank_main.cpp base/*.h rank/*.h booster/*.h booster/*/*.hpp booster/*.hpp $(BIN) : $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) diff --git a/regression/xgboost_reg.h b/regrank/xgboost_regrank.h similarity index 61% rename from regression/xgboost_reg.h rename to regrank/xgboost_regrank.h index d676de3e2..9482ce11e 100644 --- a/regression/xgboost_reg.h +++ b/regrank/xgboost_regrank.h @@ -1,28 +1,31 @@ -#ifndef XGBOOST_REG_H -#define XGBOOST_REG_H +#ifndef XGBOOST_REGRANK_H +#define XGBOOST_REGRANK_H /*! -* \file xgboost_reg.h -* \brief class for gradient boosted regression +* \file xgboost_regrank.h +* \brief class for gradient boosted regression and ranking * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com */ #include #include #include -#include "xgboost_reg_data.h" -#include "xgboost_reg_eval.h" +#include "xgboost_regrank_data.h" +#include "xgboost_regrank_eval.h" +#include "xgboost_regrank_obj.h" #include "../utils/xgboost_omp.h" #include "../booster/xgboost_gbmbase.h" #include "../utils/xgboost_utils.h" #include "../utils/xgboost_stream.h" namespace xgboost{ - namespace regression{ - /*! \brief class for gradient boosted regression */ - class RegBoostLearner{ + namespace regrank{ + /*! \brief class for gradient boosted regression and ranking */ + class RegRankBoostLearner{ public: /*! \brief constructor */ - RegBoostLearner(void){ + RegRankBoostLearner(void){ silent = 0; + obj_ = NULL; + name_obj_ = "reg"; } /*! * \brief a regression booter associated with training and evaluating data @@ -30,9 +33,9 @@ namespace xgboost{ * \param evals array of evaluating data * \param evname name of evaluation data, used print statistics */ - RegBoostLearner(const DMatrix *train, - const std::vector &evals, - const std::vector &evname){ + RegRankBoostLearner(const DMatrix *train, + const std::vector &evals, + const std::vector &evname){ silent = 0; this->SetData(train, evals, evname); } @@ -44,8 +47,8 @@ namespace xgboost{ * \param evname name of evaluation data, used print statistics */ inline void SetData(const DMatrix *train, - const std::vector &evals, - const std::vector &evname){ + const std::vector &evals, + const std::vector &evname){ this->train_ = train; this->evals_ = evals; this->evname_ = evname; @@ -83,8 +86,10 @@ namespace xgboost{ inline void SetParam(const char *name, const char *val){ if (!strcmp(name, "silent")) silent = atoi(val); if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val); + if (!strcmp(name, "objective") ) name_obj_ = val; mparam.SetParam(name, val); base_gbm.SetParam(name, val); + cfg_.push_back( std::make_pair( std::string(name), std::string(val) ) ); } /*! * \brief initialize solver before training, called before training @@ -92,13 +97,11 @@ namespace xgboost{ */ inline void InitTrainer(void){ base_gbm.InitTrainer(); - if (mparam.loss_type == kLogisticClassify){ - evaluator_.AddEval("error"); + obj_ = CreateObjFunction( name_obj_.c_str() ); + for( size_t i = 0; i < cfg_.size(); ++ i ){ + obj_->SetParam( cfg_[i].first.c_str(), cfg_[i].second.c_str() ); } - else{ - evaluator_.AddEval("rmse"); - } - evaluator_.Init(); + evaluator_.AddEval( obj_->DefaultEvalMetric() ); } /*! * \brief initialize the current data storage for model, if the model is used first time, call this function @@ -146,7 +149,7 @@ namespace xgboost{ */ inline void UpdateOneIter(int iter){ this->PredictBuffer(preds_, *train_, 0); - this->GetGradient(preds_, train_->labels, grad_, hess_); + obj_->GetGradient(preds_, train_->info, grad_, hess_); std::vector root_index; base_gbm.DoBoost(grad_, hess_, train_->data, root_index); } @@ -162,7 +165,8 @@ namespace xgboost{ for (size_t i = 0; i < evals_.size(); ++i){ std::vector &preds = this->eval_preds_[i]; this->PredictBuffer(preds, *evals_[i], buffer_offset); - evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels); + obj_->PredTransform(preds); + evaluator_.Eval(fo, evname_[i].c_str(), preds, evals_[i]->info); buffer_offset += static_cast(evals_[i]->Size()); } fprintf(fo, "\n"); @@ -171,18 +175,17 @@ namespace xgboost{ /*! \brief get prediction, without buffering */ inline void Predict(std::vector &preds, const DMatrix &data){ preds.resize(data.Size()); - const unsigned ndata = static_cast(data.Size()); #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ - preds[j] = mparam.PredTransform - (mparam.base_score + base_gbm.Predict(data.data, j, -1)); + preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1); } - } + obj_->PredTransform( preds ); + } public: /*! - * \brief update the model for one iteration - * \param iteration iteration number + * \brief interactive update + * \param action action type */ inline void UpdateInteract(std::string action){ this->InteractPredict(preds_, *train_, 0); @@ -198,7 +201,7 @@ namespace xgboost{ base_gbm.DelteBooster(); return; } - this->GetGradient(preds_, train_->labels, grad_, hess_); + obj_->GetGradient(preds_, train_->info, grad_, hess_); std::vector root_index; base_gbm.DoBoost(grad_, hess_, train_->data, root_index); @@ -216,9 +219,9 @@ namespace xgboost{ const unsigned ndata = static_cast(data.Size()); #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ - preds[j] = mparam.PredTransform - (mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j)); + preds[j] = mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j); } + obj_->PredTransform( preds ); } /*! \brief repredict trial */ inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){ @@ -232,37 +235,13 @@ namespace xgboost{ /*! \brief get the transformed predictions, given data */ inline void PredictBuffer(std::vector &preds, const DMatrix &data, unsigned buffer_offset){ preds.resize(data.Size()); - const unsigned ndata = static_cast(data.Size()); #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ - preds[j] = mparam.PredTransform - (mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j)); + preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j); } } - - /*! \brief get the first order and second order gradient, given the transformed predictions and labels */ - inline void GetGradient(const std::vector &preds, - const std::vector &labels, - std::vector &grad, - std::vector &hess){ - grad.resize(preds.size()); hess.resize(preds.size()); - - const unsigned ndata = static_cast(preds.size()); - #pragma omp parallel for schedule( static ) - for (unsigned j = 0; j < ndata; ++j){ - grad[j] = mparam.FirstOrderGradient(preds[j], labels[j]); - hess[j] = mparam.SecondOrderGradient(preds[j], labels[j]); - } - } - private: - enum LossType{ - kLinearSquare = 0, - kLogisticNeglik = 1, - kLogisticClassify = 2 - }; - /*! \brief training parameter for regression */ struct ModelParam{ /* \brief global bias */ @@ -270,14 +249,13 @@ namespace xgboost{ /* \brief type of loss function */ int loss_type; /* \brief number of features */ - int num_feature; + int num_feature; /*! \brief reserved field */ int reserved[16]; /*! \brief constructor */ ModelParam(void){ base_score = 0.5f; loss_type = 0; - num_feature = 0; memset(reserved, 0, sizeof(reserved)); } /*! @@ -299,92 +277,6 @@ namespace xgboost{ base_score = -logf(1.0f / base_score - 1.0f); } } - - /*! - * \brief transform the linear sum to prediction - * \param x linear sum of boosting ensemble - * \return transformed prediction - */ - inline float PredTransform(float x){ - switch (loss_type){ - case kLinearSquare: return x; - case kLogisticClassify: - case kLogisticNeglik: return 1.0f / (1.0f + expf(-x)); - default: utils::Error("unknown loss_type"); return 0.0f; - } - } - - /*! - * \brief calculate first order gradient of loss, given transformed prediction - * \param predt transformed prediction - * \param label true label - * \return first order gradient - */ - inline float FirstOrderGradient(float predt, float label) const{ - switch (loss_type){ - case kLinearSquare: return predt - label; - case kLogisticClassify: - case kLogisticNeglik: return predt - label; - default: utils::Error("unknown loss_type"); return 0.0f; - } - } - /*! - * \brief calculate second order gradient of loss, given transformed prediction - * \param predt transformed prediction - * \param label true label - * \return second order gradient - */ - inline float SecondOrderGradient(float predt, float label) const{ - switch (loss_type){ - case kLinearSquare: return 1.0f; - case kLogisticClassify: - case kLogisticNeglik: return predt * (1 - predt); - default: utils::Error("unknown loss_type"); return 0.0f; - } - } - - /*! - * \brief calculating the loss, given the predictions, labels and the loss type - * \param preds the given predictions - * \param labels the given labels - * \return the specified loss - */ - inline float Loss(const std::vector &preds, const std::vector &labels) const{ - switch (loss_type){ - case kLinearSquare: return SquareLoss(preds, labels); - case kLogisticNeglik: - case kLogisticClassify: return NegLoglikelihoodLoss(preds, labels); - default: utils::Error("unknown loss_type"); return 0.0f; - } - } - - /*! - * \brief calculating the square loss, given the predictions and labels - * \param preds the given predictions - * \param labels the given labels - * \return the summation of square loss - */ - inline float SquareLoss(const std::vector &preds, const std::vector &labels) const{ - float ans = 0.0; - for (size_t i = 0; i < preds.size(); i++){ - float dif = preds[i] - labels[i]; - ans += dif * dif; - } - return ans; - } - - /*! - * \brief calculating the square loss, given the predictions and labels - * \param preds the given predictions - * \param labels the given labels - * \return the summation of square loss - */ - inline float NegLoglikelihoodLoss(const std::vector &preds, const std::vector &labels) const{ - float ans = 0.0; - for (size_t i = 0; i < preds.size(); i++) - ans -= labels[i] * logf(preds[i]) + (1 - labels[i]) * logf(1 - preds[i]); - return ans; - } }; private: int silent; @@ -395,6 +287,11 @@ namespace xgboost{ std::vector evals_; std::vector evname_; std::vector buffer_index_; + // objective fnction + IObjFunction *obj_; + // name of objective function + std::string name_obj_; + std::vector< std::pair > cfg_; private: std::vector grad_, hess_, preds_; std::vector< std::vector > eval_preds_; diff --git a/regrank/xgboost_regrank_data.h b/regrank/xgboost_regrank_data.h new file mode 100644 index 000000000..344c13115 --- /dev/null +++ b/regrank/xgboost_regrank_data.h @@ -0,0 +1,205 @@ +#ifndef XGBOOST_REGRANK_DATA_H +#define XGBOOST_REGRANK_DATA_H + +/*! + * \file xgboost_regrank_data.h + * \brief input data structure for regression, binary classification, and rankning. + * Format: + * The data should contain each data instance in each line. + * The format of line data is as below: + * label [feature index:feature value]+ + * When using rank, an addtional group file with suffix group must be provided, giving the number of instances in each group + * When using weighted aware classification(regression), an addtional weight file must be provided, giving the weight of each instance + * + * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com + */ +#include +#include +#include "../booster/xgboost_data.h" +#include "../utils/xgboost_utils.h" +#include "../utils/xgboost_stream.h" + +namespace xgboost{ + /*! \brief namespace to handle regression and rank */ + namespace regrank{ + /*! \brief data matrix for regression content */ + struct DMatrix{ + public: + /*! \brief data information besides the features */ + struct Info{ + /*! \brief label of each instance */ + std::vector labels; + /*! \brief the index of begin and end of a groupneeded when the learning task is ranking */ + std::vector group_ptr; + /*! \brief weights of each instance, optional */ + std::vector weights; + /*! \brief get weight of each instances */ + inline float GetWeight( size_t i ) const{ + if( weights.size() != 0 ) return weights[i]; + else return 1.0f; + } + }; + public: + /*! \brief feature data content */ + booster::FMatrixS data; + /*! \brief information fields */ + Info info; + public: + /*! \brief default constructor */ + DMatrix(void){} + /*! \brief get the number of instances */ + inline size_t Size() const{ + return info.labels.size(); + } + /*! + * \brief load from text file + * \param fname name of text data + * \param silent whether print information or not + */ + inline void LoadText(const char* fname, bool silent = false){ + data.Clear(); + FILE* file = utils::FopenCheck(fname, "r"); + float label; bool init = true; + char tmp[1024]; + std::vector findex; + std::vector fvalue; + + while (fscanf(file, "%s", tmp) == 1){ + unsigned index; float value; + if (sscanf(tmp, "%u:%f", &index, &value) == 2){ + findex.push_back(index); fvalue.push_back(value); + } + else{ + if (!init){ + info.labels.push_back(label); + data.AddRow(findex, fvalue); + } + findex.clear(); fvalue.clear(); + utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format"); + init = false; + } + } + + info.labels.push_back(label); + data.AddRow(findex, fvalue); + // initialize column support as well + data.InitData(); + + if (!silent){ + printf("%ux%u matrix with %lu entries is loaded from %s\n", + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); + } + fclose(file); + this->TryLoadGroup(fname, silent); + this->TryLoadWeight(fname, silent); + } + /*! + * \brief load from binary file + * \param fname name of binary data + * \param silent whether print information or not + * \return whether loading is success + */ + inline bool LoadBinary(const char* fname, bool silent = false){ + FILE *fp = fopen64(fname, "rb"); + if (fp == NULL) return false; + utils::FileStream fs(fp); + data.LoadBinary(fs); + info.labels.resize(data.NumRow()); + utils::Assert(fs.Read(&info.labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary"); + fs.Close(); + // initialize column support as well + data.InitData(); + + if (!silent){ + printf("%ux%u matrix with %lu entries is loaded from %s\n", + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); + } + this->TryLoadGroup(fname, silent); + this->TryLoadWeight(fname, silent); + return true; + } + /*! + * \brief save to binary file + * \param fname name of binary data + * \param silent whether print information or not + */ + inline void SaveBinary(const char* fname, bool silent = false){ + // initialize column support as well + data.InitData(); + + utils::FileStream fs(utils::FopenCheck(fname, "wb")); + data.SaveBinary(fs); + fs.Write(&info.labels[0], sizeof(float)* data.NumRow()); + fs.Close(); + if (!silent){ + printf("%ux%u matrix with %lu entries is saved to %s\n", + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); + } + } + /*! + * \brief cache load data given a file name, if filename ends with .buffer, direct load binary + * otherwise the function will first check if fname + '.buffer' exists, + * if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file, + * and try to create a buffer file + * \param fname name of binary data + * \param silent whether print information or not + * \param savebuffer whether do save binary buffer if it is text + */ + inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){ + int len = strlen(fname); + if (len > 8 && !strcmp(fname + len - 7, ".buffer")){ + this->LoadBinary(fname, silent); return; + } + char bname[1024]; + sprintf(bname, "%s.buffer", fname); + if (!this->LoadBinary(bname, silent)){ + this->LoadText(fname, silent); + if (savebuffer) this->SaveBinary(bname, silent); + } + } + private: + inline bool TryLoadGroup(const char* fname, bool silent = false){ + std::string name = fname; + if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){ + name.resize( name.length() - 7 ); + } + name += ".group"; + //if exists group data load it in + FILE *fi = fopen64(name.c_str(), "r"); + if (fi == NULL) return false; + info.group_ptr.push_back(0); + unsigned nline; + while (fscanf(fi, "%u", &nline) == 1){ + info.group_ptr.push_back(info.group_ptr.back()+nline); + } + if(!silent){ + printf("%lu groups are loaded from %s\n", info.group_ptr.size()-1, name.c_str()); + } + fclose(fi); + utils::Assert( info.group_ptr.back() == data.NumRow(), "DMatrix: group data does not match the number of rows in feature matrix" ); + return true; + } + inline bool TryLoadWeight(const char* fname, bool silent = false){ + std::string name = fname; + if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){ + name.resize( name.length() - 7 ); + } + name += ".weight"; + //if exists group data load it in + FILE *fi = fopen64(name.c_str(), "r"); + if (fi == NULL) return false; + float wt; + while (fscanf(fi, "%f", &wt) == 1){ + info.weights.push_back( wt ); + } + if(!silent){ + printf("loading weight from %s\n", name.c_str()); + } + fclose(fi); + utils::Assert( info.weights.size() == data.NumRow(), "DMatrix: weight data does not match the number of rows in feature matrix" ); + return true; + } + }; + }; +}; +#endif diff --git a/regression/xgboost_reg_eval.h b/regrank/xgboost_regrank_eval.h similarity index 59% rename from regression/xgboost_reg_eval.h rename to regrank/xgboost_regrank_eval.h index 137f44192..fa0694bf2 100644 --- a/regression/xgboost_reg_eval.h +++ b/regrank/xgboost_regrank_eval.h @@ -1,8 +1,8 @@ -#ifndef XGBOOST_REG_EVAL_H -#define XGBOOST_REG_EVAL_H +#ifndef XGBOOST_REGRANK_EVAL_H +#define XGBOOST_REGRANK_EVAL_H /*! -* \file xgboost_reg_eval.h -* \brief evaluation metrics for regression and classification +* \file xgboost_regrank_eval.h +* \brief evaluation metrics for regression and classification and rank * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com */ @@ -12,18 +12,19 @@ #include "../utils/xgboost_utils.h" #include "../utils/xgboost_omp.h" #include "../utils/xgboost_random.h" +#include "xgboost_regrank_data.h" namespace xgboost{ - namespace regression{ + namespace regrank{ /*! \brief evaluator that evaluates the loss metrics */ struct IEvaluator{ /*! * \brief evaluate a specific metric * \param preds prediction - * \param labels label + * \param info information, including label etc. */ virtual float Eval(const std::vector &preds, - const std::vector &labels) const = 0; + const DMatrix::Info &info ) const = 0; /*! \return name of metric */ virtual const char *Name(void) const = 0; }; @@ -31,37 +32,62 @@ namespace xgboost{ /*! \brief RMSE */ struct EvalRMSE : public IEvaluator{ virtual float Eval(const std::vector &preds, - const std::vector &labels) const{ + const DMatrix::Info &info ) const { const unsigned ndata = static_cast(preds.size()); - float sum = 0.0; - #pragma omp parallel for reduction(+:sum) schedule( static ) + float sum = 0.0, wsum = 0.0; + #pragma omp parallel for reduction(+:sum,wsum) schedule( static ) for (unsigned i = 0; i < ndata; ++i){ - float diff = preds[i] - labels[i]; - sum += diff * diff; + const float wt = info.GetWeight(i); + const float diff = info.labels[i] - preds[i]; + sum += diff*diff * wt; + wsum += wt; } - return sqrtf(sum / ndata); + return sqrtf(sum / wsum); } virtual const char *Name(void) const{ return "rmse"; } }; + /*! \brief Error */ + struct EvalLogLoss : public IEvaluator{ + virtual float Eval(const std::vector &preds, + const DMatrix::Info &info ) const { + const unsigned ndata = static_cast(preds.size()); + float sum = 0.0f, wsum = 0.0f; + #pragma omp parallel for reduction(+:sum,wsum) schedule( static ) + for (unsigned i = 0; i < ndata; ++i){ + const float y = info.labels[i]; + const float py = preds[i]; + const float wt = info.GetWeight(i); + sum -= wt * ( y * std::log(py) + (1.0f - y)*std::log(1 - py) ); + wsum+= wt; + } + return sum / wsum; + } + virtual const char *Name(void) const{ + return "negllik"; + } + }; + /*! \brief Error */ struct EvalError : public IEvaluator{ virtual float Eval(const std::vector &preds, - const std::vector &labels) const{ + const DMatrix::Info &info ) const { const unsigned ndata = static_cast(preds.size()); - unsigned nerr = 0; - #pragma omp parallel for reduction(+:nerr) schedule( static ) + float sum = 0.0f, wsum = 0.0f; + #pragma omp parallel for reduction(+:sum,wsum) schedule( static ) for (unsigned i = 0; i < ndata; ++i){ + const float wt = info.GetWeight(i); if (preds[i] > 0.5f){ - if (labels[i] < 0.5f) nerr += 1; - } + if (info.labels[i] < 0.5f) sum += wt; + } else{ - if (labels[i] > 0.5f) nerr += 1; + if (info.labels[i] >= 0.5f) sum += wt; } + wsum += wt; } - return static_cast(nerr) / ndata; + return sum / wsum; } virtual const char *Name(void) const{ return "error"; @@ -74,7 +100,8 @@ namespace xgboost{ return a.first > b.first; } virtual float Eval( const std::vector &preds, - const std::vector &labels ) const{ + const DMatrix::Info &info ) const { + const std::vector &labels = info.labels; const unsigned ndata = static_cast( preds.size() ); std::vector< std::pair > rec; for( unsigned i = 0; i < ndata; ++ i ){ @@ -100,54 +127,35 @@ namespace xgboost{ return "auc"; } }; - - /*! \brief Error */ - struct EvalLogLoss : public IEvaluator{ - virtual float Eval(const std::vector &preds, - const std::vector &labels) const{ - const unsigned ndata = static_cast(preds.size()); - unsigned nerr = 0; - #pragma omp parallel for reduction(+:nerr) schedule( static ) - for (unsigned i = 0; i < ndata; ++i){ - const float y = labels[i]; - const float py = preds[i]; - nerr -= y * std::log(py) + (1.0f - y)*std::log(1 - py); - } - return static_cast(nerr) / ndata; - } - virtual const char *Name(void) const{ - return "negllik"; - } - }; }; - namespace regression{ + namespace regrank{ /*! \brief a set of evaluators */ struct EvalSet{ public: inline void AddEval(const char *name){ - if (!strcmp(name, "rmse")) evals_.push_back(&rmse_); - if (!strcmp(name, "error")) evals_.push_back(&error_); - if (!strcmp(name, "logloss")) evals_.push_back(&logloss_); - if (!strcmp( name, "auc")) evals_.push_back( &auc_ ); + for( size_t i = 0; i < evals_.size(); ++ i ){ + if(!strcmp(name, evals_[i]->Name())) return; + } + if (!strcmp(name, "rmse")) evals_.push_back( new EvalRMSE() ); + if (!strcmp(name, "error")) evals_.push_back( new EvalError() ); + if (!strcmp(name, "logloss")) evals_.push_back( new EvalLogLoss() ); + if (!strcmp( name, "auc")) evals_.push_back( new EvalAuc() ); } - inline void Init(void){ - std::sort(evals_.begin(), evals_.end()); - evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin()); + ~EvalSet(){ + for( size_t i = 0; i < evals_.size(); ++ i ){ + delete evals_[i]; + } } inline void Eval(FILE *fo, const char *evname, const std::vector &preds, - const std::vector &labels) const{ + const DMatrix::Info &info ) const{ for (size_t i = 0; i < evals_.size(); ++i){ - float res = evals_[i]->Eval(preds, labels); + float res = evals_[i]->Eval(preds, info); fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res); } } private: - EvalRMSE rmse_; - EvalError error_; - EvalAuc auc_; - EvalLogLoss logloss_; std::vector evals_; }; }; diff --git a/regression/xgboost_reg_main.cpp b/regrank/xgboost_regrank_main.cpp similarity index 98% rename from regression/xgboost_reg_main.cpp rename to regrank/xgboost_regrank_main.cpp index c780caf36..862837c97 100644 --- a/regression/xgboost_reg_main.cpp +++ b/regrank/xgboost_regrank_main.cpp @@ -4,13 +4,13 @@ #include #include #include -#include "xgboost_reg.h" +#include "xgboost_regrank.h" #include "../utils/xgboost_fmap.h" #include "../utils/xgboost_random.h" #include "../utils/xgboost_config.h" namespace xgboost{ - namespace regression{ + namespace regrank{ /*! * \brief wrapping the training process of the gradient boosting regression model, * given the configuation @@ -273,13 +273,13 @@ namespace xgboost{ DMatrix data; std::vector deval; utils::FeatMap fmap; - RegBoostLearner learner; + RegRankBoostLearner learner; }; }; }; int main( int argc, char *argv[] ){ xgboost::random::Seed( 0 ); - xgboost::regression::RegBoostTask tsk; + xgboost::regrank::RegBoostTask tsk; return tsk.Run( argc, argv ); } diff --git a/regrank/xgboost_regrank_obj.h b/regrank/xgboost_regrank_obj.h new file mode 100644 index 000000000..541b20b16 --- /dev/null +++ b/regrank/xgboost_regrank_obj.h @@ -0,0 +1,112 @@ +#ifndef XGBOOST_REGRANK_OBJ_H +#define XGBOOST_REGRANK_OBJ_H +/*! + * \file xgboost_regrank_obj.h + * \brief defines objective function interface used in xgboost for regression and rank + * \author Tianqi Chen, Kailong Chen + */ +#include "xgboost_regrank_data.h" + +namespace xgboost{ + namespace regrank{ + /*! \brief interface of objective function */ + class IObjFunction{ + public: + /*! \brief virtual destructor */ + virtual ~IObjFunction(void){} + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + virtual void SetParam(const char *name, const char *val) = 0; + /*! + * \brief get gradient over each of predictions, given existing information + * \param preds prediction of current round + * \param info information about labels, weights, groups in rank + * \param grad gradient over each preds + * \param hess second order gradient over each preds + */ + virtual void GetGradient(const std::vector& preds, + const DMatrix::Info &info, + std::vector &grad, + std::vector &hess ) = 0; + /*! \return the default evaluation metric for the problem */ + virtual const char* DefaultEvalMetric(void) = 0; + /*! + * \brief transform prediction values, this is only called when Prediction is called + * \param preds prediction values, saves to this vector as well + */ + virtual void PredTransform(std::vector &preds){} + }; + }; + + namespace regrank{ + /*! \brief defines functions to calculate some commonly used functions */ + struct LossType{ + public: + const static int kLinearSquare = 0; + const static int kLogisticNeglik = 1; + const static int kLogisticClassify = 2; + public: + /*! \brief indicate which type we are using */ + int loss_type; + public: + /*! + * \brief transform the linear sum to prediction + * \param x linear sum of boosting ensemble + * \return transformed prediction + */ + inline float PredTransform(float x){ + switch (loss_type){ + case kLinearSquare: return x; + case kLogisticClassify: + case kLogisticNeglik: return 1.0f / (1.0f + expf(-x)); + default: utils::Error("unknown loss_type"); return 0.0f; + } + } + + /*! + * \brief calculate first order gradient of loss, given transformed prediction + * \param predt transformed prediction + * \param label true label + * \return first order gradient + */ + inline float FirstOrderGradient(float predt, float label) const{ + switch (loss_type){ + case kLinearSquare: return predt - label; + case kLogisticClassify: + case kLogisticNeglik: return predt - label; + default: utils::Error("unknown loss_type"); return 0.0f; + } + } + /*! + * \brief calculate second order gradient of loss, given transformed prediction + * \param predt transformed prediction + * \param label true label + * \return second order gradient + */ + inline float SecondOrderGradient(float predt, float label) const{ + switch (loss_type){ + case kLinearSquare: return 1.0f; + case kLogisticClassify: + case kLogisticNeglik: return predt * (1 - predt); + default: utils::Error("unknown loss_type"); return 0.0f; + } + } + }; + }; +}; + +#include "xgboost_regrank_obj.hpp" + +namespace xgboost{ + namespace regrank{ + IObjFunction* CreateObjFunction( const char *name ){ + if( !strcmp("reg", name ) ) return new RegressionObj(); + utils::Error("unknown objective function type"); + return NULL; + } + }; +}; +#endif diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp new file mode 100644 index 000000000..b08e3f0fc --- /dev/null +++ b/regrank/xgboost_regrank_obj.hpp @@ -0,0 +1,52 @@ +#ifndef XGBOOST_REGRANK_OBJ_HPP +#define XGBOOST_REGRANK_OBJ_HPP +/*! + * \file xgboost_regrank_obj.h + * \brief implementation of objective functions + * \author Tianqi Chen, Kailong Chen + */ +namespace xgboost{ + namespace regrank{ + class RegressionObj : public IObjFunction{ + public: + RegressionObj(void){ + loss.loss_type = LossType::kLinearSquare; + } + virtual ~RegressionObj(){} + virtual void SetParam(const char *name, const char *val){ + if( !strcmp( "loss_type", name ) ) loss.loss_type = atoi( val ); + } + virtual void GetGradient(const std::vector& preds, + const DMatrix::Info &info, + std::vector &grad, + std::vector &hess ) { + grad.resize(preds.size()); hess.resize(preds.size()); + + const unsigned ndata = static_cast(preds.size()); + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j){ + grad[j] = loss.FirstOrderGradient(preds[j], info.labels[j]) * info.GetWeight(j); + hess[j] = loss.SecondOrderGradient(preds[j], info.labels[j]) * info.GetWeight(j); + } + } + virtual const char* DefaultEvalMetric(void) { + if( loss.loss_type == LossType::kLogisticClassify ) return "error"; + else return "rmse"; + } + virtual void PredTransform(std::vector &preds){ + const unsigned ndata = static_cast(preds.size()); + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j){ + preds[j] = loss.PredTransform( preds[j] ); + } + } + private: + LossType loss; + }; + }; + + namespace regrank{ + // TODO rank objective + }; +}; +#endif diff --git a/regression/xgboost_reg_data.h b/regression/xgboost_reg_data.h deleted file mode 100644 index b00eb1c94..000000000 --- a/regression/xgboost_reg_data.h +++ /dev/null @@ -1,141 +0,0 @@ -#ifndef XGBOOST_REG_DATA_H -#define XGBOOST_REG_DATA_H - -/*! - * \file xgboost_reg_data.h - * \brief input data structure for regression and binary classification task. - * Format: - * The data should contain each data instance in each line. - * The format of line data is as below: - * label [feature index:feature value]+ - * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com - */ -#include -#include -#include "../booster/xgboost_data.h" -#include "../utils/xgboost_utils.h" -#include "../utils/xgboost_stream.h" - -namespace xgboost{ - namespace regression{ - /*! \brief data matrix for regression content */ - struct DMatrix{ - public: - /*! \brief feature data content */ - booster::FMatrixS data; - /*! \brief label of each instance */ - std::vector labels; - public: - /*! \brief default constructor */ - DMatrix(void){} - - /*! \brief get the number of instances */ - inline size_t Size() const{ - return labels.size(); - } - /*! - * \brief load from text file - * \param fname name of text data - * \param silent whether print information or not - */ - inline void LoadText(const char* fname, bool silent = false){ - data.Clear(); - FILE* file = utils::FopenCheck(fname, "r"); - float label; bool init = true; - char tmp[1024]; - std::vector findex; - std::vector fvalue; - - while (fscanf(file, "%s", tmp) == 1){ - unsigned index; float value; - if (sscanf(tmp, "%u:%f", &index, &value) == 2){ - findex.push_back(index); fvalue.push_back(value); - } - else{ - if (!init){ - labels.push_back(label); - data.AddRow(findex, fvalue); - } - findex.clear(); fvalue.clear(); - utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format"); - init = false; - } - } - - labels.push_back(label); - data.AddRow(findex, fvalue); - // initialize column support as well - data.InitData(); - - if (!silent){ - printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); - } - fclose(file); - } - /*! - * \brief load from binary file - * \param fname name of binary data - * \param silent whether print information or not - * \return whether loading is success - */ - inline bool LoadBinary(const char* fname, bool silent = false){ - FILE *fp = fopen64(fname, "rb"); - if (fp == NULL) return false; - utils::FileStream fs(fp); - data.LoadBinary(fs); - labels.resize(data.NumRow()); - utils::Assert(fs.Read(&labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary"); - fs.Close(); - // initialize column support as well - data.InitData(); - - if (!silent){ - printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); - } - return true; - } - /*! - * \brief save to binary file - * \param fname name of binary data - * \param silent whether print information or not - */ - inline void SaveBinary(const char* fname, bool silent = false){ - // initialize column support as well - data.InitData(); - - utils::FileStream fs(utils::FopenCheck(fname, "wb")); - data.SaveBinary(fs); - fs.Write(&labels[0], sizeof(float)* data.NumRow()); - fs.Close(); - if (!silent){ - printf("%ux%u matrix with %lu entries is saved to %s\n", - (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); - } - } - /*! - * \brief cache load data given a file name, if filename ends with .buffer, direct load binary - * otherwise the function will first check if fname + '.buffer' exists, - * if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file, - * and try to create a buffer file - * \param fname name of binary data - * \param silent whether print information or not - * \param savebuffer whether do save binary buffer if it is text - */ - inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){ - int len = strlen(fname); - if (len > 8 && !strcmp(fname + len - 7, ".buffer")){ - this->LoadBinary(fname, silent); return; - } - char bname[1024]; - sprintf(bname, "%s.buffer", fname); - if (!this->LoadBinary(bname, silent)){ - this->LoadText(fname, silent); - if (savebuffer) this->SaveBinary(bname, silent); - } - } - }; - }; -}; -#endif