make regression module compatible with rank loss, now support weighted loss

This commit is contained in:
tqchen 2014-04-29 16:16:02 -07:00
parent 7a79c009ce
commit 31edfda03c
8 changed files with 478 additions and 346 deletions

View File

@ -10,9 +10,8 @@ OBJ =
all: $(BIN) $(OBJ)
export LDFLAGS= -pthread -lm
xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp
xgboost: regrank/xgboost_regrank_main.cpp regrank/*.h booster/*.h booster/*/*.hpp booster/*.hpp
#xgboost: rank/xgboost_rank_main.cpp base/*.h rank/*.h booster/*.h booster/*/*.hpp booster/*.hpp
$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)

View File

@ -1,28 +1,31 @@
#ifndef XGBOOST_REG_H
#define XGBOOST_REG_H
#ifndef XGBOOST_REGRANK_H
#define XGBOOST_REGRANK_H
/*!
* \file xgboost_reg.h
* \brief class for gradient boosted regression
* \file xgboost_regrank.h
* \brief class for gradient boosted regression and ranking
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cmath>
#include <cstdlib>
#include <cstring>
#include "xgboost_reg_data.h"
#include "xgboost_reg_eval.h"
#include "xgboost_regrank_data.h"
#include "xgboost_regrank_eval.h"
#include "xgboost_regrank_obj.h"
#include "../utils/xgboost_omp.h"
#include "../booster/xgboost_gbmbase.h"
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
namespace xgboost{
namespace regression{
/*! \brief class for gradient boosted regression */
class RegBoostLearner{
namespace regrank{
/*! \brief class for gradient boosted regression and ranking */
class RegRankBoostLearner{
public:
/*! \brief constructor */
RegBoostLearner(void){
RegRankBoostLearner(void){
silent = 0;
obj_ = NULL;
name_obj_ = "reg";
}
/*!
* \brief a regression booter associated with training and evaluating data
@ -30,7 +33,7 @@ namespace xgboost{
* \param evals array of evaluating data
* \param evname name of evaluation data, used print statistics
*/
RegBoostLearner(const DMatrix *train,
RegRankBoostLearner(const DMatrix *train,
const std::vector<DMatrix *> &evals,
const std::vector<std::string> &evname){
silent = 0;
@ -83,8 +86,10 @@ namespace xgboost{
inline void SetParam(const char *name, const char *val){
if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
if (!strcmp(name, "objective") ) name_obj_ = val;
mparam.SetParam(name, val);
base_gbm.SetParam(name, val);
cfg_.push_back( std::make_pair( std::string(name), std::string(val) ) );
}
/*!
* \brief initialize solver before training, called before training
@ -92,13 +97,11 @@ namespace xgboost{
*/
inline void InitTrainer(void){
base_gbm.InitTrainer();
if (mparam.loss_type == kLogisticClassify){
evaluator_.AddEval("error");
obj_ = CreateObjFunction( name_obj_.c_str() );
for( size_t i = 0; i < cfg_.size(); ++ i ){
obj_->SetParam( cfg_[i].first.c_str(), cfg_[i].second.c_str() );
}
else{
evaluator_.AddEval("rmse");
}
evaluator_.Init();
evaluator_.AddEval( obj_->DefaultEvalMetric() );
}
/*!
* \brief initialize the current data storage for model, if the model is used first time, call this function
@ -146,7 +149,7 @@ namespace xgboost{
*/
inline void UpdateOneIter(int iter){
this->PredictBuffer(preds_, *train_, 0);
this->GetGradient(preds_, train_->labels, grad_, hess_);
obj_->GetGradient(preds_, train_->info, grad_, hess_);
std::vector<unsigned> root_index;
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
}
@ -162,7 +165,8 @@ namespace xgboost{
for (size_t i = 0; i < evals_.size(); ++i){
std::vector<float> &preds = this->eval_preds_[i];
this->PredictBuffer(preds, *evals_[i], buffer_offset);
evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels);
obj_->PredTransform(preds);
evaluator_.Eval(fo, evname_[i].c_str(), preds, evals_[i]->info);
buffer_offset += static_cast<int>(evals_[i]->Size());
}
fprintf(fo, "\n");
@ -171,18 +175,17 @@ namespace xgboost{
/*! \brief get prediction, without buffering */
inline void Predict(std::vector<float> &preds, const DMatrix &data){
preds.resize(data.Size());
const unsigned ndata = static_cast<unsigned>(data.Size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = mparam.PredTransform
(mparam.base_score + base_gbm.Predict(data.data, j, -1));
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1);
}
obj_->PredTransform( preds );
}
public:
/*!
* \brief update the model for one iteration
* \param iteration iteration number
* \brief interactive update
* \param action action type
*/
inline void UpdateInteract(std::string action){
this->InteractPredict(preds_, *train_, 0);
@ -198,7 +201,7 @@ namespace xgboost{
base_gbm.DelteBooster(); return;
}
this->GetGradient(preds_, train_->labels, grad_, hess_);
obj_->GetGradient(preds_, train_->info, grad_, hess_);
std::vector<unsigned> root_index;
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
@ -216,9 +219,9 @@ namespace xgboost{
const unsigned ndata = static_cast<unsigned>(data.Size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = mparam.PredTransform
(mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j));
preds[j] = mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j);
}
obj_->PredTransform( preds );
}
/*! \brief repredict trial */
inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){
@ -232,37 +235,13 @@ namespace xgboost{
/*! \brief get the transformed predictions, given data */
inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
preds.resize(data.Size());
const unsigned ndata = static_cast<unsigned>(data.Size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = mparam.PredTransform
(mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j));
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j);
}
}
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
inline void GetGradient(const std::vector<float> &preds,
const std::vector<float> &labels,
std::vector<float> &grad,
std::vector<float> &hess){
grad.resize(preds.size()); hess.resize(preds.size());
const unsigned ndata = static_cast<unsigned>(preds.size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
grad[j] = mparam.FirstOrderGradient(preds[j], labels[j]);
hess[j] = mparam.SecondOrderGradient(preds[j], labels[j]);
}
}
private:
enum LossType{
kLinearSquare = 0,
kLogisticNeglik = 1,
kLogisticClassify = 2
};
/*! \brief training parameter for regression */
struct ModelParam{
/* \brief global bias */
@ -277,7 +256,6 @@ namespace xgboost{
ModelParam(void){
base_score = 0.5f;
loss_type = 0;
num_feature = 0;
memset(reserved, 0, sizeof(reserved));
}
/*!
@ -299,92 +277,6 @@ namespace xgboost{
base_score = -logf(1.0f / base_score - 1.0f);
}
}
/*!
* \brief transform the linear sum to prediction
* \param x linear sum of boosting ensemble
* \return transformed prediction
*/
inline float PredTransform(float x){
switch (loss_type){
case kLinearSquare: return x;
case kLogisticClassify:
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate first order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return first order gradient
*/
inline float FirstOrderGradient(float predt, float label) const{
switch (loss_type){
case kLinearSquare: return predt - label;
case kLogisticClassify:
case kLogisticNeglik: return predt - label;
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate second order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return second order gradient
*/
inline float SecondOrderGradient(float predt, float label) const{
switch (loss_type){
case kLinearSquare: return 1.0f;
case kLogisticClassify:
case kLogisticNeglik: return predt * (1 - predt);
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculating the loss, given the predictions, labels and the loss type
* \param preds the given predictions
* \param labels the given labels
* \return the specified loss
*/
inline float Loss(const std::vector<float> &preds, const std::vector<float> &labels) const{
switch (loss_type){
case kLinearSquare: return SquareLoss(preds, labels);
case kLogisticNeglik:
case kLogisticClassify: return NegLoglikelihoodLoss(preds, labels);
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculating the square loss, given the predictions and labels
* \param preds the given predictions
* \param labels the given labels
* \return the summation of square loss
*/
inline float SquareLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
float ans = 0.0;
for (size_t i = 0; i < preds.size(); i++){
float dif = preds[i] - labels[i];
ans += dif * dif;
}
return ans;
}
/*!
* \brief calculating the square loss, given the predictions and labels
* \param preds the given predictions
* \param labels the given labels
* \return the summation of square loss
*/
inline float NegLoglikelihoodLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
float ans = 0.0;
for (size_t i = 0; i < preds.size(); i++)
ans -= labels[i] * logf(preds[i]) + (1 - labels[i]) * logf(1 - preds[i]);
return ans;
}
};
private:
int silent;
@ -395,6 +287,11 @@ namespace xgboost{
std::vector<DMatrix *> evals_;
std::vector<std::string> evname_;
std::vector<unsigned> buffer_index_;
// objective fnction
IObjFunction *obj_;
// name of objective function
std::string name_obj_;
std::vector< std::pair<std::string, std::string> > cfg_;
private:
std::vector<float> grad_, hess_, preds_;
std::vector< std::vector<float> > eval_preds_;

View File

@ -0,0 +1,205 @@
#ifndef XGBOOST_REGRANK_DATA_H
#define XGBOOST_REGRANK_DATA_H
/*!
* \file xgboost_regrank_data.h
* \brief input data structure for regression, binary classification, and rankning.
* Format:
* The data should contain each data instance in each line.
* The format of line data is as below:
* label <nonzero feature dimension> [feature index:feature value]+
* When using rank, an addtional group file with suffix group must be provided, giving the number of instances in each group
* When using weighted aware classification(regression), an addtional weight file must be provided, giving the weight of each instance
*
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cstdio>
#include <vector>
#include "../booster/xgboost_data.h"
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
namespace xgboost{
/*! \brief namespace to handle regression and rank */
namespace regrank{
/*! \brief data matrix for regression content */
struct DMatrix{
public:
/*! \brief data information besides the features */
struct Info{
/*! \brief label of each instance */
std::vector<float> labels;
/*! \brief the index of begin and end of a groupneeded when the learning task is ranking */
std::vector<unsigned> group_ptr;
/*! \brief weights of each instance, optional */
std::vector<float> weights;
/*! \brief get weight of each instances */
inline float GetWeight( size_t i ) const{
if( weights.size() != 0 ) return weights[i];
else return 1.0f;
}
};
public:
/*! \brief feature data content */
booster::FMatrixS data;
/*! \brief information fields */
Info info;
public:
/*! \brief default constructor */
DMatrix(void){}
/*! \brief get the number of instances */
inline size_t Size() const{
return info.labels.size();
}
/*!
* \brief load from text file
* \param fname name of text data
* \param silent whether print information or not
*/
inline void LoadText(const char* fname, bool silent = false){
data.Clear();
FILE* file = utils::FopenCheck(fname, "r");
float label; bool init = true;
char tmp[1024];
std::vector<booster::bst_uint> findex;
std::vector<booster::bst_float> fvalue;
while (fscanf(file, "%s", tmp) == 1){
unsigned index; float value;
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
findex.push_back(index); fvalue.push_back(value);
}
else{
if (!init){
info.labels.push_back(label);
data.AddRow(findex, fvalue);
}
findex.clear(); fvalue.clear();
utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
init = false;
}
}
info.labels.push_back(label);
data.AddRow(findex, fvalue);
// initialize column support as well
data.InitData();
if (!silent){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
}
fclose(file);
this->TryLoadGroup(fname, silent);
this->TryLoadWeight(fname, silent);
}
/*!
* \brief load from binary file
* \param fname name of binary data
* \param silent whether print information or not
* \return whether loading is success
*/
inline bool LoadBinary(const char* fname, bool silent = false){
FILE *fp = fopen64(fname, "rb");
if (fp == NULL) return false;
utils::FileStream fs(fp);
data.LoadBinary(fs);
info.labels.resize(data.NumRow());
utils::Assert(fs.Read(&info.labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
fs.Close();
// initialize column support as well
data.InitData();
if (!silent){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
}
this->TryLoadGroup(fname, silent);
this->TryLoadWeight(fname, silent);
return true;
}
/*!
* \brief save to binary file
* \param fname name of binary data
* \param silent whether print information or not
*/
inline void SaveBinary(const char* fname, bool silent = false){
// initialize column support as well
data.InitData();
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
data.SaveBinary(fs);
fs.Write(&info.labels[0], sizeof(float)* data.NumRow());
fs.Close();
if (!silent){
printf("%ux%u matrix with %lu entries is saved to %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
}
}
/*!
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
* otherwise the function will first check if fname + '.buffer' exists,
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
* and try to create a buffer file
* \param fname name of binary data
* \param silent whether print information or not
* \param savebuffer whether do save binary buffer if it is text
*/
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
int len = strlen(fname);
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
this->LoadBinary(fname, silent); return;
}
char bname[1024];
sprintf(bname, "%s.buffer", fname);
if (!this->LoadBinary(bname, silent)){
this->LoadText(fname, silent);
if (savebuffer) this->SaveBinary(bname, silent);
}
}
private:
inline bool TryLoadGroup(const char* fname, bool silent = false){
std::string name = fname;
if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
name.resize( name.length() - 7 );
}
name += ".group";
//if exists group data load it in
FILE *fi = fopen64(name.c_str(), "r");
if (fi == NULL) return false;
info.group_ptr.push_back(0);
unsigned nline;
while (fscanf(fi, "%u", &nline) == 1){
info.group_ptr.push_back(info.group_ptr.back()+nline);
}
if(!silent){
printf("%lu groups are loaded from %s\n", info.group_ptr.size()-1, name.c_str());
}
fclose(fi);
utils::Assert( info.group_ptr.back() == data.NumRow(), "DMatrix: group data does not match the number of rows in feature matrix" );
return true;
}
inline bool TryLoadWeight(const char* fname, bool silent = false){
std::string name = fname;
if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
name.resize( name.length() - 7 );
}
name += ".weight";
//if exists group data load it in
FILE *fi = fopen64(name.c_str(), "r");
if (fi == NULL) return false;
float wt;
while (fscanf(fi, "%f", &wt) == 1){
info.weights.push_back( wt );
}
if(!silent){
printf("loading weight from %s\n", name.c_str());
}
fclose(fi);
utils::Assert( info.weights.size() == data.NumRow(), "DMatrix: weight data does not match the number of rows in feature matrix" );
return true;
}
};
};
};
#endif

View File

@ -1,8 +1,8 @@
#ifndef XGBOOST_REG_EVAL_H
#define XGBOOST_REG_EVAL_H
#ifndef XGBOOST_REGRANK_EVAL_H
#define XGBOOST_REGRANK_EVAL_H
/*!
* \file xgboost_reg_eval.h
* \brief evaluation metrics for regression and classification
* \file xgboost_regrank_eval.h
* \brief evaluation metrics for regression and classification and rank
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
@ -12,18 +12,19 @@
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_omp.h"
#include "../utils/xgboost_random.h"
#include "xgboost_regrank_data.h"
namespace xgboost{
namespace regression{
namespace regrank{
/*! \brief evaluator that evaluates the loss metrics */
struct IEvaluator{
/*!
* \brief evaluate a specific metric
* \param preds prediction
* \param labels label
* \param info information, including label etc.
*/
virtual float Eval(const std::vector<float> &preds,
const std::vector<float> &labels) const = 0;
const DMatrix::Info &info ) const = 0;
/*! \return name of metric */
virtual const char *Name(void) const = 0;
};
@ -31,37 +32,62 @@ namespace xgboost{
/*! \brief RMSE */
struct EvalRMSE : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const std::vector<float> &labels) const{
const DMatrix::Info &info ) const {
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule( static )
float sum = 0.0, wsum = 0.0;
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
for (unsigned i = 0; i < ndata; ++i){
float diff = preds[i] - labels[i];
sum += diff * diff;
const float wt = info.GetWeight(i);
const float diff = info.labels[i] - preds[i];
sum += diff*diff * wt;
wsum += wt;
}
return sqrtf(sum / ndata);
return sqrtf(sum / wsum);
}
virtual const char *Name(void) const{
return "rmse";
}
};
/*! \brief Error */
struct EvalLogLoss : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info ) const {
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0f, wsum = 0.0f;
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
for (unsigned i = 0; i < ndata; ++i){
const float y = info.labels[i];
const float py = preds[i];
const float wt = info.GetWeight(i);
sum -= wt * ( y * std::log(py) + (1.0f - y)*std::log(1 - py) );
wsum+= wt;
}
return sum / wsum;
}
virtual const char *Name(void) const{
return "negllik";
}
};
/*! \brief Error */
struct EvalError : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const std::vector<float> &labels) const{
const DMatrix::Info &info ) const {
const unsigned ndata = static_cast<unsigned>(preds.size());
unsigned nerr = 0;
#pragma omp parallel for reduction(+:nerr) schedule( static )
float sum = 0.0f, wsum = 0.0f;
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
for (unsigned i = 0; i < ndata; ++i){
const float wt = info.GetWeight(i);
if (preds[i] > 0.5f){
if (labels[i] < 0.5f) nerr += 1;
if (info.labels[i] < 0.5f) sum += wt;
}
else{
if (labels[i] > 0.5f) nerr += 1;
if (info.labels[i] >= 0.5f) sum += wt;
}
wsum += wt;
}
return static_cast<float>(nerr) / ndata;
return sum / wsum;
}
virtual const char *Name(void) const{
return "error";
@ -74,7 +100,8 @@ namespace xgboost{
return a.first > b.first;
}
virtual float Eval( const std::vector<float> &preds,
const std::vector<float> &labels ) const{
const DMatrix::Info &info ) const {
const std::vector<float> &labels = info.labels;
const unsigned ndata = static_cast<unsigned>( preds.size() );
std::vector< std::pair<float, float> > rec;
for( unsigned i = 0; i < ndata; ++ i ){
@ -100,54 +127,35 @@ namespace xgboost{
return "auc";
}
};
/*! \brief Error */
struct EvalLogLoss : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const std::vector<float> &labels) const{
const unsigned ndata = static_cast<unsigned>(preds.size());
unsigned nerr = 0;
#pragma omp parallel for reduction(+:nerr) schedule( static )
for (unsigned i = 0; i < ndata; ++i){
const float y = labels[i];
const float py = preds[i];
nerr -= y * std::log(py) + (1.0f - y)*std::log(1 - py);
}
return static_cast<float>(nerr) / ndata;
}
virtual const char *Name(void) const{
return "negllik";
}
};
};
namespace regression{
namespace regrank{
/*! \brief a set of evaluators */
struct EvalSet{
public:
inline void AddEval(const char *name){
if (!strcmp(name, "rmse")) evals_.push_back(&rmse_);
if (!strcmp(name, "error")) evals_.push_back(&error_);
if (!strcmp(name, "logloss")) evals_.push_back(&logloss_);
if (!strcmp( name, "auc")) evals_.push_back( &auc_ );
for( size_t i = 0; i < evals_.size(); ++ i ){
if(!strcmp(name, evals_[i]->Name())) return;
}
if (!strcmp(name, "rmse")) evals_.push_back( new EvalRMSE() );
if (!strcmp(name, "error")) evals_.push_back( new EvalError() );
if (!strcmp(name, "logloss")) evals_.push_back( new EvalLogLoss() );
if (!strcmp( name, "auc")) evals_.push_back( new EvalAuc() );
}
~EvalSet(){
for( size_t i = 0; i < evals_.size(); ++ i ){
delete evals_[i];
}
inline void Init(void){
std::sort(evals_.begin(), evals_.end());
evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin());
}
inline void Eval(FILE *fo, const char *evname,
const std::vector<float> &preds,
const std::vector<float> &labels) const{
const DMatrix::Info &info ) const{
for (size_t i = 0; i < evals_.size(); ++i){
float res = evals_[i]->Eval(preds, labels);
float res = evals_[i]->Eval(preds, info);
fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res);
}
}
private:
EvalRMSE rmse_;
EvalError error_;
EvalAuc auc_;
EvalLogLoss logloss_;
std::vector<const IEvaluator*> evals_;
};
};

View File

@ -4,13 +4,13 @@
#include <ctime>
#include <string>
#include <cstring>
#include "xgboost_reg.h"
#include "xgboost_regrank.h"
#include "../utils/xgboost_fmap.h"
#include "../utils/xgboost_random.h"
#include "../utils/xgboost_config.h"
namespace xgboost{
namespace regression{
namespace regrank{
/*!
* \brief wrapping the training process of the gradient boosting regression model,
* given the configuation
@ -273,13 +273,13 @@ namespace xgboost{
DMatrix data;
std::vector<DMatrix*> deval;
utils::FeatMap fmap;
RegBoostLearner learner;
RegRankBoostLearner learner;
};
};
};
int main( int argc, char *argv[] ){
xgboost::random::Seed( 0 );
xgboost::regression::RegBoostTask tsk;
xgboost::regrank::RegBoostTask tsk;
return tsk.Run( argc, argv );
}

View File

@ -0,0 +1,112 @@
#ifndef XGBOOST_REGRANK_OBJ_H
#define XGBOOST_REGRANK_OBJ_H
/*!
* \file xgboost_regrank_obj.h
* \brief defines objective function interface used in xgboost for regression and rank
* \author Tianqi Chen, Kailong Chen
*/
#include "xgboost_regrank_data.h"
namespace xgboost{
namespace regrank{
/*! \brief interface of objective function */
class IObjFunction{
public:
/*! \brief virtual destructor */
virtual ~IObjFunction(void){}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam(const char *name, const char *val) = 0;
/*!
* \brief get gradient over each of predictions, given existing information
* \param preds prediction of current round
* \param info information about labels, weights, groups in rank
* \param grad gradient over each preds
* \param hess second order gradient over each preds
*/
virtual void GetGradient(const std::vector<float>& preds,
const DMatrix::Info &info,
std::vector<float> &grad,
std::vector<float> &hess ) = 0;
/*! \return the default evaluation metric for the problem */
virtual const char* DefaultEvalMetric(void) = 0;
/*!
* \brief transform prediction values, this is only called when Prediction is called
* \param preds prediction values, saves to this vector as well
*/
virtual void PredTransform(std::vector<float> &preds){}
};
};
namespace regrank{
/*! \brief defines functions to calculate some commonly used functions */
struct LossType{
public:
const static int kLinearSquare = 0;
const static int kLogisticNeglik = 1;
const static int kLogisticClassify = 2;
public:
/*! \brief indicate which type we are using */
int loss_type;
public:
/*!
* \brief transform the linear sum to prediction
* \param x linear sum of boosting ensemble
* \return transformed prediction
*/
inline float PredTransform(float x){
switch (loss_type){
case kLinearSquare: return x;
case kLogisticClassify:
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate first order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return first order gradient
*/
inline float FirstOrderGradient(float predt, float label) const{
switch (loss_type){
case kLinearSquare: return predt - label;
case kLogisticClassify:
case kLogisticNeglik: return predt - label;
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate second order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return second order gradient
*/
inline float SecondOrderGradient(float predt, float label) const{
switch (loss_type){
case kLinearSquare: return 1.0f;
case kLogisticClassify:
case kLogisticNeglik: return predt * (1 - predt);
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
};
};
};
#include "xgboost_regrank_obj.hpp"
namespace xgboost{
namespace regrank{
IObjFunction* CreateObjFunction( const char *name ){
if( !strcmp("reg", name ) ) return new RegressionObj();
utils::Error("unknown objective function type");
return NULL;
}
};
};
#endif

View File

@ -0,0 +1,52 @@
#ifndef XGBOOST_REGRANK_OBJ_HPP
#define XGBOOST_REGRANK_OBJ_HPP
/*!
* \file xgboost_regrank_obj.h
* \brief implementation of objective functions
* \author Tianqi Chen, Kailong Chen
*/
namespace xgboost{
namespace regrank{
class RegressionObj : public IObjFunction{
public:
RegressionObj(void){
loss.loss_type = LossType::kLinearSquare;
}
virtual ~RegressionObj(){}
virtual void SetParam(const char *name, const char *val){
if( !strcmp( "loss_type", name ) ) loss.loss_type = atoi( val );
}
virtual void GetGradient(const std::vector<float>& preds,
const DMatrix::Info &info,
std::vector<float> &grad,
std::vector<float> &hess ) {
grad.resize(preds.size()); hess.resize(preds.size());
const unsigned ndata = static_cast<unsigned>(preds.size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
grad[j] = loss.FirstOrderGradient(preds[j], info.labels[j]) * info.GetWeight(j);
hess[j] = loss.SecondOrderGradient(preds[j], info.labels[j]) * info.GetWeight(j);
}
}
virtual const char* DefaultEvalMetric(void) {
if( loss.loss_type == LossType::kLogisticClassify ) return "error";
else return "rmse";
}
virtual void PredTransform(std::vector<float> &preds){
const unsigned ndata = static_cast<unsigned>(preds.size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = loss.PredTransform( preds[j] );
}
}
private:
LossType loss;
};
};
namespace regrank{
// TODO rank objective
};
};
#endif

View File

@ -1,141 +0,0 @@
#ifndef XGBOOST_REG_DATA_H
#define XGBOOST_REG_DATA_H
/*!
* \file xgboost_reg_data.h
* \brief input data structure for regression and binary classification task.
* Format:
* The data should contain each data instance in each line.
* The format of line data is as below:
* label <nonzero feature dimension> [feature index:feature value]+
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cstdio>
#include <vector>
#include "../booster/xgboost_data.h"
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
namespace xgboost{
namespace regression{
/*! \brief data matrix for regression content */
struct DMatrix{
public:
/*! \brief feature data content */
booster::FMatrixS data;
/*! \brief label of each instance */
std::vector<float> labels;
public:
/*! \brief default constructor */
DMatrix(void){}
/*! \brief get the number of instances */
inline size_t Size() const{
return labels.size();
}
/*!
* \brief load from text file
* \param fname name of text data
* \param silent whether print information or not
*/
inline void LoadText(const char* fname, bool silent = false){
data.Clear();
FILE* file = utils::FopenCheck(fname, "r");
float label; bool init = true;
char tmp[1024];
std::vector<booster::bst_uint> findex;
std::vector<booster::bst_float> fvalue;
while (fscanf(file, "%s", tmp) == 1){
unsigned index; float value;
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
findex.push_back(index); fvalue.push_back(value);
}
else{
if (!init){
labels.push_back(label);
data.AddRow(findex, fvalue);
}
findex.clear(); fvalue.clear();
utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
init = false;
}
}
labels.push_back(label);
data.AddRow(findex, fvalue);
// initialize column support as well
data.InitData();
if (!silent){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
}
fclose(file);
}
/*!
* \brief load from binary file
* \param fname name of binary data
* \param silent whether print information or not
* \return whether loading is success
*/
inline bool LoadBinary(const char* fname, bool silent = false){
FILE *fp = fopen64(fname, "rb");
if (fp == NULL) return false;
utils::FileStream fs(fp);
data.LoadBinary(fs);
labels.resize(data.NumRow());
utils::Assert(fs.Read(&labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
fs.Close();
// initialize column support as well
data.InitData();
if (!silent){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
}
return true;
}
/*!
* \brief save to binary file
* \param fname name of binary data
* \param silent whether print information or not
*/
inline void SaveBinary(const char* fname, bool silent = false){
// initialize column support as well
data.InitData();
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
data.SaveBinary(fs);
fs.Write(&labels[0], sizeof(float)* data.NumRow());
fs.Close();
if (!silent){
printf("%ux%u matrix with %lu entries is saved to %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
}
}
/*!
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
* otherwise the function will first check if fname + '.buffer' exists,
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
* and try to create a buffer file
* \param fname name of binary data
* \param silent whether print information or not
* \param savebuffer whether do save binary buffer if it is text
*/
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
int len = strlen(fname);
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
this->LoadBinary(fname, silent); return;
}
char bname[1024];
sprintf(bname, "%s.buffer", fname);
if (!this->LoadBinary(bname, silent)){
this->LoadText(fname, silent);
if (savebuffer) this->SaveBinary(bname, silent);
}
}
};
};
};
#endif