make regression module compatible with rank loss, now support weighted loss
This commit is contained in:
parent
7a79c009ce
commit
31edfda03c
3
Makefile
3
Makefile
@ -10,9 +10,8 @@ OBJ =
|
|||||||
all: $(BIN) $(OBJ)
|
all: $(BIN) $(OBJ)
|
||||||
export LDFLAGS= -pthread -lm
|
export LDFLAGS= -pthread -lm
|
||||||
|
|
||||||
xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp
|
xgboost: regrank/xgboost_regrank_main.cpp regrank/*.h booster/*.h booster/*/*.hpp booster/*.hpp
|
||||||
|
|
||||||
#xgboost: rank/xgboost_rank_main.cpp base/*.h rank/*.h booster/*.h booster/*/*.hpp booster/*.hpp
|
|
||||||
|
|
||||||
$(BIN) :
|
$(BIN) :
|
||||||
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
|||||||
@ -1,28 +1,31 @@
|
|||||||
#ifndef XGBOOST_REG_H
|
#ifndef XGBOOST_REGRANK_H
|
||||||
#define XGBOOST_REG_H
|
#define XGBOOST_REGRANK_H
|
||||||
/*!
|
/*!
|
||||||
* \file xgboost_reg.h
|
* \file xgboost_regrank.h
|
||||||
* \brief class for gradient boosted regression
|
* \brief class for gradient boosted regression and ranking
|
||||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||||
*/
|
*/
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include "xgboost_reg_data.h"
|
#include "xgboost_regrank_data.h"
|
||||||
#include "xgboost_reg_eval.h"
|
#include "xgboost_regrank_eval.h"
|
||||||
|
#include "xgboost_regrank_obj.h"
|
||||||
#include "../utils/xgboost_omp.h"
|
#include "../utils/xgboost_omp.h"
|
||||||
#include "../booster/xgboost_gbmbase.h"
|
#include "../booster/xgboost_gbmbase.h"
|
||||||
#include "../utils/xgboost_utils.h"
|
#include "../utils/xgboost_utils.h"
|
||||||
#include "../utils/xgboost_stream.h"
|
#include "../utils/xgboost_stream.h"
|
||||||
|
|
||||||
namespace xgboost{
|
namespace xgboost{
|
||||||
namespace regression{
|
namespace regrank{
|
||||||
/*! \brief class for gradient boosted regression */
|
/*! \brief class for gradient boosted regression and ranking */
|
||||||
class RegBoostLearner{
|
class RegRankBoostLearner{
|
||||||
public:
|
public:
|
||||||
/*! \brief constructor */
|
/*! \brief constructor */
|
||||||
RegBoostLearner(void){
|
RegRankBoostLearner(void){
|
||||||
silent = 0;
|
silent = 0;
|
||||||
|
obj_ = NULL;
|
||||||
|
name_obj_ = "reg";
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief a regression booter associated with training and evaluating data
|
* \brief a regression booter associated with training and evaluating data
|
||||||
@ -30,7 +33,7 @@ namespace xgboost{
|
|||||||
* \param evals array of evaluating data
|
* \param evals array of evaluating data
|
||||||
* \param evname name of evaluation data, used print statistics
|
* \param evname name of evaluation data, used print statistics
|
||||||
*/
|
*/
|
||||||
RegBoostLearner(const DMatrix *train,
|
RegRankBoostLearner(const DMatrix *train,
|
||||||
const std::vector<DMatrix *> &evals,
|
const std::vector<DMatrix *> &evals,
|
||||||
const std::vector<std::string> &evname){
|
const std::vector<std::string> &evname){
|
||||||
silent = 0;
|
silent = 0;
|
||||||
@ -83,8 +86,10 @@ namespace xgboost{
|
|||||||
inline void SetParam(const char *name, const char *val){
|
inline void SetParam(const char *name, const char *val){
|
||||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||||
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||||
|
if (!strcmp(name, "objective") ) name_obj_ = val;
|
||||||
mparam.SetParam(name, val);
|
mparam.SetParam(name, val);
|
||||||
base_gbm.SetParam(name, val);
|
base_gbm.SetParam(name, val);
|
||||||
|
cfg_.push_back( std::make_pair( std::string(name), std::string(val) ) );
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief initialize solver before training, called before training
|
* \brief initialize solver before training, called before training
|
||||||
@ -92,13 +97,11 @@ namespace xgboost{
|
|||||||
*/
|
*/
|
||||||
inline void InitTrainer(void){
|
inline void InitTrainer(void){
|
||||||
base_gbm.InitTrainer();
|
base_gbm.InitTrainer();
|
||||||
if (mparam.loss_type == kLogisticClassify){
|
obj_ = CreateObjFunction( name_obj_.c_str() );
|
||||||
evaluator_.AddEval("error");
|
for( size_t i = 0; i < cfg_.size(); ++ i ){
|
||||||
|
obj_->SetParam( cfg_[i].first.c_str(), cfg_[i].second.c_str() );
|
||||||
}
|
}
|
||||||
else{
|
evaluator_.AddEval( obj_->DefaultEvalMetric() );
|
||||||
evaluator_.AddEval("rmse");
|
|
||||||
}
|
|
||||||
evaluator_.Init();
|
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
||||||
@ -146,7 +149,7 @@ namespace xgboost{
|
|||||||
*/
|
*/
|
||||||
inline void UpdateOneIter(int iter){
|
inline void UpdateOneIter(int iter){
|
||||||
this->PredictBuffer(preds_, *train_, 0);
|
this->PredictBuffer(preds_, *train_, 0);
|
||||||
this->GetGradient(preds_, train_->labels, grad_, hess_);
|
obj_->GetGradient(preds_, train_->info, grad_, hess_);
|
||||||
std::vector<unsigned> root_index;
|
std::vector<unsigned> root_index;
|
||||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
||||||
}
|
}
|
||||||
@ -162,7 +165,8 @@ namespace xgboost{
|
|||||||
for (size_t i = 0; i < evals_.size(); ++i){
|
for (size_t i = 0; i < evals_.size(); ++i){
|
||||||
std::vector<float> &preds = this->eval_preds_[i];
|
std::vector<float> &preds = this->eval_preds_[i];
|
||||||
this->PredictBuffer(preds, *evals_[i], buffer_offset);
|
this->PredictBuffer(preds, *evals_[i], buffer_offset);
|
||||||
evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels);
|
obj_->PredTransform(preds);
|
||||||
|
evaluator_.Eval(fo, evname_[i].c_str(), preds, evals_[i]->info);
|
||||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||||
}
|
}
|
||||||
fprintf(fo, "\n");
|
fprintf(fo, "\n");
|
||||||
@ -171,18 +175,17 @@ namespace xgboost{
|
|||||||
/*! \brief get prediction, without buffering */
|
/*! \brief get prediction, without buffering */
|
||||||
inline void Predict(std::vector<float> &preds, const DMatrix &data){
|
inline void Predict(std::vector<float> &preds, const DMatrix &data){
|
||||||
preds.resize(data.Size());
|
preds.resize(data.Size());
|
||||||
|
|
||||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||||
#pragma omp parallel for schedule( static )
|
#pragma omp parallel for schedule( static )
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
preds[j] = mparam.PredTransform
|
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1);
|
||||||
(mparam.base_score + base_gbm.Predict(data.data, j, -1));
|
|
||||||
}
|
}
|
||||||
|
obj_->PredTransform( preds );
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
/*!
|
/*!
|
||||||
* \brief update the model for one iteration
|
* \brief interactive update
|
||||||
* \param iteration iteration number
|
* \param action action type
|
||||||
*/
|
*/
|
||||||
inline void UpdateInteract(std::string action){
|
inline void UpdateInteract(std::string action){
|
||||||
this->InteractPredict(preds_, *train_, 0);
|
this->InteractPredict(preds_, *train_, 0);
|
||||||
@ -198,7 +201,7 @@ namespace xgboost{
|
|||||||
base_gbm.DelteBooster(); return;
|
base_gbm.DelteBooster(); return;
|
||||||
}
|
}
|
||||||
|
|
||||||
this->GetGradient(preds_, train_->labels, grad_, hess_);
|
obj_->GetGradient(preds_, train_->info, grad_, hess_);
|
||||||
std::vector<unsigned> root_index;
|
std::vector<unsigned> root_index;
|
||||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
||||||
|
|
||||||
@ -216,9 +219,9 @@ namespace xgboost{
|
|||||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||||
#pragma omp parallel for schedule( static )
|
#pragma omp parallel for schedule( static )
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
preds[j] = mparam.PredTransform
|
preds[j] = mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j);
|
||||||
(mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j));
|
|
||||||
}
|
}
|
||||||
|
obj_->PredTransform( preds );
|
||||||
}
|
}
|
||||||
/*! \brief repredict trial */
|
/*! \brief repredict trial */
|
||||||
inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){
|
inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){
|
||||||
@ -232,37 +235,13 @@ namespace xgboost{
|
|||||||
/*! \brief get the transformed predictions, given data */
|
/*! \brief get the transformed predictions, given data */
|
||||||
inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
|
inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
|
||||||
preds.resize(data.Size());
|
preds.resize(data.Size());
|
||||||
|
|
||||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||||
#pragma omp parallel for schedule( static )
|
#pragma omp parallel for schedule( static )
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
preds[j] = mparam.PredTransform
|
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j);
|
||||||
(mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
|
|
||||||
inline void GetGradient(const std::vector<float> &preds,
|
|
||||||
const std::vector<float> &labels,
|
|
||||||
std::vector<float> &grad,
|
|
||||||
std::vector<float> &hess){
|
|
||||||
grad.resize(preds.size()); hess.resize(preds.size());
|
|
||||||
|
|
||||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
|
||||||
#pragma omp parallel for schedule( static )
|
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
|
||||||
grad[j] = mparam.FirstOrderGradient(preds[j], labels[j]);
|
|
||||||
hess[j] = mparam.SecondOrderGradient(preds[j], labels[j]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum LossType{
|
|
||||||
kLinearSquare = 0,
|
|
||||||
kLogisticNeglik = 1,
|
|
||||||
kLogisticClassify = 2
|
|
||||||
};
|
|
||||||
|
|
||||||
/*! \brief training parameter for regression */
|
/*! \brief training parameter for regression */
|
||||||
struct ModelParam{
|
struct ModelParam{
|
||||||
/* \brief global bias */
|
/* \brief global bias */
|
||||||
@ -277,7 +256,6 @@ namespace xgboost{
|
|||||||
ModelParam(void){
|
ModelParam(void){
|
||||||
base_score = 0.5f;
|
base_score = 0.5f;
|
||||||
loss_type = 0;
|
loss_type = 0;
|
||||||
num_feature = 0;
|
|
||||||
memset(reserved, 0, sizeof(reserved));
|
memset(reserved, 0, sizeof(reserved));
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -299,92 +277,6 @@ namespace xgboost{
|
|||||||
base_score = -logf(1.0f / base_score - 1.0f);
|
base_score = -logf(1.0f / base_score - 1.0f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief transform the linear sum to prediction
|
|
||||||
* \param x linear sum of boosting ensemble
|
|
||||||
* \return transformed prediction
|
|
||||||
*/
|
|
||||||
inline float PredTransform(float x){
|
|
||||||
switch (loss_type){
|
|
||||||
case kLinearSquare: return x;
|
|
||||||
case kLogisticClassify:
|
|
||||||
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
|
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief calculate first order gradient of loss, given transformed prediction
|
|
||||||
* \param predt transformed prediction
|
|
||||||
* \param label true label
|
|
||||||
* \return first order gradient
|
|
||||||
*/
|
|
||||||
inline float FirstOrderGradient(float predt, float label) const{
|
|
||||||
switch (loss_type){
|
|
||||||
case kLinearSquare: return predt - label;
|
|
||||||
case kLogisticClassify:
|
|
||||||
case kLogisticNeglik: return predt - label;
|
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief calculate second order gradient of loss, given transformed prediction
|
|
||||||
* \param predt transformed prediction
|
|
||||||
* \param label true label
|
|
||||||
* \return second order gradient
|
|
||||||
*/
|
|
||||||
inline float SecondOrderGradient(float predt, float label) const{
|
|
||||||
switch (loss_type){
|
|
||||||
case kLinearSquare: return 1.0f;
|
|
||||||
case kLogisticClassify:
|
|
||||||
case kLogisticNeglik: return predt * (1 - predt);
|
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief calculating the loss, given the predictions, labels and the loss type
|
|
||||||
* \param preds the given predictions
|
|
||||||
* \param labels the given labels
|
|
||||||
* \return the specified loss
|
|
||||||
*/
|
|
||||||
inline float Loss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
|
||||||
switch (loss_type){
|
|
||||||
case kLinearSquare: return SquareLoss(preds, labels);
|
|
||||||
case kLogisticNeglik:
|
|
||||||
case kLogisticClassify: return NegLoglikelihoodLoss(preds, labels);
|
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief calculating the square loss, given the predictions and labels
|
|
||||||
* \param preds the given predictions
|
|
||||||
* \param labels the given labels
|
|
||||||
* \return the summation of square loss
|
|
||||||
*/
|
|
||||||
inline float SquareLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
|
||||||
float ans = 0.0;
|
|
||||||
for (size_t i = 0; i < preds.size(); i++){
|
|
||||||
float dif = preds[i] - labels[i];
|
|
||||||
ans += dif * dif;
|
|
||||||
}
|
|
||||||
return ans;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief calculating the square loss, given the predictions and labels
|
|
||||||
* \param preds the given predictions
|
|
||||||
* \param labels the given labels
|
|
||||||
* \return the summation of square loss
|
|
||||||
*/
|
|
||||||
inline float NegLoglikelihoodLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
|
||||||
float ans = 0.0;
|
|
||||||
for (size_t i = 0; i < preds.size(); i++)
|
|
||||||
ans -= labels[i] * logf(preds[i]) + (1 - labels[i]) * logf(1 - preds[i]);
|
|
||||||
return ans;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
private:
|
private:
|
||||||
int silent;
|
int silent;
|
||||||
@ -395,6 +287,11 @@ namespace xgboost{
|
|||||||
std::vector<DMatrix *> evals_;
|
std::vector<DMatrix *> evals_;
|
||||||
std::vector<std::string> evname_;
|
std::vector<std::string> evname_;
|
||||||
std::vector<unsigned> buffer_index_;
|
std::vector<unsigned> buffer_index_;
|
||||||
|
// objective fnction
|
||||||
|
IObjFunction *obj_;
|
||||||
|
// name of objective function
|
||||||
|
std::string name_obj_;
|
||||||
|
std::vector< std::pair<std::string, std::string> > cfg_;
|
||||||
private:
|
private:
|
||||||
std::vector<float> grad_, hess_, preds_;
|
std::vector<float> grad_, hess_, preds_;
|
||||||
std::vector< std::vector<float> > eval_preds_;
|
std::vector< std::vector<float> > eval_preds_;
|
||||||
205
regrank/xgboost_regrank_data.h
Normal file
205
regrank/xgboost_regrank_data.h
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
#ifndef XGBOOST_REGRANK_DATA_H
|
||||||
|
#define XGBOOST_REGRANK_DATA_H
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank_data.h
|
||||||
|
* \brief input data structure for regression, binary classification, and rankning.
|
||||||
|
* Format:
|
||||||
|
* The data should contain each data instance in each line.
|
||||||
|
* The format of line data is as below:
|
||||||
|
* label <nonzero feature dimension> [feature index:feature value]+
|
||||||
|
* When using rank, an addtional group file with suffix group must be provided, giving the number of instances in each group
|
||||||
|
* When using weighted aware classification(regression), an addtional weight file must be provided, giving the weight of each instance
|
||||||
|
*
|
||||||
|
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||||
|
*/
|
||||||
|
#include <cstdio>
|
||||||
|
#include <vector>
|
||||||
|
#include "../booster/xgboost_data.h"
|
||||||
|
#include "../utils/xgboost_utils.h"
|
||||||
|
#include "../utils/xgboost_stream.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
/*! \brief namespace to handle regression and rank */
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief data matrix for regression content */
|
||||||
|
struct DMatrix{
|
||||||
|
public:
|
||||||
|
/*! \brief data information besides the features */
|
||||||
|
struct Info{
|
||||||
|
/*! \brief label of each instance */
|
||||||
|
std::vector<float> labels;
|
||||||
|
/*! \brief the index of begin and end of a groupneeded when the learning task is ranking */
|
||||||
|
std::vector<unsigned> group_ptr;
|
||||||
|
/*! \brief weights of each instance, optional */
|
||||||
|
std::vector<float> weights;
|
||||||
|
/*! \brief get weight of each instances */
|
||||||
|
inline float GetWeight( size_t i ) const{
|
||||||
|
if( weights.size() != 0 ) return weights[i];
|
||||||
|
else return 1.0f;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
public:
|
||||||
|
/*! \brief feature data content */
|
||||||
|
booster::FMatrixS data;
|
||||||
|
/*! \brief information fields */
|
||||||
|
Info info;
|
||||||
|
public:
|
||||||
|
/*! \brief default constructor */
|
||||||
|
DMatrix(void){}
|
||||||
|
/*! \brief get the number of instances */
|
||||||
|
inline size_t Size() const{
|
||||||
|
return info.labels.size();
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load from text file
|
||||||
|
* \param fname name of text data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
*/
|
||||||
|
inline void LoadText(const char* fname, bool silent = false){
|
||||||
|
data.Clear();
|
||||||
|
FILE* file = utils::FopenCheck(fname, "r");
|
||||||
|
float label; bool init = true;
|
||||||
|
char tmp[1024];
|
||||||
|
std::vector<booster::bst_uint> findex;
|
||||||
|
std::vector<booster::bst_float> fvalue;
|
||||||
|
|
||||||
|
while (fscanf(file, "%s", tmp) == 1){
|
||||||
|
unsigned index; float value;
|
||||||
|
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
|
||||||
|
findex.push_back(index); fvalue.push_back(value);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
if (!init){
|
||||||
|
info.labels.push_back(label);
|
||||||
|
data.AddRow(findex, fvalue);
|
||||||
|
}
|
||||||
|
findex.clear(); fvalue.clear();
|
||||||
|
utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
|
||||||
|
init = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info.labels.push_back(label);
|
||||||
|
data.AddRow(findex, fvalue);
|
||||||
|
// initialize column support as well
|
||||||
|
data.InitData();
|
||||||
|
|
||||||
|
if (!silent){
|
||||||
|
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||||
|
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||||
|
}
|
||||||
|
fclose(file);
|
||||||
|
this->TryLoadGroup(fname, silent);
|
||||||
|
this->TryLoadWeight(fname, silent);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load from binary file
|
||||||
|
* \param fname name of binary data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
* \return whether loading is success
|
||||||
|
*/
|
||||||
|
inline bool LoadBinary(const char* fname, bool silent = false){
|
||||||
|
FILE *fp = fopen64(fname, "rb");
|
||||||
|
if (fp == NULL) return false;
|
||||||
|
utils::FileStream fs(fp);
|
||||||
|
data.LoadBinary(fs);
|
||||||
|
info.labels.resize(data.NumRow());
|
||||||
|
utils::Assert(fs.Read(&info.labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
|
||||||
|
fs.Close();
|
||||||
|
// initialize column support as well
|
||||||
|
data.InitData();
|
||||||
|
|
||||||
|
if (!silent){
|
||||||
|
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||||
|
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||||
|
}
|
||||||
|
this->TryLoadGroup(fname, silent);
|
||||||
|
this->TryLoadWeight(fname, silent);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief save to binary file
|
||||||
|
* \param fname name of binary data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
*/
|
||||||
|
inline void SaveBinary(const char* fname, bool silent = false){
|
||||||
|
// initialize column support as well
|
||||||
|
data.InitData();
|
||||||
|
|
||||||
|
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
||||||
|
data.SaveBinary(fs);
|
||||||
|
fs.Write(&info.labels[0], sizeof(float)* data.NumRow());
|
||||||
|
fs.Close();
|
||||||
|
if (!silent){
|
||||||
|
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
||||||
|
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
|
||||||
|
* otherwise the function will first check if fname + '.buffer' exists,
|
||||||
|
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
||||||
|
* and try to create a buffer file
|
||||||
|
* \param fname name of binary data
|
||||||
|
* \param silent whether print information or not
|
||||||
|
* \param savebuffer whether do save binary buffer if it is text
|
||||||
|
*/
|
||||||
|
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
|
||||||
|
int len = strlen(fname);
|
||||||
|
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
||||||
|
this->LoadBinary(fname, silent); return;
|
||||||
|
}
|
||||||
|
char bname[1024];
|
||||||
|
sprintf(bname, "%s.buffer", fname);
|
||||||
|
if (!this->LoadBinary(bname, silent)){
|
||||||
|
this->LoadText(fname, silent);
|
||||||
|
if (savebuffer) this->SaveBinary(bname, silent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
inline bool TryLoadGroup(const char* fname, bool silent = false){
|
||||||
|
std::string name = fname;
|
||||||
|
if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
|
||||||
|
name.resize( name.length() - 7 );
|
||||||
|
}
|
||||||
|
name += ".group";
|
||||||
|
//if exists group data load it in
|
||||||
|
FILE *fi = fopen64(name.c_str(), "r");
|
||||||
|
if (fi == NULL) return false;
|
||||||
|
info.group_ptr.push_back(0);
|
||||||
|
unsigned nline;
|
||||||
|
while (fscanf(fi, "%u", &nline) == 1){
|
||||||
|
info.group_ptr.push_back(info.group_ptr.back()+nline);
|
||||||
|
}
|
||||||
|
if(!silent){
|
||||||
|
printf("%lu groups are loaded from %s\n", info.group_ptr.size()-1, name.c_str());
|
||||||
|
}
|
||||||
|
fclose(fi);
|
||||||
|
utils::Assert( info.group_ptr.back() == data.NumRow(), "DMatrix: group data does not match the number of rows in feature matrix" );
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
inline bool TryLoadWeight(const char* fname, bool silent = false){
|
||||||
|
std::string name = fname;
|
||||||
|
if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
|
||||||
|
name.resize( name.length() - 7 );
|
||||||
|
}
|
||||||
|
name += ".weight";
|
||||||
|
//if exists group data load it in
|
||||||
|
FILE *fi = fopen64(name.c_str(), "r");
|
||||||
|
if (fi == NULL) return false;
|
||||||
|
float wt;
|
||||||
|
while (fscanf(fi, "%f", &wt) == 1){
|
||||||
|
info.weights.push_back( wt );
|
||||||
|
}
|
||||||
|
if(!silent){
|
||||||
|
printf("loading weight from %s\n", name.c_str());
|
||||||
|
}
|
||||||
|
fclose(fi);
|
||||||
|
utils::Assert( info.weights.size() == data.NumRow(), "DMatrix: weight data does not match the number of rows in feature matrix" );
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
#endif
|
||||||
@ -1,8 +1,8 @@
|
|||||||
#ifndef XGBOOST_REG_EVAL_H
|
#ifndef XGBOOST_REGRANK_EVAL_H
|
||||||
#define XGBOOST_REG_EVAL_H
|
#define XGBOOST_REGRANK_EVAL_H
|
||||||
/*!
|
/*!
|
||||||
* \file xgboost_reg_eval.h
|
* \file xgboost_regrank_eval.h
|
||||||
* \brief evaluation metrics for regression and classification
|
* \brief evaluation metrics for regression and classification and rank
|
||||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -12,18 +12,19 @@
|
|||||||
#include "../utils/xgboost_utils.h"
|
#include "../utils/xgboost_utils.h"
|
||||||
#include "../utils/xgboost_omp.h"
|
#include "../utils/xgboost_omp.h"
|
||||||
#include "../utils/xgboost_random.h"
|
#include "../utils/xgboost_random.h"
|
||||||
|
#include "xgboost_regrank_data.h"
|
||||||
|
|
||||||
namespace xgboost{
|
namespace xgboost{
|
||||||
namespace regression{
|
namespace regrank{
|
||||||
/*! \brief evaluator that evaluates the loss metrics */
|
/*! \brief evaluator that evaluates the loss metrics */
|
||||||
struct IEvaluator{
|
struct IEvaluator{
|
||||||
/*!
|
/*!
|
||||||
* \brief evaluate a specific metric
|
* \brief evaluate a specific metric
|
||||||
* \param preds prediction
|
* \param preds prediction
|
||||||
* \param labels label
|
* \param info information, including label etc.
|
||||||
*/
|
*/
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const std::vector<float> &labels) const = 0;
|
const DMatrix::Info &info ) const = 0;
|
||||||
/*! \return name of metric */
|
/*! \return name of metric */
|
||||||
virtual const char *Name(void) const = 0;
|
virtual const char *Name(void) const = 0;
|
||||||
};
|
};
|
||||||
@ -31,37 +32,62 @@ namespace xgboost{
|
|||||||
/*! \brief RMSE */
|
/*! \brief RMSE */
|
||||||
struct EvalRMSE : public IEvaluator{
|
struct EvalRMSE : public IEvaluator{
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const std::vector<float> &labels) const{
|
const DMatrix::Info &info ) const {
|
||||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
float sum = 0.0;
|
float sum = 0.0, wsum = 0.0;
|
||||||
#pragma omp parallel for reduction(+:sum) schedule( static )
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
for (unsigned i = 0; i < ndata; ++i){
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
float diff = preds[i] - labels[i];
|
const float wt = info.GetWeight(i);
|
||||||
sum += diff * diff;
|
const float diff = info.labels[i] - preds[i];
|
||||||
|
sum += diff*diff * wt;
|
||||||
|
wsum += wt;
|
||||||
}
|
}
|
||||||
return sqrtf(sum / ndata);
|
return sqrtf(sum / wsum);
|
||||||
}
|
}
|
||||||
virtual const char *Name(void) const{
|
virtual const char *Name(void) const{
|
||||||
return "rmse";
|
return "rmse";
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*! \brief Error */
|
||||||
|
struct EvalLogLoss : public IEvaluator{
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info ) const {
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
float sum = 0.0f, wsum = 0.0f;
|
||||||
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
|
const float y = info.labels[i];
|
||||||
|
const float py = preds[i];
|
||||||
|
const float wt = info.GetWeight(i);
|
||||||
|
sum -= wt * ( y * std::log(py) + (1.0f - y)*std::log(1 - py) );
|
||||||
|
wsum+= wt;
|
||||||
|
}
|
||||||
|
return sum / wsum;
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const{
|
||||||
|
return "negllik";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/*! \brief Error */
|
/*! \brief Error */
|
||||||
struct EvalError : public IEvaluator{
|
struct EvalError : public IEvaluator{
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const std::vector<float> &labels) const{
|
const DMatrix::Info &info ) const {
|
||||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
unsigned nerr = 0;
|
float sum = 0.0f, wsum = 0.0f;
|
||||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
for (unsigned i = 0; i < ndata; ++i){
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
|
const float wt = info.GetWeight(i);
|
||||||
if (preds[i] > 0.5f){
|
if (preds[i] > 0.5f){
|
||||||
if (labels[i] < 0.5f) nerr += 1;
|
if (info.labels[i] < 0.5f) sum += wt;
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
if (labels[i] > 0.5f) nerr += 1;
|
if (info.labels[i] >= 0.5f) sum += wt;
|
||||||
}
|
}
|
||||||
|
wsum += wt;
|
||||||
}
|
}
|
||||||
return static_cast<float>(nerr) / ndata;
|
return sum / wsum;
|
||||||
}
|
}
|
||||||
virtual const char *Name(void) const{
|
virtual const char *Name(void) const{
|
||||||
return "error";
|
return "error";
|
||||||
@ -74,7 +100,8 @@ namespace xgboost{
|
|||||||
return a.first > b.first;
|
return a.first > b.first;
|
||||||
}
|
}
|
||||||
virtual float Eval( const std::vector<float> &preds,
|
virtual float Eval( const std::vector<float> &preds,
|
||||||
const std::vector<float> &labels ) const{
|
const DMatrix::Info &info ) const {
|
||||||
|
const std::vector<float> &labels = info.labels;
|
||||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
||||||
std::vector< std::pair<float, float> > rec;
|
std::vector< std::pair<float, float> > rec;
|
||||||
for( unsigned i = 0; i < ndata; ++ i ){
|
for( unsigned i = 0; i < ndata; ++ i ){
|
||||||
@ -100,54 +127,35 @@ namespace xgboost{
|
|||||||
return "auc";
|
return "auc";
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/*! \brief Error */
|
|
||||||
struct EvalLogLoss : public IEvaluator{
|
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
|
||||||
const std::vector<float> &labels) const{
|
|
||||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
|
||||||
unsigned nerr = 0;
|
|
||||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
|
||||||
for (unsigned i = 0; i < ndata; ++i){
|
|
||||||
const float y = labels[i];
|
|
||||||
const float py = preds[i];
|
|
||||||
nerr -= y * std::log(py) + (1.0f - y)*std::log(1 - py);
|
|
||||||
}
|
|
||||||
return static_cast<float>(nerr) / ndata;
|
|
||||||
}
|
|
||||||
virtual const char *Name(void) const{
|
|
||||||
return "negllik";
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace regression{
|
namespace regrank{
|
||||||
/*! \brief a set of evaluators */
|
/*! \brief a set of evaluators */
|
||||||
struct EvalSet{
|
struct EvalSet{
|
||||||
public:
|
public:
|
||||||
inline void AddEval(const char *name){
|
inline void AddEval(const char *name){
|
||||||
if (!strcmp(name, "rmse")) evals_.push_back(&rmse_);
|
for( size_t i = 0; i < evals_.size(); ++ i ){
|
||||||
if (!strcmp(name, "error")) evals_.push_back(&error_);
|
if(!strcmp(name, evals_[i]->Name())) return;
|
||||||
if (!strcmp(name, "logloss")) evals_.push_back(&logloss_);
|
}
|
||||||
if (!strcmp( name, "auc")) evals_.push_back( &auc_ );
|
if (!strcmp(name, "rmse")) evals_.push_back( new EvalRMSE() );
|
||||||
|
if (!strcmp(name, "error")) evals_.push_back( new EvalError() );
|
||||||
|
if (!strcmp(name, "logloss")) evals_.push_back( new EvalLogLoss() );
|
||||||
|
if (!strcmp( name, "auc")) evals_.push_back( new EvalAuc() );
|
||||||
|
}
|
||||||
|
~EvalSet(){
|
||||||
|
for( size_t i = 0; i < evals_.size(); ++ i ){
|
||||||
|
delete evals_[i];
|
||||||
}
|
}
|
||||||
inline void Init(void){
|
|
||||||
std::sort(evals_.begin(), evals_.end());
|
|
||||||
evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin());
|
|
||||||
}
|
}
|
||||||
inline void Eval(FILE *fo, const char *evname,
|
inline void Eval(FILE *fo, const char *evname,
|
||||||
const std::vector<float> &preds,
|
const std::vector<float> &preds,
|
||||||
const std::vector<float> &labels) const{
|
const DMatrix::Info &info ) const{
|
||||||
for (size_t i = 0; i < evals_.size(); ++i){
|
for (size_t i = 0; i < evals_.size(); ++i){
|
||||||
float res = evals_[i]->Eval(preds, labels);
|
float res = evals_[i]->Eval(preds, info);
|
||||||
fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
EvalRMSE rmse_;
|
|
||||||
EvalError error_;
|
|
||||||
EvalAuc auc_;
|
|
||||||
EvalLogLoss logloss_;
|
|
||||||
std::vector<const IEvaluator*> evals_;
|
std::vector<const IEvaluator*> evals_;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@ -4,13 +4,13 @@
|
|||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include "xgboost_reg.h"
|
#include "xgboost_regrank.h"
|
||||||
#include "../utils/xgboost_fmap.h"
|
#include "../utils/xgboost_fmap.h"
|
||||||
#include "../utils/xgboost_random.h"
|
#include "../utils/xgboost_random.h"
|
||||||
#include "../utils/xgboost_config.h"
|
#include "../utils/xgboost_config.h"
|
||||||
|
|
||||||
namespace xgboost{
|
namespace xgboost{
|
||||||
namespace regression{
|
namespace regrank{
|
||||||
/*!
|
/*!
|
||||||
* \brief wrapping the training process of the gradient boosting regression model,
|
* \brief wrapping the training process of the gradient boosting regression model,
|
||||||
* given the configuation
|
* given the configuation
|
||||||
@ -273,13 +273,13 @@ namespace xgboost{
|
|||||||
DMatrix data;
|
DMatrix data;
|
||||||
std::vector<DMatrix*> deval;
|
std::vector<DMatrix*> deval;
|
||||||
utils::FeatMap fmap;
|
utils::FeatMap fmap;
|
||||||
RegBoostLearner learner;
|
RegRankBoostLearner learner;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
int main( int argc, char *argv[] ){
|
int main( int argc, char *argv[] ){
|
||||||
xgboost::random::Seed( 0 );
|
xgboost::random::Seed( 0 );
|
||||||
xgboost::regression::RegBoostTask tsk;
|
xgboost::regrank::RegBoostTask tsk;
|
||||||
return tsk.Run( argc, argv );
|
return tsk.Run( argc, argv );
|
||||||
}
|
}
|
||||||
112
regrank/xgboost_regrank_obj.h
Normal file
112
regrank/xgboost_regrank_obj.h
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
#ifndef XGBOOST_REGRANK_OBJ_H
|
||||||
|
#define XGBOOST_REGRANK_OBJ_H
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank_obj.h
|
||||||
|
* \brief defines objective function interface used in xgboost for regression and rank
|
||||||
|
* \author Tianqi Chen, Kailong Chen
|
||||||
|
*/
|
||||||
|
#include "xgboost_regrank_data.h"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief interface of objective function */
|
||||||
|
class IObjFunction{
|
||||||
|
public:
|
||||||
|
/*! \brief virtual destructor */
|
||||||
|
virtual ~IObjFunction(void){}
|
||||||
|
/*!
|
||||||
|
* \brief set parameters from outside
|
||||||
|
* \param name name of the parameter
|
||||||
|
* \param val value of the parameter
|
||||||
|
*/
|
||||||
|
virtual void SetParam(const char *name, const char *val) = 0;
|
||||||
|
/*!
|
||||||
|
* \brief get gradient over each of predictions, given existing information
|
||||||
|
* \param preds prediction of current round
|
||||||
|
* \param info information about labels, weights, groups in rank
|
||||||
|
* \param grad gradient over each preds
|
||||||
|
* \param hess second order gradient over each preds
|
||||||
|
*/
|
||||||
|
virtual void GetGradient(const std::vector<float>& preds,
|
||||||
|
const DMatrix::Info &info,
|
||||||
|
std::vector<float> &grad,
|
||||||
|
std::vector<float> &hess ) = 0;
|
||||||
|
/*! \return the default evaluation metric for the problem */
|
||||||
|
virtual const char* DefaultEvalMetric(void) = 0;
|
||||||
|
/*!
|
||||||
|
* \brief transform prediction values, this is only called when Prediction is called
|
||||||
|
* \param preds prediction values, saves to this vector as well
|
||||||
|
*/
|
||||||
|
virtual void PredTransform(std::vector<float> &preds){}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace regrank{
|
||||||
|
/*! \brief defines functions to calculate some commonly used functions */
|
||||||
|
struct LossType{
|
||||||
|
public:
|
||||||
|
const static int kLinearSquare = 0;
|
||||||
|
const static int kLogisticNeglik = 1;
|
||||||
|
const static int kLogisticClassify = 2;
|
||||||
|
public:
|
||||||
|
/*! \brief indicate which type we are using */
|
||||||
|
int loss_type;
|
||||||
|
public:
|
||||||
|
/*!
|
||||||
|
* \brief transform the linear sum to prediction
|
||||||
|
* \param x linear sum of boosting ensemble
|
||||||
|
* \return transformed prediction
|
||||||
|
*/
|
||||||
|
inline float PredTransform(float x){
|
||||||
|
switch (loss_type){
|
||||||
|
case kLinearSquare: return x;
|
||||||
|
case kLogisticClassify:
|
||||||
|
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
|
||||||
|
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief calculate first order gradient of loss, given transformed prediction
|
||||||
|
* \param predt transformed prediction
|
||||||
|
* \param label true label
|
||||||
|
* \return first order gradient
|
||||||
|
*/
|
||||||
|
inline float FirstOrderGradient(float predt, float label) const{
|
||||||
|
switch (loss_type){
|
||||||
|
case kLinearSquare: return predt - label;
|
||||||
|
case kLogisticClassify:
|
||||||
|
case kLogisticNeglik: return predt - label;
|
||||||
|
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief calculate second order gradient of loss, given transformed prediction
|
||||||
|
* \param predt transformed prediction
|
||||||
|
* \param label true label
|
||||||
|
* \return second order gradient
|
||||||
|
*/
|
||||||
|
inline float SecondOrderGradient(float predt, float label) const{
|
||||||
|
switch (loss_type){
|
||||||
|
case kLinearSquare: return 1.0f;
|
||||||
|
case kLogisticClassify:
|
||||||
|
case kLogisticNeglik: return predt * (1 - predt);
|
||||||
|
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
#include "xgboost_regrank_obj.hpp"
|
||||||
|
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
IObjFunction* CreateObjFunction( const char *name ){
|
||||||
|
if( !strcmp("reg", name ) ) return new RegressionObj();
|
||||||
|
utils::Error("unknown objective function type");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
#endif
|
||||||
52
regrank/xgboost_regrank_obj.hpp
Normal file
52
regrank/xgboost_regrank_obj.hpp
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
#ifndef XGBOOST_REGRANK_OBJ_HPP
|
||||||
|
#define XGBOOST_REGRANK_OBJ_HPP
|
||||||
|
/*!
|
||||||
|
* \file xgboost_regrank_obj.h
|
||||||
|
* \brief implementation of objective functions
|
||||||
|
* \author Tianqi Chen, Kailong Chen
|
||||||
|
*/
|
||||||
|
namespace xgboost{
|
||||||
|
namespace regrank{
|
||||||
|
class RegressionObj : public IObjFunction{
|
||||||
|
public:
|
||||||
|
RegressionObj(void){
|
||||||
|
loss.loss_type = LossType::kLinearSquare;
|
||||||
|
}
|
||||||
|
virtual ~RegressionObj(){}
|
||||||
|
virtual void SetParam(const char *name, const char *val){
|
||||||
|
if( !strcmp( "loss_type", name ) ) loss.loss_type = atoi( val );
|
||||||
|
}
|
||||||
|
virtual void GetGradient(const std::vector<float>& preds,
|
||||||
|
const DMatrix::Info &info,
|
||||||
|
std::vector<float> &grad,
|
||||||
|
std::vector<float> &hess ) {
|
||||||
|
grad.resize(preds.size()); hess.resize(preds.size());
|
||||||
|
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
grad[j] = loss.FirstOrderGradient(preds[j], info.labels[j]) * info.GetWeight(j);
|
||||||
|
hess[j] = loss.SecondOrderGradient(preds[j], info.labels[j]) * info.GetWeight(j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual const char* DefaultEvalMetric(void) {
|
||||||
|
if( loss.loss_type == LossType::kLogisticClassify ) return "error";
|
||||||
|
else return "rmse";
|
||||||
|
}
|
||||||
|
virtual void PredTransform(std::vector<float> &preds){
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
|
preds[j] = loss.PredTransform( preds[j] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
LossType loss;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace regrank{
|
||||||
|
// TODO rank objective
|
||||||
|
};
|
||||||
|
};
|
||||||
|
#endif
|
||||||
@ -1,141 +0,0 @@
|
|||||||
#ifndef XGBOOST_REG_DATA_H
|
|
||||||
#define XGBOOST_REG_DATA_H
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \file xgboost_reg_data.h
|
|
||||||
* \brief input data structure for regression and binary classification task.
|
|
||||||
* Format:
|
|
||||||
* The data should contain each data instance in each line.
|
|
||||||
* The format of line data is as below:
|
|
||||||
* label <nonzero feature dimension> [feature index:feature value]+
|
|
||||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
|
||||||
*/
|
|
||||||
#include <cstdio>
|
|
||||||
#include <vector>
|
|
||||||
#include "../booster/xgboost_data.h"
|
|
||||||
#include "../utils/xgboost_utils.h"
|
|
||||||
#include "../utils/xgboost_stream.h"
|
|
||||||
|
|
||||||
namespace xgboost{
|
|
||||||
namespace regression{
|
|
||||||
/*! \brief data matrix for regression content */
|
|
||||||
struct DMatrix{
|
|
||||||
public:
|
|
||||||
/*! \brief feature data content */
|
|
||||||
booster::FMatrixS data;
|
|
||||||
/*! \brief label of each instance */
|
|
||||||
std::vector<float> labels;
|
|
||||||
public:
|
|
||||||
/*! \brief default constructor */
|
|
||||||
DMatrix(void){}
|
|
||||||
|
|
||||||
/*! \brief get the number of instances */
|
|
||||||
inline size_t Size() const{
|
|
||||||
return labels.size();
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief load from text file
|
|
||||||
* \param fname name of text data
|
|
||||||
* \param silent whether print information or not
|
|
||||||
*/
|
|
||||||
inline void LoadText(const char* fname, bool silent = false){
|
|
||||||
data.Clear();
|
|
||||||
FILE* file = utils::FopenCheck(fname, "r");
|
|
||||||
float label; bool init = true;
|
|
||||||
char tmp[1024];
|
|
||||||
std::vector<booster::bst_uint> findex;
|
|
||||||
std::vector<booster::bst_float> fvalue;
|
|
||||||
|
|
||||||
while (fscanf(file, "%s", tmp) == 1){
|
|
||||||
unsigned index; float value;
|
|
||||||
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
|
|
||||||
findex.push_back(index); fvalue.push_back(value);
|
|
||||||
}
|
|
||||||
else{
|
|
||||||
if (!init){
|
|
||||||
labels.push_back(label);
|
|
||||||
data.AddRow(findex, fvalue);
|
|
||||||
}
|
|
||||||
findex.clear(); fvalue.clear();
|
|
||||||
utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
|
|
||||||
init = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
labels.push_back(label);
|
|
||||||
data.AddRow(findex, fvalue);
|
|
||||||
// initialize column support as well
|
|
||||||
data.InitData();
|
|
||||||
|
|
||||||
if (!silent){
|
|
||||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
|
||||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
|
||||||
}
|
|
||||||
fclose(file);
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief load from binary file
|
|
||||||
* \param fname name of binary data
|
|
||||||
* \param silent whether print information or not
|
|
||||||
* \return whether loading is success
|
|
||||||
*/
|
|
||||||
inline bool LoadBinary(const char* fname, bool silent = false){
|
|
||||||
FILE *fp = fopen64(fname, "rb");
|
|
||||||
if (fp == NULL) return false;
|
|
||||||
utils::FileStream fs(fp);
|
|
||||||
data.LoadBinary(fs);
|
|
||||||
labels.resize(data.NumRow());
|
|
||||||
utils::Assert(fs.Read(&labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
|
|
||||||
fs.Close();
|
|
||||||
// initialize column support as well
|
|
||||||
data.InitData();
|
|
||||||
|
|
||||||
if (!silent){
|
|
||||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
|
||||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief save to binary file
|
|
||||||
* \param fname name of binary data
|
|
||||||
* \param silent whether print information or not
|
|
||||||
*/
|
|
||||||
inline void SaveBinary(const char* fname, bool silent = false){
|
|
||||||
// initialize column support as well
|
|
||||||
data.InitData();
|
|
||||||
|
|
||||||
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
|
||||||
data.SaveBinary(fs);
|
|
||||||
fs.Write(&labels[0], sizeof(float)* data.NumRow());
|
|
||||||
fs.Close();
|
|
||||||
if (!silent){
|
|
||||||
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
|
||||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
|
|
||||||
* otherwise the function will first check if fname + '.buffer' exists,
|
|
||||||
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
|
||||||
* and try to create a buffer file
|
|
||||||
* \param fname name of binary data
|
|
||||||
* \param silent whether print information or not
|
|
||||||
* \param savebuffer whether do save binary buffer if it is text
|
|
||||||
*/
|
|
||||||
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
|
|
||||||
int len = strlen(fname);
|
|
||||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
|
||||||
this->LoadBinary(fname, silent); return;
|
|
||||||
}
|
|
||||||
char bname[1024];
|
|
||||||
sprintf(bname, "%s.buffer", fname);
|
|
||||||
if (!this->LoadBinary(bname, silent)){
|
|
||||||
this->LoadText(fname, silent);
|
|
||||||
if (savebuffer) this->SaveBinary(bname, silent);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
Loading…
x
Reference in New Issue
Block a user