diff --git a/demo/regression/reg.conf b/demo/regression/reg.conf index e9ec9e722..da213f466 100644 --- a/demo/regression/reg.conf +++ b/demo/regression/reg.conf @@ -18,8 +18,6 @@ booster_type=1 do_reboost=0 -bst:num_roots=0 - bst:num_feature=3 learning_rate=0.01 diff --git a/regression/xgboost_reg.h b/regression/xgboost_reg.h index f067f5e3c..3fe576213 100644 --- a/regression/xgboost_reg.h +++ b/regression/xgboost_reg.h @@ -12,267 +12,267 @@ #include "../utils/xgboost_stream.h" namespace xgboost{ - namespace regression{ - /*! \brief class for gradient boosted regression */ - class RegBoostLearner{ - public: + namespace regression{ + /*! \brief class for gradient boosted regression */ + class RegBoostLearner{ + public: - RegBoostLearner(bool silent = false){ - this->silent = silent; - } + RegBoostLearner(bool silent = false){ + this->silent = silent; + } - /*! - * \brief a regression booter associated with training and evaluating data - * \param train pointer to the training data - * \param evals array of evaluating data - * \param evname name of evaluation data, used print statistics - */ - RegBoostLearner( const DMatrix *train, - std::vector evals, - std::vector evname, bool silent = false ){ - this->silent = silent; - SetData(train,evals,evname); - } + /*! + * \brief a regression booter associated with training and evaluating data + * \param train pointer to the training data + * \param evals array of evaluating data + * \param evname name of evaluation data, used print statistics + */ + RegBoostLearner( const DMatrix *train, + std::vector evals, + std::vector evname, bool silent = false ){ + this->silent = silent; + SetData(train,evals,evname); + } - /*! - * \brief associate regression booster with training and evaluating data - * \param train pointer to the training data - * \param evals array of evaluating data - * \param evname name of evaluation data, used print statistics - */ - inline void SetData(const DMatrix *train, - std::vector evals, - std::vector evname){ - this->train_ = train; - this->evals_ = evals; - this->evname_ = evname; - //assign buffer index - int buffer_size = (*train).size(); - for(int i = 0; i < evals.size(); i++){ - buffer_size += (*evals[i]).size(); - } - char str[25]; - _itoa(buffer_size,str,10); - base_model.SetParam("num_pbuffer",str); - base_model.SetParam("num_pbuffer",str); - } + /*! + * \brief associate regression booster with training and evaluating data + * \param train pointer to the training data + * \param evals array of evaluating data + * \param evname name of evaluation data, used print statistics + */ + inline void SetData(const DMatrix *train, + std::vector evals, + std::vector evname){ + this->train_ = train; + this->evals_ = evals; + this->evname_ = evname; + //assign buffer index + int buffer_size = (*train).size(); + for(int i = 0; i < evals.size(); i++){ + buffer_size += (*evals[i]).size(); + } + char str[25]; + _itoa(buffer_size,str,10); + base_model.SetParam("num_pbuffer",str); + base_model.SetParam("num_pbuffer",str); + } - /*! - * \brief set parameters from outside - * \param name name of the parameter - * \param val value of the parameter - */ - inline void SetParam( const char *name, const char *val ){ - mparam.SetParam( name, val ); - base_model.SetParam( name, val ); - } - /*! - * \brief initialize solver before training, called before training - * this function is reserved for solver to allocate necessary space and do other preparation - */ - inline void InitTrainer( void ){ - base_model.InitTrainer(); - InitModel(); - mparam.AdjustBase(); - } + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + inline void SetParam( const char *name, const char *val ){ + mparam.SetParam( name, val ); + base_model.SetParam( name, val ); + } + /*! + * \brief initialize solver before training, called before training + * this function is reserved for solver to allocate necessary space and do other preparation + */ + inline void InitTrainer( void ){ + base_model.InitTrainer(); + InitModel(); + mparam.AdjustBase(); + } - /*! - * \brief initialize the current data storage for model, if the model is used first time, call this function - */ - inline void InitModel( void ){ - base_model.InitModel(); - } + /*! + * \brief initialize the current data storage for model, if the model is used first time, call this function + */ + inline void InitModel( void ){ + base_model.InitModel(); + } - /*! - * \brief load model from stream - * \param fi input stream - */ - inline void LoadModel( utils::IStream &fi ){ - utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 ); - base_model.LoadModel( fi ); - } - /*! - * \brief save model to stream - * \param fo output stream - */ - inline void SaveModel( utils::IStream &fo ) const{ - fo.Write( &mparam, sizeof(ModelParam) ); - base_model.SaveModel( fo ); - } - - /*! - * \brief update the model for one iteration - * \param iteration the number of updating iteration - */ - inline void UpdateOneIter( int iteration ){ - std::vector grad,hess,preds; - std::vector root_index; - booster::FMatrixS::Image train_image((*train_).data); - Predict(preds,*train_,0); - Gradient(preds,(*train_).labels,grad,hess); - base_model.DoBoost(grad,hess,train_image,root_index); - int buffer_index_offset = (*train_).size(); - float loss = 0.0; - for(int i = 0; i < evals_.size();i++){ - Predict(preds, *evals_[i], buffer_index_offset); - loss = mparam.Loss(preds,(*evals_[i]).labels); - if(!silent){ - printf("The loss of %s data set in %d the \ - iteration is %f",evname_[i].c_str(),&iteration,&loss); - } - buffer_index_offset += (*evals_[i]).size(); - } - - } + /*! + * \brief load model from stream + * \param fi input stream + */ + inline void LoadModel( utils::IStream &fi ){ + utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 ); + base_model.LoadModel( fi ); + } + /*! + * \brief save model to stream + * \param fo output stream + */ + inline void SaveModel( utils::IStream &fo ) const{ + fo.Write( &mparam, sizeof(ModelParam) ); + base_model.SaveModel( fo ); + } - /*! \brief get the transformed predictions, given data */ - inline void Predict( std::vector &preds, const DMatrix &data,int buffer_index_offset = 0 ){ - int data_size = data.size(); - preds.resize(data_size); - for(int j = 0; j < data_size; j++){ - preds[j] = mparam.PredTransform(mparam.base_score + - base_model.Predict(data.data[j],buffer_index_offset + j)); - } - } + /*! + * \brief update the model for one iteration + * \param iteration the number of updating iteration + */ + inline void UpdateOneIter( int iteration ){ + std::vector grad,hess,preds; + std::vector root_index; + booster::FMatrixS::Image train_image((*train_).data); + Predict(preds,*train_,0); + Gradient(preds,(*train_).labels,grad,hess); + base_model.DoBoost(grad,hess,train_image,root_index); + int buffer_index_offset = (*train_).size(); + float loss = 0.0; + for(int i = 0; i < evals_.size();i++){ + Predict(preds, *evals_[i], buffer_index_offset); + loss = mparam.Loss(preds,(*evals_[i]).labels); + if(!silent){ + printf("The loss of %s data set in %d the \ + iteration is %f",evname_[i].c_str(),&iteration,&loss); + } + buffer_index_offset += (*evals_[i]).size(); + } - private: - /*! \brief get the first order and second order gradient, given the transformed predictions and labels*/ - inline void Gradient(const std::vector &preds, const std::vector &labels, std::vector &grad, - std::vector &hess){ - grad.clear(); - hess.clear(); - for(int j = 0; j < preds.size(); j++){ - grad.push_back(mparam.FirstOrderGradient(preds[j],labels[j])); - hess.push_back(mparam.SecondOrderGradient(preds[j],labels[j])); - } - } + } - enum LOSS_TYPE_LIST{ - LINEAR_SQUARE, - LOGISTIC_NEGLOGLIKELIHOOD, - }; + /*! \brief get the transformed predictions, given data */ + inline void Predict( std::vector &preds, const DMatrix &data,int buffer_index_offset = 0 ){ + int data_size = data.size(); + preds.resize(data_size); + for(int j = 0; j < data_size; j++){ + preds[j] = mparam.PredTransform(mparam.base_score + + base_model.Predict(data.data[j],buffer_index_offset + j)); + } + } - /*! \brief training parameter for regression */ - struct ModelParam{ - /* \brief global bias */ - float base_score; - /* \brief type of loss function */ - int loss_type; - - ModelParam( void ){ - base_score = 0.5f; - loss_type = 0; - } - /*! - * \brief set parameters from outside - * \param name name of the parameter - * \param val value of the parameter - */ - inline void SetParam( const char *name, const char *val ){ - if( !strcmp("base_score", name ) ) base_score = (float)atof( val ); - if( !strcmp("loss_type", name ) ) loss_type = atoi( val ); - } - /*! - * \brief adjust base_score - */ - inline void AdjustBase( void ){ - if( loss_type == 1 ){ - utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" ); - base_score = - logf( 1.0f / base_score - 1.0f ); - } - } - /*! - * \brief calculate first order gradient of loss, given transformed prediction - * \param predt transformed prediction - * \param label true label - * \return first order gradient - */ - inline float FirstOrderGradient( float predt, float label ) const{ - switch( loss_type ){ - case LINEAR_SQUARE: return predt - label; - case 1: return predt - label; - default: utils::Error("unknown loss_type"); return 0.0f; - } - } - /*! - * \brief calculate second order gradient of loss, given transformed prediction - * \param predt transformed prediction - * \param label true label - * \return second order gradient - */ - inline float SecondOrderGradient( float predt, float label ) const{ - switch( loss_type ){ - case LINEAR_SQUARE: return 1.0f; - case LOGISTIC_NEGLOGLIKELIHOOD: return predt * ( 1 - predt ); - default: utils::Error("unknown loss_type"); return 0.0f; - } - } + private: + /*! \brief get the first order and second order gradient, given the transformed predictions and labels*/ + inline void Gradient(const std::vector &preds, const std::vector &labels, std::vector &grad, + std::vector &hess){ + grad.clear(); + hess.clear(); + for(int j = 0; j < preds.size(); j++){ + grad.push_back(mparam.FirstOrderGradient(preds[j],labels[j])); + hess.push_back(mparam.SecondOrderGradient(preds[j],labels[j])); + } + } - /*! - * \brief calculating the loss, given the predictions, labels and the loss type - * \param preds the given predictions - * \param labels the given labels - * \return the specified loss - */ - inline float Loss(const std::vector &preds, const std::vector &labels) const{ - switch( loss_type ){ - case LINEAR_SQUARE: return SquareLoss(preds,labels); - case LOGISTIC_NEGLOGLIKELIHOOD: return NegLoglikelihoodLoss(preds,labels); - default: utils::Error("unknown loss_type"); return 0.0f; - } - } + enum LOSS_TYPE_LIST{ + LINEAR_SQUARE, + LOGISTIC_NEGLOGLIKELIHOOD, + }; - /*! - * \brief calculating the square loss, given the predictions and labels - * \param preds the given predictions - * \param labels the given labels - * \return the summation of square loss - */ - inline float SquareLoss(const std::vector &preds, const std::vector &labels) const{ - float ans = 0.0; - for(int i = 0; i < preds.size(); i++) - ans += pow(preds[i] - labels[i], 2); - return ans; - } + /*! \brief training parameter for regression */ + struct ModelParam{ + /* \brief global bias */ + float base_score; + /* \brief type of loss function */ + int loss_type; - /*! - * \brief calculating the square loss, given the predictions and labels - * \param preds the given predictions - * \param labels the given labels - * \return the summation of square loss - */ - inline float NegLoglikelihoodLoss(const std::vector &preds, const std::vector &labels) const{ - float ans = 0.0; - for(int i = 0; i < preds.size(); i++) - ans -= labels[i] * log(preds[i]) + ( 1 - labels[i] ) * log(1 - preds[i]); - return ans; - } + ModelParam( void ){ + base_score = 0.5f; + loss_type = 0; + } + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + inline void SetParam( const char *name, const char *val ){ + if( !strcmp("base_score", name ) ) base_score = (float)atof( val ); + if( !strcmp("loss_type", name ) ) loss_type = atoi( val ); + } + /*! + * \brief adjust base_score + */ + inline void AdjustBase( void ){ + if( loss_type == 1 ){ + utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" ); + base_score = - logf( 1.0f / base_score - 1.0f ); + } + } + /*! + * \brief calculate first order gradient of loss, given transformed prediction + * \param predt transformed prediction + * \param label true label + * \return first order gradient + */ + inline float FirstOrderGradient( float predt, float label ) const{ + switch( loss_type ){ + case LINEAR_SQUARE: return predt - label; + case 1: return predt - label; + default: utils::Error("unknown loss_type"); return 0.0f; + } + } + /*! + * \brief calculate second order gradient of loss, given transformed prediction + * \param predt transformed prediction + * \param label true label + * \return second order gradient + */ + inline float SecondOrderGradient( float predt, float label ) const{ + switch( loss_type ){ + case LINEAR_SQUARE: return 1.0f; + case LOGISTIC_NEGLOGLIKELIHOOD: return predt * ( 1 - predt ); + default: utils::Error("unknown loss_type"); return 0.0f; + } + } - - /*! - * \brief transform the linear sum to prediction - * \param x linear sum of boosting ensemble - * \return transformed prediction - */ - inline float PredTransform( float x ){ - switch( loss_type ){ - case LINEAR_SQUARE: return x; - case LOGISTIC_NEGLOGLIKELIHOOD: return 1.0f/(1.0f + expf(-x)); - default: utils::Error("unknown loss_type"); return 0.0f; - } - } + /*! + * \brief calculating the loss, given the predictions, labels and the loss type + * \param preds the given predictions + * \param labels the given labels + * \return the specified loss + */ + inline float Loss(const std::vector &preds, const std::vector &labels) const{ + switch( loss_type ){ + case LINEAR_SQUARE: return SquareLoss(preds,labels); + case LOGISTIC_NEGLOGLIKELIHOOD: return NegLoglikelihoodLoss(preds,labels); + default: utils::Error("unknown loss_type"); return 0.0f; + } + } - - }; - private: - booster::GBMBaseModel base_model; - ModelParam mparam; - const DMatrix *train_; - std::vector evals_; - std::vector evname_; - bool silent; - }; - } + /*! + * \brief calculating the square loss, given the predictions and labels + * \param preds the given predictions + * \param labels the given labels + * \return the summation of square loss + */ + inline float SquareLoss(const std::vector &preds, const std::vector &labels) const{ + float ans = 0.0; + for(int i = 0; i < preds.size(); i++) + ans += pow(preds[i] - labels[i], 2); + return ans; + } + + /*! + * \brief calculating the square loss, given the predictions and labels + * \param preds the given predictions + * \param labels the given labels + * \return the summation of square loss + */ + inline float NegLoglikelihoodLoss(const std::vector &preds, const std::vector &labels) const{ + float ans = 0.0; + for(int i = 0; i < preds.size(); i++) + ans -= labels[i] * log(preds[i]) + ( 1 - labels[i] ) * log(1 - preds[i]); + return ans; + } + + + /*! + * \brief transform the linear sum to prediction + * \param x linear sum of boosting ensemble + * \return transformed prediction + */ + inline float PredTransform( float x ){ + switch( loss_type ){ + case LINEAR_SQUARE: return x; + case LOGISTIC_NEGLOGLIKELIHOOD: return 1.0f/(1.0f + expf(-x)); + default: utils::Error("unknown loss_type"); return 0.0f; + } + } + + + }; + private: + booster::GBMBaseModel base_model; + ModelParam mparam; + const DMatrix *train_; + std::vector evals_; + std::vector evname_; + bool silent; + }; + } }; #endif diff --git a/regression/xgboost_reg_main.cpp b/regression/xgboost_reg_main.cpp index 594069ca2..16ef5f486 100644 --- a/regression/xgboost_reg_main.cpp +++ b/regression/xgboost_reg_main.cpp @@ -3,13 +3,13 @@ using namespace xgboost::regression; int main(int argc, char *argv[]){ -// char* config_path = argv[1]; -// bool silent = ( atoi(argv[2]) == 1 ); - char* config_path = "c:\\cygwin64\\home\\chen\\github\\xgboost\\demo\\regression\\reg.conf"; - bool silent = false; - RegBoostTrain train; - train.train(config_path,false); + //char* config_path = argv[1]; + //bool silent = ( atoi(argv[2]) == 1 ); + char* config_path = "c:\\cygwin64\\home\\chen\\github\\xgboost\\demo\\regression\\reg.conf"; + bool silent = false; + RegBoostTrain train; + train.train(config_path,false); - RegBoostTest test; - test.test(config_path,false); + RegBoostTest test; + test.test(config_path,false); } \ No newline at end of file diff --git a/regression/xgboost_reg_test.h b/regression/xgboost_reg_test.h index 2a9020ed6..cccdca5fa 100644 --- a/regression/xgboost_reg_test.h +++ b/regression/xgboost_reg_test.h @@ -11,89 +11,89 @@ using namespace xgboost::utils; namespace xgboost{ - namespace regression{ - /*! - * \brief wrapping the testing process of the gradient - boosting regression model,given the configuation - * \author Kailong Chen: chenkl198812@gmail.com - */ - class RegBoostTest{ - public: - /*! - * \brief to start the testing process of gradient boosting regression - * model given the configuation, and finally save the prediction - * results to the specified paths. - * \param config_path the location of the configuration - * \param silent whether to print feedback messages - */ - void test(char* config_path,bool silent = false){ - reg_boost_learner = new xgboost::regression::RegBoostLearner(silent); - ConfigIterator config_itr(config_path); - //Get the training data and validation data paths, config the Learner - while (config_itr.Next()){ - reg_boost_learner->SetParam(config_itr.name(),config_itr.val()); - test_param.SetParam(config_itr.name(),config_itr.val()); - } + namespace regression{ + /*! + * \brief wrapping the testing process of the gradient + boosting regression model,given the configuation + * \author Kailong Chen: chenkl198812@gmail.com + */ + class RegBoostTest{ + public: + /*! + * \brief to start the testing process of gradient boosting regression + * model given the configuation, and finally save the prediction + * results to the specified paths. + * \param config_path the location of the configuration + * \param silent whether to print feedback messages + */ + void test(char* config_path,bool silent = false){ + reg_boost_learner = new xgboost::regression::RegBoostLearner(silent); + ConfigIterator config_itr(config_path); + //Get the training data and validation data paths, config the Learner + while (config_itr.Next()){ + reg_boost_learner->SetParam(config_itr.name(),config_itr.val()); + test_param.SetParam(config_itr.name(),config_itr.val()); + } - Assert(test_param.test_paths.size() == test_param.test_names.size(), - "The number of test data set paths is not the same as the number of test data set data set names"); + Assert(test_param.test_paths.size() == test_param.test_names.size(), + "The number of test data set paths is not the same as the number of test data set data set names"); - //begin testing - reg_boost_learner->InitModel(); - char model_path[256]; - std::vector preds; - for(int i = 0; i < test_param.test_paths.size(); i++){ - xgboost::regression::DMatrix test_data; - test_data.LoadText(test_param.test_paths[i].c_str()); - sprintf(model_path,"%s/final.model",test_param.model_dir_path); - FileStream fin(fopen(model_path,"r")); - reg_boost_learner->LoadModel(fin); - fin.Close(); - reg_boost_learner->Predict(preds,test_data); - } - } + //begin testing + reg_boost_learner->InitModel(); + char model_path[256]; + std::vector preds; + for(int i = 0; i < test_param.test_paths.size(); i++){ + xgboost::regression::DMatrix test_data; + test_data.LoadText(test_param.test_paths[i].c_str()); + sprintf(model_path,"%s/final.model",test_param.model_dir_path); + FileStream fin(fopen(model_path,"r")); + reg_boost_learner->LoadModel(fin); + fin.Close(); + reg_boost_learner->Predict(preds,test_data); + } + } - private: - struct TestParam{ - /* \brief upperbound of the number of boosters */ - int boost_iterations; + private: + struct TestParam{ + /* \brief upperbound of the number of boosters */ + int boost_iterations; - /* \brief the period to save the model, -1 means only save the final round model */ - int save_period; + /* \brief the period to save the model, -1 means only save the final round model */ + int save_period; - /* \brief the path of directory containing the saved models */ - char model_dir_path[256]; + /* \brief the path of directory containing the saved models */ + char model_dir_path[256]; - /* \brief the path of directory containing the output prediction results */ - char pred_dir_path[256]; + /* \brief the path of directory containing the output prediction results */ + char pred_dir_path[256]; - /* \brief the paths of test data sets */ - std::vector test_paths; + /* \brief the paths of test data sets */ + std::vector test_paths; - /* \brief the names of the test data sets */ - std::vector test_names; + /* \brief the names of the test data sets */ + std::vector test_names; - /*! - * \brief set parameters from outside - * \param name name of the parameter - * \param val value of the parameter - */ - inline void SetParam(const char *name,const char *val ){ - if( !strcmp("model_dir_path", name ) ) strcpy(model_dir_path,val); - if( !strcmp("pred_dir_path", name ) ) strcpy(pred_dir_path,val); - if( !strcmp("test_paths", name) ) { - test_paths = StringProcessing::split(val,';'); - } - if( !strcmp("test_names", name) ) { - test_names = StringProcessing::split(val,';'); - } - } - }; + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + inline void SetParam(const char *name,const char *val ){ + if( !strcmp("model_dir_path", name ) ) strcpy(model_dir_path,val); + if( !strcmp("pred_dir_path", name ) ) strcpy(pred_dir_path,val); + if( !strcmp("test_paths", name) ) { + test_paths = StringProcessing::split(val,';'); + } + if( !strcmp("test_names", name) ) { + test_names = StringProcessing::split(val,';'); + } + } + }; - TestParam test_param; - xgboost::regression::RegBoostLearner* reg_boost_learner; - }; - } + TestParam test_param; + xgboost::regression::RegBoostLearner* reg_boost_learner; + }; + } } #endif diff --git a/regression/xgboost_reg_train.h b/regression/xgboost_reg_train.h index eb6a6198a..c994df566 100644 --- a/regression/xgboost_reg_train.h +++ b/regression/xgboost_reg_train.h @@ -12,120 +12,120 @@ using namespace xgboost::utils; namespace xgboost{ - namespace regression{ - /*! - * \brief wrapping the training process of the gradient - boosting regression model,given the configuation - * \author Kailong Chen: chenkl198812@gmail.com - */ - class RegBoostTrain{ - public: - /*! - * \brief to start the training process of gradient boosting regression - * model given the configuation, and finally saved the models - * to the specified model directory - * \param config_path the location of the configuration - * \param silent whether to print feedback messages - */ - void train(char* config_path,bool silent = false){ - reg_boost_learner = new xgboost::regression::RegBoostLearner(silent); - ConfigIterator config_itr(config_path); - //Get the training data and validation data paths, config the Learner - while (config_itr.Next()){ - printf("%s %s\n",config_itr.name(),config_itr.val()); - reg_boost_learner->SetParam(config_itr.name(),config_itr.val()); - train_param.SetParam(config_itr.name(),config_itr.val()); - } + namespace regression{ + /*! + * \brief wrapping the training process of the gradient + boosting regression model,given the configuation + * \author Kailong Chen: chenkl198812@gmail.com + */ + class RegBoostTrain{ + public: + /*! + * \brief to start the training process of gradient boosting regression + * model given the configuation, and finally saved the models + * to the specified model directory + * \param config_path the location of the configuration + * \param silent whether to print feedback messages + */ + void train(char* config_path,bool silent = false){ + reg_boost_learner = new xgboost::regression::RegBoostLearner(silent); + ConfigIterator config_itr(config_path); + //Get the training data and validation data paths, config the Learner + while (config_itr.Next()){ + printf("%s %s\n",config_itr.name(),config_itr.val()); + reg_boost_learner->SetParam(config_itr.name(),config_itr.val()); + train_param.SetParam(config_itr.name(),config_itr.val()); + } - Assert(train_param.validation_data_paths.size() == train_param.validation_data_names.size(), - "The number of validation paths is not the same as the number of validation data set names"); + Assert(train_param.validation_data_paths.size() == train_param.validation_data_names.size(), + "The number of validation paths is not the same as the number of validation data set names"); - //Load Data - xgboost::regression::DMatrix train; - printf("%s",train_param.train_path); - train.LoadText(train_param.train_path); - std::vector evals; - for(int i = 0; i < train_param.validation_data_paths.size(); i++){ - xgboost::regression::DMatrix eval; - eval.LoadText(train_param.validation_data_paths[i].c_str()); - evals.push_back(&eval); - } - reg_boost_learner->SetData(&train,evals,train_param.validation_data_names); + //Load Data + xgboost::regression::DMatrix train; + printf("%s",train_param.train_path); + train.LoadText(train_param.train_path); + std::vector evals; + for(int i = 0; i < train_param.validation_data_paths.size(); i++){ + xgboost::regression::DMatrix eval; + eval.LoadText(train_param.validation_data_paths[i].c_str()); + evals.push_back(&eval); + } + reg_boost_learner->SetData(&train,evals,train_param.validation_data_names); - //begin training - reg_boost_learner->InitTrainer(); - char suffix[256]; - for(int i = 1; i <= train_param.boost_iterations; i++){ - reg_boost_learner->UpdateOneIter(i); - if(train_param.save_period != 0 && i % train_param.save_period == 0){ - sscanf(suffix,"%d.model",i); - SaveModel(suffix); - } - } + //begin training + reg_boost_learner->InitTrainer(); + char suffix[256]; + for(int i = 1; i <= train_param.boost_iterations; i++){ + reg_boost_learner->UpdateOneIter(i); + if(train_param.save_period != 0 && i % train_param.save_period == 0){ + sscanf(suffix,"%d.model",i); + SaveModel(suffix); + } + } - //save the final round model - SaveModel("final.model"); - } + //save the final round model + SaveModel("final.model"); + } - private: - /*! \brief save model in the model directory with specified suffix*/ - void SaveModel(const char* suffix){ - char model_path[256]; - //save the final round model - sprintf(model_path,"%s/%s",train_param.model_dir_path,suffix); - FILE* file = fopen(model_path,"w"); - FileStream fin(file); - reg_boost_learner->SaveModel(fin); - fin.Close(); - } + private: + /*! \brief save model in the model directory with specified suffix*/ + void SaveModel(const char* suffix){ + char model_path[256]; + //save the final round model + sprintf(model_path,"%s/%s",train_param.model_dir_path,suffix); + FILE* file = fopen(model_path,"w"); + FileStream fin(file); + reg_boost_learner->SaveModel(fin); + fin.Close(); + } - struct TrainParam{ - /* \brief upperbound of the number of boosters */ - int boost_iterations; + struct TrainParam{ + /* \brief upperbound of the number of boosters */ + int boost_iterations; - /* \brief the period to save the model, -1 means only save the final round model */ - int save_period; + /* \brief the period to save the model, -1 means only save the final round model */ + int save_period; - /* \brief the path of training data set */ - char train_path[256]; + /* \brief the path of training data set */ + char train_path[256]; - /* \brief the path of directory containing the saved models */ - char model_dir_path[256]; + /* \brief the path of directory containing the saved models */ + char model_dir_path[256]; - /* \brief the paths of validation data sets */ - std::vector validation_data_paths; + /* \brief the paths of validation data sets */ + std::vector validation_data_paths; - /* \brief the names of the validation data sets */ - std::vector validation_data_names; + /* \brief the names of the validation data sets */ + std::vector validation_data_names; - /*! - * \brief set parameters from outside - * \param name name of the parameter - * \param val value of the parameter - */ - inline void SetParam(const char *name,const char *val ){ - if( !strcmp("boost_iterations", name ) ) boost_iterations = atoi( val ); - if( !strcmp("save_period", name ) ) save_period = atoi( val ); - if( !strcmp("train_path", name ) ) strcpy(train_path,val); - if( !strcmp("model_dir_path", name ) ) { - strcpy(model_dir_path,val); - } - if( !strcmp("validation_paths", name) ) { - validation_data_paths = StringProcessing::split(val,';'); - } - if( !strcmp("validation_names", name) ) { - validation_data_names = StringProcessing::split(val,';'); - } - } - }; + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + inline void SetParam(const char *name,const char *val ){ + if( !strcmp("boost_iterations", name ) ) boost_iterations = atoi( val ); + if( !strcmp("save_period", name ) ) save_period = atoi( val ); + if( !strcmp("train_path", name ) ) strcpy(train_path,val); + if( !strcmp("model_dir_path", name ) ) { + strcpy(model_dir_path,val); + } + if( !strcmp("validation_paths", name) ) { + validation_data_paths = StringProcessing::split(val,';'); + } + if( !strcmp("validation_names", name) ) { + validation_data_names = StringProcessing::split(val,';'); + } + } + }; - /*! \brief the parameters of the training process*/ - TrainParam train_param; - - /*! \brief the gradient boosting regression tree model*/ - xgboost::regression::RegBoostLearner* reg_boost_learner; - }; - } + /*! \brief the parameters of the training process*/ + TrainParam train_param; + + /*! \brief the gradient boosting regression tree model*/ + xgboost::regression::RegBoostLearner* reg_boost_learner; + }; + } } #endif diff --git a/regression/xgboost_regdata.h b/regression/xgboost_regdata.h index a5003e3f9..d400b74d7 100644 --- a/regression/xgboost_regdata.h +++ b/regression/xgboost_regdata.h @@ -2,14 +2,14 @@ #define _XGBOOST_REGDATA_H_ /*! - * \file xgboost_regdata.h - * \brief input data structure for regression and binary classification task. - * Format: - * The data should contain each data instance in each line. - * The format of line data is as below: - * label [feature index:feature value]+ - * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com - */ +* \file xgboost_regdata.h +* \brief input data structure for regression and binary classification task. +* Format: +* The data should contain each data instance in each line. +* The format of line data is as below: +* label [feature index:feature value]+ +* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com +*/ #include #include #include "../booster/xgboost_data.h" @@ -31,17 +31,17 @@ namespace xgboost{ /*! \brief default constructor */ DMatrix( void ){} - - /*! \brief get the number of instances */ - inline int size() const{ - return labels.size(); - } + + /*! \brief get the number of instances */ + inline int size() const{ + return labels.size(); + } /*! - * \brief load from text file - * \param fname name of text data - * \param silent whether print information or not - */ + * \brief load from text file + * \param fname name of text data + * \param silent whether print information or not + */ inline void LoadText( const char* fname, bool silent = false ){ data.Clear(); FILE* file = utils::FopenCheck( fname, "r" ); @@ -49,7 +49,7 @@ namespace xgboost{ char tmp[ 1024 ]; std::vector findex; std::vector fvalue; - + while( fscanf( file, "%s", tmp ) == 1 ){ unsigned index; float value; if( sscanf( tmp, "%u:%f", &index, &value ) == 2 ){ @@ -64,23 +64,23 @@ namespace xgboost{ init = false; } } - - labels.push_back( label ); + + labels.push_back( label ); data.AddRow( findex, fvalue ); - + this->UpdateInfo(); if( !silent ){ printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); + (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); } fclose(file); } /*! - * \brief load from binary file - * \param fname name of binary data - * \param silent whether print information or not - * \return whether loading is success - */ + * \brief load from binary file + * \param fname name of binary data + * \param silent whether print information or not + * \return whether loading is success + */ inline bool LoadBinary( const char* fname, bool silent = false ){ FILE *fp = fopen64( fname, "rb" ); if( fp == NULL ) return false; @@ -92,15 +92,15 @@ namespace xgboost{ this->UpdateInfo(); if( !silent ){ printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); + (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); } return true; } /*! - * \brief save to binary file - * \param fname name of binary data - * \param silent whether print information or not - */ + * \brief save to binary file + * \param fname name of binary data + * \param silent whether print information or not + */ inline void SaveBinary( const char* fname, bool silent = false ){ utils::FileStream fs( utils::FopenCheck( fname, "wb" ) ); data.SaveBinary( fs ); @@ -108,17 +108,17 @@ namespace xgboost{ fs.Close(); if( !silent ){ printf("%ux%u matrix with %lu entries is saved to %s\n", - (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); + (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); } } /*! - * \brief cache load data given a file name, the function will first check if fname + '.xgbuffer' exists, - * if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file, - * and try to create a buffer file - * \param fname name of binary data - * \param silent whether print information or not - * \return whether loading is success - */ + * \brief cache load data given a file name, the function will first check if fname + '.xgbuffer' exists, + * if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file, + * and try to create a buffer file + * \param fname name of binary data + * \param silent whether print information or not + * \return whether loading is success + */ inline void CacheLoad( const char *fname, bool silent = false ){ char bname[ 1024 ]; sprintf( bname, "%s.buffer", fname );