From c5ada79be574f8e9cce91f0c099134b5156d6ad4 Mon Sep 17 00:00:00 2001 From: kalenhaha Date: Mon, 10 Feb 2014 23:40:38 +0800 Subject: [PATCH] gbrt implemented --- booster/gbrt.h | 82 ++++++++++++++++++++++++ booster/xgboost_regression_data_reader.h | 76 ++++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 booster/gbrt.h create mode 100644 booster/xgboost_regression_data_reader.h diff --git a/booster/gbrt.h b/booster/gbrt.h new file mode 100644 index 000000000..6046364ea --- /dev/null +++ b/booster/gbrt.h @@ -0,0 +1,82 @@ +#ifndef _GBRT_H_ +#define _GBRT_H_ + +#include "../utils/xgboost_config.h" +#include "xgboost_regression_data_reader.h" +#include "xgboost_gbmbase.h" +#include +using namespace xgboost::utils; +using namespace xgboost::booster; + +class gbrt{ + +public: + gbrt(const char* config_path){ + ConfigIterator config_itr(config_path); + while(config_itr.Next()){ + SetParam(config_itr.name,config_itr.val); + base_model.SetParam(config_itr.name,config_itr.val); + } + } + + void SetParam( const char *name, const char *val ){ + param.SetParam(name, val); + } + + void train(){ + xgboost_regression_data_reader data_reader(param.train_file_path); + base_model.InitModel(); + base_model.InitTrainer(); + std::vector grad,hess; + std::vector root_index; + int instance_num = data_reader.InsNum(); + float label = 0,pred_transform = 0; + grad.resize(instance_num); hess.resize(instance_num); + for(int i = 0; i < 100; i++){ + grad.clear();hess.clear(); + for(int j = 0; j < instance_num; j++){ + label = data_reader.GetLabel(j); + pred_transform = Logistic(base_model.Predict(data_reader.GetLine(j))); + grad.push_back(FirstOrderGradient(pred_transform,label)); + hess.push_back(SecondOrderGradient(pred_transform)); + } + base_model.DoBoost(grad,hess,data_reader.GetImage(),root_index ); + } + } + + struct GBRTParam{ + + /*! \brief path of input training data */ + const char* train_file_path; + + GBRTParam( void ){ + } + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + inline void SetParam( const char *name, const char *val ){ + if( !strcmp("train_file_path", name ) ) train_file_path = val; + } + }; + +private: + inline float FirstOrderGradient(float pred_transform,float label){ + return label - pred_transform; + } + + inline float SecondOrderGradient(float pred_transform){ + return pred_transform * ( 1 - pred_transform ); + } + + inline float Logistic(float x){ + return 1.0/(1.0 + exp(-x)); + } + + GBMBaseModel base_model; + GBRTParam param; + +}; + +#endif \ No newline at end of file diff --git a/booster/xgboost_regression_data_reader.h b/booster/xgboost_regression_data_reader.h new file mode 100644 index 000000000..13c7b0d8d --- /dev/null +++ b/booster/xgboost_regression_data_reader.h @@ -0,0 +1,76 @@ +#include"xgboost_data.h" +#include +#include + +using namespace xgboost::booster; +/*! + * \file xgboost_gbmbase.h + * \brief A reader to read the data for regression task from a specified file + * The data should contain each data instance in each line. + * The format of line data is as below: + * label nonzero feature dimension[ feature index:feature value]+ + * \author Kailong Chen: chenkl198812@gmail.com + */ + +class xgboost_regression_data_reader{ + +public: + xgboost_regression_data_reader(const char* file_path){ + Load(file_path); + } + + void Load(const char* file_path){ + data_matrix.Clear(); + FILE* file = fopen(file_path,"r"); + if(file == NULL){ + printf("The file is missing at %s",file_path); + return; + } + float label; + int nonzero_dimension,index,value,num_row = 0; + std::vector findex; + std::vector fvalue; + + while(fscanf(file,"%f %i",label,nonzero_dimension)){ + findex.clear(); + fvalue.clear(); + findex.resize(nonzero_dimension); + fvalue.resize(nonzero_dimension); + for(int i = 0; i < nonzero_dimension; i++){ + if(!fscanf(file," %i:%f",index,value)){ + printf("The feature dimension is not coincident \ + with the indicated one"); + return; + } + findex.push_back(index); + fvalue.push_back(value); + } + data_matrix.AddRow(findex, fvalue); + labels.push_back(label); + num_row++; + } + printf("%i rows of data is loaded from %s",num_row,file_path); + fclose(file); + } + + + float GetLabel(int index){ + return labels[index]; + } + + FMatrixS::Line GetLine(int index){ + return data_matrix[index]; + } + + int InsNum(){ + return labels.size(); + } + + FMatrixS::Image GetImage(){ + return FMatrixS::Image(data_matrix); + } + +private: + FMatrixS data_matrix; + std::vector labels; +}; \ No newline at end of file