diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index c879c2d53..777fa2b0a 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -321,6 +321,8 @@ namespace xgboost{ fi.Read(&col_access, sizeof(int)); if (col_access != 0){ FMatrixS::LoadBinary(fi, col_ptr_, col_data_); + }else{ + this->InitData(); } } /*! diff --git a/python/Makefile b/python/Makefile index 4b90f7017..0db0a1ed0 100644 --- a/python/Makefile +++ b/python/Makefile @@ -1,19 +1,18 @@ export CC = gcc export CXX = g++ -export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp +export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fopenmp # specify tensor path -SLIB = xgboostpy.so -OBJ = xgboost_python.o +SLIB = libxgboostpy.so .PHONY: clean all all: $(SLIB) export LDFLAGS= -pthread -lm -xgboostpy.so: xgboost_python.cpp ../regrank/*.h ../booster/*.h ../booster/*/*.hpp ../booster/*.hpp +libxgboostpy.so: xgboost_python.cpp ../regrank/*.h ../booster/*.h ../booster/*/*.hpp ../booster/*.hpp $(SLIB) : - $(CXX) $(CFLAGS) $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^) + $(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^) $(BIN) : $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) diff --git a/python/xgboost.py b/python/xgboost.py index ab7024d4d..580b80306 100644 --- a/python/xgboost.py +++ b/python/xgboost.py @@ -2,7 +2,7 @@ import ctypes # load in xgboost library -#xglib = ctypes.cdll.LoadLibrary('./libxgboostpy.so') +xglib = ctypes.cdll.LoadLibrary('./libxgboostpy.so') # entry type of sparse matrix class REntry(ctypes.Structure): @@ -10,6 +10,13 @@ class REntry(ctypes.Structure): class DMatrix: - def __init__(fname = None): - self.__handle = xglib. - + def __init__(self,fname = None): + self.__handle = xglib.XGDMatrixCreate(); + if fname != None: + xglib.XGDMatrixLoad(self.__handle, ctypes.c_char_p(fname), 0) + def __del__(self): + xglib.XGDMatrixFree(self.__handle) + +dmata = DMatrix('xx.buffer') + + diff --git a/python/xgboost_python.cpp b/python/xgboost_python.cpp index 5d020066f..ee97c68d3 100644 --- a/python/xgboost_python.cpp +++ b/python/xgboost_python.cpp @@ -1,12 +1,41 @@ #include "xgboost_python.h" +#include "../regrank/xgboost_regrank.h" +#include "../regrank/xgboost_regrank_data.h" -void* XGDMatrixCreate(void){ - return NULL; -} -void XGDMatrixFree(void *handle){ -} -void XGDMatrixLoad(void *handle, const char *fname){ -} -void XGDMatrixSaveBinary( void *handle, const char *fname ){ -} +namespace xgboost{ + namespace python{ + class DMatrix: public regrank::DMatrix{ + public: + // whether column is initialized + bool init_col_; + public: + DMatrix(void){ + init_col_ = false; + } + ~DMatrix(void){} + public: + inline void Load(const char *fname, bool silent){ + this->CacheLoad(fname, silent); + init_col_ = this->data.HaveColAccess(); + } + }; + }; +}; + +using namespace xgboost::python; + +extern "C"{ + void* XGDMatrixCreate(void){ + return new DMatrix(); + } + void XGDMatrixFree(void *handle){ + delete static_cast(handle); + } + void XGDMatrixLoad(void *handle, const char *fname, int silent){ + static_cast(handle)->Load(fname, silent!=0); + } + void XGDMatrixSaveBinary(void *handle, const char *fname, int silent){ + static_cast(handle)->SaveBinary(fname, silent!=0); + } +}; diff --git a/python/xgboost_python.h b/python/xgboost_python.h index 56af7a095..ead07200d 100644 --- a/python/xgboost_python.h +++ b/python/xgboost_python.h @@ -7,37 +7,51 @@ * use c style interface */ #include "../booster/xgboost_data.h" -/*! \brief type of row entry */ -typedef xgboost::booster::FMatrixS::REntry XGEntry; - -/*! - * \brief create a data matrix - * \return a new data matrix - */ -void* XGDMatrixCreate(void); -/*! - * \brief free space in data matrix - */ -void XGDMatrixFree(void *handle); -/*! - * \brief load a data matrix from text file or buffer(if exists) - * \param handle a instance of data matrix - * \param fname file name - */ -void XGDMatrixLoad(void *handle, const char *fname); -/*! - * \brief load a data matrix into binary file - * \param handle a instance of data matrix - * \param fname file name - */ -void XGDMatrixSaveBinary( void *handle, const char *fname ); -/*! - * \brief add row - * \param handle a instance of data matrix - * \param fname file name - * \return a new data matrix - */ -//void XGDMatrixPush( void *handle, const std::pair ); +extern "C"{ + /*! \brief type of row entry */ + typedef xgboost::booster::FMatrixS::REntry XGEntry; + + /*! + * \brief create a data matrix + * \return a new data matrix + */ + void* XGDMatrixCreate(void); + /*! + * \brief free space in data matrix + */ + void XGDMatrixFree(void *handle); + /*! + * \brief load a data matrix from text file or buffer(if exists) + * \param handle a instance of data matrix + * \param fname file name + * \param silent print statistics when loading + */ + void XGDMatrixLoad(void *handle, const char *fname, int silent); + /*! + * \brief load a data matrix into binary file + * \param handle a instance of data matrix + * \param fname file name + * \param silent print statistics when saving + */ + void XGDMatrixSaveBinary(void *handle, const char *fname, int silent); + /*! + * \brief add row + * \param handle a instance of data matrix + * \param fname file name + * \return a new data matrix + */ + void XGDMatrixPush(void *handle, const XGEntry *data, int len); + + /*! + * \brief create a booster + */ + void* XGBoostCreate(void); + /*! + * \brief create a booster + */ + void* XGBoost(void); + +}; #endif diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h index ea8f3dd3d..c99241a4a 100644 --- a/regrank/xgboost_regrank.h +++ b/regrank/xgboost_regrank.h @@ -28,40 +28,36 @@ namespace xgboost{ name_obj_ = "reg"; } /*! - * \brief a regression booter associated with training and evaluating data - * \param train pointer to the training data - * \param evals array of evaluating data - * \param evname name of evaluation data, used print statistics - */ - RegRankBoostLearner(const DMatrix *train, - const std::vector &evals, - const std::vector &evname){ + * \brief a regression booter associated with training and evaluating data + * \param mats array of pointers to matrix whose prediction result need to be cached + */ + RegRankBoostLearner(const std::vector mats){ silent = 0; - this->SetData(train, evals, evname); - } - + obj_ = NULL; + name_obj_ = "reg"; + this->SetCacheData(mats); + } /*! - * \brief associate regression booster with training and evaluating data - * \param train pointer to the training data - * \param evals array of evaluating data - * \param evname name of evaluation data, used print statistics - */ - inline void SetData(const DMatrix *train, - const std::vector &evals, - const std::vector &evname){ - this->train_ = train; - this->evals_ = evals; - this->evname_ = evname; + * \brief add internal cache space for mat, this can speedup prediction for matrix, + * please cache prediction for training and eval data + * warning: if the model is loaded from file from some previous training history + * set cache data must be called with exactly SAME + * data matrices to continue training otherwise it will cause error + * \param mats array of pointers to matrix whose prediction result need to be cached + */ + inline void SetCacheData(const std::vector mats){ // estimate feature bound - int num_feature = (int)(train->data.NumCol()); + int num_feature = 0; // assign buffer index - unsigned buffer_size = static_cast(train->Size()); + unsigned buffer_size = 0; - for (size_t i = 0; i < evals.size(); ++i){ - buffer_size += static_cast(evals[i]->Size()); - num_feature = std::max(num_feature, (int)(evals[i]->data.NumCol())); + utils::Assert( cache_.size() == 0, "can only call cache data once" ); + for( size_t i = 0; i < mats.size(); ++i ){ + cache_.push_back( CacheEntry( mats[i], buffer_size ) ); + buffer_size += static_cast(mats[i]->Size()); + num_feature = std::max(num_feature, (int)(mats[i]->data.NumCol())); } - + char str_temp[25]; if (num_feature > mparam.num_feature){ mparam.num_feature = num_feature; @@ -74,15 +70,13 @@ namespace xgboost{ if (!silent){ printf("buffer_size=%u\n", buffer_size); } - - // set eval_preds tmp sapce - this->eval_preds_.resize(evals.size(), std::vector()); } + /*! - * \brief set parameters from outside - * \param name name of the parameter - * \param val value of the parameter - */ + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ inline void SetParam(const char *name, const char *val){ if (!strcmp(name, "silent")) silent = atoi(val); if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val); @@ -104,8 +98,8 @@ namespace xgboost{ evaluator_.AddEval( obj_->DefaultEvalMetric() ); } /*! - * \brief initialize the current data storage for model, if the model is used first time, call this function - */ + * \brief initialize the current data storage for model, if the model is used first time, call this function + */ inline void InitModel(void){ base_gbm.InitModel(); mparam.AdjustBase(); @@ -147,74 +141,66 @@ namespace xgboost{ * \brief update the model for one iteration * \param iteration iteration number */ - inline void UpdateOneIter(int iter){ - this->PredictBuffer(preds_, *train_, 0); - obj_->GetGradient(preds_, train_->info, base_gbm.NumBoosters(), grad_, hess_); + inline void UpdateOneIter(int iter, const DMatrix &train){ + this->PredictRaw(preds_, train); + obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_); + // do boost std::vector root_index; - base_gbm.DoBoost(grad_, hess_, train_->data, root_index); + base_gbm.DoBoost(grad_, hess_, train.data, root_index); } /*! * \brief evaluate the model for specific iteration * \param iter iteration number + * \param evals datas i want to evaluate + * \param evname name of each dataset * \param fo file to output log */ - inline void EvalOneIter(int iter, FILE *fo = stderr){ + inline void EvalOneIter(int iter, + const std::vector &evals, + const std::vector &evname, + FILE *fo=stderr ){ fprintf(fo, "[%d]", iter); - int buffer_offset = static_cast(train_->Size()); - - for (size_t i = 0; i < evals_.size(); ++i){ - std::vector &preds = this->eval_preds_[i]; - this->PredictBuffer(preds, *evals_[i], buffer_offset); - obj_->PredTransform(preds); - evaluator_.Eval(fo, evname_[i].c_str(), preds, evals_[i]->info); - buffer_offset += static_cast(evals_[i]->Size()); + for (size_t i = 0; i < evals.size(); ++i){ + this->PredictRaw(preds_, *evals[i]); + obj_->PredTransform(preds_); + evaluator_.Eval(fo, evname[i].c_str(), preds_, evals[i]->info); } fprintf(fo, "\n"); fflush(fo); } /*! \brief get prediction, without buffering */ inline void Predict(std::vector &preds, const DMatrix &data){ - preds.resize(data.Size()); - const unsigned ndata = static_cast(data.Size()); - #pragma omp parallel for schedule( static ) - for (unsigned j = 0; j < ndata; ++j){ - preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1); - } + this->PredictRaw(preds,data); obj_->PredTransform( preds ); } public: /*! * \brief interactive update * \param action action type + * \parma train training data */ - inline void UpdateInteract(std::string action){ - this->InteractPredict(preds_, *train_, 0); - - int buffer_offset = static_cast(train_->Size()); - for (size_t i = 0; i < evals_.size(); ++i){ - std::vector &preds = this->eval_preds_[i]; - this->InteractPredict(preds, *evals_[i], buffer_offset); - buffer_offset += static_cast(evals_[i]->Size()); + inline void UpdateInteract(std::string action, const DMatrix& train){ + for(size_t i = 0; i < cache_.size(); ++i){ + this->InteractPredict(preds_, *cache_[i].mat_); } if (action == "remove"){ base_gbm.DelteBooster(); return; } - obj_->GetGradient(preds_, train_->info, base_gbm.NumBoosters(), grad_, hess_); + obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_); std::vector root_index; - base_gbm.DoBoost(grad_, hess_, train_->data, root_index); + base_gbm.DoBoost(grad_, hess_, train.data, root_index); - this->InteractRePredict(*train_, 0); - buffer_offset = static_cast(train_->Size()); - for (size_t i = 0; i < evals_.size(); ++i){ - this->InteractRePredict(*evals_[i], buffer_offset); - buffer_offset += static_cast(evals_[i]->Size()); + for(size_t i = 0; i < cache_.size(); ++i){ + this->InteractRePredict(*cache_[i].mat_); } } private: /*! \brief get the transformed predictions, given data */ - inline void InteractPredict(std::vector &preds, const DMatrix &data, unsigned buffer_offset){ + inline void InteractPredict(std::vector &preds, const DMatrix &data){ + int buffer_offset = this->FindBufferOffset(data); + utils::Assert( buffer_offset >=0, "interact mode must cache training data" ); preds.resize(data.Size()); const unsigned ndata = static_cast(data.Size()); #pragma omp parallel for schedule( static ) @@ -224,7 +210,9 @@ namespace xgboost{ obj_->PredTransform( preds ); } /*! \brief repredict trial */ - inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){ + inline void InteractRePredict(const DMatrix &data){ + int buffer_offset = this->FindBufferOffset(data); + utils::Assert( buffer_offset >=0, "interact mode must cache training data" ); const unsigned ndata = static_cast(data.Size()); #pragma omp parallel for schedule( static ) for (unsigned j = 0; j < ndata; ++j){ @@ -232,13 +220,24 @@ namespace xgboost{ } } private: - /*! \brief get the transformed predictions, given data */ - inline void PredictBuffer(std::vector &preds, const DMatrix &data, unsigned buffer_offset){ + /*! \brief get un-transformed prediction*/ + inline void PredictRaw(std::vector &preds, const DMatrix &data){ + this->PredictBuffer(preds, data, this->FindBufferOffset(data) ); + } + /*! \brief get the un-transformed predictions, given data */ + inline void PredictBuffer(std::vector &preds, const DMatrix &data, int buffer_offset){ preds.resize(data.Size()); const unsigned ndata = static_cast(data.Size()); - #pragma omp parallel for schedule( static ) - for (unsigned j = 0; j < ndata; ++j){ - preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j); + if( buffer_offset >= 0 ){ + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j){ + preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j); + } + }else + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j){ + preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1); + }{ } } private: @@ -260,10 +259,10 @@ namespace xgboost{ memset(reserved, 0, sizeof(reserved)); } /*! - * \brief set parameters from outside - * \param name name of the parameter - * \param val value of the parameter - */ + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ inline void SetParam(const char *name, const char *val){ if (!strcmp("base_score", name)) base_score = (float)atof(val); if (!strcmp("loss_type", name)) loss_type = atoi(val); @@ -279,15 +278,28 @@ namespace xgboost{ } } }; + private: + struct CacheEntry{ + const DMatrix *mat_; + int buffer_offset_; + CacheEntry(const DMatrix *mat, int buffer_offset) + :mat_(mat), buffer_offset_(buffer_offset){} + }; + /*! \brief the entries indicates that we have internal prediction cache */ + std::vector cache_; + private: + // find internal bufer offset for certain matrix, if not exist, return -1 + inline int FindBufferOffset(const DMatrix &mat){ + for(size_t i = 0; i < cache_.size(); ++i){ + if( cache_[i].mat_ == &mat ) return cache_[i].buffer_offset_; + } + return -1; + } private: int silent; EvalSet evaluator_; booster::GBMBase base_gbm; - ModelParam mparam; - const DMatrix *train_; - std::vector evals_; - std::vector evname_; - std::vector buffer_index_; + ModelParam mparam; // objective fnction IObjFunction *obj_; // name of objective function @@ -295,7 +307,6 @@ namespace xgboost{ std::vector< std::pair > cfg_; private: std::vector grad_, hess_, preds_; - std::vector< std::vector > eval_preds_; }; } }; diff --git a/regrank/xgboost_regrank_data.h b/regrank/xgboost_regrank_data.h index 3653021c5..458ad60f0 100644 --- a/regrank/xgboost_regrank_data.h +++ b/regrank/xgboost_regrank_data.h @@ -116,8 +116,6 @@ namespace xgboost{ } } fs.Close(); - // initialize column support as well - data.InitData(); if (!silent){ printf("%ux%u matrix with %lu entries is loaded from %s\n", diff --git a/regrank/xgboost_regrank_main.cpp b/regrank/xgboost_regrank_main.cpp index 862837c97..165ff2636 100644 --- a/regrank/xgboost_regrank_main.cpp +++ b/regrank/xgboost_regrank_main.cpp @@ -62,6 +62,7 @@ namespace xgboost{ if (!strcmp("seed", name)) random::Seed(atoi(val)); if (!strcmp("num_round", name)) num_round = atoi(val); if (!strcmp("save_period", name)) save_period = atoi(val); + if (!strcmp("eval_train", name)) eval_train = atoi(val); if (!strcmp("task", name)) task = val; if (!strcmp("data", name)) train_path = val; if (!strcmp("test:data", name)) test_path = val; @@ -92,6 +93,7 @@ namespace xgboost{ use_buffer = 1; num_round = 10; save_period = 0; + eval_train = 0; dump_model_stats = 0; task = "train"; model_in = "NULL"; @@ -122,9 +124,22 @@ namespace xgboost{ for (size_t i = 0; i < eval_data_names.size(); ++i){ deval.push_back(new DMatrix()); deval.back()->CacheLoad(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0); + devalall.push_back(deval.back()); } + std::vector dcache(1, &data); + for( size_t i = 0; i < deval.size(); ++ i){ + dcache.push_back( deval[i] ); + } + // set cache data to be all training and evaluation data + learner.SetCacheData(dcache); + + // add training set to evaluation set if needed + if( eval_train != 0 ){ + devalall.push_back( &data ); + eval_data_names.push_back( std::string("train") ); + } + } - learner.SetData(&data, deval, eval_data_names); } inline void InitLearner(void){ cfg.BeforeFirst(); @@ -148,8 +163,8 @@ namespace xgboost{ for (int i = 0; i < num_round; ++i){ elapsed = (unsigned long)(time(NULL) - start); if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); - learner.UpdateOneIter(i); - learner.EvalOneIter(i); + learner.UpdateOneIter(i, data); + learner.EvalOneIter(i, devalall, eval_data_names); if (save_period != 0 && (i + 1) % save_period == 0){ this->SaveModel(i); } @@ -169,7 +184,7 @@ namespace xgboost{ } } inline void TaskEval(void){ - learner.EvalOneIter(0); + learner.EvalOneIter(0, devalall, eval_data_names); } inline void TaskInteractive(void){ const time_t start = time(NULL); @@ -179,7 +194,7 @@ namespace xgboost{ cfg_batch.BeforeFirst(); while (cfg_batch.Next()){ if (!strcmp(cfg_batch.name(), "run")){ - learner.UpdateInteract(interact_action); + learner.UpdateInteract(interact_action, data); batch_action += 1; } else{ @@ -188,7 +203,7 @@ namespace xgboost{ } if (batch_action == 0){ - learner.UpdateInteract(interact_action); + learner.UpdateInteract(interact_action, data); } utils::Assert(model_out != "NULL", "interactive mode must specify model_out"); this->SaveModel(model_out.c_str()); @@ -235,6 +250,8 @@ namespace xgboost{ int silent; /* \brief whether use auto binary buffer */ int use_buffer; + /* \brief whether evaluate training statistics */ + int eval_train; /* \brief number of boosting iterations */ int num_round; /* \brief the period to save the model, 0 means only save the final round model */ @@ -272,6 +289,7 @@ namespace xgboost{ private: DMatrix data; std::vector deval; + std::vector devalall; utils::FeatMap fmap; RegRankBoostLearner learner; };