From adc94007369b4e003b60a1bb70634878b260bd54 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 3 May 2014 22:18:25 -0700 Subject: [PATCH] finish python lib --- python/xgboost.py | 76 +++++++++++++++++++---- python/xgboost_python.cpp | 102 ++++++++++++++++++++++++++++++- python/xgboost_python.h | 27 +++++++- regrank/xgboost_regrank.h | 44 +++++++++---- regrank/xgboost_regrank_data.h | 6 +- regrank/xgboost_regrank_main.cpp | 2 +- utils/xgboost_fmap.h | 2 +- utils/xgboost_utils.h | 4 +- 8 files changed, 231 insertions(+), 32 deletions(-) diff --git a/python/xgboost.py b/python/xgboost.py index 8f64d5b0d..959b8bc1e 100644 --- a/python/xgboost.py +++ b/python/xgboost.py @@ -1,30 +1,35 @@ # module for xgboost import ctypes # optinally have scipy sparse, though not necessary +import numpy as np import scipy.sparse as scp +# set this line correctly +XGBOOST_PATH = './libxgboostpy.so' # entry type of sparse matrix class REntry(ctypes.Structure): _fields_ = [("findex", ctypes.c_uint), ("fvalue", ctypes.c_float) ] # load in xgboost library -xglib = ctypes.cdll.LoadLibrary('./libxgboostpy.so') +xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH) xglib.XGDMatrixCreate.restype = ctypes.c_void_p xglib.XGDMatrixNumRow.restype = ctypes.c_ulong xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float ) xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry ) +xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float ) # data matrix used in xgboost class DMatrix: # constructor def __init__(self, data=None, label=None): - self.handle = xglib.XGDMatrixCreate(); + self.handle = xglib.XGDMatrixCreate() if data == None: - return - if type(data) is str: + return + if isinstance(data,str): xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data), 1) - elif type(data) is scp.csr_matrix: + + elif isinstance(data,scp.csr_matrix): self.__init_from_csr(data) else: try: @@ -54,7 +59,13 @@ class DMatrix: xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname), int(silent)) # set label of dmatrix def set_label(self, label): - xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) ); + xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) ) + # set group size of dmatrix, used for rank + def set_group(self, group): + xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group) ) + # set weight of each instances + def set_weight(self, weight): + xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_uint*len(weight))(*weight), len(weight) ) # get label from dmatrix def get_label(self): length = ctypes.c_ulong() @@ -66,16 +77,57 @@ class DMatrix: def num_row(self): return xglib.XGDMatrixNumRow(self.handle) # append a row to DMatrix - def add_row(self, row, label): - xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row), label ) + def add_row(self, row): + xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row) ) # get n-throw from DMatrix def __getitem__(self, ridx): length = ctypes.c_ulong() row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) ); return [ (int(row[i].findex),row[i].fvalue) for i in xrange(length.value) ] +class Booster: + """learner class """ + def __init__(self, params, cache=[]): + """ constructor, param: """ + for d in cache: + assert isinstance(d,DMatrix) + dmats = ( ctypes.c_void_p * len(cache) )(*[ ctypes.c_void_p(d.handle) for d in cache]) + self.handle = xglib.XGBoosterCreate( dmats, len(cache) ) + for k, v in params.iteritems(): + xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) ) + def update(self, dtrain): + """ update """ + assert isinstance(dtrain, DMatrix) + xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle ) + def eval_set(self, evals, it = 0): + for d in evals: + assert isinstance(d[0], DMatrix) + assert isinstance(d[1], str) + dmats = ( ctypes.c_void_p * len(evals) )(*[ ctypes.c_void_p(d[0].handle) for d in evals]) + evnames = ( ctypes.c_char_p * len(evals) )(*[ ctypes.c_char_p(d[1]) for d in evals]) + xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) ) + def eval(self, mat, name = 'eval', it = 0 ): + self.eval_set( [(mat,name)], it) + def predict(self, data): + length = ctypes.c_ulong() + preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length)) + return [ preds[i] for i in xrange(length.value) ] + def save_model(self, fname): + """ save model to file """ + xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) ) + def load_model(self, fname): + """load model from file""" + xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname) ) + def dump_model(self, fname, fmap=''): + """dump model into text file""" + xglib.XGBoosterDumpModel( self.handle, ctypes.c_char_p(fname), ctypes.c_char_p(fmap) ) - -mat = DMatrix('xx.buffer') -print mat.num_row() -mat.clear() +def train(params, dtrain, num_boost_round = 10, evals = []): + """ train a booster with given paramaters """ + bst = Booster(params, [dtrain] ) + for i in xrange(num_boost_round): + bst.update( dtrain ) + if len(evals) != 0: + bst.eval_set( evals, i ) + return bst + diff --git a/python/xgboost_python.cpp b/python/xgboost_python.cpp index 8fb664417..93bb4dc9d 100644 --- a/python/xgboost_python.cpp +++ b/python/xgboost_python.cpp @@ -56,20 +56,63 @@ namespace xgboost{ this->info.labels.resize( len ); memcpy( &(this->info).labels[0], label, sizeof(float)*len ); } + inline void SetGroup( const unsigned *group, size_t len ){ + this->info.group_ptr.resize( len + 1 ); + this->info.group_ptr[0] = 0; + for( size_t i = 0; i < len; ++ i ){ + this->info.group_ptr[i+1] = this->info.group_ptr[i]+group[i]; + } + } + inline void SetWeight( const float *weight, size_t len ){ + this->info.weights.resize( len ); + memcpy( &(this->info).weights[0], weight, sizeof(float)*len ); + } inline const float* GetLabel( size_t* len ) const{ *len = this->info.labels.size(); return &(this->info.labels[0]); } - inline void InitTrain(void){ - if(!this->data.HaveColAccess()) this->data.InitData(); + inline void CheckInit(void){ + if(!this->data.HaveColAccess()){ + this->data.InitData(); + } utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix"); } }; + + class Booster: public xgboost::regrank::RegRankBoostLearner{ + private: + bool init_trainer, init_model; + public: + Booster(const std::vector mats){ + silent = 1; + init_trainer = false; + init_model = false; + this->SetCacheData(mats); + } + inline void CheckInit(void){ + if( !init_trainer ){ + this->InitTrainer(); init_trainer = true; + } + if( !init_model ){ + this->InitModel(); init_model = true; + } + } + inline void LoadModel( const char *fname ){ + xgboost::regrank::RegRankBoostLearner::LoadModel(fname); + this->init_model = true; + } + const float *Pred( const DMatrix &dmat, size_t *len ){ + this->Predict( this->preds_, dmat ); + *len = this->preds_.size(); + return &this->preds_[0]; + } + }; }; }; using namespace xgboost::python; + extern "C"{ void* XGDMatrixCreate( void ){ return new DMatrix(); @@ -94,6 +137,12 @@ extern "C"{ void XGDMatrixSetLabel( void *handle, const float *label, size_t len ){ static_cast(handle)->SetLabel(label,len); } + void XGDMatrixSetWeight( void *handle, const float *weight, size_t len ){ + static_cast(handle)->SetWeight(weight,len); + } + void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len ){ + static_cast(handle)->SetGroup(group,len); + } const float* XGDMatrixGetLabel( const void *handle, size_t* len ){ return static_cast(handle)->GetLabel(len); } @@ -109,5 +158,54 @@ extern "C"{ const XGEntry* XGDMatrixGetRow(void *handle, unsigned ridx, size_t* len){ return static_cast(handle)->GetRow(ridx, len); } + + // xgboost implementation + void *XGBoosterCreate( void *dmats[], size_t len ){ + std::vector mats; + for( size_t i = 0; i < len; ++i ){ + mats.push_back( static_cast(dmats[i]) ); + } + return new Booster( mats ); + } + void XGBoosterSetParam( void *handle, const char *name, const char *value ){ + static_cast(handle)->SetParam( name, value ); + } + void XGBoosterUpdateOneIter( void *handle, void *dtrain ){ + Booster *bst = static_cast(handle); + DMatrix *dtr = static_cast(dtrain); + bst->CheckInit(); dtr->CheckInit(); + bst->UpdateOneIter( *dtr ); + } + void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len ){ + Booster *bst = static_cast(handle); + bst->CheckInit(); + + std::vector names; + std::vector mats; + for( size_t i = 0; i < len; ++i ){ + mats.push_back( static_cast(dmats[i]) ); + names.push_back( std::string( evnames[i]) ); + } + bst->EvalOneIter( iter, mats, names, stdout ); + } + const float *XGBoosterPredict( void *handle, void *dmat, size_t *len ){ + return static_cast(handle)->Pred( *static_cast(dmat), len ); + } + void XGBoosterLoadModel( void *handle, const char *fname ){ + static_cast(handle)->LoadModel( fname ); + } + void XGBoosterSaveModel( const void *handle, const char *fname ){ + static_cast(handle)->SaveModel( fname ); + } + void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap ){ + using namespace xgboost::utils; + FILE *fo = FopenCheck( fname, "w" ); + FeatMap featmap; + if( strlen(fmap) != 0 ){ + featmap.LoadText( fmap ); + } + static_cast(handle)->DumpModel( fo, featmap, false ); + fclose( fo ); + } }; diff --git a/python/xgboost_python.h b/python/xgboost_python.h index 313b4d817..9088f37d6 100644 --- a/python/xgboost_python.h +++ b/python/xgboost_python.h @@ -52,10 +52,24 @@ extern "C"{ /*! * \brief set label of the training matrix * \param handle a instance of data matrix - * \param data array of row content + * \param label pointer to label * \param len length of array */ void XGDMatrixSetLabel( void *handle, const float *label, size_t len ); + /*! + * \brief set label of the training matrix + * \param handle a instance of data matrix + * \param group pointer to group size + * \param len length of array + */ + void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len ); + /*! + * \brief set weight of each instacne + * \param handle a instance of data matrix + * \param weight data pointer to weights + * \param len length of array + */ + void XGDMatrixSetWeight( void *handle, const float *weight, size_t len ); /*! * \brief get label set from matrix * \param handle a instance of data matrix @@ -94,7 +108,7 @@ extern "C"{ * \param dmats matrices that are set to be cached * \param create a booster */ - void *CreateXGBooster( void**dmats, size_t len ); + void *XGBoosterCreate( void* dmats[], size_t len ); /*! * \brief set parameters * \param handle handle @@ -135,7 +149,14 @@ extern "C"{ * \param handle handle * \param fname file name */ - void XGBoosterSaveModel( void *handle, const char *fname ); + void XGBoosterSaveModel( const void *handle, const char *fname ); + /*! + * \brief dump model into text file + * \param handle handle + * \param fname file name + * \param fmap name to fmap can be empty string + */ + void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap ); }; #endif diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h index c99241a4a..2363b5eae 100644 --- a/regrank/xgboost_regrank.h +++ b/regrank/xgboost_regrank.h @@ -31,7 +31,7 @@ namespace xgboost{ * \brief a regression booter associated with training and evaluating data * \param mats array of pointers to matrix whose prediction result need to be cached */ - RegRankBoostLearner(const std::vector mats){ + RegRankBoostLearner(const std::vector& mats){ silent = 0; obj_ = NULL; name_obj_ = "reg"; @@ -45,14 +45,19 @@ namespace xgboost{ * data matrices to continue training otherwise it will cause error * \param mats array of pointers to matrix whose prediction result need to be cached */ - inline void SetCacheData(const std::vector mats){ + inline void SetCacheData(const std::vector& mats){ // estimate feature bound int num_feature = 0; // assign buffer index unsigned buffer_size = 0; - + utils::Assert( cache_.size() == 0, "can only call cache data once" ); for( size_t i = 0; i < mats.size(); ++i ){ + bool dupilicate = false; + for( size_t j = 0; j < i; ++ j ){ + if( mats[i] == mats[j] ) dupilicate = true; + } + if( dupilicate ) continue; cache_.push_back( CacheEntry( mats[i], buffer_size ) ); buffer_size += static_cast(mats[i]->Size()); num_feature = std::max(num_feature, (int)(mats[i]->data.NumCol())); @@ -105,9 +110,18 @@ namespace xgboost{ mparam.AdjustBase(); } /*! - * \brief load model from stream - * \param fi input stream - */ + * \brief load model from file + * \param fname file name + */ + inline void LoadModel(const char *fname){ + utils::FileStream fi(utils::FopenCheck(fname, "rb")); + this->LoadModel(fi); + fi.Close(); + } + /*! + * \brief load model from stream + * \param fi input stream + */ inline void LoadModel(utils::IStream &fi){ base_gbm.LoadModel(fi); utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0); @@ -138,10 +152,18 @@ namespace xgboost{ fo.Write(&mparam, sizeof(ModelParam)); } /*! - * \brief update the model for one iteration - * \param iteration iteration number + * \brief save model into file + * \param fname file name */ - inline void UpdateOneIter(int iter, const DMatrix &train){ + inline void SaveModel(const char *fname) const{ + utils::FileStream fo(utils::FopenCheck(fname, "wb")); + this->SaveModel(fo); + fo.Close(); + } + /*! + * \brief update the model for one iteration + */ + inline void UpdateOneIter(const DMatrix &train){ this->PredictRaw(preds_, train); obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_); // do boost @@ -295,7 +317,7 @@ namespace xgboost{ } return -1; } - private: + protected: int silent; EvalSet evaluator_; booster::GBMBase base_gbm; @@ -305,7 +327,7 @@ namespace xgboost{ // name of objective function std::string name_obj_; std::vector< std::pair > cfg_; - private: + protected: std::vector grad_, hess_, preds_; }; } diff --git a/regrank/xgboost_regrank_data.h b/regrank/xgboost_regrank_data.h index 458ad60f0..0c1b7ff69 100644 --- a/regrank/xgboost_regrank_data.h +++ b/regrank/xgboost_regrank_data.h @@ -166,7 +166,11 @@ namespace xgboost{ inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){ int len = strlen(fname); if (len > 8 && !strcmp(fname + len - 7, ".buffer")){ - this->LoadBinary(fname, silent); return; + if( !this->LoadBinary(fname, silent) ){ + fprintf(stderr,"can not open file \"%s\"", fname); + utils::Error("DMatrix::CacheLoad failed"); + } + return; } char bname[1024]; sprintf(bname, "%s.buffer", fname); diff --git a/regrank/xgboost_regrank_main.cpp b/regrank/xgboost_regrank_main.cpp index 165ff2636..be7bbbb35 100644 --- a/regrank/xgboost_regrank_main.cpp +++ b/regrank/xgboost_regrank_main.cpp @@ -163,7 +163,7 @@ namespace xgboost{ for (int i = 0; i < num_round; ++i){ elapsed = (unsigned long)(time(NULL) - start); if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); - learner.UpdateOneIter(i, data); + learner.UpdateOneIter(data); learner.EvalOneIter(i, devalall, eval_data_names); if (save_period != 0 && (i + 1) % save_period == 0){ this->SaveModel(i); diff --git a/utils/xgboost_fmap.h b/utils/xgboost_fmap.h index fcd9d7756..e549c4d7f 100644 --- a/utils/xgboost_fmap.h +++ b/utils/xgboost_fmap.h @@ -31,7 +31,7 @@ namespace xgboost{ /*! \brief load feature map from text format */ inline void LoadText(FILE *fi){ int fid; - char fname[256], ftype[256]; + char fname[1256], ftype[1256]; while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3){ utils::Assert(fid == (int)names_.size(), "invalid fmap format"); names_.push_back(std::string(fname)); diff --git a/utils/xgboost_utils.h b/utils/xgboost_utils.h index 7c0e53f2e..e7746a881 100644 --- a/utils/xgboost_utils.h +++ b/utils/xgboost_utils.h @@ -38,6 +38,7 @@ namespace xgboost{ namespace utils{ inline void Error(const char *msg){ fprintf(stderr, "Error:%s\n", msg); + fflush(stderr); exit(-1); } @@ -57,7 +58,8 @@ namespace xgboost{ inline FILE *FopenCheck(const char *fname, const char *flag){ FILE *fp = fopen64(fname, flag); if (fp == NULL){ - fprintf(stderr, "can not open file \"%s\"\n", fname); + fprintf(stderr, "can not open file \"%s\" \n", fname); + fflush(stderr); exit(-1); } return fp;