From 301685e0a431e11fdf05286c95d7394733c651c3 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 17 Aug 2014 18:43:25 -0700 Subject: [PATCH] python module pass basic test --- python/example/demo.py | 49 +++----- python/xgboost.py | 212 ++++++++++++++++++++-------------- python/xgboost_wrapper.cpp | 17 +++ src/data.h | 2 +- src/io/io.cpp | 7 +- src/io/simple_dmatrix-inl.hpp | 2 +- src/learner/dmatrix.h | 2 +- 7 files changed, 170 insertions(+), 121 deletions(-) diff --git a/python/example/demo.py b/python/example/demo.py index 389f139ff..e14c806aa 100755 --- a/python/example/demo.py +++ b/python/example/demo.py @@ -17,36 +17,17 @@ param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logisti # specify validations set to watch performance evallist = [(dtest,'eval'), (dtrain,'train')] num_round = 2 -bst = xgb.train( param, dtrain, num_round, evallist ) +bst = xgb.train(param, dtrain, num_round, evallist) # this is prediction -preds = bst.predict( dtest ) +preds = bst.predict(dtest) labels = dtest.get_label() print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) bst.save_model('0001.model') # dump model bst.dump_model('dump.raw.txt') # dump model with feature map -bst.dump_model('dump.raw.txt','featmap.txt') - -### -# build dmatrix in python iteratively -# -print ('start running example of build DMatrix in python') -dtrain = xgb.DMatrix() -labels = [] -for l in open('agaricus.txt.train'): - arr = l.split() - labels.append( int(arr[0])) - feats = [] - for it in arr[1:]: - k,v = it.split(':') - feats.append( (int(k), float(v)) ) - dtrain.add_row( feats ) -dtrain.set_label( labels ) -evallist = [(dtest,'eval'), (dtrain,'train')] - -bst = xgb.train( param, dtrain, num_round, evallist ) +bst.dump_model('dump.nice.txt','featmap.txt') ### # build dmatrix from scipy.sparse @@ -61,7 +42,6 @@ for l in open('agaricus.txt.train'): k,v = it.split(':') row.append(i); col.append(int(k)); dat.append(float(v)) i += 1 - csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) dtrain = xgb.DMatrix( csr ) dtrain.set_label(labels) @@ -71,7 +51,7 @@ bst = xgb.train( param, dtrain, num_round, evallist ) print ('start running example of build DMatrix from numpy array') # NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix npymat = csr.todense() -dtrain = xgb.DMatrix( npymat ) +dtrain = xgb.DMatrix( npymat) dtrain.set_label(labels) evallist = [(dtest,'eval'), (dtrain,'train')] bst = xgb.train( param, dtrain, num_round, evallist ) @@ -81,16 +61,25 @@ bst = xgb.train( param, dtrain, num_round, evallist ) # print ('start running example to used cutomized objective function') -# note: set objective= binary:logistic means the prediction will get logistic transformed -# in most case, we may want to leave it as default -param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' } +# note: for customized objective function, we leave objective as default +# note: what we are getting is margin value in prediction +# you must know what you are doing +param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1 } # user define objective function, given prediction, return gradient and second order gradient -def logregobj( preds, dtrain ): +# this is loglikelihood loss +def logregobj(preds, dtrain): labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) grad = preds - labels hess = preds * (1.0-preds) return grad, hess -# training with customized objective, we can also do step by step training, simply look at xgboost.py's implementation of train -bst = xgb.train( param, dtrain, num_round, evallist, logregobj ) +# user defined evaluation function, return a pair metric_name, result +def evalerror(preds, dtrain): + labels = dtrain.get_label() + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + +# training with customized objective, we can also do step by step training +# simply look at xgboost.py's implementation of train +bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror) diff --git a/python/xgboost.py b/python/xgboost.py index 2e8deefa8..c7a04d4c3 100644 --- a/python/xgboost.py +++ b/python/xgboost.py @@ -4,6 +4,7 @@ import ctypes import os # optinally have scipy sparse, though not necessary import numpy +import sys import numpy.ctypeslib import scipy.sparse as scp @@ -13,33 +14,39 @@ XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostwrapper.so' # load in xgboost library xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH) -xglib.XGDMatrixCreate.restype = ctypes.c_void_p +xglib.XGDMatrixCreateFromFile.restype = ctypes.c_void_p +xglib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p +xglib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p +xglib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p +xglib.XGDMatrixGetLabel.restype = ctypes.POINTER(ctypes.c_float) +xglib.XGDMatrixGetWeight.restype = ctypes.POINTER(ctypes.c_float) xglib.XGDMatrixNumRow.restype = ctypes.c_ulong -xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float ) -xglib.XGDMatrixGetWeight.restype = ctypes.POINTER( ctypes.c_float ) -xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry ) -xglib.XGBoosterCreate.restype = ctypes.c_void_p -xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float ) -def ctypes2numpy( cptr, length ): +xglib.XGBoosterCreate.restype = ctypes.c_void_p +xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float) +xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p +xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p) + + +def ctypes2numpy(cptr, length): # convert a ctypes pointer array to numpy - assert isinstance( cptr, ctypes.POINTER( ctypes.c_float ) ) - res = numpy.zeros( length, dtype='float32' ) - assert ctypes.memmove( res.ctypes.data, cptr, length * res.strides[0] ) + assert isinstance(cptr, ctypes.POINTER(ctypes.c_float)) + res = numpy.zeros(length, dtype='float32') + assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]) return res # data matrix used in xgboost class DMatrix: # constructor - def __init__(self, data=None, label=None, missing=0.0, weight = None): + def __init__(self, data, label=None, missing=0.0, weight = None): # force into void_p, mac need to pass things in as void_p - self.handle = ctypes.c_void_p( xglib.XGDMatrixCreate() ) if data == None: + self.handle = None return - if isinstance(data,str): - xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data.encode('utf-8')), 1) - elif isinstance(data,scp.csr_matrix): - self.__init_from_csr(data) + if isinstance(data, str): + self.handle = xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 1) + elif isinstance(data, scp.csr_matrix): + self.__init_from_csr(data) elif isinstance(data, numpy.ndarray) and len(data.shape) == 2: self.__init_from_npy2d(data, missing) else: @@ -52,77 +59,68 @@ class DMatrix: self.set_label(label) if weight !=None: self.set_weight(weight) - # convert data from csr matrix - def __init_from_csr(self,csr): + def __init_from_csr(self, csr): assert len(csr.indices) == len(csr.data) - xglib.XGDMatrixParseCSR( self.handle, - ( ctypes.c_ulong * len(csr.indptr) )(*csr.indptr), - ( ctypes.c_uint * len(csr.indices) )(*csr.indices), - ( ctypes.c_float * len(csr.data) )(*csr.data), - len(csr.indptr), len(csr.data) ) + self.handle = xglib.XGDMatrixCreateFromCSR( + (ctypes.c_ulong * len(csr.indptr))(*csr.indptr), + (ctypes.c_uint * len(csr.indices))(*csr.indices), + (ctypes.c_float * len(csr.data))(*csr.data), + len(csr.indptr), len(csr.data)) # convert data from numpy matrix def __init_from_npy2d(self,mat,missing): - data = numpy.array( mat.reshape(mat.size), dtype='float32' ) - xglib.XGDMatrixParseMat( self.handle, - data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), - mat.shape[0], mat.shape[1], ctypes.c_float(missing) ) + data = numpy.array(mat.reshape(mat.size), dtype='float32') + self.handle = xglib.XGDMatrixCreateFromMat( + data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + mat.shape[0], mat.shape[1], ctypes.c_float(missing)) # destructor def __del__(self): - xglib.XGDMatrixFree(self.handle) - # load data from file - def load(self, fname, silent=True): - xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent)) + xglib.XGDMatrixFree(self.handle) # load data from file def save_binary(self, fname, silent=True): xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent)) # set label of dmatrix def set_label(self, label): - xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) ) + xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label)) # set group size of dmatrix, used for rank def set_group(self, group): - xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group) ) + xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group)) # set weight of each instances def set_weight(self, weight): - xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_float*len(weight))(*weight), len(weight) ) + xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_float*len(weight))(*weight), len(weight)) # get label from dmatrix def get_label(self): length = ctypes.c_ulong() labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length)) - return ctypes2numpy( labels, length.value ); + return ctypes2numpy(labels, length.value) # get weight from dmatrix def get_weight(self): length = ctypes.c_ulong() weights = xglib.XGDMatrixGetWeight(self.handle, ctypes.byref(length)) - return ctypes2numpy( weights, length.value ); - # clear everything - def clear(self): - xglib.XGDMatrixClear(self.handle) + return ctypes2numpy(weights, length.value) def num_row(self): return xglib.XGDMatrixNumRow(self.handle) - # append a row to DMatrix - def add_row(self, row): - xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row) ) - # get n-throw from DMatrix - def __getitem__(self, ridx): - length = ctypes.c_ulong() - row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) ); - return [ (int(row[i].findex),row[i].fvalue) for i in range(length.value) ] + # slice the DMatrix to return a new DMatrix that only contains rindex + def slice(self, rindex): + res = DMatrix(None) + res.handle = xglib.XGDMatrixSliceDMatrix( + self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex)) + return res class Booster: """learner class """ def __init__(self, params={}, cache=[]): """ constructor, param: """ for d in cache: - assert isinstance(d,DMatrix) - dmats = ( ctypes.c_void_p * len(cache) )(*[ d.handle for d in cache]) - self.handle = ctypes.c_void_p( xglib.XGBoosterCreate( dmats, len(cache) ) ) - self.set_param( {'seed':0} ) - self.set_param( params ) + assert isinstance(d, DMatrix) + dmats = (ctypes.c_void_p * len(cache))(*[ d.handle for d in cache]) + self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache))) + self.set_param({'seed':0}) + self.set_param(params) def __del__(self): xglib.XGBoosterFree(self.handle) def set_param(self, params, pv=None): - if isinstance(params,dict): + if isinstance(params, dict): for k, v in params.items(): xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k.encode('utf-8')), @@ -130,72 +128,112 @@ class Booster: elif isinstance(params,str) and pv != None: xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(params.encode('utf-8')), - ctypes.c_char_p(str(pv).encode('utf-8')) ) + ctypes.c_char_p(str(pv).encode('utf-8'))) else: for k, v in params: xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k.encode('utf-8')), - ctypes.c_char_p(str(v).encode('utf-8')) ) - def update(self, dtrain): + ctypes.c_char_p(str(v).encode('utf-8'))) + def update(self, dtrain, it): """ update """ assert isinstance(dtrain, DMatrix) - xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle ) - def boost(self, dtrain, grad, hess, bst_group = -1): + xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle) + def boost(self, dtrain, grad, hess): """ update """ assert len(grad) == len(hess) assert isinstance(dtrain, DMatrix) - xglib.XGBoosterBoostOneIter( self.handle, dtrain.handle, - (ctypes.c_float*len(grad))(*grad), - (ctypes.c_float*len(hess))(*hess), - len(grad), bst_group ) - def update_interact(self, dtrain, action, booster_index=None): - """ beta: update with specified action""" - assert isinstance(dtrain, DMatrix) - if booster_index != None: - self.set_param('interact:booster_index', str(booster_index)) - xglib.XGBoosterUpdateInteract( - self.handle, dtrain.handle, ctypes.c_char_p(str(action)) ) + xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle, + (ctypes.c_float*len(grad))(*grad), + (ctypes.c_float*len(hess))(*hess), + len(grad)) def eval_set(self, evals, it = 0): for d in evals: assert isinstance(d[0], DMatrix) assert isinstance(d[1], str) - dmats = ( ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals]) - evnames = ( ctypes.c_char_p * len(evals) )( - *[ctypes.c_char_p(d[1].encode('utf-8')) for d in evals]) - xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) ) - def eval(self, mat, name = 'eval', it = 0 ): - self.eval_set( [(mat,name)], it) - def predict(self, data, bst_group = -1): + dmats = (ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals]) + evnames = (ctypes.c_char_p * len(evals))( + * [ctypes.c_char_p(d[1].encode('utf-8')) for d in evals]) + return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) + def eval(self, mat, name = 'eval', it = 0): + return self.eval_set( [(mat,name)], it) + def predict(self, data): length = ctypes.c_ulong() - preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group) - return ctypes2numpy( preds, length.value ) + preds = xglib.XGBoosterPredict(self.handle, data.handle, ctypes.byref(length)) + return ctypes2numpy(preds, length.value) def save_model(self, fname): """ save model to file """ xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8'))) def load_model(self, fname): """load model from file""" xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) ) - def dump_model(self, fname, fmap=''): + def dump_model(self, fo, fmap=''): """dump model into text file""" - xglib.XGBoosterDumpModel( - self.handle, ctypes.c_char_p(fname.encode('utf-8')), - ctypes.c_char_p(fmap.encode('utf-8'))) + if isinstance(fo,str): + fo = open(fo,'w') + need_close = True + else: + need_close = False + ret = self.get_dump(fmap) + for i in range(len(ret)): + fo.write('booster[%d]:\n' %i) + fo.write( ret[i] ) + if need_close: + fo.close() + def get_dump(self, fmap=''): + """get dump of model as list of strings """ + length = ctypes.c_ulong() + sarr = xglib.XGBoosterDumpModel(self.handle, ctypes.c_char_p(fmap.encode('utf-8')), ctypes.byref(length)) + res = [] + for i in range(length.value): + res.append( str(sarr[i]) ) + return res + def get_fscore(self, fmap=''): + """ get feature importance of each feature """ + trees = self.get_dump(fmap) + fmap = {} + for tree in trees: + print tree + for l in tree.split('\n'): + arr = l.split('[') + if len(arr) == 1: + continue + fid = arr[1].split(']')[0] + fid = fid.split('<')[0] + if fid not in fmap: + fmap[fid] = 1 + else: + fmap[fid]+= 1 + return fmap -def train(params, dtrain, num_boost_round = 10, evals = [], obj=None): +def evaluate(bst, evals, it, feval = None): + """evaluation on eval set""" + if feval != None: + res = '[%d]' % it + for dm, evname in evals: + name, val = feval(bst.predict(dm), dm) + res += '\t%s-%s:%f' % (evname, name, val) + else: + res = bst.eval_set(evals, it) + + return res + +def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None): """ train a booster with given paramaters """ bst = Booster(params, [dtrain]+[ d[0] for d in evals ] ) if obj == None: for i in range(num_boost_round): - bst.update( dtrain ) + bst.update( dtrain, i ) if len(evals) != 0: - bst.eval_set( evals, i ) + sys.stderr.write(evaluate(bst, evals, i, feval)+'\n') else: + if len(evals) != 0 and feval == None: + print 'you need to provide your own evaluation function' + # try customized objective function for i in range(num_boost_round): pred = bst.predict( dtrain ) grad, hess = obj( pred, dtrain ) bst.boost( dtrain, grad, hess ) if len(evals) != 0: - bst.eval_set( evals, i ) + sys.stderr.write(evaluate(bst, evals, i, feval)+'\n') return bst - diff --git a/python/xgboost_wrapper.cpp b/python/xgboost_wrapper.cpp index e43095920..478d74936 100644 --- a/python/xgboost_wrapper.cpp +++ b/python/xgboost_wrapper.cpp @@ -20,9 +20,11 @@ class Booster: public learner::BoostLearner { public: explicit Booster(const std::vector& mats) { this->silent = 1; + this->init_model = false; this->SetCacheData(mats); } const float *Pred(const DataMatrix &dmat, size_t *len) { + this->CheckInitModel(); this->Predict(dmat, &this->preds_); *len = this->preds_.size(); return &this->preds_[0]; @@ -37,6 +39,15 @@ class Booster: public learner::BoostLearner { } gbm_->DoBoost(gpair_, train.fmat, train.info.root_index); } + inline void CheckInitModel(void) { + if (!init_model) { + this->InitModel(); init_model = true; + } + } + inline void LoadModel(const char *fname) { + learner::BoostLearner::LoadModel(fname); + this->init_model = true; + } inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, size_t *len) { model_dump = this->DumpModel(fmap, with_stats); model_dump_cptr.resize(model_dump.size()); @@ -52,6 +63,9 @@ class Booster: public learner::BoostLearner { // temporal space to save model dump std::vector model_dump; std::vector model_dump_cptr; + + private: + bool init_model; }; } // namespace wrapper } // namespace xgboost @@ -199,6 +213,7 @@ extern "C"{ void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain) { Booster *bst = static_cast(handle); DataMatrix *dtr = static_cast(dtrain); + bst->CheckInitModel(); bst->CheckInit(dtr); bst->UpdateOneIter(iter, *dtr); } @@ -206,6 +221,7 @@ extern "C"{ float *grad, float *hess, size_t len) { Booster *bst = static_cast(handle); DataMatrix *dtr = static_cast(dtrain); + bst->CheckInitModel(); bst->CheckInit(dtr); bst->BoostOneIter(*dtr, grad, hess, len); } @@ -217,6 +233,7 @@ extern "C"{ mats.push_back(static_cast(dmats[i])); names.push_back(std::string(evnames[i])); } + bst->CheckInitModel(); bst->eval_str = bst->EvalOneIter(iter, mats, names); return bst->eval_str.c_str(); } diff --git a/src/data.h b/src/data.h index c60b58b8a..df43551e3 100644 --- a/src/data.h +++ b/src/data.h @@ -242,7 +242,7 @@ class FMatrixS : public FMatrixInterface{ * \brief save column access data into stream * \param fo output stream to save to */ - inline void SaveColAccess(utils::IStream &fo) { + inline void SaveColAccess(utils::IStream &fo) const { fo.Write(&num_buffered_row_, sizeof(num_buffered_row_)); if (num_buffered_row_ != 0) { SaveBinary(fo, col_ptr_, col_data_); diff --git a/src/io/io.cpp b/src/io/io.cpp index 4ddf61eb0..7689a4560 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -15,7 +15,12 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) { } void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) { - utils::Error("not implemented"); + if (dmat.magic == DMatrixSimple::kMagic){ + const DMatrixSimple *p_dmat = static_cast(&dmat); + p_dmat->SaveBinary(fname, silent); + } else { + utils::Error("not implemented"); + } } } // namespace io diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index f996b8d8c..b8b15adce 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -148,7 +148,7 @@ class DMatrixSimple : public DataMatrix { * \param fname name of binary data * \param silent whether print information or not */ - inline void SaveBinary(const char* fname, bool silent = false) { + inline void SaveBinary(const char* fname, bool silent = false) const { utils::FileStream fs(utils::FopenCheck(fname, "wb")); int magic = kMagic; fs.Write(&magic, sizeof(magic)); diff --git a/src/learner/dmatrix.h b/src/learner/dmatrix.h index b558b070b..144b1a44e 100644 --- a/src/learner/dmatrix.h +++ b/src/learner/dmatrix.h @@ -58,7 +58,7 @@ struct MetaInfo { return 0; } } - inline void SaveBinary(utils::IStream &fo) { + inline void SaveBinary(utils::IStream &fo) const { fo.Write(&num_row, sizeof(num_row)); fo.Write(&num_col, sizeof(num_col)); fo.Write(labels);