python module pass basic test

2014-08-17 18:43:25 -07:00 · 2014-08-17 18:43:25 -07:00 · 301685e0a4
commit 301685e0a4
parent af100dd869
7 changed files with 170 additions and 121 deletions
--- a/python/example/demo.py
+++ b/python/example/demo.py
@ -17,36 +17,17 @@ param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logisti
 # specify validations set to watch performance
 evallist  = [(dtest,'eval'), (dtrain,'train')]
 num_round = 2
-bst = xgb.train( param, dtrain, num_round, evallist )
+bst = xgb.train(param, dtrain, num_round, evallist)
 # this is prediction
-preds = bst.predict( dtest )
+preds = bst.predict(dtest)
 labels = dtest.get_label()
 print ('error=%f' % (  sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
 bst.save_model('0001.model')
 # dump model
 bst.dump_model('dump.raw.txt')
 # dump model with feature map
-bst.dump_model('dump.raw.txt','featmap.txt')
+bst.dump_model('dump.nice.txt','featmap.txt')
 ###
 # build dmatrix in python iteratively
 #
 print ('start running example of build DMatrix in python')
 dtrain = xgb.DMatrix()
 labels = []
 for l in open('agaricus.txt.train'):
    arr = l.split()
    labels.append( int(arr[0]))
    feats = []
    for it in arr[1:]:
        k,v = it.split(':')
        feats.append( (int(k), float(v)) )
    dtrain.add_row( feats )
 dtrain.set_label( labels )
 evallist  = [(dtest,'eval'), (dtrain,'train')]
 bst = xgb.train( param, dtrain, num_round, evallist )
 ###
 # build dmatrix from scipy.sparse
@ -61,7 +42,6 @@ for l in open('agaricus.txt.train'):
        k,v = it.split(':')
        row.append(i); col.append(int(k)); dat.append(float(v))
    i += 1
 csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
 dtrain = xgb.DMatrix( csr )
 dtrain.set_label(labels)
@ -71,7 +51,7 @@ bst = xgb.train( param, dtrain, num_round, evallist )
 print ('start running example of build DMatrix from numpy array')
 # NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
 npymat = csr.todense()
-dtrain = xgb.DMatrix( npymat )
+dtrain = xgb.DMatrix( npymat)
 dtrain.set_label(labels)
 evallist  = [(dtest,'eval'), (dtrain,'train')]
 bst = xgb.train( param, dtrain, num_round, evallist )
@ -81,16 +61,25 @@ bst = xgb.train( param, dtrain, num_round, evallist )
 # 
 print ('start running example to used cutomized objective function')
-# note: set objective= binary:logistic means the prediction will get logistic transformed
+# note: for customized objective function, we leave objective as default
-#       in most case, we may want to leave it as default
+# note: what we are getting is margin value in prediction
-param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
+# you must know what you are doing
 param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1 }
 # user define objective function, given prediction, return gradient and second order gradient
-def logregobj( preds, dtrain ):
+# this is loglikelihood loss
 def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0-preds)
    return grad, hess
-# training with customized objective, we can also do step by step training, simply look at xgboost.py's implementation of train
+# user defined evaluation function, return a pair metric_name, result
-bst = xgb.train( param, dtrain, num_round, evallist, logregobj )
+def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
 bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)
--- a/python/xgboost.py
+++ b/python/xgboost.py
@ -4,6 +4,7 @@ import ctypes
 import os
 # optinally have scipy sparse, though not necessary
 import numpy
 import sys
 import numpy.ctypeslib 
 import scipy.sparse as scp
@ -13,32 +14,38 @@ XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostwrapper.so'
 # load in xgboost library
 xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
-xglib.XGDMatrixCreate.restype = ctypes.c_void_p
+xglib.XGDMatrixCreateFromFile.restype = ctypes.c_void_p
 xglib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p
 xglib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p
 xglib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p
 xglib.XGDMatrixGetLabel.restype = ctypes.POINTER(ctypes.c_float)
 xglib.XGDMatrixGetWeight.restype = ctypes.POINTER(ctypes.c_float)
 xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
 xglib.XGDMatrixGetLabel.restype =  ctypes.POINTER( ctypes.c_float )
 xglib.XGDMatrixGetWeight.restype =  ctypes.POINTER( ctypes.c_float )
 xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
 xglib.XGBoosterCreate.restype = ctypes.c_void_p
 xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float ) 
-def ctypes2numpy( cptr, length ):
+xglib.XGBoosterCreate.restype = ctypes.c_void_p
 xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
 xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
 xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
 def ctypes2numpy(cptr, length):
    # convert a ctypes pointer array to numpy
-    assert isinstance( cptr, ctypes.POINTER( ctypes.c_float ) )
+    assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
-    res = numpy.zeros( length, dtype='float32' )
+    res = numpy.zeros(length, dtype='float32')
-    assert ctypes.memmove( res.ctypes.data, cptr, length * res.strides[0] )
+    assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0])
    return res
 # data matrix used in xgboost
 class DMatrix:
    # constructor
-    def __init__(self, data=None, label=None, missing=0.0, weight = None):
+    def __init__(self, data, label=None, missing=0.0, weight = None):
        # force into void_p, mac need to pass things in as void_p
        self.handle = ctypes.c_void_p( xglib.XGDMatrixCreate() )
        if data == None:
            self.handle = None
            return
-        if isinstance(data,str):
+        if isinstance(data, str):
-            xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data.encode('utf-8')), 1)             
+            self.handle = xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 1)
-        elif isinstance(data,scp.csr_matrix):
+        elif isinstance(data, scp.csr_matrix):
            self.__init_from_csr(data)            
        elif isinstance(data, numpy.ndarray) and len(data.shape) == 2:
            self.__init_from_npy2d(data, missing)
@ -52,77 +59,68 @@ class DMatrix:
            self.set_label(label)
        if weight !=None:
            self.set_weight(weight)
    # convert data from csr matrix
-    def __init_from_csr(self,csr):
+    def __init_from_csr(self, csr):
        assert len(csr.indices) == len(csr.data)
-        xglib.XGDMatrixParseCSR( self.handle, 
+        self.handle = xglib.XGDMatrixCreateFromCSR(
-                                 ( ctypes.c_ulong  * len(csr.indptr) )(*csr.indptr),
+            (ctypes.c_ulong  * len(csr.indptr))(*csr.indptr),
-                                 ( ctypes.c_uint  * len(csr.indices) )(*csr.indices),
+            (ctypes.c_uint  * len(csr.indices))(*csr.indices),
-                                 ( ctypes.c_float * len(csr.data) )(*csr.data),
+            (ctypes.c_float * len(csr.data))(*csr.data),
-                                 len(csr.indptr), len(csr.data) )
+            len(csr.indptr), len(csr.data))
    # convert data from numpy matrix
    def __init_from_npy2d(self,mat,missing):
-        data = numpy.array( mat.reshape(mat.size), dtype='float32' )
+        data = numpy.array(mat.reshape(mat.size), dtype='float32')
-        xglib.XGDMatrixParseMat( self.handle, 
+        self.handle = xglib.XGDMatrixCreateFromMat(
-                                 data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), 
+            data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                                 mat.shape[0], mat.shape[1], ctypes.c_float(missing) )
+            mat.shape[0], mat.shape[1], ctypes.c_float(missing))
    # destructor
    def __del__(self):
        xglib.XGDMatrixFree(self.handle)
    # load data from file 
    def load(self, fname, silent=True):
        xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
    # load data from file 
    def save_binary(self, fname, silent=True):
        xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
    # set label of dmatrix
    def set_label(self, label):
-        xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) )
+        xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label))
    # set group size of dmatrix, used for rank
    def set_group(self, group):
-        xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group) )
+        xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group))
    # set weight of each instances
    def set_weight(self, weight):
-        xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_float*len(weight))(*weight), len(weight) )
+        xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_float*len(weight))(*weight), len(weight))
    # get label from dmatrix
    def get_label(self):
        length = ctypes.c_ulong()
        labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length))
-        return ctypes2numpy( labels, length.value );
+        return ctypes2numpy(labels, length.value)
    # get weight from dmatrix
    def get_weight(self):
        length = ctypes.c_ulong()
        weights = xglib.XGDMatrixGetWeight(self.handle, ctypes.byref(length))
-        return ctypes2numpy( weights, length.value );
+        return ctypes2numpy(weights, length.value)
    # clear everything
    def clear(self):
        xglib.XGDMatrixClear(self.handle)
    def num_row(self):
        return xglib.XGDMatrixNumRow(self.handle)
-    # append a row to DMatrix
+    # slice the DMatrix to return a new DMatrix that only contains rindex
-    def add_row(self, row):
+    def slice(self, rindex):
-        xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row) )
+        res = DMatrix(None)
-    # get n-throw from DMatrix
+        res.handle = xglib.XGDMatrixSliceDMatrix(
-    def __getitem__(self, ridx):
+            self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex))
-        length = ctypes.c_ulong()
+        return res
        row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) );
        return [ (int(row[i].findex),row[i].fvalue) for i in range(length.value) ]
 class Booster:
    """learner class """
    def __init__(self, params={}, cache=[]):
        """ constructor, param: """    
        for d in cache:
-            assert isinstance(d,DMatrix)
+            assert isinstance(d, DMatrix)
-        dmats = ( ctypes.c_void_p  * len(cache) )(*[ d.handle for d in cache])
+        dmats = (ctypes.c_void_p  * len(cache))(*[ d.handle for d in cache])
-        self.handle = ctypes.c_void_p( xglib.XGBoosterCreate( dmats, len(cache) ) )
+        self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache)))
-        self.set_param( {'seed':0} )
+        self.set_param({'seed':0})
-        self.set_param( params )
+        self.set_param(params)
    def __del__(self):
        xglib.XGBoosterFree(self.handle) 
    def set_param(self, params, pv=None):
-        if isinstance(params,dict):
+        if isinstance(params, dict):
            for k, v in params.items():
                xglib.XGBoosterSetParam(
                    self.handle, ctypes.c_char_p(k.encode('utf-8')), 
@ -130,72 +128,112 @@ class Booster:
        elif isinstance(params,str) and pv != None:
            xglib.XGBoosterSetParam(
                self.handle, ctypes.c_char_p(params.encode('utf-8')),
-                ctypes.c_char_p(str(pv).encode('utf-8')) )
+                ctypes.c_char_p(str(pv).encode('utf-8')))
        else:
            for k, v in params:
                xglib.XGBoosterSetParam(
                    self.handle, ctypes.c_char_p(k.encode('utf-8')),
-                    ctypes.c_char_p(str(v).encode('utf-8')) )             
+                    ctypes.c_char_p(str(v).encode('utf-8')))
-    def update(self, dtrain):
+    def update(self, dtrain, it):
        """ update """
        assert isinstance(dtrain, DMatrix)
-        xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle )
+        xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle)
-    def boost(self, dtrain, grad, hess, bst_group = -1):
+    def boost(self, dtrain, grad, hess):
        """ update """
        assert len(grad) == len(hess)
        assert isinstance(dtrain, DMatrix)
-        xglib.XGBoosterBoostOneIter( self.handle, dtrain.handle,
+        xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle,
-                                     (ctypes.c_float*len(grad))(*grad),
+                                    (ctypes.c_float*len(grad))(*grad),
-                                     (ctypes.c_float*len(hess))(*hess),
+                                    (ctypes.c_float*len(hess))(*hess),
-                                     len(grad), bst_group )
+                                    len(grad))
    def update_interact(self, dtrain, action, booster_index=None):
        """ beta: update with specified action"""
        assert isinstance(dtrain, DMatrix)
        if booster_index != None:
            self.set_param('interact:booster_index', str(booster_index))
        xglib.XGBoosterUpdateInteract(
            self.handle, dtrain.handle, ctypes.c_char_p(str(action)) )
    def eval_set(self, evals, it = 0):
        for d in evals:
            assert isinstance(d[0], DMatrix)
            assert isinstance(d[1], str)
-        dmats = ( ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals])
+        dmats = (ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals])
-        evnames = ( ctypes.c_char_p * len(evals) )(
+        evnames = (ctypes.c_char_p * len(evals))(
-            *[ctypes.c_char_p(d[1].encode('utf-8')) for d in evals])
+            * [ctypes.c_char_p(d[1].encode('utf-8')) for d in evals])
-        xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) )
+        return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals))        
-    def eval(self, mat, name = 'eval', it = 0 ):
+    def eval(self, mat, name = 'eval', it = 0):
-        self.eval_set( [(mat,name)], it)
+        return self.eval_set( [(mat,name)], it)
-    def predict(self, data, bst_group = -1):
+    def predict(self, data):
        length = ctypes.c_ulong()
-        preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group)
+        preds = xglib.XGBoosterPredict(self.handle, data.handle, ctypes.byref(length))
-        return ctypes2numpy( preds, length.value )
+        return ctypes2numpy(preds, length.value)
    def save_model(self, fname):
        """ save model to file """
        xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))
    def load_model(self, fname):
        """load model from file"""
        xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
-    def dump_model(self, fname, fmap=''):
+    def dump_model(self, fo, fmap=''):
        """dump model into text file"""
-        xglib.XGBoosterDumpModel(
+        if isinstance(fo,str):            
-            self.handle, ctypes.c_char_p(fname.encode('utf-8')), 
+            fo = open(fo,'w')
-            ctypes.c_char_p(fmap.encode('utf-8')))
+            need_close = True
        else:
            need_close = False
        ret = self.get_dump(fmap)
        for i in range(len(ret)):
            fo.write('booster[%d]:\n' %i)
            fo.write( ret[i] )
        if need_close:
            fo.close()
    def get_dump(self, fmap=''):
        """get dump of model as list of strings """
        length = ctypes.c_ulong()
        sarr = xglib.XGBoosterDumpModel(self.handle, ctypes.c_char_p(fmap.encode('utf-8')), ctypes.byref(length))
        res = []
        for i in range(length.value):
            res.append( str(sarr[i]) )
        return res
    def get_fscore(self, fmap=''):
        """ get feature importance of each feature """
        trees = self.get_dump(fmap)
        fmap = {}
        for tree in trees:
            print tree
            for l in tree.split('\n'):
                arr = l.split('[')
                if len(arr) == 1:
                    continue
                fid = arr[1].split(']')[0]
                fid = fid.split('<')[0]
                if fid not in fmap:
                    fmap[fid] = 1
                else:
                    fmap[fid]+= 1
        return fmap
-def train(params, dtrain, num_boost_round = 10, evals = [], obj=None):
+def evaluate(bst, evals, it, feval = None):
    """evaluation on eval set"""
    if feval != None:
        res = '[%d]' % it
        for dm, evname in evals:
            name, val = feval(bst.predict(dm), dm)
            res += '\t%s-%s:%f' % (evname, name, val)
    else:
        res = bst.eval_set(evals, it)
    return res
 def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None):
    """ train a booster with given paramaters """
    bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
    if obj == None:
        for i in range(num_boost_round):
-            bst.update( dtrain )
+            bst.update( dtrain, i )
            if len(evals) != 0:
-                bst.eval_set( evals, i )
+                sys.stderr.write(evaluate(bst, evals, i, feval)+'\n')
    else:
        if len(evals) != 0 and feval == None:
            print 'you need to provide your own evaluation function'
        # try customized objective function
        for i in range(num_boost_round):
            pred = bst.predict( dtrain )
            grad, hess = obj( pred, dtrain )
            bst.boost( dtrain, grad, hess )
            if len(evals) != 0:
-                bst.eval_set( evals, i )        
+                sys.stderr.write(evaluate(bst, evals, i, feval)+'\n')
    return bst
--- a/python/xgboost_wrapper.cpp
+++ b/python/xgboost_wrapper.cpp
@ -20,9 +20,11 @@ class Booster: public learner::BoostLearner<FMatrixS> {
 public:
  explicit Booster(const std::vector<DataMatrix*>& mats) {
    this->silent = 1;
    this->init_model = false;
    this->SetCacheData(mats);
  }
  const float *Pred(const DataMatrix &dmat, size_t *len) {
    this->CheckInitModel();
    this->Predict(dmat, &this->preds_);
    *len = this->preds_.size();
    return &this->preds_[0];
@ -37,6 +39,15 @@ class Booster: public learner::BoostLearner<FMatrixS> {
    }
    gbm_->DoBoost(gpair_, train.fmat, train.info.root_index);
  }
  inline void CheckInitModel(void) {
    if (!init_model) {
      this->InitModel(); init_model = true;
    }  
  }
  inline void LoadModel(const char *fname) {
    learner::BoostLearner<FMatrixS>::LoadModel(fname);
    this->init_model = true;
  }
  inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, size_t *len) {
    model_dump = this->DumpModel(fmap, with_stats);
    model_dump_cptr.resize(model_dump.size()); 
@ -52,6 +63,9 @@ class Booster: public learner::BoostLearner<FMatrixS> {
  // temporal space to save model dump
  std::vector<std::string> model_dump;
  std::vector<const char*> model_dump_cptr;
 private:
  bool init_model;
 };
 }  // namespace wrapper
 }  // namespace xgboost
@ -199,6 +213,7 @@ extern "C"{
  void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain) {
    Booster *bst = static_cast<Booster*>(handle);
    DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
    bst->CheckInitModel();
    bst->CheckInit(dtr);
    bst->UpdateOneIter(iter, *dtr);
  }
@ -206,6 +221,7 @@ extern "C"{
                             float *grad, float *hess, size_t len) {
    Booster *bst = static_cast<Booster*>(handle);
    DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
    bst->CheckInitModel();
    bst->CheckInit(dtr);
    bst->BoostOneIter(*dtr, grad, hess, len);
  }
@ -217,6 +233,7 @@ extern "C"{
      mats.push_back(static_cast<DataMatrix*>(dmats[i]));
      names.push_back(std::string(evnames[i]));
    }
    bst->CheckInitModel();
    bst->eval_str = bst->EvalOneIter(iter, mats, names);
    return bst->eval_str.c_str();
  }
--- a/src/data.h
+++ b/src/data.h
@ -242,7 +242,7 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
   * \brief save column access data into stream
   * \param fo output stream to save to
   */
-  inline void SaveColAccess(utils::IStream &fo) {
+  inline void SaveColAccess(utils::IStream &fo) const {
    fo.Write(&num_buffered_row_, sizeof(num_buffered_row_));
    if (num_buffered_row_ != 0) {
      SaveBinary(fo, col_ptr_, col_data_);
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@ -15,7 +15,12 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
 }
 void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
-  utils::Error("not implemented");
+  if (dmat.magic == DMatrixSimple::kMagic){
    const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
    p_dmat->SaveBinary(fname, silent);
  } else {
    utils::Error("not implemented");
  }
 }
 }  // namespace io
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@ -148,7 +148,7 @@ class DMatrixSimple : public DataMatrix {
   * \param fname name of binary data
   * \param silent whether print information or not
   */
-  inline void SaveBinary(const char* fname, bool silent = false) {
+  inline void SaveBinary(const char* fname, bool silent = false) const {
    utils::FileStream fs(utils::FopenCheck(fname, "wb"));
    int magic = kMagic;
    fs.Write(&magic, sizeof(magic));
--- a/src/learner/dmatrix.h
+++ b/src/learner/dmatrix.h
@ -58,7 +58,7 @@ struct MetaInfo {
      return 0;
    }
  }
-  inline void SaveBinary(utils::IStream &fo) {
+  inline void SaveBinary(utils::IStream &fo) const {
    fo.Write(&num_row, sizeof(num_row));
    fo.Write(&num_col, sizeof(num_col));
    fo.Write(labels);