finish python lib

This commit is contained in:
tqchen 2014-05-03 22:18:25 -07:00
parent 20de7f8f97
commit adc9400736
8 changed files with 231 additions and 32 deletions

View File

@ -1,30 +1,35 @@
# module for xgboost
import ctypes
# optinally have scipy sparse, though not necessary
import numpy as np
import scipy.sparse as scp
# set this line correctly
XGBOOST_PATH = './libxgboostpy.so'
# entry type of sparse matrix
class REntry(ctypes.Structure):
_fields_ = [("findex", ctypes.c_uint), ("fvalue", ctypes.c_float) ]
# load in xgboost library
xglib = ctypes.cdll.LoadLibrary('./libxgboostpy.so')
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
xglib.XGDMatrixCreate.restype = ctypes.c_void_p
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float )
xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float )
# data matrix used in xgboost
class DMatrix:
# constructor
def __init__(self, data=None, label=None):
self.handle = xglib.XGDMatrixCreate();
self.handle = xglib.XGDMatrixCreate()
if data == None:
return
if type(data) is str:
return
if isinstance(data,str):
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data), 1)
elif type(data) is scp.csr_matrix:
elif isinstance(data,scp.csr_matrix):
self.__init_from_csr(data)
else:
try:
@ -54,7 +59,13 @@ class DMatrix:
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname), int(silent))
# set label of dmatrix
def set_label(self, label):
xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) );
xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) )
# set group size of dmatrix, used for rank
def set_group(self, group):
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group) )
# set weight of each instances
def set_weight(self, weight):
xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_uint*len(weight))(*weight), len(weight) )
# get label from dmatrix
def get_label(self):
length = ctypes.c_ulong()
@ -66,16 +77,57 @@ class DMatrix:
def num_row(self):
return xglib.XGDMatrixNumRow(self.handle)
# append a row to DMatrix
def add_row(self, row, label):
xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row), label )
def add_row(self, row):
xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row) )
# get n-throw from DMatrix
def __getitem__(self, ridx):
length = ctypes.c_ulong()
row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) );
return [ (int(row[i].findex),row[i].fvalue) for i in xrange(length.value) ]
class Booster:
"""learner class """
def __init__(self, params, cache=[]):
""" constructor, param: """
for d in cache:
assert isinstance(d,DMatrix)
dmats = ( ctypes.c_void_p * len(cache) )(*[ ctypes.c_void_p(d.handle) for d in cache])
self.handle = xglib.XGBoosterCreate( dmats, len(cache) )
for k, v in params.iteritems():
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) )
def update(self, dtrain):
""" update """
assert isinstance(dtrain, DMatrix)
xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle )
def eval_set(self, evals, it = 0):
for d in evals:
assert isinstance(d[0], DMatrix)
assert isinstance(d[1], str)
dmats = ( ctypes.c_void_p * len(evals) )(*[ ctypes.c_void_p(d[0].handle) for d in evals])
evnames = ( ctypes.c_char_p * len(evals) )(*[ ctypes.c_char_p(d[1]) for d in evals])
xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) )
def eval(self, mat, name = 'eval', it = 0 ):
self.eval_set( [(mat,name)], it)
def predict(self, data):
length = ctypes.c_ulong()
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length))
return [ preds[i] for i in xrange(length.value) ]
def save_model(self, fname):
""" save model to file """
xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) )
def load_model(self, fname):
"""load model from file"""
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname) )
def dump_model(self, fname, fmap=''):
"""dump model into text file"""
xglib.XGBoosterDumpModel( self.handle, ctypes.c_char_p(fname), ctypes.c_char_p(fmap) )
mat = DMatrix('xx.buffer')
print mat.num_row()
mat.clear()
def train(params, dtrain, num_boost_round = 10, evals = []):
""" train a booster with given paramaters """
bst = Booster(params, [dtrain] )
for i in xrange(num_boost_round):
bst.update( dtrain )
if len(evals) != 0:
bst.eval_set( evals, i )
return bst

View File

@ -56,20 +56,63 @@ namespace xgboost{
this->info.labels.resize( len );
memcpy( &(this->info).labels[0], label, sizeof(float)*len );
}
inline void SetGroup( const unsigned *group, size_t len ){
this->info.group_ptr.resize( len + 1 );
this->info.group_ptr[0] = 0;
for( size_t i = 0; i < len; ++ i ){
this->info.group_ptr[i+1] = this->info.group_ptr[i]+group[i];
}
}
inline void SetWeight( const float *weight, size_t len ){
this->info.weights.resize( len );
memcpy( &(this->info).weights[0], weight, sizeof(float)*len );
}
inline const float* GetLabel( size_t* len ) const{
*len = this->info.labels.size();
return &(this->info.labels[0]);
}
inline void InitTrain(void){
if(!this->data.HaveColAccess()) this->data.InitData();
inline void CheckInit(void){
if(!this->data.HaveColAccess()){
this->data.InitData();
}
utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
}
};
class Booster: public xgboost::regrank::RegRankBoostLearner{
private:
bool init_trainer, init_model;
public:
Booster(const std::vector<const regrank::DMatrix *> mats){
silent = 1;
init_trainer = false;
init_model = false;
this->SetCacheData(mats);
}
inline void CheckInit(void){
if( !init_trainer ){
this->InitTrainer(); init_trainer = true;
}
if( !init_model ){
this->InitModel(); init_model = true;
}
}
inline void LoadModel( const char *fname ){
xgboost::regrank::RegRankBoostLearner::LoadModel(fname);
this->init_model = true;
}
const float *Pred( const DMatrix &dmat, size_t *len ){
this->Predict( this->preds_, dmat );
*len = this->preds_.size();
return &this->preds_[0];
}
};
};
};
using namespace xgboost::python;
extern "C"{
void* XGDMatrixCreate( void ){
return new DMatrix();
@ -94,6 +137,12 @@ extern "C"{
void XGDMatrixSetLabel( void *handle, const float *label, size_t len ){
static_cast<DMatrix*>(handle)->SetLabel(label,len);
}
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len ){
static_cast<DMatrix*>(handle)->SetWeight(weight,len);
}
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len ){
static_cast<DMatrix*>(handle)->SetGroup(group,len);
}
const float* XGDMatrixGetLabel( const void *handle, size_t* len ){
return static_cast<const DMatrix*>(handle)->GetLabel(len);
}
@ -109,5 +158,54 @@ extern "C"{
const XGEntry* XGDMatrixGetRow(void *handle, unsigned ridx, size_t* len){
return static_cast<DMatrix*>(handle)->GetRow(ridx, len);
}
// xgboost implementation
void *XGBoosterCreate( void *dmats[], size_t len ){
std::vector<const xgboost::regrank::DMatrix*> mats;
for( size_t i = 0; i < len; ++i ){
mats.push_back( static_cast<DMatrix*>(dmats[i]) );
}
return new Booster( mats );
}
void XGBoosterSetParam( void *handle, const char *name, const char *value ){
static_cast<Booster*>(handle)->SetParam( name, value );
}
void XGBoosterUpdateOneIter( void *handle, void *dtrain ){
Booster *bst = static_cast<Booster*>(handle);
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
bst->CheckInit(); dtr->CheckInit();
bst->UpdateOneIter( *dtr );
}
void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len ){
Booster *bst = static_cast<Booster*>(handle);
bst->CheckInit();
std::vector<std::string> names;
std::vector<const xgboost::regrank::DMatrix*> mats;
for( size_t i = 0; i < len; ++i ){
mats.push_back( static_cast<DMatrix*>(dmats[i]) );
names.push_back( std::string( evnames[i]) );
}
bst->EvalOneIter( iter, mats, names, stdout );
}
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len ){
return static_cast<Booster*>(handle)->Pred( *static_cast<DMatrix*>(dmat), len );
}
void XGBoosterLoadModel( void *handle, const char *fname ){
static_cast<Booster*>(handle)->LoadModel( fname );
}
void XGBoosterSaveModel( const void *handle, const char *fname ){
static_cast<const Booster*>(handle)->SaveModel( fname );
}
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap ){
using namespace xgboost::utils;
FILE *fo = FopenCheck( fname, "w" );
FeatMap featmap;
if( strlen(fmap) != 0 ){
featmap.LoadText( fmap );
}
static_cast<Booster*>(handle)->DumpModel( fo, featmap, false );
fclose( fo );
}
};

View File

@ -52,10 +52,24 @@ extern "C"{
/*!
* \brief set label of the training matrix
* \param handle a instance of data matrix
* \param data array of row content
* \param label pointer to label
* \param len length of array
*/
void XGDMatrixSetLabel( void *handle, const float *label, size_t len );
/*!
* \brief set label of the training matrix
* \param handle a instance of data matrix
* \param group pointer to group size
* \param len length of array
*/
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len );
/*!
* \brief set weight of each instacne
* \param handle a instance of data matrix
* \param weight data pointer to weights
* \param len length of array
*/
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len );
/*!
* \brief get label set from matrix
* \param handle a instance of data matrix
@ -94,7 +108,7 @@ extern "C"{
* \param dmats matrices that are set to be cached
* \param create a booster
*/
void *CreateXGBooster( void**dmats, size_t len );
void *XGBoosterCreate( void* dmats[], size_t len );
/*!
* \brief set parameters
* \param handle handle
@ -135,7 +149,14 @@ extern "C"{
* \param handle handle
* \param fname file name
*/
void XGBoosterSaveModel( void *handle, const char *fname );
void XGBoosterSaveModel( const void *handle, const char *fname );
/*!
* \brief dump model into text file
* \param handle handle
* \param fname file name
* \param fmap name to fmap can be empty string
*/
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap );
};
#endif

View File

@ -31,7 +31,7 @@ namespace xgboost{
* \brief a regression booter associated with training and evaluating data
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
RegRankBoostLearner(const std::vector<const DMatrix *> mats){
RegRankBoostLearner(const std::vector<const DMatrix *>& mats){
silent = 0;
obj_ = NULL;
name_obj_ = "reg";
@ -45,14 +45,19 @@ namespace xgboost{
* data matrices to continue training otherwise it will cause error
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
inline void SetCacheData(const std::vector<const DMatrix *> mats){
inline void SetCacheData(const std::vector<const DMatrix *>& mats){
// estimate feature bound
int num_feature = 0;
// assign buffer index
unsigned buffer_size = 0;
utils::Assert( cache_.size() == 0, "can only call cache data once" );
for( size_t i = 0; i < mats.size(); ++i ){
bool dupilicate = false;
for( size_t j = 0; j < i; ++ j ){
if( mats[i] == mats[j] ) dupilicate = true;
}
if( dupilicate ) continue;
cache_.push_back( CacheEntry( mats[i], buffer_size ) );
buffer_size += static_cast<unsigned>(mats[i]->Size());
num_feature = std::max(num_feature, (int)(mats[i]->data.NumCol()));
@ -105,9 +110,18 @@ namespace xgboost{
mparam.AdjustBase();
}
/*!
* \brief load model from stream
* \param fi input stream
*/
* \brief load model from file
* \param fname file name
*/
inline void LoadModel(const char *fname){
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
this->LoadModel(fi);
fi.Close();
}
/*!
* \brief load model from stream
* \param fi input stream
*/
inline void LoadModel(utils::IStream &fi){
base_gbm.LoadModel(fi);
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
@ -138,10 +152,18 @@ namespace xgboost{
fo.Write(&mparam, sizeof(ModelParam));
}
/*!
* \brief update the model for one iteration
* \param iteration iteration number
* \brief save model into file
* \param fname file name
*/
inline void UpdateOneIter(int iter, const DMatrix &train){
inline void SaveModel(const char *fname) const{
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
this->SaveModel(fo);
fo.Close();
}
/*!
* \brief update the model for one iteration
*/
inline void UpdateOneIter(const DMatrix &train){
this->PredictRaw(preds_, train);
obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
// do boost
@ -295,7 +317,7 @@ namespace xgboost{
}
return -1;
}
private:
protected:
int silent;
EvalSet evaluator_;
booster::GBMBase base_gbm;
@ -305,7 +327,7 @@ namespace xgboost{
// name of objective function
std::string name_obj_;
std::vector< std::pair<std::string, std::string> > cfg_;
private:
protected:
std::vector<float> grad_, hess_, preds_;
};
}

View File

@ -166,7 +166,11 @@ namespace xgboost{
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
int len = strlen(fname);
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
this->LoadBinary(fname, silent); return;
if( !this->LoadBinary(fname, silent) ){
fprintf(stderr,"can not open file \"%s\"", fname);
utils::Error("DMatrix::CacheLoad failed");
}
return;
}
char bname[1024];
sprintf(bname, "%s.buffer", fname);

View File

@ -163,7 +163,7 @@ namespace xgboost{
for (int i = 0; i < num_round; ++i){
elapsed = (unsigned long)(time(NULL) - start);
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
learner.UpdateOneIter(i, data);
learner.UpdateOneIter(data);
learner.EvalOneIter(i, devalall, eval_data_names);
if (save_period != 0 && (i + 1) % save_period == 0){
this->SaveModel(i);

View File

@ -31,7 +31,7 @@ namespace xgboost{
/*! \brief load feature map from text format */
inline void LoadText(FILE *fi){
int fid;
char fname[256], ftype[256];
char fname[1256], ftype[1256];
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3){
utils::Assert(fid == (int)names_.size(), "invalid fmap format");
names_.push_back(std::string(fname));

View File

@ -38,6 +38,7 @@ namespace xgboost{
namespace utils{
inline void Error(const char *msg){
fprintf(stderr, "Error:%s\n", msg);
fflush(stderr);
exit(-1);
}
@ -57,7 +58,8 @@ namespace xgboost{
inline FILE *FopenCheck(const char *fname, const char *flag){
FILE *fp = fopen64(fname, flag);
if (fp == NULL){
fprintf(stderr, "can not open file \"%s\"\n", fname);
fprintf(stderr, "can not open file \"%s\" \n", fname);
fflush(stderr);
exit(-1);
}
return fp;