finish python lib
This commit is contained in:
parent
20de7f8f97
commit
adc9400736
@ -1,30 +1,35 @@
|
||||
# module for xgboost
|
||||
import ctypes
|
||||
# optinally have scipy sparse, though not necessary
|
||||
import numpy as np
|
||||
import scipy.sparse as scp
|
||||
# set this line correctly
|
||||
XGBOOST_PATH = './libxgboostpy.so'
|
||||
|
||||
# entry type of sparse matrix
|
||||
class REntry(ctypes.Structure):
|
||||
_fields_ = [("findex", ctypes.c_uint), ("fvalue", ctypes.c_float) ]
|
||||
|
||||
# load in xgboost library
|
||||
xglib = ctypes.cdll.LoadLibrary('./libxgboostpy.so')
|
||||
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
|
||||
|
||||
xglib.XGDMatrixCreate.restype = ctypes.c_void_p
|
||||
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
|
||||
xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float )
|
||||
xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
|
||||
xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float )
|
||||
|
||||
# data matrix used in xgboost
|
||||
class DMatrix:
|
||||
# constructor
|
||||
def __init__(self, data=None, label=None):
|
||||
self.handle = xglib.XGDMatrixCreate();
|
||||
self.handle = xglib.XGDMatrixCreate()
|
||||
if data == None:
|
||||
return
|
||||
if type(data) is str:
|
||||
return
|
||||
if isinstance(data,str):
|
||||
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data), 1)
|
||||
elif type(data) is scp.csr_matrix:
|
||||
|
||||
elif isinstance(data,scp.csr_matrix):
|
||||
self.__init_from_csr(data)
|
||||
else:
|
||||
try:
|
||||
@ -54,7 +59,13 @@ class DMatrix:
|
||||
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname), int(silent))
|
||||
# set label of dmatrix
|
||||
def set_label(self, label):
|
||||
xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) );
|
||||
xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) )
|
||||
# set group size of dmatrix, used for rank
|
||||
def set_group(self, group):
|
||||
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group) )
|
||||
# set weight of each instances
|
||||
def set_weight(self, weight):
|
||||
xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_uint*len(weight))(*weight), len(weight) )
|
||||
# get label from dmatrix
|
||||
def get_label(self):
|
||||
length = ctypes.c_ulong()
|
||||
@ -66,16 +77,57 @@ class DMatrix:
|
||||
def num_row(self):
|
||||
return xglib.XGDMatrixNumRow(self.handle)
|
||||
# append a row to DMatrix
|
||||
def add_row(self, row, label):
|
||||
xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row), label )
|
||||
def add_row(self, row):
|
||||
xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row) )
|
||||
# get n-throw from DMatrix
|
||||
def __getitem__(self, ridx):
|
||||
length = ctypes.c_ulong()
|
||||
row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) );
|
||||
return [ (int(row[i].findex),row[i].fvalue) for i in xrange(length.value) ]
|
||||
|
||||
class Booster:
|
||||
"""learner class """
|
||||
def __init__(self, params, cache=[]):
|
||||
""" constructor, param: """
|
||||
for d in cache:
|
||||
assert isinstance(d,DMatrix)
|
||||
dmats = ( ctypes.c_void_p * len(cache) )(*[ ctypes.c_void_p(d.handle) for d in cache])
|
||||
self.handle = xglib.XGBoosterCreate( dmats, len(cache) )
|
||||
for k, v in params.iteritems():
|
||||
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) )
|
||||
def update(self, dtrain):
|
||||
""" update """
|
||||
assert isinstance(dtrain, DMatrix)
|
||||
xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle )
|
||||
def eval_set(self, evals, it = 0):
|
||||
for d in evals:
|
||||
assert isinstance(d[0], DMatrix)
|
||||
assert isinstance(d[1], str)
|
||||
dmats = ( ctypes.c_void_p * len(evals) )(*[ ctypes.c_void_p(d[0].handle) for d in evals])
|
||||
evnames = ( ctypes.c_char_p * len(evals) )(*[ ctypes.c_char_p(d[1]) for d in evals])
|
||||
xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) )
|
||||
def eval(self, mat, name = 'eval', it = 0 ):
|
||||
self.eval_set( [(mat,name)], it)
|
||||
def predict(self, data):
|
||||
length = ctypes.c_ulong()
|
||||
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length))
|
||||
return [ preds[i] for i in xrange(length.value) ]
|
||||
def save_model(self, fname):
|
||||
""" save model to file """
|
||||
xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) )
|
||||
def load_model(self, fname):
|
||||
"""load model from file"""
|
||||
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname) )
|
||||
def dump_model(self, fname, fmap=''):
|
||||
"""dump model into text file"""
|
||||
xglib.XGBoosterDumpModel( self.handle, ctypes.c_char_p(fname), ctypes.c_char_p(fmap) )
|
||||
|
||||
|
||||
mat = DMatrix('xx.buffer')
|
||||
print mat.num_row()
|
||||
mat.clear()
|
||||
def train(params, dtrain, num_boost_round = 10, evals = []):
|
||||
""" train a booster with given paramaters """
|
||||
bst = Booster(params, [dtrain] )
|
||||
for i in xrange(num_boost_round):
|
||||
bst.update( dtrain )
|
||||
if len(evals) != 0:
|
||||
bst.eval_set( evals, i )
|
||||
return bst
|
||||
|
||||
|
||||
@ -56,20 +56,63 @@ namespace xgboost{
|
||||
this->info.labels.resize( len );
|
||||
memcpy( &(this->info).labels[0], label, sizeof(float)*len );
|
||||
}
|
||||
inline void SetGroup( const unsigned *group, size_t len ){
|
||||
this->info.group_ptr.resize( len + 1 );
|
||||
this->info.group_ptr[0] = 0;
|
||||
for( size_t i = 0; i < len; ++ i ){
|
||||
this->info.group_ptr[i+1] = this->info.group_ptr[i]+group[i];
|
||||
}
|
||||
}
|
||||
inline void SetWeight( const float *weight, size_t len ){
|
||||
this->info.weights.resize( len );
|
||||
memcpy( &(this->info).weights[0], weight, sizeof(float)*len );
|
||||
}
|
||||
inline const float* GetLabel( size_t* len ) const{
|
||||
*len = this->info.labels.size();
|
||||
return &(this->info.labels[0]);
|
||||
}
|
||||
inline void InitTrain(void){
|
||||
if(!this->data.HaveColAccess()) this->data.InitData();
|
||||
inline void CheckInit(void){
|
||||
if(!this->data.HaveColAccess()){
|
||||
this->data.InitData();
|
||||
}
|
||||
utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
|
||||
}
|
||||
};
|
||||
|
||||
class Booster: public xgboost::regrank::RegRankBoostLearner{
|
||||
private:
|
||||
bool init_trainer, init_model;
|
||||
public:
|
||||
Booster(const std::vector<const regrank::DMatrix *> mats){
|
||||
silent = 1;
|
||||
init_trainer = false;
|
||||
init_model = false;
|
||||
this->SetCacheData(mats);
|
||||
}
|
||||
inline void CheckInit(void){
|
||||
if( !init_trainer ){
|
||||
this->InitTrainer(); init_trainer = true;
|
||||
}
|
||||
if( !init_model ){
|
||||
this->InitModel(); init_model = true;
|
||||
}
|
||||
}
|
||||
inline void LoadModel( const char *fname ){
|
||||
xgboost::regrank::RegRankBoostLearner::LoadModel(fname);
|
||||
this->init_model = true;
|
||||
}
|
||||
const float *Pred( const DMatrix &dmat, size_t *len ){
|
||||
this->Predict( this->preds_, dmat );
|
||||
*len = this->preds_.size();
|
||||
return &this->preds_[0];
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
using namespace xgboost::python;
|
||||
|
||||
|
||||
extern "C"{
|
||||
void* XGDMatrixCreate( void ){
|
||||
return new DMatrix();
|
||||
@ -94,6 +137,12 @@ extern "C"{
|
||||
void XGDMatrixSetLabel( void *handle, const float *label, size_t len ){
|
||||
static_cast<DMatrix*>(handle)->SetLabel(label,len);
|
||||
}
|
||||
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len ){
|
||||
static_cast<DMatrix*>(handle)->SetWeight(weight,len);
|
||||
}
|
||||
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len ){
|
||||
static_cast<DMatrix*>(handle)->SetGroup(group,len);
|
||||
}
|
||||
const float* XGDMatrixGetLabel( const void *handle, size_t* len ){
|
||||
return static_cast<const DMatrix*>(handle)->GetLabel(len);
|
||||
}
|
||||
@ -109,5 +158,54 @@ extern "C"{
|
||||
const XGEntry* XGDMatrixGetRow(void *handle, unsigned ridx, size_t* len){
|
||||
return static_cast<DMatrix*>(handle)->GetRow(ridx, len);
|
||||
}
|
||||
|
||||
// xgboost implementation
|
||||
void *XGBoosterCreate( void *dmats[], size_t len ){
|
||||
std::vector<const xgboost::regrank::DMatrix*> mats;
|
||||
for( size_t i = 0; i < len; ++i ){
|
||||
mats.push_back( static_cast<DMatrix*>(dmats[i]) );
|
||||
}
|
||||
return new Booster( mats );
|
||||
}
|
||||
void XGBoosterSetParam( void *handle, const char *name, const char *value ){
|
||||
static_cast<Booster*>(handle)->SetParam( name, value );
|
||||
}
|
||||
void XGBoosterUpdateOneIter( void *handle, void *dtrain ){
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
|
||||
bst->CheckInit(); dtr->CheckInit();
|
||||
bst->UpdateOneIter( *dtr );
|
||||
}
|
||||
void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len ){
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
bst->CheckInit();
|
||||
|
||||
std::vector<std::string> names;
|
||||
std::vector<const xgboost::regrank::DMatrix*> mats;
|
||||
for( size_t i = 0; i < len; ++i ){
|
||||
mats.push_back( static_cast<DMatrix*>(dmats[i]) );
|
||||
names.push_back( std::string( evnames[i]) );
|
||||
}
|
||||
bst->EvalOneIter( iter, mats, names, stdout );
|
||||
}
|
||||
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len ){
|
||||
return static_cast<Booster*>(handle)->Pred( *static_cast<DMatrix*>(dmat), len );
|
||||
}
|
||||
void XGBoosterLoadModel( void *handle, const char *fname ){
|
||||
static_cast<Booster*>(handle)->LoadModel( fname );
|
||||
}
|
||||
void XGBoosterSaveModel( const void *handle, const char *fname ){
|
||||
static_cast<const Booster*>(handle)->SaveModel( fname );
|
||||
}
|
||||
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap ){
|
||||
using namespace xgboost::utils;
|
||||
FILE *fo = FopenCheck( fname, "w" );
|
||||
FeatMap featmap;
|
||||
if( strlen(fmap) != 0 ){
|
||||
featmap.LoadText( fmap );
|
||||
}
|
||||
static_cast<Booster*>(handle)->DumpModel( fo, featmap, false );
|
||||
fclose( fo );
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -52,10 +52,24 @@ extern "C"{
|
||||
/*!
|
||||
* \brief set label of the training matrix
|
||||
* \param handle a instance of data matrix
|
||||
* \param data array of row content
|
||||
* \param label pointer to label
|
||||
* \param len length of array
|
||||
*/
|
||||
void XGDMatrixSetLabel( void *handle, const float *label, size_t len );
|
||||
/*!
|
||||
* \brief set label of the training matrix
|
||||
* \param handle a instance of data matrix
|
||||
* \param group pointer to group size
|
||||
* \param len length of array
|
||||
*/
|
||||
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len );
|
||||
/*!
|
||||
* \brief set weight of each instacne
|
||||
* \param handle a instance of data matrix
|
||||
* \param weight data pointer to weights
|
||||
* \param len length of array
|
||||
*/
|
||||
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len );
|
||||
/*!
|
||||
* \brief get label set from matrix
|
||||
* \param handle a instance of data matrix
|
||||
@ -94,7 +108,7 @@ extern "C"{
|
||||
* \param dmats matrices that are set to be cached
|
||||
* \param create a booster
|
||||
*/
|
||||
void *CreateXGBooster( void**dmats, size_t len );
|
||||
void *XGBoosterCreate( void* dmats[], size_t len );
|
||||
/*!
|
||||
* \brief set parameters
|
||||
* \param handle handle
|
||||
@ -135,7 +149,14 @@ extern "C"{
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
*/
|
||||
void XGBoosterSaveModel( void *handle, const char *fname );
|
||||
void XGBoosterSaveModel( const void *handle, const char *fname );
|
||||
/*!
|
||||
* \brief dump model into text file
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
* \param fmap name to fmap can be empty string
|
||||
*/
|
||||
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap );
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
@ -31,7 +31,7 @@ namespace xgboost{
|
||||
* \brief a regression booter associated with training and evaluating data
|
||||
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||
*/
|
||||
RegRankBoostLearner(const std::vector<const DMatrix *> mats){
|
||||
RegRankBoostLearner(const std::vector<const DMatrix *>& mats){
|
||||
silent = 0;
|
||||
obj_ = NULL;
|
||||
name_obj_ = "reg";
|
||||
@ -45,14 +45,19 @@ namespace xgboost{
|
||||
* data matrices to continue training otherwise it will cause error
|
||||
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||
*/
|
||||
inline void SetCacheData(const std::vector<const DMatrix *> mats){
|
||||
inline void SetCacheData(const std::vector<const DMatrix *>& mats){
|
||||
// estimate feature bound
|
||||
int num_feature = 0;
|
||||
// assign buffer index
|
||||
unsigned buffer_size = 0;
|
||||
|
||||
|
||||
utils::Assert( cache_.size() == 0, "can only call cache data once" );
|
||||
for( size_t i = 0; i < mats.size(); ++i ){
|
||||
bool dupilicate = false;
|
||||
for( size_t j = 0; j < i; ++ j ){
|
||||
if( mats[i] == mats[j] ) dupilicate = true;
|
||||
}
|
||||
if( dupilicate ) continue;
|
||||
cache_.push_back( CacheEntry( mats[i], buffer_size ) );
|
||||
buffer_size += static_cast<unsigned>(mats[i]->Size());
|
||||
num_feature = std::max(num_feature, (int)(mats[i]->data.NumCol()));
|
||||
@ -105,9 +110,18 @@ namespace xgboost{
|
||||
mparam.AdjustBase();
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
* \brief load model from file
|
||||
* \param fname file name
|
||||
*/
|
||||
inline void LoadModel(const char *fname){
|
||||
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
|
||||
this->LoadModel(fi);
|
||||
fi.Close();
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel(utils::IStream &fi){
|
||||
base_gbm.LoadModel(fi);
|
||||
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
|
||||
@ -138,10 +152,18 @@ namespace xgboost{
|
||||
fo.Write(&mparam, sizeof(ModelParam));
|
||||
}
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iteration iteration number
|
||||
* \brief save model into file
|
||||
* \param fname file name
|
||||
*/
|
||||
inline void UpdateOneIter(int iter, const DMatrix &train){
|
||||
inline void SaveModel(const char *fname) const{
|
||||
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||
this->SaveModel(fo);
|
||||
fo.Close();
|
||||
}
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
*/
|
||||
inline void UpdateOneIter(const DMatrix &train){
|
||||
this->PredictRaw(preds_, train);
|
||||
obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
|
||||
// do boost
|
||||
@ -295,7 +317,7 @@ namespace xgboost{
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
private:
|
||||
protected:
|
||||
int silent;
|
||||
EvalSet evaluator_;
|
||||
booster::GBMBase base_gbm;
|
||||
@ -305,7 +327,7 @@ namespace xgboost{
|
||||
// name of objective function
|
||||
std::string name_obj_;
|
||||
std::vector< std::pair<std::string, std::string> > cfg_;
|
||||
private:
|
||||
protected:
|
||||
std::vector<float> grad_, hess_, preds_;
|
||||
};
|
||||
}
|
||||
|
||||
@ -166,7 +166,11 @@ namespace xgboost{
|
||||
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
|
||||
int len = strlen(fname);
|
||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
||||
this->LoadBinary(fname, silent); return;
|
||||
if( !this->LoadBinary(fname, silent) ){
|
||||
fprintf(stderr,"can not open file \"%s\"", fname);
|
||||
utils::Error("DMatrix::CacheLoad failed");
|
||||
}
|
||||
return;
|
||||
}
|
||||
char bname[1024];
|
||||
sprintf(bname, "%s.buffer", fname);
|
||||
|
||||
@ -163,7 +163,7 @@ namespace xgboost{
|
||||
for (int i = 0; i < num_round; ++i){
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
||||
learner.UpdateOneIter(i, data);
|
||||
learner.UpdateOneIter(data);
|
||||
learner.EvalOneIter(i, devalall, eval_data_names);
|
||||
if (save_period != 0 && (i + 1) % save_period == 0){
|
||||
this->SaveModel(i);
|
||||
|
||||
@ -31,7 +31,7 @@ namespace xgboost{
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText(FILE *fi){
|
||||
int fid;
|
||||
char fname[256], ftype[256];
|
||||
char fname[1256], ftype[1256];
|
||||
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3){
|
||||
utils::Assert(fid == (int)names_.size(), "invalid fmap format");
|
||||
names_.push_back(std::string(fname));
|
||||
|
||||
@ -38,6 +38,7 @@ namespace xgboost{
|
||||
namespace utils{
|
||||
inline void Error(const char *msg){
|
||||
fprintf(stderr, "Error:%s\n", msg);
|
||||
fflush(stderr);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@ -57,7 +58,8 @@ namespace xgboost{
|
||||
inline FILE *FopenCheck(const char *fname, const char *flag){
|
||||
FILE *fp = fopen64(fname, flag);
|
||||
if (fp == NULL){
|
||||
fprintf(stderr, "can not open file \"%s\"\n", fname);
|
||||
fprintf(stderr, "can not open file \"%s\" \n", fname);
|
||||
fflush(stderr);
|
||||
exit(-1);
|
||||
}
|
||||
return fp;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user