194 lines
8.4 KiB
Python
194 lines
8.4 KiB
Python
# Author: Tianqi Chen, Bing Xu
|
|
# module for xgboost
|
|
import ctypes
|
|
import os
|
|
# optinally have scipy sparse, though not necessary
|
|
import numpy
|
|
import numpy.ctypeslib
|
|
import scipy.sparse as scp
|
|
|
|
# set this line correctly
|
|
XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostpy.so'
|
|
|
|
# entry type of sparse matrix
|
|
class REntry(ctypes.Structure):
|
|
_fields_ = [("findex", ctypes.c_uint), ("fvalue", ctypes.c_float) ]
|
|
|
|
# load in xgboost library
|
|
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
|
|
|
|
xglib.XGDMatrixCreate.restype = ctypes.c_void_p
|
|
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
|
|
xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float )
|
|
xglib.XGDMatrixGetWeight.restype = ctypes.POINTER( ctypes.c_float )
|
|
xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
|
|
xglib.XGBoosterCreate.restype = ctypes.c_void_p
|
|
xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float )
|
|
|
|
def ctypes2numpy( cptr, length ):
|
|
# convert a ctypes pointer array to numpy
|
|
assert isinstance( cptr, ctypes.POINTER( ctypes.c_float ) )
|
|
res = numpy.zeros( length, dtype='float32' )
|
|
assert ctypes.memmove( res.ctypes.data, cptr, length * res.strides[0] )
|
|
return res
|
|
|
|
# data matrix used in xgboost
|
|
class DMatrix:
|
|
# constructor
|
|
def __init__(self, data=None, label=None, missing=0.0, weight = None):
|
|
self.handle = xglib.XGDMatrixCreate()
|
|
if data == None:
|
|
return
|
|
if isinstance(data,str):
|
|
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data), 1)
|
|
elif isinstance(data,scp.csr_matrix):
|
|
self.__init_from_csr(data)
|
|
elif isinstance(data, numpy.ndarray) and len(data.shape) == 2:
|
|
self.__init_from_npy2d(data, missing)
|
|
else:
|
|
try:
|
|
csr = scp.csr_matrix(data)
|
|
self.__init_from_csr(csr)
|
|
except:
|
|
raise Exception, "can not intialize DMatrix from"+str(type(data))
|
|
if label != None:
|
|
self.set_label(label)
|
|
if weight !=None:
|
|
self.set_weight(weight)
|
|
|
|
# convert data from csr matrix
|
|
def __init_from_csr(self,csr):
|
|
assert len(csr.indices) == len(csr.data)
|
|
xglib.XGDMatrixParseCSR( self.handle,
|
|
( ctypes.c_ulong * len(csr.indptr) )(*csr.indptr),
|
|
( ctypes.c_uint * len(csr.indices) )(*csr.indices),
|
|
( ctypes.c_float * len(csr.data) )(*csr.data),
|
|
len(csr.indptr), len(csr.data) )
|
|
# convert data from numpy matrix
|
|
def __init_from_npy2d(self,mat,missing):
|
|
data = numpy.array( mat.reshape(mat.size), dtype='float32' )
|
|
xglib.XGDMatrixParseMat( self.handle,
|
|
data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
|
mat.shape[0], mat.shape[1], ctypes.c_float(missing) )
|
|
# destructor
|
|
def __del__(self):
|
|
xglib.XGDMatrixFree(self.handle)
|
|
# load data from file
|
|
def load(self, fname, silent=True):
|
|
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(fname), int(silent))
|
|
# load data from file
|
|
def save_binary(self, fname, silent=True):
|
|
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname), int(silent))
|
|
# set label of dmatrix
|
|
def set_label(self, label):
|
|
xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) )
|
|
# set group size of dmatrix, used for rank
|
|
def set_group(self, group):
|
|
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group) )
|
|
# set weight of each instances
|
|
def set_weight(self, weight):
|
|
xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_float*len(weight))(*weight), len(weight) )
|
|
# get label from dmatrix
|
|
def get_label(self):
|
|
length = ctypes.c_ulong()
|
|
labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length))
|
|
return ctypes2numpy( labels, length.value );
|
|
# get weight from dmatrix
|
|
def get_weight(self):
|
|
length = ctypes.c_ulong()
|
|
weights = xglib.XGDMatrixGetWeight(self.handle, ctypes.byref(length))
|
|
return ctypes2numpy( weights, length.value );
|
|
# clear everything
|
|
def clear(self):
|
|
xglib.XGDMatrixClear(self.handle)
|
|
def num_row(self):
|
|
return xglib.XGDMatrixNumRow(self.handle)
|
|
# append a row to DMatrix
|
|
def add_row(self, row):
|
|
xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row) )
|
|
# get n-throw from DMatrix
|
|
def __getitem__(self, ridx):
|
|
length = ctypes.c_ulong()
|
|
row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) );
|
|
return [ (int(row[i].findex),row[i].fvalue) for i in xrange(length.value) ]
|
|
|
|
class Booster:
|
|
"""learner class """
|
|
def __init__(self, params={}, cache=[]):
|
|
""" constructor, param: """
|
|
for d in cache:
|
|
assert isinstance(d,DMatrix)
|
|
dmats = ( ctypes.c_void_p * len(cache) )(*[ ctypes.c_void_p(d.handle) for d in cache])
|
|
self.handle = xglib.XGBoosterCreate( dmats, len(cache) )
|
|
self.set_param( params )
|
|
def __del__(self):
|
|
xglib.XGBoosterFree(self.handle)
|
|
def set_param(self, params,pv=None):
|
|
if isinstance(params,dict):
|
|
for k, v in params.iteritems():
|
|
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) )
|
|
elif isinstance(params,str) and pv != None:
|
|
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(params), ctypes.c_char_p(str(pv)) )
|
|
else:
|
|
for k, v in params:
|
|
xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) )
|
|
def update(self, dtrain):
|
|
""" update """
|
|
assert isinstance(dtrain, DMatrix)
|
|
xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle )
|
|
def boost(self, dtrain, grad, hess, bst_group = -1):
|
|
""" update """
|
|
assert len(grad) == len(hess)
|
|
assert isinstance(dtrain, DMatrix)
|
|
xglib.XGBoosterBoostOneIter( self.handle, dtrain.handle,
|
|
(ctypes.c_float*len(grad))(*grad),
|
|
(ctypes.c_float*len(hess))(*hess),
|
|
len(grad), bst_group )
|
|
def update_interact(self, dtrain, action, booster_index=None):
|
|
""" beta: update with specified action"""
|
|
assert isinstance(dtrain, DMatrix)
|
|
if booster_index != None:
|
|
self.set_param('interact:booster_index', str(booster_index))
|
|
xglib.XGBoosterUpdateInteract( self.handle, dtrain.handle, ctypes.c_char_p(str(action)) )
|
|
def eval_set(self, evals, it = 0):
|
|
for d in evals:
|
|
assert isinstance(d[0], DMatrix)
|
|
assert isinstance(d[1], str)
|
|
dmats = ( ctypes.c_void_p * len(evals) )(*[ ctypes.c_void_p(d[0].handle) for d in evals])
|
|
evnames = ( ctypes.c_char_p * len(evals) )(*[ ctypes.c_char_p(d[1]) for d in evals])
|
|
xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) )
|
|
def eval(self, mat, name = 'eval', it = 0 ):
|
|
self.eval_set( [(mat,name)], it)
|
|
def predict(self, data, bst_group = -1):
|
|
length = ctypes.c_ulong()
|
|
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group)
|
|
return ctypes2numpy( preds, length.value )
|
|
def save_model(self, fname):
|
|
""" save model to file """
|
|
xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) )
|
|
def load_model(self, fname):
|
|
"""load model from file"""
|
|
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname) )
|
|
def dump_model(self, fname, fmap=''):
|
|
"""dump model into text file"""
|
|
xglib.XGBoosterDumpModel( self.handle, ctypes.c_char_p(fname), ctypes.c_char_p(fmap) )
|
|
|
|
def train(params, dtrain, num_boost_round = 10, evals = [], obj=None):
|
|
""" train a booster with given paramaters """
|
|
bst = Booster(params, [dtrain] )
|
|
if obj == None:
|
|
for i in xrange(num_boost_round):
|
|
bst.update( dtrain )
|
|
if len(evals) != 0:
|
|
bst.eval_set( evals, i )
|
|
else:
|
|
# try customized objective function
|
|
for i in xrange(num_boost_round):
|
|
pred = bst.predict( dtrain )
|
|
grad, hess = obj( pred, dtrain )
|
|
bst.boost( dtrain, grad, hess )
|
|
if len(evals) != 0:
|
|
bst.eval_set( evals, i )
|
|
return bst
|
|
|