Conflicts: .gitignore R-package/src/xgboost_R.cpp src/gbm/gblinear-inl.hpp tools/xgcombine_buffer.cpp
560 lines
21 KiB
Python
560 lines
21 KiB
Python
"""
|
|
xgboost: eXtreme Gradient Boosting library
|
|
Author: Tianqi Chen, Bing Xu
|
|
|
|
"""
|
|
import ctypes
|
|
import os
|
|
# optinally have scipy sparse, though not necessary
|
|
import numpy as np
|
|
import sys
|
|
import numpy.ctypeslib
|
|
import scipy.sparse as scp
|
|
|
|
# set this line correctly
|
|
if os.name == 'nt':
|
|
XGBOOST_PATH = os.path.dirname(__file__)+'/../windows/x64/Release/xgboost_wrapper.dll'
|
|
else:
|
|
XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostwrapper.so'
|
|
|
|
# load in xgboost library
|
|
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
|
|
# DMatrix functions
|
|
xglib.XGDMatrixCreateFromFile.restype = ctypes.c_void_p
|
|
xglib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p
|
|
xglib.XGDMatrixCreateFromCSC.restype = ctypes.c_void_p
|
|
xglib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p
|
|
xglib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p
|
|
xglib.XGDMatrixGetFloatInfo.restype = ctypes.POINTER(ctypes.c_float)
|
|
xglib.XGDMatrixGetUIntInfo.restype = ctypes.POINTER(ctypes.c_uint)
|
|
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
|
|
# booster functions
|
|
xglib.XGBoosterCreate.restype = ctypes.c_void_p
|
|
xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
|
|
xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
|
|
xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
|
|
|
|
def ctypes2numpy(cptr, length, dtype):
|
|
"""convert a ctypes pointer array to numpy array """
|
|
assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
|
|
res = numpy.zeros(length, dtype=dtype)
|
|
assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0])
|
|
return res
|
|
|
|
class DMatrix:
|
|
"""data matrix used in xgboost"""
|
|
# constructor
|
|
def __init__(self, data, label=None, missing=0.0, weight = None):
|
|
""" constructor of DMatrix
|
|
|
|
Args:
|
|
data: string/numpy array/scipy.sparse
|
|
data source, string type is the path of svmlight format txt file or xgb buffer
|
|
label: list or numpy 1d array, optional
|
|
label of training data
|
|
missing: float
|
|
value in data which need to be present as missing value
|
|
weight: list or numpy 1d array, optional
|
|
weight for each instances
|
|
"""
|
|
# force into void_p, mac need to pass things in as void_p
|
|
if data is None:
|
|
self.handle = None
|
|
return
|
|
if isinstance(data, str):
|
|
self.handle = ctypes.c_void_p(
|
|
xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 0))
|
|
elif isinstance(data, scp.csr_matrix):
|
|
self.__init_from_csr(data)
|
|
elif isinstance(data, scp.csc_matrix):
|
|
self.__init_from_csc(data)
|
|
elif isinstance(data, numpy.ndarray) and len(data.shape) == 2:
|
|
self.__init_from_npy2d(data, missing)
|
|
else:
|
|
try:
|
|
csr = scp.csr_matrix(data)
|
|
self.__init_from_csr(csr)
|
|
except:
|
|
raise Exception("can not intialize DMatrix from"+str(type(data)))
|
|
if label != None:
|
|
self.set_label(label)
|
|
if weight !=None:
|
|
self.set_weight(weight)
|
|
|
|
def __init_from_csr(self, csr):
|
|
"""convert data from csr matrix"""
|
|
assert len(csr.indices) == len(csr.data)
|
|
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR(
|
|
(ctypes.c_ulong * len(csr.indptr))(*csr.indptr),
|
|
(ctypes.c_uint * len(csr.indices))(*csr.indices),
|
|
(ctypes.c_float * len(csr.data))(*csr.data),
|
|
len(csr.indptr), len(csr.data)))
|
|
|
|
def __init_from_csc(self, csc):
|
|
"""convert data from csr matrix"""
|
|
assert len(csc.indices) == len(csc.data)
|
|
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSC(
|
|
(ctypes.c_ulong * len(csc.indptr))(*csc.indptr),
|
|
(ctypes.c_uint * len(csc.indices))(*csc.indices),
|
|
(ctypes.c_float * len(csc.data))(*csc.data),
|
|
len(csc.indptr), len(csc.data)))
|
|
|
|
def __init_from_npy2d(self,mat,missing):
|
|
"""convert data from numpy matrix"""
|
|
data = numpy.array(mat.reshape(mat.size), dtype='float32')
|
|
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat(
|
|
data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
|
mat.shape[0], mat.shape[1], ctypes.c_float(missing)))
|
|
|
|
def __del__(self):
|
|
"""destructor"""
|
|
xglib.XGDMatrixFree(self.handle)
|
|
def get_float_info(self, field):
|
|
length = ctypes.c_ulong()
|
|
ret = xglib.XGDMatrixGetFloatInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
|
|
ctypes.byref(length))
|
|
return ctypes2numpy(ret, length.value, 'float32')
|
|
def get_uint_info(self, field):
|
|
length = ctypes.c_ulong()
|
|
ret = xglib.XGDMatrixGetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
|
|
ctypes.byref(length))
|
|
return ctypes2numpy(ret, length.value, 'uint32')
|
|
def set_float_info(self, field, data):
|
|
xglib.XGDMatrixSetFloatInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
|
|
(ctypes.c_float*len(data))(*data), len(data))
|
|
def set_uint_info(self, field, data):
|
|
xglib.XGDMatrixSetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
|
|
(ctypes.c_uint*len(data))(*data), len(data))
|
|
|
|
def save_binary(self, fname, silent=True):
|
|
"""save DMatrix to XGBoost buffer
|
|
Args:
|
|
fname: string
|
|
name of buffer file
|
|
slient: bool, option
|
|
whether print info
|
|
Returns:
|
|
None
|
|
"""
|
|
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
|
|
|
|
def set_label(self, label):
|
|
"""set label of dmatrix
|
|
Args:
|
|
label: list
|
|
label for DMatrix
|
|
Returns:
|
|
None
|
|
"""
|
|
self.set_float_info('label', label)
|
|
|
|
def set_weight(self, weight):
|
|
"""set weight of each instances
|
|
Args:
|
|
weight: float
|
|
weight for positive instance
|
|
Returns:
|
|
None
|
|
"""
|
|
self.set_float_info('weight', weight)
|
|
|
|
def set_base_margin(self, margin):
|
|
"""
|
|
set base margin of booster to start from
|
|
this can be used to specify a prediction value of
|
|
existing model to be base_margin
|
|
However, remember margin is needed, instead of transformed prediction
|
|
e.g. for logistic regression: need to put in value before logistic transformation
|
|
see also example/demo.py
|
|
"""
|
|
self.set_float_info('base_margin', margin)
|
|
|
|
def set_group(self, group):
|
|
"""set group size of dmatrix, used for rank
|
|
Args:
|
|
group:
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group))
|
|
|
|
def get_label(self):
|
|
"""get label from dmatrix
|
|
Args:
|
|
None
|
|
Returns:
|
|
list, label of data
|
|
"""
|
|
return self.get_float_info('label')
|
|
|
|
def get_weight(self):
|
|
"""get weight from dmatrix
|
|
Args:
|
|
None
|
|
Returns:
|
|
float, weight
|
|
"""
|
|
return self.get_float_info('weight')
|
|
def get_base_margin(self):
|
|
"""get base_margin from dmatrix
|
|
Args:
|
|
None
|
|
Returns:
|
|
float, base margin
|
|
"""
|
|
return self.get_float_info('base_margin')
|
|
def num_row(self):
|
|
"""get number of rows
|
|
Args:
|
|
None
|
|
Returns:
|
|
int, num rows
|
|
"""
|
|
return xglib.XGDMatrixNumRow(self.handle)
|
|
def slice(self, rindex):
|
|
"""slice the DMatrix to return a new DMatrix that only contains rindex
|
|
Args:
|
|
rindex: list
|
|
list of index to be chosen
|
|
Returns:
|
|
res: DMatrix
|
|
new DMatrix with chosen index
|
|
"""
|
|
res = DMatrix(None)
|
|
res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix(
|
|
self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex)))
|
|
return res
|
|
|
|
class Booster:
|
|
"""learner class """
|
|
def __init__(self, params={}, cache=[], model_file = None):
|
|
""" constructor
|
|
Args:
|
|
params: dict
|
|
params for boosters
|
|
cache: list
|
|
list of cache item
|
|
model_file: string
|
|
path of model file
|
|
Returns:
|
|
None
|
|
"""
|
|
for d in cache:
|
|
assert isinstance(d, DMatrix)
|
|
dmats = (ctypes.c_void_p * len(cache))(*[ d.handle for d in cache])
|
|
self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache)))
|
|
self.set_param({'seed':0})
|
|
self.set_param(params)
|
|
if model_file != None:
|
|
self.load_model(model_file)
|
|
def __del__(self):
|
|
xglib.XGBoosterFree(self.handle)
|
|
def set_param(self, params, pv=None):
|
|
if isinstance(params, dict):
|
|
for k, v in params.items():
|
|
xglib.XGBoosterSetParam(
|
|
self.handle, ctypes.c_char_p(k.encode('utf-8')),
|
|
ctypes.c_char_p(str(v).encode('utf-8')))
|
|
elif isinstance(params,str) and pv != None:
|
|
xglib.XGBoosterSetParam(
|
|
self.handle, ctypes.c_char_p(params.encode('utf-8')),
|
|
ctypes.c_char_p(str(pv).encode('utf-8')))
|
|
else:
|
|
for k, v in params:
|
|
xglib.XGBoosterSetParam(
|
|
self.handle, ctypes.c_char_p(k.encode('utf-8')),
|
|
ctypes.c_char_p(str(v).encode('utf-8')))
|
|
|
|
def update(self, dtrain, it, fobj=None):
|
|
"""
|
|
update
|
|
Args:
|
|
dtrain: DMatrix
|
|
the training DMatrix
|
|
it: int
|
|
current iteration number
|
|
fobj: function
|
|
cutomzied objective function
|
|
Returns:
|
|
None
|
|
"""
|
|
assert isinstance(dtrain, DMatrix)
|
|
if fobj is None:
|
|
xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle)
|
|
else:
|
|
pred = self.predict( dtrain )
|
|
grad, hess = fobj( pred, dtrain )
|
|
self.boost( dtrain, grad, hess )
|
|
|
|
def boost(self, dtrain, grad, hess):
|
|
""" update
|
|
Args:
|
|
dtrain: DMatrix
|
|
the training DMatrix
|
|
grad: list
|
|
the first order of gradient
|
|
hess: list
|
|
the second order of gradient
|
|
"""
|
|
assert len(grad) == len(hess)
|
|
assert isinstance(dtrain, DMatrix)
|
|
xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle,
|
|
(ctypes.c_float*len(grad))(*grad),
|
|
(ctypes.c_float*len(hess))(*hess),
|
|
len(grad))
|
|
|
|
def eval_set(self, evals, it = 0, feval = None):
|
|
"""evaluates by metric
|
|
Args:
|
|
evals: list of tuple (DMatrix, string)
|
|
lists of items to be evaluated
|
|
it: int
|
|
current iteration
|
|
feval: function
|
|
custom evaluation function
|
|
Returns:
|
|
evals result
|
|
"""
|
|
if feval is None:
|
|
for d in evals:
|
|
assert isinstance(d[0], DMatrix)
|
|
assert isinstance(d[1], str)
|
|
dmats = (ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals])
|
|
evnames = (ctypes.c_char_p * len(evals))(
|
|
* [ctypes.c_char_p(d[1].encode('utf-8')) for d in evals])
|
|
return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals))
|
|
else:
|
|
res = '[%d]' % it
|
|
for dm, evname in evals:
|
|
name, val = feval(self.predict(dm), dm)
|
|
res += '\t%s-%s:%f' % (evname, name, val)
|
|
return res
|
|
def eval(self, mat, name = 'eval', it = 0):
|
|
return self.eval_set( [(mat,name)], it)
|
|
def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
|
|
"""
|
|
predict with data
|
|
Args:
|
|
data: DMatrix
|
|
the dmatrix storing the input
|
|
output_margin: bool
|
|
whether output raw margin value that is untransformed
|
|
ntree_limit: int
|
|
limit number of trees in prediction, default to 0, 0 means using all the trees
|
|
pred_leaf: bool
|
|
when this option is on, the output will be a matrix of (nsample, ntrees)
|
|
with each record indicate the predicted leaf index of each sample in each tree
|
|
Note that the leaf index of tree is unique per tree, so you may find leaf 1 in both tree 1 and tree 0
|
|
Returns:
|
|
numpy array of prediction
|
|
"""
|
|
option_mask = 0
|
|
if output_margin:
|
|
option_mask += 1
|
|
if pred_leaf:
|
|
option_mask += 2
|
|
length = ctypes.c_ulong()
|
|
preds = xglib.XGBoosterPredict(self.handle, data.handle,
|
|
option_mask, ntree_limit, ctypes.byref(length))
|
|
preds = ctypes2numpy(preds, length.value, 'float32')
|
|
if pred_leaf:
|
|
preds = preds.astype('int32')
|
|
nrow = data.num_row()
|
|
if preds.size != nrow and preds.size % nrow == 0:
|
|
preds = preds.reshape(nrow, preds.size / nrow)
|
|
return preds
|
|
def save_model(self, fname):
|
|
""" save model to file
|
|
Args:
|
|
fname: string
|
|
file name of saving model
|
|
Returns:
|
|
None
|
|
"""
|
|
xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))
|
|
def load_model(self, fname):
|
|
"""load model from file
|
|
Args:
|
|
fname: string
|
|
file name of saving model
|
|
Returns:
|
|
None
|
|
"""
|
|
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
|
|
def dump_model(self, fo, fmap='', with_stats = False):
|
|
"""dump model into text file
|
|
Args:
|
|
fo: string
|
|
file name to be dumped
|
|
fmap: string, optional
|
|
file name of feature map names
|
|
with_stats: bool, optional
|
|
whether output statistics of the split
|
|
Returns:
|
|
None
|
|
"""
|
|
if isinstance(fo,str):
|
|
fo = open(fo,'w')
|
|
need_close = True
|
|
else:
|
|
need_close = False
|
|
ret = self.get_dump(fmap, with_stats)
|
|
for i in range(len(ret)):
|
|
fo.write('booster[%d]:\n' %i)
|
|
fo.write( ret[i] )
|
|
if need_close:
|
|
fo.close()
|
|
def get_dump(self, fmap='', with_stats=False):
|
|
"""get dump of model as list of strings """
|
|
length = ctypes.c_ulong()
|
|
sarr = xglib.XGBoosterDumpModel(self.handle,
|
|
ctypes.c_char_p(fmap.encode('utf-8')),
|
|
int(with_stats), ctypes.byref(length))
|
|
res = []
|
|
for i in range(length.value):
|
|
res.append( str(sarr[i]) )
|
|
return res
|
|
def get_fscore(self, fmap=''):
|
|
""" get feature importance of each feature """
|
|
trees = self.get_dump(fmap)
|
|
fmap = {}
|
|
for tree in trees:
|
|
print (tree)
|
|
for l in tree.split('\n'):
|
|
arr = l.split('[')
|
|
if len(arr) == 1:
|
|
continue
|
|
fid = arr[1].split(']')[0]
|
|
fid = fid.split('<')[0]
|
|
if fid not in fmap:
|
|
fmap[fid] = 1
|
|
else:
|
|
fmap[fid]+= 1
|
|
return fmap
|
|
|
|
def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None):
|
|
""" train a booster with given paramaters
|
|
Args:
|
|
params: dict
|
|
params of booster
|
|
dtrain: DMatrix
|
|
data to be trained
|
|
num_boost_round: int
|
|
num of round to be boosted
|
|
watchlist: list of pairs (DMatrix, string)
|
|
list of items to be evaluated during training, this allows user to watch performance on validation set
|
|
obj: function
|
|
cutomized objective function
|
|
feval: function
|
|
cutomized evaluation function
|
|
Returns: Booster model trained
|
|
"""
|
|
bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
|
|
for i in range(num_boost_round):
|
|
bst.update( dtrain, i, obj )
|
|
if len(evals) != 0:
|
|
bst_eval_set=bst.eval_set(evals, i, feval)
|
|
if isinstance(bst_eval_set,str):
|
|
sys.stderr.write(bst_eval_set+'\n')
|
|
else:
|
|
sys.stderr.write(bst_eval_set.decode()+'\n')
|
|
return bst
|
|
|
|
class CVPack:
|
|
def __init__(self, dtrain, dtest, param):
|
|
self.dtrain = dtrain
|
|
self.dtest = dtest
|
|
self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ]
|
|
self.bst = Booster(param, [dtrain,dtest])
|
|
def update(self, r, fobj):
|
|
self.bst.update(self.dtrain, r, fobj)
|
|
def eval(self, r, feval):
|
|
return self.bst.eval_set(self.watchlist, r, feval)
|
|
|
|
def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None):
|
|
"""
|
|
mk nfold list of cvpack from randidx
|
|
"""
|
|
np.random.seed(seed)
|
|
randidx = np.random.permutation(dall.num_row())
|
|
kstep = len(randidx) / nfold
|
|
idset = [randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ] for i in range(nfold)]
|
|
ret = []
|
|
for k in range(nfold):
|
|
dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
|
|
dtest = dall.slice(idset[k])
|
|
# run preprocessing on the data set if needed
|
|
if fpreproc is not None:
|
|
dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
|
|
else:
|
|
tparam = param
|
|
plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
|
|
ret.append(CVPack(dtrain, dtest, plst))
|
|
return ret
|
|
|
|
def aggcv(rlist, show_stdv=True):
|
|
"""
|
|
aggregate cross validation results
|
|
"""
|
|
cvmap = {}
|
|
ret = rlist[0].split()[0]
|
|
for line in rlist:
|
|
arr = line.split()
|
|
assert ret == arr[0]
|
|
for it in arr[1:]:
|
|
if not isinstance(it,str):
|
|
it=it.decode()
|
|
k, v = it.split(':')
|
|
if k not in cvmap:
|
|
cvmap[k] = []
|
|
cvmap[k].append(float(v))
|
|
for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
|
|
v = np.array(v)
|
|
if not isinstance(ret,str):
|
|
ret = ret.decode()
|
|
if show_stdv:
|
|
ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
|
|
else:
|
|
ret += '\tcv-%s:%f' % (k, np.mean(v))
|
|
return ret
|
|
|
|
def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
|
|
obj = None, feval = None, fpreproc = None, show_stdv = True, seed = 0):
|
|
""" cross validation with given paramaters
|
|
Args:
|
|
params: dict
|
|
params of booster
|
|
dtrain: DMatrix
|
|
data to be trained
|
|
num_boost_round: int
|
|
num of round to be boosted
|
|
nfold: int
|
|
number of folds to do cv
|
|
metrics: list of strings
|
|
evaluation metrics to be watched in cv
|
|
obj: function
|
|
custom objective function
|
|
feval: function
|
|
custom evaluation function
|
|
fpreproc: function
|
|
preprocessing function that takes dtrain, dtest,
|
|
param and return transformed version of dtrain, dtest, param
|
|
show_stdv: bool
|
|
whether display standard deviation
|
|
seed: int
|
|
seed used to generate the folds, this is passed to numpy.random.seed
|
|
|
|
Returns: list(string) of evaluation history
|
|
"""
|
|
results = []
|
|
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
|
|
for i in range(num_boost_round):
|
|
for f in cvfolds:
|
|
f.update(i, obj)
|
|
res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
|
|
sys.stderr.write(res+'\n')
|
|
results.append(res)
|
|
return results
|
|
|