move custom obj build in into booster

This commit is contained in:
tqchen 2014-09-02 23:07:50 -07:00
parent 10648a1ca7
commit ac8958b284

View File

@ -213,77 +213,6 @@ class DMatrix:
self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex))) self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex)))
return res return res
class CVPack:
def __init__(self, dtrain, dtest, param):
self.dtrain = dtrain
self.dtest = dtest
self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ]
self.bst = Booster(param, [dtrain,dtest])
def update(self,r):
self.bst.update(self.dtrain, r)
def eval(self,r):
return self.bst.eval_set(self.watchlist, r)
def mknfold(dall, nfold, param, seed, weightscale=None, evals=[], set_pos_weight=None):
"""
mk nfold list of cvpack from randidx
"""
randidx = range(dall.num_row())
random.seed(seed)
random.shuffle(randidx)
idxset = []
kstep = len(randidx) / nfold
for i in range(nfold):
idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ])
ret = []
for k in range(nfold):
trainlst = []
for j in range(nfold):
if j == k:
testlst = idxset[j]
else:
trainlst += idxset[j]
dtrain = dall.slice(trainlst)
dtest = dall.slice(testlst)
# rescale weight of dtrain and dtest
if weightscale != None:
dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() )
dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() )
if set_pos_weight != None:
label = dtrain.get_label()
weight = dtrain.get_weight()
sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 )
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 )
param['scale_pos_weight'] = sum_wneg/sum_wpos
plst = param.items() + [('eval_metric', itm) for itm in evals]
ret.append(CVPack(dtrain, dtest, plst))
return ret
def aggcv(rlist):
"""
aggregate cross validation results
"""
cvmap = {}
arr = rlist[0].split()
ret = arr[0]
for it in arr[1:]:
k, v = it.split(':')
cvmap[k] = [float(v)]
for line in rlist[1:]:
arr = line.split()
assert ret == arr[0]
for it in arr[1:]:
k, v = it.split(':')
cvmap[k].append(float(v))
for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
v = np.array(v)
ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
return ret
class Booster: class Booster:
"""learner class """ """learner class """
def __init__(self, params={}, cache=[], model_file = None): def __init__(self, params={}, cache=[], model_file = None):
@ -324,7 +253,7 @@ class Booster:
self.handle, ctypes.c_char_p(k.encode('utf-8')), self.handle, ctypes.c_char_p(k.encode('utf-8')),
ctypes.c_char_p(str(v).encode('utf-8'))) ctypes.c_char_p(str(v).encode('utf-8')))
def update(self, dtrain, it): def update(self, dtrain, it, fobj=None):
""" """
update update
Args: Args:
@ -332,11 +261,19 @@ class Booster:
the training DMatrix the training DMatrix
it: int it: int
current iteration number current iteration number
fobj: function
cutomzied objective function
Returns: Returns:
None None
""" """
assert isinstance(dtrain, DMatrix) assert isinstance(dtrain, DMatrix)
xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle) if fobj is None:
xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle)
else:
pred = self.predict( dtrain )
grad, hess = fobj( pred, dtrain )
self.boost( dtrain, grad, hess )
def boost(self, dtrain, grad, hess): def boost(self, dtrain, grad, hess):
""" update """ update
Args: Args:
@ -353,22 +290,31 @@ class Booster:
(ctypes.c_float*len(grad))(*grad), (ctypes.c_float*len(grad))(*grad),
(ctypes.c_float*len(hess))(*hess), (ctypes.c_float*len(hess))(*hess),
len(grad)) len(grad))
def eval_set(self, evals, it = 0): def eval_set(self, evals, it = 0, feval = None):
"""evaluates by metric """evaluates by metric
Args: Args:
evals: list of tuple (DMatrix, string) evals: list of tuple (DMatrix, string)
lists of items to be evaluated lists of items to be evaluated
it: int it: int
feval: function
custom evaluation function
Returns: Returns:
evals result evals result
""" """
for d in evals: if feval is None:
assert isinstance(d[0], DMatrix) for d in evals:
assert isinstance(d[1], str) assert isinstance(d[0], DMatrix)
dmats = (ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals]) assert isinstance(d[1], str)
evnames = (ctypes.c_char_p * len(evals))( dmats = (ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals])
* [ctypes.c_char_p(d[1].encode('utf-8')) for d in evals]) evnames = (ctypes.c_char_p * len(evals))(
return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals)) * [ctypes.c_char_p(d[1].encode('utf-8')) for d in evals])
return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals))
else:
res = '[%d]' % it
for dm, evname in evals:
name, val = feval(self.predict(dm), dm)
res += '\t%s-%s:%f' % (evname, name, val)
return res
def eval(self, mat, name = 'eval', it = 0): def eval(self, mat, name = 'eval', it = 0):
return self.eval_set( [(mat,name)], it) return self.eval_set( [(mat,name)], it)
def predict(self, data, output_margin=False, ntree_limit=0): def predict(self, data, output_margin=False, ntree_limit=0):
@ -453,31 +399,7 @@ class Booster:
fmap[fid]+= 1 fmap[fid]+= 1
return fmap return fmap
def evaluate(bst, evals, it, feval = None): def train(params, dtrain, num_boost_round = 10, evals = [], fobj=None, feval=None):
"""evaluation on eval set
Args:
bst: XGBoost object
object of XGBoost model
evals: list of tuple (DMatrix, string)
obj need to be evaluated
it: int
feval: optional
Returns:
eval result
"""
if feval != None:
res = '[%d]' % it
for dm, evname in evals:
name, val = feval(bst.predict(dm), dm)
res += '\t%s-%s:%f' % (evname, name, val)
else:
res = bst.eval_set(evals, it)
return res
def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None):
""" train a booster with given paramaters """ train a booster with given paramaters
Args: Args:
params: dict params: dict
@ -488,27 +410,84 @@ def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None
num of round to be boosted num of round to be boosted
evals: list evals: list
list of items to be evaluated list of items to be evaluated
obj: fobj: function
feval: cutomized objective function
feval: function
cutomized evaluation function
""" """
bst = Booster(params, [dtrain]+[ d[0] for d in evals ] ) bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
if obj is None: for i in range(num_boost_round):
for i in range(num_boost_round): bst.update( dtrain, i, fobj )
bst.update( dtrain, i ) if len(evals) != 0:
if len(evals) != 0: sys.stderr.write(bst.eval_set(evals, i, feval).decode()+'\n')
sys.stderr.write(evaluate(bst, evals, i, feval).decode()+'\n')
else:
# try customized objective function
for i in range(num_boost_round):
pred = bst.predict( dtrain )
grad, hess = obj( pred, dtrain )
bst.boost( dtrain, grad, hess )
if len(evals) != 0:
sys.stderr.write(evaluate(bst, evals, i, feval)+'\n')
return bst return bst
def cv(params, dtrain, num_boost_round = 10, nfold=3, evals = [], \ class CVPack:
weightscale=None, obj=None, feval=None, set_pos_weight=None): def __init__(self, dtrain, dtest, param):
self.dtrain = dtrain
self.dtest = dtest
self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ]
self.bst = Booster(param, [dtrain,dtest])
def update(self, r, fobj):
self.bst.update(self.dtrain, r, fobj)
def eval(self, r, fval):
return self.bst.eval_set(self.watchlist, r, feval)
def mknfold(dall, nfold, param, seed, weightscale=None, evals=[]):
"""
mk nfold list of cvpack from randidx
"""
randidx = range(dall.num_row())
random.seed(seed)
random.shuffle(randidx)
idxset = []
kstep = len(randidx) / nfold
for i in range(nfold):
idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ])
ret = []
for k in range(nfold):
trainlst = []
for j in range(nfold):
if j == k:
testlst = idxset[j]
else:
trainlst += idxset[j]
dtrain = dall.slice(trainlst)
dtest = dall.slice(testlst)
# rescale weight of dtrain and dtest
if weightscale != None:
dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() )
dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() )
plst = param.items() + [('eval_metric', itm) for itm in evals]
ret.append(CVPack(dtrain, dtest, plst))
return ret
def aggcv(rlist):
"""
aggregate cross validation results
"""
cvmap = {}
arr = rlist[0].split()
ret = arr[0]
for it in arr[1:]:
k, v = it.split(':')
cvmap[k] = [float(v)]
for line in rlist[1:]:
arr = line.split()
assert ret == arr[0]
for it in arr[1:]:
k, v = it.split(':')
cvmap[k].append(float(v))
for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
v = np.array(v)
ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
return ret
def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metrics = [], \
weightscale=None, fobj=None, feval=None):
""" cross validation with given paramaters """ cross validation with given paramaters
Args: Args:
params: dict params: dict
@ -521,14 +500,12 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, evals = [], \
folds to do cv folds to do cv
evals: list evals: list
list of items to be evaluated list of items to be evaluated
obj: fobj:
feval: feval:
set_pos_weight: bool, optional
Adjust pos weight by number
""" """
cvfolds = mknfold(dtrain, nfold, params, 0, weightscale, evals) cvfolds = mknfold(dtrain, nfold, params, 0, weightscale, evals_metrics)
for i in range(num_boost_round): for i in range(num_boost_round):
for f in cvfolds: for f in cvfolds:
f.update(i) f.update(i, fobj)
res = aggcv([f.eval(i) for f in cvfolds]) res = aggcv([f.eval(i, fval) for f in cvfolds])
sys.stderr.write(res+'\n') sys.stderr.write(res+'\n')