450 lines
17 KiB
Python
450 lines
17 KiB
Python
# coding: utf-8
|
|
# pylint: disable=too-many-locals, too-many-arguments, invalid-name
|
|
# pylint: disable=too-many-branches
|
|
"""Training Library containing training routines."""
|
|
from __future__ import absolute_import
|
|
|
|
import sys
|
|
import re
|
|
import numpy as np
|
|
from .core import Booster, STRING_TYPES
|
|
|
|
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|
maximize=False, early_stopping_rounds=None, evals_result=None,
|
|
verbose_eval=True, learning_rates=None, xgb_model=None):
|
|
# pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
|
|
"""Train a booster with given parameters.
|
|
|
|
Parameters
|
|
----------
|
|
params : dict
|
|
Booster params.
|
|
dtrain : DMatrix
|
|
Data to be trained.
|
|
num_boost_round: int
|
|
Number of boosting iterations.
|
|
watchlist (evals): list of pairs (DMatrix, string)
|
|
List of items to be evaluated during training, this allows user to watch
|
|
performance on the validation set.
|
|
obj : function
|
|
Customized objective function.
|
|
feval : function
|
|
Customized evaluation function.
|
|
maximize : bool
|
|
Whether to maximize feval.
|
|
early_stopping_rounds: int
|
|
Activates early stopping. Validation error needs to decrease at least
|
|
every <early_stopping_rounds> round(s) to continue training.
|
|
Requires at least one item in evals.
|
|
If there's more than one, will use the last.
|
|
Returns the model from the last iteration (not the best one).
|
|
If early stopping occurs, the model will have three additional fields:
|
|
bst.best_score, bst.best_iteration and bst.best_ntree_limit.
|
|
(Use bst.best_ntree_limit to get the correct value if num_parallel_tree
|
|
and/or num_class appears in the parameters)
|
|
evals_result: dict
|
|
This dictionary stores the evaluation results of all the items in watchlist.
|
|
Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and
|
|
and a paramater containing ('eval_metric', 'logloss')
|
|
Returns: {'train': {'logloss': ['0.48253', '0.35953']},
|
|
'eval': {'logloss': ['0.480385', '0.357756']}}
|
|
verbose_eval : bool or int
|
|
Requires at least one item in evals.
|
|
If `verbose_eval` is True then the evaluation metric on the validation set is
|
|
printed at each boosting stage.
|
|
If `verbose_eval` is an integer then the evaluation metric on the validation set
|
|
is printed at every given `verbose_eval` boosting stage. The last boosting stage
|
|
/ the boosting stage found by using `early_stopping_rounds` is also printed.
|
|
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
|
|
is printed every 4 boosting stages, instead of every boosting stage.
|
|
learning_rates: list or function
|
|
List of learning rate for each boosting round
|
|
or a customized function that calculates eta in terms of
|
|
current number of round and the total number of boosting round (e.g. yields
|
|
learning rate decay)
|
|
- list l: eta = l[boosting round]
|
|
- function f: eta = f(boosting round, num_boost_round)
|
|
xgb_model : file name of stored xgb model or 'Booster' instance
|
|
Xgb model to be loaded before training (allows training continuation).
|
|
|
|
Returns
|
|
-------
|
|
booster : a trained booster model
|
|
"""
|
|
evals = list(evals)
|
|
if isinstance(params, dict) \
|
|
and 'eval_metric' in params \
|
|
and isinstance(params['eval_metric'], list):
|
|
params = dict((k, v) for k, v in params.items())
|
|
eval_metrics = params['eval_metric']
|
|
params.pop("eval_metric", None)
|
|
params = list(params.items())
|
|
for eval_metric in eval_metrics:
|
|
params += [('eval_metric', eval_metric)]
|
|
|
|
bst = Booster(params, [dtrain] + [d[0] for d in evals])
|
|
nboost = 0
|
|
num_parallel_tree = 1
|
|
|
|
if isinstance(verbose_eval, bool):
|
|
verbose_eval_every_line = False
|
|
else:
|
|
if isinstance(verbose_eval, int):
|
|
verbose_eval_every_line = verbose_eval
|
|
verbose_eval = True if verbose_eval_every_line > 0 else False
|
|
|
|
if xgb_model is not None:
|
|
if not isinstance(xgb_model, STRING_TYPES):
|
|
xgb_model = xgb_model.save_raw()
|
|
bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
|
|
nboost = len(bst.get_dump())
|
|
else:
|
|
bst = Booster(params, [dtrain] + [d[0] for d in evals])
|
|
|
|
_params = dict(params) if isinstance(params, list) else params
|
|
if 'num_parallel_tree' in _params:
|
|
num_parallel_tree = _params['num_parallel_tree']
|
|
nboost //= num_parallel_tree
|
|
if 'num_class' in _params:
|
|
nboost //= _params['num_class']
|
|
|
|
if evals_result is not None:
|
|
if not isinstance(evals_result, dict):
|
|
raise TypeError('evals_result has to be a dictionary')
|
|
else:
|
|
evals_name = [d[1] for d in evals]
|
|
evals_result.clear()
|
|
evals_result.update(dict([(key, {}) for key in evals_name]))
|
|
|
|
if not early_stopping_rounds:
|
|
for i in range(nboost, nboost + num_boost_round):
|
|
bst.update(dtrain, i, obj)
|
|
nboost += 1
|
|
if len(evals) != 0:
|
|
bst_eval_set = bst.eval_set(evals, i, feval)
|
|
if isinstance(bst_eval_set, STRING_TYPES):
|
|
msg = bst_eval_set
|
|
else:
|
|
msg = bst_eval_set.decode()
|
|
|
|
if verbose_eval:
|
|
if verbose_eval_every_line:
|
|
if i % verbose_eval_every_line == 0 or i == num_boost_round - 1:
|
|
sys.stderr.write(msg + '\n')
|
|
else:
|
|
sys.stderr.write(msg + '\n')
|
|
|
|
if evals_result is not None:
|
|
res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg)
|
|
for key in evals_name:
|
|
evals_idx = evals_name.index(key)
|
|
res_per_eval = len(res) // len(evals_name)
|
|
for r in range(res_per_eval):
|
|
res_item = res[(evals_idx*res_per_eval) + r]
|
|
res_key = res_item[0]
|
|
res_val = res_item[1]
|
|
if res_key in evals_result[key]:
|
|
evals_result[key][res_key].append(res_val)
|
|
else:
|
|
evals_result[key][res_key] = [res_val]
|
|
bst.best_iteration = (nboost - 1)
|
|
bst.best_ntree_limit = nboost * num_parallel_tree
|
|
return bst
|
|
|
|
else:
|
|
# early stopping
|
|
if len(evals) < 1:
|
|
raise ValueError('For early stopping you need at least one set in evals.')
|
|
|
|
if verbose_eval:
|
|
sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
|
|
evals[-1][1], early_stopping_rounds))
|
|
|
|
# is params a list of tuples? are we using multiple eval metrics?
|
|
if isinstance(params, list):
|
|
if len(params) != len(dict(params).items()):
|
|
params = dict(params)
|
|
sys.stderr.write("Multiple eval metrics have been passed: " \
|
|
"'{0}' will be used for early stopping.\n\n".format(params['eval_metric']))
|
|
else:
|
|
params = dict(params)
|
|
|
|
# either minimize loss or maximize AUC/MAP/NDCG
|
|
maximize_score = False
|
|
if 'eval_metric' in params:
|
|
maximize_metrics = ('auc', 'map', 'ndcg')
|
|
if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
|
|
maximize_score = True
|
|
if feval is not None:
|
|
maximize_score = maximize
|
|
|
|
if maximize_score:
|
|
best_score = 0.0
|
|
else:
|
|
best_score = float('inf')
|
|
|
|
best_msg = ''
|
|
best_score_i = (nboost - 1)
|
|
|
|
if isinstance(learning_rates, list) and len(learning_rates) != num_boost_round:
|
|
raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
|
|
|
|
for i in range(nboost, nboost + num_boost_round):
|
|
if learning_rates is not None:
|
|
if isinstance(learning_rates, list):
|
|
bst.set_param({'eta': learning_rates[i]})
|
|
else:
|
|
bst.set_param({'eta': learning_rates(i, num_boost_round)})
|
|
bst.update(dtrain, i, obj)
|
|
nboost += 1
|
|
bst_eval_set = bst.eval_set(evals, i, feval)
|
|
|
|
if isinstance(bst_eval_set, STRING_TYPES):
|
|
msg = bst_eval_set
|
|
else:
|
|
msg = bst_eval_set.decode()
|
|
|
|
if verbose_eval:
|
|
if verbose_eval_every_line:
|
|
if i % verbose_eval_every_line == 0 or i == num_boost_round - 1:
|
|
sys.stderr.write(msg + '\n')
|
|
else:
|
|
sys.stderr.write(msg + '\n')
|
|
|
|
if evals_result is not None:
|
|
res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg)
|
|
for key in evals_name:
|
|
evals_idx = evals_name.index(key)
|
|
res_per_eval = len(res) // len(evals_name)
|
|
for r in range(res_per_eval):
|
|
res_item = res[(evals_idx*res_per_eval) + r]
|
|
res_key = res_item[0]
|
|
res_val = res_item[1]
|
|
if res_key in evals_result[key]:
|
|
evals_result[key][res_key].append(res_val)
|
|
else:
|
|
evals_result[key][res_key] = [res_val]
|
|
|
|
score = float(msg.rsplit(':', 1)[1])
|
|
if (maximize_score and score > best_score) or \
|
|
(not maximize_score and score < best_score):
|
|
best_score = score
|
|
best_score_i = (nboost - 1)
|
|
best_msg = msg
|
|
elif i - best_score_i >= early_stopping_rounds:
|
|
if verbose_eval:
|
|
sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
|
|
bst.best_score = best_score
|
|
bst.best_iteration = best_score_i
|
|
break
|
|
bst.best_score = best_score
|
|
bst.best_iteration = best_score_i
|
|
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
|
|
return bst
|
|
|
|
|
|
class CVPack(object):
|
|
""""Auxiliary datastruct to hold one fold of CV."""
|
|
def __init__(self, dtrain, dtest, param):
|
|
""""Initialize the CVPack"""
|
|
self.dtrain = dtrain
|
|
self.dtest = dtest
|
|
self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
|
|
self.bst = Booster(param, [dtrain, dtest])
|
|
|
|
def update(self, iteration, fobj):
|
|
""""Update the boosters for one iteration"""
|
|
self.bst.update(self.dtrain, iteration, fobj)
|
|
|
|
def eval(self, iteration, feval):
|
|
""""Evaluate the CVPack for one iteration."""
|
|
return self.bst.eval_set(self.watchlist, iteration, feval)
|
|
|
|
|
|
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
|
|
"""
|
|
Make an n-fold list of CVPack from random indices.
|
|
"""
|
|
evals = list(evals)
|
|
np.random.seed(seed)
|
|
randidx = np.random.permutation(dall.num_row())
|
|
kstep = len(randidx) / nfold
|
|
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
|
|
ret = []
|
|
for k in range(nfold):
|
|
dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
|
|
dtest = dall.slice(idset[k])
|
|
# run preprocessing on the data set if needed
|
|
if fpreproc is not None:
|
|
dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
|
|
else:
|
|
tparam = param
|
|
plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
|
|
ret.append(CVPack(dtrain, dtest, plst))
|
|
return ret
|
|
|
|
def aggcv(rlist, show_stdv=True, show_progress=None, as_pandas=True, trial=0):
|
|
# pylint: disable=invalid-name
|
|
"""
|
|
Aggregate cross-validation results.
|
|
|
|
If show_progress is true, progress is displayed in every call. If
|
|
show_progress is an integer, progress will only be displayed every
|
|
`show_progress` trees, tracked via trial.
|
|
"""
|
|
cvmap = {}
|
|
idx = rlist[0].split()[0]
|
|
for line in rlist:
|
|
arr = line.split()
|
|
assert idx == arr[0]
|
|
for it in arr[1:]:
|
|
if not isinstance(it, STRING_TYPES):
|
|
it = it.decode()
|
|
k, v = it.split(':')
|
|
if k not in cvmap:
|
|
cvmap[k] = []
|
|
cvmap[k].append(float(v))
|
|
|
|
msg = idx
|
|
|
|
if show_stdv:
|
|
fmt = '\tcv-{0}:{1}+{2}'
|
|
else:
|
|
fmt = '\tcv-{0}:{1}'
|
|
|
|
index = []
|
|
results = []
|
|
for k, v in sorted(cvmap.items(), key=lambda x: x[0]):
|
|
v = np.array(v)
|
|
if not isinstance(msg, STRING_TYPES):
|
|
msg = msg.decode()
|
|
mean, std = np.mean(v), np.std(v)
|
|
msg += fmt.format(k, mean, std)
|
|
|
|
index.extend([k + '-mean', k + '-std'])
|
|
results.extend([mean, std])
|
|
|
|
if as_pandas:
|
|
try:
|
|
import pandas as pd
|
|
results = pd.Series(results, index=index)
|
|
except ImportError:
|
|
if show_progress is None:
|
|
show_progress = True
|
|
else:
|
|
# if show_progress is default (None),
|
|
# result will be np.ndarray as it can't hold column name
|
|
if show_progress is None:
|
|
show_progress = True
|
|
|
|
if (isinstance(show_progress, int) and show_progress > 0 and trial % show_progress == 0) or \
|
|
(isinstance(show_progress, bool) and show_progress):
|
|
sys.stderr.write(msg + '\n')
|
|
sys.stderr.flush()
|
|
|
|
return results
|
|
|
|
|
|
def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
|
|
obj=None, feval=None, maximize=False, early_stopping_rounds=None,
|
|
fpreproc=None, as_pandas=True, show_progress=None, show_stdv=True, seed=0):
|
|
# pylint: disable = invalid-name
|
|
"""Cross-validation with given paramaters.
|
|
|
|
Parameters
|
|
----------
|
|
params : dict
|
|
Booster params.
|
|
dtrain : DMatrix
|
|
Data to be trained.
|
|
num_boost_round : int
|
|
Number of boosting iterations.
|
|
nfold : int
|
|
Number of folds in CV.
|
|
metrics : list of strings
|
|
Evaluation metrics to be watched in CV.
|
|
obj : function
|
|
Custom objective function.
|
|
feval : function
|
|
Custom evaluation function.
|
|
maximize : bool
|
|
Whether to maximize feval.
|
|
early_stopping_rounds: int
|
|
Activates early stopping. CV error needs to decrease at least
|
|
every <early_stopping_rounds> round(s) to continue.
|
|
Last entry in evaluation history is the one from best iteration.
|
|
fpreproc : function
|
|
Preprocessing function that takes (dtrain, dtest, param) and returns
|
|
transformed versions of those.
|
|
as_pandas : bool, default True
|
|
Return pd.DataFrame when pandas is installed.
|
|
If False or pandas is not installed, return np.ndarray
|
|
show_progress : bool, int, or None, default None
|
|
Whether to display the progress. If None, progress will be displayed
|
|
when np.ndarray is returned. If True, progress will be displayed at
|
|
boosting stage. If an integer is given, progress will be displayed
|
|
at every given `show_progress` boosting stage.
|
|
show_stdv : bool, default True
|
|
Whether to display the standard deviation in progress.
|
|
Results are not affected, and always contains std.
|
|
seed : int
|
|
Seed used to generate the folds (passed to numpy.random.seed).
|
|
|
|
Returns
|
|
-------
|
|
evaluation history : list(string)
|
|
"""
|
|
if early_stopping_rounds is not None:
|
|
if len(metrics) > 1:
|
|
raise ValueError('Check your params.'\
|
|
'Early stopping works with single eval metric only.')
|
|
|
|
sys.stderr.write("Will train until cv error hasn't decreased in {} rounds.\n".format(\
|
|
early_stopping_rounds))
|
|
|
|
maximize_score = False
|
|
if len(metrics) == 1:
|
|
maximize_metrics = ('auc', 'map', 'ndcg')
|
|
if any(metrics[0].startswith(x) for x in maximize_metrics):
|
|
maximize_score = True
|
|
if feval is not None:
|
|
maximize_score = maximize
|
|
|
|
if maximize_score:
|
|
best_score = 0.0
|
|
else:
|
|
best_score = float('inf')
|
|
|
|
best_score_i = 0
|
|
results = []
|
|
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
|
|
for i in range(num_boost_round):
|
|
for fold in cvfolds:
|
|
fold.update(i, obj)
|
|
res = aggcv([f.eval(i, feval) for f in cvfolds],
|
|
show_stdv=show_stdv, show_progress=show_progress,
|
|
as_pandas=as_pandas, trial=i)
|
|
results.append(res)
|
|
|
|
if early_stopping_rounds is not None:
|
|
score = res[0]
|
|
if (maximize_score and score > best_score) or \
|
|
(not maximize_score and score < best_score):
|
|
best_score = score
|
|
best_score_i = i
|
|
elif i - best_score_i >= early_stopping_rounds:
|
|
results = results[:best_score_i+1]
|
|
sys.stderr.write("Stopping. Best iteration: {} (mean: {}, std: {})\n".
|
|
format(best_score_i, results[-1][0], results[-1][1]))
|
|
break
|
|
if as_pandas:
|
|
try:
|
|
import pandas as pd
|
|
results = pd.DataFrame(results)
|
|
except ImportError:
|
|
results = np.array(results)
|
|
else:
|
|
results = np.array(results)
|
|
|
|
return results
|