diff --git a/python-package/training.py b/python-package/training.py deleted file mode 100644 index 709b0aa5a..000000000 --- a/python-package/training.py +++ /dev/null @@ -1,490 +0,0 @@ -# coding: utf-8 -# pylint: disable=too-many-locals, too-many-arguments, invalid-name -# pylint: disable=too-many-branches -"""Training Library containing training routines.""" -from __future__ import absolute_import - -import sys -import re -import numpy as np -from .core import Booster, STRING_TYPES -from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold, XGBKFold) - -def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, - maximize=False, early_stopping_rounds=None, evals_result=None, - verbose_eval=True, learning_rates=None, xgb_model=None): - # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init - """Train a booster with given parameters. - - Parameters - ---------- - params : dict - Booster params. - dtrain : DMatrix - Data to be trained. - num_boost_round: int - Number of boosting iterations. - watchlist (evals): list of pairs (DMatrix, string) - List of items to be evaluated during training, this allows user to watch - performance on the validation set. - obj : function - Customized objective function. - feval : function - Customized evaluation function. - maximize : bool - Whether to maximize feval. - early_stopping_rounds: int - Activates early stopping. Validation error needs to decrease at least - every round(s) to continue training. - Requires at least one item in evals. - If there's more than one, will use the last. - Returns the model from the last iteration (not the best one). - If early stopping occurs, the model will have three additional fields: - bst.best_score, bst.best_iteration and bst.best_ntree_limit. - (Use bst.best_ntree_limit to get the correct value if num_parallel_tree - and/or num_class appears in the parameters) - evals_result: dict - This dictionary stores the evaluation results of all the items in watchlist. - Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and - and a paramater containing ('eval_metric', 'logloss') - Returns: {'train': {'logloss': ['0.48253', '0.35953']}, - 'eval': {'logloss': ['0.480385', '0.357756']}} - verbose_eval : bool or int - Requires at least one item in evals. - If `verbose_eval` is True then the evaluation metric on the validation set is - printed at each boosting stage. - If `verbose_eval` is an integer then the evaluation metric on the validation set - is printed at every given `verbose_eval` boosting stage. The last boosting stage - / the boosting stage found by using `early_stopping_rounds` is also printed. - Example: with verbose_eval=4 and at least one item in evals, an evaluation metric - is printed every 4 boosting stages, instead of every boosting stage. - learning_rates: list or function - List of learning rate for each boosting round - or a customized function that calculates eta in terms of - current number of round and the total number of boosting round (e.g. yields - learning rate decay) - - list l: eta = l[boosting round] - - function f: eta = f(boosting round, num_boost_round) - xgb_model : file name of stored xgb model or 'Booster' instance - Xgb model to be loaded before training (allows training continuation). - - Returns - ------- - booster : a trained booster model - """ - evals = list(evals) - if isinstance(params, dict) \ - and 'eval_metric' in params \ - and isinstance(params['eval_metric'], list): - params = dict((k, v) for k, v in params.items()) - eval_metrics = params['eval_metric'] - params.pop("eval_metric", None) - params = list(params.items()) - for eval_metric in eval_metrics: - params += [('eval_metric', eval_metric)] - - bst = Booster(params, [dtrain] + [d[0] for d in evals]) - nboost = 0 - num_parallel_tree = 1 - - if isinstance(verbose_eval, bool): - verbose_eval_every_line = False - else: - if isinstance(verbose_eval, int): - verbose_eval_every_line = verbose_eval - verbose_eval = True if verbose_eval_every_line > 0 else False - - if xgb_model is not None: - if not isinstance(xgb_model, STRING_TYPES): - xgb_model = xgb_model.save_raw() - bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model) - nboost = len(bst.get_dump()) - else: - bst = Booster(params, [dtrain] + [d[0] for d in evals]) - - _params = dict(params) if isinstance(params, list) else params - if 'num_parallel_tree' in _params: - num_parallel_tree = _params['num_parallel_tree'] - nboost //= num_parallel_tree - if 'num_class' in _params: - nboost //= _params['num_class'] - - if evals_result is not None: - if not isinstance(evals_result, dict): - raise TypeError('evals_result has to be a dictionary') - else: - evals_name = [d[1] for d in evals] - evals_result.clear() - evals_result.update(dict([(key, {}) for key in evals_name])) - - if not early_stopping_rounds: - for i in range(nboost, nboost + num_boost_round): - bst.update(dtrain, i, obj) - nboost += 1 - if len(evals) != 0: - bst_eval_set = bst.eval_set(evals, i, feval) - if isinstance(bst_eval_set, STRING_TYPES): - msg = bst_eval_set - else: - msg = bst_eval_set.decode() - - if verbose_eval: - if verbose_eval_every_line: - if i % verbose_eval_every_line == 0 or i == num_boost_round - 1: - sys.stderr.write(msg + '\n') - else: - sys.stderr.write(msg + '\n') - - if evals_result is not None: - res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg) - for key in evals_name: - evals_idx = evals_name.index(key) - res_per_eval = len(res) // len(evals_name) - for r in range(res_per_eval): - res_item = res[(evals_idx*res_per_eval) + r] - res_key = res_item[0] - res_val = res_item[1] - if res_key in evals_result[key]: - evals_result[key][res_key].append(res_val) - else: - evals_result[key][res_key] = [res_val] - bst.best_iteration = (nboost - 1) - bst.best_ntree_limit = nboost * num_parallel_tree - return bst - - else: - # early stopping - if len(evals) < 1: - raise ValueError('For early stopping you need at least one set in evals.') - - if verbose_eval: - sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\ - evals[-1][1], early_stopping_rounds)) - - # is params a list of tuples? are we using multiple eval metrics? - if isinstance(params, list): - if len(params) != len(dict(params).items()): - params = dict(params) - sys.stderr.write("Multiple eval metrics have been passed: " \ - "'{0}' will be used for early stopping.\n\n".format(params['eval_metric'])) - else: - params = dict(params) - - # either minimize loss or maximize AUC/MAP/NDCG - maximize_score = False - if 'eval_metric' in params: - maximize_metrics = ('auc', 'map', 'ndcg') - if any(params['eval_metric'].startswith(x) for x in maximize_metrics): - maximize_score = True - if feval is not None: - maximize_score = maximize - - if maximize_score: - best_score = 0.0 - else: - best_score = float('inf') - - best_msg = '' - best_score_i = (nboost - 1) - - if isinstance(learning_rates, list) and len(learning_rates) != num_boost_round: - raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.") - - for i in range(nboost, nboost + num_boost_round): - if learning_rates is not None: - if isinstance(learning_rates, list): - bst.set_param({'eta': learning_rates[i]}) - else: - bst.set_param({'eta': learning_rates(i, num_boost_round)}) - bst.update(dtrain, i, obj) - nboost += 1 - bst_eval_set = bst.eval_set(evals, i, feval) - - if isinstance(bst_eval_set, STRING_TYPES): - msg = bst_eval_set - else: - msg = bst_eval_set.decode() - - if verbose_eval: - if verbose_eval_every_line: - if i % verbose_eval_every_line == 0 or i == num_boost_round - 1: - sys.stderr.write(msg + '\n') - else: - sys.stderr.write(msg + '\n') - - if evals_result is not None: - res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg) - for key in evals_name: - evals_idx = evals_name.index(key) - res_per_eval = len(res) // len(evals_name) - for r in range(res_per_eval): - res_item = res[(evals_idx*res_per_eval) + r] - res_key = res_item[0] - res_val = res_item[1] - if res_key in evals_result[key]: - evals_result[key][res_key].append(res_val) - else: - evals_result[key][res_key] = [res_val] - - score = float(msg.rsplit(':', 1)[1]) - if (maximize_score and score > best_score) or \ - (not maximize_score and score < best_score): - best_score = score - best_score_i = (nboost - 1) - best_msg = msg - elif i - best_score_i >= early_stopping_rounds: - if verbose_eval: - sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg)) - bst.best_score = best_score - bst.best_iteration = best_score_i - break - bst.best_score = best_score - bst.best_iteration = best_score_i - bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree - return bst - - -class CVPack(object): - """"Auxiliary datastruct to hold one fold of CV.""" - def __init__(self, dtrain, dtest, param): - """"Initialize the CVPack""" - self.dtrain = dtrain - self.dtest = dtest - self.watchlist = [(dtrain, 'train'), (dtest, 'test')] - self.bst = Booster(param, [dtrain, dtest]) - - def update(self, iteration, fobj): - """"Update the boosters for one iteration""" - self.bst.update(self.dtrain, iteration, fobj) - - def eval(self, iteration, feval): - """"Evaluate the CVPack for one iteration.""" - return self.bst.eval_set(self.watchlist, iteration, feval) - - -def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, folds=None): - """ - Make an n-fold list of CVPack from random indices. - """ - evals = list(evals) - np.random.seed(seed) - - if stratified is False and folds is None: - randidx = np.random.permutation(dall.num_row()) - kstep = len(randidx) / nfold - idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)] - elif folds is not None: - idset = [x[1] for x in folds] - nfold = len(idset) - else: - idset = [x[1] for x in XGBStratifiedKFold(dall.get_label(), - n_folds=nfold, - shuffle=True, - random_state=seed)] - - ret = [] - for k in range(nfold): - dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i])) - dtest = dall.slice(idset[k]) - # run preprocessing on the data set if needed - if fpreproc is not None: - dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) - else: - tparam = param - plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals] - ret.append(CVPack(dtrain, dtest, plst)) - return ret - -def aggcv(rlist, show_stdv=True, verbose_eval=None, as_pandas=True, trial=0): - # pylint: disable=invalid-name - """ - Aggregate cross-validation results. - - If verbose_eval is true, progress is displayed in every call. If - verbose_eval is an integer, progress will only be displayed every - `verbose_eval` trees, tracked via trial. - """ - cvmap = {} - idx = rlist[0].split()[0] - for line in rlist: - arr = line.split() - assert idx == arr[0] - for it in arr[1:]: - if not isinstance(it, STRING_TYPES): - it = it.decode() - k, v = it.split(':') - if k not in cvmap: - cvmap[k] = [] - cvmap[k].append(float(v)) - - msg = idx - - if show_stdv: - fmt = '\tcv-{0}:{1}+{2}' - else: - fmt = '\tcv-{0}:{1}' - - index = [] - results = [] - for k, v in sorted(cvmap.items(), key=lambda x: x[0]): - v = np.array(v) - if not isinstance(msg, STRING_TYPES): - msg = msg.decode() - mean, std = np.mean(v), np.std(v) - msg += fmt.format(k, mean, std) - - index.extend([k + '-mean', k + '-std']) - results.extend([mean, std]) - - if as_pandas: - try: - import pandas as pd - results = pd.Series(results, index=index) - except ImportError: - if verbose_eval is None: - verbose_eval = True - else: - # if show_progress is default (None), - # result will be np.ndarray as it can't hold column name - if verbose_eval is None: - verbose_eval = True - - if (isinstance(verbose_eval, int) and verbose_eval > 0 and trial % verbose_eval == 0) or \ - (isinstance(verbose_eval, bool) and verbose_eval): - sys.stderr.write(msg + '\n') - sys.stderr.flush() - - return results - - -def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None, - metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None, - fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0): - # pylint: disable = invalid-name - """Cross-validation with given paramaters. - - Parameters - ---------- - params : dict - Booster params. - dtrain : DMatrix - Data to be trained. - num_boost_round : int - Number of boosting iterations. - nfold : int - Number of folds in CV. - stratified : bool - Perform stratified sampling. - folds : KFold or StratifiedKFold - Sklearn KFolds or StratifiedKFolds. - metrics : string or list of strings - Evaluation metrics to be watched in CV. - obj : function - Custom objective function. - feval : function - Custom evaluation function. - maximize : bool - Whether to maximize feval. - early_stopping_rounds: int - Activates early stopping. CV error needs to decrease at least - every round(s) to continue. - Last entry in evaluation history is the one from best iteration. - fpreproc : function - Preprocessing function that takes (dtrain, dtest, param) and returns - transformed versions of those. - as_pandas : bool, default True - Return pd.DataFrame when pandas is installed. - If False or pandas is not installed, return np.ndarray - verbose_eval : bool, int, or None, default None - Whether to display the progress. If None, progress will be displayed - when np.ndarray is returned. If True, progress will be displayed at - boosting stage. If an integer is given, progress will be displayed - at every given `show_progress` boosting stage. - show_stdv : bool, default True - Whether to display the standard deviation in progress. - Results are not affected, and always contains std. - seed : int - Seed used to generate the folds (passed to numpy.random.seed). - - Returns - ------- - evaluation history : list(string) - """ - if stratified == True and not SKLEARN_INSTALLED: - raise XGBoostError('sklearn needs to be installed in order to use stratified cv') - - if isinstance(metrics, str): - metrics = [metrics] - - if isinstance(params, list): - _metrics = [x[1] for x in params if x[0] == 'eval_metric'] - params = dict(params) - if 'eval_metric' in params: - params['eval_metric'] = _metrics - else: - params= dict((k, v) for k, v in params.items()) - - if len(metrics) == 0 and 'eval_metric' in params: - if isinstance(params['eval_metric'], list): - metrics = params['eval_metric'] - else: - metrics = [params['eval_metric']] - - params.pop("eval_metric", None) - - if early_stopping_rounds is not None: - if len(metrics) > 1: - raise ValueError('Check your params. '\ - 'Early stopping works with single eval metric only.') - - if verbose_eval: - sys.stderr.write("Will train until cv error hasn't decreased in {} rounds.\n".format(\ - early_stopping_rounds)) - - maximize_score = False - if len(metrics) == 1: - maximize_metrics = ('auc', 'map', 'ndcg') - if any(metrics[0].startswith(x) for x in maximize_metrics): - maximize_score = True - if feval is not None: - maximize_score = maximize - - if maximize_score: - best_score = 0.0 - else: - best_score = float('inf') - - best_score_i = 0 - results = [] - cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds) - for i in range(num_boost_round): - for fold in cvfolds: - fold.update(i, obj) - res = aggcv([f.eval(i, feval) for f in cvfolds], - show_stdv=show_stdv, verbose_eval=verbose_eval, - as_pandas=as_pandas, trial=i) - results.append(res) - - if early_stopping_rounds is not None: - score = res[0] - if (maximize_score and score > best_score) or \ - (not maximize_score and score < best_score): - best_score = score - best_score_i = i - elif i - best_score_i >= early_stopping_rounds: - results = results[:best_score_i+1] - if verbose_eval: - sys.stderr.write("Stopping. Best iteration:\n[{}] cv-mean:{}\tcv-std:{}\n". - format(best_score_i, results[-1][0], results[-1][1])) - break - if as_pandas: - try: - import pandas as pd - results = pd.DataFrame(results) - except ImportError: - results = np.array(results) - else: - results = np.array(results) - - return results -