Support slicing tree model (#6302)

This PR is meant the end the confusion around best_ntree_limit and unify model slicing. We have multi-class and random forests, asking users to understand how to set ntree_limit is difficult and error prone.

* Implement the save_best option in early stopping.

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan
2020-11-03 02:27:39 -05:00
committed by GitHub
parent 29745c6df2
commit 2cc9662005
19 changed files with 550 additions and 37 deletions

View File

@@ -10,7 +10,7 @@ from typing import Callable, List
import numpy
from . import rabit
from .core import EarlyStopException, CallbackEnv
from .core import EarlyStopException, CallbackEnv, Booster, XGBoostError
from .compat import STRING_TYPES
@@ -279,9 +279,11 @@ class TrainingCallback(ABC):
def before_training(self, model):
'''Run before training starts.'''
return model
def after_training(self, model):
'''Run after training is finished.'''
return model
def before_iteration(self, model, epoch, evals_log):
'''Run before each iteration. Return True when training should stop.'''
@@ -362,12 +364,24 @@ class CallbackContainer:
def before_training(self, model):
'''Function called before training.'''
for c in self.callbacks:
c.before_training(model=model)
model = c.before_training(model=model)
msg = 'before_training should return the model'
if self.is_cv:
assert isinstance(model.cvfolds, list), msg
else:
assert isinstance(model, Booster), msg
return model
def after_training(self, model):
'''Function called after training.'''
for c in self.callbacks:
c.after_training(model)
model = c.after_training(model=model)
msg = 'after_training should return the model'
if self.is_cv:
assert isinstance(model.cvfolds, list), msg
else:
assert isinstance(model, Booster), msg
return model
def before_iteration(self, model, epoch, dtrain, evals):
'''Function called before training iteration.'''
@@ -461,7 +475,7 @@ class EarlyStopping(TrainingCallback):
maximize : bool
Whether to maximize evaluation metric. None means auto (discouraged).
save_best : bool
Placeholder, the feature is not yet supported.
Whether training should return the best model or the last model.
'''
def __init__(self,
rounds,
@@ -473,9 +487,6 @@ class EarlyStopping(TrainingCallback):
self.metric_name = metric_name
self.rounds = rounds
self.save_best = save_best
# https://github.com/dmlc/xgboost/issues/5531
assert self.save_best is False, 'save best is not yet supported.'
self.maximize = maximize
self.stopping_history = {}
@@ -525,7 +536,7 @@ class EarlyStopping(TrainingCallback):
return True
return False
def after_iteration(self, model, epoch, evals_log):
def after_iteration(self, model: Booster, epoch, evals_log):
msg = 'Must have at least 1 validation dataset for early stopping.'
assert len(evals_log.keys()) >= 1, msg
data_name = ''
@@ -551,6 +562,14 @@ class EarlyStopping(TrainingCallback):
score = data_log[metric_name][-1]
return self._update_rounds(score, data_name, metric_name, model, epoch)
def after_training(self, model: Booster):
try:
if self.save_best:
model = model[: int(model.attr('best_iteration'))]
except XGBoostError as e:
raise XGBoostError('`save_best` is not applicable to current booster') from e
return model
class EvaluationMonitor(TrainingCallback):
'''Print the evaluation result at each iteration.
@@ -684,9 +703,11 @@ class LegacyCallbacks:
def before_training(self, model):
'''Nothing to do for legacy callbacks'''
return model
def after_training(self, model):
'''Nothing to do for legacy callbacks'''
return model
def before_iteration(self, model, epoch, dtrain, evals):
'''Called before each iteration.'''

View File

@@ -944,8 +944,8 @@ class Booster(object):
Parameters for boosters.
cache : list
List of cache items.
model_file : string or os.PathLike
Path to the model file.
model_file : string/os.PathLike/Booster/bytearray
Path to the model file if it's string or PathLike.
"""
for d in cache:
if not isinstance(d, DMatrix):
@@ -1021,6 +1021,43 @@ class Booster(object):
state['handle'] = handle
self.__dict__.update(state)
def __getitem__(self, val):
if isinstance(val, int):
val = slice(val, val+1)
if isinstance(val, tuple):
raise ValueError('Only supports slicing through 1 dimension.')
if not isinstance(val, slice):
msg = _expect((int, slice), type(val))
raise TypeError(msg)
if isinstance(val.start, type(Ellipsis)) or val.start is None:
start = 0
else:
start = val.start
if isinstance(val.stop, type(Ellipsis)) or val.stop is None:
stop = 0
else:
stop = val.stop
if stop < start:
raise ValueError('Invalid slice', val)
step = val.step if val.step is not None else 1
start = ctypes.c_int(start)
stop = ctypes.c_int(stop)
step = ctypes.c_int(step)
sliced_handle = ctypes.c_void_p()
status = _LIB.XGBoosterSlice(self.handle, start, stop, step,
ctypes.byref(sliced_handle))
if status == -2:
raise IndexError('Layer index out of range')
_check_call(status)
sliced = Booster()
_check_call(_LIB.XGBoosterFree(sliced.handle))
sliced.handle = sliced_handle
return sliced
def save_config(self):
'''Output internal parameter configuration of Booster as a JSON
string.

View File

@@ -103,7 +103,7 @@ def _train_internal(params, dtrain,
num_boost_round, feval, evals_result, callbacks,
show_stdv=False, cvfolds=None)
callbacks.before_training(bst)
bst = callbacks.before_training(bst)
for i in range(start_iteration, num_boost_round):
if callbacks.before_iteration(bst, i, dtrain, evals):
break
@@ -125,7 +125,7 @@ def _train_internal(params, dtrain,
bst.save_rabit_checkpoint()
version += 1
callbacks.after_training(bst)
bst = callbacks.after_training(bst)
if evals_result is not None and is_new_callback:
evals_result.update(callbacks.history)
@@ -495,9 +495,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
verbose_eval, early_stopping_rounds, maximize, 0,
num_boost_round, feval, None, callbacks,
show_stdv=show_stdv, cvfolds=cvfolds)
callbacks.before_training(cvfolds)
booster = _PackedBooster(cvfolds)
callbacks.before_training(booster)
for i in range(num_boost_round):
if callbacks.before_iteration(booster, i, dtrain, None):
@@ -524,4 +523,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
results = pd.DataFrame.from_dict(results)
except ImportError:
pass
callbacks.after_training(booster)
return results