Support slicing tree model (#6302)

This PR is meant the end the confusion around best_ntree_limit and unify model slicing. We have multi-class and random forests, asking users to understand how to set ntree_limit is difficult and error prone. * Implement the save_best option in early stopping. Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2020-11-03 02:27:39 -05:00
parent 29745c6df2
commit 2cc9662005
19 changed files with 550 additions and 37 deletions
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -10,7 +10,7 @@ from typing import Callable, List
 import numpy

 from . import rabit
-from .core import EarlyStopException, CallbackEnv
+from .core import EarlyStopException, CallbackEnv, Booster, XGBoostError
 from .compat import STRING_TYPES


@@ -279,9 +279,11 @@ class TrainingCallback(ABC):

    def before_training(self, model):
        '''Run before training starts.'''
+        return model

    def after_training(self, model):
        '''Run after training is finished.'''
+        return model

    def before_iteration(self, model, epoch, evals_log):
        '''Run before each iteration.  Return True when training should stop.'''
@@ -362,12 +364,24 @@ class CallbackContainer:
    def before_training(self, model):
        '''Function called before training.'''
        for c in self.callbacks:
-            c.before_training(model=model)
+            model = c.before_training(model=model)
+            msg = 'before_training should return the model'
+            if self.is_cv:
+                assert isinstance(model.cvfolds, list), msg
+            else:
+                assert isinstance(model, Booster), msg
+        return model

    def after_training(self, model):
        '''Function called after training.'''
        for c in self.callbacks:
-            c.after_training(model)
+            model = c.after_training(model=model)
+            msg = 'after_training should return the model'
+            if self.is_cv:
+                assert isinstance(model.cvfolds, list), msg
+            else:
+                assert isinstance(model, Booster), msg
+        return model

    def before_iteration(self, model, epoch, dtrain, evals):
        '''Function called before training iteration.'''
@@ -461,7 +475,7 @@ class EarlyStopping(TrainingCallback):
    maximize : bool
        Whether to maximize evaluation metric.  None means auto (discouraged).
    save_best : bool
-        Placeholder, the feature is not yet supported.
+        Whether training should return the best model or the last model.
    '''
    def __init__(self,
                 rounds,
@@ -473,9 +487,6 @@ class EarlyStopping(TrainingCallback):
        self.metric_name = metric_name
        self.rounds = rounds
        self.save_best = save_best
-        # https://github.com/dmlc/xgboost/issues/5531
-        assert self.save_best is False, 'save best is not yet supported.'
-
        self.maximize = maximize
        self.stopping_history = {}

@@ -525,7 +536,7 @@ class EarlyStopping(TrainingCallback):
            return True
        return False

-    def after_iteration(self, model, epoch, evals_log):
+    def after_iteration(self, model: Booster, epoch, evals_log):
        msg = 'Must have at least 1 validation dataset for early stopping.'
        assert len(evals_log.keys()) >= 1, msg
        data_name = ''
@@ -551,6 +562,14 @@ class EarlyStopping(TrainingCallback):
        score = data_log[metric_name][-1]
        return self._update_rounds(score, data_name, metric_name, model, epoch)

+    def after_training(self, model: Booster):
+        try:
+            if self.save_best:
+                model = model[: int(model.attr('best_iteration'))]
+        except XGBoostError as e:
+            raise XGBoostError('`save_best` is not applicable to current booster') from e
+        return model
+

 class EvaluationMonitor(TrainingCallback):
    '''Print the evaluation result at each iteration.
@@ -684,9 +703,11 @@ class LegacyCallbacks:

    def before_training(self, model):
        '''Nothing to do for legacy callbacks'''
+        return model

    def after_training(self, model):
        '''Nothing to do for legacy callbacks'''
+        return model

    def before_iteration(self, model, epoch, dtrain, evals):
        '''Called before each iteration.'''
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -944,8 +944,8 @@ class Booster(object):
            Parameters for boosters.
        cache : list
            List of cache items.
-        model_file : string or os.PathLike
-            Path to the model file.
+        model_file : string/os.PathLike/Booster/bytearray
+            Path to the model file if it's string or PathLike.
        """
        for d in cache:
            if not isinstance(d, DMatrix):
@@ -1021,6 +1021,43 @@ class Booster(object):
            state['handle'] = handle
        self.__dict__.update(state)

+    def __getitem__(self, val):
+        if isinstance(val, int):
+            val = slice(val, val+1)
+        if isinstance(val, tuple):
+            raise ValueError('Only supports slicing through 1 dimension.')
+        if not isinstance(val, slice):
+            msg = _expect((int, slice), type(val))
+            raise TypeError(msg)
+        if isinstance(val.start, type(Ellipsis)) or val.start is None:
+            start = 0
+        else:
+            start = val.start
+        if isinstance(val.stop, type(Ellipsis)) or val.stop is None:
+            stop = 0
+        else:
+            stop = val.stop
+            if stop < start:
+                raise ValueError('Invalid slice', val)
+
+        step = val.step if val.step is not None else 1
+
+        start = ctypes.c_int(start)
+        stop = ctypes.c_int(stop)
+        step = ctypes.c_int(step)
+
+        sliced_handle = ctypes.c_void_p()
+        status = _LIB.XGBoosterSlice(self.handle, start, stop, step,
+                                     ctypes.byref(sliced_handle))
+        if status == -2:
+            raise IndexError('Layer index out of range')
+        _check_call(status)
+
+        sliced = Booster()
+        _check_call(_LIB.XGBoosterFree(sliced.handle))
+        sliced.handle = sliced_handle
+        return sliced
+
    def save_config(self):
        '''Output internal parameter configuration of Booster as a JSON
        string.
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -103,7 +103,7 @@ def _train_internal(params, dtrain,
            num_boost_round, feval, evals_result, callbacks,
            show_stdv=False, cvfolds=None)

-    callbacks.before_training(bst)
+    bst = callbacks.before_training(bst)
    for i in range(start_iteration, num_boost_round):
        if callbacks.before_iteration(bst, i, dtrain, evals):
            break
@@ -125,7 +125,7 @@ def _train_internal(params, dtrain,
        bst.save_rabit_checkpoint()
        version += 1

-    callbacks.after_training(bst)
+    bst = callbacks.after_training(bst)

    if evals_result is not None and is_new_callback:
        evals_result.update(callbacks.history)
@@ -495,9 +495,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
            verbose_eval, early_stopping_rounds, maximize, 0,
            num_boost_round, feval, None, callbacks,
            show_stdv=show_stdv, cvfolds=cvfolds)
-    callbacks.before_training(cvfolds)
-
    booster = _PackedBooster(cvfolds)
+    callbacks.before_training(booster)

    for i in range(num_boost_round):
        if callbacks.before_iteration(booster, i, dtrain, None):
@@ -524,4 +523,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
            results = pd.DataFrame.from_dict(results)
        except ImportError:
            pass
+
+    callbacks.after_training(booster)
+
    return results