Fix early stopping in the Python package (#4638)

* Fix #4630, #4421: Preserve correct ordering between metrics, and always use last metric for early stopping * Clarify semantics of early stopping in presence of multiple valid sets and metrics * Add a test * Fix lint
2019-07-07 01:01:03 -07:00
parent 562d9ae963
commit 1aaf4a679d
3 changed files with 106 additions and 115 deletions
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -127,8 +127,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
    num_boost_round: int
        Number of boosting iterations.
    evals: list of pairs (DMatrix, string)
-        List of items to be evaluated during training, this allows user to watch
-        performance on the validation set.
+        List of validation sets for which metrics will evaluated during training.
+        Validation metrics will help us track the performance of the model.
    obj : function
        Customized objective function.
    feval : function
@@ -136,11 +136,14 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
    maximize : bool
        Whether to maximize feval.
    early_stopping_rounds: int
-        Activates early stopping. Validation error needs to decrease at least
+        Activates early stopping. Validation metric needs to improve at least once in
        every **early_stopping_rounds** round(s) to continue training.
        Requires at least one item in **evals**.
-        If there's more than one, will use the last.
-        Returns the model from the last iteration (not the best one).
+        The method returns the model from the last iteration (not the best one).
+        If there's more than one item in **evals**, the last entry will be used
+        for early stopping.
+        If there's more than one metric in the **eval_metric** parameter given in
+        **params**, the last metric will be used for early stopping.
        If early stopping occurs, the model will have three additional fields:
        ``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``.
        (Use ``bst.best_ntree_limit`` to get the correct value if
@@ -352,16 +355,16 @@ def aggcv(rlist):
    for line in rlist:
        arr = line.split()
        assert idx == arr[0]
-        for it in arr[1:]:
+        for metric_idx, it in enumerate(arr[1:]):
            if not isinstance(it, STRING_TYPES):
                it = it.decode()
            k, v = it.split(':')
-            if k not in cvmap:
-                cvmap[k] = []
-            cvmap[k].append(float(v))
+            if (metric_idx, k) not in cvmap:
+                cvmap[(metric_idx, k)] = []
+            cvmap[(metric_idx, k)].append(float(v))
    msg = idx
    results = []
-    for k, v in sorted(cvmap.items(), key=lambda x: (x[0].startswith('test'), x[0])):
+    for (metric_idx, k), v in sorted(cvmap.items(), key=lambda x: x[0][0]):
        v = np.array(v)
        if not isinstance(msg, STRING_TYPES):
            msg = msg.decode()
@@ -405,9 +408,12 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
    maximize : bool
        Whether to maximize feval.
    early_stopping_rounds: int
-        Activates early stopping. CV error needs to decrease at least
-        every <early_stopping_rounds> round(s) to continue.
-        Last entry in evaluation history is the one from best iteration.
+        Activates early stopping. Cross-Validation metric (average of validation
+        metric computed over CV folds) needs to improve at least once in
+        every **early_stopping_rounds** round(s) to continue training.
+        The last entry in the evaluation history will represent the best iteration.
+        If there's more than one metric in the **eval_metric** parameter given in
+        **params**, the last metric will be used for early stopping.
    fpreproc : function
        Preprocessing function that takes (dtrain, dtest, param) and returns
        transformed versions of those.