[breaking] Remove duplicated predict functions, Fix attributes IO. (#6593)

* Fix attributes not being restored. * Rename all `data` to `X`. [breaking]
2021-01-13 16:56:49 +08:00
parent 7f4d3a91b9
commit 0027220aa0
4 changed files with 91 additions and 87 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1741,6 +1741,13 @@ class Booster(object):
        else:
            raise TypeError('Unknown file type: ', fname)

+        if self.attr("best_iteration") is not None:
+            self.best_iteration = int(self.attr("best_iteration"))
+        if self.attr("best_score") is not None:
+            self.best_score = float(self.attr("best_score"))
+        if self.attr("best_ntree_limit") is not None:
+            self.best_ntree_limit = int(self.attr("best_ntree_limit"))
+
    def num_boosted_rounds(self) -> int:
        '''Get number of boosted rounds.  For gblinear this is reset to 0 after
        serializing the model.
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -682,10 +682,16 @@ class XGBModel(XGBModelBase):
        self._set_evaluation_result(evals_result)
        return self

-    def predict(self, data, output_margin=False, ntree_limit=None,
-                validate_features=True, base_margin=None):
+    def predict(
+        self,
+        X,
+        output_margin=False,
+        ntree_limit=None,
+        validate_features=True,
+        base_margin=None
+    ):
        """
-        Predict with `data`.
+        Predict with `X`.

        .. note:: This function is not thread safe.

@@ -699,7 +705,7 @@ class XGBModel(XGBModelBase):

        Parameters
        ----------
-        data : array_like
+        X : array_like
            Data to predict with
        output_margin : bool
            Whether to output the raw untransformed margin value.
@@ -718,16 +724,21 @@ class XGBModel(XGBModelBase):
        prediction : numpy array
        """
        # pylint: disable=missing-docstring,invalid-name
-        test_dmatrix = DMatrix(data, base_margin=base_margin,
+        test_dmatrix = DMatrix(X, base_margin=base_margin,
                               missing=self.missing, nthread=self.n_jobs)
        # get ntree_limit to use - if none specified, default to
        # best_ntree_limit if defined, otherwise 0.
        if ntree_limit is None:
-            ntree_limit = getattr(self, "best_ntree_limit", 0)
-        return self.get_booster().predict(test_dmatrix,
-                                          output_margin=output_margin,
-                                          ntree_limit=ntree_limit,
-                                          validate_features=validate_features)
+            try:
+                ntree_limit = self.best_ntree_limit
+            except AttributeError:
+                ntree_limit = 0
+        return self.get_booster().predict(
+            test_dmatrix,
+            output_margin=output_margin,
+            ntree_limit=ntree_limit,
+            validate_features=validate_features
+        )

    def apply(self, X, ntree_limit=0):
        """Return the predicted leaf every tree for each sample.
@@ -1037,50 +1048,21 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        'Fit gradient boosting model',
        'Fit gradient boosting classifier', 1)

-    def predict(self, data, output_margin=False, ntree_limit=None,
-                validate_features=True, base_margin=None):
-        """
-        Predict with `data`.
-
-        .. note:: This function is not thread safe.
-
-          For each booster object, predict can only be called from one thread.
-          If you want to run prediction using multiple thread, call
-          ``xgb.copy()`` to make copies of model object and then call
-          ``predict()``.
-
-          .. code-block:: python
-
-            preds = bst.predict(dtest, ntree_limit=num_round)
-
-        Parameters
-        ----------
-        data : array_like
-            Feature matrix.
-        output_margin : bool
-            Whether to output the raw untransformed margin value.
-        ntree_limit : int
-            Limit number of trees in the prediction; defaults to
-            best_ntree_limit if defined (i.e. it has been trained with early
-            stopping), otherwise 0 (use all trees).
-        validate_features : bool
-            When this is True, validate that the Booster's and data's
-            feature_names are identical.  Otherwise, it is assumed that the
-            feature_names are the same.
-
-        Returns
-        -------
-        prediction : numpy array
-        """
-        test_dmatrix = DMatrix(data, base_margin=base_margin,
-                               missing=self.missing, nthread=self.n_jobs)
-        if ntree_limit is None:
-            ntree_limit = getattr(self, "best_ntree_limit", 0)
-        class_probs = self.get_booster().predict(
-            test_dmatrix,
+    def predict(
+        self,
+        X,
+        output_margin=False,
+        ntree_limit=None,
+        validate_features=True,
+        base_margin=None
+    ):
+        class_probs = super().predict(
+            X=X,
            output_margin=output_margin,
            ntree_limit=ntree_limit,
-            validate_features=validate_features)
+            validate_features=validate_features,
+            base_margin=base_margin
+        )
        if output_margin:
            # If output_margin is active, simply return the scores
            return class_probs
@@ -1125,13 +1107,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
            a numpy array of shape array-like of shape (n_samples, n_classes) with the
            probability of each data example being of a given class.
        """
-        test_dmatrix = DMatrix(X, base_margin=base_margin,
-                               missing=self.missing, nthread=self.n_jobs)
-        if ntree_limit is None:
-            ntree_limit = getattr(self, "best_ntree_limit", 0)
-        class_probs = self.get_booster().predict(test_dmatrix,
-                                                 ntree_limit=ntree_limit,
-                                                 validate_features=validate_features)
+        class_probs = super().predict(
+            X=X,
+            output_margin=False,
+            ntree_limit=ntree_limit,
+            validate_features=validate_features,
+            base_margin=base_margin
+        )
        return _cls_predict_proba(self.objective, class_probs, np.vstack)

    def evals_result(self):
@@ -1493,18 +1475,3 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        self.objective = params["objective"]
        self._set_evaluation_result(evals_result)
        return self
-
-    def predict(self, data, output_margin=False,
-                ntree_limit=0, validate_features=True, base_margin=None):
-
-        test_dmatrix = DMatrix(data, base_margin=base_margin,
-                               missing=self.missing)
-        if ntree_limit is None:
-            ntree_limit = getattr(self, "best_ntree_limit", 0)
-
-        return self.get_booster().predict(test_dmatrix,
-                                          output_margin=output_margin,
-                                          ntree_limit=ntree_limit,
-                                          validate_features=validate_features)
-
-    predict.__doc__ = XGBModel.predict.__doc__
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -88,12 +88,9 @@ def _train_internal(params, dtrain,
    if evals_result is not None and is_new_callback:
        evals_result.update(callbacks.history)

-    if bst.attr('best_score') is not None:
-        bst.best_score = float(bst.attr('best_score'))
-        bst.best_iteration = int(bst.attr('best_iteration'))
-    else:
-        bst.best_iteration = bst.num_boosted_rounds() - 1
-
+    # These should be moved into callback functions `after_training`, but until old
+    # callbacks are removed, the train function is the only place for setting the
+    # attributes.
    config = json.loads(bst.save_config())
    booster = config['learner']['gradient_booster']['name']
    if booster == 'gblinear':
@@ -114,7 +111,20 @@ def _train_internal(params, dtrain,

    num_groups = int(config['learner']['learner_model_param']['num_class'])
    num_groups = 1 if num_groups == 0 else num_groups
-    bst.best_ntree_limit = ((bst.best_iteration + 1) * num_parallel_tree * num_groups)
+    if bst.attr('best_score') is not None:
+        bst.best_score = float(bst.attr('best_score'))
+        bst.best_iteration = int(bst.attr('best_iteration'))
+        bst.set_attr(
+            best_ntree_limit=str(
+                (bst.best_iteration + 1) * num_parallel_tree * num_groups
+            )
+        )
+        bst.best_ntree_limit = int(bst.attr("best_ntree_limit"))
+    else:
+        # Due to compatibility with version older than 1.4, these attributes are added
+        # to Python object even if early stopping is not used.
+        bst.best_iteration = bst.num_boosted_rounds() - 1
+        bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree * num_groups

    # Copy to serialise and unserialise booster to reset state and free
    # training memory
@@ -148,15 +158,16 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
        Activates early stopping. Validation metric needs to improve at least once in
        every **early_stopping_rounds** round(s) to continue training.
        Requires at least one item in **evals**.
-        The method returns the model from the last iteration (not the best one).
-        If there's more than one item in **evals**, the last entry will be used
-        for early stopping.
+        The method returns the model from the last iteration (not the best one).  Use
+        custom callback or model slicing if the best model is desired.
+        If there's more than one item in **evals**, the last entry will be used for early
+        stopping.
        If there's more than one metric in the **eval_metric** parameter given in
        **params**, the last metric will be used for early stopping.
        If early stopping occurs, the model will have three additional fields:
-        ``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``.
-        (Use ``bst.best_ntree_limit`` to get the correct value if
-        ``num_parallel_tree`` and/or ``num_class`` appears in the parameters)
+        ``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``.  (Use
+        ``bst.best_ntree_limit`` to get the correct value if ``num_parallel_tree`` and/or
+        ``num_class`` appears in the parameters)
    evals_result: dict
        This dictionary stores the evaluation results of all the items in watchlist.

--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -341,6 +341,25 @@ class TestModels:
                      'objective': 'multi:softmax'}
        validate_model(parameters)

+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_attributes(self):
+        from sklearn.datasets import load_iris
+        X, y = load_iris(return_X_y=True)
+        cls = xgb.XGBClassifier(n_estimators=2)
+        cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)])
+        assert cls.get_booster().best_ntree_limit == 2 * cls.n_classes_
+        assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "cls.json")
+            cls.save_model(path)
+
+            cls = xgb.XGBClassifier(n_estimators=2)
+            cls.load_model(path)
+            assert cls.get_booster().best_ntree_limit == 2 * cls.n_classes_
+            assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
+
+    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.parametrize('booster', ['gbtree', 'dart'])
    def test_slice(self, booster):
        from sklearn.datasets import make_classification