[breaking] Remove duplicated predict functions, Fix attributes IO. (#6593)

* Fix attributes not being restored.
* Rename all `data` to `X`. [breaking]
This commit is contained in:
Jiaming Yuan 2021-01-13 16:56:49 +08:00 committed by GitHub
parent 7f4d3a91b9
commit 0027220aa0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 91 additions and 87 deletions

View File

@ -1741,6 +1741,13 @@ class Booster(object):
else: else:
raise TypeError('Unknown file type: ', fname) raise TypeError('Unknown file type: ', fname)
if self.attr("best_iteration") is not None:
self.best_iteration = int(self.attr("best_iteration"))
if self.attr("best_score") is not None:
self.best_score = float(self.attr("best_score"))
if self.attr("best_ntree_limit") is not None:
self.best_ntree_limit = int(self.attr("best_ntree_limit"))
def num_boosted_rounds(self) -> int: def num_boosted_rounds(self) -> int:
'''Get number of boosted rounds. For gblinear this is reset to 0 after '''Get number of boosted rounds. For gblinear this is reset to 0 after
serializing the model. serializing the model.

View File

@ -682,10 +682,16 @@ class XGBModel(XGBModelBase):
self._set_evaluation_result(evals_result) self._set_evaluation_result(evals_result)
return self return self
def predict(self, data, output_margin=False, ntree_limit=None, def predict(
validate_features=True, base_margin=None): self,
X,
output_margin=False,
ntree_limit=None,
validate_features=True,
base_margin=None
):
""" """
Predict with `data`. Predict with `X`.
.. note:: This function is not thread safe. .. note:: This function is not thread safe.
@ -699,7 +705,7 @@ class XGBModel(XGBModelBase):
Parameters Parameters
---------- ----------
data : array_like X : array_like
Data to predict with Data to predict with
output_margin : bool output_margin : bool
Whether to output the raw untransformed margin value. Whether to output the raw untransformed margin value.
@ -718,16 +724,21 @@ class XGBModel(XGBModelBase):
prediction : numpy array prediction : numpy array
""" """
# pylint: disable=missing-docstring,invalid-name # pylint: disable=missing-docstring,invalid-name
test_dmatrix = DMatrix(data, base_margin=base_margin, test_dmatrix = DMatrix(X, base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs) missing=self.missing, nthread=self.n_jobs)
# get ntree_limit to use - if none specified, default to # get ntree_limit to use - if none specified, default to
# best_ntree_limit if defined, otherwise 0. # best_ntree_limit if defined, otherwise 0.
if ntree_limit is None: if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0) try:
return self.get_booster().predict(test_dmatrix, ntree_limit = self.best_ntree_limit
output_margin=output_margin, except AttributeError:
ntree_limit=ntree_limit, ntree_limit = 0
validate_features=validate_features) return self.get_booster().predict(
test_dmatrix,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features
)
def apply(self, X, ntree_limit=0): def apply(self, X, ntree_limit=0):
"""Return the predicted leaf every tree for each sample. """Return the predicted leaf every tree for each sample.
@ -1037,50 +1048,21 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
'Fit gradient boosting model', 'Fit gradient boosting model',
'Fit gradient boosting classifier', 1) 'Fit gradient boosting classifier', 1)
def predict(self, data, output_margin=False, ntree_limit=None, def predict(
validate_features=True, base_margin=None): self,
""" X,
Predict with `data`. output_margin=False,
ntree_limit=None,
.. note:: This function is not thread safe. validate_features=True,
base_margin=None
For each booster object, predict can only be called from one thread. ):
If you want to run prediction using multiple thread, call class_probs = super().predict(
``xgb.copy()`` to make copies of model object and then call X=X,
``predict()``.
.. code-block:: python
preds = bst.predict(dtest, ntree_limit=num_round)
Parameters
----------
data : array_like
Feature matrix.
output_margin : bool
Whether to output the raw untransformed margin value.
ntree_limit : int
Limit number of trees in the prediction; defaults to
best_ntree_limit if defined (i.e. it has been trained with early
stopping), otherwise 0 (use all trees).
validate_features : bool
When this is True, validate that the Booster's and data's
feature_names are identical. Otherwise, it is assumed that the
feature_names are the same.
Returns
-------
prediction : numpy array
"""
test_dmatrix = DMatrix(data, base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs)
if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0)
class_probs = self.get_booster().predict(
test_dmatrix,
output_margin=output_margin, output_margin=output_margin,
ntree_limit=ntree_limit, ntree_limit=ntree_limit,
validate_features=validate_features) validate_features=validate_features,
base_margin=base_margin
)
if output_margin: if output_margin:
# If output_margin is active, simply return the scores # If output_margin is active, simply return the scores
return class_probs return class_probs
@ -1125,13 +1107,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
a numpy array of shape array-like of shape (n_samples, n_classes) with the a numpy array of shape array-like of shape (n_samples, n_classes) with the
probability of each data example being of a given class. probability of each data example being of a given class.
""" """
test_dmatrix = DMatrix(X, base_margin=base_margin, class_probs = super().predict(
missing=self.missing, nthread=self.n_jobs) X=X,
if ntree_limit is None: output_margin=False,
ntree_limit = getattr(self, "best_ntree_limit", 0) ntree_limit=ntree_limit,
class_probs = self.get_booster().predict(test_dmatrix, validate_features=validate_features,
ntree_limit=ntree_limit, base_margin=base_margin
validate_features=validate_features) )
return _cls_predict_proba(self.objective, class_probs, np.vstack) return _cls_predict_proba(self.objective, class_probs, np.vstack)
def evals_result(self): def evals_result(self):
@ -1493,18 +1475,3 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
self.objective = params["objective"] self.objective = params["objective"]
self._set_evaluation_result(evals_result) self._set_evaluation_result(evals_result)
return self return self
def predict(self, data, output_margin=False,
ntree_limit=0, validate_features=True, base_margin=None):
test_dmatrix = DMatrix(data, base_margin=base_margin,
missing=self.missing)
if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0)
return self.get_booster().predict(test_dmatrix,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features)
predict.__doc__ = XGBModel.predict.__doc__

View File

@ -88,12 +88,9 @@ def _train_internal(params, dtrain,
if evals_result is not None and is_new_callback: if evals_result is not None and is_new_callback:
evals_result.update(callbacks.history) evals_result.update(callbacks.history)
if bst.attr('best_score') is not None: # These should be moved into callback functions `after_training`, but until old
bst.best_score = float(bst.attr('best_score')) # callbacks are removed, the train function is the only place for setting the
bst.best_iteration = int(bst.attr('best_iteration')) # attributes.
else:
bst.best_iteration = bst.num_boosted_rounds() - 1
config = json.loads(bst.save_config()) config = json.loads(bst.save_config())
booster = config['learner']['gradient_booster']['name'] booster = config['learner']['gradient_booster']['name']
if booster == 'gblinear': if booster == 'gblinear':
@ -114,7 +111,20 @@ def _train_internal(params, dtrain,
num_groups = int(config['learner']['learner_model_param']['num_class']) num_groups = int(config['learner']['learner_model_param']['num_class'])
num_groups = 1 if num_groups == 0 else num_groups num_groups = 1 if num_groups == 0 else num_groups
bst.best_ntree_limit = ((bst.best_iteration + 1) * num_parallel_tree * num_groups) if bst.attr('best_score') is not None:
bst.best_score = float(bst.attr('best_score'))
bst.best_iteration = int(bst.attr('best_iteration'))
bst.set_attr(
best_ntree_limit=str(
(bst.best_iteration + 1) * num_parallel_tree * num_groups
)
)
bst.best_ntree_limit = int(bst.attr("best_ntree_limit"))
else:
# Due to compatibility with version older than 1.4, these attributes are added
# to Python object even if early stopping is not used.
bst.best_iteration = bst.num_boosted_rounds() - 1
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree * num_groups
# Copy to serialise and unserialise booster to reset state and free # Copy to serialise and unserialise booster to reset state and free
# training memory # training memory
@ -148,15 +158,16 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
Activates early stopping. Validation metric needs to improve at least once in Activates early stopping. Validation metric needs to improve at least once in
every **early_stopping_rounds** round(s) to continue training. every **early_stopping_rounds** round(s) to continue training.
Requires at least one item in **evals**. Requires at least one item in **evals**.
The method returns the model from the last iteration (not the best one). The method returns the model from the last iteration (not the best one). Use
If there's more than one item in **evals**, the last entry will be used custom callback or model slicing if the best model is desired.
for early stopping. If there's more than one item in **evals**, the last entry will be used for early
stopping.
If there's more than one metric in the **eval_metric** parameter given in If there's more than one metric in the **eval_metric** parameter given in
**params**, the last metric will be used for early stopping. **params**, the last metric will be used for early stopping.
If early stopping occurs, the model will have three additional fields: If early stopping occurs, the model will have three additional fields:
``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``. ``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``. (Use
(Use ``bst.best_ntree_limit`` to get the correct value if ``bst.best_ntree_limit`` to get the correct value if ``num_parallel_tree`` and/or
``num_parallel_tree`` and/or ``num_class`` appears in the parameters) ``num_class`` appears in the parameters)
evals_result: dict evals_result: dict
This dictionary stores the evaluation results of all the items in watchlist. This dictionary stores the evaluation results of all the items in watchlist.

View File

@ -341,6 +341,25 @@ class TestModels:
'objective': 'multi:softmax'} 'objective': 'multi:softmax'}
validate_model(parameters) validate_model(parameters)
@pytest.mark.skipif(**tm.no_sklearn())
def test_attributes(self):
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
cls = xgb.XGBClassifier(n_estimators=2)
cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)])
assert cls.get_booster().best_ntree_limit == 2 * cls.n_classes_
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "cls.json")
cls.save_model(path)
cls = xgb.XGBClassifier(n_estimators=2)
cls.load_model(path)
assert cls.get_booster().best_ntree_limit == 2 * cls.n_classes_
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
@pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.parametrize('booster', ['gbtree', 'dart']) @pytest.mark.parametrize('booster', ['gbtree', 'dart'])
def test_slice(self, booster): def test_slice(self, booster):
from sklearn.datasets import make_classification from sklearn.datasets import make_classification