[breaking] Remove duplicated predict functions, Fix attributes IO. (#6593)

* Fix attributes not being restored.
* Rename all `data` to `X`. [breaking]
This commit is contained in:
Jiaming Yuan 2021-01-13 16:56:49 +08:00 committed by GitHub
parent 7f4d3a91b9
commit 0027220aa0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 91 additions and 87 deletions

View File

@ -1741,6 +1741,13 @@ class Booster(object):
else:
raise TypeError('Unknown file type: ', fname)
if self.attr("best_iteration") is not None:
self.best_iteration = int(self.attr("best_iteration"))
if self.attr("best_score") is not None:
self.best_score = float(self.attr("best_score"))
if self.attr("best_ntree_limit") is not None:
self.best_ntree_limit = int(self.attr("best_ntree_limit"))
def num_boosted_rounds(self) -> int:
'''Get number of boosted rounds. For gblinear this is reset to 0 after
serializing the model.

View File

@ -682,10 +682,16 @@ class XGBModel(XGBModelBase):
self._set_evaluation_result(evals_result)
return self
def predict(self, data, output_margin=False, ntree_limit=None,
validate_features=True, base_margin=None):
def predict(
self,
X,
output_margin=False,
ntree_limit=None,
validate_features=True,
base_margin=None
):
"""
Predict with `data`.
Predict with `X`.
.. note:: This function is not thread safe.
@ -699,7 +705,7 @@ class XGBModel(XGBModelBase):
Parameters
----------
data : array_like
X : array_like
Data to predict with
output_margin : bool
Whether to output the raw untransformed margin value.
@ -718,16 +724,21 @@ class XGBModel(XGBModelBase):
prediction : numpy array
"""
# pylint: disable=missing-docstring,invalid-name
test_dmatrix = DMatrix(data, base_margin=base_margin,
test_dmatrix = DMatrix(X, base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs)
# get ntree_limit to use - if none specified, default to
# best_ntree_limit if defined, otherwise 0.
if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0)
return self.get_booster().predict(test_dmatrix,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features)
try:
ntree_limit = self.best_ntree_limit
except AttributeError:
ntree_limit = 0
return self.get_booster().predict(
test_dmatrix,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features
)
def apply(self, X, ntree_limit=0):
"""Return the predicted leaf every tree for each sample.
@ -1037,50 +1048,21 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
'Fit gradient boosting model',
'Fit gradient boosting classifier', 1)
def predict(self, data, output_margin=False, ntree_limit=None,
validate_features=True, base_margin=None):
"""
Predict with `data`.
.. note:: This function is not thread safe.
For each booster object, predict can only be called from one thread.
If you want to run prediction using multiple thread, call
``xgb.copy()`` to make copies of model object and then call
``predict()``.
.. code-block:: python
preds = bst.predict(dtest, ntree_limit=num_round)
Parameters
----------
data : array_like
Feature matrix.
output_margin : bool
Whether to output the raw untransformed margin value.
ntree_limit : int
Limit number of trees in the prediction; defaults to
best_ntree_limit if defined (i.e. it has been trained with early
stopping), otherwise 0 (use all trees).
validate_features : bool
When this is True, validate that the Booster's and data's
feature_names are identical. Otherwise, it is assumed that the
feature_names are the same.
Returns
-------
prediction : numpy array
"""
test_dmatrix = DMatrix(data, base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs)
if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0)
class_probs = self.get_booster().predict(
test_dmatrix,
def predict(
self,
X,
output_margin=False,
ntree_limit=None,
validate_features=True,
base_margin=None
):
class_probs = super().predict(
X=X,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features)
validate_features=validate_features,
base_margin=base_margin
)
if output_margin:
# If output_margin is active, simply return the scores
return class_probs
@ -1125,13 +1107,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
a numpy array of shape array-like of shape (n_samples, n_classes) with the
probability of each data example being of a given class.
"""
test_dmatrix = DMatrix(X, base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs)
if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0)
class_probs = self.get_booster().predict(test_dmatrix,
ntree_limit=ntree_limit,
validate_features=validate_features)
class_probs = super().predict(
X=X,
output_margin=False,
ntree_limit=ntree_limit,
validate_features=validate_features,
base_margin=base_margin
)
return _cls_predict_proba(self.objective, class_probs, np.vstack)
def evals_result(self):
@ -1493,18 +1475,3 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
self.objective = params["objective"]
self._set_evaluation_result(evals_result)
return self
def predict(self, data, output_margin=False,
ntree_limit=0, validate_features=True, base_margin=None):
test_dmatrix = DMatrix(data, base_margin=base_margin,
missing=self.missing)
if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0)
return self.get_booster().predict(test_dmatrix,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features)
predict.__doc__ = XGBModel.predict.__doc__

View File

@ -88,12 +88,9 @@ def _train_internal(params, dtrain,
if evals_result is not None and is_new_callback:
evals_result.update(callbacks.history)
if bst.attr('best_score') is not None:
bst.best_score = float(bst.attr('best_score'))
bst.best_iteration = int(bst.attr('best_iteration'))
else:
bst.best_iteration = bst.num_boosted_rounds() - 1
# These should be moved into callback functions `after_training`, but until old
# callbacks are removed, the train function is the only place for setting the
# attributes.
config = json.loads(bst.save_config())
booster = config['learner']['gradient_booster']['name']
if booster == 'gblinear':
@ -114,7 +111,20 @@ def _train_internal(params, dtrain,
num_groups = int(config['learner']['learner_model_param']['num_class'])
num_groups = 1 if num_groups == 0 else num_groups
bst.best_ntree_limit = ((bst.best_iteration + 1) * num_parallel_tree * num_groups)
if bst.attr('best_score') is not None:
bst.best_score = float(bst.attr('best_score'))
bst.best_iteration = int(bst.attr('best_iteration'))
bst.set_attr(
best_ntree_limit=str(
(bst.best_iteration + 1) * num_parallel_tree * num_groups
)
)
bst.best_ntree_limit = int(bst.attr("best_ntree_limit"))
else:
# Due to compatibility with version older than 1.4, these attributes are added
# to Python object even if early stopping is not used.
bst.best_iteration = bst.num_boosted_rounds() - 1
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree * num_groups
# Copy to serialise and unserialise booster to reset state and free
# training memory
@ -148,15 +158,16 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
Activates early stopping. Validation metric needs to improve at least once in
every **early_stopping_rounds** round(s) to continue training.
Requires at least one item in **evals**.
The method returns the model from the last iteration (not the best one).
If there's more than one item in **evals**, the last entry will be used
for early stopping.
The method returns the model from the last iteration (not the best one). Use
custom callback or model slicing if the best model is desired.
If there's more than one item in **evals**, the last entry will be used for early
stopping.
If there's more than one metric in the **eval_metric** parameter given in
**params**, the last metric will be used for early stopping.
If early stopping occurs, the model will have three additional fields:
``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``.
(Use ``bst.best_ntree_limit`` to get the correct value if
``num_parallel_tree`` and/or ``num_class`` appears in the parameters)
``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``. (Use
``bst.best_ntree_limit`` to get the correct value if ``num_parallel_tree`` and/or
``num_class`` appears in the parameters)
evals_result: dict
This dictionary stores the evaluation results of all the items in watchlist.

View File

@ -341,6 +341,25 @@ class TestModels:
'objective': 'multi:softmax'}
validate_model(parameters)
@pytest.mark.skipif(**tm.no_sklearn())
def test_attributes(self):
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
cls = xgb.XGBClassifier(n_estimators=2)
cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)])
assert cls.get_booster().best_ntree_limit == 2 * cls.n_classes_
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "cls.json")
cls.save_model(path)
cls = xgb.XGBClassifier(n_estimators=2)
cls.load_model(path)
assert cls.get_booster().best_ntree_limit == 2 * cls.n_classes_
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
@pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.parametrize('booster', ['gbtree', 'dart'])
def test_slice(self, booster):
from sklearn.datasets import make_classification