Save Scikit-Learn attributes into learner attributes. (#5245)

* Remove the recommendation for pickle.

* Save skl attributes in booster.attr

* Test loading scikit-learn model with native booster.
This commit is contained in:
Jiaming Yuan
2020-01-30 16:00:18 +08:00
committed by GitHub
parent c67163250e
commit 472ded549d
8 changed files with 194 additions and 57 deletions

View File

@@ -4,9 +4,10 @@
import abc
import os
import sys
from pathlib import PurePath
import numpy as np
assert (sys.version_info[0] == 3), 'Python 2 is no longer supported.'
# pylint: disable=invalid-name, redefined-builtin
@@ -148,7 +149,29 @@ try:
XGBKFold = KFold
XGBStratifiedKFold = StratifiedKFold
XGBLabelEncoder = LabelEncoder
class XGBoostLabelEncoder(LabelEncoder):
'''Label encoder with JSON serialization methods.'''
def to_json(self):
'''Returns a JSON compatible dictionary'''
meta = dict()
for k, v in self.__dict__.items():
if isinstance(v, np.ndarray):
meta[k] = v.tolist()
else:
meta[k] = v
return meta
def from_json(self, doc):
# pylint: disable=attribute-defined-outside-init
'''Load the encoder back from a JSON compatible dict.'''
meta = dict()
for k, v in doc.items():
if k == 'classes_':
self.classes_ = np.array(v)
continue
meta[k] = v
self.__dict__.update(meta)
except ImportError:
SKLEARN_INSTALLED = False
@@ -159,7 +182,7 @@ except ImportError:
XGBKFold = None
XGBStratifiedKFold = None
XGBLabelEncoder = None
XGBoostLabelEncoder = None
# dask

View File

@@ -11,7 +11,7 @@ from .training import train
# Do not use class names on scikit-learn directly. Re-define the classes on
# .compat to guarantee the behavior without scikit-learn
from .compat import (SKLEARN_INSTALLED, XGBModelBase,
XGBClassifierBase, XGBRegressorBase, XGBLabelEncoder)
XGBClassifierBase, XGBRegressorBase, XGBoostLabelEncoder)
def _objective_decorator(func):
@@ -330,54 +330,96 @@ class XGBModel(XGBModelBase):
"""Gets the number of xgboost boosting rounds."""
return self.n_estimators
def save_model(self, fname):
"""
Save the model to a file.
def save_model(self, fname: str):
"""Save the model to a file.
The model is saved in an XGBoost internal binary format which is
universal among the various XGBoost interfaces. Auxiliary attributes of
the Python Booster object (such as feature names) will not be loaded.
Label encodings (text labels to numeric labels) will be also lost.
**If you are using only the Python interface, we recommend pickling the
model object for best results.**
The model is saved in an XGBoost internal format which is universal
among the various XGBoost interfaces. Auxiliary attributes of the
Python Booster object (such as feature names) will not be saved.
.. note::
See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
Parameters
----------
fname : string
Output file name
"""
warnings.warn("save_model: Useful attributes in the Python " +
"object {} will be lost. ".format(type(self).__name__) +
"If you did not mean to export the model to " +
"a non-Python binding of XGBoost, consider " +
"using `pickle` or `joblib` to save your model.",
Warning)
meta = dict()
for k, v in self.__dict__.items():
if k == '_le':
meta['_le'] = self._le.to_json()
continue
if k == '_Booster':
continue
if k == 'classes_':
# numpy array is not JSON serializable
meta['classes_'] = self.classes_.tolist()
continue
try:
json.dumps({k: v})
meta[k] = v
except TypeError:
warnings.warn(str(k) + ' is not saved in Scikit-Learn meta.')
meta['type'] = type(self).__name__
meta = json.dumps(meta)
self.get_booster().set_attr(scikit_learn=meta)
self.get_booster().save_model(fname)
# Delete the attribute after save
self.get_booster().set_attr(scikit_learn=None)
def load_model(self, fname):
"""
Load the model from a file.
# pylint: disable=attribute-defined-outside-init
"""Load the model from a file.
The model is loaded from an XGBoost internal binary format which is
universal among the various XGBoost interfaces. Auxiliary attributes of
the Python Booster object (such as feature names) will not be loaded.
Label encodings (text labels to numeric labels) will be also lost.
**If you are using only the Python interface, we recommend pickling the
model object for best results.**
The model is loaded from an XGBoost internal format which is universal
among the various XGBoost interfaces. Auxiliary attributes of the
Python Booster object (such as feature names) will not be loaded.
Parameters
----------
fname : string or a memory buffer
Input file name or memory buffer(see also save_raw)
fname : string
Input file name.
"""
if self._Booster is None:
self._Booster = Booster({'n_jobs': self.n_jobs})
self._Booster.load_model(fname)
meta = self._Booster.attr('scikit_learn')
if meta is None:
warnings.warn(
'Loading a native XGBoost model with Scikit-Learn interface.')
return
meta = json.loads(meta)
states = dict()
for k, v in meta.items():
if k == '_le':
self._le = XGBoostLabelEncoder()
self._le.from_json(v)
continue
if k == 'classes_':
self.classes_ = np.array(v)
continue
if k == 'type' and type(self).__name__ != v:
msg = f'Current model type: {type(self).__name__}, ' + \
f'type of model in file: {v}'
raise TypeError(msg)
if k == 'type':
continue
states[k] = v
self.__dict__.update(states)
# Delete the attribute after load
self.get_booster().set_attr(scikit_learn=None)
def fit(self, X, y, sample_weight=None, base_margin=None,
eval_set=None, eval_metric=None, early_stopping_rounds=None,
verbose=True, xgb_model=None, sample_weight_eval_set=None, callbacks=None):
# pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init
verbose=True, xgb_model=None, sample_weight_eval_set=None,
callbacks=None):
# pylint: disable=invalid-name,attribute-defined-outside-init
"""Fit gradient boosting model
Parameters
@@ -678,7 +720,7 @@ class XGBModel(XGBModelBase):
"Implementation of the scikit-learn API for XGBoost classification.",
['model', 'objective'])
class XGBClassifier(XGBModel, XGBClassifierBase):
# pylint: disable=missing-docstring,too-many-arguments,invalid-name,too-many-instance-attributes
# pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
def __init__(self, objective="binary:logistic", **kwargs):
super().__init__(objective=objective, **kwargs)
@@ -714,7 +756,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
else:
xgb_options.update({"eval_metric": eval_metric})
self._le = XGBLabelEncoder().fit(y)
self._le = XGBoostLabelEncoder().fit(y)
training_labels = self._le.transform(y)
if eval_set is not None:
@@ -809,10 +851,11 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
missing=self.missing, nthread=self.n_jobs)
if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0)
class_probs = self.get_booster().predict(test_dmatrix,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features)
class_probs = self.get_booster().predict(
test_dmatrix,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features)
if output_margin:
# If output_margin is active, simply return the scores
return class_probs
@@ -822,7 +865,12 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
else:
column_indexes = np.repeat(0, class_probs.shape[0])
column_indexes[class_probs > 0.5] = 1
return self._le.inverse_transform(column_indexes)
if hasattr(self, '_le'):
return self._le.inverse_transform(column_indexes)
warnings.warn(
'Label encoder is not defined. Returning class probability.')
return class_probs
def predict_proba(self, data, ntree_limit=None, validate_features=True,
base_margin=None):