[breaking] Save booster feature info in JSON, remove feature name generation. (#6605)

* Save feature info in booster in JSON model.
* [breaking] Remove automatic feature name generation in `DMatrix`.

This PR is to enable reliable feature validation in Python package.
This commit is contained in:
Jiaming Yuan
2021-02-25 18:54:16 +08:00
committed by GitHub
parent b6167cd2ff
commit 9da2287ab8
12 changed files with 363 additions and 36 deletions

View File

@@ -77,7 +77,7 @@ def from_pystr_to_cstr(data: Union[str, List[str]]):
raise TypeError()
def from_cstr_to_pystr(data, length):
def from_cstr_to_pystr(data, length) -> List[str]:
"""Revert C pointer to Python str
Parameters
@@ -869,7 +869,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
)
feature_names = from_cstr_to_pystr(sarr, length)
if not feature_names:
feature_names = ["f{0}".format(i) for i in range(self.num_col())]
return None
return feature_names
@feature_names.setter
@@ -1167,9 +1167,6 @@ class Booster(object):
training, prediction and evaluation.
"""
feature_names = None
feature_types = None
def __init__(self, params=None, cache=(), model_file=None):
# pylint: disable=invalid-name
"""
@@ -1185,12 +1182,15 @@ class Booster(object):
for d in cache:
if not isinstance(d, DMatrix):
raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
self._validate_features(d)
dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
ctypes.byref(self.handle)))
for d in cache:
# Validate feature only after the feature names are saved into booster.
self._validate_features(d)
params = params or {}
params = self._configure_metrics(params.copy())
if isinstance(params, list):
@@ -1400,6 +1400,60 @@ class Booster(object):
_check_call(_LIB.XGBoosterSetAttr(
self.handle, c_str(key), value))
def _get_feature_info(self, field: str):
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
if not hasattr(self, "handle") or self.handle is None:
return None
_check_call(
_LIB.XGBoosterGetStrFeatureInfo(
self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr),
)
)
feature_info = from_cstr_to_pystr(sarr, length)
return feature_info if feature_info else None
@property
def feature_types(self) -> Optional[List[str]]:
"""Feature types for this booster. Can be directly set by input data or by
assignment.
"""
return self._get_feature_info("feature_type")
@property
def feature_names(self) -> Optional[List[str]]:
"""Feature names for this booster. Can be directly set by input data or by
assignment.
"""
return self._get_feature_info("feature_name")
def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
if features is not None:
assert isinstance(features, list)
c_feature_info = [bytes(f, encoding="utf-8") for f in features]
c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info)
_check_call(
_LIB.XGBoosterSetStrFeatureInfo(
self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features))
)
)
else:
_check_call(
_LIB.XGBoosterSetStrFeatureInfo(
self.handle, c_str(field), None, c_bst_ulong(0)
)
)
@feature_names.setter
def feature_names(self, features: Optional[List[str]]) -> None:
self._set_feature_info(features, "feature_name")
@feature_types.setter
def feature_types(self, features: Optional[List[str]]) -> None:
self._set_feature_info(features, "feature_type")
def set_param(self, params, value=None):
"""Set parameters into the Booster.
@@ -1859,9 +1913,10 @@ class Booster(object):
def save_model(self, fname):
"""Save the model to a file.
The model is saved in an XGBoost internal format which is universal
among the various XGBoost interfaces. Auxiliary attributes of the
Python Booster object (such as feature_names) will not be saved. See:
The model is saved in an XGBoost internal format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
(such as feature_names) will not be saved when using binary format. To save those
attributes, use JSON instead. See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
@@ -1898,9 +1953,10 @@ class Booster(object):
"""Load the model from a file or bytearray. Path to file can be local
or as an URI.
The model is loaded from XGBoost format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster
object (such as feature_names) will not be loaded. See:
The model is loaded from XGBoost format which is universal among the various
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
feature_names) will not be loaded when using binary format. To save those
attributes, use JSON instead. See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
@@ -2249,7 +2305,7 @@ class Booster(object):
# pylint: disable=no-member
return df.sort(['Tree', 'Node']).reset_index(drop=True)
def _validate_features(self, data):
def _validate_features(self, data: DMatrix):
"""
Validate Booster and data's feature_names are identical.
Set feature_names and feature_types from DMatrix
@@ -2260,24 +2316,27 @@ class Booster(object):
if self.feature_names is None:
self.feature_names = data.feature_names
self.feature_types = data.feature_types
else:
# Booster can't accept data with different feature names
if self.feature_names != data.feature_names:
dat_missing = set(self.feature_names) - set(data.feature_names)
my_missing = set(data.feature_names) - set(self.feature_names)
if data.feature_names is None and self.feature_names is not None:
raise ValueError(
"training data did not have the following fields: " +
", ".join(self.feature_names)
)
# Booster can't accept data with different feature names
if self.feature_names != data.feature_names:
dat_missing = set(self.feature_names) - set(data.feature_names)
my_missing = set(data.feature_names) - set(self.feature_names)
msg = 'feature_names mismatch: {0} {1}'
msg = 'feature_names mismatch: {0} {1}'
if dat_missing:
msg += ('\nexpected ' + ', '.join(
str(s) for s in dat_missing) + ' in input data')
if dat_missing:
msg += ('\nexpected ' + ', '.join(
str(s) for s in dat_missing) + ' in input data')
if my_missing:
msg += ('\ntraining data did not have the following fields: ' +
', '.join(str(s) for s in my_missing))
if my_missing:
msg += ('\ntraining data did not have the following fields: ' +
', '.join(str(s) for s in my_missing))
raise ValueError(msg.format(self.feature_names,
data.feature_names))
raise ValueError(msg.format(self.feature_names, data.feature_names))
def get_split_value_histogram(self, feature, fmap='', bins=None,
as_pandas=True):

View File

@@ -958,9 +958,13 @@ class XGBModel(XGBModelBase):
raise AttributeError(
'Feature importance is not defined for Booster type {}'
.format(self.booster))
b = self.get_booster()
b: Booster = self.get_booster()
score = b.get_score(importance_type=self.importance_type)
all_features = [score.get(f, 0.) for f in b.feature_names]
if b.feature_names is None:
feature_names = ["f{0}".format(i) for i in range(self.n_features_in_)]
else:
feature_names = b.feature_names
all_features = [score.get(f, 0.) for f in feature_names]
all_features = np.array(all_features, dtype=np.float32)
total = all_features.sum()
if total == 0: