[breaking] Save booster feature info in JSON, remove feature name generation. (#6605)

* Save feature info in booster in JSON model. * [breaking] Remove automatic feature name generation in `DMatrix`. This PR is to enable reliable feature validation in Python package.
2021-02-25 18:54:16 +08:00
parent b6167cd2ff
commit 9da2287ab8
12 changed files with 363 additions and 36 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -77,7 +77,7 @@ def from_pystr_to_cstr(data: Union[str, List[str]]):
    raise TypeError()


-def from_cstr_to_pystr(data, length):
+def from_cstr_to_pystr(data, length) -> List[str]:
    """Revert C pointer to Python str

    Parameters
@@ -869,7 +869,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes
        )
        feature_names = from_cstr_to_pystr(sarr, length)
        if not feature_names:
-            feature_names = ["f{0}".format(i) for i in range(self.num_col())]
+            return None
        return feature_names

    @feature_names.setter
@@ -1167,9 +1167,6 @@ class Booster(object):
    training, prediction and evaluation.
    """

-    feature_names = None
-    feature_types = None
-
    def __init__(self, params=None, cache=(), model_file=None):
        # pylint: disable=invalid-name
        """
@@ -1185,12 +1182,15 @@ class Booster(object):
        for d in cache:
            if not isinstance(d, DMatrix):
                raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
-            self._validate_features(d)

        dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
        self.handle = ctypes.c_void_p()
        _check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
                                         ctypes.byref(self.handle)))
+        for d in cache:
+            # Validate feature only after the feature names are saved into booster.
+            self._validate_features(d)
+
        params = params or {}
        params = self._configure_metrics(params.copy())
        if isinstance(params, list):
@@ -1400,6 +1400,60 @@ class Booster(object):
            _check_call(_LIB.XGBoosterSetAttr(
                self.handle, c_str(key), value))

+    def _get_feature_info(self, field: str):
+        length = c_bst_ulong()
+        sarr = ctypes.POINTER(ctypes.c_char_p)()
+        if not hasattr(self, "handle") or self.handle is None:
+            return None
+        _check_call(
+            _LIB.XGBoosterGetStrFeatureInfo(
+                self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr),
+            )
+        )
+        feature_info = from_cstr_to_pystr(sarr, length)
+        return feature_info if feature_info else None
+
+    @property
+    def feature_types(self) -> Optional[List[str]]:
+        """Feature types for this booster.  Can be directly set by input data or by
+        assignment.
+
+        """
+        return self._get_feature_info("feature_type")
+
+    @property
+    def feature_names(self) -> Optional[List[str]]:
+        """Feature names for this booster.  Can be directly set by input data or by
+        assignment.
+
+        """
+        return self._get_feature_info("feature_name")
+
+    def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
+        if features is not None:
+            assert isinstance(features, list)
+            c_feature_info = [bytes(f, encoding="utf-8") for f in features]
+            c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info)
+            _check_call(
+                _LIB.XGBoosterSetStrFeatureInfo(
+                    self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features))
+                )
+            )
+        else:
+            _check_call(
+                _LIB.XGBoosterSetStrFeatureInfo(
+                    self.handle, c_str(field), None, c_bst_ulong(0)
+                )
+            )
+
+    @feature_names.setter
+    def feature_names(self, features: Optional[List[str]]) -> None:
+        self._set_feature_info(features, "feature_name")
+
+    @feature_types.setter
+    def feature_types(self, features: Optional[List[str]]) -> None:
+        self._set_feature_info(features, "feature_type")
+
    def set_param(self, params, value=None):
        """Set parameters into the Booster.

@@ -1859,9 +1913,10 @@ class Booster(object):
    def save_model(self, fname):
        """Save the model to a file.

-        The model is saved in an XGBoost internal format which is universal
-        among the various XGBoost interfaces. Auxiliary attributes of the
-        Python Booster object (such as feature_names) will not be saved.  See:
+        The model is saved in an XGBoost internal format which is universal among the
+        various XGBoost interfaces. Auxiliary attributes of the Python Booster object
+        (such as feature_names) will not be saved when using binary format.  To save those
+        attributes, use JSON instead. See:

          https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

@@ -1898,9 +1953,10 @@ class Booster(object):
        """Load the model from a file or bytearray. Path to file can be local
        or as an URI.

-        The model is loaded from XGBoost format which is universal among the
-        various XGBoost interfaces. Auxiliary attributes of the Python Booster
-        object (such as feature_names) will not be loaded.  See:
+        The model is loaded from XGBoost format which is universal among the various
+        XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
+        feature_names) will not be loaded when using binary format.  To save those
+        attributes, use JSON instead.  See:

          https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

@@ -2249,7 +2305,7 @@ class Booster(object):
        # pylint: disable=no-member
        return df.sort(['Tree', 'Node']).reset_index(drop=True)

-    def _validate_features(self, data):
+    def _validate_features(self, data: DMatrix):
        """
        Validate Booster and data's feature_names are identical.
        Set feature_names and feature_types from DMatrix
@@ -2260,24 +2316,27 @@ class Booster(object):
        if self.feature_names is None:
            self.feature_names = data.feature_names
            self.feature_types = data.feature_types
-        else:
-            # Booster can't accept data with different feature names
-            if self.feature_names != data.feature_names:
-                dat_missing = set(self.feature_names) - set(data.feature_names)
-                my_missing = set(data.feature_names) - set(self.feature_names)
+        if data.feature_names is None and self.feature_names is not None:
+            raise ValueError(
+                "training data did not have the following fields: " +
+                ", ".join(self.feature_names)
+            )
+        # Booster can't accept data with different feature names
+        if self.feature_names != data.feature_names:
+            dat_missing = set(self.feature_names) - set(data.feature_names)
+            my_missing = set(data.feature_names) - set(self.feature_names)

-                msg = 'feature_names mismatch: {0} {1}'
+            msg = 'feature_names mismatch: {0} {1}'

-                if dat_missing:
-                    msg += ('\nexpected ' + ', '.join(
-                        str(s) for s in dat_missing) + ' in input data')
+            if dat_missing:
+                msg += ('\nexpected ' + ', '.join(
+                    str(s) for s in dat_missing) + ' in input data')

-                if my_missing:
-                    msg += ('\ntraining data did not have the following fields: ' +
-                            ', '.join(str(s) for s in my_missing))
+            if my_missing:
+                msg += ('\ntraining data did not have the following fields: ' +
+                        ', '.join(str(s) for s in my_missing))

-                raise ValueError(msg.format(self.feature_names,
-                                            data.feature_names))
+            raise ValueError(msg.format(self.feature_names, data.feature_names))

    def get_split_value_histogram(self, feature, fmap='', bins=None,
                                  as_pandas=True):
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -958,9 +958,13 @@ class XGBModel(XGBModelBase):
            raise AttributeError(
                'Feature importance is not defined for Booster type {}'
                .format(self.booster))
-        b = self.get_booster()
+        b: Booster = self.get_booster()
        score = b.get_score(importance_type=self.importance_type)
-        all_features = [score.get(f, 0.) for f in b.feature_names]
+        if b.feature_names is None:
+            feature_names = ["f{0}".format(i) for i in range(self.n_features_in_)]
+        else:
+            feature_names = b.feature_names
+        all_features = [score.get(f, 0.) for f in feature_names]
        all_features = np.array(all_features, dtype=np.float32)
        total = all_features.sum()
        if total == 0: