[breaking] Save booster feature info in JSON, remove feature name generation. (#6605)

* Save feature info in booster in JSON model. * [breaking] Remove automatic feature name generation in `DMatrix`. This PR is to enable reliable feature validation in Python package.
2021-02-25 18:54:16 +08:00 · 2021-02-25 18:54:16 +08:00 · 9da2287ab8
commit 9da2287ab8
parent b6167cd2ff
12 changed files with 363 additions and 36 deletions
--- a/doc/model.schema
+++ b/doc/model.schema
@ -88,6 +88,12 @@
                      "type": "number"
                    }
                  },
+                  "split_type": {
+                    "type": "array",
+                    "items": {
+                      "type": "integer"
+                    }
+                  },
                  "default_left": {
                    "type": "array",
                    "items": {
@ -247,6 +253,18 @@
    "learner": {
      "type": "object",
      "properties": {
+        "feature_names": {
+          "type": "array",
+          "items": {
+              "type": "string"
+          }
+        },
+        "feature_types": {
+          "type": "array",
+          "items": {
+              "type": "string"
+          }
+        },
        "gradient_booster": {
          "oneOf": [
            {
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@ -1132,4 +1132,46 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
 XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
                                  bst_ulong* out_len,
                                  const char*** out);
+
+/*!
+ * \brief Set string encoded feature info in Booster, similar to the feature
+ *        info in DMatrix.
+ *
+ * Accepted fields are:
+ *   - feature_name
+ *   - feature_type
+ *
+ * \param handle    An instance of Booster
+ * \param field     Feild name
+ * \param features  Pointer to array of strings.
+ * \param size      Size of `features` pointer (number of strings passed in).
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
+                                       const char **features,
+                                       const bst_ulong size);
+
+/*!
+ * \brief Get string encoded feature info from Booster, similar to feature info
+ *        in DMatrix.
+ *
+ * Accepted fields are:
+ *   - feature_name
+ *   - feature_type
+ *
+ * Caller is responsible for copying out the data, before next call to any API
+ * function of XGBoost.
+ *
+ * \param handle       An instance of Booster
+ * \param field        Feild name
+ * \param size         Size of output pointer `features` (number of strings returned).
+ * \param out_features Address of a pointer to array of strings. Result is stored in
+ *        thread local memory.
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
+                                       bst_ulong *len,
+                                       const char ***out_features);
 #endif  // XGBOOST_C_API_H_
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@ -213,6 +213,27 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   * \return vector of attribute name strings.
   */
  virtual std::vector<std::string> GetAttrNames() const = 0;
+  /*!
+   * \brief Set the feature names for current booster.
+   * \param fn Input feature names
+   */
+  virtual  void SetFeatureNames(std::vector<std::string> const& fn) = 0;
+  /*!
+   * \brief Get the feature names for current booster.
+   * \param fn Output feature names
+   */
+  virtual void GetFeatureNames(std::vector<std::string>* fn) const = 0;
+  /*!
+   * \brief Set the feature types for current booster.
+   * \param ft Input feature types.
+   */
+  virtual void SetFeatureTypes(std::vector<std::string> const& ft) = 0;
+  /*!
+   * \brief Get the feature types for current booster.
+   * \param fn Output feature types
+   */
+  virtual void GetFeatureTypes(std::vector<std::string>* ft) const = 0;
+
  /*!
   * \return whether the model allow lazy checkpoint in rabit.
   */
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -77,7 +77,7 @@ def from_pystr_to_cstr(data: Union[str, List[str]]):
    raise TypeError()


-def from_cstr_to_pystr(data, length):
+def from_cstr_to_pystr(data, length) -> List[str]:
    """Revert C pointer to Python str

    Parameters
@ -869,7 +869,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes
        )
        feature_names = from_cstr_to_pystr(sarr, length)
        if not feature_names:
-            feature_names = ["f{0}".format(i) for i in range(self.num_col())]
+            return None
        return feature_names

    @feature_names.setter
@ -1167,9 +1167,6 @@ class Booster(object):
    training, prediction and evaluation.
    """

-    feature_names = None
-    feature_types = None
-
    def __init__(self, params=None, cache=(), model_file=None):
        # pylint: disable=invalid-name
        """
@ -1185,12 +1182,15 @@ class Booster(object):
        for d in cache:
            if not isinstance(d, DMatrix):
                raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
-            self._validate_features(d)

        dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
        self.handle = ctypes.c_void_p()
        _check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
                                         ctypes.byref(self.handle)))
+        for d in cache:
+            # Validate feature only after the feature names are saved into booster.
+            self._validate_features(d)
+
        params = params or {}
        params = self._configure_metrics(params.copy())
        if isinstance(params, list):
@ -1400,6 +1400,60 @@ class Booster(object):
            _check_call(_LIB.XGBoosterSetAttr(
                self.handle, c_str(key), value))

+    def _get_feature_info(self, field: str):
+        length = c_bst_ulong()
+        sarr = ctypes.POINTER(ctypes.c_char_p)()
+        if not hasattr(self, "handle") or self.handle is None:
+            return None
+        _check_call(
+            _LIB.XGBoosterGetStrFeatureInfo(
+                self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr),
+            )
+        )
+        feature_info = from_cstr_to_pystr(sarr, length)
+        return feature_info if feature_info else None
+
+    @property
+    def feature_types(self) -> Optional[List[str]]:
+        """Feature types for this booster.  Can be directly set by input data or by
+        assignment.
+
+        """
+        return self._get_feature_info("feature_type")
+
+    @property
+    def feature_names(self) -> Optional[List[str]]:
+        """Feature names for this booster.  Can be directly set by input data or by
+        assignment.
+
+        """
+        return self._get_feature_info("feature_name")
+
+    def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
+        if features is not None:
+            assert isinstance(features, list)
+            c_feature_info = [bytes(f, encoding="utf-8") for f in features]
+            c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info)
+            _check_call(
+                _LIB.XGBoosterSetStrFeatureInfo(
+                    self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features))
+                )
+            )
+        else:
+            _check_call(
+                _LIB.XGBoosterSetStrFeatureInfo(
+                    self.handle, c_str(field), None, c_bst_ulong(0)
+                )
+            )
+
+    @feature_names.setter
+    def feature_names(self, features: Optional[List[str]]) -> None:
+        self._set_feature_info(features, "feature_name")
+
+    @feature_types.setter
+    def feature_types(self, features: Optional[List[str]]) -> None:
+        self._set_feature_info(features, "feature_type")
+
    def set_param(self, params, value=None):
        """Set parameters into the Booster.

@ -1859,9 +1913,10 @@ class Booster(object):
    def save_model(self, fname):
        """Save the model to a file.

-        The model is saved in an XGBoost internal format which is universal
-        among the various XGBoost interfaces. Auxiliary attributes of the
-        Python Booster object (such as feature_names) will not be saved.  See:
+        The model is saved in an XGBoost internal format which is universal among the
+        various XGBoost interfaces. Auxiliary attributes of the Python Booster object
+        (such as feature_names) will not be saved when using binary format.  To save those
+        attributes, use JSON instead. See:

          https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

@ -1898,9 +1953,10 @@ class Booster(object):
        """Load the model from a file or bytearray. Path to file can be local
        or as an URI.

-        The model is loaded from XGBoost format which is universal among the
-        various XGBoost interfaces. Auxiliary attributes of the Python Booster
-        object (such as feature_names) will not be loaded.  See:
+        The model is loaded from XGBoost format which is universal among the various
+        XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
+        feature_names) will not be loaded when using binary format.  To save those
+        attributes, use JSON instead.  See:

          https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

@ -2249,7 +2305,7 @@ class Booster(object):
        # pylint: disable=no-member
        return df.sort(['Tree', 'Node']).reset_index(drop=True)

-    def _validate_features(self, data):
+    def _validate_features(self, data: DMatrix):
        """
        Validate Booster and data's feature_names are identical.
        Set feature_names and feature_types from DMatrix
@ -2260,24 +2316,27 @@ class Booster(object):
        if self.feature_names is None:
            self.feature_names = data.feature_names
            self.feature_types = data.feature_types
-        else:
-            # Booster can't accept data with different feature names
-            if self.feature_names != data.feature_names:
-                dat_missing = set(self.feature_names) - set(data.feature_names)
-                my_missing = set(data.feature_names) - set(self.feature_names)
+        if data.feature_names is None and self.feature_names is not None:
+            raise ValueError(
+                "training data did not have the following fields: " +
+                ", ".join(self.feature_names)
+            )
+        # Booster can't accept data with different feature names
+        if self.feature_names != data.feature_names:
+            dat_missing = set(self.feature_names) - set(data.feature_names)
+            my_missing = set(data.feature_names) - set(self.feature_names)

-                msg = 'feature_names mismatch: {0} {1}'
+            msg = 'feature_names mismatch: {0} {1}'

-                if dat_missing:
-                    msg += ('\nexpected ' + ', '.join(
-                        str(s) for s in dat_missing) + ' in input data')
+            if dat_missing:
+                msg += ('\nexpected ' + ', '.join(
+                    str(s) for s in dat_missing) + ' in input data')

-                if my_missing:
-                    msg += ('\ntraining data did not have the following fields: ' +
-                            ', '.join(str(s) for s in my_missing))
+            if my_missing:
+                msg += ('\ntraining data did not have the following fields: ' +
+                        ', '.join(str(s) for s in my_missing))

-                raise ValueError(msg.format(self.feature_names,
-                                            data.feature_names))
+            raise ValueError(msg.format(self.feature_names, data.feature_names))

    def get_split_value_histogram(self, feature, fmap='', bins=None,
                                  as_pandas=True):
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -958,9 +958,13 @@ class XGBModel(XGBModelBase):
            raise AttributeError(
                'Feature importance is not defined for Booster type {}'
                .format(self.booster))
-        b = self.get_booster()
+        b: Booster = self.get_booster()
        score = b.get_score(importance_type=self.importance_type)
-        all_features = [score.get(f, 0.) for f in b.feature_names]
+        if b.feature_names is None:
+            feature_names = ["f{0}".format(i) for i in range(self.n_features_in_)]
+        else:
+            feature_names = b.feature_names
+        all_features = [score.get(f, 0.) for f in feature_names]
        all_features = np.array(all_features, dtype=np.float32)
        total = all_features.sum()
        if total == 0:
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -1022,5 +1022,50 @@ XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
  API_END();
 }

+XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
+                                       const char **features,
+                                       const xgboost::bst_ulong size) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  auto *learner = static_cast<Learner *>(handle);
+  std::vector<std::string> feature_info;
+  for (size_t i = 0; i < size; ++i) {
+    feature_info.emplace_back(features[i]);
+  }
+  if (!std::strcmp(field, "feature_name")) {
+    learner->SetFeatureNames(feature_info);
+  } else if (!std::strcmp(field, "feature_type")) {
+    learner->SetFeatureTypes(feature_info);
+  } else {
+    LOG(FATAL) << "Unknown field for Booster feature info:" << field;
+  }
+  API_END();
+}
+
+XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
+                                       xgboost::bst_ulong *len,
+                                       const char ***out_features) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  auto const *learner = static_cast<Learner const *>(handle);
+  std::vector<const char *> &charp_vecs =
+      learner->GetThreadLocal().ret_vec_charp;
+  std::vector<std::string> &str_vecs = learner->GetThreadLocal().ret_vec_str;
+  if (!std::strcmp(field, "feature_name")) {
+    learner->GetFeatureNames(&str_vecs);
+  } else if (!std::strcmp(field, "feature_type")) {
+    learner->GetFeatureTypes(&str_vecs);
+  } else {
+    LOG(FATAL) << "Unknown field for Booster feature info:" << field;
+  }
+  charp_vecs.resize(str_vecs.size());
+  for (size_t i = 0; i < str_vecs.size(); ++i) {
+    charp_vecs[i] = str_vecs[i].c_str();
+  }
+  *out_features = dmlc::BeginPtr(charp_vecs);
+  *len = static_cast<xgboost::bst_ulong>(charp_vecs.size());
+  API_END();
+}
+
 // force link rabit
 static DMLC_ATTRIBUTE_UNUSED int XGBOOST_LINK_RABIT_C_API_ = RabitLinkTag();
--- a/src/learner.cc
+++ b/src/learner.cc
@ -256,6 +256,11 @@ class LearnerConfiguration : public Learner {
  std::map<std::string, std::string> cfg_;
  // Stores information like best-iteration for early stopping.
  std::map<std::string, std::string> attributes_;
+  // Name of each feature, usually set from DMatrix.
+  std::vector<std::string> feature_names_;
+  // Type of each feature, usually set from DMatrix.
+  std::vector<std::string> feature_types_;
+
  common::Monitor monitor_;
  LearnerModelParamLegacy mparam_;
  LearnerModelParam learner_model_param_;
@ -460,6 +465,23 @@ class LearnerConfiguration : public Learner {
    return true;
  }

+  void SetFeatureNames(std::vector<std::string> const& fn) override {
+    feature_names_ = fn;
+  }
+
+  void GetFeatureNames(std::vector<std::string>* fn) const override {
+    *fn = feature_names_;
+  }
+
+  void SetFeatureTypes(std::vector<std::string> const& ft) override {
+    this->feature_types_ = ft;
+  }
+
+  void GetFeatureTypes(std::vector<std::string>* p_ft) const override {
+    auto& ft = *p_ft;
+    ft = this->feature_types_;
+  }
+
  std::vector<std::string> GetAttrNames() const override {
    std::vector<std::string> out;
    for (auto const& kv : attributes_) {
@ -666,6 +688,25 @@ class LearnerIO : public LearnerConfiguration {
      attributes_[kv.first] = get<String const>(kv.second);
    }

+    // feature names and types are saved in xgboost 1.4
+    auto it = learner.find("feature_names");
+    if (it != learner.cend()) {
+      auto const &feature_names = get<Array const>(it->second);
+      feature_names_.clear();
+      for (auto const &name : feature_names) {
+        feature_names_.emplace_back(get<String const>(name));
+      }
+    }
+    it = learner.find("feature_types");
+    if (it != learner.cend()) {
+      auto const &feature_types = get<Array const>(it->second);
+      feature_types_.clear();
+      for (auto const &name : feature_types) {
+        auto type = get<String const>(name);
+        feature_types_.emplace_back(type);
+      }
+    }
+
    this->need_configuration_ = true;
  }

@ -691,6 +732,17 @@ class LearnerIO : public LearnerConfiguration {
    for (auto const& kv : attributes_) {
      learner["attributes"][kv.first] = String(kv.second);
    }
+
+    learner["feature_names"] = Array();
+    auto& feature_names = get<Array>(learner["feature_names"]);
+    for (auto const& name : feature_names_) {
+      feature_names.emplace_back(name);
+    }
+    learner["feature_types"] = Array();
+    auto& feature_types = get<Array>(learner["feature_types"]);
+    for (auto const& type : feature_types_) {
+      feature_types.emplace_back(type);
+    }
  }
  // About to be deprecated by JSON format
  void LoadModel(dmlc::Stream* fi) override {
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@ -385,7 +385,7 @@ class JsonGenerator : public TreeGenerator {
  std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
    auto cond = tree[nid].SplitCond();
    static std::string const kNodeTemplate =
-        R"I( "nodeid": {nid}, "depth": {depth}, "split": {fname}, )I"
+        R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I"
        R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
        R"I("missing": {missing})I";
    return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@ -360,4 +360,60 @@ TEST(Learner, ConstantSeed) {
    CHECK_EQ(v_0, v_2);
  }
 }
+
+TEST(Learner, FeatureInfo) {
+  size_t constexpr kCols = 10;
+  auto m = RandomDataGenerator{10, kCols, 0}.GenerateDMatrix(true);
+  std::vector<std::string> names(kCols);
+  for (size_t i = 0; i < kCols; ++i) {
+    names[i] = ("f" + std::to_string(i));
+  }
+
+  std::vector<std::string> types(kCols);
+  for (size_t i = 0; i < kCols; ++i) {
+    types[i] = "q";
+  }
+  types[8] = "f";
+  types[0] = "int";
+  types[3] = "i";
+  types[7] = "i";
+
+  std::vector<char const*> c_names(kCols);
+  for (size_t i = 0; i < names.size(); ++i) {
+    c_names[i] = names[i].c_str();
+  }
+  std::vector<char const*> c_types(kCols);
+  for (size_t i = 0; i < types.size(); ++i) {
+    c_types[i] = names[i].c_str();
+  }
+
+  std::vector<std::string> out_names;
+  std::vector<std::string> out_types;
+
+  Json model{Object()};
+  {
+    std::unique_ptr<Learner> learner{Learner::Create({m})};
+    learner->Configure();
+    learner->SetFeatureNames(names);
+    learner->GetFeatureNames(&out_names);
+
+    learner->SetFeatureTypes(types);
+    learner->GetFeatureTypes(&out_types);
+
+    ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin()));
+    ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin()));
+
+    learner->SaveModel(&model);
+  }
+
+  {
+    std::unique_ptr<Learner> learner{Learner::Create({m})};
+    learner->LoadModel(model);
+
+    learner->GetFeatureNames(&out_names);
+    learner->GetFeatureTypes(&out_types);
+    ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin()));
+    ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin()));
+  }
+}
 }  // namespace xgboost
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@ -217,8 +217,8 @@ class TestModels:
        X = np.random.random((10, 3))
        y = np.random.randint(2, size=(10,))

-        dm1 = xgb.DMatrix(X, y)
-        dm2 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
+        dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
+        dm2 = xgb.DMatrix(X, y)

        bst = xgb.train([], dm1)
        bst.predict(dm1)  # success
@ -228,9 +228,6 @@ class TestModels:

        bst = xgb.train([], dm2)
        bst.predict(dm2)  # success
-        with pytest.raises(ValueError):
-            bst.predict(dm1)
-        bst.predict(dm2)  # success

    def test_model_binary_io(self):
        model_path = 'test_model_binary_io.bin'
@ -458,3 +455,31 @@ class TestModels:
        merged = predt_0 + predt_1 - 0.5
        single = booster[1:7].predict(dtrain, output_margin=True)
        np.testing.assert_allclose(merged, single, atol=1e-6)
+
+    @pytest.mark.skipif(**tm.no_pandas())
+    def test_feature_info(self):
+        import pandas as pd
+        rows = 100
+        cols = 10
+        X = rng.randn(rows, cols)
+        y = rng.randn(rows)
+        feature_names = ["test_feature_" + str(i) for i in range(cols)]
+        X_pd = pd.DataFrame(X, columns=feature_names)
+        X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int)
+
+        Xy = xgb.DMatrix(X_pd, y)
+        assert Xy.feature_types[3] == "int"
+        booster = xgb.train({}, dtrain=Xy, num_boost_round=1)
+
+        assert booster.feature_names == Xy.feature_names
+        assert booster.feature_names == feature_names
+        assert booster.feature_types == Xy.feature_types
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = tmpdir + "model.json"
+            booster.save_model(path)
+            booster = xgb.Booster()
+            booster.load_model(path)
+
+            assert booster.feature_names == Xy.feature_names
+            assert booster.feature_types == Xy.feature_types
--- a/tests/python/test_cli.py
+++ b/tests/python/test_cli.py
@ -95,6 +95,11 @@ eval[test] = {data_path}
            }
            data = xgboost.DMatrix(data_path)
            booster = xgboost.train(parameters, data, num_boost_round=10)
+
+            # CLI model doesn't contain feature info.
+            booster.feature_names = None
+            booster.feature_types = None
+
            booster.save_model(model_out_py)
            py_predt = booster.predict(data)

--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@ -180,7 +180,7 @@ class TestDMatrix:

        # reset
        dm.feature_names = None
-        assert dm.feature_names == ['f0', 'f1', 'f2', 'f3', 'f4']
+        assert dm.feature_names is None
        assert dm.feature_types is None

    def test_feature_names(self):