[breaking] Save booster feature info in JSON, remove feature name generation. (#6605)

* Save feature info in booster in JSON model.
* [breaking] Remove automatic feature name generation in `DMatrix`.

This PR is to enable reliable feature validation in Python package.
This commit is contained in:
Jiaming Yuan 2021-02-25 18:54:16 +08:00 committed by GitHub
parent b6167cd2ff
commit 9da2287ab8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 363 additions and 36 deletions

View File

@ -88,6 +88,12 @@
"type": "number"
}
},
"split_type": {
"type": "array",
"items": {
"type": "integer"
}
},
"default_left": {
"type": "array",
"items": {
@ -247,6 +253,18 @@
"learner": {
"type": "object",
"properties": {
"feature_names": {
"type": "array",
"items": {
"type": "string"
}
},
"feature_types": {
"type": "array",
"items": {
"type": "string"
}
},
"gradient_booster": {
"oneOf": [
{

View File

@ -1132,4 +1132,46 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
bst_ulong* out_len,
const char*** out);
/*!
* \brief Set string encoded feature info in Booster, similar to the feature
* info in DMatrix.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* \param handle An instance of Booster
* \param field Feild name
* \param features Pointer to array of strings.
* \param size Size of `features` pointer (number of strings passed in).
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
const char **features,
const bst_ulong size);
/*!
* \brief Get string encoded feature info from Booster, similar to feature info
* in DMatrix.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* Caller is responsible for copying out the data, before next call to any API
* function of XGBoost.
*
* \param handle An instance of Booster
* \param field Feild name
* \param size Size of output pointer `features` (number of strings returned).
* \param out_features Address of a pointer to array of strings. Result is stored in
* thread local memory.
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
bst_ulong *len,
const char ***out_features);
#endif // XGBOOST_C_API_H_

View File

@ -213,6 +213,27 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
* \return vector of attribute name strings.
*/
virtual std::vector<std::string> GetAttrNames() const = 0;
/*!
* \brief Set the feature names for current booster.
* \param fn Input feature names
*/
virtual void SetFeatureNames(std::vector<std::string> const& fn) = 0;
/*!
* \brief Get the feature names for current booster.
* \param fn Output feature names
*/
virtual void GetFeatureNames(std::vector<std::string>* fn) const = 0;
/*!
* \brief Set the feature types for current booster.
* \param ft Input feature types.
*/
virtual void SetFeatureTypes(std::vector<std::string> const& ft) = 0;
/*!
* \brief Get the feature types for current booster.
* \param fn Output feature types
*/
virtual void GetFeatureTypes(std::vector<std::string>* ft) const = 0;
/*!
* \return whether the model allow lazy checkpoint in rabit.
*/

View File

@ -77,7 +77,7 @@ def from_pystr_to_cstr(data: Union[str, List[str]]):
raise TypeError()
def from_cstr_to_pystr(data, length):
def from_cstr_to_pystr(data, length) -> List[str]:
"""Revert C pointer to Python str
Parameters
@ -869,7 +869,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
)
feature_names = from_cstr_to_pystr(sarr, length)
if not feature_names:
feature_names = ["f{0}".format(i) for i in range(self.num_col())]
return None
return feature_names
@feature_names.setter
@ -1167,9 +1167,6 @@ class Booster(object):
training, prediction and evaluation.
"""
feature_names = None
feature_types = None
def __init__(self, params=None, cache=(), model_file=None):
# pylint: disable=invalid-name
"""
@ -1185,12 +1182,15 @@ class Booster(object):
for d in cache:
if not isinstance(d, DMatrix):
raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
self._validate_features(d)
dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
ctypes.byref(self.handle)))
for d in cache:
# Validate feature only after the feature names are saved into booster.
self._validate_features(d)
params = params or {}
params = self._configure_metrics(params.copy())
if isinstance(params, list):
@ -1400,6 +1400,60 @@ class Booster(object):
_check_call(_LIB.XGBoosterSetAttr(
self.handle, c_str(key), value))
def _get_feature_info(self, field: str):
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
if not hasattr(self, "handle") or self.handle is None:
return None
_check_call(
_LIB.XGBoosterGetStrFeatureInfo(
self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr),
)
)
feature_info = from_cstr_to_pystr(sarr, length)
return feature_info if feature_info else None
@property
def feature_types(self) -> Optional[List[str]]:
"""Feature types for this booster. Can be directly set by input data or by
assignment.
"""
return self._get_feature_info("feature_type")
@property
def feature_names(self) -> Optional[List[str]]:
"""Feature names for this booster. Can be directly set by input data or by
assignment.
"""
return self._get_feature_info("feature_name")
def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
if features is not None:
assert isinstance(features, list)
c_feature_info = [bytes(f, encoding="utf-8") for f in features]
c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info)
_check_call(
_LIB.XGBoosterSetStrFeatureInfo(
self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features))
)
)
else:
_check_call(
_LIB.XGBoosterSetStrFeatureInfo(
self.handle, c_str(field), None, c_bst_ulong(0)
)
)
@feature_names.setter
def feature_names(self, features: Optional[List[str]]) -> None:
self._set_feature_info(features, "feature_name")
@feature_types.setter
def feature_types(self, features: Optional[List[str]]) -> None:
self._set_feature_info(features, "feature_type")
def set_param(self, params, value=None):
"""Set parameters into the Booster.
@ -1859,9 +1913,10 @@ class Booster(object):
def save_model(self, fname):
"""Save the model to a file.
The model is saved in an XGBoost internal format which is universal
among the various XGBoost interfaces. Auxiliary attributes of the
Python Booster object (such as feature_names) will not be saved. See:
The model is saved in an XGBoost internal format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
(such as feature_names) will not be saved when using binary format. To save those
attributes, use JSON instead. See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
@ -1898,9 +1953,10 @@ class Booster(object):
"""Load the model from a file or bytearray. Path to file can be local
or as an URI.
The model is loaded from XGBoost format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster
object (such as feature_names) will not be loaded. See:
The model is loaded from XGBoost format which is universal among the various
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
feature_names) will not be loaded when using binary format. To save those
attributes, use JSON instead. See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
@ -2249,7 +2305,7 @@ class Booster(object):
# pylint: disable=no-member
return df.sort(['Tree', 'Node']).reset_index(drop=True)
def _validate_features(self, data):
def _validate_features(self, data: DMatrix):
"""
Validate Booster and data's feature_names are identical.
Set feature_names and feature_types from DMatrix
@ -2260,24 +2316,27 @@ class Booster(object):
if self.feature_names is None:
self.feature_names = data.feature_names
self.feature_types = data.feature_types
else:
# Booster can't accept data with different feature names
if self.feature_names != data.feature_names:
dat_missing = set(self.feature_names) - set(data.feature_names)
my_missing = set(data.feature_names) - set(self.feature_names)
if data.feature_names is None and self.feature_names is not None:
raise ValueError(
"training data did not have the following fields: " +
", ".join(self.feature_names)
)
# Booster can't accept data with different feature names
if self.feature_names != data.feature_names:
dat_missing = set(self.feature_names) - set(data.feature_names)
my_missing = set(data.feature_names) - set(self.feature_names)
msg = 'feature_names mismatch: {0} {1}'
msg = 'feature_names mismatch: {0} {1}'
if dat_missing:
msg += ('\nexpected ' + ', '.join(
str(s) for s in dat_missing) + ' in input data')
if dat_missing:
msg += ('\nexpected ' + ', '.join(
str(s) for s in dat_missing) + ' in input data')
if my_missing:
msg += ('\ntraining data did not have the following fields: ' +
', '.join(str(s) for s in my_missing))
if my_missing:
msg += ('\ntraining data did not have the following fields: ' +
', '.join(str(s) for s in my_missing))
raise ValueError(msg.format(self.feature_names,
data.feature_names))
raise ValueError(msg.format(self.feature_names, data.feature_names))
def get_split_value_histogram(self, feature, fmap='', bins=None,
as_pandas=True):

View File

@ -958,9 +958,13 @@ class XGBModel(XGBModelBase):
raise AttributeError(
'Feature importance is not defined for Booster type {}'
.format(self.booster))
b = self.get_booster()
b: Booster = self.get_booster()
score = b.get_score(importance_type=self.importance_type)
all_features = [score.get(f, 0.) for f in b.feature_names]
if b.feature_names is None:
feature_names = ["f{0}".format(i) for i in range(self.n_features_in_)]
else:
feature_names = b.feature_names
all_features = [score.get(f, 0.) for f in feature_names]
all_features = np.array(all_features, dtype=np.float32)
total = all_features.sum()
if total == 0:

View File

@ -1022,5 +1022,50 @@ XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
API_END();
}
XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
const char **features,
const xgboost::bst_ulong size) {
API_BEGIN();
CHECK_HANDLE();
auto *learner = static_cast<Learner *>(handle);
std::vector<std::string> feature_info;
for (size_t i = 0; i < size; ++i) {
feature_info.emplace_back(features[i]);
}
if (!std::strcmp(field, "feature_name")) {
learner->SetFeatureNames(feature_info);
} else if (!std::strcmp(field, "feature_type")) {
learner->SetFeatureTypes(feature_info);
} else {
LOG(FATAL) << "Unknown field for Booster feature info:" << field;
}
API_END();
}
XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
xgboost::bst_ulong *len,
const char ***out_features) {
API_BEGIN();
CHECK_HANDLE();
auto const *learner = static_cast<Learner const *>(handle);
std::vector<const char *> &charp_vecs =
learner->GetThreadLocal().ret_vec_charp;
std::vector<std::string> &str_vecs = learner->GetThreadLocal().ret_vec_str;
if (!std::strcmp(field, "feature_name")) {
learner->GetFeatureNames(&str_vecs);
} else if (!std::strcmp(field, "feature_type")) {
learner->GetFeatureTypes(&str_vecs);
} else {
LOG(FATAL) << "Unknown field for Booster feature info:" << field;
}
charp_vecs.resize(str_vecs.size());
for (size_t i = 0; i < str_vecs.size(); ++i) {
charp_vecs[i] = str_vecs[i].c_str();
}
*out_features = dmlc::BeginPtr(charp_vecs);
*len = static_cast<xgboost::bst_ulong>(charp_vecs.size());
API_END();
}
// force link rabit
static DMLC_ATTRIBUTE_UNUSED int XGBOOST_LINK_RABIT_C_API_ = RabitLinkTag();

View File

@ -256,6 +256,11 @@ class LearnerConfiguration : public Learner {
std::map<std::string, std::string> cfg_;
// Stores information like best-iteration for early stopping.
std::map<std::string, std::string> attributes_;
// Name of each feature, usually set from DMatrix.
std::vector<std::string> feature_names_;
// Type of each feature, usually set from DMatrix.
std::vector<std::string> feature_types_;
common::Monitor monitor_;
LearnerModelParamLegacy mparam_;
LearnerModelParam learner_model_param_;
@ -460,6 +465,23 @@ class LearnerConfiguration : public Learner {
return true;
}
void SetFeatureNames(std::vector<std::string> const& fn) override {
feature_names_ = fn;
}
void GetFeatureNames(std::vector<std::string>* fn) const override {
*fn = feature_names_;
}
void SetFeatureTypes(std::vector<std::string> const& ft) override {
this->feature_types_ = ft;
}
void GetFeatureTypes(std::vector<std::string>* p_ft) const override {
auto& ft = *p_ft;
ft = this->feature_types_;
}
std::vector<std::string> GetAttrNames() const override {
std::vector<std::string> out;
for (auto const& kv : attributes_) {
@ -666,6 +688,25 @@ class LearnerIO : public LearnerConfiguration {
attributes_[kv.first] = get<String const>(kv.second);
}
// feature names and types are saved in xgboost 1.4
auto it = learner.find("feature_names");
if (it != learner.cend()) {
auto const &feature_names = get<Array const>(it->second);
feature_names_.clear();
for (auto const &name : feature_names) {
feature_names_.emplace_back(get<String const>(name));
}
}
it = learner.find("feature_types");
if (it != learner.cend()) {
auto const &feature_types = get<Array const>(it->second);
feature_types_.clear();
for (auto const &name : feature_types) {
auto type = get<String const>(name);
feature_types_.emplace_back(type);
}
}
this->need_configuration_ = true;
}
@ -691,6 +732,17 @@ class LearnerIO : public LearnerConfiguration {
for (auto const& kv : attributes_) {
learner["attributes"][kv.first] = String(kv.second);
}
learner["feature_names"] = Array();
auto& feature_names = get<Array>(learner["feature_names"]);
for (auto const& name : feature_names_) {
feature_names.emplace_back(name);
}
learner["feature_types"] = Array();
auto& feature_types = get<Array>(learner["feature_types"]);
for (auto const& type : feature_types_) {
feature_types.emplace_back(type);
}
}
// About to be deprecated by JSON format
void LoadModel(dmlc::Stream* fi) override {

View File

@ -385,7 +385,7 @@ class JsonGenerator : public TreeGenerator {
std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
auto cond = tree[nid].SplitCond();
static std::string const kNodeTemplate =
R"I( "nodeid": {nid}, "depth": {depth}, "split": {fname}, )I"
R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I"
R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
R"I("missing": {missing})I";
return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);

View File

@ -360,4 +360,60 @@ TEST(Learner, ConstantSeed) {
CHECK_EQ(v_0, v_2);
}
}
TEST(Learner, FeatureInfo) {
size_t constexpr kCols = 10;
auto m = RandomDataGenerator{10, kCols, 0}.GenerateDMatrix(true);
std::vector<std::string> names(kCols);
for (size_t i = 0; i < kCols; ++i) {
names[i] = ("f" + std::to_string(i));
}
std::vector<std::string> types(kCols);
for (size_t i = 0; i < kCols; ++i) {
types[i] = "q";
}
types[8] = "f";
types[0] = "int";
types[3] = "i";
types[7] = "i";
std::vector<char const*> c_names(kCols);
for (size_t i = 0; i < names.size(); ++i) {
c_names[i] = names[i].c_str();
}
std::vector<char const*> c_types(kCols);
for (size_t i = 0; i < types.size(); ++i) {
c_types[i] = names[i].c_str();
}
std::vector<std::string> out_names;
std::vector<std::string> out_types;
Json model{Object()};
{
std::unique_ptr<Learner> learner{Learner::Create({m})};
learner->Configure();
learner->SetFeatureNames(names);
learner->GetFeatureNames(&out_names);
learner->SetFeatureTypes(types);
learner->GetFeatureTypes(&out_types);
ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin()));
ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin()));
learner->SaveModel(&model);
}
{
std::unique_ptr<Learner> learner{Learner::Create({m})};
learner->LoadModel(model);
learner->GetFeatureNames(&out_names);
learner->GetFeatureTypes(&out_types);
ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin()));
ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin()));
}
}
} // namespace xgboost

View File

@ -217,8 +217,8 @@ class TestModels:
X = np.random.random((10, 3))
y = np.random.randint(2, size=(10,))
dm1 = xgb.DMatrix(X, y)
dm2 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
dm2 = xgb.DMatrix(X, y)
bst = xgb.train([], dm1)
bst.predict(dm1) # success
@ -228,9 +228,6 @@ class TestModels:
bst = xgb.train([], dm2)
bst.predict(dm2) # success
with pytest.raises(ValueError):
bst.predict(dm1)
bst.predict(dm2) # success
def test_model_binary_io(self):
model_path = 'test_model_binary_io.bin'
@ -458,3 +455,31 @@ class TestModels:
merged = predt_0 + predt_1 - 0.5
single = booster[1:7].predict(dtrain, output_margin=True)
np.testing.assert_allclose(merged, single, atol=1e-6)
@pytest.mark.skipif(**tm.no_pandas())
def test_feature_info(self):
import pandas as pd
rows = 100
cols = 10
X = rng.randn(rows, cols)
y = rng.randn(rows)
feature_names = ["test_feature_" + str(i) for i in range(cols)]
X_pd = pd.DataFrame(X, columns=feature_names)
X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int)
Xy = xgb.DMatrix(X_pd, y)
assert Xy.feature_types[3] == "int"
booster = xgb.train({}, dtrain=Xy, num_boost_round=1)
assert booster.feature_names == Xy.feature_names
assert booster.feature_names == feature_names
assert booster.feature_types == Xy.feature_types
with tempfile.TemporaryDirectory() as tmpdir:
path = tmpdir + "model.json"
booster.save_model(path)
booster = xgb.Booster()
booster.load_model(path)
assert booster.feature_names == Xy.feature_names
assert booster.feature_types == Xy.feature_types

View File

@ -95,6 +95,11 @@ eval[test] = {data_path}
}
data = xgboost.DMatrix(data_path)
booster = xgboost.train(parameters, data, num_boost_round=10)
# CLI model doesn't contain feature info.
booster.feature_names = None
booster.feature_types = None
booster.save_model(model_out_py)
py_predt = booster.predict(data)

View File

@ -180,7 +180,7 @@ class TestDMatrix:
# reset
dm.feature_names = None
assert dm.feature_names == ['f0', 'f1', 'f2', 'f3', 'f4']
assert dm.feature_names is None
assert dm.feature_types is None
def test_feature_names(self):