[breaking] Save booster feature info in JSON, remove feature name generation. (#6605)
* Save feature info in booster in JSON model. * [breaking] Remove automatic feature name generation in `DMatrix`. This PR is to enable reliable feature validation in Python package.
This commit is contained in:
parent
b6167cd2ff
commit
9da2287ab8
@ -88,6 +88,12 @@
|
|||||||
"type": "number"
|
"type": "number"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"split_type": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
},
|
||||||
"default_left": {
|
"default_left": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
@ -247,6 +253,18 @@
|
|||||||
"learner": {
|
"learner": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
"feature_names": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"feature_types": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
"gradient_booster": {
|
"gradient_booster": {
|
||||||
"oneOf": [
|
"oneOf": [
|
||||||
{
|
{
|
||||||
|
|||||||
@ -1132,4 +1132,46 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
|
|||||||
XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
|
XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
|
||||||
bst_ulong* out_len,
|
bst_ulong* out_len,
|
||||||
const char*** out);
|
const char*** out);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Set string encoded feature info in Booster, similar to the feature
|
||||||
|
* info in DMatrix.
|
||||||
|
*
|
||||||
|
* Accepted fields are:
|
||||||
|
* - feature_name
|
||||||
|
* - feature_type
|
||||||
|
*
|
||||||
|
* \param handle An instance of Booster
|
||||||
|
* \param field Feild name
|
||||||
|
* \param features Pointer to array of strings.
|
||||||
|
* \param size Size of `features` pointer (number of strings passed in).
|
||||||
|
*
|
||||||
|
* \return 0 when success, -1 when failure happens
|
||||||
|
*/
|
||||||
|
XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
|
||||||
|
const char **features,
|
||||||
|
const bst_ulong size);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Get string encoded feature info from Booster, similar to feature info
|
||||||
|
* in DMatrix.
|
||||||
|
*
|
||||||
|
* Accepted fields are:
|
||||||
|
* - feature_name
|
||||||
|
* - feature_type
|
||||||
|
*
|
||||||
|
* Caller is responsible for copying out the data, before next call to any API
|
||||||
|
* function of XGBoost.
|
||||||
|
*
|
||||||
|
* \param handle An instance of Booster
|
||||||
|
* \param field Feild name
|
||||||
|
* \param size Size of output pointer `features` (number of strings returned).
|
||||||
|
* \param out_features Address of a pointer to array of strings. Result is stored in
|
||||||
|
* thread local memory.
|
||||||
|
*
|
||||||
|
* \return 0 when success, -1 when failure happens
|
||||||
|
*/
|
||||||
|
XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
|
||||||
|
bst_ulong *len,
|
||||||
|
const char ***out_features);
|
||||||
#endif // XGBOOST_C_API_H_
|
#endif // XGBOOST_C_API_H_
|
||||||
|
|||||||
@ -213,6 +213,27 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
|
|||||||
* \return vector of attribute name strings.
|
* \return vector of attribute name strings.
|
||||||
*/
|
*/
|
||||||
virtual std::vector<std::string> GetAttrNames() const = 0;
|
virtual std::vector<std::string> GetAttrNames() const = 0;
|
||||||
|
/*!
|
||||||
|
* \brief Set the feature names for current booster.
|
||||||
|
* \param fn Input feature names
|
||||||
|
*/
|
||||||
|
virtual void SetFeatureNames(std::vector<std::string> const& fn) = 0;
|
||||||
|
/*!
|
||||||
|
* \brief Get the feature names for current booster.
|
||||||
|
* \param fn Output feature names
|
||||||
|
*/
|
||||||
|
virtual void GetFeatureNames(std::vector<std::string>* fn) const = 0;
|
||||||
|
/*!
|
||||||
|
* \brief Set the feature types for current booster.
|
||||||
|
* \param ft Input feature types.
|
||||||
|
*/
|
||||||
|
virtual void SetFeatureTypes(std::vector<std::string> const& ft) = 0;
|
||||||
|
/*!
|
||||||
|
* \brief Get the feature types for current booster.
|
||||||
|
* \param fn Output feature types
|
||||||
|
*/
|
||||||
|
virtual void GetFeatureTypes(std::vector<std::string>* ft) const = 0;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \return whether the model allow lazy checkpoint in rabit.
|
* \return whether the model allow lazy checkpoint in rabit.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -77,7 +77,7 @@ def from_pystr_to_cstr(data: Union[str, List[str]]):
|
|||||||
raise TypeError()
|
raise TypeError()
|
||||||
|
|
||||||
|
|
||||||
def from_cstr_to_pystr(data, length):
|
def from_cstr_to_pystr(data, length) -> List[str]:
|
||||||
"""Revert C pointer to Python str
|
"""Revert C pointer to Python str
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -869,7 +869,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
|||||||
)
|
)
|
||||||
feature_names = from_cstr_to_pystr(sarr, length)
|
feature_names = from_cstr_to_pystr(sarr, length)
|
||||||
if not feature_names:
|
if not feature_names:
|
||||||
feature_names = ["f{0}".format(i) for i in range(self.num_col())]
|
return None
|
||||||
return feature_names
|
return feature_names
|
||||||
|
|
||||||
@feature_names.setter
|
@feature_names.setter
|
||||||
@ -1167,9 +1167,6 @@ class Booster(object):
|
|||||||
training, prediction and evaluation.
|
training, prediction and evaluation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
feature_names = None
|
|
||||||
feature_types = None
|
|
||||||
|
|
||||||
def __init__(self, params=None, cache=(), model_file=None):
|
def __init__(self, params=None, cache=(), model_file=None):
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
"""
|
"""
|
||||||
@ -1185,12 +1182,15 @@ class Booster(object):
|
|||||||
for d in cache:
|
for d in cache:
|
||||||
if not isinstance(d, DMatrix):
|
if not isinstance(d, DMatrix):
|
||||||
raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
|
raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
|
||||||
self._validate_features(d)
|
|
||||||
|
|
||||||
dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
|
dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
|
||||||
self.handle = ctypes.c_void_p()
|
self.handle = ctypes.c_void_p()
|
||||||
_check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
|
_check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
|
||||||
ctypes.byref(self.handle)))
|
ctypes.byref(self.handle)))
|
||||||
|
for d in cache:
|
||||||
|
# Validate feature only after the feature names are saved into booster.
|
||||||
|
self._validate_features(d)
|
||||||
|
|
||||||
params = params or {}
|
params = params or {}
|
||||||
params = self._configure_metrics(params.copy())
|
params = self._configure_metrics(params.copy())
|
||||||
if isinstance(params, list):
|
if isinstance(params, list):
|
||||||
@ -1400,6 +1400,60 @@ class Booster(object):
|
|||||||
_check_call(_LIB.XGBoosterSetAttr(
|
_check_call(_LIB.XGBoosterSetAttr(
|
||||||
self.handle, c_str(key), value))
|
self.handle, c_str(key), value))
|
||||||
|
|
||||||
|
def _get_feature_info(self, field: str):
|
||||||
|
length = c_bst_ulong()
|
||||||
|
sarr = ctypes.POINTER(ctypes.c_char_p)()
|
||||||
|
if not hasattr(self, "handle") or self.handle is None:
|
||||||
|
return None
|
||||||
|
_check_call(
|
||||||
|
_LIB.XGBoosterGetStrFeatureInfo(
|
||||||
|
self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
feature_info = from_cstr_to_pystr(sarr, length)
|
||||||
|
return feature_info if feature_info else None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_types(self) -> Optional[List[str]]:
|
||||||
|
"""Feature types for this booster. Can be directly set by input data or by
|
||||||
|
assignment.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self._get_feature_info("feature_type")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_names(self) -> Optional[List[str]]:
|
||||||
|
"""Feature names for this booster. Can be directly set by input data or by
|
||||||
|
assignment.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self._get_feature_info("feature_name")
|
||||||
|
|
||||||
|
def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
|
||||||
|
if features is not None:
|
||||||
|
assert isinstance(features, list)
|
||||||
|
c_feature_info = [bytes(f, encoding="utf-8") for f in features]
|
||||||
|
c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info)
|
||||||
|
_check_call(
|
||||||
|
_LIB.XGBoosterSetStrFeatureInfo(
|
||||||
|
self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_check_call(
|
||||||
|
_LIB.XGBoosterSetStrFeatureInfo(
|
||||||
|
self.handle, c_str(field), None, c_bst_ulong(0)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@feature_names.setter
|
||||||
|
def feature_names(self, features: Optional[List[str]]) -> None:
|
||||||
|
self._set_feature_info(features, "feature_name")
|
||||||
|
|
||||||
|
@feature_types.setter
|
||||||
|
def feature_types(self, features: Optional[List[str]]) -> None:
|
||||||
|
self._set_feature_info(features, "feature_type")
|
||||||
|
|
||||||
def set_param(self, params, value=None):
|
def set_param(self, params, value=None):
|
||||||
"""Set parameters into the Booster.
|
"""Set parameters into the Booster.
|
||||||
|
|
||||||
@ -1859,9 +1913,10 @@ class Booster(object):
|
|||||||
def save_model(self, fname):
|
def save_model(self, fname):
|
||||||
"""Save the model to a file.
|
"""Save the model to a file.
|
||||||
|
|
||||||
The model is saved in an XGBoost internal format which is universal
|
The model is saved in an XGBoost internal format which is universal among the
|
||||||
among the various XGBoost interfaces. Auxiliary attributes of the
|
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
|
||||||
Python Booster object (such as feature_names) will not be saved. See:
|
(such as feature_names) will not be saved when using binary format. To save those
|
||||||
|
attributes, use JSON instead. See:
|
||||||
|
|
||||||
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
|
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
|
||||||
|
|
||||||
@ -1898,9 +1953,10 @@ class Booster(object):
|
|||||||
"""Load the model from a file or bytearray. Path to file can be local
|
"""Load the model from a file or bytearray. Path to file can be local
|
||||||
or as an URI.
|
or as an URI.
|
||||||
|
|
||||||
The model is loaded from XGBoost format which is universal among the
|
The model is loaded from XGBoost format which is universal among the various
|
||||||
various XGBoost interfaces. Auxiliary attributes of the Python Booster
|
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
|
||||||
object (such as feature_names) will not be loaded. See:
|
feature_names) will not be loaded when using binary format. To save those
|
||||||
|
attributes, use JSON instead. See:
|
||||||
|
|
||||||
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
|
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
|
||||||
|
|
||||||
@ -2249,7 +2305,7 @@ class Booster(object):
|
|||||||
# pylint: disable=no-member
|
# pylint: disable=no-member
|
||||||
return df.sort(['Tree', 'Node']).reset_index(drop=True)
|
return df.sort(['Tree', 'Node']).reset_index(drop=True)
|
||||||
|
|
||||||
def _validate_features(self, data):
|
def _validate_features(self, data: DMatrix):
|
||||||
"""
|
"""
|
||||||
Validate Booster and data's feature_names are identical.
|
Validate Booster and data's feature_names are identical.
|
||||||
Set feature_names and feature_types from DMatrix
|
Set feature_names and feature_types from DMatrix
|
||||||
@ -2260,24 +2316,27 @@ class Booster(object):
|
|||||||
if self.feature_names is None:
|
if self.feature_names is None:
|
||||||
self.feature_names = data.feature_names
|
self.feature_names = data.feature_names
|
||||||
self.feature_types = data.feature_types
|
self.feature_types = data.feature_types
|
||||||
else:
|
if data.feature_names is None and self.feature_names is not None:
|
||||||
# Booster can't accept data with different feature names
|
raise ValueError(
|
||||||
if self.feature_names != data.feature_names:
|
"training data did not have the following fields: " +
|
||||||
dat_missing = set(self.feature_names) - set(data.feature_names)
|
", ".join(self.feature_names)
|
||||||
my_missing = set(data.feature_names) - set(self.feature_names)
|
)
|
||||||
|
# Booster can't accept data with different feature names
|
||||||
|
if self.feature_names != data.feature_names:
|
||||||
|
dat_missing = set(self.feature_names) - set(data.feature_names)
|
||||||
|
my_missing = set(data.feature_names) - set(self.feature_names)
|
||||||
|
|
||||||
msg = 'feature_names mismatch: {0} {1}'
|
msg = 'feature_names mismatch: {0} {1}'
|
||||||
|
|
||||||
if dat_missing:
|
if dat_missing:
|
||||||
msg += ('\nexpected ' + ', '.join(
|
msg += ('\nexpected ' + ', '.join(
|
||||||
str(s) for s in dat_missing) + ' in input data')
|
str(s) for s in dat_missing) + ' in input data')
|
||||||
|
|
||||||
if my_missing:
|
if my_missing:
|
||||||
msg += ('\ntraining data did not have the following fields: ' +
|
msg += ('\ntraining data did not have the following fields: ' +
|
||||||
', '.join(str(s) for s in my_missing))
|
', '.join(str(s) for s in my_missing))
|
||||||
|
|
||||||
raise ValueError(msg.format(self.feature_names,
|
raise ValueError(msg.format(self.feature_names, data.feature_names))
|
||||||
data.feature_names))
|
|
||||||
|
|
||||||
def get_split_value_histogram(self, feature, fmap='', bins=None,
|
def get_split_value_histogram(self, feature, fmap='', bins=None,
|
||||||
as_pandas=True):
|
as_pandas=True):
|
||||||
|
|||||||
@ -958,9 +958,13 @@ class XGBModel(XGBModelBase):
|
|||||||
raise AttributeError(
|
raise AttributeError(
|
||||||
'Feature importance is not defined for Booster type {}'
|
'Feature importance is not defined for Booster type {}'
|
||||||
.format(self.booster))
|
.format(self.booster))
|
||||||
b = self.get_booster()
|
b: Booster = self.get_booster()
|
||||||
score = b.get_score(importance_type=self.importance_type)
|
score = b.get_score(importance_type=self.importance_type)
|
||||||
all_features = [score.get(f, 0.) for f in b.feature_names]
|
if b.feature_names is None:
|
||||||
|
feature_names = ["f{0}".format(i) for i in range(self.n_features_in_)]
|
||||||
|
else:
|
||||||
|
feature_names = b.feature_names
|
||||||
|
all_features = [score.get(f, 0.) for f in feature_names]
|
||||||
all_features = np.array(all_features, dtype=np.float32)
|
all_features = np.array(all_features, dtype=np.float32)
|
||||||
total = all_features.sum()
|
total = all_features.sum()
|
||||||
if total == 0:
|
if total == 0:
|
||||||
|
|||||||
@ -1022,5 +1022,50 @@ XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
|
|||||||
API_END();
|
API_END();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
|
||||||
|
const char **features,
|
||||||
|
const xgboost::bst_ulong size) {
|
||||||
|
API_BEGIN();
|
||||||
|
CHECK_HANDLE();
|
||||||
|
auto *learner = static_cast<Learner *>(handle);
|
||||||
|
std::vector<std::string> feature_info;
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
feature_info.emplace_back(features[i]);
|
||||||
|
}
|
||||||
|
if (!std::strcmp(field, "feature_name")) {
|
||||||
|
learner->SetFeatureNames(feature_info);
|
||||||
|
} else if (!std::strcmp(field, "feature_type")) {
|
||||||
|
learner->SetFeatureTypes(feature_info);
|
||||||
|
} else {
|
||||||
|
LOG(FATAL) << "Unknown field for Booster feature info:" << field;
|
||||||
|
}
|
||||||
|
API_END();
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
|
||||||
|
xgboost::bst_ulong *len,
|
||||||
|
const char ***out_features) {
|
||||||
|
API_BEGIN();
|
||||||
|
CHECK_HANDLE();
|
||||||
|
auto const *learner = static_cast<Learner const *>(handle);
|
||||||
|
std::vector<const char *> &charp_vecs =
|
||||||
|
learner->GetThreadLocal().ret_vec_charp;
|
||||||
|
std::vector<std::string> &str_vecs = learner->GetThreadLocal().ret_vec_str;
|
||||||
|
if (!std::strcmp(field, "feature_name")) {
|
||||||
|
learner->GetFeatureNames(&str_vecs);
|
||||||
|
} else if (!std::strcmp(field, "feature_type")) {
|
||||||
|
learner->GetFeatureTypes(&str_vecs);
|
||||||
|
} else {
|
||||||
|
LOG(FATAL) << "Unknown field for Booster feature info:" << field;
|
||||||
|
}
|
||||||
|
charp_vecs.resize(str_vecs.size());
|
||||||
|
for (size_t i = 0; i < str_vecs.size(); ++i) {
|
||||||
|
charp_vecs[i] = str_vecs[i].c_str();
|
||||||
|
}
|
||||||
|
*out_features = dmlc::BeginPtr(charp_vecs);
|
||||||
|
*len = static_cast<xgboost::bst_ulong>(charp_vecs.size());
|
||||||
|
API_END();
|
||||||
|
}
|
||||||
|
|
||||||
// force link rabit
|
// force link rabit
|
||||||
static DMLC_ATTRIBUTE_UNUSED int XGBOOST_LINK_RABIT_C_API_ = RabitLinkTag();
|
static DMLC_ATTRIBUTE_UNUSED int XGBOOST_LINK_RABIT_C_API_ = RabitLinkTag();
|
||||||
|
|||||||
@ -256,6 +256,11 @@ class LearnerConfiguration : public Learner {
|
|||||||
std::map<std::string, std::string> cfg_;
|
std::map<std::string, std::string> cfg_;
|
||||||
// Stores information like best-iteration for early stopping.
|
// Stores information like best-iteration for early stopping.
|
||||||
std::map<std::string, std::string> attributes_;
|
std::map<std::string, std::string> attributes_;
|
||||||
|
// Name of each feature, usually set from DMatrix.
|
||||||
|
std::vector<std::string> feature_names_;
|
||||||
|
// Type of each feature, usually set from DMatrix.
|
||||||
|
std::vector<std::string> feature_types_;
|
||||||
|
|
||||||
common::Monitor monitor_;
|
common::Monitor monitor_;
|
||||||
LearnerModelParamLegacy mparam_;
|
LearnerModelParamLegacy mparam_;
|
||||||
LearnerModelParam learner_model_param_;
|
LearnerModelParam learner_model_param_;
|
||||||
@ -460,6 +465,23 @@ class LearnerConfiguration : public Learner {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SetFeatureNames(std::vector<std::string> const& fn) override {
|
||||||
|
feature_names_ = fn;
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetFeatureNames(std::vector<std::string>* fn) const override {
|
||||||
|
*fn = feature_names_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetFeatureTypes(std::vector<std::string> const& ft) override {
|
||||||
|
this->feature_types_ = ft;
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetFeatureTypes(std::vector<std::string>* p_ft) const override {
|
||||||
|
auto& ft = *p_ft;
|
||||||
|
ft = this->feature_types_;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::string> GetAttrNames() const override {
|
std::vector<std::string> GetAttrNames() const override {
|
||||||
std::vector<std::string> out;
|
std::vector<std::string> out;
|
||||||
for (auto const& kv : attributes_) {
|
for (auto const& kv : attributes_) {
|
||||||
@ -666,6 +688,25 @@ class LearnerIO : public LearnerConfiguration {
|
|||||||
attributes_[kv.first] = get<String const>(kv.second);
|
attributes_[kv.first] = get<String const>(kv.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// feature names and types are saved in xgboost 1.4
|
||||||
|
auto it = learner.find("feature_names");
|
||||||
|
if (it != learner.cend()) {
|
||||||
|
auto const &feature_names = get<Array const>(it->second);
|
||||||
|
feature_names_.clear();
|
||||||
|
for (auto const &name : feature_names) {
|
||||||
|
feature_names_.emplace_back(get<String const>(name));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
it = learner.find("feature_types");
|
||||||
|
if (it != learner.cend()) {
|
||||||
|
auto const &feature_types = get<Array const>(it->second);
|
||||||
|
feature_types_.clear();
|
||||||
|
for (auto const &name : feature_types) {
|
||||||
|
auto type = get<String const>(name);
|
||||||
|
feature_types_.emplace_back(type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
this->need_configuration_ = true;
|
this->need_configuration_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -691,6 +732,17 @@ class LearnerIO : public LearnerConfiguration {
|
|||||||
for (auto const& kv : attributes_) {
|
for (auto const& kv : attributes_) {
|
||||||
learner["attributes"][kv.first] = String(kv.second);
|
learner["attributes"][kv.first] = String(kv.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
learner["feature_names"] = Array();
|
||||||
|
auto& feature_names = get<Array>(learner["feature_names"]);
|
||||||
|
for (auto const& name : feature_names_) {
|
||||||
|
feature_names.emplace_back(name);
|
||||||
|
}
|
||||||
|
learner["feature_types"] = Array();
|
||||||
|
auto& feature_types = get<Array>(learner["feature_types"]);
|
||||||
|
for (auto const& type : feature_types_) {
|
||||||
|
feature_types.emplace_back(type);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// About to be deprecated by JSON format
|
// About to be deprecated by JSON format
|
||||||
void LoadModel(dmlc::Stream* fi) override {
|
void LoadModel(dmlc::Stream* fi) override {
|
||||||
|
|||||||
@ -385,7 +385,7 @@ class JsonGenerator : public TreeGenerator {
|
|||||||
std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
|
std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
|
||||||
auto cond = tree[nid].SplitCond();
|
auto cond = tree[nid].SplitCond();
|
||||||
static std::string const kNodeTemplate =
|
static std::string const kNodeTemplate =
|
||||||
R"I( "nodeid": {nid}, "depth": {depth}, "split": {fname}, )I"
|
R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I"
|
||||||
R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
|
R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
|
||||||
R"I("missing": {missing})I";
|
R"I("missing": {missing})I";
|
||||||
return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);
|
return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);
|
||||||
|
|||||||
@ -360,4 +360,60 @@ TEST(Learner, ConstantSeed) {
|
|||||||
CHECK_EQ(v_0, v_2);
|
CHECK_EQ(v_0, v_2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Learner, FeatureInfo) {
|
||||||
|
size_t constexpr kCols = 10;
|
||||||
|
auto m = RandomDataGenerator{10, kCols, 0}.GenerateDMatrix(true);
|
||||||
|
std::vector<std::string> names(kCols);
|
||||||
|
for (size_t i = 0; i < kCols; ++i) {
|
||||||
|
names[i] = ("f" + std::to_string(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> types(kCols);
|
||||||
|
for (size_t i = 0; i < kCols; ++i) {
|
||||||
|
types[i] = "q";
|
||||||
|
}
|
||||||
|
types[8] = "f";
|
||||||
|
types[0] = "int";
|
||||||
|
types[3] = "i";
|
||||||
|
types[7] = "i";
|
||||||
|
|
||||||
|
std::vector<char const*> c_names(kCols);
|
||||||
|
for (size_t i = 0; i < names.size(); ++i) {
|
||||||
|
c_names[i] = names[i].c_str();
|
||||||
|
}
|
||||||
|
std::vector<char const*> c_types(kCols);
|
||||||
|
for (size_t i = 0; i < types.size(); ++i) {
|
||||||
|
c_types[i] = names[i].c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> out_names;
|
||||||
|
std::vector<std::string> out_types;
|
||||||
|
|
||||||
|
Json model{Object()};
|
||||||
|
{
|
||||||
|
std::unique_ptr<Learner> learner{Learner::Create({m})};
|
||||||
|
learner->Configure();
|
||||||
|
learner->SetFeatureNames(names);
|
||||||
|
learner->GetFeatureNames(&out_names);
|
||||||
|
|
||||||
|
learner->SetFeatureTypes(types);
|
||||||
|
learner->GetFeatureTypes(&out_types);
|
||||||
|
|
||||||
|
ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin()));
|
||||||
|
ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin()));
|
||||||
|
|
||||||
|
learner->SaveModel(&model);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
std::unique_ptr<Learner> learner{Learner::Create({m})};
|
||||||
|
learner->LoadModel(model);
|
||||||
|
|
||||||
|
learner->GetFeatureNames(&out_names);
|
||||||
|
learner->GetFeatureTypes(&out_types);
|
||||||
|
ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin()));
|
||||||
|
ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin()));
|
||||||
|
}
|
||||||
|
}
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -217,8 +217,8 @@ class TestModels:
|
|||||||
X = np.random.random((10, 3))
|
X = np.random.random((10, 3))
|
||||||
y = np.random.randint(2, size=(10,))
|
y = np.random.randint(2, size=(10,))
|
||||||
|
|
||||||
dm1 = xgb.DMatrix(X, y)
|
dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
|
||||||
dm2 = xgb.DMatrix(X, y, feature_names=("a", "b", "c"))
|
dm2 = xgb.DMatrix(X, y)
|
||||||
|
|
||||||
bst = xgb.train([], dm1)
|
bst = xgb.train([], dm1)
|
||||||
bst.predict(dm1) # success
|
bst.predict(dm1) # success
|
||||||
@ -228,9 +228,6 @@ class TestModels:
|
|||||||
|
|
||||||
bst = xgb.train([], dm2)
|
bst = xgb.train([], dm2)
|
||||||
bst.predict(dm2) # success
|
bst.predict(dm2) # success
|
||||||
with pytest.raises(ValueError):
|
|
||||||
bst.predict(dm1)
|
|
||||||
bst.predict(dm2) # success
|
|
||||||
|
|
||||||
def test_model_binary_io(self):
|
def test_model_binary_io(self):
|
||||||
model_path = 'test_model_binary_io.bin'
|
model_path = 'test_model_binary_io.bin'
|
||||||
@ -458,3 +455,31 @@ class TestModels:
|
|||||||
merged = predt_0 + predt_1 - 0.5
|
merged = predt_0 + predt_1 - 0.5
|
||||||
single = booster[1:7].predict(dtrain, output_margin=True)
|
single = booster[1:7].predict(dtrain, output_margin=True)
|
||||||
np.testing.assert_allclose(merged, single, atol=1e-6)
|
np.testing.assert_allclose(merged, single, atol=1e-6)
|
||||||
|
|
||||||
|
@pytest.mark.skipif(**tm.no_pandas())
|
||||||
|
def test_feature_info(self):
|
||||||
|
import pandas as pd
|
||||||
|
rows = 100
|
||||||
|
cols = 10
|
||||||
|
X = rng.randn(rows, cols)
|
||||||
|
y = rng.randn(rows)
|
||||||
|
feature_names = ["test_feature_" + str(i) for i in range(cols)]
|
||||||
|
X_pd = pd.DataFrame(X, columns=feature_names)
|
||||||
|
X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int)
|
||||||
|
|
||||||
|
Xy = xgb.DMatrix(X_pd, y)
|
||||||
|
assert Xy.feature_types[3] == "int"
|
||||||
|
booster = xgb.train({}, dtrain=Xy, num_boost_round=1)
|
||||||
|
|
||||||
|
assert booster.feature_names == Xy.feature_names
|
||||||
|
assert booster.feature_names == feature_names
|
||||||
|
assert booster.feature_types == Xy.feature_types
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
path = tmpdir + "model.json"
|
||||||
|
booster.save_model(path)
|
||||||
|
booster = xgb.Booster()
|
||||||
|
booster.load_model(path)
|
||||||
|
|
||||||
|
assert booster.feature_names == Xy.feature_names
|
||||||
|
assert booster.feature_types == Xy.feature_types
|
||||||
|
|||||||
@ -95,6 +95,11 @@ eval[test] = {data_path}
|
|||||||
}
|
}
|
||||||
data = xgboost.DMatrix(data_path)
|
data = xgboost.DMatrix(data_path)
|
||||||
booster = xgboost.train(parameters, data, num_boost_round=10)
|
booster = xgboost.train(parameters, data, num_boost_round=10)
|
||||||
|
|
||||||
|
# CLI model doesn't contain feature info.
|
||||||
|
booster.feature_names = None
|
||||||
|
booster.feature_types = None
|
||||||
|
|
||||||
booster.save_model(model_out_py)
|
booster.save_model(model_out_py)
|
||||||
py_predt = booster.predict(data)
|
py_predt = booster.predict(data)
|
||||||
|
|
||||||
|
|||||||
@ -180,7 +180,7 @@ class TestDMatrix:
|
|||||||
|
|
||||||
# reset
|
# reset
|
||||||
dm.feature_names = None
|
dm.feature_names = None
|
||||||
assert dm.feature_names == ['f0', 'f1', 'f2', 'f3', 'f4']
|
assert dm.feature_names is None
|
||||||
assert dm.feature_types is None
|
assert dm.feature_types is None
|
||||||
|
|
||||||
def test_feature_names(self):
|
def test_feature_names(self):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user