Move feature names and types of DMatrix from Python to C++. (#5858)
* Add thread local return entry for DMatrix. * Save feature name and feature type in binary file. Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
parent
4b0852ee41
commit
93c44a9a64
@ -415,6 +415,74 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
|
||||
const unsigned *array,
|
||||
bst_ulong len);
|
||||
|
||||
/*!
|
||||
* \brief Set string encoded information of all features.
|
||||
*
|
||||
* Accepted fields are:
|
||||
* - feature_name
|
||||
* - feature_type
|
||||
*
|
||||
* \param handle An instance of data matrix
|
||||
* \param field Feild name
|
||||
* \param features Pointer to array of strings.
|
||||
* \param size Size of `features` pointer (number of strings passed in).
|
||||
*
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*
|
||||
* \code
|
||||
*
|
||||
* char const* feat_names [] {"feat_0", "feat_1"};
|
||||
* XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2);
|
||||
*
|
||||
* // i for integer, q for quantitive. Similarly "int" and "float" are also recognized.
|
||||
* char const* feat_types [] {"i", "q"};
|
||||
* XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2);
|
||||
*
|
||||
* \endcode
|
||||
*/
|
||||
XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
|
||||
const char **features,
|
||||
const bst_ulong size);
|
||||
|
||||
/*!
|
||||
* \brief Get string encoded information of all features.
|
||||
*
|
||||
* Accepted fields are:
|
||||
* - feature_name
|
||||
* - feature_type
|
||||
*
|
||||
* Caller is responsible for copying out the data, before next call to any API function of
|
||||
* XGBoost.
|
||||
*
|
||||
* \param handle An instance of data matrix
|
||||
* \param field Feild name
|
||||
* \param size Size of output pointer `features` (number of strings returned).
|
||||
* \param out_features Address of a pointer to array of strings. Result is stored in
|
||||
* thread local memory.
|
||||
*
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*
|
||||
* \code
|
||||
*
|
||||
* char const **c_out_features = NULL;
|
||||
* bst_ulong out_size = 0;
|
||||
*
|
||||
* // Asumming the feature names are already set by `XGDMatrixSetStrFeatureInfo`.
|
||||
* XGDMatrixGetStrFeatureInfo(handle, "feature_name", &out_size,
|
||||
* &c_out_features)
|
||||
*
|
||||
* for (bst_ulong i = 0; i < out_size; ++i) {
|
||||
* // Here we are simply printing the string. Copy it out if the feature name is
|
||||
* // useful after printing.
|
||||
* printf("feature %lu: %s\n", i, c_out_features[i]);
|
||||
* }
|
||||
*
|
||||
* \endcode
|
||||
*/
|
||||
XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
|
||||
bst_ulong *size,
|
||||
const char ***out_features);
|
||||
|
||||
/*!
|
||||
* \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
|
||||
* \param handle a instance of data matrix
|
||||
@ -575,8 +643,9 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
|
||||
*
|
||||
* - Functions with the term "Model" handles saving/loading XGBoost model like trees or
|
||||
* linear weights. Striping out parameters configuration like training algorithms or
|
||||
* CUDA device ID helps user to reuse the trained model for different tasks, examples
|
||||
* are prediction, training continuation or interpretation.
|
||||
* CUDA device ID. These functions are designed to let users reuse the trained model
|
||||
* for different tasks, examples are prediction, training continuation or model
|
||||
* interpretation.
|
||||
*
|
||||
* - Functions with the term "Config" handles save/loading configuration. It helps user
|
||||
* to study the internal of XGBoost. Also user can use the load method for specifying
|
||||
@ -592,7 +661,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
|
||||
/*!
|
||||
* \brief Load model from existing file
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
* \param fname File URI or file name.
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
|
||||
@ -600,7 +669,7 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
|
||||
/*!
|
||||
* \brief Save model into existing file
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
* \param fname File URI or file name.
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
|
||||
|
||||
@ -31,7 +31,12 @@ enum class DataType : uint8_t {
|
||||
kFloat32 = 1,
|
||||
kDouble = 2,
|
||||
kUInt32 = 3,
|
||||
kUInt64 = 4
|
||||
kUInt64 = 4,
|
||||
kStr = 5
|
||||
};
|
||||
|
||||
enum class FeatureType : uint8_t {
|
||||
kNumerical
|
||||
};
|
||||
|
||||
/*!
|
||||
@ -40,7 +45,7 @@ enum class DataType : uint8_t {
|
||||
class MetaInfo {
|
||||
public:
|
||||
/*! \brief number of data fields in MetaInfo */
|
||||
static constexpr uint64_t kNumField = 9;
|
||||
static constexpr uint64_t kNumField = 11;
|
||||
|
||||
/*! \brief number of rows in the data */
|
||||
uint64_t num_row_{0}; // NOLINT
|
||||
@ -72,6 +77,19 @@ class MetaInfo {
|
||||
*/
|
||||
HostDeviceVector<bst_float> labels_upper_bound_; // NOLINT
|
||||
|
||||
/*!
|
||||
* \brief Name of type for each feature provided by users. Eg. "int"/"float"/"i"/"q"
|
||||
*/
|
||||
std::vector<std::string> feature_type_names;
|
||||
/*!
|
||||
* \brief Name for each feature.
|
||||
*/
|
||||
std::vector<std::string> feature_names;
|
||||
/*
|
||||
* \brief Type of each feature. Automatically set when feature_type_names is specifed.
|
||||
*/
|
||||
HostDeviceVector<FeatureType> feature_types;
|
||||
|
||||
/*! \brief default constructor */
|
||||
MetaInfo() = default;
|
||||
MetaInfo(MetaInfo&& that) = default;
|
||||
@ -158,6 +176,12 @@ class MetaInfo {
|
||||
*/
|
||||
void SetInfo(const char* key, std::string const& interface_str);
|
||||
|
||||
void GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
|
||||
const void** out_dptr) const;
|
||||
|
||||
void SetFeatureInfo(const char *key, const char **info, const bst_ulong size);
|
||||
void GetFeatureInfo(const char *field, std::vector<std::string>* out_str_vecs) const;
|
||||
|
||||
/*
|
||||
* \brief Extend with other MetaInfo.
|
||||
*
|
||||
@ -432,6 +456,8 @@ class BatchSet {
|
||||
BatchIterator<T> begin_iter_;
|
||||
};
|
||||
|
||||
struct XGBAPIThreadLocalEntry;
|
||||
|
||||
/*!
|
||||
* \brief Internal data structured used by XGBoost during training.
|
||||
*/
|
||||
@ -450,6 +476,10 @@ class DMatrix {
|
||||
}
|
||||
/*! \brief meta information of the dataset */
|
||||
virtual const MetaInfo& Info() const = 0;
|
||||
|
||||
/*! \brief Get thread local memory for returning data from DMatrix. */
|
||||
XGBAPIThreadLocalEntry& GetThreadLocal() const;
|
||||
|
||||
/**
|
||||
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
|
||||
*/
|
||||
@ -462,7 +492,7 @@ class DMatrix {
|
||||
/*! \return Whether the data columns single column block. */
|
||||
virtual bool SingleColBlock() const = 0;
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~DMatrix() = default;
|
||||
virtual ~DMatrix();
|
||||
|
||||
/*! \brief Whether the matrix is dense. */
|
||||
bool IsDense() const {
|
||||
|
||||
@ -305,12 +305,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
|
||||
DMatrix is a internal data structure that used by XGBoost
|
||||
which is optimized for both memory efficiency and training speed.
|
||||
You can construct DMatrix from numpy.arrays
|
||||
You can construct DMatrix from multiple different sources of data.
|
||||
"""
|
||||
|
||||
_feature_names = None # for previous version's pickle
|
||||
_feature_types = None
|
||||
|
||||
def __init__(self, data, label=None, weight=None, base_margin=None,
|
||||
missing=None,
|
||||
silent=False,
|
||||
@ -362,11 +359,6 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
# force into void_p, mac need to pass things in as void_p
|
||||
if data is None:
|
||||
self.handle = None
|
||||
|
||||
if feature_names is not None:
|
||||
self._feature_names = feature_names
|
||||
if feature_types is not None:
|
||||
self._feature_types = feature_types
|
||||
return
|
||||
|
||||
handler = self.get_data_handler(data)
|
||||
@ -666,14 +658,16 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
res : DMatrix
|
||||
A new DMatrix containing only selected indices.
|
||||
"""
|
||||
res = DMatrix(None, feature_names=self.feature_names,
|
||||
feature_types=self.feature_types)
|
||||
res = DMatrix(None)
|
||||
res.handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGDMatrixSliceDMatrixEx(self.handle,
|
||||
c_array(ctypes.c_int, rindex),
|
||||
c_bst_ulong(len(rindex)),
|
||||
ctypes.byref(res.handle),
|
||||
ctypes.c_int(1 if allow_groups else 0)))
|
||||
_check_call(_LIB.XGDMatrixSliceDMatrixEx(
|
||||
self.handle,
|
||||
c_array(ctypes.c_int, rindex),
|
||||
c_bst_ulong(len(rindex)),
|
||||
ctypes.byref(res.handle),
|
||||
ctypes.c_int(1 if allow_groups else 0)))
|
||||
res.feature_names = self.feature_names
|
||||
res.feature_types = self.feature_types
|
||||
return res
|
||||
|
||||
@property
|
||||
@ -684,20 +678,17 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
-------
|
||||
feature_names : list or None
|
||||
"""
|
||||
if self._feature_names is None:
|
||||
self._feature_names = ['f{0}'.format(i)
|
||||
for i in range(self.num_col())]
|
||||
return self._feature_names
|
||||
|
||||
@property
|
||||
def feature_types(self):
|
||||
"""Get feature types (column types).
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_types : list or None
|
||||
"""
|
||||
return self._feature_types
|
||||
length = c_bst_ulong()
|
||||
sarr = ctypes.POINTER(ctypes.c_char_p)()
|
||||
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
|
||||
c_str('feature_name'),
|
||||
ctypes.byref(length),
|
||||
ctypes.byref(sarr)))
|
||||
feature_names = from_cstr_to_pystr(sarr, length)
|
||||
if not feature_names:
|
||||
feature_names = ['f{0}'.format(i)
|
||||
for i in range(self.num_col())]
|
||||
return feature_names
|
||||
|
||||
@feature_names.setter
|
||||
def feature_names(self, feature_names):
|
||||
@ -728,10 +719,41 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
not any(x in f for x in set(('[', ']', '<')))
|
||||
for f in feature_names):
|
||||
raise ValueError('feature_names must be string, and may not contain [, ] or <')
|
||||
c_feature_names = [bytes(f, encoding='utf-8')
|
||||
for f in feature_names]
|
||||
c_feature_names = (ctypes.c_char_p *
|
||||
len(c_feature_names))(*c_feature_names)
|
||||
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
|
||||
self.handle, c_str('feature_name'),
|
||||
c_feature_names,
|
||||
c_bst_ulong(len(feature_names))))
|
||||
else:
|
||||
# reset feature_types also
|
||||
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
|
||||
self.handle,
|
||||
c_str('feature_name'),
|
||||
None,
|
||||
c_bst_ulong(0)))
|
||||
self.feature_types = None
|
||||
self._feature_names = feature_names
|
||||
|
||||
@property
|
||||
def feature_types(self):
|
||||
"""Get feature types (column types).
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_types : list or None
|
||||
"""
|
||||
length = c_bst_ulong()
|
||||
sarr = ctypes.POINTER(ctypes.c_char_p)()
|
||||
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
|
||||
c_str('feature_type'),
|
||||
ctypes.byref(length),
|
||||
ctypes.byref(sarr)))
|
||||
res = from_cstr_to_pystr(sarr, length)
|
||||
if not res:
|
||||
return None
|
||||
return res
|
||||
|
||||
@feature_types.setter
|
||||
def feature_types(self, feature_types):
|
||||
@ -746,14 +768,12 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
Labels for features. None will reset existing feature names
|
||||
"""
|
||||
if feature_types is not None:
|
||||
if self._feature_names is None:
|
||||
msg = 'Unable to set feature types before setting names'
|
||||
raise ValueError(msg)
|
||||
|
||||
if not isinstance(feature_types, (list, str)):
|
||||
raise TypeError(
|
||||
'feature_types must be string or list of strings')
|
||||
if isinstance(feature_types, STRING_TYPES):
|
||||
# single string will be applied to all columns
|
||||
feature_types = [feature_types] * self.num_col()
|
||||
|
||||
try:
|
||||
if not isinstance(feature_types, str):
|
||||
feature_types = list(feature_types)
|
||||
@ -761,16 +781,25 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
feature_types = [feature_types]
|
||||
except TypeError:
|
||||
feature_types = [feature_types]
|
||||
c_feature_types = [bytes(f, encoding='utf-8')
|
||||
for f in feature_types]
|
||||
c_feature_types = (ctypes.c_char_p *
|
||||
len(c_feature_types))(*c_feature_types)
|
||||
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
|
||||
self.handle, c_str('feature_type'),
|
||||
c_feature_types,
|
||||
c_bst_ulong(len(feature_types))))
|
||||
|
||||
if len(feature_types) != self.num_col():
|
||||
msg = 'feature_types must have the same length as data'
|
||||
raise ValueError(msg)
|
||||
|
||||
valid = ('int', 'float', 'i', 'q')
|
||||
if not all(isinstance(f, STRING_TYPES) and f in valid
|
||||
for f in feature_types):
|
||||
raise ValueError('All feature_names must be {int, float, i, q}')
|
||||
self._feature_types = feature_types
|
||||
else:
|
||||
# Reset.
|
||||
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
|
||||
self.handle,
|
||||
c_str('feature_type'),
|
||||
None,
|
||||
c_bst_ulong(0)))
|
||||
|
||||
|
||||
class DeviceQuantileDMatrix(DMatrix):
|
||||
|
||||
@ -372,7 +372,7 @@ class DTHandler(DataHandler):
|
||||
raise ValueError(
|
||||
'DataTable has own feature types, cannot pass them in.')
|
||||
feature_types = np.vectorize(self.dt_type_mapper2.get)(
|
||||
data_types_names)
|
||||
data_types_names).tolist()
|
||||
|
||||
return data, feature_names, feature_types
|
||||
|
||||
|
||||
@ -283,6 +283,38 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
|
||||
const char **c_info,
|
||||
const xgboost::bst_ulong size) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();
|
||||
info.SetFeatureInfo(field, c_info, size);
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
|
||||
xgboost::bst_ulong *len,
|
||||
const char ***out_features) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
auto m = *static_cast<std::shared_ptr<DMatrix>*>(handle);
|
||||
auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();
|
||||
|
||||
std::vector<const char *> &charp_vecs = m->GetThreadLocal().ret_vec_charp;
|
||||
std::vector<std::string> &str_vecs = m->GetThreadLocal().ret_vec_str;
|
||||
|
||||
info.GetFeatureInfo(field, &str_vecs);
|
||||
|
||||
charp_vecs.resize(str_vecs.size());
|
||||
for (size_t i = 0; i < str_vecs.size(); ++i) {
|
||||
charp_vecs[i] = str_vecs[i].c_str();
|
||||
}
|
||||
*out_features = dmlc::BeginPtr(charp_vecs);
|
||||
*len = static_cast<xgboost::bst_ulong>(charp_vecs.size());
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle,
|
||||
const unsigned* group,
|
||||
xgboost::bst_ulong len) {
|
||||
@ -301,22 +333,7 @@ XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
const MetaInfo& info = static_cast<std::shared_ptr<DMatrix>*>(handle)->get()->Info();
|
||||
const std::vector<bst_float>* vec = nullptr;
|
||||
if (!std::strcmp(field, "label")) {
|
||||
vec = &info.labels_.HostVector();
|
||||
} else if (!std::strcmp(field, "weight")) {
|
||||
vec = &info.weights_.HostVector();
|
||||
} else if (!std::strcmp(field, "base_margin")) {
|
||||
vec = &info.base_margin_.HostVector();
|
||||
} else if (!std::strcmp(field, "label_lower_bound")) {
|
||||
vec = &info.labels_lower_bound_.HostVector();
|
||||
} else if (!std::strcmp(field, "label_upper_bound")) {
|
||||
vec = &info.labels_upper_bound_.HostVector();
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown float field name " << field;
|
||||
}
|
||||
*out_len = static_cast<xgboost::bst_ulong>(vec->size()); // NOLINT
|
||||
*out_dptr = dmlc::BeginPtr(*vec);
|
||||
info.GetInfo(field, out_len, DataType::kFloat32, reinterpret_cast<void const**>(out_dptr));
|
||||
API_END();
|
||||
}
|
||||
|
||||
@ -327,14 +344,7 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
const MetaInfo& info = static_cast<std::shared_ptr<DMatrix>*>(handle)->get()->Info();
|
||||
const std::vector<unsigned>* vec = nullptr;
|
||||
if (!std::strcmp(field, "group_ptr")) {
|
||||
vec = &info.group_ptr_;
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown uint field name " << field;
|
||||
}
|
||||
*out_len = static_cast<xgboost::bst_ulong>(vec->size());
|
||||
*out_dptr = dmlc::BeginPtr(*vec);
|
||||
info.GetInfo(field, out_len, DataType::kUInt32, reinterpret_cast<void const**>(out_dptr));
|
||||
API_END();
|
||||
}
|
||||
|
||||
|
||||
@ -171,6 +171,8 @@ void HostDeviceVector<T>::SetDevice(int device) const {}
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<int32_t>; // bst_node_t
|
||||
template class HostDeviceVector<uint8_t>;
|
||||
template class HostDeviceVector<FeatureType>;
|
||||
template class HostDeviceVector<Entry>;
|
||||
template class HostDeviceVector<uint64_t>; // bst_row_t
|
||||
template class HostDeviceVector<uint32_t>; // bst_feature_t
|
||||
|
||||
@ -398,6 +398,7 @@ template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<int32_t>; // bst_node_t
|
||||
template class HostDeviceVector<uint8_t>;
|
||||
template class HostDeviceVector<FeatureType>;
|
||||
template class HostDeviceVector<Entry>;
|
||||
template class HostDeviceVector<uint64_t>; // bst_row_t
|
||||
template class HostDeviceVector<uint32_t>; // bst_feature_t
|
||||
|
||||
134
src/data/data.cc
134
src/data/data.cc
@ -11,6 +11,7 @@
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/version_config.h"
|
||||
#include "xgboost/learner.h"
|
||||
#include "sparse_page_writer.h"
|
||||
#include "simple_dmatrix.h"
|
||||
|
||||
@ -148,8 +149,10 @@ void MetaInfo::Clear() {
|
||||
* | group_ptr | kUInt32 | False | ${size} | 1 | ${group_ptr_} |
|
||||
* | weights | kFloat32 | False | ${size} | 1 | ${weights_} |
|
||||
* | base_margin | kFloat32 | False | ${size} | 1 | ${base_margin_} |
|
||||
* | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound__} |
|
||||
* | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound__} |
|
||||
* | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound_} |
|
||||
* | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound_} |
|
||||
* | feature_names | kStr | False | ${size} | 1 | ${feature_names} |
|
||||
* | feature_types | kStr | False | ${size} | 1 | ${feature_types} |
|
||||
*
|
||||
* Note that the scalar fields (is_scalar=True) will have num_row and num_col missing.
|
||||
* Also notice the difference between the saved name and the name used in `SetInfo':
|
||||
@ -177,9 +180,31 @@ void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
|
||||
SaveVectorField(fo, u8"labels_upper_bound", DataType::kFloat32,
|
||||
{labels_upper_bound_.Size(), 1}, labels_upper_bound_); ++field_cnt;
|
||||
|
||||
SaveVectorField(fo, u8"feature_names", DataType::kStr,
|
||||
{feature_names.size(), 1}, feature_names); ++field_cnt;
|
||||
SaveVectorField(fo, u8"feature_types", DataType::kStr,
|
||||
{feature_type_names.size(), 1}, feature_type_names); ++field_cnt;
|
||||
|
||||
CHECK_EQ(field_cnt, kNumField) << "Wrong number of fields";
|
||||
}
|
||||
|
||||
void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<FeatureType>* types) {
|
||||
types->clear();
|
||||
for (auto const &elem : type_names) {
|
||||
if (elem == "int") {
|
||||
types->emplace_back(FeatureType::kNumerical);
|
||||
} else if (elem == "float") {
|
||||
types->emplace_back(FeatureType::kNumerical);
|
||||
} else if (elem == "i") {
|
||||
types->emplace_back(FeatureType::kNumerical);
|
||||
} else if (elem == "q") {
|
||||
types->emplace_back(FeatureType::kNumerical);
|
||||
} else {
|
||||
LOG(FATAL) << "All feature_types must be {int, float, i, q}";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::LoadBinary(dmlc::Stream *fi) {
|
||||
auto version = Version::Load(fi);
|
||||
auto major = std::get<0>(version);
|
||||
@ -193,11 +218,20 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
|
||||
const uint64_t expected_num_field = kNumField;
|
||||
uint64_t num_field { 0 };
|
||||
CHECK(fi->Read(&num_field)) << "MetaInfo: invalid format";
|
||||
CHECK_GE(num_field, expected_num_field)
|
||||
<< "MetaInfo: insufficient number of fields (expected at least " << expected_num_field
|
||||
<< " fields, but the binary file only contains " << num_field << "fields.)";
|
||||
size_t expected = 0;
|
||||
if (major == 1 && std::get<1>(version) < 2) {
|
||||
// feature names and types are added in 1.2
|
||||
expected = expected_num_field - 2;
|
||||
} else {
|
||||
expected = expected_num_field;
|
||||
}
|
||||
CHECK_GE(num_field, expected)
|
||||
<< "MetaInfo: insufficient number of fields (expected at least "
|
||||
<< expected << " fields, but the binary file only contains " << num_field
|
||||
<< "fields.)";
|
||||
if (num_field > expected_num_field) {
|
||||
LOG(WARNING) << "MetaInfo: the given binary file contains extra fields which will be ignored.";
|
||||
LOG(WARNING) << "MetaInfo: the given binary file contains extra fields "
|
||||
"which will be ignored.";
|
||||
}
|
||||
|
||||
LoadScalarField(fi, u8"num_row", DataType::kUInt64, &num_row_);
|
||||
@ -209,6 +243,10 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
|
||||
LoadVectorField(fi, u8"base_margin", DataType::kFloat32, &base_margin_);
|
||||
LoadVectorField(fi, u8"labels_lower_bound", DataType::kFloat32, &labels_lower_bound_);
|
||||
LoadVectorField(fi, u8"labels_upper_bound", DataType::kFloat32, &labels_upper_bound_);
|
||||
|
||||
LoadVectorField(fi, u8"feature_names", DataType::kStr, &feature_names);
|
||||
LoadVectorField(fi, u8"feature_types", DataType::kStr, &feature_type_names);
|
||||
LoadFeatureType(feature_type_names, &feature_types.HostVector());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -344,6 +382,76 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype,
|
||||
const void **out_dptr) const {
|
||||
if (dtype == DataType::kFloat32) {
|
||||
const std::vector<bst_float>* vec = nullptr;
|
||||
if (!std::strcmp(key, "label")) {
|
||||
vec = &this->labels_.HostVector();
|
||||
} else if (!std::strcmp(key, "weight")) {
|
||||
vec = &this->weights_.HostVector();
|
||||
} else if (!std::strcmp(key, "base_margin")) {
|
||||
vec = &this->base_margin_.HostVector();
|
||||
} else if (!std::strcmp(key, "label_lower_bound")) {
|
||||
vec = &this->labels_lower_bound_.HostVector();
|
||||
} else if (!std::strcmp(key, "label_upper_bound")) {
|
||||
vec = &this->labels_upper_bound_.HostVector();
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown float field name: " << key;
|
||||
}
|
||||
*out_len = static_cast<xgboost::bst_ulong>(vec->size()); // NOLINT
|
||||
*reinterpret_cast<float const**>(out_dptr) = dmlc::BeginPtr(*vec);
|
||||
} else if (dtype == DataType::kUInt32) {
|
||||
const std::vector<unsigned> *vec = nullptr;
|
||||
if (!std::strcmp(key, "group_ptr")) {
|
||||
vec = &this->group_ptr_;
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown uint32 field name: " << key;
|
||||
}
|
||||
*out_len = static_cast<xgboost::bst_ulong>(vec->size());
|
||||
*reinterpret_cast<unsigned const**>(out_dptr) = dmlc::BeginPtr(*vec);
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown data type for getting meta info.";
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) {
|
||||
if (size != 0) {
|
||||
CHECK_EQ(size, this->num_col_)
|
||||
<< "Length of " << key << " must be equal to number of columns.";
|
||||
}
|
||||
if (!std::strcmp(key, "feature_type")) {
|
||||
feature_type_names.clear();
|
||||
auto& h_feature_types = feature_types.HostVector();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
auto elem = info[i];
|
||||
feature_type_names.emplace_back(elem);
|
||||
}
|
||||
LoadFeatureType(feature_type_names, &h_feature_types);
|
||||
} else if (!std::strcmp(key, "feature_name")) {
|
||||
feature_names.clear();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
feature_names.emplace_back(info[i]);
|
||||
}
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown feature info name: " << key;
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::GetFeatureInfo(const char *field,
|
||||
std::vector<std::string> *out_str_vecs) const {
|
||||
auto &str_vecs = *out_str_vecs;
|
||||
if (!std::strcmp(field, "feature_type")) {
|
||||
str_vecs.resize(feature_type_names.size());
|
||||
std::copy(feature_type_names.cbegin(), feature_type_names.cend(), str_vecs.begin());
|
||||
} else if (!strcmp(field, "feature_name")) {
|
||||
str_vecs.resize(feature_names.size());
|
||||
std::copy(feature_names.begin(), feature_names.end(), str_vecs.begin());
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown feature info: " << field;
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) {
|
||||
if (accumulate_rows) {
|
||||
this->num_row_ += that.num_row_;
|
||||
@ -441,6 +549,20 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
using DMatrixThreadLocal =
|
||||
dmlc::ThreadLocalStore<std::map<DMatrix const *, XGBAPIThreadLocalEntry>>;
|
||||
|
||||
XGBAPIThreadLocalEntry& DMatrix::GetThreadLocal() const {
|
||||
return (*DMatrixThreadLocal::Get())[this];
|
||||
}
|
||||
|
||||
DMatrix::~DMatrix() {
|
||||
auto local_map = DMatrixThreadLocal::Get();
|
||||
if (local_map->find(this) != local_map->cend()) {
|
||||
local_map->erase(this);
|
||||
}
|
||||
}
|
||||
|
||||
DMatrix* DMatrix::Load(const std::string& uri,
|
||||
bool silent,
|
||||
bool load_row_split,
|
||||
|
||||
@ -202,7 +202,7 @@ void GenericParameter::ConfigureGpuId(bool require_gpu) {
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
}
|
||||
|
||||
using XGBAPIThreadLocalStore =
|
||||
using LearnerAPIThreadLocalStore =
|
||||
dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
|
||||
|
||||
class LearnerConfiguration : public Learner {
|
||||
@ -895,7 +895,7 @@ class LearnerImpl : public LearnerIO {
|
||||
explicit LearnerImpl(std::vector<std::shared_ptr<DMatrix> > cache)
|
||||
: LearnerIO{cache} {}
|
||||
~LearnerImpl() override {
|
||||
auto local_map = XGBAPIThreadLocalStore::Get();
|
||||
auto local_map = LearnerAPIThreadLocalStore::Get();
|
||||
if (local_map->find(this) != local_map->cend()) {
|
||||
local_map->erase(this);
|
||||
}
|
||||
@ -1023,7 +1023,7 @@ class LearnerImpl : public LearnerIO {
|
||||
}
|
||||
|
||||
XGBAPIThreadLocalEntry& GetThreadLocal() const override {
|
||||
return (*XGBAPIThreadLocalStore::Get())[this];
|
||||
return (*LearnerAPIThreadLocalStore::Get())[this];
|
||||
}
|
||||
|
||||
void InplacePredict(dmlc::any const &x, std::string const &type,
|
||||
|
||||
@ -10,7 +10,6 @@
|
||||
#include "../helpers.h"
|
||||
#include "../../../src/common/io.h"
|
||||
|
||||
|
||||
TEST(CAPI, XGDMatrixCreateFromMatDT) {
|
||||
std::vector<int> col0 = {0, -1, 3};
|
||||
std::vector<float> col1 = {-4.0f, 2.0f, 0.0f};
|
||||
@ -148,4 +147,48 @@ TEST(CAPI, CatchDMLCError) {
|
||||
EXPECT_THROW({ dmlc::Stream::Create("foo", "r"); }, dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(CAPI, DMatrixSetFeatureName) {
|
||||
size_t constexpr kRows = 10;
|
||||
bst_feature_t constexpr kCols = 2;
|
||||
|
||||
DMatrixHandle handle;
|
||||
std::vector<float> data(kCols * kRows, 1.5);
|
||||
|
||||
XGDMatrixCreateFromMat_omp(data.data(), kRows, kCols,
|
||||
std::numeric_limits<float>::quiet_NaN(), &handle,
|
||||
0);
|
||||
std::vector<std::string> feature_names;
|
||||
for (bst_feature_t i = 0; i < kCols; ++i) {
|
||||
feature_names.emplace_back(std::to_string(i));
|
||||
}
|
||||
std::vector<char const*> c_feature_names;
|
||||
c_feature_names.resize(feature_names.size());
|
||||
std::transform(feature_names.cbegin(), feature_names.cend(),
|
||||
c_feature_names.begin(),
|
||||
[](auto const &str) { return str.c_str(); });
|
||||
XGDMatrixSetStrFeatureInfo(handle, u8"feature_name", c_feature_names.data(),
|
||||
c_feature_names.size());
|
||||
bst_ulong out_len = 0;
|
||||
char const **c_out_features;
|
||||
XGDMatrixGetStrFeatureInfo(handle, u8"feature_name", &out_len,
|
||||
&c_out_features);
|
||||
|
||||
CHECK_EQ(out_len, kCols);
|
||||
std::vector<std::string> out_features;
|
||||
for (bst_ulong i = 0; i < out_len; ++i) {
|
||||
ASSERT_EQ(std::to_string(i), c_out_features[i]);
|
||||
}
|
||||
|
||||
char const* feat_types [] {"i", "q"};
|
||||
static_assert(sizeof(feat_types)/ sizeof(feat_types[0]) == kCols, "");
|
||||
XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, kCols);
|
||||
char const **c_out_types;
|
||||
XGDMatrixGetStrFeatureInfo(handle, u8"feature_type", &out_len,
|
||||
&c_out_types);
|
||||
for (bst_ulong i = 0; i < out_len; ++i) {
|
||||
ASSERT_STREQ(feat_types[i], c_out_types[i]);
|
||||
}
|
||||
|
||||
XGDMatrixFree(handle);
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
@ -39,6 +39,36 @@ TEST(MetaInfo, GetSet) {
|
||||
ASSERT_EQ(info.group_ptr_.size(), 0);
|
||||
}
|
||||
|
||||
TEST(MetaInfo, GetSetFeature) {
|
||||
xgboost::MetaInfo info;
|
||||
EXPECT_THROW(info.SetFeatureInfo("", nullptr, 0), dmlc::Error);
|
||||
EXPECT_THROW(info.SetFeatureInfo("foo", nullptr, 0), dmlc::Error);
|
||||
EXPECT_NO_THROW(info.SetFeatureInfo("feature_name", nullptr, 0));
|
||||
EXPECT_NO_THROW(info.SetFeatureInfo("feature_type", nullptr, 0));
|
||||
ASSERT_EQ(info.feature_type_names.size(), 0);
|
||||
ASSERT_EQ(info.feature_types.Size(), 0);
|
||||
ASSERT_EQ(info.feature_names.size(), 0);
|
||||
|
||||
size_t constexpr kCols = 19;
|
||||
std::vector<std::string> types(kCols, u8"float");
|
||||
std::vector<char const*> c_types(kCols);
|
||||
std::transform(types.cbegin(), types.cend(), c_types.begin(),
|
||||
[](auto const &str) { return str.c_str(); });
|
||||
// Info has 0 column
|
||||
EXPECT_THROW(
|
||||
info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()),
|
||||
dmlc::Error);
|
||||
info.num_col_ = kCols;
|
||||
EXPECT_NO_THROW(
|
||||
info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
|
||||
|
||||
// Test clear.
|
||||
info.SetFeatureInfo("feature_type", nullptr, 0);
|
||||
ASSERT_EQ(info.feature_type_names.size(), 0);
|
||||
ASSERT_EQ(info.feature_types.Size(), 0);
|
||||
// Other conditions are tested in `SaveLoadBinary`.
|
||||
}
|
||||
|
||||
TEST(MetaInfo, SaveLoadBinary) {
|
||||
xgboost::MetaInfo info;
|
||||
uint64_t constexpr kRows { 64 }, kCols { 32 };
|
||||
@ -51,9 +81,22 @@ TEST(MetaInfo, SaveLoadBinary) {
|
||||
info.SetInfo("label", values.data(), xgboost::DataType::kFloat32, kRows);
|
||||
info.SetInfo("weight", values.data(), xgboost::DataType::kFloat32, kRows);
|
||||
info.SetInfo("base_margin", values.data(), xgboost::DataType::kFloat32, kRows);
|
||||
|
||||
info.num_row_ = kRows;
|
||||
info.num_col_ = kCols;
|
||||
|
||||
auto featname = u8"特征名";
|
||||
std::vector<std::string> types(kCols, u8"float");
|
||||
std::vector<char const*> c_types(kCols);
|
||||
std::transform(types.cbegin(), types.cend(), c_types.begin(),
|
||||
[](auto const &str) { return str.c_str(); });
|
||||
info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size());
|
||||
std::vector<std::string> names(kCols, featname);
|
||||
std::vector<char const*> c_names(kCols);
|
||||
std::transform(names.cbegin(), names.cend(), c_names.begin(),
|
||||
[](auto const &str) { return str.c_str(); });
|
||||
info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size());;
|
||||
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/metainfo.binary";
|
||||
{
|
||||
@ -80,6 +123,23 @@ TEST(MetaInfo, SaveLoadBinary) {
|
||||
EXPECT_EQ(inforead.group_ptr_, info.group_ptr_);
|
||||
EXPECT_EQ(inforead.weights_.HostVector(), info.weights_.HostVector());
|
||||
EXPECT_EQ(inforead.base_margin_.HostVector(), info.base_margin_.HostVector());
|
||||
|
||||
EXPECT_EQ(inforead.feature_type_names.size(), kCols);
|
||||
EXPECT_EQ(inforead.feature_types.Size(), kCols);
|
||||
EXPECT_TRUE(std::all_of(inforead.feature_type_names.cbegin(),
|
||||
inforead.feature_type_names.cend(),
|
||||
[](auto const &str) { return str == u8"float"; }));
|
||||
auto h_ft = inforead.feature_types.HostSpan();
|
||||
EXPECT_TRUE(std::all_of(h_ft.cbegin(), h_ft.cend(), [](auto f) {
|
||||
return f == xgboost::FeatureType::kNumerical;
|
||||
}));
|
||||
|
||||
EXPECT_EQ(inforead.feature_names.size(), kCols);
|
||||
EXPECT_TRUE(std::all_of(inforead.feature_names.cbegin(),
|
||||
inforead.feature_names.cend(),
|
||||
[=](auto const& str) {
|
||||
return str == featname;
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -115,6 +115,7 @@ class TestDMatrix(unittest.TestCase):
|
||||
dm.feature_names = list('abcde')
|
||||
assert dm.feature_names == list('abcde')
|
||||
|
||||
assert dm.slice([0, 1]).num_col() == dm.num_col()
|
||||
assert dm.slice([0, 1]).feature_names == dm.feature_names
|
||||
|
||||
dm.feature_types = 'q'
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user