Move feature names and types of DMatrix from Python to C++. (#5858)

* Add thread local return entry for DMatrix.
* Save feature name and feature type in binary file.

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan 2020-07-07 09:40:13 +08:00 committed by GitHub
parent 4b0852ee41
commit 93c44a9a64
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 451 additions and 84 deletions

View File

@ -415,6 +415,74 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
const unsigned *array,
bst_ulong len);
/*!
* \brief Set string encoded information of all features.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* \param handle An instance of data matrix
* \param field Feild name
* \param features Pointer to array of strings.
* \param size Size of `features` pointer (number of strings passed in).
*
* \return 0 when success, -1 when failure happens
*
* \code
*
* char const* feat_names [] {"feat_0", "feat_1"};
* XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2);
*
* // i for integer, q for quantitive. Similarly "int" and "float" are also recognized.
* char const* feat_types [] {"i", "q"};
* XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2);
*
* \endcode
*/
XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
const char **features,
const bst_ulong size);
/*!
* \brief Get string encoded information of all features.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* Caller is responsible for copying out the data, before next call to any API function of
* XGBoost.
*
* \param handle An instance of data matrix
* \param field Feild name
* \param size Size of output pointer `features` (number of strings returned).
* \param out_features Address of a pointer to array of strings. Result is stored in
* thread local memory.
*
* \return 0 when success, -1 when failure happens
*
* \code
*
* char const **c_out_features = NULL;
* bst_ulong out_size = 0;
*
* // Asumming the feature names are already set by `XGDMatrixSetStrFeatureInfo`.
* XGDMatrixGetStrFeatureInfo(handle, "feature_name", &out_size,
* &c_out_features)
*
* for (bst_ulong i = 0; i < out_size; ++i) {
* // Here we are simply printing the string. Copy it out if the feature name is
* // useful after printing.
* printf("feature %lu: %s\n", i, c_out_features[i]);
* }
*
* \endcode
*/
XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
bst_ulong *size,
const char ***out_features);
/*!
* \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
* \param handle a instance of data matrix
@ -575,8 +643,9 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
*
* - Functions with the term "Model" handles saving/loading XGBoost model like trees or
* linear weights. Striping out parameters configuration like training algorithms or
* CUDA device ID helps user to reuse the trained model for different tasks, examples
* are prediction, training continuation or interpretation.
* CUDA device ID. These functions are designed to let users reuse the trained model
* for different tasks, examples are prediction, training continuation or model
* interpretation.
*
* - Functions with the term "Config" handles save/loading configuration. It helps user
* to study the internal of XGBoost. Also user can use the load method for specifying
@ -592,7 +661,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
/*!
* \brief Load model from existing file
* \param handle handle
* \param fname file name
* \param fname File URI or file name.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
@ -600,7 +669,7 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
/*!
* \brief Save model into existing file
* \param handle handle
* \param fname file name
* \param fname File URI or file name.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,

View File

@ -31,7 +31,12 @@ enum class DataType : uint8_t {
kFloat32 = 1,
kDouble = 2,
kUInt32 = 3,
kUInt64 = 4
kUInt64 = 4,
kStr = 5
};
enum class FeatureType : uint8_t {
kNumerical
};
/*!
@ -40,7 +45,7 @@ enum class DataType : uint8_t {
class MetaInfo {
public:
/*! \brief number of data fields in MetaInfo */
static constexpr uint64_t kNumField = 9;
static constexpr uint64_t kNumField = 11;
/*! \brief number of rows in the data */
uint64_t num_row_{0}; // NOLINT
@ -72,6 +77,19 @@ class MetaInfo {
*/
HostDeviceVector<bst_float> labels_upper_bound_; // NOLINT
/*!
* \brief Name of type for each feature provided by users. Eg. "int"/"float"/"i"/"q"
*/
std::vector<std::string> feature_type_names;
/*!
* \brief Name for each feature.
*/
std::vector<std::string> feature_names;
/*
* \brief Type of each feature. Automatically set when feature_type_names is specifed.
*/
HostDeviceVector<FeatureType> feature_types;
/*! \brief default constructor */
MetaInfo() = default;
MetaInfo(MetaInfo&& that) = default;
@ -158,6 +176,12 @@ class MetaInfo {
*/
void SetInfo(const char* key, std::string const& interface_str);
void GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
const void** out_dptr) const;
void SetFeatureInfo(const char *key, const char **info, const bst_ulong size);
void GetFeatureInfo(const char *field, std::vector<std::string>* out_str_vecs) const;
/*
* \brief Extend with other MetaInfo.
*
@ -432,6 +456,8 @@ class BatchSet {
BatchIterator<T> begin_iter_;
};
struct XGBAPIThreadLocalEntry;
/*!
* \brief Internal data structured used by XGBoost during training.
*/
@ -450,6 +476,10 @@ class DMatrix {
}
/*! \brief meta information of the dataset */
virtual const MetaInfo& Info() const = 0;
/*! \brief Get thread local memory for returning data from DMatrix. */
XGBAPIThreadLocalEntry& GetThreadLocal() const;
/**
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
*/
@ -462,7 +492,7 @@ class DMatrix {
/*! \return Whether the data columns single column block. */
virtual bool SingleColBlock() const = 0;
/*! \brief virtual destructor */
virtual ~DMatrix() = default;
virtual ~DMatrix();
/*! \brief Whether the matrix is dense. */
bool IsDense() const {

View File

@ -305,12 +305,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
DMatrix is a internal data structure that used by XGBoost
which is optimized for both memory efficiency and training speed.
You can construct DMatrix from numpy.arrays
You can construct DMatrix from multiple different sources of data.
"""
_feature_names = None # for previous version's pickle
_feature_types = None
def __init__(self, data, label=None, weight=None, base_margin=None,
missing=None,
silent=False,
@ -362,11 +359,6 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
# force into void_p, mac need to pass things in as void_p
if data is None:
self.handle = None
if feature_names is not None:
self._feature_names = feature_names
if feature_types is not None:
self._feature_types = feature_types
return
handler = self.get_data_handler(data)
@ -666,14 +658,16 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
res : DMatrix
A new DMatrix containing only selected indices.
"""
res = DMatrix(None, feature_names=self.feature_names,
feature_types=self.feature_types)
res = DMatrix(None)
res.handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixSliceDMatrixEx(self.handle,
_check_call(_LIB.XGDMatrixSliceDMatrixEx(
self.handle,
c_array(ctypes.c_int, rindex),
c_bst_ulong(len(rindex)),
ctypes.byref(res.handle),
ctypes.c_int(1 if allow_groups else 0)))
res.feature_names = self.feature_names
res.feature_types = self.feature_types
return res
@property
@ -684,20 +678,17 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
-------
feature_names : list or None
"""
if self._feature_names is None:
self._feature_names = ['f{0}'.format(i)
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
c_str('feature_name'),
ctypes.byref(length),
ctypes.byref(sarr)))
feature_names = from_cstr_to_pystr(sarr, length)
if not feature_names:
feature_names = ['f{0}'.format(i)
for i in range(self.num_col())]
return self._feature_names
@property
def feature_types(self):
"""Get feature types (column types).
Returns
-------
feature_types : list or None
"""
return self._feature_types
return feature_names
@feature_names.setter
def feature_names(self, feature_names):
@ -728,10 +719,41 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
not any(x in f for x in set(('[', ']', '<')))
for f in feature_names):
raise ValueError('feature_names must be string, and may not contain [, ] or <')
c_feature_names = [bytes(f, encoding='utf-8')
for f in feature_names]
c_feature_names = (ctypes.c_char_p *
len(c_feature_names))(*c_feature_names)
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle, c_str('feature_name'),
c_feature_names,
c_bst_ulong(len(feature_names))))
else:
# reset feature_types also
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle,
c_str('feature_name'),
None,
c_bst_ulong(0)))
self.feature_types = None
self._feature_names = feature_names
@property
def feature_types(self):
"""Get feature types (column types).
Returns
-------
feature_types : list or None
"""
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
c_str('feature_type'),
ctypes.byref(length),
ctypes.byref(sarr)))
res = from_cstr_to_pystr(sarr, length)
if not res:
return None
return res
@feature_types.setter
def feature_types(self, feature_types):
@ -746,14 +768,12 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
Labels for features. None will reset existing feature names
"""
if feature_types is not None:
if self._feature_names is None:
msg = 'Unable to set feature types before setting names'
raise ValueError(msg)
if not isinstance(feature_types, (list, str)):
raise TypeError(
'feature_types must be string or list of strings')
if isinstance(feature_types, STRING_TYPES):
# single string will be applied to all columns
feature_types = [feature_types] * self.num_col()
try:
if not isinstance(feature_types, str):
feature_types = list(feature_types)
@ -761,16 +781,25 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
feature_types = [feature_types]
except TypeError:
feature_types = [feature_types]
c_feature_types = [bytes(f, encoding='utf-8')
for f in feature_types]
c_feature_types = (ctypes.c_char_p *
len(c_feature_types))(*c_feature_types)
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle, c_str('feature_type'),
c_feature_types,
c_bst_ulong(len(feature_types))))
if len(feature_types) != self.num_col():
msg = 'feature_types must have the same length as data'
raise ValueError(msg)
valid = ('int', 'float', 'i', 'q')
if not all(isinstance(f, STRING_TYPES) and f in valid
for f in feature_types):
raise ValueError('All feature_names must be {int, float, i, q}')
self._feature_types = feature_types
else:
# Reset.
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle,
c_str('feature_type'),
None,
c_bst_ulong(0)))
class DeviceQuantileDMatrix(DMatrix):

View File

@ -372,7 +372,7 @@ class DTHandler(DataHandler):
raise ValueError(
'DataTable has own feature types, cannot pass them in.')
feature_types = np.vectorize(self.dt_type_mapper2.get)(
data_types_names)
data_types_names).tolist()
return data, feature_names, feature_types

View File

@ -283,6 +283,38 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
API_END();
}
XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
const char **c_info,
const xgboost::bst_ulong size) {
API_BEGIN();
CHECK_HANDLE();
auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();
info.SetFeatureInfo(field, c_info, size);
API_END();
}
XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
xgboost::bst_ulong *len,
const char ***out_features) {
API_BEGIN();
CHECK_HANDLE();
auto m = *static_cast<std::shared_ptr<DMatrix>*>(handle);
auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();
std::vector<const char *> &charp_vecs = m->GetThreadLocal().ret_vec_charp;
std::vector<std::string> &str_vecs = m->GetThreadLocal().ret_vec_str;
info.GetFeatureInfo(field, &str_vecs);
charp_vecs.resize(str_vecs.size());
for (size_t i = 0; i < str_vecs.size(); ++i) {
charp_vecs[i] = str_vecs[i].c_str();
}
*out_features = dmlc::BeginPtr(charp_vecs);
*len = static_cast<xgboost::bst_ulong>(charp_vecs.size());
API_END();
}
XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle,
const unsigned* group,
xgboost::bst_ulong len) {
@ -301,22 +333,7 @@ XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
API_BEGIN();
CHECK_HANDLE();
const MetaInfo& info = static_cast<std::shared_ptr<DMatrix>*>(handle)->get()->Info();
const std::vector<bst_float>* vec = nullptr;
if (!std::strcmp(field, "label")) {
vec = &info.labels_.HostVector();
} else if (!std::strcmp(field, "weight")) {
vec = &info.weights_.HostVector();
} else if (!std::strcmp(field, "base_margin")) {
vec = &info.base_margin_.HostVector();
} else if (!std::strcmp(field, "label_lower_bound")) {
vec = &info.labels_lower_bound_.HostVector();
} else if (!std::strcmp(field, "label_upper_bound")) {
vec = &info.labels_upper_bound_.HostVector();
} else {
LOG(FATAL) << "Unknown float field name " << field;
}
*out_len = static_cast<xgboost::bst_ulong>(vec->size()); // NOLINT
*out_dptr = dmlc::BeginPtr(*vec);
info.GetInfo(field, out_len, DataType::kFloat32, reinterpret_cast<void const**>(out_dptr));
API_END();
}
@ -327,14 +344,7 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
API_BEGIN();
CHECK_HANDLE();
const MetaInfo& info = static_cast<std::shared_ptr<DMatrix>*>(handle)->get()->Info();
const std::vector<unsigned>* vec = nullptr;
if (!std::strcmp(field, "group_ptr")) {
vec = &info.group_ptr_;
} else {
LOG(FATAL) << "Unknown uint field name " << field;
}
*out_len = static_cast<xgboost::bst_ulong>(vec->size());
*out_dptr = dmlc::BeginPtr(*vec);
info.GetInfo(field, out_len, DataType::kUInt32, reinterpret_cast<void const**>(out_dptr));
API_END();
}

View File

@ -171,6 +171,8 @@ void HostDeviceVector<T>::SetDevice(int device) const {}
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<GradientPair>;
template class HostDeviceVector<int32_t>; // bst_node_t
template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<uint32_t>; // bst_feature_t

View File

@ -398,6 +398,7 @@ template class HostDeviceVector<bst_float>;
template class HostDeviceVector<GradientPair>;
template class HostDeviceVector<int32_t>; // bst_node_t
template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<uint32_t>; // bst_feature_t

View File

@ -11,6 +11,7 @@
#include "xgboost/host_device_vector.h"
#include "xgboost/logging.h"
#include "xgboost/version_config.h"
#include "xgboost/learner.h"
#include "sparse_page_writer.h"
#include "simple_dmatrix.h"
@ -148,8 +149,10 @@ void MetaInfo::Clear() {
* | group_ptr | kUInt32 | False | ${size} | 1 | ${group_ptr_} |
* | weights | kFloat32 | False | ${size} | 1 | ${weights_} |
* | base_margin | kFloat32 | False | ${size} | 1 | ${base_margin_} |
* | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound__} |
* | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound__} |
* | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound_} |
* | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound_} |
* | feature_names | kStr | False | ${size} | 1 | ${feature_names} |
* | feature_types | kStr | False | ${size} | 1 | ${feature_types} |
*
* Note that the scalar fields (is_scalar=True) will have num_row and num_col missing.
* Also notice the difference between the saved name and the name used in `SetInfo':
@ -177,9 +180,31 @@ void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
SaveVectorField(fo, u8"labels_upper_bound", DataType::kFloat32,
{labels_upper_bound_.Size(), 1}, labels_upper_bound_); ++field_cnt;
SaveVectorField(fo, u8"feature_names", DataType::kStr,
{feature_names.size(), 1}, feature_names); ++field_cnt;
SaveVectorField(fo, u8"feature_types", DataType::kStr,
{feature_type_names.size(), 1}, feature_type_names); ++field_cnt;
CHECK_EQ(field_cnt, kNumField) << "Wrong number of fields";
}
void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<FeatureType>* types) {
types->clear();
for (auto const &elem : type_names) {
if (elem == "int") {
types->emplace_back(FeatureType::kNumerical);
} else if (elem == "float") {
types->emplace_back(FeatureType::kNumerical);
} else if (elem == "i") {
types->emplace_back(FeatureType::kNumerical);
} else if (elem == "q") {
types->emplace_back(FeatureType::kNumerical);
} else {
LOG(FATAL) << "All feature_types must be {int, float, i, q}";
}
}
}
void MetaInfo::LoadBinary(dmlc::Stream *fi) {
auto version = Version::Load(fi);
auto major = std::get<0>(version);
@ -193,11 +218,20 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
const uint64_t expected_num_field = kNumField;
uint64_t num_field { 0 };
CHECK(fi->Read(&num_field)) << "MetaInfo: invalid format";
CHECK_GE(num_field, expected_num_field)
<< "MetaInfo: insufficient number of fields (expected at least " << expected_num_field
<< " fields, but the binary file only contains " << num_field << "fields.)";
size_t expected = 0;
if (major == 1 && std::get<1>(version) < 2) {
// feature names and types are added in 1.2
expected = expected_num_field - 2;
} else {
expected = expected_num_field;
}
CHECK_GE(num_field, expected)
<< "MetaInfo: insufficient number of fields (expected at least "
<< expected << " fields, but the binary file only contains " << num_field
<< "fields.)";
if (num_field > expected_num_field) {
LOG(WARNING) << "MetaInfo: the given binary file contains extra fields which will be ignored.";
LOG(WARNING) << "MetaInfo: the given binary file contains extra fields "
"which will be ignored.";
}
LoadScalarField(fi, u8"num_row", DataType::kUInt64, &num_row_);
@ -209,6 +243,10 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
LoadVectorField(fi, u8"base_margin", DataType::kFloat32, &base_margin_);
LoadVectorField(fi, u8"labels_lower_bound", DataType::kFloat32, &labels_lower_bound_);
LoadVectorField(fi, u8"labels_upper_bound", DataType::kFloat32, &labels_upper_bound_);
LoadVectorField(fi, u8"feature_names", DataType::kStr, &feature_names);
LoadVectorField(fi, u8"feature_types", DataType::kStr, &feature_type_names);
LoadFeatureType(feature_type_names, &feature_types.HostVector());
}
template <typename T>
@ -344,6 +382,76 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
}
}
void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype,
const void **out_dptr) const {
if (dtype == DataType::kFloat32) {
const std::vector<bst_float>* vec = nullptr;
if (!std::strcmp(key, "label")) {
vec = &this->labels_.HostVector();
} else if (!std::strcmp(key, "weight")) {
vec = &this->weights_.HostVector();
} else if (!std::strcmp(key, "base_margin")) {
vec = &this->base_margin_.HostVector();
} else if (!std::strcmp(key, "label_lower_bound")) {
vec = &this->labels_lower_bound_.HostVector();
} else if (!std::strcmp(key, "label_upper_bound")) {
vec = &this->labels_upper_bound_.HostVector();
} else {
LOG(FATAL) << "Unknown float field name: " << key;
}
*out_len = static_cast<xgboost::bst_ulong>(vec->size()); // NOLINT
*reinterpret_cast<float const**>(out_dptr) = dmlc::BeginPtr(*vec);
} else if (dtype == DataType::kUInt32) {
const std::vector<unsigned> *vec = nullptr;
if (!std::strcmp(key, "group_ptr")) {
vec = &this->group_ptr_;
} else {
LOG(FATAL) << "Unknown uint32 field name: " << key;
}
*out_len = static_cast<xgboost::bst_ulong>(vec->size());
*reinterpret_cast<unsigned const**>(out_dptr) = dmlc::BeginPtr(*vec);
} else {
LOG(FATAL) << "Unknown data type for getting meta info.";
}
}
void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) {
if (size != 0) {
CHECK_EQ(size, this->num_col_)
<< "Length of " << key << " must be equal to number of columns.";
}
if (!std::strcmp(key, "feature_type")) {
feature_type_names.clear();
auto& h_feature_types = feature_types.HostVector();
for (size_t i = 0; i < size; ++i) {
auto elem = info[i];
feature_type_names.emplace_back(elem);
}
LoadFeatureType(feature_type_names, &h_feature_types);
} else if (!std::strcmp(key, "feature_name")) {
feature_names.clear();
for (size_t i = 0; i < size; ++i) {
feature_names.emplace_back(info[i]);
}
} else {
LOG(FATAL) << "Unknown feature info name: " << key;
}
}
void MetaInfo::GetFeatureInfo(const char *field,
std::vector<std::string> *out_str_vecs) const {
auto &str_vecs = *out_str_vecs;
if (!std::strcmp(field, "feature_type")) {
str_vecs.resize(feature_type_names.size());
std::copy(feature_type_names.cbegin(), feature_type_names.cend(), str_vecs.begin());
} else if (!strcmp(field, "feature_name")) {
str_vecs.resize(feature_names.size());
std::copy(feature_names.begin(), feature_names.end(), str_vecs.begin());
} else {
LOG(FATAL) << "Unknown feature info: " << field;
}
}
void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) {
if (accumulate_rows) {
this->num_row_ += that.num_row_;
@ -441,6 +549,20 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
}
#endif // !defined(XGBOOST_USE_CUDA)
using DMatrixThreadLocal =
dmlc::ThreadLocalStore<std::map<DMatrix const *, XGBAPIThreadLocalEntry>>;
XGBAPIThreadLocalEntry& DMatrix::GetThreadLocal() const {
return (*DMatrixThreadLocal::Get())[this];
}
DMatrix::~DMatrix() {
auto local_map = DMatrixThreadLocal::Get();
if (local_map->find(this) != local_map->cend()) {
local_map->erase(this);
}
}
DMatrix* DMatrix::Load(const std::string& uri,
bool silent,
bool load_row_split,

View File

@ -202,7 +202,7 @@ void GenericParameter::ConfigureGpuId(bool require_gpu) {
#endif // defined(XGBOOST_USE_CUDA)
}
using XGBAPIThreadLocalStore =
using LearnerAPIThreadLocalStore =
dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
class LearnerConfiguration : public Learner {
@ -895,7 +895,7 @@ class LearnerImpl : public LearnerIO {
explicit LearnerImpl(std::vector<std::shared_ptr<DMatrix> > cache)
: LearnerIO{cache} {}
~LearnerImpl() override {
auto local_map = XGBAPIThreadLocalStore::Get();
auto local_map = LearnerAPIThreadLocalStore::Get();
if (local_map->find(this) != local_map->cend()) {
local_map->erase(this);
}
@ -1023,7 +1023,7 @@ class LearnerImpl : public LearnerIO {
}
XGBAPIThreadLocalEntry& GetThreadLocal() const override {
return (*XGBAPIThreadLocalStore::Get())[this];
return (*LearnerAPIThreadLocalStore::Get())[this];
}
void InplacePredict(dmlc::any const &x, std::string const &type,

View File

@ -10,7 +10,6 @@
#include "../helpers.h"
#include "../../../src/common/io.h"
TEST(CAPI, XGDMatrixCreateFromMatDT) {
std::vector<int> col0 = {0, -1, 3};
std::vector<float> col1 = {-4.0f, 2.0f, 0.0f};
@ -148,4 +147,48 @@ TEST(CAPI, CatchDMLCError) {
EXPECT_THROW({ dmlc::Stream::Create("foo", "r"); }, dmlc::Error);
}
TEST(CAPI, DMatrixSetFeatureName) {
size_t constexpr kRows = 10;
bst_feature_t constexpr kCols = 2;
DMatrixHandle handle;
std::vector<float> data(kCols * kRows, 1.5);
XGDMatrixCreateFromMat_omp(data.data(), kRows, kCols,
std::numeric_limits<float>::quiet_NaN(), &handle,
0);
std::vector<std::string> feature_names;
for (bst_feature_t i = 0; i < kCols; ++i) {
feature_names.emplace_back(std::to_string(i));
}
std::vector<char const*> c_feature_names;
c_feature_names.resize(feature_names.size());
std::transform(feature_names.cbegin(), feature_names.cend(),
c_feature_names.begin(),
[](auto const &str) { return str.c_str(); });
XGDMatrixSetStrFeatureInfo(handle, u8"feature_name", c_feature_names.data(),
c_feature_names.size());
bst_ulong out_len = 0;
char const **c_out_features;
XGDMatrixGetStrFeatureInfo(handle, u8"feature_name", &out_len,
&c_out_features);
CHECK_EQ(out_len, kCols);
std::vector<std::string> out_features;
for (bst_ulong i = 0; i < out_len; ++i) {
ASSERT_EQ(std::to_string(i), c_out_features[i]);
}
char const* feat_types [] {"i", "q"};
static_assert(sizeof(feat_types)/ sizeof(feat_types[0]) == kCols, "");
XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, kCols);
char const **c_out_types;
XGDMatrixGetStrFeatureInfo(handle, u8"feature_type", &out_len,
&c_out_types);
for (bst_ulong i = 0; i < out_len; ++i) {
ASSERT_STREQ(feat_types[i], c_out_types[i]);
}
XGDMatrixFree(handle);
}
} // namespace xgboost

View File

@ -39,6 +39,36 @@ TEST(MetaInfo, GetSet) {
ASSERT_EQ(info.group_ptr_.size(), 0);
}
TEST(MetaInfo, GetSetFeature) {
xgboost::MetaInfo info;
EXPECT_THROW(info.SetFeatureInfo("", nullptr, 0), dmlc::Error);
EXPECT_THROW(info.SetFeatureInfo("foo", nullptr, 0), dmlc::Error);
EXPECT_NO_THROW(info.SetFeatureInfo("feature_name", nullptr, 0));
EXPECT_NO_THROW(info.SetFeatureInfo("feature_type", nullptr, 0));
ASSERT_EQ(info.feature_type_names.size(), 0);
ASSERT_EQ(info.feature_types.Size(), 0);
ASSERT_EQ(info.feature_names.size(), 0);
size_t constexpr kCols = 19;
std::vector<std::string> types(kCols, u8"float");
std::vector<char const*> c_types(kCols);
std::transform(types.cbegin(), types.cend(), c_types.begin(),
[](auto const &str) { return str.c_str(); });
// Info has 0 column
EXPECT_THROW(
info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()),
dmlc::Error);
info.num_col_ = kCols;
EXPECT_NO_THROW(
info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
// Test clear.
info.SetFeatureInfo("feature_type", nullptr, 0);
ASSERT_EQ(info.feature_type_names.size(), 0);
ASSERT_EQ(info.feature_types.Size(), 0);
// Other conditions are tested in `SaveLoadBinary`.
}
TEST(MetaInfo, SaveLoadBinary) {
xgboost::MetaInfo info;
uint64_t constexpr kRows { 64 }, kCols { 32 };
@ -51,9 +81,22 @@ TEST(MetaInfo, SaveLoadBinary) {
info.SetInfo("label", values.data(), xgboost::DataType::kFloat32, kRows);
info.SetInfo("weight", values.data(), xgboost::DataType::kFloat32, kRows);
info.SetInfo("base_margin", values.data(), xgboost::DataType::kFloat32, kRows);
info.num_row_ = kRows;
info.num_col_ = kCols;
auto featname = u8"特征名";
std::vector<std::string> types(kCols, u8"float");
std::vector<char const*> c_types(kCols);
std::transform(types.cbegin(), types.cend(), c_types.begin(),
[](auto const &str) { return str.c_str(); });
info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size());
std::vector<std::string> names(kCols, featname);
std::vector<char const*> c_names(kCols);
std::transform(names.cbegin(), names.cend(), c_names.begin(),
[](auto const &str) { return str.c_str(); });
info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size());;
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/metainfo.binary";
{
@ -80,6 +123,23 @@ TEST(MetaInfo, SaveLoadBinary) {
EXPECT_EQ(inforead.group_ptr_, info.group_ptr_);
EXPECT_EQ(inforead.weights_.HostVector(), info.weights_.HostVector());
EXPECT_EQ(inforead.base_margin_.HostVector(), info.base_margin_.HostVector());
EXPECT_EQ(inforead.feature_type_names.size(), kCols);
EXPECT_EQ(inforead.feature_types.Size(), kCols);
EXPECT_TRUE(std::all_of(inforead.feature_type_names.cbegin(),
inforead.feature_type_names.cend(),
[](auto const &str) { return str == u8"float"; }));
auto h_ft = inforead.feature_types.HostSpan();
EXPECT_TRUE(std::all_of(h_ft.cbegin(), h_ft.cend(), [](auto f) {
return f == xgboost::FeatureType::kNumerical;
}));
EXPECT_EQ(inforead.feature_names.size(), kCols);
EXPECT_TRUE(std::all_of(inforead.feature_names.cbegin(),
inforead.feature_names.cend(),
[=](auto const& str) {
return str == featname;
}));
}
}

View File

@ -115,6 +115,7 @@ class TestDMatrix(unittest.TestCase):
dm.feature_names = list('abcde')
assert dm.feature_names == list('abcde')
assert dm.slice([0, 1]).num_col() == dm.num_col()
assert dm.slice([0, 1]).feature_names == dm.feature_names
dm.feature_types = 'q'