Move feature names and types of DMatrix from Python to C++. (#5858)
* Add thread local return entry for DMatrix. * Save feature name and feature type in binary file. Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -283,6 +283,38 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
|
||||
const char **c_info,
|
||||
const xgboost::bst_ulong size) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();
|
||||
info.SetFeatureInfo(field, c_info, size);
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
|
||||
xgboost::bst_ulong *len,
|
||||
const char ***out_features) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
auto m = *static_cast<std::shared_ptr<DMatrix>*>(handle);
|
||||
auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();
|
||||
|
||||
std::vector<const char *> &charp_vecs = m->GetThreadLocal().ret_vec_charp;
|
||||
std::vector<std::string> &str_vecs = m->GetThreadLocal().ret_vec_str;
|
||||
|
||||
info.GetFeatureInfo(field, &str_vecs);
|
||||
|
||||
charp_vecs.resize(str_vecs.size());
|
||||
for (size_t i = 0; i < str_vecs.size(); ++i) {
|
||||
charp_vecs[i] = str_vecs[i].c_str();
|
||||
}
|
||||
*out_features = dmlc::BeginPtr(charp_vecs);
|
||||
*len = static_cast<xgboost::bst_ulong>(charp_vecs.size());
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle,
|
||||
const unsigned* group,
|
||||
xgboost::bst_ulong len) {
|
||||
@@ -301,22 +333,7 @@ XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
const MetaInfo& info = static_cast<std::shared_ptr<DMatrix>*>(handle)->get()->Info();
|
||||
const std::vector<bst_float>* vec = nullptr;
|
||||
if (!std::strcmp(field, "label")) {
|
||||
vec = &info.labels_.HostVector();
|
||||
} else if (!std::strcmp(field, "weight")) {
|
||||
vec = &info.weights_.HostVector();
|
||||
} else if (!std::strcmp(field, "base_margin")) {
|
||||
vec = &info.base_margin_.HostVector();
|
||||
} else if (!std::strcmp(field, "label_lower_bound")) {
|
||||
vec = &info.labels_lower_bound_.HostVector();
|
||||
} else if (!std::strcmp(field, "label_upper_bound")) {
|
||||
vec = &info.labels_upper_bound_.HostVector();
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown float field name " << field;
|
||||
}
|
||||
*out_len = static_cast<xgboost::bst_ulong>(vec->size()); // NOLINT
|
||||
*out_dptr = dmlc::BeginPtr(*vec);
|
||||
info.GetInfo(field, out_len, DataType::kFloat32, reinterpret_cast<void const**>(out_dptr));
|
||||
API_END();
|
||||
}
|
||||
|
||||
@@ -327,14 +344,7 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
const MetaInfo& info = static_cast<std::shared_ptr<DMatrix>*>(handle)->get()->Info();
|
||||
const std::vector<unsigned>* vec = nullptr;
|
||||
if (!std::strcmp(field, "group_ptr")) {
|
||||
vec = &info.group_ptr_;
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown uint field name " << field;
|
||||
}
|
||||
*out_len = static_cast<xgboost::bst_ulong>(vec->size());
|
||||
*out_dptr = dmlc::BeginPtr(*vec);
|
||||
info.GetInfo(field, out_len, DataType::kUInt32, reinterpret_cast<void const**>(out_dptr));
|
||||
API_END();
|
||||
}
|
||||
|
||||
|
||||
@@ -171,6 +171,8 @@ void HostDeviceVector<T>::SetDevice(int device) const {}
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<int32_t>; // bst_node_t
|
||||
template class HostDeviceVector<uint8_t>;
|
||||
template class HostDeviceVector<FeatureType>;
|
||||
template class HostDeviceVector<Entry>;
|
||||
template class HostDeviceVector<uint64_t>; // bst_row_t
|
||||
template class HostDeviceVector<uint32_t>; // bst_feature_t
|
||||
|
||||
@@ -398,6 +398,7 @@ template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<int32_t>; // bst_node_t
|
||||
template class HostDeviceVector<uint8_t>;
|
||||
template class HostDeviceVector<FeatureType>;
|
||||
template class HostDeviceVector<Entry>;
|
||||
template class HostDeviceVector<uint64_t>; // bst_row_t
|
||||
template class HostDeviceVector<uint32_t>; // bst_feature_t
|
||||
|
||||
134
src/data/data.cc
134
src/data/data.cc
@@ -11,6 +11,7 @@
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/version_config.h"
|
||||
#include "xgboost/learner.h"
|
||||
#include "sparse_page_writer.h"
|
||||
#include "simple_dmatrix.h"
|
||||
|
||||
@@ -148,8 +149,10 @@ void MetaInfo::Clear() {
|
||||
* | group_ptr | kUInt32 | False | ${size} | 1 | ${group_ptr_} |
|
||||
* | weights | kFloat32 | False | ${size} | 1 | ${weights_} |
|
||||
* | base_margin | kFloat32 | False | ${size} | 1 | ${base_margin_} |
|
||||
* | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound__} |
|
||||
* | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound__} |
|
||||
* | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound_} |
|
||||
* | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound_} |
|
||||
* | feature_names | kStr | False | ${size} | 1 | ${feature_names} |
|
||||
* | feature_types | kStr | False | ${size} | 1 | ${feature_types} |
|
||||
*
|
||||
* Note that the scalar fields (is_scalar=True) will have num_row and num_col missing.
|
||||
* Also notice the difference between the saved name and the name used in `SetInfo':
|
||||
@@ -177,9 +180,31 @@ void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
|
||||
SaveVectorField(fo, u8"labels_upper_bound", DataType::kFloat32,
|
||||
{labels_upper_bound_.Size(), 1}, labels_upper_bound_); ++field_cnt;
|
||||
|
||||
SaveVectorField(fo, u8"feature_names", DataType::kStr,
|
||||
{feature_names.size(), 1}, feature_names); ++field_cnt;
|
||||
SaveVectorField(fo, u8"feature_types", DataType::kStr,
|
||||
{feature_type_names.size(), 1}, feature_type_names); ++field_cnt;
|
||||
|
||||
CHECK_EQ(field_cnt, kNumField) << "Wrong number of fields";
|
||||
}
|
||||
|
||||
void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<FeatureType>* types) {
|
||||
types->clear();
|
||||
for (auto const &elem : type_names) {
|
||||
if (elem == "int") {
|
||||
types->emplace_back(FeatureType::kNumerical);
|
||||
} else if (elem == "float") {
|
||||
types->emplace_back(FeatureType::kNumerical);
|
||||
} else if (elem == "i") {
|
||||
types->emplace_back(FeatureType::kNumerical);
|
||||
} else if (elem == "q") {
|
||||
types->emplace_back(FeatureType::kNumerical);
|
||||
} else {
|
||||
LOG(FATAL) << "All feature_types must be {int, float, i, q}";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::LoadBinary(dmlc::Stream *fi) {
|
||||
auto version = Version::Load(fi);
|
||||
auto major = std::get<0>(version);
|
||||
@@ -193,11 +218,20 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
|
||||
const uint64_t expected_num_field = kNumField;
|
||||
uint64_t num_field { 0 };
|
||||
CHECK(fi->Read(&num_field)) << "MetaInfo: invalid format";
|
||||
CHECK_GE(num_field, expected_num_field)
|
||||
<< "MetaInfo: insufficient number of fields (expected at least " << expected_num_field
|
||||
<< " fields, but the binary file only contains " << num_field << "fields.)";
|
||||
size_t expected = 0;
|
||||
if (major == 1 && std::get<1>(version) < 2) {
|
||||
// feature names and types are added in 1.2
|
||||
expected = expected_num_field - 2;
|
||||
} else {
|
||||
expected = expected_num_field;
|
||||
}
|
||||
CHECK_GE(num_field, expected)
|
||||
<< "MetaInfo: insufficient number of fields (expected at least "
|
||||
<< expected << " fields, but the binary file only contains " << num_field
|
||||
<< "fields.)";
|
||||
if (num_field > expected_num_field) {
|
||||
LOG(WARNING) << "MetaInfo: the given binary file contains extra fields which will be ignored.";
|
||||
LOG(WARNING) << "MetaInfo: the given binary file contains extra fields "
|
||||
"which will be ignored.";
|
||||
}
|
||||
|
||||
LoadScalarField(fi, u8"num_row", DataType::kUInt64, &num_row_);
|
||||
@@ -209,6 +243,10 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
|
||||
LoadVectorField(fi, u8"base_margin", DataType::kFloat32, &base_margin_);
|
||||
LoadVectorField(fi, u8"labels_lower_bound", DataType::kFloat32, &labels_lower_bound_);
|
||||
LoadVectorField(fi, u8"labels_upper_bound", DataType::kFloat32, &labels_upper_bound_);
|
||||
|
||||
LoadVectorField(fi, u8"feature_names", DataType::kStr, &feature_names);
|
||||
LoadVectorField(fi, u8"feature_types", DataType::kStr, &feature_type_names);
|
||||
LoadFeatureType(feature_type_names, &feature_types.HostVector());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -344,6 +382,76 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype,
|
||||
const void **out_dptr) const {
|
||||
if (dtype == DataType::kFloat32) {
|
||||
const std::vector<bst_float>* vec = nullptr;
|
||||
if (!std::strcmp(key, "label")) {
|
||||
vec = &this->labels_.HostVector();
|
||||
} else if (!std::strcmp(key, "weight")) {
|
||||
vec = &this->weights_.HostVector();
|
||||
} else if (!std::strcmp(key, "base_margin")) {
|
||||
vec = &this->base_margin_.HostVector();
|
||||
} else if (!std::strcmp(key, "label_lower_bound")) {
|
||||
vec = &this->labels_lower_bound_.HostVector();
|
||||
} else if (!std::strcmp(key, "label_upper_bound")) {
|
||||
vec = &this->labels_upper_bound_.HostVector();
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown float field name: " << key;
|
||||
}
|
||||
*out_len = static_cast<xgboost::bst_ulong>(vec->size()); // NOLINT
|
||||
*reinterpret_cast<float const**>(out_dptr) = dmlc::BeginPtr(*vec);
|
||||
} else if (dtype == DataType::kUInt32) {
|
||||
const std::vector<unsigned> *vec = nullptr;
|
||||
if (!std::strcmp(key, "group_ptr")) {
|
||||
vec = &this->group_ptr_;
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown uint32 field name: " << key;
|
||||
}
|
||||
*out_len = static_cast<xgboost::bst_ulong>(vec->size());
|
||||
*reinterpret_cast<unsigned const**>(out_dptr) = dmlc::BeginPtr(*vec);
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown data type for getting meta info.";
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) {
|
||||
if (size != 0) {
|
||||
CHECK_EQ(size, this->num_col_)
|
||||
<< "Length of " << key << " must be equal to number of columns.";
|
||||
}
|
||||
if (!std::strcmp(key, "feature_type")) {
|
||||
feature_type_names.clear();
|
||||
auto& h_feature_types = feature_types.HostVector();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
auto elem = info[i];
|
||||
feature_type_names.emplace_back(elem);
|
||||
}
|
||||
LoadFeatureType(feature_type_names, &h_feature_types);
|
||||
} else if (!std::strcmp(key, "feature_name")) {
|
||||
feature_names.clear();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
feature_names.emplace_back(info[i]);
|
||||
}
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown feature info name: " << key;
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::GetFeatureInfo(const char *field,
|
||||
std::vector<std::string> *out_str_vecs) const {
|
||||
auto &str_vecs = *out_str_vecs;
|
||||
if (!std::strcmp(field, "feature_type")) {
|
||||
str_vecs.resize(feature_type_names.size());
|
||||
std::copy(feature_type_names.cbegin(), feature_type_names.cend(), str_vecs.begin());
|
||||
} else if (!strcmp(field, "feature_name")) {
|
||||
str_vecs.resize(feature_names.size());
|
||||
std::copy(feature_names.begin(), feature_names.end(), str_vecs.begin());
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown feature info: " << field;
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) {
|
||||
if (accumulate_rows) {
|
||||
this->num_row_ += that.num_row_;
|
||||
@@ -441,6 +549,20 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
using DMatrixThreadLocal =
|
||||
dmlc::ThreadLocalStore<std::map<DMatrix const *, XGBAPIThreadLocalEntry>>;
|
||||
|
||||
XGBAPIThreadLocalEntry& DMatrix::GetThreadLocal() const {
|
||||
return (*DMatrixThreadLocal::Get())[this];
|
||||
}
|
||||
|
||||
DMatrix::~DMatrix() {
|
||||
auto local_map = DMatrixThreadLocal::Get();
|
||||
if (local_map->find(this) != local_map->cend()) {
|
||||
local_map->erase(this);
|
||||
}
|
||||
}
|
||||
|
||||
DMatrix* DMatrix::Load(const std::string& uri,
|
||||
bool silent,
|
||||
bool load_row_split,
|
||||
|
||||
@@ -202,7 +202,7 @@ void GenericParameter::ConfigureGpuId(bool require_gpu) {
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
}
|
||||
|
||||
using XGBAPIThreadLocalStore =
|
||||
using LearnerAPIThreadLocalStore =
|
||||
dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
|
||||
|
||||
class LearnerConfiguration : public Learner {
|
||||
@@ -895,7 +895,7 @@ class LearnerImpl : public LearnerIO {
|
||||
explicit LearnerImpl(std::vector<std::shared_ptr<DMatrix> > cache)
|
||||
: LearnerIO{cache} {}
|
||||
~LearnerImpl() override {
|
||||
auto local_map = XGBAPIThreadLocalStore::Get();
|
||||
auto local_map = LearnerAPIThreadLocalStore::Get();
|
||||
if (local_map->find(this) != local_map->cend()) {
|
||||
local_map->erase(this);
|
||||
}
|
||||
@@ -1023,7 +1023,7 @@ class LearnerImpl : public LearnerIO {
|
||||
}
|
||||
|
||||
XGBAPIThreadLocalEntry& GetThreadLocal() const override {
|
||||
return (*XGBAPIThreadLocalStore::Get())[this];
|
||||
return (*LearnerAPIThreadLocalStore::Get())[this];
|
||||
}
|
||||
|
||||
void InplacePredict(dmlc::any const &x, std::string const &type,
|
||||
|
||||
Reference in New Issue
Block a user