Move feature names and types of DMatrix from Python to C++. (#5858)

* Add thread local return entry for DMatrix.
* Save feature name and feature type in binary file.

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan
2020-07-07 09:40:13 +08:00
committed by GitHub
parent 4b0852ee41
commit 93c44a9a64
12 changed files with 451 additions and 84 deletions

View File

@@ -415,6 +415,74 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
const unsigned *array,
bst_ulong len);
/*!
* \brief Set string encoded information of all features.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* \param handle An instance of data matrix
* \param field Feild name
* \param features Pointer to array of strings.
* \param size Size of `features` pointer (number of strings passed in).
*
* \return 0 when success, -1 when failure happens
*
* \code
*
* char const* feat_names [] {"feat_0", "feat_1"};
* XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2);
*
* // i for integer, q for quantitive. Similarly "int" and "float" are also recognized.
* char const* feat_types [] {"i", "q"};
* XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2);
*
* \endcode
*/
XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
const char **features,
const bst_ulong size);
/*!
* \brief Get string encoded information of all features.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* Caller is responsible for copying out the data, before next call to any API function of
* XGBoost.
*
* \param handle An instance of data matrix
* \param field Feild name
* \param size Size of output pointer `features` (number of strings returned).
* \param out_features Address of a pointer to array of strings. Result is stored in
* thread local memory.
*
* \return 0 when success, -1 when failure happens
*
* \code
*
* char const **c_out_features = NULL;
* bst_ulong out_size = 0;
*
* // Asumming the feature names are already set by `XGDMatrixSetStrFeatureInfo`.
* XGDMatrixGetStrFeatureInfo(handle, "feature_name", &out_size,
* &c_out_features)
*
* for (bst_ulong i = 0; i < out_size; ++i) {
* // Here we are simply printing the string. Copy it out if the feature name is
* // useful after printing.
* printf("feature %lu: %s\n", i, c_out_features[i]);
* }
*
* \endcode
*/
XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
bst_ulong *size,
const char ***out_features);
/*!
* \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
* \param handle a instance of data matrix
@@ -575,8 +643,9 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
*
* - Functions with the term "Model" handles saving/loading XGBoost model like trees or
* linear weights. Striping out parameters configuration like training algorithms or
* CUDA device ID helps user to reuse the trained model for different tasks, examples
* are prediction, training continuation or interpretation.
* CUDA device ID. These functions are designed to let users reuse the trained model
* for different tasks, examples are prediction, training continuation or model
* interpretation.
*
* - Functions with the term "Config" handles save/loading configuration. It helps user
* to study the internal of XGBoost. Also user can use the load method for specifying
@@ -592,7 +661,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
/*!
* \brief Load model from existing file
* \param handle handle
* \param fname file name
* \param fname File URI or file name.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
@@ -600,7 +669,7 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
/*!
* \brief Save model into existing file
* \param handle handle
* \param fname file name
* \param fname File URI or file name.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,

View File

@@ -31,7 +31,12 @@ enum class DataType : uint8_t {
kFloat32 = 1,
kDouble = 2,
kUInt32 = 3,
kUInt64 = 4
kUInt64 = 4,
kStr = 5
};
enum class FeatureType : uint8_t {
kNumerical
};
/*!
@@ -40,7 +45,7 @@ enum class DataType : uint8_t {
class MetaInfo {
public:
/*! \brief number of data fields in MetaInfo */
static constexpr uint64_t kNumField = 9;
static constexpr uint64_t kNumField = 11;
/*! \brief number of rows in the data */
uint64_t num_row_{0}; // NOLINT
@@ -72,6 +77,19 @@ class MetaInfo {
*/
HostDeviceVector<bst_float> labels_upper_bound_; // NOLINT
/*!
* \brief Name of type for each feature provided by users. Eg. "int"/"float"/"i"/"q"
*/
std::vector<std::string> feature_type_names;
/*!
* \brief Name for each feature.
*/
std::vector<std::string> feature_names;
/*
* \brief Type of each feature. Automatically set when feature_type_names is specifed.
*/
HostDeviceVector<FeatureType> feature_types;
/*! \brief default constructor */
MetaInfo() = default;
MetaInfo(MetaInfo&& that) = default;
@@ -158,6 +176,12 @@ class MetaInfo {
*/
void SetInfo(const char* key, std::string const& interface_str);
void GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
const void** out_dptr) const;
void SetFeatureInfo(const char *key, const char **info, const bst_ulong size);
void GetFeatureInfo(const char *field, std::vector<std::string>* out_str_vecs) const;
/*
* \brief Extend with other MetaInfo.
*
@@ -432,6 +456,8 @@ class BatchSet {
BatchIterator<T> begin_iter_;
};
struct XGBAPIThreadLocalEntry;
/*!
* \brief Internal data structured used by XGBoost during training.
*/
@@ -450,6 +476,10 @@ class DMatrix {
}
/*! \brief meta information of the dataset */
virtual const MetaInfo& Info() const = 0;
/*! \brief Get thread local memory for returning data from DMatrix. */
XGBAPIThreadLocalEntry& GetThreadLocal() const;
/**
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
*/
@@ -462,7 +492,7 @@ class DMatrix {
/*! \return Whether the data columns single column block. */
virtual bool SingleColBlock() const = 0;
/*! \brief virtual destructor */
virtual ~DMatrix() = default;
virtual ~DMatrix();
/*! \brief Whether the matrix is dense. */
bool IsDense() const {