Move feature names and types of DMatrix from Python to C++. (#5858)
* Add thread local return entry for DMatrix. * Save feature name and feature type in binary file. Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -415,6 +415,74 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
|
||||
const unsigned *array,
|
||||
bst_ulong len);
|
||||
|
||||
/*!
|
||||
* \brief Set string encoded information of all features.
|
||||
*
|
||||
* Accepted fields are:
|
||||
* - feature_name
|
||||
* - feature_type
|
||||
*
|
||||
* \param handle An instance of data matrix
|
||||
* \param field Feild name
|
||||
* \param features Pointer to array of strings.
|
||||
* \param size Size of `features` pointer (number of strings passed in).
|
||||
*
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*
|
||||
* \code
|
||||
*
|
||||
* char const* feat_names [] {"feat_0", "feat_1"};
|
||||
* XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2);
|
||||
*
|
||||
* // i for integer, q for quantitive. Similarly "int" and "float" are also recognized.
|
||||
* char const* feat_types [] {"i", "q"};
|
||||
* XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2);
|
||||
*
|
||||
* \endcode
|
||||
*/
|
||||
XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
|
||||
const char **features,
|
||||
const bst_ulong size);
|
||||
|
||||
/*!
|
||||
* \brief Get string encoded information of all features.
|
||||
*
|
||||
* Accepted fields are:
|
||||
* - feature_name
|
||||
* - feature_type
|
||||
*
|
||||
* Caller is responsible for copying out the data, before next call to any API function of
|
||||
* XGBoost.
|
||||
*
|
||||
* \param handle An instance of data matrix
|
||||
* \param field Feild name
|
||||
* \param size Size of output pointer `features` (number of strings returned).
|
||||
* \param out_features Address of a pointer to array of strings. Result is stored in
|
||||
* thread local memory.
|
||||
*
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*
|
||||
* \code
|
||||
*
|
||||
* char const **c_out_features = NULL;
|
||||
* bst_ulong out_size = 0;
|
||||
*
|
||||
* // Asumming the feature names are already set by `XGDMatrixSetStrFeatureInfo`.
|
||||
* XGDMatrixGetStrFeatureInfo(handle, "feature_name", &out_size,
|
||||
* &c_out_features)
|
||||
*
|
||||
* for (bst_ulong i = 0; i < out_size; ++i) {
|
||||
* // Here we are simply printing the string. Copy it out if the feature name is
|
||||
* // useful after printing.
|
||||
* printf("feature %lu: %s\n", i, c_out_features[i]);
|
||||
* }
|
||||
*
|
||||
* \endcode
|
||||
*/
|
||||
XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
|
||||
bst_ulong *size,
|
||||
const char ***out_features);
|
||||
|
||||
/*!
|
||||
* \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
|
||||
* \param handle a instance of data matrix
|
||||
@@ -575,8 +643,9 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
|
||||
*
|
||||
* - Functions with the term "Model" handles saving/loading XGBoost model like trees or
|
||||
* linear weights. Striping out parameters configuration like training algorithms or
|
||||
* CUDA device ID helps user to reuse the trained model for different tasks, examples
|
||||
* are prediction, training continuation or interpretation.
|
||||
* CUDA device ID. These functions are designed to let users reuse the trained model
|
||||
* for different tasks, examples are prediction, training continuation or model
|
||||
* interpretation.
|
||||
*
|
||||
* - Functions with the term "Config" handles save/loading configuration. It helps user
|
||||
* to study the internal of XGBoost. Also user can use the load method for specifying
|
||||
@@ -592,7 +661,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
|
||||
/*!
|
||||
* \brief Load model from existing file
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
* \param fname File URI or file name.
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
|
||||
@@ -600,7 +669,7 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
|
||||
/*!
|
||||
* \brief Save model into existing file
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
* \param fname File URI or file name.
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
|
||||
|
||||
@@ -31,7 +31,12 @@ enum class DataType : uint8_t {
|
||||
kFloat32 = 1,
|
||||
kDouble = 2,
|
||||
kUInt32 = 3,
|
||||
kUInt64 = 4
|
||||
kUInt64 = 4,
|
||||
kStr = 5
|
||||
};
|
||||
|
||||
enum class FeatureType : uint8_t {
|
||||
kNumerical
|
||||
};
|
||||
|
||||
/*!
|
||||
@@ -40,7 +45,7 @@ enum class DataType : uint8_t {
|
||||
class MetaInfo {
|
||||
public:
|
||||
/*! \brief number of data fields in MetaInfo */
|
||||
static constexpr uint64_t kNumField = 9;
|
||||
static constexpr uint64_t kNumField = 11;
|
||||
|
||||
/*! \brief number of rows in the data */
|
||||
uint64_t num_row_{0}; // NOLINT
|
||||
@@ -72,6 +77,19 @@ class MetaInfo {
|
||||
*/
|
||||
HostDeviceVector<bst_float> labels_upper_bound_; // NOLINT
|
||||
|
||||
/*!
|
||||
* \brief Name of type for each feature provided by users. Eg. "int"/"float"/"i"/"q"
|
||||
*/
|
||||
std::vector<std::string> feature_type_names;
|
||||
/*!
|
||||
* \brief Name for each feature.
|
||||
*/
|
||||
std::vector<std::string> feature_names;
|
||||
/*
|
||||
* \brief Type of each feature. Automatically set when feature_type_names is specifed.
|
||||
*/
|
||||
HostDeviceVector<FeatureType> feature_types;
|
||||
|
||||
/*! \brief default constructor */
|
||||
MetaInfo() = default;
|
||||
MetaInfo(MetaInfo&& that) = default;
|
||||
@@ -158,6 +176,12 @@ class MetaInfo {
|
||||
*/
|
||||
void SetInfo(const char* key, std::string const& interface_str);
|
||||
|
||||
void GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
|
||||
const void** out_dptr) const;
|
||||
|
||||
void SetFeatureInfo(const char *key, const char **info, const bst_ulong size);
|
||||
void GetFeatureInfo(const char *field, std::vector<std::string>* out_str_vecs) const;
|
||||
|
||||
/*
|
||||
* \brief Extend with other MetaInfo.
|
||||
*
|
||||
@@ -432,6 +456,8 @@ class BatchSet {
|
||||
BatchIterator<T> begin_iter_;
|
||||
};
|
||||
|
||||
struct XGBAPIThreadLocalEntry;
|
||||
|
||||
/*!
|
||||
* \brief Internal data structured used by XGBoost during training.
|
||||
*/
|
||||
@@ -450,6 +476,10 @@ class DMatrix {
|
||||
}
|
||||
/*! \brief meta information of the dataset */
|
||||
virtual const MetaInfo& Info() const = 0;
|
||||
|
||||
/*! \brief Get thread local memory for returning data from DMatrix. */
|
||||
XGBAPIThreadLocalEntry& GetThreadLocal() const;
|
||||
|
||||
/**
|
||||
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
|
||||
*/
|
||||
@@ -462,7 +492,7 @@ class DMatrix {
|
||||
/*! \return Whether the data columns single column block. */
|
||||
virtual bool SingleColBlock() const = 0;
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~DMatrix() = default;
|
||||
virtual ~DMatrix();
|
||||
|
||||
/*! \brief Whether the matrix is dense. */
|
||||
bool IsDense() const {
|
||||
|
||||
Reference in New Issue
Block a user