[Breaking] Accept multi-dim meta info. (#7405)

This PR changes base_margin into a 3-dim array, with one of them being reserved for multi-target classification. Also, a breaking change is made for binary serialization due to extra dimension along with a fix for saving the feature weights. Lastly, it unifies the prediction initialization between CPU and GPU. After this PR, the meta info setter in Python will be based on array interface.
This commit is contained in:
Jiaming Yuan
2021-11-18 23:02:54 +08:00
committed by GitHub
parent 9fb4338964
commit d33854af1b
25 changed files with 545 additions and 256 deletions

View File

@@ -249,7 +249,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
char const* json_config,
DMatrixHandle *out);
/*
/**
* ========================== Begin data callback APIs =========================
*
* Short notes for data callback
@@ -258,9 +258,9 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
* used by JVM packages. It uses `XGBoostBatchCSR` to accept batches for CSR formated
* input, and concatenate them into 1 final big CSR. The related functions are:
*
* - XGBCallbackSetData
* - XGBCallbackDataIterNext
* - XGDMatrixCreateFromDataIter
* - \ref XGBCallbackSetData
* - \ref XGBCallbackDataIterNext
* - \ref XGDMatrixCreateFromDataIter
*
* Another set is used by external data iterator. It accept foreign data iterators as
* callbacks. There are 2 different senarios where users might want to pass in callbacks
@@ -276,17 +276,17 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
* Related functions are:
*
* # Factory functions
* - `XGDMatrixCreateFromCallback` for external memory
* - `XGDeviceQuantileDMatrixCreateFromCallback` for quantile DMatrix
* - \ref XGDMatrixCreateFromCallback for external memory
* - \ref XGDeviceQuantileDMatrixCreateFromCallback for quantile DMatrix
*
* # Proxy that callers can use to pass data to XGBoost
* - XGProxyDMatrixCreate
* - XGDMatrixCallbackNext
* - DataIterResetCallback
* - XGProxyDMatrixSetDataCudaArrayInterface
* - XGProxyDMatrixSetDataCudaColumnar
* - XGProxyDMatrixSetDataDense
* - XGProxyDMatrixSetDataCSR
* - \ref XGProxyDMatrixCreate
* - \ref XGDMatrixCallbackNext
* - \ref DataIterResetCallback
* - \ref XGProxyDMatrixSetDataCudaArrayInterface
* - \ref XGProxyDMatrixSetDataCudaColumnar
* - \ref XGProxyDMatrixSetDataDense
* - \ref XGProxyDMatrixSetDataCSR
* - ... (data setters)
*/
@@ -411,7 +411,7 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
* - cache_prefix: The path of cache file, caller must initialize all the directories in this path.
* - nthread (optional): Number of threads used for initializing DMatrix.
*
* \param out The created external memory DMatrix
* \param[out] out The created external memory DMatrix
*
* \return 0 when success, -1 when failure happens
*/
@@ -605,7 +605,8 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
* char const* feat_names [] {"feat_0", "feat_1"};
* XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2);
*
* // i for integer, q for quantitive. Similarly "int" and "float" are also recognized.
* // i for integer, q for quantitive, c for categorical. Similarly "int" and "float"
* // are also recognized.
* char const* feat_types [] {"i", "q"};
* XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2);
*

View File

@@ -47,7 +47,7 @@ enum class FeatureType : uint8_t {
class MetaInfo {
public:
/*! \brief number of data fields in MetaInfo */
static constexpr uint64_t kNumField = 11;
static constexpr uint64_t kNumField = 12;
/*! \brief number of rows in the data */
uint64_t num_row_{0}; // NOLINT
@@ -69,7 +69,7 @@ class MetaInfo {
* if specified, xgboost will start from this init margin
* can be used to specify initial prediction to boost from.
*/
HostDeviceVector<bst_float> base_margin_; // NOLINT
linalg::Tensor<float, 3> base_margin_; // NOLINT
/*!
* \brief lower bound of the label, to be used for survival analysis (censored regression)
*/
@@ -154,12 +154,8 @@ class MetaInfo {
* \brief Set information in the meta info with array interface.
* \param key The key of the information.
* \param interface_str String representation of json format array interface.
*
* [ column_0, column_1, ... column_n ]
*
* Right now only 1 column is permitted.
*/
void SetInfo(StringView key, std::string const& interface_str);
void SetInfo(StringView key, StringView interface_str);
void GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
const void** out_dptr) const;
@@ -181,6 +177,9 @@ class MetaInfo {
void Extend(MetaInfo const& that, bool accumulate_rows, bool check_column);
private:
void SetInfoFromHost(StringView key, Json arr);
void SetInfoFromCUDA(StringView key, Json arr);
/*! \brief argsort of labels */
mutable std::vector<size_t> label_order_cache_;
};
@@ -479,7 +478,7 @@ class DMatrix {
this->Info().SetInfo(key, dptr, dtype, num);
}
virtual void SetInfo(const char* key, std::string const& interface_str) {
this->Info().SetInfo(key, interface_str);
this->Info().SetInfo(key, StringView{interface_str});
}
/*! \brief meta information of the dataset */
virtual const MetaInfo& Info() const = 0;

View File

@@ -19,7 +19,7 @@ namespace xgboost {
*/
class IntrusivePtrCell {
private:
std::atomic<int32_t> count_;
std::atomic<int32_t> count_ {0};
template <typename T> friend class IntrusivePtr;
std::int32_t IncRef() noexcept {
@@ -31,7 +31,7 @@ class IntrusivePtrCell {
bool IsZero() const { return Count() == 0; }
public:
IntrusivePtrCell() noexcept : count_{0} {}
IntrusivePtrCell() noexcept = default;
int32_t Count() const { return count_.load(std::memory_order_relaxed); }
};

View File

@@ -126,9 +126,8 @@ class Predictor {
* \param out_predt Prediction vector to be initialized.
* \param model Tree model used for prediction.
*/
virtual void InitOutPredictions(const MetaInfo &info,
HostDeviceVector<bst_float> *out_predt,
const gbm::GBTreeModel &model) const = 0;
void InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_float>* out_predt,
const gbm::GBTreeModel& model) const;
/**
* \brief Generate batch predictions for a given feature matrix. May use

View File

@@ -33,7 +33,7 @@ struct ObjInfo {
bool const_hess{false};
explicit ObjInfo(Task t) : task{t} {}
ObjInfo(Task t, bool khess) : const_hess{khess} {}
ObjInfo(Task t, bool khess) : task{t}, const_hess{khess} {}
};
} // namespace xgboost
#endif // XGBOOST_TASK_H_