[breaking] Change internal model serialization to UBJSON. (#7556)

* Use typed array for models.
* Change the memory snapshot format.
* Add new C API for saving to raw format.
This commit is contained in:
Jiaming Yuan
2022-01-16 02:11:53 +08:00
committed by GitHub
parent 13b0fa4b97
commit a1bcd33a3b
24 changed files with 566 additions and 255 deletions

View File

@@ -1081,14 +1081,32 @@ XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
const void *buf,
bst_ulong len);
/*!
* \brief save model into binary raw bytes, return header of the array
* user must copy the result out, before next xgboost call
* \brief Save model into raw bytes, return header of the array. User must copy the
* result out, before next xgboost call
*
* \param handle handle
* \param out_len the argument to hold the output length
* \param out_dptr the argument to hold the output data pointer
* \param json_config JSON encoded string storing parameters for the function. Following
* keys are expected in the JSON document:
*
* "format": str
* - json: Output booster will be encoded as JSON.
* - ubj: Output booster will be encoded as Univeral binary JSON.
* - deprecated: Output booster will be encoded as old custom binary format. Do not use
* this format except for compatibility reasons.
*
* \param out_len The argument to hold the output length
* \param out_dptr The argument to hold the output data pointer
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_config,
bst_ulong *out_len, char const **out_dptr);
/*!
* \brief Deprecated, use `XGBoosterSaveModelToBuffer` instead.
*/
XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, bst_ulong *out_len,
const char **out_dptr);

View File

@@ -1,5 +1,5 @@
/*!
* Copyright (c) 2015-2021 by Contributors
* Copyright (c) 2015-2022 by Contributors
* \file data.h
* \brief The input data structure of xgboost.
* \author Tianqi Chen
@@ -36,10 +36,7 @@ enum class DataType : uint8_t {
kStr = 5
};
enum class FeatureType : uint8_t {
kNumerical,
kCategorical
};
enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };
/*!
* \brief Meta information about dataset, always sit in memory.

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2021 by XGBoost Contributors
* Copyright 2021-2022 by XGBoost Contributors
* \file linalg.h
* \brief Linear algebra related utilities.
*/
@@ -567,7 +567,7 @@ template <typename T, int32_t D>
Json ArrayInterface(TensorView<T const, D> const &t) {
Json array_interface{Object{}};
array_interface["data"] = std::vector<Json>(2);
array_interface["data"][0] = Integer(reinterpret_cast<int64_t>(t.Values().data()));
array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(t.Values().data())};
array_interface["data"][1] = Boolean{true};
if (t.DeviceIdx() >= 0) {
// Change this once we have different CUDA stream.

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2019 by Contributors
* Copyright 2014-2022 by Contributors
* \file tree_model.h
* \brief model structure for tree
* \author Tianqi Chen
@@ -42,7 +42,7 @@ struct TreeParam : public dmlc::Parameter<TreeParam> {
/*! \brief maximum depth, this is a statistics of the tree */
int deprecated_max_depth;
/*! \brief number of features used for tree construction */
int num_feature;
bst_feature_t num_feature;
/*!
* \brief leaf vector size, used for vector tree
* used to store more than one dimensional information in tree
@@ -629,6 +629,7 @@ class RegTree : public Model {
}
private:
template <bool typed>
void LoadCategoricalSplit(Json const& in);
void SaveCategoricalSplit(Json* p_out) const;
// vector of nodes