[breaking] Change internal model serialization to UBJSON. (#7556)

* Use typed array for models.
* Change the memory snapshot format.
* Add new C API for saving to raw format.
This commit is contained in:
Jiaming Yuan
2022-01-16 02:11:53 +08:00
committed by GitHub
parent 13b0fa4b97
commit a1bcd33a3b
24 changed files with 566 additions and 255 deletions

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2019-2020 XGBoost contributors
* Copyright 2019-2022 XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/version_config.h>
@@ -150,6 +150,33 @@ TEST(CAPI, JsonModelIO) {
ASSERT_EQ(model_str_0.front(), '{');
ASSERT_EQ(model_str_0, model_str_1);
/**
* In memory
*/
bst_ulong len{0};
char const *data;
XGBoosterSaveModelToBuffer(handle, R"({"format": "ubj"})", &len, &data);
ASSERT_GT(len, 3);
XGBoosterLoadModelFromBuffer(handle, data, len);
char const *saved;
bst_ulong saved_len{0};
XGBoosterSaveModelToBuffer(handle, R"({"format": "ubj"})", &saved_len, &saved);
ASSERT_EQ(len, saved_len);
auto l = StringView{data, len};
auto r = StringView{saved, saved_len};
ASSERT_EQ(l.size(), r.size());
ASSERT_EQ(l, r);
std::string buffer;
Json::Dump(Json::Load(l, std::ios::binary), &buffer);
ASSERT_EQ(model_str_0.size() - 1, buffer.size());
ASSERT_EQ(model_str_0.back(), '\0');
ASSERT_TRUE(std::equal(model_str_0.begin(), model_str_0.end() - 1, buffer.begin()));
ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({})", &len, &data), -1);
ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({"format": "foo"})", &len, &data), -1);
}
TEST(CAPI, CatchDMLCError) {

View File

@@ -178,8 +178,8 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
learner->Save(&fo);
}
Json m_0 = Json::Load(StringView{continued_model.c_str(), continued_model.size()});
Json m_1 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()});
Json m_0 = Json::Load(StringView{continued_model}, std::ios::binary);
Json m_1 = Json::Load(StringView{model_at_2kiter}, std::ios::binary);
CompareJSON(m_0, m_1);
}
@@ -214,8 +214,8 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
common::MemoryBufferStream fo(&serialised_model_tmp);
learner->Save(&fo);
Json m_0 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()});
Json m_1 = Json::Load(StringView{serialised_model_tmp.c_str(), serialised_model_tmp.size()});
Json m_0 = Json::Load(StringView{model_at_2kiter}, std::ios::binary);
Json m_1 = Json::Load(StringView{serialised_model_tmp}, std::ios::binary);
// GPU ID is changed as data is coming from device.
ASSERT_EQ(get<Object>(m_0["Config"]["learner"]["generic_param"]).erase("gpu_id"),
get<Object>(m_1["Config"]["learner"]["generic_param"]).erase("gpu_id"));

View File

@@ -198,8 +198,7 @@ void CheckReload(RegTree const &tree) {
Json saved{Object()};
loaded_tree.SaveModel(&saved);
auto same = out == saved;
ASSERT_TRUE(same);
ASSERT_EQ(out, saved);
}
TEST(Tree, CategoricalIO) {
@@ -433,12 +432,12 @@ TEST(Tree, JsonIO) {
ASSERT_EQ(get<String>(tparam["num_nodes"]), "3");
ASSERT_EQ(get<String>(tparam["size_leaf_vector"]), "0");
ASSERT_EQ(get<Array const>(j_tree["left_children"]).size(), 3ul);
ASSERT_EQ(get<Array const>(j_tree["right_children"]).size(), 3ul);
ASSERT_EQ(get<Array const>(j_tree["parents"]).size(), 3ul);
ASSERT_EQ(get<Array const>(j_tree["split_indices"]).size(), 3ul);
ASSERT_EQ(get<Array const>(j_tree["split_conditions"]).size(), 3ul);
ASSERT_EQ(get<Array const>(j_tree["default_left"]).size(), 3ul);
ASSERT_EQ(get<I32Array const>(j_tree["left_children"]).size(), 3ul);
ASSERT_EQ(get<I32Array const>(j_tree["right_children"]).size(), 3ul);
ASSERT_EQ(get<I32Array const>(j_tree["parents"]).size(), 3ul);
ASSERT_EQ(get<I32Array const>(j_tree["split_indices"]).size(), 3ul);
ASSERT_EQ(get<F32Array const>(j_tree["split_conditions"]).size(), 3ul);
ASSERT_EQ(get<U8Array const>(j_tree["default_left"]).size(), 3ul);
RegTree loaded_tree;
loaded_tree.LoadModel(j_tree);