[breaking] Change internal model serialization to UBJSON. (#7556)
* Use typed array for models. * Change the memory snapshot format. * Add new C API for saving to raw format.
This commit is contained in:
@@ -32,6 +32,7 @@ dependencies:
|
||||
- awscli
|
||||
- numba
|
||||
- llvmlite
|
||||
- py-ubjson
|
||||
- pip:
|
||||
- shap
|
||||
- ipython # required by shap at import time.
|
||||
|
||||
@@ -31,6 +31,7 @@ dependencies:
|
||||
- jsonschema
|
||||
- boto3
|
||||
- awscli
|
||||
- py-ubjson
|
||||
- pip:
|
||||
- sphinx_rtd_theme
|
||||
- datatable
|
||||
|
||||
@@ -18,3 +18,4 @@ dependencies:
|
||||
- jsonschema
|
||||
- python-graphviz
|
||||
- pip
|
||||
- py-ubjson
|
||||
|
||||
@@ -16,3 +16,4 @@ dependencies:
|
||||
- python-graphviz
|
||||
- modin-ray
|
||||
- pip
|
||||
- py-ubjson
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2019-2020 XGBoost contributors
|
||||
* Copyright 2019-2022 XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/version_config.h>
|
||||
@@ -150,6 +150,33 @@ TEST(CAPI, JsonModelIO) {
|
||||
|
||||
ASSERT_EQ(model_str_0.front(), '{');
|
||||
ASSERT_EQ(model_str_0, model_str_1);
|
||||
|
||||
/**
|
||||
* In memory
|
||||
*/
|
||||
bst_ulong len{0};
|
||||
char const *data;
|
||||
XGBoosterSaveModelToBuffer(handle, R"({"format": "ubj"})", &len, &data);
|
||||
ASSERT_GT(len, 3);
|
||||
|
||||
XGBoosterLoadModelFromBuffer(handle, data, len);
|
||||
char const *saved;
|
||||
bst_ulong saved_len{0};
|
||||
XGBoosterSaveModelToBuffer(handle, R"({"format": "ubj"})", &saved_len, &saved);
|
||||
ASSERT_EQ(len, saved_len);
|
||||
auto l = StringView{data, len};
|
||||
auto r = StringView{saved, saved_len};
|
||||
ASSERT_EQ(l.size(), r.size());
|
||||
ASSERT_EQ(l, r);
|
||||
|
||||
std::string buffer;
|
||||
Json::Dump(Json::Load(l, std::ios::binary), &buffer);
|
||||
ASSERT_EQ(model_str_0.size() - 1, buffer.size());
|
||||
ASSERT_EQ(model_str_0.back(), '\0');
|
||||
ASSERT_TRUE(std::equal(model_str_0.begin(), model_str_0.end() - 1, buffer.begin()));
|
||||
|
||||
ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({})", &len, &data), -1);
|
||||
ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({"format": "foo"})", &len, &data), -1);
|
||||
}
|
||||
|
||||
TEST(CAPI, CatchDMLCError) {
|
||||
|
||||
@@ -178,8 +178,8 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
|
||||
learner->Save(&fo);
|
||||
}
|
||||
|
||||
Json m_0 = Json::Load(StringView{continued_model.c_str(), continued_model.size()});
|
||||
Json m_1 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()});
|
||||
Json m_0 = Json::Load(StringView{continued_model}, std::ios::binary);
|
||||
Json m_1 = Json::Load(StringView{model_at_2kiter}, std::ios::binary);
|
||||
|
||||
CompareJSON(m_0, m_1);
|
||||
}
|
||||
@@ -214,8 +214,8 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
|
||||
common::MemoryBufferStream fo(&serialised_model_tmp);
|
||||
learner->Save(&fo);
|
||||
|
||||
Json m_0 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()});
|
||||
Json m_1 = Json::Load(StringView{serialised_model_tmp.c_str(), serialised_model_tmp.size()});
|
||||
Json m_0 = Json::Load(StringView{model_at_2kiter}, std::ios::binary);
|
||||
Json m_1 = Json::Load(StringView{serialised_model_tmp}, std::ios::binary);
|
||||
// GPU ID is changed as data is coming from device.
|
||||
ASSERT_EQ(get<Object>(m_0["Config"]["learner"]["generic_param"]).erase("gpu_id"),
|
||||
get<Object>(m_1["Config"]["learner"]["generic_param"]).erase("gpu_id"));
|
||||
|
||||
@@ -198,8 +198,7 @@ void CheckReload(RegTree const &tree) {
|
||||
Json saved{Object()};
|
||||
loaded_tree.SaveModel(&saved);
|
||||
|
||||
auto same = out == saved;
|
||||
ASSERT_TRUE(same);
|
||||
ASSERT_EQ(out, saved);
|
||||
}
|
||||
|
||||
TEST(Tree, CategoricalIO) {
|
||||
@@ -433,12 +432,12 @@ TEST(Tree, JsonIO) {
|
||||
ASSERT_EQ(get<String>(tparam["num_nodes"]), "3");
|
||||
ASSERT_EQ(get<String>(tparam["size_leaf_vector"]), "0");
|
||||
|
||||
ASSERT_EQ(get<Array const>(j_tree["left_children"]).size(), 3ul);
|
||||
ASSERT_EQ(get<Array const>(j_tree["right_children"]).size(), 3ul);
|
||||
ASSERT_EQ(get<Array const>(j_tree["parents"]).size(), 3ul);
|
||||
ASSERT_EQ(get<Array const>(j_tree["split_indices"]).size(), 3ul);
|
||||
ASSERT_EQ(get<Array const>(j_tree["split_conditions"]).size(), 3ul);
|
||||
ASSERT_EQ(get<Array const>(j_tree["default_left"]).size(), 3ul);
|
||||
ASSERT_EQ(get<I32Array const>(j_tree["left_children"]).size(), 3ul);
|
||||
ASSERT_EQ(get<I32Array const>(j_tree["right_children"]).size(), 3ul);
|
||||
ASSERT_EQ(get<I32Array const>(j_tree["parents"]).size(), 3ul);
|
||||
ASSERT_EQ(get<I32Array const>(j_tree["split_indices"]).size(), 3ul);
|
||||
ASSERT_EQ(get<F32Array const>(j_tree["split_conditions"]).size(), 3ul);
|
||||
ASSERT_EQ(get<U8Array const>(j_tree["default_left"]).size(), 3ul);
|
||||
|
||||
RegTree loaded_tree;
|
||||
loaded_tree.LoadModel(j_tree);
|
||||
|
||||
@@ -14,7 +14,7 @@ dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
def json_model(model_path, parameters):
|
||||
def json_model(model_path: str, parameters: dict) -> dict:
|
||||
X = np.random.random((10, 3))
|
||||
y = np.random.randint(2, size=(10,))
|
||||
|
||||
@@ -22,9 +22,14 @@ def json_model(model_path, parameters):
|
||||
|
||||
bst = xgb.train(parameters, dm1)
|
||||
bst.save_model(model_path)
|
||||
if model_path.endswith("ubj"):
|
||||
import ubjson
|
||||
with open(model_path, "rb") as ubjfd:
|
||||
model = ubjson.load(ubjfd)
|
||||
else:
|
||||
with open(model_path, 'r') as fd:
|
||||
model = json.load(fd)
|
||||
|
||||
with open(model_path, 'r') as fd:
|
||||
model = json.load(fd)
|
||||
return model
|
||||
|
||||
|
||||
@@ -259,23 +264,40 @@ class TestModels:
|
||||
buf_from_raw = from_raw.save_raw()
|
||||
assert buf == buf_from_raw
|
||||
|
||||
def test_model_json_io(self):
|
||||
def run_model_json_io(self, parameters: dict, ext: str) -> None:
|
||||
if ext == "ubj" and tm.no_ubjson()["condition"]:
|
||||
pytest.skip(tm.no_ubjson()["reason"])
|
||||
|
||||
loc = locale.getpreferredencoding(False)
|
||||
model_path = 'test_model_json_io.json'
|
||||
parameters = {'tree_method': 'hist', 'booster': 'gbtree'}
|
||||
model_path = 'test_model_json_io.' + ext
|
||||
j_model = json_model(model_path, parameters)
|
||||
assert isinstance(j_model['learner'], dict)
|
||||
|
||||
bst = xgb.Booster(model_file=model_path)
|
||||
|
||||
bst.save_model(fname=model_path)
|
||||
with open(model_path, 'r') as fd:
|
||||
j_model = json.load(fd)
|
||||
if ext == "ubj":
|
||||
import ubjson
|
||||
with open(model_path, "rb") as ubjfd:
|
||||
j_model = ubjson.load(ubjfd)
|
||||
else:
|
||||
with open(model_path, 'r') as fd:
|
||||
j_model = json.load(fd)
|
||||
|
||||
assert isinstance(j_model['learner'], dict)
|
||||
|
||||
os.remove(model_path)
|
||||
assert locale.getpreferredencoding(False) == loc
|
||||
|
||||
@pytest.mark.parametrize("ext", ["json", "ubj"])
|
||||
def test_model_json_io(self, ext: str) -> None:
|
||||
parameters = {"booster": "gbtree", "tree_method": "hist"}
|
||||
self.run_model_json_io(parameters, ext)
|
||||
parameters = {"booster": "gblinear"}
|
||||
self.run_model_json_io(parameters, ext)
|
||||
parameters = {"booster": "dart", "tree_method": "hist"}
|
||||
self.run_model_json_io(parameters, ext)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_json_schema())
|
||||
def test_json_io_schema(self):
|
||||
import jsonschema
|
||||
|
||||
@@ -2,6 +2,7 @@ import pickle
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import os
|
||||
import json
|
||||
|
||||
|
||||
kRows = 100
|
||||
@@ -15,13 +16,14 @@ def generate_data():
|
||||
|
||||
|
||||
class TestPickling:
|
||||
def run_model_pickling(self, xgb_params):
|
||||
def run_model_pickling(self, xgb_params) -> str:
|
||||
X, y = generate_data()
|
||||
dtrain = xgb.DMatrix(X, y)
|
||||
bst = xgb.train(xgb_params, dtrain)
|
||||
|
||||
dump_0 = bst.get_dump(dump_format='json')
|
||||
assert dump_0
|
||||
config_0 = bst.save_config()
|
||||
|
||||
filename = 'model.pkl'
|
||||
|
||||
@@ -42,9 +44,22 @@ class TestPickling:
|
||||
if os.path.exists(filename):
|
||||
os.remove(filename)
|
||||
|
||||
config_1 = bst.save_config()
|
||||
assert config_0 == config_1
|
||||
return json.loads(config_0)
|
||||
|
||||
def test_model_pickling_json(self):
|
||||
params = {
|
||||
'nthread': 1,
|
||||
'tree_method': 'hist',
|
||||
}
|
||||
self.run_model_pickling(params)
|
||||
def check(config):
|
||||
updater = config["learner"]["gradient_booster"]["updater"]
|
||||
if params["tree_method"] == "exact":
|
||||
subsample = updater["grow_colmaker"]["train_param"]["subsample"]
|
||||
else:
|
||||
subsample = updater["grow_quantile_histmaker"]["train_param"]["subsample"]
|
||||
assert float(subsample) == 0.5
|
||||
|
||||
params = {"nthread": 8, "tree_method": "hist", "subsample": 0.5}
|
||||
config = self.run_model_pickling(params)
|
||||
check(config)
|
||||
params = {"nthread": 8, "tree_method": "exact", "subsample": 0.5}
|
||||
config = self.run_model_pickling(params)
|
||||
check(config)
|
||||
|
||||
@@ -29,6 +29,15 @@ except ImportError:
|
||||
memory = Memory('./cachedir', verbose=0)
|
||||
|
||||
|
||||
def no_ubjson():
|
||||
reason = "ubjson is not intsalled."
|
||||
try:
|
||||
import ubjson # noqa
|
||||
return {"condition": False, "reason": reason}
|
||||
except ImportError:
|
||||
return {"condition": True, "reason": reason}
|
||||
|
||||
|
||||
def no_sklearn():
|
||||
return {'condition': not SKLEARN_INSTALLED,
|
||||
'reason': 'Scikit-Learn is not installed'}
|
||||
|
||||
Reference in New Issue
Block a user