Model IO in JSON. (#5110)

This commit is contained in:
Jiaming Yuan
2019-12-11 11:20:40 +08:00
committed by GitHub
parent c7cc657a4d
commit 208ab3b1ff
25 changed files with 667 additions and 165 deletions

View File

@@ -5,23 +5,25 @@
#include <cstdio>
#include <cstring>
#include <fstream>
#include <algorithm>
#include <vector>
#include <string>
#include <memory>
#include "xgboost/data.h"
#include "xgboost/learner.h"
#include "xgboost/c_api.h"
#include "xgboost/logging.h"
#include "xgboost/version_config.h"
#include "xgboost/json.h"
#include "c_api_error.h"
#include "../data/simple_csr_source.h"
#include "../common/io.h"
#include "../data/adapter.h"
namespace xgboost {
// declare the data callback.
XGB_EXTERN_C int XGBoostNativeDataIterSetData(
@@ -569,23 +571,43 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
API_BEGIN();
CHECK_HANDLE();
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
static_cast<Learner*>(handle)->Load(fi.get());
if (common::FileExtension(fname) == "json") {
auto str = common::LoadSequentialFile(fname);
CHECK_GT(str.size(), 2);
CHECK_EQ(str[0], '{');
Json in { Json::Load({str.c_str(), str.size()}) };
static_cast<Learner*>(handle)->LoadModel(in);
} else {
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
static_cast<Learner*>(handle)->Load(fi.get());
}
API_END();
}
XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char* fname) {
XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char* c_fname) {
API_BEGIN();
CHECK_HANDLE();
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname, "w"));
auto *bst = static_cast<Learner*>(handle);
bst->Save(fo.get());
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(c_fname, "w"));
auto *learner = static_cast<Learner *>(handle);
learner->Configure();
if (common::FileExtension(c_fname) == "json") {
Json out { Object() };
learner->SaveModel(&out);
std::string str;
Json::Dump(out, &str);
fo->Write(str.c_str(), str.size());
} else {
auto *bst = static_cast<Learner*>(handle);
bst->Save(fo.get());
}
API_END();
}
// The following two functions are `Load` and `Save` for memory based serialization
// methods. E.g. Python pickle.
XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
const void* buf,
xgboost::bst_ulong len) {
const void* buf,
xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
common::MemoryFixSizeBuffer fs((void*)buf, len); // NOLINT(*)
@@ -594,16 +616,17 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
}
XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle,
xgboost::bst_ulong* out_len,
const char** out_dptr) {
xgboost::bst_ulong* out_len,
const char** out_dptr) {
std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
raw_str.resize(0);
API_BEGIN();
CHECK_HANDLE();
common::MemoryBufferStream fo(&raw_str);
auto *bst = static_cast<Learner*>(handle);
bst->Save(&fo);
auto *learner = static_cast<Learner*>(handle);
learner->Configure();
learner->Save(&fo);
*out_dptr = dmlc::BeginPtr(raw_str);
*out_len = static_cast<xgboost::bst_ulong>(raw_str.length());
API_END();
@@ -619,6 +642,7 @@ inline void XGBoostDumpModelImpl(
std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
auto *bst = static_cast<Learner*>(handle);
bst->Configure();
str_vecs = bst->DumpModel(fmap, with_stats != 0, format);
charp_vecs.resize(str_vecs.size());
for (size_t i = 0; i < str_vecs.size(); ++i) {

View File

@@ -115,7 +115,7 @@ std::string LoadSequentialFile(std::string fname) {
}
size_t f_size_bytes = fs.st_size;
buffer.resize(f_size_bytes+1);
buffer.resize(f_size_bytes + 1);
int32_t fd = open(fname.c_str(), O_RDONLY);
posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
ssize_t bytes_read = read(fd, &buffer[0], f_size_bytes);

View File

@@ -85,6 +85,20 @@ class GBLinear : public GradientBooster {
model_.Save(fo);
}
void SaveModel(Json* p_out) const override {
auto& out = *p_out;
out["name"] = String{"gblinear"};
out["model"] = Object();
auto& model = out["model"];
model_.SaveModel(&model);
}
void LoadModel(Json const& in) override {
CHECK_EQ(get<String>(in["name"]), "gblinear");
auto const& model = in["model"];
model_.LoadModel(model);
}
void DoBoost(DMatrix *p_fmat,
HostDeviceVector<GradientPair> *in_gpair,
ObjFunction* obj) override {

38
src/gbm/gblinear_model.cc Normal file
View File

@@ -0,0 +1,38 @@
/*!
* Copyright 2019 by Contributors
*/
#include <utility>
#include <limits>
#include "xgboost/json.h"
#include "gblinear_model.h"
namespace xgboost {
namespace gbm {
void GBLinearModel::SaveModel(Json* p_out) const {
using WeightType = std::remove_reference<decltype(std::declval<decltype(weight)>().back())>::type;
using JsonFloat = Number::Float;
static_assert(std::is_same<WeightType, JsonFloat>::value,
"Weight type should be of the same type with JSON float");
auto& out = *p_out;
size_t const n_weights = weight.size();
std::vector<Json> j_weights(n_weights);
for (size_t i = 0; i < n_weights; ++i) {
j_weights[i] = weight[i];
}
out["weights"] = std::move(j_weights);
}
void GBLinearModel::LoadModel(Json const& in) {
auto const& j_weights = get<Array const>(in["weights"]);
auto n_weights = j_weights.size();
weight.resize(n_weights);
for (size_t i = 0; i < n_weights; ++i) {
weight[i] = get<Number const>(j_weights[i]);
}
}
DMLC_REGISTER_PARAMETER(DeprecatedGBLinearModelParam);
} // namespace gbm
} // namespace xgboost

View File

@@ -62,27 +62,21 @@ class GBLinearModel : public Model {
learner_model_param_->num_output_group);
std::fill(weight.begin(), weight.end(), 0.0f);
}
void SaveModel(Json *p_out) const override;
void LoadModel(Json const &in) override;
// save the model to file
inline void Save(dmlc::Stream* fo) const {
void Save(dmlc::Stream *fo) const {
fo->Write(&param, sizeof(param));
fo->Write(weight);
}
// load model from file
inline void Load(dmlc::Stream* fi) {
void Load(dmlc::Stream *fi) {
CHECK_EQ(fi->Read(&param, sizeof(param)), sizeof(param));
fi->Read(&weight);
}
void LoadModel(dmlc::Stream* fi) override {
// They are the same right now until we can split up the saved parameter from model.
this->Load(fi);
}
void SaveModel(dmlc::Stream* fo) const override {
// They are the same right now until we can split up the saved parameter from model.
this->Save(fo);
}
// model bias
inline bst_float *bias() {
return &weight[learner_model_param_->num_feature *

View File

@@ -289,8 +289,19 @@ void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& ne
monitor_.Stop("CommitModel");
}
void GBTree::LoadModel(Json const& in) {
CHECK_EQ(get<String>(in["name"]), "gbtree");
model_.LoadModel(in["model"]);
}
void GBTree::SaveModel(Json* p_out) const {
auto& out = *p_out;
out["name"] = String("gbtree");
out["model"] = Object();
auto& model = out["model"];
model_.SaveModel(&model);
}
// dart
class Dart : public GBTree {
public:
explicit Dart(LearnerModelParam const* booster_config) :
@@ -303,6 +314,30 @@ class Dart : public GBTree {
}
}
void SaveModel(Json *p_out) const override {
auto &out = *p_out;
out["name"] = String("dart");
out["gbtree"] = Object();
GBTree::SaveModel(&(out["gbtree"]));
std::vector<Json> j_weight_drop(weight_drop_.size());
for (size_t i = 0; i < weight_drop_.size(); ++i) {
j_weight_drop[i] = Number(weight_drop_[i]);
}
out["weight_drop"] = Array(j_weight_drop);
}
void LoadModel(Json const& in) override {
CHECK_EQ(get<String>(in["name"]), "dart");
auto const& gbtree = in["gbtree"];
GBTree::LoadModel(gbtree);
auto const& j_weight_drop = get<Array>(in["weight_drop"]);
weight_drop_.resize(j_weight_drop.size());
for (size_t i = 0; i < weight_drop_.size(); ++i) {
weight_drop_[i] = get<Number const>(j_weight_drop[i]);
}
}
void Load(dmlc::Stream* fi) override {
GBTree::Load(fi);
weight_drop_.resize(model_.param.num_trees);
@@ -387,7 +422,7 @@ class Dart : public GBTree {
if (init_out_preds) {
size_t n = num_group * p_fmat->Info().num_row_;
const auto& base_margin =
p_fmat->Info().base_margin_.ConstHostVector();
p_fmat->Info().base_margin_.ConstHostVector();
out_preds->resize(n);
if (base_margin.size() != 0) {
CHECK_EQ(out_preds->size(), n);

View File

@@ -192,6 +192,9 @@ class GBTree : public GradientBooster {
model_.Save(fo);
}
void SaveModel(Json* p_out) const override;
void LoadModel(Json const& in) override;
bool AllowLazyCheckPoint() const override {
return model_.learner_model_param_->num_output_group == 1 ||
tparam_.updater_seq.find("distcol") != std::string::npos;

85
src/gbm/gbtree_model.cc Normal file
View File

@@ -0,0 +1,85 @@
/*!
* Copyright 2019 by Contributors
*/
#include "xgboost/json.h"
#include "xgboost/logging.h"
#include "gbtree_model.h"
namespace xgboost {
namespace gbm {
void GBTreeModel::Save(dmlc::Stream* fo) const {
CHECK_EQ(param.num_trees, static_cast<int32_t>(trees.size()));
fo->Write(&param, sizeof(param));
for (const auto & tree : trees) {
tree->Save(fo);
}
if (tree_info.size() != 0) {
fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size());
}
}
void GBTreeModel::Load(dmlc::Stream* fi) {
CHECK_EQ(fi->Read(&param, sizeof(param)), sizeof(param))
<< "GBTree: invalid model file";
trees.clear();
trees_to_update.clear();
for (int32_t i = 0; i < param.num_trees; ++i) {
std::unique_ptr<RegTree> ptr(new RegTree());
ptr->Load(fi);
trees.push_back(std::move(ptr));
}
tree_info.resize(param.num_trees);
if (param.num_trees != 0) {
CHECK_EQ(
fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees),
sizeof(int32_t) * param.num_trees);
}
}
void GBTreeModel::SaveModel(Json* p_out) const {
auto& out = *p_out;
CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
out["model_param"] = toJson(param);
std::vector<Json> trees_json;
size_t t = 0;
for (auto const& tree : trees) {
Json tree_json{Object()};
tree->SaveModel(&tree_json);
tree_json["id"] = std::to_string(t);
trees_json.emplace_back(tree_json);
t++;
}
std::vector<Json> tree_info_json(tree_info.size());
for (size_t i = 0; i < tree_info.size(); ++i) {
tree_info_json[i] = Integer(tree_info[i]);
}
out["trees"] = Array(std::move(trees_json));
out["tree_info"] = Array(std::move(tree_info_json));
}
void GBTreeModel::LoadModel(Json const& in) {
fromJson(in["model_param"], &param);
trees.clear();
trees_to_update.clear();
auto const& trees_json = get<Array const>(in["trees"]);
trees.resize(trees_json.size());
for (size_t t = 0; t < trees.size(); ++t) {
trees[t].reset( new RegTree() );
trees[t]->LoadModel(trees_json[t]);
}
tree_info.resize(param.num_trees);
auto const& tree_info_json = get<Array const>(in["tree_info"]);
for (int32_t i = 0; i < param.num_trees; ++i) {
tree_info[i] = get<Integer const>(tree_info_json[i]);
}
}
} // namespace gbm
} // namespace xgboost

View File

@@ -84,43 +84,11 @@ struct GBTreeModel : public Model {
}
}
void LoadModel(dmlc::Stream* fi) override {
// They are the same right now until we can split up the saved parameter from model.
this->Load(fi);
}
void SaveModel(dmlc::Stream* fo) const override {
// They are the same right now until we can split up the saved parameter from model.
this->Save(fo);
}
void Load(dmlc::Stream* fi);
void Save(dmlc::Stream* fo) const;
void Load(dmlc::Stream* fi) {
CHECK_EQ(fi->Read(&param, sizeof(param)), sizeof(param))
<< "GBTree: invalid model file";
trees.clear();
trees_to_update.clear();
for (int i = 0; i < param.num_trees; ++i) {
std::unique_ptr<RegTree> ptr(new RegTree());
ptr->LoadModel(fi);
trees.push_back(std::move(ptr));
}
tree_info.resize(param.num_trees);
if (param.num_trees != 0) {
CHECK_EQ(
fi->Read(dmlc::BeginPtr(tree_info), sizeof(int) * param.num_trees),
sizeof(int) * param.num_trees);
}
}
void Save(dmlc::Stream* fo) const {
CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
fo->Write(&param, sizeof(param));
for (const auto & tree : trees) {
tree->SaveModel(fo);
}
if (tree_info.size() != 0) {
fo->Write(dmlc::BeginPtr(tree_info), sizeof(int) * tree_info.size());
}
}
void SaveModel(Json* p_out) const override;
void LoadModel(Json const& p_out) override;
std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
std::string format) const {

View File

@@ -266,14 +266,61 @@ class LearnerImpl : public Learner {
}
}
void LoadModel(dmlc::Stream* fi) override {
// They are the same right now until we can split up the saved parameter from model.
this->Load(fi);
void LoadModel(Json const& in) override {
CHECK(IsA<Object>(in));
Version::Load(in, false);
auto const& learner = get<Object>(in["Learner"]);
mparam_.FromJson(learner.at("learner_model_param"));
auto const& objective_fn = learner.at("objective");
std::string name = get<String>(objective_fn["name"]);
tparam_.UpdateAllowUnknown(Args{{"objective", name}});
obj_.reset(ObjFunction::Create(name, &generic_parameters_));
obj_->LoadConfig(objective_fn);
auto const& gradient_booster = learner.at("gradient_booster");
name = get<String>(gradient_booster["name"]);
tparam_.UpdateAllowUnknown(Args{{"booster", name}});
gbm_.reset(GradientBooster::Create(tparam_.booster,
&generic_parameters_, &learner_model_param_,
cache_));
gbm_->LoadModel(gradient_booster);
learner_model_param_ = LearnerModelParam(mparam_,
obj_->ProbToMargin(mparam_.base_score));
auto const& j_attributes = get<Object const>(learner.at("attributes"));
attributes_.clear();
for (auto const& kv : j_attributes) {
attributes_[kv.first] = get<String const>(kv.second);
}
this->need_configuration_ = true;
}
void SaveModel(dmlc::Stream* fo) const override {
// They are the same right now until we can split up the saved parameter from model.
this->Save(fo);
void SaveModel(Json* p_out) const override {
CHECK(!this->need_configuration_) << "Call Configure before saving model.";
Version::Save(p_out);
Json& out { *p_out };
out["Learner"] = Object();
auto& learner = out["Learner"];
learner["learner_model_param"] = mparam_.ToJson();
learner["gradient_booster"] = Object();
auto& gradient_booster = learner["gradient_booster"];
gbm_->SaveModel(&gradient_booster);
learner["objective"] = Object();
auto& objective_fn = learner["objective"];
obj_->SaveConfig(&objective_fn);
learner["attributes"] = Object();
for (auto const& kv : attributes_) {
learner["attributes"][kv.first] = String(kv.second);
}
}
void Load(dmlc::Stream* fi) override {
@@ -747,7 +794,6 @@ class LearnerImpl : public Learner {
LearnerTrainParam tparam_;
// configurations
std::map<std::string, std::string> cfg_;
// FIXME(trivialfis): Legacy field used to store extra attributes into binary model.
std::map<std::string, std::string> attributes_;
std::vector<std::string> metric_names_;
static std::string const kEvalMetric; // NOLINT

View File

@@ -8,12 +8,15 @@
#include <xgboost/tree_model.h>
#include <xgboost/logging.h>
#include <xgboost/json.h>
#include <sstream>
#include <limits>
#include <cmath>
#include <iomanip>
#include "param.h"
#include "../common/common.h"
namespace xgboost {
// register tree parameter
@@ -615,7 +618,7 @@ std::string RegTree::DumpModel(const FeatureMap& fmap,
return result;
}
void RegTree::LoadModel(dmlc::Stream* fi) {
void RegTree::Load(dmlc::Stream* fi) {
CHECK_EQ(fi->Read(&param, sizeof(TreeParam)), sizeof(TreeParam));
nodes_.resize(param.num_nodes);
stats_.resize(param.num_nodes);
@@ -633,11 +636,7 @@ void RegTree::LoadModel(dmlc::Stream* fi) {
}
CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param.num_deleted);
}
/*!
* \brief save model to stream
* \param fo output stream
*/
void RegTree::SaveModel(dmlc::Stream* fo) const {
void RegTree::Save(dmlc::Stream* fo) const {
CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
fo->Write(&param, sizeof(TreeParam));
@@ -646,6 +645,114 @@ void RegTree::SaveModel(dmlc::Stream* fo) const {
fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
}
void RegTree::LoadModel(Json const& in) {
fromJson(in["tree_param"], &param);
auto n_nodes = param.num_nodes;
CHECK_NE(n_nodes, 0);
// stats
auto const& loss_changes = get<Array const>(in["loss_changes"]);
CHECK_EQ(loss_changes.size(), n_nodes);
auto const& sum_hessian = get<Array const>(in["sum_hessian"]);
CHECK_EQ(sum_hessian.size(), n_nodes);
auto const& base_weights = get<Array const>(in["base_weights"]);
CHECK_EQ(base_weights.size(), n_nodes);
auto const& leaf_child_counts = get<Array const>(in["leaf_child_counts"]);
CHECK_EQ(leaf_child_counts.size(), n_nodes);
// nodes
auto const& lefts = get<Array const>(in["left_children"]);
CHECK_EQ(lefts.size(), n_nodes);
auto const& rights = get<Array const>(in["right_children"]);
CHECK_EQ(rights.size(), n_nodes);
auto const& parents = get<Array const>(in["parents"]);
CHECK_EQ(parents.size(), n_nodes);
auto const& indices = get<Array const>(in["split_indices"]);
CHECK_EQ(indices.size(), n_nodes);
auto const& conds = get<Array const>(in["split_conditions"]);
CHECK_EQ(conds.size(), n_nodes);
auto const& default_left = get<Array const>(in["default_left"]);
CHECK_EQ(default_left.size(), n_nodes);
stats_.resize(n_nodes);
nodes_.resize(n_nodes);
for (int32_t i = 0; i < n_nodes; ++i) {
auto& s = stats_[i];
s.loss_chg = get<Number const>(loss_changes[i]);
s.sum_hess = get<Number const>(sum_hessian[i]);
s.base_weight = get<Number const>(base_weights[i]);
s.leaf_child_cnt = get<Integer const>(leaf_child_counts[i]);
auto& n = nodes_[i];
auto left = get<Integer const>(lefts[i]);
auto right = get<Integer const>(rights[i]);
auto parent = get<Integer const>(parents[i]);
auto ind = get<Integer const>(indices[i]);
auto cond = get<Number const>(conds[i]);
auto dft_left = get<Boolean const>(default_left[i]);
n = Node(left, right, parent, ind, cond, dft_left);
}
deleted_nodes_.resize(0);
for (bst_node_t i = 1; i < param.num_nodes; ++i) {
if (nodes_[i].IsDeleted()) {
deleted_nodes_.push_back(i);
}
}
CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param.num_deleted);
}
void RegTree::SaveModel(Json* p_out) const {
auto& out = *p_out;
CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
out["tree_param"] = toJson(param);
CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param.num_nodes));
using I = Integer::Int;
auto n_nodes = param.num_nodes;
// stats
std::vector<Json> loss_changes(n_nodes);
std::vector<Json> sum_hessian(n_nodes);
std::vector<Json> base_weights(n_nodes);
std::vector<Json> leaf_child_counts(n_nodes);
// nodes
std::vector<Json> lefts(n_nodes);
std::vector<Json> rights(n_nodes);
std::vector<Json> parents(n_nodes);
std::vector<Json> indices(n_nodes);
std::vector<Json> conds(n_nodes);
std::vector<Json> default_left(n_nodes);
for (int32_t i = 0; i < n_nodes; ++i) {
auto const& s = stats_[i];
loss_changes[i] = s.loss_chg;
sum_hessian[i] = s.sum_hess;
base_weights[i] = s.base_weight;
leaf_child_counts[i] = static_cast<I>(s.leaf_child_cnt);
auto const& n = nodes_[i];
lefts[i] = static_cast<I>(n.LeftChild());
rights[i] = static_cast<I>(n.RightChild());
parents[i] = static_cast<I>(n.Parent());
indices[i] = static_cast<I>(n.SplitIndex());
conds[i] = n.SplitCond();
default_left[i] = n.DefaultLeft();
}
out["loss_changes"] = std::move(loss_changes);
out["sum_hessian"] = std::move(sum_hessian);
out["base_weights"] = std::move(base_weights);
out["leaf_child_counts"] = std::move(leaf_child_counts);
out["left_children"] = std::move(lefts);
out["right_children"] = std::move(rights);
out["parents"] = std::move(parents);
out["split_indices"] = std::move(indices);
out["split_conditions"] = std::move(conds);
out["default_left"] = std::move(default_left);
}
void RegTree::FillNodeMeanValues() {
size_t num_nodes = this->param.num_nodes;
if (this->node_mean_values_.size() == num_nodes) {

View File

@@ -1110,12 +1110,12 @@ class GPUHistMakerSpecialised {
common::MemoryBufferStream fs(&s_model);
int rank = rabit::GetRank();
if (rank == 0) {
local_tree->SaveModel(&fs);
local_tree->Save(&fs);
}
fs.Seek(0);
rabit::Broadcast(&s_model, 0);
RegTree reference_tree {}; // rank 0 tree
reference_tree.LoadModel(&fs);
reference_tree.Load(&fs);
CHECK(*local_tree == reference_tree);
}

View File

@@ -40,13 +40,13 @@ class TreeSyncher: public TreeUpdater {
int rank = rabit::GetRank();
if (rank == 0) {
for (auto tree : trees) {
tree->SaveModel(&fs);
tree->Save(&fs);
}
}
fs.Seek(0);
rabit::Broadcast(&s_model, 0);
for (auto tree : trees) {
tree->LoadModel(&fs);
tree->Load(&fs);
}
}
};