Swap byte-order in binary serializer to support big-endian arch (#5813)
* fixed some endian issues * Use dmlc::ByteSwap() to simplify code * Fix lint check * [CI] Add test for s390x * Download latest CMake on s390x * Fix a bug in my code * Save magic number in dmatrix with byteswap on big-endian machine * Save version in binary with byteswap on big-endian machine * Load scalar with byteswap in MetaInfo * Add a debugging message * Handle arrays correctly when byteswapping * EOF can also be 255 * Handle magic number in MetaInfo carefully * Skip Tree.Load test for big-endian, since the test manually builds little-endian binary model * Handle missing packages in Python tests * Don't use boto3 in model compatibility tests * Add s390 Docker file for local testing * Add model compatibility tests * Add R compatibility test * Revert "Add R compatibility test" This reverts commit c2d2bdcb7dbae133cbb927fcd20f7e83ee2b18a8. Co-authored-by: Qi Zhang <q.zhang@ibm.com> Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -49,9 +49,9 @@ Version::TripletT Version::Load(dmlc::Stream* fi) {
|
||||
LOG(FATAL) << msg;
|
||||
}
|
||||
|
||||
CHECK_EQ(fi->Read(&major, sizeof(major)), sizeof(major)) << msg;
|
||||
CHECK_EQ(fi->Read(&minor, sizeof(major)), sizeof(minor)) << msg;
|
||||
CHECK_EQ(fi->Read(&patch, sizeof(major)), sizeof(patch)) << msg;
|
||||
CHECK(fi->Read(&major)) << msg;
|
||||
CHECK(fi->Read(&minor)) << msg;
|
||||
CHECK(fi->Read(&patch)) << msg;
|
||||
|
||||
return std::make_tuple(major, minor, patch);
|
||||
}
|
||||
@@ -69,9 +69,9 @@ void Version::Save(dmlc::Stream* fo) {
|
||||
std::tie(major, minor, patch) = Self();
|
||||
std::string verstr { u8"version:" };
|
||||
fo->Write(&verstr[0], verstr.size());
|
||||
fo->Write(&major, sizeof(major));
|
||||
fo->Write(&minor, sizeof(minor));
|
||||
fo->Write(&patch, sizeof(patch));
|
||||
fo->Write(major);
|
||||
fo->Write(minor);
|
||||
fo->Write(patch);
|
||||
}
|
||||
|
||||
std::string Version::String(TripletT const& version) {
|
||||
|
||||
@@ -83,7 +83,7 @@ void LoadScalarField(dmlc::Stream* strm, const std::string& expected_name,
|
||||
CHECK(strm->Read(&is_scalar)) << invalid;
|
||||
CHECK(is_scalar)
|
||||
<< invalid << "Expected field " << expected_name << " to be a scalar; got a vector";
|
||||
CHECK(strm->Read(field, sizeof(T))) << invalid;
|
||||
CHECK(strm->Read(field)) << invalid;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -653,14 +653,18 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
|
||||
if (fi != nullptr) {
|
||||
common::PeekableInStream is(fi.get());
|
||||
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) &&
|
||||
magic == data::SimpleDMatrix::kMagic) {
|
||||
DMatrix* dmat = new data::SimpleDMatrix(&is);
|
||||
if (!silent) {
|
||||
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
|
||||
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
|
||||
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
|
||||
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
dmlc::ByteSwap(&magic, sizeof(magic), 1);
|
||||
}
|
||||
if (magic == data::SimpleDMatrix::kMagic) {
|
||||
DMatrix* dmat = new data::SimpleDMatrix(&is);
|
||||
if (!silent) {
|
||||
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
|
||||
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
|
||||
}
|
||||
return dmat;
|
||||
}
|
||||
return dmat;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -192,8 +192,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
||||
|
||||
SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
|
||||
int tmagic;
|
||||
CHECK(in_stream->Read(&tmagic, sizeof(tmagic)) == sizeof(tmagic))
|
||||
<< "invalid input file format";
|
||||
CHECK(in_stream->Read(&tmagic)) << "invalid input file format";
|
||||
CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
|
||||
info_.LoadBinary(in_stream);
|
||||
in_stream->Read(&sparse_page_.offset.HostVector());
|
||||
@@ -203,7 +202,7 @@ SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
|
||||
void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
|
||||
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
|
||||
int tmagic = kMagic;
|
||||
fo->Write(&tmagic, sizeof(tmagic));
|
||||
fo->Write(tmagic);
|
||||
info_.SaveBinary(fo.get());
|
||||
fo->Write(sparse_page_.offset.HostVector());
|
||||
fo->Write(sparse_page_.data.HostVector());
|
||||
|
||||
@@ -144,7 +144,7 @@ class ExternalMemoryPrefetcher : dmlc::DataIter<PageT> {
|
||||
std::unique_ptr<dmlc::Stream> finfo(
|
||||
dmlc::Stream::Create(info.name_info.c_str(), "r"));
|
||||
int tmagic;
|
||||
CHECK_EQ(finfo->Read(&tmagic, sizeof(tmagic)), sizeof(tmagic));
|
||||
CHECK(finfo->Read(&tmagic));
|
||||
CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
|
||||
}
|
||||
files_.resize(info.name_shards.size());
|
||||
@@ -359,7 +359,7 @@ class SparsePageSource {
|
||||
std::unique_ptr<dmlc::Stream> fo(
|
||||
dmlc::Stream::Create(cache_info_.name_info.c_str(), "w"));
|
||||
int tmagic = kMagic;
|
||||
fo->Write(&tmagic, sizeof(tmagic));
|
||||
fo->Write(tmagic);
|
||||
// Either every row has query ID or none at all
|
||||
CHECK(qids.empty() || qids.size() == info.num_row_);
|
||||
info.SaveBinary(fo.get());
|
||||
|
||||
@@ -12,18 +12,35 @@ namespace gbm {
|
||||
|
||||
void GBTreeModel::Save(dmlc::Stream* fo) const {
|
||||
CHECK_EQ(param.num_trees, static_cast<int32_t>(trees.size()));
|
||||
fo->Write(¶m, sizeof(param));
|
||||
|
||||
if (DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
fo->Write(¶m, sizeof(param));
|
||||
} else {
|
||||
auto x = param.ByteSwap();
|
||||
fo->Write(&x, sizeof(x));
|
||||
}
|
||||
for (const auto & tree : trees) {
|
||||
tree->Save(fo);
|
||||
}
|
||||
if (tree_info.size() != 0) {
|
||||
fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size());
|
||||
if (DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size());
|
||||
} else {
|
||||
for (const auto& e : tree_info) {
|
||||
auto x = e;
|
||||
dmlc::ByteSwap(&x, sizeof(x), 1);
|
||||
fo->Write(&x, sizeof(x));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GBTreeModel::Load(dmlc::Stream* fi) {
|
||||
CHECK_EQ(fi->Read(¶m, sizeof(param)), sizeof(param))
|
||||
<< "GBTree: invalid model file";
|
||||
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
param = param.ByteSwap();
|
||||
}
|
||||
trees.clear();
|
||||
trees_to_update.clear();
|
||||
for (int32_t i = 0; i < param.num_trees; ++i) {
|
||||
@@ -33,9 +50,16 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
|
||||
}
|
||||
tree_info.resize(param.num_trees);
|
||||
if (param.num_trees != 0) {
|
||||
CHECK_EQ(
|
||||
fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees),
|
||||
sizeof(int32_t) * param.num_trees);
|
||||
if (DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
CHECK_EQ(
|
||||
fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees),
|
||||
sizeof(int32_t) * param.num_trees);
|
||||
} else {
|
||||
for (auto& info : tree_info) {
|
||||
CHECK_EQ(fi->Read(&info, sizeof(int32_t)), sizeof(int32_t));
|
||||
dmlc::ByteSwap(&info, sizeof(info), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -61,6 +61,21 @@ struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
|
||||
.set_default(0)
|
||||
.describe("Reserved option for vector tree.");
|
||||
}
|
||||
|
||||
// Swap byte order for all fields. Useful for transporting models between machines with different
|
||||
// endianness (big endian vs little endian)
|
||||
inline GBTreeModelParam ByteSwap() const {
|
||||
GBTreeModelParam x = *this;
|
||||
dmlc::ByteSwap(&x.num_trees, sizeof(x.num_trees), 1);
|
||||
dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1);
|
||||
dmlc::ByteSwap(&x.deprecated_num_feature, sizeof(x.deprecated_num_feature), 1);
|
||||
dmlc::ByteSwap(&x.pad_32bit, sizeof(x.pad_32bit), 1);
|
||||
dmlc::ByteSwap(&x.deprecated_num_pbuffer, sizeof(x.deprecated_num_pbuffer), 1);
|
||||
dmlc::ByteSwap(&x.deprecated_num_output_group, sizeof(x.deprecated_num_output_group), 1);
|
||||
dmlc::ByteSwap(&x.size_leaf_vector, sizeof(x.size_leaf_vector), 1);
|
||||
dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct GBTreeModel : public Model {
|
||||
|
||||
@@ -128,6 +128,19 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
|
||||
std::string str = get<String const>(j_param.at("base_score"));
|
||||
from_chars(str.c_str(), str.c_str() + str.size(), base_score);
|
||||
}
|
||||
inline LearnerModelParamLegacy ByteSwap() const {
|
||||
LearnerModelParamLegacy x = *this;
|
||||
dmlc::ByteSwap(&x.base_score, sizeof(x.base_score), 1);
|
||||
dmlc::ByteSwap(&x.num_feature, sizeof(x.num_feature), 1);
|
||||
dmlc::ByteSwap(&x.num_class, sizeof(x.num_class), 1);
|
||||
dmlc::ByteSwap(&x.contain_extra_attrs, sizeof(x.contain_extra_attrs), 1);
|
||||
dmlc::ByteSwap(&x.contain_eval_metrics, sizeof(x.contain_eval_metrics), 1);
|
||||
dmlc::ByteSwap(&x.major_version, sizeof(x.major_version), 1);
|
||||
dmlc::ByteSwap(&x.minor_version, sizeof(x.minor_version), 1);
|
||||
dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
|
||||
return x;
|
||||
}
|
||||
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(LearnerModelParamLegacy) {
|
||||
DMLC_DECLARE_FIELD(base_score)
|
||||
@@ -694,7 +707,9 @@ class LearnerIO : public LearnerConfiguration {
|
||||
// read parameter
|
||||
CHECK_EQ(fi->Read(&mparam_, sizeof(mparam_)), sizeof(mparam_))
|
||||
<< "BoostLearner: wrong model format";
|
||||
|
||||
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
mparam_ = mparam_.ByteSwap();
|
||||
}
|
||||
CHECK(fi->Read(&tparam_.objective)) << "BoostLearner: wrong model format";
|
||||
CHECK(fi->Read(&tparam_.booster)) << "BoostLearner: wrong model format";
|
||||
|
||||
@@ -828,7 +843,12 @@ class LearnerIO : public LearnerConfiguration {
|
||||
}
|
||||
std::string header {"binf"};
|
||||
fo->Write(header.data(), 4);
|
||||
fo->Write(&mparam, sizeof(LearnerModelParamLegacy));
|
||||
if (DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
fo->Write(&mparam, sizeof(LearnerModelParamLegacy));
|
||||
} else {
|
||||
LearnerModelParamLegacy x = mparam.ByteSwap();
|
||||
fo->Write(&x, sizeof(LearnerModelParamLegacy));
|
||||
}
|
||||
fo->Write(tparam_.objective);
|
||||
fo->Write(tparam_.booster);
|
||||
gbm_->Save(fo);
|
||||
@@ -867,7 +887,13 @@ class LearnerIO : public LearnerConfiguration {
|
||||
// concatonate the model and config at final output, it's a temporary solution for
|
||||
// continuing support for binary model format
|
||||
fo->Write(&serialisation_header_[0], serialisation_header_.size());
|
||||
fo->Write(&json_offset, sizeof(json_offset));
|
||||
if (DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
fo->Write(&json_offset, sizeof(json_offset));
|
||||
} else {
|
||||
auto x = json_offset;
|
||||
dmlc::ByteSwap(&x, sizeof(x), 1);
|
||||
fo->Write(&x, sizeof(json_offset));
|
||||
}
|
||||
fo->Write(&binary_buf[0], binary_buf.size());
|
||||
fo->Write(&config_str[0], config_str.size());
|
||||
}
|
||||
@@ -904,6 +930,9 @@ class LearnerIO : public LearnerConfiguration {
|
||||
)doc";
|
||||
int64_t sz {-1};
|
||||
CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
|
||||
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
dmlc::ByteSwap(&sz, sizeof(sz), 1);
|
||||
}
|
||||
CHECK_GT(sz, 0);
|
||||
size_t json_offset = static_cast<size_t>(sz);
|
||||
std::string buffer;
|
||||
|
||||
@@ -664,13 +664,26 @@ bst_node_t RegTree::GetNumSplitNodes() const {
|
||||
|
||||
void RegTree::Load(dmlc::Stream* fi) {
|
||||
CHECK_EQ(fi->Read(¶m, sizeof(TreeParam)), sizeof(TreeParam));
|
||||
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
param = param.ByteSwap();
|
||||
}
|
||||
nodes_.resize(param.num_nodes);
|
||||
stats_.resize(param.num_nodes);
|
||||
CHECK_NE(param.num_nodes, 0);
|
||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()),
|
||||
sizeof(Node) * nodes_.size());
|
||||
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
for (Node& node : nodes_) {
|
||||
node = node.ByteSwap();
|
||||
}
|
||||
}
|
||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * stats_.size()),
|
||||
sizeof(RTreeNodeStat) * stats_.size());
|
||||
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
for (RTreeNodeStat& stat : stats_) {
|
||||
stat = stat.ByteSwap();
|
||||
}
|
||||
}
|
||||
// chg deleted nodes
|
||||
deleted_nodes_.resize(0);
|
||||
for (int i = 1; i < param.num_nodes; ++i) {
|
||||
@@ -683,11 +696,32 @@ void RegTree::Load(dmlc::Stream* fi) {
|
||||
void RegTree::Save(dmlc::Stream* fo) const {
|
||||
CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
|
||||
CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
|
||||
fo->Write(¶m, sizeof(TreeParam));
|
||||
CHECK_EQ(param.deprecated_num_roots, 1);
|
||||
CHECK_NE(param.num_nodes, 0);
|
||||
fo->Write(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size());
|
||||
fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
|
||||
|
||||
if (DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
fo->Write(¶m, sizeof(TreeParam));
|
||||
} else {
|
||||
TreeParam x = param.ByteSwap();
|
||||
fo->Write(&x, sizeof(x));
|
||||
}
|
||||
|
||||
if (DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
fo->Write(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size());
|
||||
} else {
|
||||
for (const Node& node : nodes_) {
|
||||
Node x = node.ByteSwap();
|
||||
fo->Write(&x, sizeof(x));
|
||||
}
|
||||
}
|
||||
if (DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
|
||||
} else {
|
||||
for (const RTreeNodeStat& stat : stats_) {
|
||||
RTreeNodeStat x = stat.ByteSwap();
|
||||
fo->Write(&x, sizeof(x));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RegTree::LoadModel(Json const& in) {
|
||||
|
||||
Reference in New Issue
Block a user