From dac9eb13bd288f09a0bb451211cc206802adab48 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 19 Jan 2022 02:27:51 +0800 Subject: [PATCH] Implement new `save_raw` in Python. (#7572) * Expose the new C API function to Python. * Remove old document and helper script. * Small optimization to the `save_raw` and Json ctors. --- doc/python/convert_090to100.py | 79 ------------------------------- doc/tutorials/saving_model.rst | 41 +++++++--------- include/xgboost/json.h | 32 ++++++------- include/xgboost/learner.h | 2 + python-package/xgboost/core.py | 42 ++++++++++++---- src/c_api/c_api.cc | 26 ++++++---- src/common/json.cc | 19 ++++---- tests/python/test_basic_models.py | 13 +++++ 8 files changed, 104 insertions(+), 150 deletions(-) delete mode 100644 doc/python/convert_090to100.py diff --git a/doc/python/convert_090to100.py b/doc/python/convert_090to100.py deleted file mode 100644 index 135489b09..000000000 --- a/doc/python/convert_090to100.py +++ /dev/null @@ -1,79 +0,0 @@ -'''This is a simple script that converts a pickled XGBoost -Scikit-Learn interface object from 0.90 to a native model. Pickle -format is not stable as it's a direct serialization of Python object. -We advice not to use it when stability is needed. - -''' -import pickle -import json -import os -import argparse -import numpy as np -import xgboost -import warnings - - -def save_label_encoder(le): - '''Save the label encoder in XGBClassifier''' - meta = dict() - for k, v in le.__dict__.items(): - if isinstance(v, np.ndarray): - meta[k] = v.tolist() - else: - meta[k] = v - return meta - - -def xgboost_skl_90to100(skl_model): - '''Extract the model and related metadata in SKL model.''' - model = {} - with open(skl_model, 'rb') as fd: - old = pickle.load(fd) - if not isinstance(old, xgboost.XGBModel): - raise TypeError( - 'The script only handes Scikit-Learn interface object') - - # Save Scikit-Learn specific Python attributes into a JSON document. - for k, v in old.__dict__.items(): - if k == '_le': - model[k] = save_label_encoder(v) - elif k == 'classes_': - model[k] = v.tolist() - elif k == '_Booster': - continue - else: - try: - json.dumps({k: v}) - model[k] = v - except TypeError: - warnings.warn(str(k) + ' is not saved in Scikit-Learn meta.') - booster = old.get_booster() - # Store the JSON serialization as an attribute - booster.set_attr(scikit_learn=json.dumps(model)) - - # Save it into a native model. - i = 0 - while True: - path = 'xgboost_native_model_from_' + skl_model + '-' + str(i) + '.bin' - if os.path.exists(path): - i += 1 - continue - booster.save_model(path) - break - - -if __name__ == '__main__': - assert xgboost.__version__ != '1.0.0', ('Please use the XGBoost version' - ' that generates this pickle.') - parser = argparse.ArgumentParser( - description=('A simple script to convert pickle generated by' - ' XGBoost 0.90 to XGBoost 1.0.0 model (not pickle).')) - parser.add_argument( - '--old-pickle', - type=str, - help='Path to old pickle file of Scikit-Learn interface object. ' - 'Will output a native model converted from this pickle file', - required=True) - args = parser.parse_args() - - xgboost_skl_90to100(args.old_pickle) diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst index 909155e4d..ab60cfc1a 100644 --- a/doc/tutorials/saving_model.rst +++ b/doc/tutorials/saving_model.rst @@ -2,16 +2,18 @@ Introduction to Model IO ######################## -In XGBoost 1.0.0, we introduced experimental support of using `JSON +In XGBoost 1.0.0, we introduced support of using `JSON `_ for saving/loading XGBoost models and related hyper-parameters for training, aiming to replace the old binary internal format with an -open format that can be easily reused. The support for binary format will be continued in -the future until JSON format is no-longer experimental and has satisfying performance. -This tutorial aims to share some basic insights into the JSON serialisation method used in -XGBoost. Without explicitly mentioned, the following sections assume you are using the -JSON format, which can be enabled by providing the file name with ``.json`` as file -extension when saving/loading model: ``booster.save_model('model.json')``. More details -below. +open format that can be easily reused. Later in XGBoost 1.6.0, additional support for +`Universal Binary JSON `__ is added as an optimization for more +efficient model IO. They have the same document structure with different representations, +and we will refer them collectively as the JSON format. This tutorial aims to share some +basic insights into the JSON serialisation method used in XGBoost. Without explicitly +mentioned, the following sections assume you are using the one of the 2 outputs formats, +which can be enabled by providing the file name with ``.json`` (or ``.ubj`` for binary +JSON) as file extension when saving/loading model: ``booster.save_model('model.json')``. +More details below. Before we get started, XGBoost is a gradient boosting library with focus on tree model, which means inside XGBoost, there are 2 distinct parts: @@ -53,7 +55,8 @@ Other language bindings are still working in progress. based serialisation methods. To enable JSON format support for model IO (saving only the trees and objective), provide -a filename with ``.json`` as file extension: +a filename with ``.json`` or ``.ubj`` as file extension, the latter is the extension for +`Universal Binary JSON `__ .. code-block:: python :caption: Python @@ -65,7 +68,7 @@ a filename with ``.json`` as file extension: xgb.save(bst, 'model_file_name.json') -While for memory snapshot, JSON is the default starting with xgboost 1.3. +While for memory snapshot, UBJSON is the default starting with xgboost 1.6. *************************************************************** A note on backward compatibility of models and memory snapshots @@ -105,15 +108,10 @@ Loading pickled file from different version of XGBoost As noted, pickled model is neither portable nor stable, but in some cases the pickled models are valuable. One way to restore it in the future is to load it back with that -specific version of Python and XGBoost, export the model by calling `save_model`. To help -easing the mitigation, we created a simple script for converting pickled XGBoost 0.90 -Scikit-Learn interface object to XGBoost 1.0.0 native model. Please note that the script -suits simple use cases, and it's advised not to use pickle when stability is needed. It's -located in ``xgboost/doc/python`` with the name ``convert_090to100.py``. See comments in -the script for more details. +specific version of Python and XGBoost, export the model by calling `save_model`. -A similar procedure may be used to recover the model persisted in an old RDS file. In R, you are -able to install an older version of XGBoost using the ``remotes`` package: +A similar procedure may be used to recover the model persisted in an old RDS file. In R, +you are able to install an older version of XGBoost using the ``remotes`` package: .. code-block:: r @@ -244,10 +242,3 @@ leaf directly, instead it saves the weights as a separated array. .. include:: ../model.schema :code: json - -************ -Future Plans -************ - -Right now using the JSON format incurs longer serialisation time, we have been working on -optimizing the JSON implementation to close the gap between binary format and JSON format. diff --git a/include/xgboost/json.h b/include/xgboost/json.h index 885a0d1cd..473b0f1d5 100644 --- a/include/xgboost/json.h +++ b/include/xgboost/json.h @@ -89,9 +89,10 @@ class JsonString : public Value { JsonString(std::string const& str) : // NOLINT Value(ValueKind::kString), str_{str} {} JsonString(std::string&& str) noexcept : // NOLINT - Value(ValueKind::kString), str_{std::move(str)} {} - JsonString(JsonString&& str) noexcept : // NOLINT - Value(ValueKind::kString), str_{std::move(str.str_)} {} + Value(ValueKind::kString), str_{std::forward(str)} {} + JsonString(JsonString&& str) noexcept : Value(ValueKind::kString) { // NOLINT + std::swap(str.str_, this->str_); + } void Save(JsonWriter* writer) const override; @@ -111,8 +112,8 @@ class JsonArray : public Value { public: JsonArray() : Value(ValueKind::kArray) {} - JsonArray(std::vector&& arr) noexcept : // NOLINT - Value(ValueKind::kArray), vec_{std::move(arr)} {} + JsonArray(std::vector&& arr) noexcept // NOLINT + : Value(ValueKind::kArray), vec_{std::forward>(arr)} {} JsonArray(std::vector const& arr) : // NOLINT Value(ValueKind::kArray), vec_{arr} {} JsonArray(JsonArray const& that) = delete; @@ -381,10 +382,9 @@ class Json { return *this; } // array - explicit Json(JsonArray list) : - ptr_ {new JsonArray(std::move(list))} {} - Json& operator=(JsonArray array) { - ptr_.reset(new JsonArray(std::move(array))); + explicit Json(JsonArray&& list) : ptr_{new JsonArray(std::forward(list))} {} + Json& operator=(JsonArray&& array) { + ptr_.reset(new JsonArray(std::forward(array))); return *this; } // typed array @@ -397,17 +397,15 @@ class Json { return *this; } // object - explicit Json(JsonObject object) : - ptr_{new JsonObject(std::move(object))} {} - Json& operator=(JsonObject object) { - ptr_.reset(new JsonObject(std::move(object))); + explicit Json(JsonObject&& object) : ptr_{new JsonObject(std::forward(object))} {} + Json& operator=(JsonObject&& object) { + ptr_.reset(new JsonObject(std::forward(object))); return *this; } // string - explicit Json(JsonString str) : - ptr_{new JsonString(std::move(str))} {} - Json& operator=(JsonString str) { - ptr_.reset(new JsonString(std::move(str))); + explicit Json(JsonString&& str) : ptr_{new JsonString(std::forward(str))} {} + Json& operator=(JsonString&& str) { + ptr_.reset(new JsonString(std::forward(str))); return *this; } // bool diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 4d0275c0c..6f8d55eec 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -45,6 +45,8 @@ enum class PredictionType : std::uint8_t { // NOLINT struct XGBAPIThreadLocalEntry { /*! \brief result holder for returning string */ std::string ret_str; + /*! \brief result holder for returning raw buffer */ + std::vector ret_char_vec; /*! \brief result holder for returning strings */ std::vector ret_vec_str; /*! \brief result holder for returning string pointers */ diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 0d14d29dd..915e0a8a6 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -2135,9 +2135,15 @@ class Booster: The model is saved in an XGBoost internal format which is universal among the various XGBoost interfaces. Auxiliary attributes of the Python Booster object - (such as feature_names) will not be saved when using binary format. To save those - attributes, use JSON instead. See :doc:`Model IO ` for - more info. + (such as feature_names) will not be saved when using binary format. To save + those attributes, use JSON/UBJ instead. See :doc:`Model IO + ` for more info. + + .. code-block:: python + + model.save_model("model.json") + # or + model.save_model("model.ubj") Parameters ---------- @@ -2152,18 +2158,28 @@ class Booster: else: raise TypeError("fname must be a string or os PathLike") - def save_raw(self) -> bytearray: + def save_raw(self, raw_format: str = "deprecated") -> bytearray: """Save the model to a in memory buffer representation instead of file. + Parameters + ---------- + raw_format : + Format of output buffer. Can be `json`, `ubj` or `deprecated`. Right now + the default is `deprecated` but it will be changed to `ubj` (univeral binary + json) in the future. + Returns ------- - a in memory buffer representation of the model + An in memory buffer representation of the model """ length = c_bst_ulong() cptr = ctypes.POINTER(ctypes.c_char)() - _check_call(_LIB.XGBoosterGetModelRaw(self.handle, - ctypes.byref(length), - ctypes.byref(cptr))) + config = from_pystr_to_cstr(json.dumps({"format": raw_format})) + _check_call( + _LIB.XGBoosterSaveModelToBuffer( + self.handle, config, ctypes.byref(length), ctypes.byref(cptr) + ) + ) return ctypes2buffer(cptr, length.value) def load_model(self, fname: Union[str, bytearray, os.PathLike]) -> None: @@ -2173,8 +2189,14 @@ class Booster: The model is loaded from XGBoost format which is universal among the various XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as feature_names) will not be loaded when using binary format. To save those - attributes, use JSON instead. See :doc:`Model IO ` for - more info. + attributes, use JSON/UBJ instead. See :doc:`Model IO ` + for more info. + + .. code-block:: python + + model.load_model("model.json") + # or + model.load_model("model.ubj") Parameters ---------- diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 32a986591..25f055f87 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -971,28 +971,34 @@ XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_co auto format = RequiredArg(config, "format", __func__); auto *learner = static_cast(handle); - std::string &raw_str = learner->GetThreadLocal().ret_str; - raw_str.clear(); - learner->Configure(); + + auto save_json = [&](std::ios::openmode mode) { + std::vector &raw_char_vec = learner->GetThreadLocal().ret_char_vec; + Json out{Object{}}; + learner->SaveModel(&out); + Json::Dump(out, &raw_char_vec, mode); + *out_dptr = dmlc::BeginPtr(raw_char_vec); + *out_len = static_cast(raw_char_vec.size()); + }; + Json out{Object{}}; if (format == "json") { - learner->SaveModel(&out); - Json::Dump(out, &raw_str); + save_json(std::ios::out); } else if (format == "ubj") { - learner->SaveModel(&out); - Json::Dump(out, &raw_str, std::ios::binary); + save_json(std::ios::binary); } else if (format == "deprecated") { WarnOldModel(); + auto &raw_str = learner->GetThreadLocal().ret_str; + raw_str.clear(); common::MemoryBufferStream fo(&raw_str); learner->SaveModel(&fo); + *out_dptr = dmlc::BeginPtr(raw_str); + *out_len = static_cast(raw_str.size()); } else { LOG(FATAL) << "Unknown format: `" << format << "`"; } - *out_dptr = dmlc::BeginPtr(raw_str); - *out_len = static_cast(raw_str.length()); - API_END(); } diff --git a/src/common/json.cc b/src/common/json.cc index 83ef27182..d3160f048 100644 --- a/src/common/json.cc +++ b/src/common/json.cc @@ -195,11 +195,12 @@ Json& Value::operator[](int) { } // Json Object -JsonObject::JsonObject(JsonObject && that) noexcept : - Value(ValueKind::kObject), object_{std::move(that.object_)} {} +JsonObject::JsonObject(JsonObject&& that) noexcept : Value(ValueKind::kObject) { + std::swap(that.object_, this->object_); +} -JsonObject::JsonObject(std::map &&object) noexcept - : Value(ValueKind::kObject), object_{std::move(object)} {} +JsonObject::JsonObject(std::map&& object) noexcept + : Value(ValueKind::kObject), object_{std::forward>(object)} {} bool JsonObject::operator==(Value const& rhs) const { if (!IsA(&rhs)) { @@ -220,8 +221,9 @@ bool JsonString::operator==(Value const& rhs) const { void JsonString::Save(JsonWriter* writer) const { writer->Visit(this); } // Json Array -JsonArray::JsonArray(JsonArray && that) noexcept : - Value(ValueKind::kArray), vec_{std::move(that.vec_)} {} +JsonArray::JsonArray(JsonArray&& that) noexcept : Value(ValueKind::kArray) { + std::swap(that.vec_, this->vec_); +} bool JsonArray::operator==(Value const& rhs) const { if (!IsA(&rhs)) { @@ -696,6 +698,7 @@ void Json::Dump(Json json, std::string* str, std::ios::openmode mode) { } void Json::Dump(Json json, std::vector* str, std::ios::openmode mode) { + str->clear(); if (mode & std::ios::binary) { UBJWriter writer{str}; writer.Save(json); @@ -768,9 +771,7 @@ std::string UBJReader::DecodeStr() { str.resize(bsize); auto ptr = raw_str_.c_str() + cursor_.Pos(); std::memcpy(&str[0], ptr, bsize); - for (int64_t i = 0; i < bsize; ++i) { - this->cursor_.Forward(); - } + this->cursor_.Forward(bsize); return str; } diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index d80d92722..cbb7b1fd9 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -289,6 +289,19 @@ class TestModels: os.remove(model_path) assert locale.getpreferredencoding(False) == loc + json_raw = bst.save_raw(raw_format="json") + from_jraw = xgb.Booster() + from_jraw.load_model(json_raw) + + ubj_raw = bst.save_raw(raw_format="ubj") + from_ubjraw = xgb.Booster() + from_ubjraw.load_model(ubj_raw) + + old_from_json = from_jraw.save_raw(raw_format="deprecated") + old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated") + + assert old_from_json == old_from_ubj + @pytest.mark.parametrize("ext", ["json", "ubj"]) def test_model_json_io(self, ext: str) -> None: parameters = {"booster": "gbtree", "tree_method": "hist"}