Drop support for loading remote files. (#9504)
This commit is contained in:
parent
d779a11af9
commit
044fea1281
@ -72,10 +72,6 @@ option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
|||||||
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
|
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
|
||||||
set(GPU_COMPUTE_VER "" CACHE STRING
|
set(GPU_COMPUTE_VER "" CACHE STRING
|
||||||
"Semicolon separated list of compute versions to be built against, e.g. '35;61'")
|
"Semicolon separated list of compute versions to be built against, e.g. '35;61'")
|
||||||
## Copied From dmlc
|
|
||||||
option(USE_HDFS "Build with HDFS support" OFF)
|
|
||||||
option(USE_AZURE "Build with AZURE support" OFF)
|
|
||||||
option(USE_S3 "Build with S3 support" OFF)
|
|
||||||
## Sanitizers
|
## Sanitizers
|
||||||
option(USE_SANITIZER "Use santizer flags" OFF)
|
option(USE_SANITIZER "Use santizer flags" OFF)
|
||||||
option(SANITIZER_PATH "Path to sanitizes.")
|
option(SANITIZER_PATH "Path to sanitizes.")
|
||||||
|
|||||||
@ -390,39 +390,6 @@ Then we can load this model with single node Python XGBoost:
|
|||||||
bst = xgb.Booster({'nthread': 4})
|
bst = xgb.Booster({'nthread': 4})
|
||||||
bst.load_model(nativeModelPath)
|
bst.load_model(nativeModelPath)
|
||||||
|
|
||||||
.. note:: Using HDFS and S3 for exporting the models with nativeBooster.saveModel()
|
|
||||||
|
|
||||||
When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following:
|
|
||||||
|
|
||||||
1. Build XGBoost4J-Spark with the steps described in :ref:`here <install_jvm_packages>`, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path.
|
|
||||||
|
|
||||||
- However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option.
|
|
||||||
|
|
||||||
2. Use bindings of HDFS, S3, etc. to pass model files around. Here are the steps (taking HDFS as an example):
|
|
||||||
|
|
||||||
- Create a new file with
|
|
||||||
|
|
||||||
.. code-block:: scala
|
|
||||||
|
|
||||||
val outputStream = fs.create("hdfs_path")
|
|
||||||
|
|
||||||
where "fs" is an instance of `org.apache.hadoop.fs.FileSystem <https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/fs/FileSystem.html>`_ class in Hadoop.
|
|
||||||
|
|
||||||
- Pass the returned OutputStream in the first step to nativeBooster.saveModel():
|
|
||||||
|
|
||||||
.. code-block:: scala
|
|
||||||
|
|
||||||
xgbClassificationModel.nativeBooster.saveModel(outputStream)
|
|
||||||
|
|
||||||
- Download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost. (The function "download_from_hdfs" is a helper function to be implemented by the user)
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
import xgboost as xgb
|
|
||||||
bst = xgb.Booster({'nthread': 4})
|
|
||||||
local_path = download_from_hdfs("hdfs_path")
|
|
||||||
bst.load_model(local_path)
|
|
||||||
|
|
||||||
.. note:: Consistency issue between XGBoost4J-Spark and other bindings
|
.. note:: Consistency issue between XGBoost4J-Spark and other bindings
|
||||||
|
|
||||||
There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost.
|
There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost.
|
||||||
|
|||||||
@ -505,8 +505,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
|
|||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
cache_prefix :
|
cache_prefix :
|
||||||
Prefix to the cache files, only used in external memory. It can be either an
|
Prefix to the cache files, only used in external memory.
|
||||||
URI or a file path.
|
|
||||||
release_data :
|
release_data :
|
||||||
Whether the iterator should release the data during reset. Set it to True if the
|
Whether the iterator should release the data during reset. Set it to True if the
|
||||||
data transformation (converting data to np.float32 type) is expensive.
|
data transformation (converting data to np.float32 type) is expensive.
|
||||||
@ -2558,8 +2557,7 @@ class Booster:
|
|||||||
return ctypes2buffer(cptr, length.value)
|
return ctypes2buffer(cptr, length.value)
|
||||||
|
|
||||||
def load_model(self, fname: ModelIn) -> None:
|
def load_model(self, fname: ModelIn) -> None:
|
||||||
"""Load the model from a file or bytearray. Path to file can be local
|
"""Load the model from a file or a bytearray.
|
||||||
or as an URI.
|
|
||||||
|
|
||||||
The model is loaded from XGBoost format which is universal among the various
|
The model is loaded from XGBoost format which is universal among the various
|
||||||
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
|
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
|
||||||
|
|||||||
@ -1220,12 +1220,12 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
|
|||||||
return str;
|
return str;
|
||||||
};
|
};
|
||||||
if (common::FileExtension(fname) == "json") {
|
if (common::FileExtension(fname) == "json") {
|
||||||
auto str = read_file();
|
auto buffer = read_file();
|
||||||
Json in{Json::Load(StringView{str})};
|
Json in{Json::Load(StringView{buffer.data(), buffer.size()})};
|
||||||
static_cast<Learner*>(handle)->LoadModel(in);
|
static_cast<Learner*>(handle)->LoadModel(in);
|
||||||
} else if (common::FileExtension(fname) == "ubj") {
|
} else if (common::FileExtension(fname) == "ubj") {
|
||||||
auto str = read_file();
|
auto buffer = read_file();
|
||||||
Json in = Json::Load(StringView{str}, std::ios::binary);
|
Json in = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary);
|
||||||
static_cast<Learner *>(handle)->LoadModel(in);
|
static_cast<Learner *>(handle)->LoadModel(in);
|
||||||
} else {
|
} else {
|
||||||
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
|
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
|
||||||
|
|||||||
@ -345,10 +345,10 @@ class CLI {
|
|||||||
|
|
||||||
void LoadModel(std::string const& path, Learner* learner) const {
|
void LoadModel(std::string const& path, Learner* learner) const {
|
||||||
if (common::FileExtension(path) == "json") {
|
if (common::FileExtension(path) == "json") {
|
||||||
auto str = common::LoadSequentialFile(path);
|
auto buffer = common::LoadSequentialFile(path);
|
||||||
CHECK_GT(str.size(), 2);
|
CHECK_GT(buffer.size(), 2);
|
||||||
CHECK_EQ(str[0], '{');
|
CHECK_EQ(buffer[0], '{');
|
||||||
Json in{Json::Load({str.c_str(), str.size()})};
|
Json in{Json::Load({buffer.data(), buffer.size()})};
|
||||||
learner->LoadModel(in);
|
learner->LoadModel(in);
|
||||||
} else {
|
} else {
|
||||||
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(path.c_str(), "r"));
|
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(path.c_str(), "r"));
|
||||||
|
|||||||
@ -139,7 +139,7 @@ auto SystemErrorMsg() {
|
|||||||
}
|
}
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
std::string LoadSequentialFile(std::string uri, bool stream) {
|
std::vector<char> LoadSequentialFile(std::string uri) {
|
||||||
auto OpenErr = [&uri]() {
|
auto OpenErr = [&uri]() {
|
||||||
std::string msg;
|
std::string msg;
|
||||||
msg = "Opening " + uri + " failed: ";
|
msg = "Opening " + uri + " failed: ";
|
||||||
@ -148,44 +148,20 @@ std::string LoadSequentialFile(std::string uri, bool stream) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
auto parsed = dmlc::io::URI(uri.c_str());
|
auto parsed = dmlc::io::URI(uri.c_str());
|
||||||
|
CHECK((parsed.protocol == "file://" || parsed.protocol.length() == 0))
|
||||||
|
<< "Only local file is supported.";
|
||||||
// Read from file.
|
// Read from file.
|
||||||
if ((parsed.protocol == "file://" || parsed.protocol.length() == 0) && !stream) {
|
auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri));
|
||||||
std::string buffer;
|
std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in);
|
||||||
// Open in binary mode so that correct file size can be computed with
|
if (!ifs) {
|
||||||
// seekg(). This accommodates Windows platform:
|
// https://stackoverflow.com/a/17338934
|
||||||
// https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
|
OpenErr();
|
||||||
auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri));
|
|
||||||
std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in);
|
|
||||||
if (!ifs) {
|
|
||||||
// https://stackoverflow.com/a/17338934
|
|
||||||
OpenErr();
|
|
||||||
}
|
|
||||||
|
|
||||||
ifs.seekg(0, std::ios_base::end);
|
|
||||||
const size_t file_size = static_cast<size_t>(ifs.tellg());
|
|
||||||
ifs.seekg(0, std::ios_base::beg);
|
|
||||||
buffer.resize(file_size + 1);
|
|
||||||
ifs.read(&buffer[0], file_size);
|
|
||||||
buffer.back() = '\0';
|
|
||||||
|
|
||||||
return buffer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read from remote.
|
auto file_size = std::filesystem::file_size(path);
|
||||||
std::unique_ptr<dmlc::Stream> fs{dmlc::Stream::Create(uri.c_str(), "r")};
|
std::vector<char> buffer(file_size);
|
||||||
std::string buffer;
|
ifs.read(&buffer[0], file_size);
|
||||||
size_t constexpr kInitialSize = 4096;
|
|
||||||
size_t size {kInitialSize}, total {0};
|
|
||||||
while (true) {
|
|
||||||
buffer.resize(total + size);
|
|
||||||
size_t read = fs->Read(&buffer[total], size);
|
|
||||||
total += read;
|
|
||||||
if (read < size) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
size *= 2;
|
|
||||||
}
|
|
||||||
buffer.resize(total);
|
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -84,16 +84,14 @@ class FixedSizeStream : public PeekableInStream {
|
|||||||
std::string buffer_;
|
std::string buffer_;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*!
|
/**
|
||||||
* \brief Helper function for loading consecutive file to avoid dmlc Stream when possible.
|
* @brief Helper function for loading consecutive file.
|
||||||
*
|
*
|
||||||
* \param uri URI or file name to file.
|
* @param uri URI or file name to file.
|
||||||
* \param stream Use dmlc Stream unconditionally if set to true. Used for running test
|
|
||||||
* without remote filesystem.
|
|
||||||
*
|
*
|
||||||
* \return File content.
|
* @return File content.
|
||||||
*/
|
*/
|
||||||
std::string LoadSequentialFile(std::string uri, bool stream = false);
|
std::vector<char> LoadSequentialFile(std::string uri);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Get file extension from file name.
|
* \brief Get file extension from file name.
|
||||||
|
|||||||
@ -216,8 +216,8 @@ TEST(CAPI, JsonModelIO) {
|
|||||||
|
|
||||||
std::string buffer;
|
std::string buffer;
|
||||||
Json::Dump(Json::Load(l, std::ios::binary), &buffer);
|
Json::Dump(Json::Load(l, std::ios::binary), &buffer);
|
||||||
ASSERT_EQ(model_str_0.size() - 1, buffer.size());
|
ASSERT_EQ(model_str_0.size(), buffer.size());
|
||||||
ASSERT_EQ(model_str_0.back(), '\0');
|
ASSERT_EQ(model_str_0.back(), '}');
|
||||||
ASSERT_TRUE(std::equal(model_str_0.begin(), model_str_0.end() - 1, buffer.begin()));
|
ASSERT_TRUE(std::equal(model_str_0.begin(), model_str_0.end() - 1, buffer.begin()));
|
||||||
|
|
||||||
ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({})", &len, &data), -1);
|
ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({})", &len, &data), -1);
|
||||||
|
|||||||
@ -63,31 +63,27 @@ TEST(IO, LoadSequentialFile) {
|
|||||||
|
|
||||||
// Generate a JSON file.
|
// Generate a JSON file.
|
||||||
size_t constexpr kRows = 1000, kCols = 100;
|
size_t constexpr kRows = 1000, kCols = 100;
|
||||||
std::shared_ptr<DMatrix> p_dmat{
|
std::shared_ptr<DMatrix> p_dmat{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
|
||||||
RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
|
std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
|
||||||
std::unique_ptr<Learner> learner { Learner::Create({p_dmat}) };
|
|
||||||
learner->SetParam("tree_method", "hist");
|
learner->SetParam("tree_method", "hist");
|
||||||
learner->Configure();
|
learner->Configure();
|
||||||
|
|
||||||
for (int32_t iter = 0; iter < 10; ++iter) {
|
for (int32_t iter = 0; iter < 10; ++iter) {
|
||||||
learner->UpdateOneIter(iter, p_dmat);
|
learner->UpdateOneIter(iter, p_dmat);
|
||||||
}
|
}
|
||||||
Json out { Object() };
|
Json out{Object()};
|
||||||
learner->SaveModel(&out);
|
learner->SaveModel(&out);
|
||||||
std::string str;
|
std::vector<char> str;
|
||||||
Json::Dump(out, &str);
|
Json::Dump(out, &str);
|
||||||
|
|
||||||
std::string tmpfile = tempdir.path + "/model.json";
|
std::string tmpfile = tempdir.path + "/model.json";
|
||||||
{
|
{
|
||||||
std::unique_ptr<dmlc::Stream> fo(
|
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(tmpfile.c_str(), "w"));
|
||||||
dmlc::Stream::Create(tmpfile.c_str(), "w"));
|
fo->Write(str.data(), str.size());
|
||||||
fo->Write(str.c_str(), str.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto loaded = LoadSequentialFile(tmpfile, true);
|
auto loaded = LoadSequentialFile(tmpfile);
|
||||||
ASSERT_EQ(loaded, str);
|
ASSERT_EQ(loaded, str);
|
||||||
|
|
||||||
ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(IO, Resource) {
|
TEST(IO, Resource) {
|
||||||
|
|||||||
@ -418,7 +418,7 @@ TEST(Json, AssigningString) {
|
|||||||
|
|
||||||
TEST(Json, LoadDump) {
|
TEST(Json, LoadDump) {
|
||||||
std::string ori_buffer = GetModelStr();
|
std::string ori_buffer = GetModelStr();
|
||||||
Json origin {Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};
|
Json origin{Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};
|
||||||
|
|
||||||
dmlc::TemporaryDirectory tempdir;
|
dmlc::TemporaryDirectory tempdir;
|
||||||
auto const& path = tempdir.path + "test_model_dump";
|
auto const& path = tempdir.path + "test_model_dump";
|
||||||
@ -430,9 +430,9 @@ TEST(Json, LoadDump) {
|
|||||||
ASSERT_TRUE(fout);
|
ASSERT_TRUE(fout);
|
||||||
fout << out << std::flush;
|
fout << out << std::flush;
|
||||||
|
|
||||||
std::string new_buffer = common::LoadSequentialFile(path);
|
std::vector<char> new_buffer = common::LoadSequentialFile(path);
|
||||||
|
|
||||||
Json load_back {Json::Load(StringView(new_buffer.c_str(), new_buffer.size()))};
|
Json load_back{Json::Load(StringView(new_buffer.data(), new_buffer.size()))};
|
||||||
ASSERT_EQ(load_back, origin);
|
ASSERT_EQ(load_back, origin);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -651,7 +651,7 @@ TEST(UBJson, Basic) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto data = common::LoadSequentialFile("test.ubj");
|
auto data = common::LoadSequentialFile("test.ubj");
|
||||||
UBJReader reader{StringView{data}};
|
UBJReader reader{StringView{data.data(), data.size()}};
|
||||||
json = reader.Load();
|
json = reader.Load();
|
||||||
return json;
|
return json;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -250,7 +250,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
|
|||||||
|
|
||||||
auto cache_name =
|
auto cache_name =
|
||||||
data::MakeId(filename, dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) + ".row.page";
|
data::MakeId(filename, dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) + ".row.page";
|
||||||
std::string cache = common::LoadSequentialFile(cache_name);
|
auto cache = common::LoadSequentialFile(cache_name);
|
||||||
return cache;
|
return cache;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -258,7 +258,7 @@ TEST(SparsePageDMatrix, Determinism) {
|
|||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
return;
|
return;
|
||||||
#endif // defined(_MSC_VER)
|
#endif // defined(_MSC_VER)
|
||||||
std::vector<std::string> caches;
|
std::vector<std::vector<char>> caches;
|
||||||
for (size_t i = 1; i < 18; i += 2) {
|
for (size_t i = 1; i < 18; i += 2) {
|
||||||
caches.emplace_back(TestSparsePageDMatrixDeterminism(i));
|
caches.emplace_back(TestSparsePageDMatrixDeterminism(i));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -184,7 +184,7 @@ TEST(Learner, JsonModelIO) {
|
|||||||
fout.close();
|
fout.close();
|
||||||
|
|
||||||
auto loaded_str = common::LoadSequentialFile(tmpdir.path + "/model.json");
|
auto loaded_str = common::LoadSequentialFile(tmpdir.path + "/model.json");
|
||||||
Json loaded = Json::Load(StringView{loaded_str.c_str(), loaded_str.size()});
|
Json loaded = Json::Load(StringView{loaded_str.data(), loaded_str.size()});
|
||||||
|
|
||||||
learner->LoadModel(loaded);
|
learner->LoadModel(loaded);
|
||||||
learner->Configure();
|
learner->Configure();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user