Drop support for loading remote files. (#9504)

This commit is contained in:
Jiaming Yuan 2023-08-21 23:34:05 +08:00 committed by GitHub
parent d779a11af9
commit 044fea1281
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 43 additions and 112 deletions

View File

@ -72,10 +72,6 @@ option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF) option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
set(GPU_COMPUTE_VER "" CACHE STRING set(GPU_COMPUTE_VER "" CACHE STRING
"Semicolon separated list of compute versions to be built against, e.g. '35;61'") "Semicolon separated list of compute versions to be built against, e.g. '35;61'")
## Copied From dmlc
option(USE_HDFS "Build with HDFS support" OFF)
option(USE_AZURE "Build with AZURE support" OFF)
option(USE_S3 "Build with S3 support" OFF)
## Sanitizers ## Sanitizers
option(USE_SANITIZER "Use santizer flags" OFF) option(USE_SANITIZER "Use santizer flags" OFF)
option(SANITIZER_PATH "Path to sanitizes.") option(SANITIZER_PATH "Path to sanitizes.")

View File

@ -390,39 +390,6 @@ Then we can load this model with single node Python XGBoost:
bst = xgb.Booster({'nthread': 4}) bst = xgb.Booster({'nthread': 4})
bst.load_model(nativeModelPath) bst.load_model(nativeModelPath)
.. note:: Using HDFS and S3 for exporting the models with nativeBooster.saveModel()
When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following:
1. Build XGBoost4J-Spark with the steps described in :ref:`here <install_jvm_packages>`, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path.
- However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option.
2. Use bindings of HDFS, S3, etc. to pass model files around. Here are the steps (taking HDFS as an example):
- Create a new file with
.. code-block:: scala
val outputStream = fs.create("hdfs_path")
where "fs" is an instance of `org.apache.hadoop.fs.FileSystem <https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/fs/FileSystem.html>`_ class in Hadoop.
- Pass the returned OutputStream in the first step to nativeBooster.saveModel():
.. code-block:: scala
xgbClassificationModel.nativeBooster.saveModel(outputStream)
- Download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost. (The function "download_from_hdfs" is a helper function to be implemented by the user)
.. code-block:: python
import xgboost as xgb
bst = xgb.Booster({'nthread': 4})
local_path = download_from_hdfs("hdfs_path")
bst.load_model(local_path)
.. note:: Consistency issue between XGBoost4J-Spark and other bindings .. note:: Consistency issue between XGBoost4J-Spark and other bindings
There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost. There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost.

View File

@ -505,8 +505,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
Parameters Parameters
---------- ----------
cache_prefix : cache_prefix :
Prefix to the cache files, only used in external memory. It can be either an Prefix to the cache files, only used in external memory.
URI or a file path.
release_data : release_data :
Whether the iterator should release the data during reset. Set it to True if the Whether the iterator should release the data during reset. Set it to True if the
data transformation (converting data to np.float32 type) is expensive. data transformation (converting data to np.float32 type) is expensive.
@ -2558,8 +2557,7 @@ class Booster:
return ctypes2buffer(cptr, length.value) return ctypes2buffer(cptr, length.value)
def load_model(self, fname: ModelIn) -> None: def load_model(self, fname: ModelIn) -> None:
"""Load the model from a file or bytearray. Path to file can be local """Load the model from a file or a bytearray.
or as an URI.
The model is loaded from XGBoost format which is universal among the various The model is loaded from XGBoost format which is universal among the various
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as

View File

@ -1220,12 +1220,12 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
return str; return str;
}; };
if (common::FileExtension(fname) == "json") { if (common::FileExtension(fname) == "json") {
auto str = read_file(); auto buffer = read_file();
Json in{Json::Load(StringView{str})}; Json in{Json::Load(StringView{buffer.data(), buffer.size()})};
static_cast<Learner*>(handle)->LoadModel(in); static_cast<Learner*>(handle)->LoadModel(in);
} else if (common::FileExtension(fname) == "ubj") { } else if (common::FileExtension(fname) == "ubj") {
auto str = read_file(); auto buffer = read_file();
Json in = Json::Load(StringView{str}, std::ios::binary); Json in = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary);
static_cast<Learner *>(handle)->LoadModel(in); static_cast<Learner *>(handle)->LoadModel(in);
} else { } else {
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r")); std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));

View File

@ -345,10 +345,10 @@ class CLI {
void LoadModel(std::string const& path, Learner* learner) const { void LoadModel(std::string const& path, Learner* learner) const {
if (common::FileExtension(path) == "json") { if (common::FileExtension(path) == "json") {
auto str = common::LoadSequentialFile(path); auto buffer = common::LoadSequentialFile(path);
CHECK_GT(str.size(), 2); CHECK_GT(buffer.size(), 2);
CHECK_EQ(str[0], '{'); CHECK_EQ(buffer[0], '{');
Json in{Json::Load({str.c_str(), str.size()})}; Json in{Json::Load({buffer.data(), buffer.size()})};
learner->LoadModel(in); learner->LoadModel(in);
} else { } else {
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(path.c_str(), "r")); std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(path.c_str(), "r"));

View File

@ -139,7 +139,7 @@ auto SystemErrorMsg() {
} }
} // anonymous namespace } // anonymous namespace
std::string LoadSequentialFile(std::string uri, bool stream) { std::vector<char> LoadSequentialFile(std::string uri) {
auto OpenErr = [&uri]() { auto OpenErr = [&uri]() {
std::string msg; std::string msg;
msg = "Opening " + uri + " failed: "; msg = "Opening " + uri + " failed: ";
@ -148,44 +148,20 @@ std::string LoadSequentialFile(std::string uri, bool stream) {
}; };
auto parsed = dmlc::io::URI(uri.c_str()); auto parsed = dmlc::io::URI(uri.c_str());
CHECK((parsed.protocol == "file://" || parsed.protocol.length() == 0))
<< "Only local file is supported.";
// Read from file. // Read from file.
if ((parsed.protocol == "file://" || parsed.protocol.length() == 0) && !stream) { auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri));
std::string buffer; std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in);
// Open in binary mode so that correct file size can be computed with if (!ifs) {
// seekg(). This accommodates Windows platform: // https://stackoverflow.com/a/17338934
// https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg OpenErr();
auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri));
std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in);
if (!ifs) {
// https://stackoverflow.com/a/17338934
OpenErr();
}
ifs.seekg(0, std::ios_base::end);
const size_t file_size = static_cast<size_t>(ifs.tellg());
ifs.seekg(0, std::ios_base::beg);
buffer.resize(file_size + 1);
ifs.read(&buffer[0], file_size);
buffer.back() = '\0';
return buffer;
} }
// Read from remote. auto file_size = std::filesystem::file_size(path);
std::unique_ptr<dmlc::Stream> fs{dmlc::Stream::Create(uri.c_str(), "r")}; std::vector<char> buffer(file_size);
std::string buffer; ifs.read(&buffer[0], file_size);
size_t constexpr kInitialSize = 4096;
size_t size {kInitialSize}, total {0};
while (true) {
buffer.resize(total + size);
size_t read = fs->Read(&buffer[total], size);
total += read;
if (read < size) {
break;
}
size *= 2;
}
buffer.resize(total);
return buffer; return buffer;
} }

View File

@ -84,16 +84,14 @@ class FixedSizeStream : public PeekableInStream {
std::string buffer_; std::string buffer_;
}; };
/*! /**
* \brief Helper function for loading consecutive file to avoid dmlc Stream when possible. * @brief Helper function for loading consecutive file.
* *
* \param uri URI or file name to file. * @param uri URI or file name to file.
* \param stream Use dmlc Stream unconditionally if set to true. Used for running test
* without remote filesystem.
* *
* \return File content. * @return File content.
*/ */
std::string LoadSequentialFile(std::string uri, bool stream = false); std::vector<char> LoadSequentialFile(std::string uri);
/** /**
* \brief Get file extension from file name. * \brief Get file extension from file name.

View File

@ -216,8 +216,8 @@ TEST(CAPI, JsonModelIO) {
std::string buffer; std::string buffer;
Json::Dump(Json::Load(l, std::ios::binary), &buffer); Json::Dump(Json::Load(l, std::ios::binary), &buffer);
ASSERT_EQ(model_str_0.size() - 1, buffer.size()); ASSERT_EQ(model_str_0.size(), buffer.size());
ASSERT_EQ(model_str_0.back(), '\0'); ASSERT_EQ(model_str_0.back(), '}');
ASSERT_TRUE(std::equal(model_str_0.begin(), model_str_0.end() - 1, buffer.begin())); ASSERT_TRUE(std::equal(model_str_0.begin(), model_str_0.end() - 1, buffer.begin()));
ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({})", &len, &data), -1); ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({})", &len, &data), -1);

View File

@ -63,31 +63,27 @@ TEST(IO, LoadSequentialFile) {
// Generate a JSON file. // Generate a JSON file.
size_t constexpr kRows = 1000, kCols = 100; size_t constexpr kRows = 1000, kCols = 100;
std::shared_ptr<DMatrix> p_dmat{ std::shared_ptr<DMatrix> p_dmat{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)}; std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
std::unique_ptr<Learner> learner { Learner::Create({p_dmat}) };
learner->SetParam("tree_method", "hist"); learner->SetParam("tree_method", "hist");
learner->Configure(); learner->Configure();
for (int32_t iter = 0; iter < 10; ++iter) { for (int32_t iter = 0; iter < 10; ++iter) {
learner->UpdateOneIter(iter, p_dmat); learner->UpdateOneIter(iter, p_dmat);
} }
Json out { Object() }; Json out{Object()};
learner->SaveModel(&out); learner->SaveModel(&out);
std::string str; std::vector<char> str;
Json::Dump(out, &str); Json::Dump(out, &str);
std::string tmpfile = tempdir.path + "/model.json"; std::string tmpfile = tempdir.path + "/model.json";
{ {
std::unique_ptr<dmlc::Stream> fo( std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(tmpfile.c_str(), "w"));
dmlc::Stream::Create(tmpfile.c_str(), "w")); fo->Write(str.data(), str.size());
fo->Write(str.c_str(), str.size());
} }
auto loaded = LoadSequentialFile(tmpfile, true); auto loaded = LoadSequentialFile(tmpfile);
ASSERT_EQ(loaded, str); ASSERT_EQ(loaded, str);
ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
} }
TEST(IO, Resource) { TEST(IO, Resource) {

View File

@ -418,7 +418,7 @@ TEST(Json, AssigningString) {
TEST(Json, LoadDump) { TEST(Json, LoadDump) {
std::string ori_buffer = GetModelStr(); std::string ori_buffer = GetModelStr();
Json origin {Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})}; Json origin{Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};
dmlc::TemporaryDirectory tempdir; dmlc::TemporaryDirectory tempdir;
auto const& path = tempdir.path + "test_model_dump"; auto const& path = tempdir.path + "test_model_dump";
@ -430,9 +430,9 @@ TEST(Json, LoadDump) {
ASSERT_TRUE(fout); ASSERT_TRUE(fout);
fout << out << std::flush; fout << out << std::flush;
std::string new_buffer = common::LoadSequentialFile(path); std::vector<char> new_buffer = common::LoadSequentialFile(path);
Json load_back {Json::Load(StringView(new_buffer.c_str(), new_buffer.size()))}; Json load_back{Json::Load(StringView(new_buffer.data(), new_buffer.size()))};
ASSERT_EQ(load_back, origin); ASSERT_EQ(load_back, origin);
} }
@ -651,7 +651,7 @@ TEST(UBJson, Basic) {
} }
auto data = common::LoadSequentialFile("test.ubj"); auto data = common::LoadSequentialFile("test.ubj");
UBJReader reader{StringView{data}}; UBJReader reader{StringView{data.data(), data.size()}};
json = reader.Load(); json = reader.Load();
return json; return json;
}; };

View File

@ -250,7 +250,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
auto cache_name = auto cache_name =
data::MakeId(filename, dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) + ".row.page"; data::MakeId(filename, dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) + ".row.page";
std::string cache = common::LoadSequentialFile(cache_name); auto cache = common::LoadSequentialFile(cache_name);
return cache; return cache;
} }
@ -258,7 +258,7 @@ TEST(SparsePageDMatrix, Determinism) {
#if defined(_MSC_VER) #if defined(_MSC_VER)
return; return;
#endif // defined(_MSC_VER) #endif // defined(_MSC_VER)
std::vector<std::string> caches; std::vector<std::vector<char>> caches;
for (size_t i = 1; i < 18; i += 2) { for (size_t i = 1; i < 18; i += 2) {
caches.emplace_back(TestSparsePageDMatrixDeterminism(i)); caches.emplace_back(TestSparsePageDMatrixDeterminism(i));
} }

View File

@ -184,7 +184,7 @@ TEST(Learner, JsonModelIO) {
fout.close(); fout.close();
auto loaded_str = common::LoadSequentialFile(tmpdir.path + "/model.json"); auto loaded_str = common::LoadSequentialFile(tmpdir.path + "/model.json");
Json loaded = Json::Load(StringView{loaded_str.c_str(), loaded_str.size()}); Json loaded = Json::Load(StringView{loaded_str.data(), loaded_str.size()});
learner->LoadModel(loaded); learner->LoadModel(loaded);
learner->Configure(); learner->Configure();