Use dmlc stream when URI protocol is not local file. (#5857)
This commit is contained in:
@@ -9,6 +9,7 @@
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <cstdio>
|
||||
|
||||
@@ -93,35 +94,53 @@ void FixedSizeStream::Take(std::string* out) {
|
||||
*out = std::move(buffer_);
|
||||
}
|
||||
|
||||
std::string LoadSequentialFile(std::string fname) {
|
||||
auto OpenErr = [&fname]() {
|
||||
std::string msg;
|
||||
msg = "Opening " + fname + " failed: ";
|
||||
msg += strerror(errno);
|
||||
LOG(FATAL) << msg;
|
||||
};
|
||||
auto ReadErr = [&fname]() {
|
||||
std::string msg {"Error in reading file: "};
|
||||
msg += fname;
|
||||
msg += ": ";
|
||||
msg += strerror(errno);
|
||||
LOG(FATAL) << msg;
|
||||
};
|
||||
std::string LoadSequentialFile(std::string uri, bool stream) {
|
||||
auto OpenErr = [&uri]() {
|
||||
std::string msg;
|
||||
msg = "Opening " + uri + " failed: ";
|
||||
msg += strerror(errno);
|
||||
LOG(FATAL) << msg;
|
||||
};
|
||||
|
||||
auto parsed = dmlc::io::URI(uri.c_str());
|
||||
// Read from file.
|
||||
if ((parsed.protocol == "file://" || parsed.protocol.length() == 0) && !stream) {
|
||||
std::string buffer;
|
||||
// Open in binary mode so that correct file size can be computed with
|
||||
// seekg(). This accommodates Windows platform:
|
||||
// https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
|
||||
std::ifstream ifs(uri, std::ios_base::binary | std::ios_base::in);
|
||||
if (!ifs) {
|
||||
// https://stackoverflow.com/a/17338934
|
||||
OpenErr();
|
||||
}
|
||||
|
||||
ifs.seekg(0, std::ios_base::end);
|
||||
const size_t file_size = static_cast<size_t>(ifs.tellg());
|
||||
ifs.seekg(0, std::ios_base::beg);
|
||||
buffer.resize(file_size + 1);
|
||||
ifs.read(&buffer[0], file_size);
|
||||
buffer.back() = '\0';
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
// Read from remote.
|
||||
std::unique_ptr<dmlc::Stream> fs{dmlc::Stream::Create(uri.c_str(), "r")};
|
||||
std::string buffer;
|
||||
// Open in binary mode so that correct file size can be computed with seekg().
|
||||
// This accommodates Windows platform:
|
||||
// https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
|
||||
std::ifstream ifs(fname, std::ios_base::binary | std::ios_base::in);
|
||||
ifs.seekg(0, std::ios_base::end);
|
||||
const size_t file_size = static_cast<size_t>(ifs.tellg());
|
||||
ifs.seekg(0, std::ios_base::beg);
|
||||
buffer.resize(file_size + 1);
|
||||
ifs.read(&buffer[0], file_size);
|
||||
buffer.back() = '\0';
|
||||
|
||||
size_t constexpr kInitialSize = 4096;
|
||||
size_t size {kInitialSize}, total {0};
|
||||
while (true) {
|
||||
buffer.resize(total + size);
|
||||
size_t read = fs->Read(&buffer[total], size);
|
||||
total += read;
|
||||
if (read < size) {
|
||||
break;
|
||||
}
|
||||
size *= 2;
|
||||
}
|
||||
buffer.resize(total);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -75,7 +75,16 @@ class FixedSizeStream : public PeekableInStream {
|
||||
std::string buffer_;
|
||||
};
|
||||
|
||||
std::string LoadSequentialFile(std::string fname);
|
||||
/*!
|
||||
* \brief Helper function for loading consecutive file to avoid dmlc Stream when possible.
|
||||
*
|
||||
* \param uri URI or file name to file.
|
||||
* \param stream Use dmlc Stream unconditionally if set to true. Used for running test
|
||||
* without remote filesystem.
|
||||
*
|
||||
* \return File content.
|
||||
*/
|
||||
std::string LoadSequentialFile(std::string uri, bool stream = false);
|
||||
|
||||
inline std::string FileExtension(std::string const& fname) {
|
||||
auto splited = Split(fname, '.');
|
||||
|
||||
Reference in New Issue
Block a user