Ensure that LoadSequentialFile() actually read the whole file (#5831)

This commit is contained in:
Philip Hyunsu Cho 2020-07-04 01:17:11 -07:00 committed by GitHub
parent 1a0801238e
commit efe3e48ae2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 17 additions and 35 deletions

View File

@ -9,12 +9,12 @@
#include <vector> #include <vector>
#include <memory> #include <memory>
#include <string> #include <string>
#include <cinttypes>
#include <utility> #include <utility>
#include <map> #include <map>
#include <limits> #include <limits>
#include <sstream> #include <sstream>
#include <locale> #include <locale>
#include <cinttypes>
namespace xgboost { namespace xgboost {
/* /*
@ -86,6 +86,8 @@ class JsonReader {
msg += "\", got: \""; msg += "\", got: \"";
if (got == -1) { if (got == -1) {
msg += "EOF\""; msg += "EOF\"";
} else if (got == 0) {
msg += "\\0\"";
} else { } else {
msg += std::to_string(got) + " \""; msg += std::to_string(got) + " \"";
} }

View File

@ -7,9 +7,10 @@
#include <unistd.h> #include <unistd.h>
#endif // defined(__unix__) #endif // defined(__unix__)
#include <algorithm> #include <algorithm>
#include <cstdio> #include <fstream>
#include <string> #include <string>
#include <utility> #include <utility>
#include <cstdio>
#include "xgboost/logging.h" #include "xgboost/logging.h"
#include "io.h" #include "io.h"
@ -108,39 +109,17 @@ std::string LoadSequentialFile(std::string fname) {
}; };
std::string buffer; std::string buffer;
#if defined(__unix__) // Open in binary mode so that correct file size can be computed with seekg().
struct stat fs; // This accommodates Windows platform:
if (stat(fname.c_str(), &fs) != 0) { // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
OpenErr(); std::ifstream ifs(fname, std::ios_base::binary | std::ios_base::in);
} ifs.seekg(0, std::ios_base::end);
const size_t file_size = static_cast<size_t>(ifs.tellg());
size_t f_size_bytes = fs.st_size; ifs.seekg(0, std::ios_base::beg);
buffer.resize(f_size_bytes + 1); buffer.resize(file_size + 1);
int32_t fd = open(fname.c_str(), O_RDONLY); ifs.read(&buffer[0], file_size);
#if defined(__linux__)
posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
#endif // defined(__linux__)
ssize_t bytes_read = read(fd, &buffer[0], f_size_bytes);
if (bytes_read < 0) {
close(fd);
ReadErr();
}
close(fd);
#else // defined(__unix__)
FILE *f = fopen(fname.c_str(), "r");
if (f == NULL) {
std::string msg;
OpenErr();
}
fseek(f, 0, SEEK_END);
auto fsize = ftell(f);
fseek(f, 0, SEEK_SET);
buffer.resize(fsize + 1);
fread(&buffer[0], 1, fsize, f);
fclose(f);
#endif // defined(__unix__)
buffer.back() = '\0'; buffer.back() = '\0';
return buffer; return buffer;
} }

View File

@ -75,7 +75,6 @@ class FixedSizeStream : public PeekableInStream {
std::string buffer_; std::string buffer_;
}; };
// Optimized for consecutive file loading in unix like systime.
std::string LoadSequentialFile(std::string fname); std::string LoadSequentialFile(std::string fname);
inline std::string FileExtension(std::string const& fname) { inline std::string FileExtension(std::string const& fname) {

View File

@ -427,6 +427,8 @@ void JsonReader::Error(std::string msg) const {
for (auto c : raw_portion) { for (auto c : raw_portion) {
if (c == '\n') { if (c == '\n') {
portion += "\\n"; portion += "\\n";
} else if (c == '\0') {
portion += "\\0";
} else { } else {
portion += c; portion += c;
} }