Use mmap for external memory. (#9282)
- Have basic infrastructure for mmap. - Release file write handle.
This commit is contained in:
176
src/common/io.cc
176
src/common/io.cc
@@ -1,24 +1,47 @@
|
||||
/*!
|
||||
* Copyright (c) by XGBoost Contributors 2019-2022
|
||||
/**
|
||||
* Copyright 2019-2023, by XGBoost Contributors
|
||||
*/
|
||||
#if defined(__unix__)
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#if !defined(NOMINMAX) && defined(_WIN32)
|
||||
#define NOMINMAX
|
||||
#endif // !defined(NOMINMAX)
|
||||
|
||||
#if !defined(xgboost_IS_WIN)
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#define xgboost_IS_WIN 1
|
||||
#endif // defined(_MSC_VER) || defined(__MINGW32__)
|
||||
|
||||
#endif // !defined(xgboost_IS_WIN)
|
||||
|
||||
#if defined(__unix__) || defined(__APPLE__)
|
||||
#include <fcntl.h> // for open, O_RDONLY
|
||||
#include <sys/mman.h> // for mmap, mmap64, munmap
|
||||
#include <unistd.h> // for close, getpagesize
|
||||
#elif defined(xgboost_IS_WIN)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif // defined(__unix__)
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <cstdio>
|
||||
|
||||
#include "xgboost/logging.h"
|
||||
#include <algorithm> // for copy, transform
|
||||
#include <cctype> // for tolower
|
||||
#include <cerrno> // for errno
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t, uint32_t
|
||||
#include <cstring> // for memcpy
|
||||
#include <fstream> // for ifstream
|
||||
#include <iterator> // for distance
|
||||
#include <limits> // for numeric_limits
|
||||
#include <memory> // for unique_ptr
|
||||
#include <string> // for string
|
||||
#include <system_error> // for error_code, system_category
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "io.h"
|
||||
#include "xgboost/collective/socket.h" // for LastError
|
||||
#include "xgboost/logging.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
namespace xgboost::common {
|
||||
size_t PeekableInStream::Read(void* dptr, size_t size) {
|
||||
size_t nbuffer = buffer_.length() - buffer_ptr_;
|
||||
if (nbuffer == 0) return strm_->Read(dptr, size);
|
||||
@@ -94,11 +117,32 @@ void FixedSizeStream::Take(std::string* out) {
|
||||
*out = std::move(buffer_);
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Get system alignment value for IO with mmap.
|
||||
std::size_t GetMmapAlignment() {
|
||||
#if defined(xgboost_IS_WIN)
|
||||
SYSTEM_INFO sys_info;
|
||||
GetSystemInfo(&sys_info);
|
||||
// During testing, `sys_info.dwPageSize` is of size 4096 while `dwAllocationGranularity` is of
|
||||
// size 65536.
|
||||
return sys_info.dwAllocationGranularity;
|
||||
#else
|
||||
return getpagesize();
|
||||
#endif
|
||||
}
|
||||
|
||||
auto SystemErrorMsg() {
|
||||
std::int32_t errsv = system::LastError();
|
||||
auto err = std::error_code{errsv, std::system_category()};
|
||||
return err.message();
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
std::string LoadSequentialFile(std::string uri, bool stream) {
|
||||
auto OpenErr = [&uri]() {
|
||||
std::string msg;
|
||||
msg = "Opening " + uri + " failed: ";
|
||||
msg += strerror(errno);
|
||||
msg += SystemErrorMsg();
|
||||
LOG(FATAL) << msg;
|
||||
};
|
||||
|
||||
@@ -155,5 +199,99 @@ std::string FileExtension(std::string fname, bool lower) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
struct PrivateMmapConstStream::MMAPFile {
|
||||
#if defined(xgboost_IS_WIN)
|
||||
HANDLE fd{INVALID_HANDLE_VALUE};
|
||||
HANDLE file_map{INVALID_HANDLE_VALUE};
|
||||
#else
|
||||
std::int32_t fd{0};
|
||||
#endif
|
||||
char* base_ptr{nullptr};
|
||||
std::size_t base_size{0};
|
||||
std::string path;
|
||||
};
|
||||
|
||||
char* PrivateMmapConstStream::Open(std::string path, std::size_t offset, std::size_t length) {
|
||||
if (length == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#if defined(xgboost_IS_WIN)
|
||||
HANDLE fd = CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING,
|
||||
FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, nullptr);
|
||||
CHECK_NE(fd, INVALID_HANDLE_VALUE) << "Failed to open:" << path << ". " << SystemErrorMsg();
|
||||
#else
|
||||
auto fd = open(path.c_str(), O_RDONLY);
|
||||
CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
|
||||
#endif
|
||||
|
||||
char* ptr{nullptr};
|
||||
// Round down for alignment.
|
||||
auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
|
||||
auto view_size = length + (offset - view_start);
|
||||
|
||||
#if defined(__linux__) || defined(__GLIBC__)
|
||||
int prot{PROT_READ};
|
||||
ptr = reinterpret_cast<char*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
|
||||
CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
|
||||
handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
|
||||
#elif defined(xgboost_IS_WIN)
|
||||
auto file_size = GetFileSize(fd, nullptr);
|
||||
DWORD access = PAGE_READONLY;
|
||||
auto map_file = CreateFileMapping(fd, nullptr, access, 0, file_size, nullptr);
|
||||
access = FILE_MAP_READ;
|
||||
std::uint32_t loff = static_cast<std::uint32_t>(view_start);
|
||||
std::uint32_t hoff = view_start >> 32;
|
||||
CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
|
||||
ptr = reinterpret_cast<char*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
|
||||
CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
|
||||
handle_.reset(new MMAPFile{fd, map_file, ptr, view_size, std::move(path)});
|
||||
#else
|
||||
CHECK_LE(offset, std::numeric_limits<off_t>::max())
|
||||
<< "File size has exceeded the limit on the current system.";
|
||||
int prot{PROT_READ};
|
||||
ptr = reinterpret_cast<char*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
|
||||
CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
|
||||
handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
|
||||
#endif // defined(__linux__)
|
||||
|
||||
ptr += (offset - view_start);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
PrivateMmapConstStream::PrivateMmapConstStream(std::string path, std::size_t offset,
|
||||
std::size_t length)
|
||||
: MemoryFixSizeBuffer{}, handle_{nullptr} {
|
||||
this->p_buffer_ = Open(std::move(path), offset, length);
|
||||
this->buffer_size_ = length;
|
||||
}
|
||||
|
||||
PrivateMmapConstStream::~PrivateMmapConstStream() {
|
||||
CHECK(handle_);
|
||||
#if defined(xgboost_IS_WIN)
|
||||
if (p_buffer_) {
|
||||
CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
|
||||
}
|
||||
if (handle_->fd != INVALID_HANDLE_VALUE) {
|
||||
CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
|
||||
}
|
||||
if (handle_->file_map != INVALID_HANDLE_VALUE) {
|
||||
CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
|
||||
}
|
||||
#else
|
||||
if (handle_->base_ptr) {
|
||||
CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
|
||||
<< "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
|
||||
}
|
||||
if (handle_->fd != 0) {
|
||||
CHECK_NE(close(handle_->fd), -1)
|
||||
<< "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} // namespace xgboost::common
|
||||
|
||||
#if defined(xgboost_IS_WIN)
|
||||
#undef xgboost_IS_WIN
|
||||
#endif // defined(xgboost_IS_WIN)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright by XGBoost Contributors 2014-2022
|
||||
/**
|
||||
* Copyright 2014-2023, XGBoost Contributors
|
||||
* \file io.h
|
||||
* \brief general stream interface for serialization, I/O
|
||||
* \author Tianqi Chen
|
||||
@@ -10,9 +10,11 @@
|
||||
|
||||
#include <dmlc/io.h>
|
||||
#include <rabit/rabit.h>
|
||||
#include <string>
|
||||
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <memory> // for unique_ptr
|
||||
#include <string> // for string
|
||||
|
||||
#include "common.h"
|
||||
|
||||
@@ -127,6 +129,31 @@ inline std::string ReadAll(std::string const &path) {
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Private mmap file as a read-only stream.
|
||||
*
|
||||
* It can calculate alignment automatically based on system page size (or allocation
|
||||
* granularity on Windows).
|
||||
*/
|
||||
class PrivateMmapConstStream : public MemoryFixSizeBuffer {
|
||||
struct MMAPFile;
|
||||
std::unique_ptr<MMAPFile> handle_;
|
||||
|
||||
char* Open(std::string path, std::size_t offset, std::size_t length);
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a private mmap stream.
|
||||
*
|
||||
* @param path File path.
|
||||
* @param offset See the `offset` parameter of `mmap` for details.
|
||||
* @param length See the `length` parameter of `mmap` for details.
|
||||
*/
|
||||
explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length);
|
||||
void Write(void const*, std::size_t) override { LOG(FATAL) << "Read-only stream."; }
|
||||
|
||||
~PrivateMmapConstStream() override;
|
||||
};
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_IO_H_
|
||||
|
||||
Reference in New Issue
Block a user