Use mmap for external memory. (#9282)

- Have basic infrastructure for mmap.
- Release file write handle.
This commit is contained in:
Jiaming Yuan
2023-06-19 18:52:55 +08:00
committed by GitHub
parent d8beb517ed
commit ee6809e642
16 changed files with 599 additions and 275 deletions

View File

@@ -1,24 +1,47 @@
/*!
* Copyright (c) by XGBoost Contributors 2019-2022
/**
* Copyright 2019-2023, by XGBoost Contributors
*/
#if defined(__unix__)
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#if !defined(NOMINMAX) && defined(_WIN32)
#define NOMINMAX
#endif // !defined(NOMINMAX)
#if !defined(xgboost_IS_WIN)
#if defined(_MSC_VER) || defined(__MINGW32__)
#define xgboost_IS_WIN 1
#endif // defined(_MSC_VER) || defined(__MINGW32__)
#endif // !defined(xgboost_IS_WIN)
#if defined(__unix__) || defined(__APPLE__)
#include <fcntl.h> // for open, O_RDONLY
#include <sys/mman.h> // for mmap, mmap64, munmap
#include <unistd.h> // for close, getpagesize
#elif defined(xgboost_IS_WIN)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif // defined(__unix__)
#include <algorithm>
#include <fstream>
#include <string>
#include <memory>
#include <utility>
#include <cstdio>
#include "xgboost/logging.h"
#include <algorithm> // for copy, transform
#include <cctype> // for tolower
#include <cerrno> // for errno
#include <cstddef> // for size_t
#include <cstdint> // for int32_t, uint32_t
#include <cstring> // for memcpy
#include <fstream> // for ifstream
#include <iterator> // for distance
#include <limits> // for numeric_limits
#include <memory> // for unique_ptr
#include <string> // for string
#include <system_error> // for error_code, system_category
#include <utility> // for move
#include <vector> // for vector
#include "io.h"
#include "xgboost/collective/socket.h" // for LastError
#include "xgboost/logging.h"
namespace xgboost {
namespace common {
namespace xgboost::common {
size_t PeekableInStream::Read(void* dptr, size_t size) {
size_t nbuffer = buffer_.length() - buffer_ptr_;
if (nbuffer == 0) return strm_->Read(dptr, size);
@@ -94,11 +117,32 @@ void FixedSizeStream::Take(std::string* out) {
*out = std::move(buffer_);
}
namespace {
// Get system alignment value for IO with mmap.
std::size_t GetMmapAlignment() {
#if defined(xgboost_IS_WIN)
SYSTEM_INFO sys_info;
GetSystemInfo(&sys_info);
// During testing, `sys_info.dwPageSize` is of size 4096 while `dwAllocationGranularity` is of
// size 65536.
return sys_info.dwAllocationGranularity;
#else
return getpagesize();
#endif
}
auto SystemErrorMsg() {
std::int32_t errsv = system::LastError();
auto err = std::error_code{errsv, std::system_category()};
return err.message();
}
} // anonymous namespace
std::string LoadSequentialFile(std::string uri, bool stream) {
auto OpenErr = [&uri]() {
std::string msg;
msg = "Opening " + uri + " failed: ";
msg += strerror(errno);
msg += SystemErrorMsg();
LOG(FATAL) << msg;
};
@@ -155,5 +199,99 @@ std::string FileExtension(std::string fname, bool lower) {
return "";
}
}
} // namespace common
} // namespace xgboost
struct PrivateMmapConstStream::MMAPFile {
#if defined(xgboost_IS_WIN)
HANDLE fd{INVALID_HANDLE_VALUE};
HANDLE file_map{INVALID_HANDLE_VALUE};
#else
std::int32_t fd{0};
#endif
char* base_ptr{nullptr};
std::size_t base_size{0};
std::string path;
};
char* PrivateMmapConstStream::Open(std::string path, std::size_t offset, std::size_t length) {
if (length == 0) {
return nullptr;
}
#if defined(xgboost_IS_WIN)
HANDLE fd = CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, nullptr);
CHECK_NE(fd, INVALID_HANDLE_VALUE) << "Failed to open:" << path << ". " << SystemErrorMsg();
#else
auto fd = open(path.c_str(), O_RDONLY);
CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
#endif
char* ptr{nullptr};
// Round down for alignment.
auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
auto view_size = length + (offset - view_start);
#if defined(__linux__) || defined(__GLIBC__)
int prot{PROT_READ};
ptr = reinterpret_cast<char*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
#elif defined(xgboost_IS_WIN)
auto file_size = GetFileSize(fd, nullptr);
DWORD access = PAGE_READONLY;
auto map_file = CreateFileMapping(fd, nullptr, access, 0, file_size, nullptr);
access = FILE_MAP_READ;
std::uint32_t loff = static_cast<std::uint32_t>(view_start);
std::uint32_t hoff = view_start >> 32;
CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
ptr = reinterpret_cast<char*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
handle_.reset(new MMAPFile{fd, map_file, ptr, view_size, std::move(path)});
#else
CHECK_LE(offset, std::numeric_limits<off_t>::max())
<< "File size has exceeded the limit on the current system.";
int prot{PROT_READ};
ptr = reinterpret_cast<char*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
#endif // defined(__linux__)
ptr += (offset - view_start);
return ptr;
}
PrivateMmapConstStream::PrivateMmapConstStream(std::string path, std::size_t offset,
std::size_t length)
: MemoryFixSizeBuffer{}, handle_{nullptr} {
this->p_buffer_ = Open(std::move(path), offset, length);
this->buffer_size_ = length;
}
PrivateMmapConstStream::~PrivateMmapConstStream() {
CHECK(handle_);
#if defined(xgboost_IS_WIN)
if (p_buffer_) {
CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
}
if (handle_->fd != INVALID_HANDLE_VALUE) {
CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
}
if (handle_->file_map != INVALID_HANDLE_VALUE) {
CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
}
#else
if (handle_->base_ptr) {
CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
<< "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
}
if (handle_->fd != 0) {
CHECK_NE(close(handle_->fd), -1)
<< "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
}
#endif
}
} // namespace xgboost::common
#if defined(xgboost_IS_WIN)
#undef xgboost_IS_WIN
#endif // defined(xgboost_IS_WIN)

View File

@@ -1,5 +1,5 @@
/*!
* Copyright by XGBoost Contributors 2014-2022
/**
* Copyright 2014-2023, XGBoost Contributors
* \file io.h
* \brief general stream interface for serialization, I/O
* \author Tianqi Chen
@@ -10,9 +10,11 @@
#include <dmlc/io.h>
#include <rabit/rabit.h>
#include <string>
#include <cstring>
#include <fstream>
#include <memory> // for unique_ptr
#include <string> // for string
#include "common.h"
@@ -127,6 +129,31 @@ inline std::string ReadAll(std::string const &path) {
return content;
}
/**
* @brief Private mmap file as a read-only stream.
*
* It can calculate alignment automatically based on system page size (or allocation
* granularity on Windows).
*/
class PrivateMmapConstStream : public MemoryFixSizeBuffer {
struct MMAPFile;
std::unique_ptr<MMAPFile> handle_;
char* Open(std::string path, std::size_t offset, std::size_t length);
public:
/**
* @brief Construct a private mmap stream.
*
* @param path File path.
* @param offset See the `offset` parameter of `mmap` for details.
* @param length See the `length` parameter of `mmap` for details.
*/
explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length);
void Write(void const*, std::size_t) override { LOG(FATAL) << "Read-only stream."; }
~PrivateMmapConstStream() override;
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_IO_H_