Use ptr from mmap for GHistIndexMatrix and ColumnMatrix. (#9315)

* Use ptr from mmap for `GHistIndexMatrix` and `ColumnMatrix`.

- Define a resource for holding various types of memory pointers.
- Define ref vector for holding resources.
- Swap the underlying resources for GHist and ColumnM.
- Add documentation for current status.
- s390x support is removed. It should work if you can compile XGBoost, all the old workaround code does is to get GCC to compile.
This commit is contained in:
Jiaming Yuan
2023-06-27 19:05:46 +08:00
committed by GitHub
parent 96c3071a8a
commit bc267dd729
29 changed files with 1448 additions and 509 deletions

View File

@@ -3,11 +3,12 @@
*/
#include <gtest/gtest.h>
#include <fstream>
#include <cstddef> // for size_t
#include <fstream> // for ofstream
#include "../../../src/common/io.h"
#include "../helpers.h"
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
namespace xgboost::common {
TEST(MemoryFixSizeBuffer, Seek) {
@@ -89,6 +90,57 @@ TEST(IO, LoadSequentialFile) {
ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
}
TEST(IO, Resource) {
{
// test malloc basic
std::size_t n = 128;
std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
ASSERT_EQ(resource->Size(), n);
ASSERT_EQ(resource->Type(), ResourceHandler::kMalloc);
}
// test malloc resize
auto test_malloc_resize = [](bool force_malloc) {
std::size_t n = 64;
std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
auto ptr = reinterpret_cast<std::uint8_t *>(resource->Data());
std::iota(ptr, ptr + n, 0);
auto malloc_resource = std::dynamic_pointer_cast<MallocResource>(resource);
ASSERT_TRUE(malloc_resource);
if (force_malloc) {
malloc_resource->Resize<true>(n * 2);
} else {
malloc_resource->Resize<false>(n * 2);
}
for (std::size_t i = 0; i < n; ++i) {
ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], i) << force_malloc;
}
for (std::size_t i = n; i < 2 * n; ++i) {
ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
}
};
test_malloc_resize(true);
test_malloc_resize(false);
{
// test mmap
dmlc::TemporaryDirectory tmpdir;
auto path = tmpdir.path + "/testfile";
std::ofstream fout(path, std::ios::binary);
double val{1.0};
fout.write(reinterpret_cast<char const *>(&val), sizeof(val));
fout << 1.0 << std::endl;
fout.close();
auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
ASSERT_EQ(resource->Size(), sizeof(double));
ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
ASSERT_EQ(resource->DataAs<double>()[0], val);
}
}
TEST(IO, PrivateMmapStream) {
dmlc::TemporaryDirectory tempdir;
auto path = tempdir.path + "/testfile";
@@ -124,17 +176,35 @@ TEST(IO, PrivateMmapStream) {
// Turn size info offset
std::partial_sum(offset.begin(), offset.end(), offset.begin());
// Test read
for (std::size_t i = 0; i < n_batches; ++i) {
std::size_t off = offset[i];
std::size_t n = offset.at(i + 1) - offset[i];
std::unique_ptr<dmlc::Stream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
auto fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
std::vector<T> data;
std::uint64_t size{0};
fi->Read(&size);
ASSERT_TRUE(fi->Read(&size));
ASSERT_EQ(fi->Tell(), sizeof(size));
data.resize(size);
fi->Read(data.data(), size * sizeof(T));
ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), size * sizeof(T));
ASSERT_EQ(data, batches[i]);
}
// Test consume
for (std::size_t i = 0; i < n_batches; ++i) {
std::size_t off = offset[i];
std::size_t n = offset.at(i + 1) - offset[i];
std::unique_ptr<AlignedResourceReadStream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
std::vector<T> data;
std::uint64_t size{0};
ASSERT_TRUE(fi->Consume(&size));
ASSERT_EQ(fi->Tell(), sizeof(size));
data.resize(size);
ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), sizeof(T) * size);
ASSERT_EQ(data, batches[i]);
}
}

View File

@@ -0,0 +1,108 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <cstddef> // for size_t
#include <memory> // for make_shared, make_unique
#include <numeric> // for iota
#include <vector> // for vector
#include "../../../src/common/ref_resource_view.h"
#include "dmlc/filesystem.h" // for TemporaryDirectory
namespace xgboost::common {
TEST(RefResourceView, Basic) {
std::size_t n_bytes = 1024;
auto mem = std::make_shared<MallocResource>(n_bytes);
{
RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
mem};
ASSERT_EQ(mem.use_count(), 3);
ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
}
{
RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
1.5f};
for (auto v : view) {
ASSERT_EQ(v, 1.5f);
}
std::iota(view.begin(), view.end(), 0.0f);
ASSERT_EQ(view.front(), 0.0f);
ASSERT_EQ(view.back(), static_cast<float>(view.size() - 1));
view.front() = 1.0f;
view.back() = 2.0f;
ASSERT_EQ(view.front(), 1.0f);
ASSERT_EQ(view.back(), 2.0f);
}
ASSERT_EQ(mem.use_count(), 1);
}
TEST(RefResourceView, IO) {
dmlc::TemporaryDirectory tmpdir;
auto path = tmpdir.path + "/testfile";
auto data = MakeFixedVecWithMalloc(123, std::size_t{1});
{
auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
ASSERT_EQ(fo->Write(data.data(), data.size_bytes()), data.size_bytes());
}
{
auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
ASSERT_EQ(WriteVec(fo.get(), data),
data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
}
{
auto fi = std::make_unique<PrivateMmapConstStream>(
path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
auto read = MakeFixedVecWithMalloc(123, std::size_t{1});
ASSERT_TRUE(ReadVec(fi.get(), &read));
for (auto v : read) {
ASSERT_EQ(v, 1ul);
}
}
}
TEST(RefResourceView, IOAligned) {
dmlc::TemporaryDirectory tmpdir;
auto path = tmpdir.path + "/testfile";
auto data = MakeFixedVecWithMalloc(123, 1.0f);
{
auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
// + sizeof(float) for alignment
ASSERT_EQ(WriteVec(fo.get(), data),
data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type) + sizeof(float));
}
{
auto fi = std::make_unique<PrivateMmapConstStream>(
path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
// wrong type, float vs. double
auto read = MakeFixedVecWithMalloc(123, 2.0);
ASSERT_FALSE(ReadVec(fi.get(), &read));
}
{
auto fi = std::make_unique<PrivateMmapConstStream>(
path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
auto read = MakeFixedVecWithMalloc(123, 2.0f);
ASSERT_TRUE(ReadVec(fi.get(), &read));
for (auto v : read) {
ASSERT_EQ(v, 1ul);
}
}
{
// Test std::vector
std::vector<float> data(123);
std::iota(data.begin(), data.end(), 0.0f);
auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
// + sizeof(float) for alignment
ASSERT_EQ(WriteVec(fo.get(), data), data.size() * sizeof(float) +
sizeof(RefResourceView<std::size_t>::size_type) +
sizeof(float));
}
}
} // namespace xgboost::common

View File

@@ -4,14 +4,14 @@
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include "../../../src/common/io.h" // for PrivateMmapConstStream, AlignedResourceReadStream...
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/sparse_page_source.h"
#include "../../../src/tree/param.h" // TrainParam
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
TEST(EllpackPageRawFormat, IO) {
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
@@ -22,15 +22,17 @@ TEST(EllpackPageRawFormat, IO) {
dmlc::TemporaryDirectory tmpdir;
std::string path = tmpdir.path + "/ellpack.page";
std::size_t n_bytes{0};
{
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
format->Write(ellpack, fo.get());
n_bytes += format->Write(ellpack, fo.get());
}
}
EllpackPage page;
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
std::unique_ptr<common::AlignedResourceReadStream> fi{
std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
format->Read(&page, fi.get());
for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
@@ -44,5 +46,4 @@ TEST(EllpackPageRawFormat, IO) {
ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -26,8 +26,7 @@
#include "xgboost/context.h" // for Context
#include "xgboost/host_device_vector.h" // for HostDeviceVector
namespace xgboost {
namespace data {
namespace xgboost::data {
TEST(GradientIndex, ExternalMemory) {
Context ctx;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
@@ -171,7 +170,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
gpu_ctx.gpu_id = 0;
for (auto const &page : Xy->GetBatches<EllpackPage>(
&gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
}
for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
@@ -199,13 +198,15 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
std::string from_sparse_buf;
{
common::MemoryBufferStream fo{&from_sparse_buf};
columns_from_sparse.Write(&fo);
common::AlignedMemWriteStream fo{&from_sparse_buf};
auto n_bytes = columns_from_sparse.Write(&fo);
ASSERT_EQ(fo.Tell(), n_bytes);
}
std::string from_ellpack_buf;
{
common::MemoryBufferStream fo{&from_ellpack_buf};
columns_from_sparse.Write(&fo);
common::AlignedMemWriteStream fo{&from_ellpack_buf};
auto n_bytes = columns_from_sparse.Write(&fo);
ASSERT_EQ(fo.Tell(), n_bytes);
}
ASSERT_EQ(from_sparse_buf, from_ellpack_buf);
}
@@ -229,5 +230,4 @@ INSTANTIATE_TEST_SUITE_P(GHistIndexMatrix, GHistIndexMatrixTest,
std::make_tuple(.6f, .4))); // dense columns
#endif // defined(XGBOOST_USE_CUDA)
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -2,14 +2,18 @@
* Copyright 2021-2023, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/context.h> // for Context
#include <cstddef> // for size_t
#include <memory> // for unique_ptr
#include "../../../src/common/column_matrix.h"
#include "../../../src/data/gradient_index.h"
#include "../../../src/common/io.h" // for MmapResource, AlignedResourceReadStream...
#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix
#include "../../../src/data/sparse_page_source.h"
#include "../helpers.h"
#include "../helpers.h" // for RandomDataGenerator
namespace xgboost {
namespace data {
namespace xgboost::data {
TEST(GHistIndexPageRawFormat, IO) {
Context ctx;
@@ -20,15 +24,18 @@ TEST(GHistIndexPageRawFormat, IO) {
std::string path = tmpdir.path + "/ghistindex.page";
auto batch = BatchParam{256, 0.5};
std::size_t bytes{0};
{
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
for (auto const &index : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
format->Write(index, fo.get());
bytes += format->Write(index, fo.get());
}
}
GHistIndexMatrix page;
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
std::unique_ptr<common::AlignedResourceReadStream> fi{
std::make_unique<common::PrivateMmapConstStream>(path, 0, bytes)};
format->Read(&page, fi.get());
for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
@@ -37,6 +44,8 @@ TEST(GHistIndexPageRawFormat, IO) {
ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
ASSERT_EQ(loaded.cut.Values(), page.cut.Values());
ASSERT_EQ(loaded.base_rowid, page.base_rowid);
ASSERT_EQ(loaded.row_ptr.size(), page.row_ptr.size());
ASSERT_TRUE(std::equal(loaded.row_ptr.cbegin(), loaded.row_ptr.cend(), page.row_ptr.cbegin()));
ASSERT_EQ(loaded.IsDense(), page.IsDense());
ASSERT_TRUE(std::equal(loaded.index.begin(), loaded.index.end(), page.index.begin()));
ASSERT_TRUE(std::equal(loaded.index.Offset(), loaded.index.Offset() + loaded.index.OffsetSize(),
@@ -45,5 +54,4 @@ TEST(GHistIndexPageRawFormat, IO) {
ASSERT_EQ(loaded.Transpose().GetTypeSize(), loaded.Transpose().GetTypeSize());
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -2,20 +2,20 @@
* Copyright 2021-2023, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/data.h> // for CSCPage, SortedCSCPage, SparsePage
#include <xgboost/data.h> // for CSCPage, SortedCSCPage, SparsePage
#include <memory> // for allocator, unique_ptr, __shared_ptr_ac...
#include <string> // for char_traits, operator+, basic_string
#include <memory> // for allocator, unique_ptr, __shared_ptr_ac...
#include <string> // for char_traits, operator+, basic_string
#include "../../../src/common/io.h" // for PrivateMmapConstStream, AlignedResourceReadStream...
#include "../../../src/data/sparse_page_writer.h" // for CreatePageFormat
#include "../helpers.h" // for RandomDataGenerator
#include "dmlc/filesystem.h" // for TemporaryDirectory
#include "dmlc/io.h" // for SeekStream, Stream
#include "dmlc/io.h" // for Stream
#include "gtest/gtest_pred_impl.h" // for Test, AssertionResult, ASSERT_EQ, TEST
#include "xgboost/context.h" // for Context
namespace xgboost {
namespace data {
namespace xgboost::data {
template <typename S> void TestSparsePageRawFormat() {
std::unique_ptr<SparsePageFormat<S>> format{CreatePageFormat<S>("raw")};
Context ctx;
@@ -25,17 +25,19 @@ template <typename S> void TestSparsePageRawFormat() {
dmlc::TemporaryDirectory tmpdir;
std::string path = tmpdir.path + "/sparse.page";
S orig;
std::size_t n_bytes{0};
{
// block code to flush the stream
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
for (auto const &page : m->GetBatches<S>(&ctx)) {
orig.Push(page);
format->Write(page, fo.get());
n_bytes = format->Write(page, fo.get());
}
}
S page;
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
std::unique_ptr<common::AlignedResourceReadStream> fi{
std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
format->Read(&page, fi.get());
for (size_t i = 0; i < orig.data.Size(); ++i) {
ASSERT_EQ(page.data.HostVector()[i].fvalue,
@@ -59,5 +61,4 @@ TEST(SparsePageRawFormat, CSCPage) {
TEST(SparsePageRawFormat, SortedCSCPage) {
TestSparsePageRawFormat<SortedCSCPage>();
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data