Use ptr from mmap for GHistIndexMatrix and ColumnMatrix. (#9315)

* Use ptr from mmap for `GHistIndexMatrix` and `ColumnMatrix`.

- Define a resource for holding various types of memory pointers.
- Define ref vector for holding resources.
- Swap the underlying resources for GHist and ColumnM.
- Add documentation for current status.
- s390x support is removed. It should work if you can compile XGBoost, all the old workaround code does is to get GCC to compile.
This commit is contained in:
Jiaming Yuan
2023-06-27 19:05:46 +08:00
committed by GitHub
parent 96c3071a8a
commit bc267dd729
29 changed files with 1448 additions and 509 deletions

View File

@@ -4,14 +4,14 @@
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include "../../../src/common/io.h" // for PrivateMmapConstStream, AlignedResourceReadStream...
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/sparse_page_source.h"
#include "../../../src/tree/param.h" // TrainParam
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
TEST(EllpackPageRawFormat, IO) {
Context ctx{MakeCUDACtx(0)};
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
@@ -22,15 +22,17 @@ TEST(EllpackPageRawFormat, IO) {
dmlc::TemporaryDirectory tmpdir;
std::string path = tmpdir.path + "/ellpack.page";
std::size_t n_bytes{0};
{
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
format->Write(ellpack, fo.get());
n_bytes += format->Write(ellpack, fo.get());
}
}
EllpackPage page;
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
std::unique_ptr<common::AlignedResourceReadStream> fi{
std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
format->Read(&page, fi.get());
for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
@@ -44,5 +46,4 @@ TEST(EllpackPageRawFormat, IO) {
ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -26,8 +26,7 @@
#include "xgboost/context.h" // for Context
#include "xgboost/host_device_vector.h" // for HostDeviceVector
namespace xgboost {
namespace data {
namespace xgboost::data {
TEST(GradientIndex, ExternalMemory) {
Context ctx;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
@@ -171,7 +170,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
gpu_ctx.gpu_id = 0;
for (auto const &page : Xy->GetBatches<EllpackPage>(
&gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
}
for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
@@ -199,13 +198,15 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
std::string from_sparse_buf;
{
common::MemoryBufferStream fo{&from_sparse_buf};
columns_from_sparse.Write(&fo);
common::AlignedMemWriteStream fo{&from_sparse_buf};
auto n_bytes = columns_from_sparse.Write(&fo);
ASSERT_EQ(fo.Tell(), n_bytes);
}
std::string from_ellpack_buf;
{
common::MemoryBufferStream fo{&from_ellpack_buf};
columns_from_sparse.Write(&fo);
common::AlignedMemWriteStream fo{&from_ellpack_buf};
auto n_bytes = columns_from_sparse.Write(&fo);
ASSERT_EQ(fo.Tell(), n_bytes);
}
ASSERT_EQ(from_sparse_buf, from_ellpack_buf);
}
@@ -229,5 +230,4 @@ INSTANTIATE_TEST_SUITE_P(GHistIndexMatrix, GHistIndexMatrixTest,
std::make_tuple(.6f, .4))); // dense columns
#endif // defined(XGBOOST_USE_CUDA)
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -2,14 +2,18 @@
* Copyright 2021-2023, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/context.h> // for Context
#include <cstddef> // for size_t
#include <memory> // for unique_ptr
#include "../../../src/common/column_matrix.h"
#include "../../../src/data/gradient_index.h"
#include "../../../src/common/io.h" // for MmapResource, AlignedResourceReadStream...
#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix
#include "../../../src/data/sparse_page_source.h"
#include "../helpers.h"
#include "../helpers.h" // for RandomDataGenerator
namespace xgboost {
namespace data {
namespace xgboost::data {
TEST(GHistIndexPageRawFormat, IO) {
Context ctx;
@@ -20,15 +24,18 @@ TEST(GHistIndexPageRawFormat, IO) {
std::string path = tmpdir.path + "/ghistindex.page";
auto batch = BatchParam{256, 0.5};
std::size_t bytes{0};
{
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
for (auto const &index : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
format->Write(index, fo.get());
bytes += format->Write(index, fo.get());
}
}
GHistIndexMatrix page;
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
std::unique_ptr<common::AlignedResourceReadStream> fi{
std::make_unique<common::PrivateMmapConstStream>(path, 0, bytes)};
format->Read(&page, fi.get());
for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
@@ -37,6 +44,8 @@ TEST(GHistIndexPageRawFormat, IO) {
ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
ASSERT_EQ(loaded.cut.Values(), page.cut.Values());
ASSERT_EQ(loaded.base_rowid, page.base_rowid);
ASSERT_EQ(loaded.row_ptr.size(), page.row_ptr.size());
ASSERT_TRUE(std::equal(loaded.row_ptr.cbegin(), loaded.row_ptr.cend(), page.row_ptr.cbegin()));
ASSERT_EQ(loaded.IsDense(), page.IsDense());
ASSERT_TRUE(std::equal(loaded.index.begin(), loaded.index.end(), page.index.begin()));
ASSERT_TRUE(std::equal(loaded.index.Offset(), loaded.index.Offset() + loaded.index.OffsetSize(),
@@ -45,5 +54,4 @@ TEST(GHistIndexPageRawFormat, IO) {
ASSERT_EQ(loaded.Transpose().GetTypeSize(), loaded.Transpose().GetTypeSize());
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -2,20 +2,20 @@
* Copyright 2021-2023, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/data.h> // for CSCPage, SortedCSCPage, SparsePage
#include <xgboost/data.h> // for CSCPage, SortedCSCPage, SparsePage
#include <memory> // for allocator, unique_ptr, __shared_ptr_ac...
#include <string> // for char_traits, operator+, basic_string
#include <memory> // for allocator, unique_ptr, __shared_ptr_ac...
#include <string> // for char_traits, operator+, basic_string
#include "../../../src/common/io.h" // for PrivateMmapConstStream, AlignedResourceReadStream...
#include "../../../src/data/sparse_page_writer.h" // for CreatePageFormat
#include "../helpers.h" // for RandomDataGenerator
#include "dmlc/filesystem.h" // for TemporaryDirectory
#include "dmlc/io.h" // for SeekStream, Stream
#include "dmlc/io.h" // for Stream
#include "gtest/gtest_pred_impl.h" // for Test, AssertionResult, ASSERT_EQ, TEST
#include "xgboost/context.h" // for Context
namespace xgboost {
namespace data {
namespace xgboost::data {
template <typename S> void TestSparsePageRawFormat() {
std::unique_ptr<SparsePageFormat<S>> format{CreatePageFormat<S>("raw")};
Context ctx;
@@ -25,17 +25,19 @@ template <typename S> void TestSparsePageRawFormat() {
dmlc::TemporaryDirectory tmpdir;
std::string path = tmpdir.path + "/sparse.page";
S orig;
std::size_t n_bytes{0};
{
// block code to flush the stream
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
for (auto const &page : m->GetBatches<S>(&ctx)) {
orig.Push(page);
format->Write(page, fo.get());
n_bytes = format->Write(page, fo.get());
}
}
S page;
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
std::unique_ptr<common::AlignedResourceReadStream> fi{
std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
format->Read(&page, fi.get());
for (size_t i = 0; i < orig.data.Size(); ++i) {
ASSERT_EQ(page.data.HostVector()[i].fvalue,
@@ -59,5 +61,4 @@ TEST(SparsePageRawFormat, CSCPage) {
TEST(SparsePageRawFormat, SortedCSCPage) {
TestSparsePageRawFormat<SortedCSCPage>();
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data