Refactor external memory formats. (#7089)

* Save base_rowid.
* Return write size.
* Remove unused function.
This commit is contained in:
Jiaming Yuan
2021-07-08 04:04:51 +08:00
committed by GitHub
parent 689eb8f620
commit c766f143ab
5 changed files with 134 additions and 79 deletions

View File

@@ -0,0 +1,45 @@
/*!
* Copyright 2021 XGBoost contributors
*/
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <xgboost/data.h>
#include "../../../src/data/sparse_page_source.h"
#include "../../../src/data/ellpack_page.cuh"
#include "../helpers.h"
namespace xgboost {
namespace data {
TEST(EllpackPageRawFormat, IO) {
std::unique_ptr<SparsePageFormat<EllpackPage>> format{CreatePageFormat<EllpackPage>("raw")};
auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
dmlc::TemporaryDirectory tmpdir;
std::string path = tmpdir.path + "/ellpack.page";
{
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
for (auto const &ellpack : m->GetBatches<EllpackPage>({0, 256})) {
format->Write(ellpack, fo.get());
}
}
EllpackPage page;
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
format->Read(&page, fi.get());
for (auto const &ellpack : m->GetBatches<EllpackPage>({0, 256})) {
auto loaded = page.Impl();
auto orig = ellpack.Impl();
ASSERT_EQ(loaded->Cuts().Ptrs(), orig->Cuts().Ptrs());
ASSERT_EQ(loaded->Cuts().MinValues(), orig->Cuts().MinValues());
ASSERT_EQ(loaded->Cuts().Values(), orig->Cuts().Values());
ASSERT_EQ(loaded->base_rowid, orig->base_rowid);
ASSERT_EQ(loaded->row_stride, orig->row_stride);
ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
}
}
} // namespace data
} // namespace xgboost

View File

@@ -0,0 +1,56 @@
/*!
* Copyright 2021 XGBoost contributors
*/
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <xgboost/data.h>
#include "../../../src/data/sparse_page_source.h"
#include "../helpers.h"
namespace xgboost {
namespace data {
template <typename S> void TestSparsePageRawFormat() {
std::unique_ptr<SparsePageFormat<S>> format{CreatePageFormat<S>("raw")};
auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
ASSERT_TRUE(m->SingleColBlock());
dmlc::TemporaryDirectory tmpdir;
std::string path = tmpdir.path + "/sparse.page";
S orig;
{
// block code to flush the stream
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
for (auto const &page : m->GetBatches<S>()) {
orig.Push(page);
format->Write(page, fo.get());
}
}
S page;
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
format->Read(&page, fi.get());
for (size_t i = 0; i < orig.data.Size(); ++i) {
ASSERT_EQ(page.data.HostVector()[i].fvalue,
orig.data.HostVector()[i].fvalue);
ASSERT_EQ(page.data.HostVector()[i].index, orig.data.HostVector()[i].index);
}
for (size_t i = 0; i < orig.offset.Size(); ++i) {
ASSERT_EQ(page.offset.HostVector()[i], orig.offset.HostVector()[i]);
}
ASSERT_EQ(page.base_rowid, orig.base_rowid);
}
TEST(SparsePageRawFormat, SparsePage) {
TestSparsePageRawFormat<SparsePage>();
}
TEST(SparsePageRawFormat, CSCPage) {
TestSparsePageRawFormat<CSCPage>();
}
TEST(SparsePageRawFormat, SortedCSCPage) {
TestSparsePageRawFormat<SortedCSCPage>();
}
} // namespace data
} // namespace xgboost