Rewrite sparse dmatrix using callbacks. (#7092)

- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves.
- Remove use of threaded iterator and IO queue.
- Remove `page_size`.
- Make sure the number of pages in memory is bounded.
- Make sure the cache can not be violated.
- Provide an interface for internal algorithms to process data asynchronously.
This commit is contained in:
Jiaming Yuan
2021-07-16 12:33:31 +08:00
committed by GitHub
parent 2f524e9f41
commit bd1f3a38f0
51 changed files with 1445 additions and 1391 deletions

View File

@@ -59,12 +59,9 @@ TEST(SparsePage, PushCSC) {
}
TEST(SparsePage, PushCSCAfterTranspose) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat =
CreateSparsePageDMatrix(kEntries, 64UL, filename);
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
const int ncols = dmat->Info().num_col_;
SparsePage page; // Consolidated sparse page
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
@@ -76,12 +73,12 @@ TEST(SparsePage, PushCSCAfterTranspose) {
// Make sure that the final sparse page has the right number of entries
ASSERT_EQ(kEntries, page.data.Size());
// The feature value for a feature in each row should be identical, as that is
// how the dmatrix has been created
for (size_t i = 0; i < page.Size(); ++i) {
auto inst = page.GetView()[i];
for (size_t j = 1; j < inst.size(); ++j) {
ASSERT_EQ(inst[0].fvalue, inst[j].fvalue);
page.SortRows();
auto v = page.GetView();
for (size_t i = 0; i < v.Size(); ++i) {
auto column = v[i];
for (size_t j = 1; j < column.size(); ++j) {
ASSERT_GE(column[j].fvalue, column[j-1].fvalue);
}
}
}

View File

@@ -142,7 +142,7 @@ TEST(EllpackPage, Copy) {
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix>
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 256, kPageSize};
BatchParam param{0, 256};
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
// Create an empty result page.
@@ -188,7 +188,7 @@ TEST(EllpackPage, Compact) {
dmlc::TemporaryDirectory tmpdir;
std::unique_ptr<DMatrix>
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 256, kPageSize};
BatchParam param{0, 256};
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
// Create an empty result page.
@@ -212,7 +212,7 @@ TEST(EllpackPage, Compact) {
std::vector<bst_float> row_result(kCols);
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
auto impl = page.Impl();
EXPECT_EQ(impl->base_rowid, current_row);
ASSERT_EQ(impl->base_rowid, current_row);
for (size_t i = 0; i < impl->Size(); i++) {
size_t compacted_row = row_indexes_h[current_row];

View File

@@ -0,0 +1,46 @@
/*!
* Copyright 2021 XGBoost contributors
*/
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <memory>
#include "../../../src/data/file_iterator.h"
#include "../../../src/data/proxy_dmatrix.h"
#include "../../../src/data/adapter.h"
#include "../helpers.h"
namespace xgboost {
namespace data {
TEST(FileIterator, Basic) {
auto check_n_features = [](FileIterator *iter) {
size_t n_features = 0;
iter->Reset();
while (iter->Next()) {
auto proxy = MakeProxy(iter->Proxy());
auto csr = dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
n_features = std::max(n_features, csr->NumColumns());
}
ASSERT_EQ(n_features, 5);
};
dmlc::TemporaryDirectory tmpdir;
{
auto zpath = tmpdir.path + "/0-based.svm";
CreateBigTestData(zpath, 3 * 64, true);
zpath += "?indexing_mode=0";
FileIterator iter{zpath, 0, 1, "libsvm"};
check_n_features(&iter);
}
{
auto opath = tmpdir.path + "/1-based.svm";
CreateBigTestData(opath, 3 * 64, false);
opath += "?indexing_mode=1";
FileIterator iter{opath, 0, 1, "libsvm"};
check_n_features(&iter);
}
}
} // namespace data
} // namespace xgboost

View File

@@ -142,7 +142,7 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
IterativeDeviceDMatrix m(
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
0, 256);
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256, 0}).begin();
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256}).begin();
auto impl = ellpack.Impl();
common::CompressedIterator<uint32_t> iterator(
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());

View File

@@ -260,7 +260,7 @@ TEST(MetaInfo, HostExtend) {
lhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
rhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
lhs.Extend(rhs, true);
lhs.Extend(rhs, true, true);
ASSERT_EQ(lhs.num_row_, kRows * 2);
ASSERT_TRUE(lhs.labels_.HostCanRead());
ASSERT_TRUE(rhs.labels_.HostCanRead());

View File

@@ -141,7 +141,7 @@ TEST(MetaInfo, DeviceExtend) {
lhs.num_row_ = kRows;
rhs.num_row_ = kRows;
lhs.Extend(rhs, true);
lhs.Extend(rhs, true, true);
ASSERT_EQ(lhs.num_row_, kRows * 2);
ASSERT_FALSE(lhs.labels_.HostCanRead());

View File

@@ -6,11 +6,100 @@
#include <future>
#include "../../../src/common/io.h"
#include "../../../src/data/adapter.h"
#include "../../../src/data/simple_dmatrix.h"
#include "../../../src/data/sparse_page_dmatrix.h"
#include "../../../src/data/file_iterator.h"
#include "../helpers.h"
using namespace xgboost; // NOLINT
template <typename Page>
void TestSparseDMatrixLoadFile() {
dmlc::TemporaryDirectory tmpdir;
auto opath = tmpdir.path + "/1-based.svm";
CreateBigTestData(opath, 3 * 64, false);
opath += "?indexing_mode=1";
data::FileIterator iter{opath, 0, 1, "libsvm"};
data::SparsePageDMatrix m{&iter,
iter.Proxy(),
data::fileiter::Reset,
data::fileiter::Next,
std::numeric_limits<float>::quiet_NaN(),
1,
"cache"};
ASSERT_EQ(m.Info().num_col_, 5);
ASSERT_EQ(m.Info().num_row_, 64);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(opath.c_str(), 0, 1, "auto"));
auto adapter = data::FileAdapter{parser.get()};
data::SimpleDMatrix simple{&adapter, std::numeric_limits<float>::quiet_NaN(),
1};
Page out;
for (auto const& page : m.GetBatches<Page>()) {
if (std::is_same<Page, SparsePage>::value) {
out.Push(page);
} else {
out.PushCSC(page);
}
}
ASSERT_EQ(m.Info().num_col_, simple.Info().num_col_);
ASSERT_EQ(m.Info().num_row_, simple.Info().num_row_);
for (auto const& page : simple.GetBatches<Page>()) {
ASSERT_EQ(page.offset.HostVector(), out.offset.HostVector());
for (size_t i = 0; i < page.data.Size(); ++i) {
ASSERT_EQ(page.data.HostVector()[i].fvalue, out.data.HostVector()[i].fvalue);
}
}
}
TEST(SparsePageDMatrix, LoadFile) {
TestSparseDMatrixLoadFile<SparsePage>();
TestSparseDMatrixLoadFile<CSCPage>();
TestSparseDMatrixLoadFile<SortedCSCPage>();
}
// allow caller to retain pages so they can process multiple pages at the same time.
template <typename Page>
void TestRetainPage() {
auto m = CreateSparsePageDMatrix(10000);
auto batches = m->GetBatches<Page>();
auto begin = batches.begin();
auto end = batches.end();
std::vector<Page> pages;
std::vector<std::shared_ptr<Page const>> iterators;
for (auto it = begin; it != end; ++it) {
iterators.push_back(it.Page());
pages.emplace_back(Page{});
if (std::is_same<Page, SparsePage>::value) {
pages.back().Push(*it);
} else {
pages.back().PushCSC(*it);
}
ASSERT_EQ(pages.back().Size(), (*it).Size());
}
ASSERT_GE(iterators.size(), 2);
for (size_t i = 0; i < iterators.size(); ++i) {
ASSERT_EQ((*iterators[i]).Size(), pages.at(i).Size());
ASSERT_EQ((*iterators[i]).data.HostVector(), pages.at(i).data.HostVector());
}
// make sure it's const and the caller can not modify the content of page.
for (auto& page : m->GetBatches<Page>()) {
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
}
}
TEST(SparsePageDMatrix, RetainSparsePage) {
TestRetainPage<SparsePage>();
TestRetainPage<CSCPage>();
TestRetainPage<SortedCSCPage>();
}
TEST(SparsePageDMatrix, MetaInfo) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
@@ -19,8 +108,6 @@ TEST(SparsePageDMatrix, MetaInfo) {
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", false, false);
std::cout << tmp_file << std::endl;
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
// Test the metadata that was parsed
EXPECT_EQ(dmat->Info().num_row_, 8ul);
@@ -32,10 +119,7 @@ TEST(SparsePageDMatrix, MetaInfo) {
}
TEST(SparsePageDMatrix, RowAccess) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(24, 4, filename);
std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(24);
// Test the data read into the first row
auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
@@ -43,7 +127,7 @@ TEST(SparsePageDMatrix, RowAccess) {
auto first_row = page[0];
ASSERT_EQ(first_row.size(), 3ul);
EXPECT_EQ(first_row[2].index, 2u);
EXPECT_EQ(first_row[2].fvalue, 20);
EXPECT_NEAR(first_row[2].fvalue, 0.986566, 1e-4);
}
TEST(SparsePageDMatrix, ColAccess) {
@@ -54,55 +138,46 @@ TEST(SparsePageDMatrix, ColAccess) {
xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
// Loop over the batches and assert the data is as expected
size_t iter = 0;
for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
auto col_page = col_batch.GetView();
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
EXPECT_EQ(col_page[1].size(), 1);
ASSERT_EQ(col_page.Size(), dmat->Info().num_col_);
if (iter == 1) {
ASSERT_EQ(col_page[0][0].fvalue, 0.f);
ASSERT_EQ(col_page[3][0].fvalue, 30.f);
ASSERT_EQ(col_page[3][0].index, 1);
ASSERT_EQ(col_page[3].size(), 1);
} else {
ASSERT_EQ(col_page[1][0].fvalue, 10.0f);
ASSERT_EQ(col_page[1].size(), 1);
}
CHECK_LE(col_batch.base_rowid, dmat->Info().num_row_);
++iter;
}
// Loop over the batches and assert the data is as expected
iter = 0;
for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
auto col_page = col_batch.GetView();
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
EXPECT_EQ(col_page[1].size(), 1);
if (iter == 0) {
EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
EXPECT_EQ(col_page[1].size(), 1);
} else {
EXPECT_EQ(col_page[3][0].fvalue, 30.f);
EXPECT_EQ(col_page[3].size(), 1);
}
iter++;
}
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.sorted.col.page"));
delete dmat;
EXPECT_FALSE(FileExists(tmp_file + ".cache"));
EXPECT_FALSE(FileExists(tmp_file + ".cache.row.page"));
EXPECT_FALSE(FileExists(tmp_file + ".cache.col.page"));
EXPECT_FALSE(FileExists(tmp_file + ".cache.sorted.col.page"));
}
TEST(SparsePageDMatrix, ExistingCacheFile) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
EXPECT_ANY_THROW({
std::unique_ptr<xgboost::DMatrix> dmat2 =
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
});
}
TEST(SparsePageDMatrix, ThreadSafetyException) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/test";
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
size_t constexpr kEntriesPerCol = 3;
size_t constexpr kEntries = 64 * kEntriesPerCol * 2;
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
xgboost::CreateSparsePageDMatrix(kEntries);
int threads = 1000;
@@ -134,13 +209,10 @@ TEST(SparsePageDMatrix, ThreadSafetyException) {
// Multi-batches access
TEST(SparsePageDMatrix, ColAccessBatches) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
// Create multiple sparse pages
std::unique_ptr<xgboost::DMatrix> dmat{
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename)};
std::unique_ptr<xgboost::DMatrix> dmat{xgboost::CreateSparsePageDMatrix(kEntries)};
auto n_threads = omp_get_max_threads();
omp_set_num_threads(16);
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
@@ -149,234 +221,37 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
omp_set_num_threads(n_threads);
}
TEST(SparsePageDMatrix, Empty) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data{};
std::vector<unsigned> feature_idx = {};
std::vector<size_t> row_ptr = {};
{
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(),
data.data(), 0, 0, 0);
data::SparsePageDMatrix dmat(
&csr_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat.Info().num_nonzero_, 0);
EXPECT_EQ(dmat.Info().num_row_, 0);
EXPECT_EQ(dmat.Info().num_col_, 0);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 0);
}
}
{
data::DenseAdapter dense_adapter(nullptr, 0, 0);
data::SparsePageDMatrix dmat2(
&dense_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat2.Info().num_nonzero_, 0);
EXPECT_EQ(dmat2.Info().num_row_, 0);
EXPECT_EQ(dmat2.Info().num_col_, 0);
for (auto &batch : dmat2.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 0);
}
}
{
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
data::SparsePageDMatrix dmat3(
&csc_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat3.Info().num_nonzero_, 0);
EXPECT_EQ(dmat3.Info().num_row_, 0);
EXPECT_EQ(dmat3.Info().num_col_, 0);
for (auto &batch : dmat3.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 0);
}
}
}
TEST(SparsePageDMatrix, MissingData) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data{0.0, std::nanf(""), 1.0};
std::vector<unsigned> feature_idx = {0, 1, 0};
std::vector<size_t> row_ptr = {0, 2, 3};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
3, 2);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
const std::string tmp_file2 = tempdir.path + "/simple2.libsvm";
data::SparsePageDMatrix dmat2(&adapter, 1.0, 1, tmp_file2);
EXPECT_EQ(dmat2.Info().num_nonzero_, 1);
}
TEST(SparsePageDMatrix, EmptyRow) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data{0.0, 1.0};
std::vector<unsigned> feature_idx = {0, 1};
std::vector<size_t> row_ptr = {0, 2, 2};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
2, 2);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
EXPECT_EQ(dmat.Info().num_row_, 2);
EXPECT_EQ(dmat.Info().num_col_, 2);
}
TEST(SparsePageDMatrix, FromDense) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
int m = 3;
int n = 2;
std::vector<float> data = {1, 2, 3, 4, 5, 6};
data::DenseAdapter adapter(data.data(), m, n);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 6);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
auto page = batch.GetView();
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = page[i];
for (auto j = 0ull; j < inst.size(); j++) {
EXPECT_EQ(inst[j].fvalue, data[i * n + j]);
EXPECT_EQ(inst[j].index, j);
}
}
}
}
TEST(SparsePageDMatrix, FromCSC) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data = {1, 3, 2, 4, 5};
std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
std::vector<size_t> col_ptr = {0, 2, 5};
data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
auto &batch = *dmat.GetBatches<SparsePage>().begin();
auto page = batch.GetView();
auto inst = page[0];
EXPECT_EQ(inst[0].fvalue, 1);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 2);
EXPECT_EQ(inst[1].index, 1);
inst = page[1];
EXPECT_EQ(inst[0].fvalue, 3);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 4);
EXPECT_EQ(inst[1].index, 1);
inst = page[2];
EXPECT_EQ(inst[0].fvalue, 5);
EXPECT_EQ(inst[0].index, 1);
}
TEST(SparsePageDMatrix, FromFile) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 20);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1);
ASSERT_EQ(dmat.Info().num_col_, 5);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
std::vector<bst_row_t> expected_offset(batch.Size() + 1);
auto page = batch.GetView();
int n = -3;
std::generate(expected_offset.begin(), expected_offset.end(),
[&n] { return n += 3; });
EXPECT_EQ(batch.offset.HostVector(), expected_offset);
if (batch.base_rowid % 2 == 0) {
EXPECT_EQ(page[0][0].index, 0);
EXPECT_EQ(page[0][1].index, 1);
EXPECT_EQ(page[0][2].index, 2);
} else {
EXPECT_EQ(page[0][0].index, 0);
EXPECT_EQ(page[0][1].index, 3);
EXPECT_EQ(page[0][2].index, 4);
}
}
}
TEST(SparsePageDMatrix, Large) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 1 << 16);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 16)};
std::unique_ptr<DMatrix> simple{DMatrix::Load(filename, true, true)};
std::vector<float> sparse_data;
std::vector<size_t> sparse_rptr;
std::vector<bst_feature_t> sparse_cids;
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
std::vector<float> simple_data;
std::vector<size_t> simple_rptr;
std::vector<bst_feature_t> simple_cids;
DMatrixToCSR(simple.get(), &simple_data, &simple_rptr, &simple_cids);
ASSERT_EQ(sparse_rptr.size(), sparse->Info().num_row_ + 1);
ASSERT_EQ(sparse_rptr.size(), simple->Info().num_row_ + 1);
ASSERT_EQ(sparse_data.size(), simple_data.size());
ASSERT_EQ(sparse_data, simple_data);
ASSERT_EQ(sparse_rptr.size(), simple_rptr.size());
ASSERT_EQ(sparse_rptr, simple_rptr);
ASSERT_EQ(sparse_cids, simple_cids);
}
auto TestSparsePageDMatrixDeterminism(int32_t threads, std::string const& filename) {
auto TestSparsePageDMatrixDeterminism(int32_t threads) {
omp_set_num_threads(threads);
std::vector<float> sparse_data;
std::vector<size_t> sparse_rptr;
std::vector<bst_feature_t> sparse_cids;
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1 << 8)};
std::string filename = tempdir.path + "/simple.libsvm";
CreateBigTestData(filename, 1 << 16);
data::FileIterator iter(filename, 0, 1, "auto");
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix{
&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
std::numeric_limits<float>::quiet_NaN(), 1, filename}};
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
std::string cache_name = tmp_file + ".row.page";
auto cache_name =
data::MakeId(filename,
dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) +
".row.page";
std::string cache = common::LoadSequentialFile(cache_name);
return cache;
}
TEST(SparsePageDMatrix, Determinism) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 1 << 16);
#if defined(_MSC_VER)
return;
#endif // defined(_MSC_VER)
std::vector<std::string> caches;
for (size_t i = 1; i < 18; i += 2) {
caches.emplace_back(TestSparsePageDMatrixDeterminism(i, filename));
caches.emplace_back(TestSparsePageDMatrixDeterminism(i));
}
for (size_t i = 1; i < caches.size(); ++i) {

View File

@@ -4,6 +4,7 @@
#include "../helpers.h"
#include "../../../src/common/compressed_iterator.h"
#include "../../../src/data/ellpack_page.cuh"
#include "../../../src/data/sparse_page_dmatrix.h"
namespace xgboost {
@@ -14,13 +15,22 @@ TEST(SparsePageDMatrix, EllpackPage) {
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
// Loop over the batches and assert the data is as expected
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 64})) {
EXPECT_EQ(batch.Size(), dmat->Info().num_row_);
size_t n = 0;
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
n += batch.Size();
}
EXPECT_EQ(n, dmat->Info().num_row_);
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.ellpack.page"));
auto path =
data::MakeId(tmp_file + ".cache",
dynamic_cast<data::SparsePageDMatrix *>(dmat)) +
".row.page";
EXPECT_TRUE(FileExists(path));
path =
data::MakeId(tmp_file + ".cache",
dynamic_cast<data::SparsePageDMatrix *>(dmat)) +
".ellpack.page";
EXPECT_TRUE(FileExists(path));
delete dmat;
}
@@ -30,12 +40,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename);
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, filename);
// Loop over the batches and count the records
int64_t batch_count = 0;
int64_t row_count = 0;
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 7UL})) {
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
batch_count++;
row_count += batch.Size();
@@ -43,7 +53,36 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
EXPECT_GE(batch_count, 2);
EXPECT_EQ(row_count, dmat->Info().num_row_);
EXPECT_TRUE(FileExists(filename + ".cache.ellpack.page"));
auto path =
data::MakeId(filename,
dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
".ellpack.page";
}
TEST(SparsePageDMatrix, RetainEllpackPage) {
auto m = CreateSparsePageDMatrix(10000);
auto batches = m->GetBatches<EllpackPage>({0, 32});
auto begin = batches.begin();
auto end = batches.end();
std::vector<HostDeviceVector<common::CompressedByteT>> gidx_buffers;
std::vector<std::shared_ptr<EllpackPage const>> iterators;
for (auto it = begin; it != end; ++it) {
iterators.push_back(it.Page());
gidx_buffers.emplace_back(HostDeviceVector<common::CompressedByteT>{});
gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size());
gidx_buffers.back().Copy((*it).Impl()->gidx_buffer);
}
ASSERT_GE(iterators.size(), 2);
for (size_t i = 0; i < iterators.size(); ++i) {
ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
}
// make sure it's const and the caller can not modify the content of page.
for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
}
}
TEST(SparsePageDMatrix, EllpackPageContent) {
@@ -59,7 +98,7 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, 2, 0};
BatchParam param{0, 2};
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
EXPECT_EQ(impl->base_rowid, 0);
EXPECT_EQ(impl->n_rows, kRows);
@@ -67,7 +106,17 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
EXPECT_EQ(impl->row_stride, 2);
EXPECT_EQ(impl->Cuts().TotalBins(), 4);
auto impl_ext = (*dmat_ext->GetBatches<EllpackPage>(param).begin()).Impl();
std::unique_ptr<EllpackPageImpl> impl_ext;
size_t offset = 0;
for (auto& batch : dmat_ext->GetBatches<EllpackPage>(param)) {
if (!impl_ext) {
impl_ext.reset(new EllpackPageImpl(
batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
batch.Impl()->is_dense, batch.Impl()->row_stride, kRows));
}
auto n_elems = impl_ext->Copy(0, batch.Impl(), offset);
offset += n_elems;
}
EXPECT_EQ(impl_ext->base_rowid, 0);
EXPECT_EQ(impl_ext->n_rows, kRows);
EXPECT_FALSE(impl_ext->is_dense);
@@ -109,7 +158,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, kMaxBins, kPageSize};
BatchParam param{0, kMaxBins};
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
EXPECT_EQ(impl->base_rowid, 0);
EXPECT_EQ(impl->n_rows, kRows);
@@ -150,7 +199,7 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
std::unique_ptr<DMatrix>
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
BatchParam param{0, kMaxBins, kPageSize};
BatchParam param{0, kMaxBins};
size_t current_row = 0;
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {