Dmatrix refactor stage 2 (#3395)

* DMatrix refactor 2

* Remove buffered rowset usage where possible

* Transition to c++11 style iterators for row access

* Transition column iterators to C++ 11
This commit is contained in:
Rory Mitchell
2018-10-01 01:29:03 +13:00
committed by GitHub
parent b50bc2c1d4
commit 70d208d68c
36 changed files with 459 additions and 846 deletions

View File

@@ -8,7 +8,6 @@ TEST(SparsePageDMatrix, MetaInfo) {
std::string tmp_file = CreateSimpleTestData();
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", false, false);
std::remove(tmp_file.c_str());
std::cout << tmp_file << std::endl;
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
@@ -19,6 +18,7 @@ TEST(SparsePageDMatrix, MetaInfo) {
EXPECT_EQ(dmat->Info().labels_.Size(), dmat->Info().num_row_);
// Clean up of external memory files
std::remove(tmp_file.c_str());
std::remove((tmp_file + ".cache").c_str());
std::remove((tmp_file + ".cache.row.page").c_str());
@@ -26,26 +26,26 @@ TEST(SparsePageDMatrix, MetaInfo) {
}
TEST(SparsePageDMatrix, RowAccess) {
std::string tmp_file = CreateSimpleTestData();
// Create sufficiently large data to make two row pages
std::string tmp_file = CreateBigTestData(5000000);
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false);
std::remove(tmp_file.c_str());
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
auto row_iter = dmat->RowIterator();
// Loop over the batches and count the records
long row_count = 0;
row_iter->BeforeFirst();
while (row_iter->Next()) row_count += row_iter->Value().Size();
for (auto &batch : dmat->GetRowBatches()) {
row_count += batch.Size();
}
EXPECT_EQ(row_count, dmat->Info().num_row_);
// Test the data read into the first row
row_iter->BeforeFirst();
row_iter->Next();
auto first_row = row_iter->Value()[0];
auto &batch = *dmat->GetRowBatches().begin();
auto first_row = batch[0];
ASSERT_EQ(first_row.size(), 3);
EXPECT_EQ(first_row[2].index, 2);
EXPECT_EQ(first_row[2].fvalue, 20);
row_iter = nullptr;
// Clean up of external memory files
std::remove((tmp_file + ".cache").c_str());
@@ -59,35 +59,33 @@ TEST(SparsePageDMatrix, ColAccess) {
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false);
std::remove(tmp_file.c_str());
EXPECT_FALSE(FileExists(tmp_file + ".cache.col.page"));
EXPECT_EQ(dmat->HaveColAccess(true), false);
const std::vector<bool> enable(dmat->Info().num_col_, true);
dmat->InitColAccess(1, true); // Max 1 row per patch
ASSERT_EQ(dmat->HaveColAccess(true), true);
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
EXPECT_EQ(dmat->GetColSize(0), 2);
EXPECT_EQ(dmat->GetColSize(1), 1);
EXPECT_EQ(dmat->GetColDensity(0), 1);
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
auto col_iter = dmat->ColIterator();
// Loop over the batches and assert the data is as expected
long num_col_batch = 0;
col_iter->BeforeFirst();
while (col_iter->Next()) {
num_col_batch += 1;
EXPECT_EQ(col_iter->Value().Size(), dmat->Info().num_col_)
<< "Expected batch size to be same as num_cols as max_row_perbatch is 1.";
for (auto col_batch : dmat->GetSortedColumnBatches()) {
EXPECT_EQ(col_batch.Size(), dmat->Info().num_col_);
EXPECT_EQ(col_batch[1][0].fvalue, 10.0f);
EXPECT_EQ(col_batch[1].size(), 1);
}
EXPECT_EQ(num_col_batch, dmat->Info().num_row_)
<< "Expected num batches to be same as num_rows as max_row_perbatch is 1";
col_iter = nullptr;
// Loop over the batches and assert the data is as expected
for (auto col_batch : dmat->GetColumnBatches()) {
EXPECT_EQ(col_batch.Size(), dmat->Info().num_col_);
EXPECT_EQ(col_batch[1][0].fvalue, 10.0f);
EXPECT_EQ(col_batch[1].size(), 1);
}
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.sorted.col.page"));
std::remove((tmp_file + ".cache").c_str());
std::remove((tmp_file + ".cache.col.page").c_str());
std::remove((tmp_file + ".cache.row.page").c_str());
std::remove((tmp_file + ".cache.col.page").c_str());
std::remove((tmp_file + ".cache.sorted.col.page").c_str());
delete dmat;
}