Dmatrix refactor stage 1 (#3301)

* Use sparse page as singular CSR matrix representation

* Simplify dmatrix methods

* Reduce statefullness of batch iterators

* BREAKING CHANGE: Remove prob_buffer_row parameter. Users are instead recommended to sample their dataset as a preprocessing step before using XGBoost.
This commit is contained in:
Rory Mitchell
2018-06-07 10:25:58 +12:00
committed by GitHub
parent 286dccb8e8
commit a96039141a
47 changed files with 650 additions and 1036 deletions

View File

@@ -18,13 +18,13 @@ TEST(SimpleCSRSource, SaveLoadBinary) {
EXPECT_EQ(dmat->Info().num_row_, dmat_read->Info().num_row_);
EXPECT_EQ(dmat->Info().num_row_, dmat_read->Info().num_row_);
dmlc::DataIter<xgboost::RowBatch> * row_iter = dmat->RowIterator();
dmlc::DataIter<xgboost::RowBatch> * row_iter_read = dmat_read->RowIterator();
auto row_iter = dmat->RowIterator();
auto row_iter_read = dmat_read->RowIterator();
// Test the data read into the first row
row_iter->BeforeFirst(); row_iter->Next();
row_iter_read->BeforeFirst(); row_iter_read->Next();
xgboost::SparseBatch::Inst first_row = row_iter->Value()[0];
xgboost::SparseBatch::Inst first_row_read = row_iter_read->Value()[0];
auto first_row = row_iter->Value()[0];
auto first_row_read = row_iter_read->Value()[0];
EXPECT_EQ(first_row.length, first_row_read.length);
EXPECT_EQ(first_row[2].index, first_row_read[2].index);
EXPECT_EQ(first_row[2].fvalue, first_row_read[2].fvalue);

View File

@@ -18,19 +18,19 @@ TEST(SimpleDMatrix, MetaInfo) {
TEST(SimpleDMatrix, RowAccess) {
std::string tmp_file = CreateSimpleTestData();
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, true, false);
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, false, false);
std::remove(tmp_file.c_str());
dmlc::DataIter<xgboost::RowBatch> * row_iter = dmat->RowIterator();
auto row_iter = dmat->RowIterator();
// Loop over the batches and count the records
long row_count = 0;
row_iter->BeforeFirst();
while (row_iter->Next()) row_count += row_iter->Value().size;
while (row_iter->Next()) row_count += row_iter->Value().Size();
EXPECT_EQ(row_count, dmat->Info().num_row_);
// Test the data read into the first row
row_iter->BeforeFirst();
row_iter->Next();
xgboost::SparseBatch::Inst first_row = row_iter->Value()[0];
auto first_row = row_iter->Value()[0];
ASSERT_EQ(first_row.length, 3);
EXPECT_EQ(first_row[2].index, 2);
EXPECT_EQ(first_row[2].fvalue, 20);
@@ -45,14 +45,14 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
// Unsorted column access
const std::vector<bool> enable(dmat->Info().num_col_, true);
EXPECT_EQ(dmat->HaveColAccess(false), false);
dmat->InitColAccess(enable, 1, dmat->Info().num_row_, false);
dmat->InitColAccess(enable, 0, 0, false); // Calling it again should not change it
dmat->InitColAccess(dmat->Info().num_row_, false);
dmat->InitColAccess(0, false); // Calling it again should not change it
ASSERT_EQ(dmat->HaveColAccess(false), true);
// Sorted column access
EXPECT_EQ(dmat->HaveColAccess(true), false);
dmat->InitColAccess(enable, 1, dmat->Info().num_row_, true);
dmat->InitColAccess(enable, 0, 0, true); // Calling it again should not change it
dmat->InitColAccess(dmat->Info().num_row_, true);
dmat->InitColAccess(0, true); // Calling it again should not change it
ASSERT_EQ(dmat->HaveColAccess(true), true);
EXPECT_EQ(dmat->GetColSize(0), 2);
@@ -61,84 +61,19 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
ASSERT_TRUE(dmat->SingleColBlock());
dmlc::DataIter<xgboost::ColBatch> * col_iter = dmat->ColIterator();
auto* col_iter = dmat->ColIterator();
// Loop over the batches and assert the data is as expected
long num_col_batch = 0;
col_iter->BeforeFirst();
while (col_iter->Next()) {
num_col_batch += 1;
EXPECT_EQ(col_iter->Value().size, dmat->Info().num_col_)
EXPECT_EQ(col_iter->Value().Size(), dmat->Info().num_col_)
<< "Expected batch size = number of cells as #batches is 1.";
for (int i = 0; i < static_cast<int>(col_iter->Value().size); ++i) {
for (int i = 0; i < static_cast<int>(col_iter->Value().Size()); ++i) {
EXPECT_EQ(col_iter->Value()[i].length, dmat->GetColSize(i))
<< "Expected length of each colbatch = colsize as #batches is 1.";
}
}
EXPECT_EQ(num_col_batch, 1) << "Expected number of batches to be 1";
col_iter = nullptr;
std::vector<xgboost::bst_uint> sub_feats = {4, 3};
dmlc::DataIter<xgboost::ColBatch> * sub_col_iter = dmat->ColIterator(sub_feats);
// Loop over the batches and assert the data is as expected
sub_col_iter->BeforeFirst();
while (sub_col_iter->Next()) {
EXPECT_EQ(sub_col_iter->Value().size, sub_feats.size())
<< "Expected size of a batch = number of cells in subset as #batches is 1.";
}
sub_col_iter = nullptr;
}
TEST(SimpleDMatrix, ColAccessWithBatches) {
std::string tmp_file = CreateSimpleTestData();
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, true, false);
std::remove(tmp_file.c_str());
// Unsorted column access
const std::vector<bool> enable(dmat->Info().num_col_, true);
EXPECT_EQ(dmat->HaveColAccess(false), false);
dmat->InitColAccess(enable, 1, 1, false);
dmat->InitColAccess(enable, 0, 0, false); // Calling it again should not change it
ASSERT_EQ(dmat->HaveColAccess(false), true);
// Sorted column access
EXPECT_EQ(dmat->HaveColAccess(true), false);
dmat->InitColAccess(enable, 1, 1, true); // Max 1 row per patch
dmat->InitColAccess(enable, 0, 0, true); // Calling it again should not change it
ASSERT_EQ(dmat->HaveColAccess(true), true);
EXPECT_EQ(dmat->GetColSize(0), 2);
EXPECT_EQ(dmat->GetColSize(1), 1);
EXPECT_EQ(dmat->GetColDensity(0), 1);
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
ASSERT_FALSE(dmat->SingleColBlock());
dmlc::DataIter<xgboost::ColBatch> * col_iter = dmat->ColIterator();
// Loop over the batches and assert the data is as expected
long num_col_batch = 0;
col_iter->BeforeFirst();
while (col_iter->Next()) {
num_col_batch += 1;
EXPECT_EQ(col_iter->Value().size, dmat->Info().num_col_)
<< "Expected batch size = num_cols as max_row_perbatch is 1.";
for (int i = 0; i < static_cast<int>(col_iter->Value().size); ++i) {
EXPECT_LE(col_iter->Value()[i].length, 1)
<< "Expected length of each colbatch <=1 as max_row_perbatch is 1.";
}
}
EXPECT_EQ(num_col_batch, dmat->Info().num_row_)
<< "Expected num batches = num_rows as max_row_perbatch is 1";
col_iter = nullptr;
// The iterator feats should ignore any numbers larger than the num_col
std::vector<xgboost::bst_uint> sub_feats = {
4, 3, static_cast<unsigned int>(dmat->Info().num_col_ + 1)};
dmlc::DataIter<xgboost::ColBatch> * sub_col_iter = dmat->ColIterator(sub_feats);
// Loop over the batches and assert the data is as expected
sub_col_iter->BeforeFirst();
while (sub_col_iter->Next()) {
EXPECT_EQ(sub_col_iter->Value().size, sub_feats.size() - 1)
<< "Expected size of a batch = number of columns in subset "
<< "as max_row_perbatch is 1.";
}
sub_col_iter = nullptr;
}

View File

@@ -7,8 +7,9 @@
TEST(SparsePageDMatrix, MetaInfo) {
std::string tmp_file = CreateSimpleTestData();
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false);
tmp_file + "#" + tmp_file + ".cache", false, false);
std::remove(tmp_file.c_str());
std::cout << tmp_file << std::endl;
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
// Test the metadata that was parsed
@@ -29,16 +30,16 @@ TEST(SparsePageDMatrix, RowAccess) {
std::remove(tmp_file.c_str());
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
dmlc::DataIter<xgboost::RowBatch> * row_iter = dmat->RowIterator();
auto row_iter = dmat->RowIterator();
// Loop over the batches and count the records
long row_count = 0;
row_iter->BeforeFirst();
while (row_iter->Next()) row_count += row_iter->Value().size;
while (row_iter->Next()) row_count += row_iter->Value().Size();
EXPECT_EQ(row_count, dmat->Info().num_row_);
// Test the data read into the first row
row_iter->BeforeFirst();
row_iter->Next();
xgboost::SparseBatch::Inst first_row = row_iter->Value()[0];
auto first_row = row_iter->Value()[0];
ASSERT_EQ(first_row.length, 3);
EXPECT_EQ(first_row[2].index, 2);
EXPECT_EQ(first_row[2].fvalue, 20);
@@ -58,7 +59,7 @@ TEST(SparsePageDMatrix, ColAcess) {
EXPECT_EQ(dmat->HaveColAccess(true), false);
const std::vector<bool> enable(dmat->Info().num_col_, true);
dmat->InitColAccess(enable, 1, 1, true); // Max 1 row per patch
dmat->InitColAccess(1, true); // Max 1 row per patch
ASSERT_EQ(dmat->HaveColAccess(true), true);
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
@@ -67,31 +68,19 @@ TEST(SparsePageDMatrix, ColAcess) {
EXPECT_EQ(dmat->GetColDensity(0), 1);
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
dmlc::DataIter<xgboost::ColBatch> * col_iter = dmat->ColIterator();
auto col_iter = dmat->ColIterator();
// Loop over the batches and assert the data is as expected
long num_col_batch = 0;
col_iter->BeforeFirst();
while (col_iter->Next()) {
num_col_batch += 1;
EXPECT_EQ(col_iter->Value().size, dmat->Info().num_col_)
EXPECT_EQ(col_iter->Value().Size(), dmat->Info().num_col_)
<< "Expected batch size to be same as num_cols as max_row_perbatch is 1.";
}
EXPECT_EQ(num_col_batch, dmat->Info().num_row_)
<< "Expected num batches to be same as num_rows as max_row_perbatch is 1";
col_iter = nullptr;
std::vector<xgboost::bst_uint> sub_feats = {4, 3};
dmlc::DataIter<xgboost::ColBatch> * sub_col_iter = dmat->ColIterator(sub_feats);
// Loop over the batches and assert the data is as expected
sub_col_iter->BeforeFirst();
while (sub_col_iter->Next()) {
EXPECT_EQ(sub_col_iter->Value().size, sub_feats.size())
<< "Expected size of a batch to be same as number of columns "
<< "as max_row_perbatch was set to 1.";
}
sub_col_iter = nullptr;
// Clean up of external memory files
std::remove((tmp_file + ".cache").c_str());
std::remove((tmp_file + ".cache.col.page").c_str());
std::remove((tmp_file + ".cache.row.page").c_str());