Dmatrix refactor stage 2 (#3395)
* DMatrix refactor 2 * Remove buffered rowset usage where possible * Transition to c++11 style iterators for row access * Transition column iterators to C++ 11
This commit is contained in:
@@ -20,10 +20,7 @@ TEST(c_api, XGDMatrixCreateFromMatDT) {
|
||||
ASSERT_EQ(info.num_row_, 3);
|
||||
ASSERT_EQ(info.num_nonzero_, 6);
|
||||
|
||||
auto iter = (*dmat)->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
auto batch = iter->Value();
|
||||
for (const auto &batch : (*dmat)->GetRowBatches()) {
|
||||
ASSERT_EQ(batch[0][0].fvalue, 0.0f);
|
||||
ASSERT_EQ(batch[0][1].fvalue, -4.0f);
|
||||
ASSERT_EQ(batch[2][0].fvalue, 3.0f);
|
||||
@@ -55,10 +52,7 @@ TEST(c_api, XGDMatrixCreateFromMat_omp) {
|
||||
ASSERT_EQ(info.num_row_, row);
|
||||
ASSERT_EQ(info.num_nonzero_, num_cols * row - num_missing);
|
||||
|
||||
auto iter = (*dmat)->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
auto batch = iter->Value();
|
||||
for (const auto &batch : (*dmat)->GetRowBatches()) {
|
||||
for (int i = 0; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for (int j = 0; i < inst.size(); i++) {
|
||||
|
||||
@@ -37,13 +37,9 @@ TEST(gpu_hist_util, TestDeviceSketch) {
|
||||
hmat_cpu.Init((*dmat).get(), p.max_bin);
|
||||
|
||||
// find the cuts on the GPU
|
||||
dmlc::DataIter<SparsePage>* iter = (*dmat)->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
CHECK(iter->Next());
|
||||
const SparsePage& batch = iter->Value();
|
||||
const SparsePage& batch = *(*dmat)->GetRowBatches().begin();
|
||||
HistCutMatrix hmat_gpu;
|
||||
DeviceSketch(batch, (*dmat)->Info(), p, &hmat_gpu);
|
||||
CHECK(!iter->Next());
|
||||
|
||||
// compare the cuts
|
||||
double eps = 1e-2;
|
||||
|
||||
@@ -123,12 +123,9 @@ TEST(MetaInfo, LoadQid) {
|
||||
xgboost::Entry(2, 0), xgboost::Entry(3, 0), xgboost::Entry(4, 0.4),
|
||||
xgboost::Entry(5, 1), xgboost::Entry(1, 0), xgboost::Entry(2, 1),
|
||||
xgboost::Entry(3, 1), xgboost::Entry(4, 0.5), {5, 0}};
|
||||
dmlc::DataIter<xgboost::SparsePage>* iter = dmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
CHECK(iter->Next());
|
||||
const xgboost::SparsePage& batch = iter->Value();
|
||||
CHECK_EQ(batch.base_rowid, 0);
|
||||
CHECK(batch.offset.HostVector() == expected_offset);
|
||||
CHECK(batch.data.HostVector() == expected_data);
|
||||
CHECK(!iter->Next());
|
||||
for (const auto &batch : dmat->GetRowBatches()) {
|
||||
CHECK_EQ(batch.base_rowid, 0);
|
||||
CHECK(batch.offset.HostVector() == expected_offset);
|
||||
CHECK(batch.data.HostVector() == expected_data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,18 +18,17 @@ TEST(SimpleCSRSource, SaveLoadBinary) {
|
||||
EXPECT_EQ(dmat->Info().num_row_, dmat_read->Info().num_row_);
|
||||
EXPECT_EQ(dmat->Info().num_row_, dmat_read->Info().num_row_);
|
||||
|
||||
auto row_iter = dmat->RowIterator();
|
||||
auto row_iter_read = dmat_read->RowIterator();
|
||||
// Test we have non-empty batch
|
||||
EXPECT_EQ(dmat->GetRowBatches().begin().AtEnd(), false);
|
||||
|
||||
auto row_iter = dmat->GetRowBatches().begin();
|
||||
auto row_iter_read = dmat_read->GetRowBatches().begin();
|
||||
// Test the data read into the first row
|
||||
row_iter->BeforeFirst(); row_iter->Next();
|
||||
row_iter_read->BeforeFirst(); row_iter_read->Next();
|
||||
auto first_row = row_iter->Value()[0];
|
||||
auto first_row_read = row_iter_read->Value()[0];
|
||||
auto first_row = (*row_iter)[0];
|
||||
auto first_row_read = (*row_iter_read)[0];
|
||||
EXPECT_EQ(first_row.size(), first_row_read.size());
|
||||
EXPECT_EQ(first_row[2].index, first_row_read[2].index);
|
||||
EXPECT_EQ(first_row[2].fvalue, first_row_read[2].fvalue);
|
||||
row_iter = nullptr; row_iter_read = nullptr;
|
||||
|
||||
delete dmat;
|
||||
delete dmat_read;
|
||||
}
|
||||
|
||||
@@ -23,20 +23,18 @@ TEST(SimpleDMatrix, RowAccess) {
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, false, false);
|
||||
std::remove(tmp_file.c_str());
|
||||
|
||||
auto row_iter = dmat->RowIterator();
|
||||
// Loop over the batches and count the records
|
||||
long row_count = 0;
|
||||
row_iter->BeforeFirst();
|
||||
while (row_iter->Next()) row_count += row_iter->Value().Size();
|
||||
for (auto &batch : dmat->GetRowBatches()) {
|
||||
row_count += batch.Size();
|
||||
}
|
||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||
// Test the data read into the first row
|
||||
row_iter->BeforeFirst();
|
||||
row_iter->Next();
|
||||
auto first_row = row_iter->Value()[0];
|
||||
auto &batch = *dmat->GetRowBatches().begin();
|
||||
auto first_row = batch[0];
|
||||
ASSERT_EQ(first_row.size(), 3);
|
||||
EXPECT_EQ(first_row[2].index, 2);
|
||||
EXPECT_EQ(first_row[2].fvalue, 20);
|
||||
row_iter = nullptr;
|
||||
|
||||
delete dmat;
|
||||
}
|
||||
@@ -46,40 +44,18 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, true, false);
|
||||
std::remove(tmp_file.c_str());
|
||||
|
||||
// Unsorted column access
|
||||
const std::vector<bool> enable(dmat->Info().num_col_, true);
|
||||
EXPECT_EQ(dmat->HaveColAccess(false), false);
|
||||
dmat->InitColAccess(dmat->Info().num_row_, false);
|
||||
dmat->InitColAccess(0, false); // Calling it again should not change it
|
||||
ASSERT_EQ(dmat->HaveColAccess(false), true);
|
||||
|
||||
// Sorted column access
|
||||
EXPECT_EQ(dmat->HaveColAccess(true), false);
|
||||
dmat->InitColAccess(dmat->Info().num_row_, true);
|
||||
dmat->InitColAccess(0, true); // Calling it again should not change it
|
||||
ASSERT_EQ(dmat->HaveColAccess(true), true);
|
||||
|
||||
EXPECT_EQ(dmat->GetColSize(0), 2);
|
||||
EXPECT_EQ(dmat->GetColSize(1), 1);
|
||||
EXPECT_EQ(dmat->GetColDensity(0), 1);
|
||||
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
|
||||
ASSERT_TRUE(dmat->SingleColBlock());
|
||||
|
||||
auto* col_iter = dmat->ColIterator();
|
||||
// Loop over the batches and assert the data is as expected
|
||||
long num_col_batch = 0;
|
||||
col_iter->BeforeFirst();
|
||||
while (col_iter->Next()) {
|
||||
for (const auto &batch : dmat->GetSortedColumnBatches()) {
|
||||
num_col_batch += 1;
|
||||
EXPECT_EQ(col_iter->Value().Size(), dmat->Info().num_col_)
|
||||
<< "Expected batch size = number of cells as #batches is 1.";
|
||||
for (int i = 0; i < static_cast<int>(col_iter->Value().Size()); ++i) {
|
||||
EXPECT_EQ(col_iter->Value()[i].size(), dmat->GetColSize(i))
|
||||
<< "Expected length of each colbatch = colsize as #batches is 1.";
|
||||
}
|
||||
EXPECT_EQ(batch.Size(), dmat->Info().num_col_)
|
||||
<< "Expected batch size = number of cells as #batches is 1.";
|
||||
}
|
||||
EXPECT_EQ(num_col_batch, 1) << "Expected number of batches to be 1";
|
||||
col_iter = nullptr;
|
||||
|
||||
delete dmat;
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
||||
std::string tmp_file = CreateSimpleTestData();
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", false, false);
|
||||
std::remove(tmp_file.c_str());
|
||||
std::cout << tmp_file << std::endl;
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
||||
|
||||
@@ -19,6 +18,7 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
||||
EXPECT_EQ(dmat->Info().labels_.Size(), dmat->Info().num_row_);
|
||||
|
||||
// Clean up of external memory files
|
||||
std::remove(tmp_file.c_str());
|
||||
std::remove((tmp_file + ".cache").c_str());
|
||||
std::remove((tmp_file + ".cache.row.page").c_str());
|
||||
|
||||
@@ -26,26 +26,26 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, RowAccess) {
|
||||
std::string tmp_file = CreateSimpleTestData();
|
||||
// Create sufficiently large data to make two row pages
|
||||
std::string tmp_file = CreateBigTestData(5000000);
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||
std::remove(tmp_file.c_str());
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
|
||||
auto row_iter = dmat->RowIterator();
|
||||
// Loop over the batches and count the records
|
||||
long row_count = 0;
|
||||
row_iter->BeforeFirst();
|
||||
while (row_iter->Next()) row_count += row_iter->Value().Size();
|
||||
for (auto &batch : dmat->GetRowBatches()) {
|
||||
row_count += batch.Size();
|
||||
}
|
||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||
|
||||
// Test the data read into the first row
|
||||
row_iter->BeforeFirst();
|
||||
row_iter->Next();
|
||||
auto first_row = row_iter->Value()[0];
|
||||
auto &batch = *dmat->GetRowBatches().begin();
|
||||
auto first_row = batch[0];
|
||||
ASSERT_EQ(first_row.size(), 3);
|
||||
EXPECT_EQ(first_row[2].index, 2);
|
||||
EXPECT_EQ(first_row[2].fvalue, 20);
|
||||
row_iter = nullptr;
|
||||
|
||||
// Clean up of external memory files
|
||||
std::remove((tmp_file + ".cache").c_str());
|
||||
@@ -59,35 +59,33 @@ TEST(SparsePageDMatrix, ColAccess) {
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||
std::remove(tmp_file.c_str());
|
||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.col.page"));
|
||||
|
||||
EXPECT_EQ(dmat->HaveColAccess(true), false);
|
||||
const std::vector<bool> enable(dmat->Info().num_col_, true);
|
||||
dmat->InitColAccess(1, true); // Max 1 row per patch
|
||||
ASSERT_EQ(dmat->HaveColAccess(true), true);
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
|
||||
|
||||
EXPECT_EQ(dmat->GetColSize(0), 2);
|
||||
EXPECT_EQ(dmat->GetColSize(1), 1);
|
||||
EXPECT_EQ(dmat->GetColDensity(0), 1);
|
||||
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
|
||||
|
||||
auto col_iter = dmat->ColIterator();
|
||||
// Loop over the batches and assert the data is as expected
|
||||
long num_col_batch = 0;
|
||||
col_iter->BeforeFirst();
|
||||
while (col_iter->Next()) {
|
||||
num_col_batch += 1;
|
||||
EXPECT_EQ(col_iter->Value().Size(), dmat->Info().num_col_)
|
||||
<< "Expected batch size to be same as num_cols as max_row_perbatch is 1.";
|
||||
for (auto col_batch : dmat->GetSortedColumnBatches()) {
|
||||
EXPECT_EQ(col_batch.Size(), dmat->Info().num_col_);
|
||||
EXPECT_EQ(col_batch[1][0].fvalue, 10.0f);
|
||||
EXPECT_EQ(col_batch[1].size(), 1);
|
||||
}
|
||||
EXPECT_EQ(num_col_batch, dmat->Info().num_row_)
|
||||
<< "Expected num batches to be same as num_rows as max_row_perbatch is 1";
|
||||
col_iter = nullptr;
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
for (auto col_batch : dmat->GetColumnBatches()) {
|
||||
EXPECT_EQ(col_batch.Size(), dmat->Info().num_col_);
|
||||
EXPECT_EQ(col_batch[1][0].fvalue, 10.0f);
|
||||
EXPECT_EQ(col_batch[1].size(), 1);
|
||||
}
|
||||
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.sorted.col.page"));
|
||||
|
||||
std::remove((tmp_file + ".cache").c_str());
|
||||
std::remove((tmp_file + ".cache.col.page").c_str());
|
||||
std::remove((tmp_file + ".cache.row.page").c_str());
|
||||
std::remove((tmp_file + ".cache.col.page").c_str());
|
||||
std::remove((tmp_file + ".cache.sorted.col.page").c_str());
|
||||
|
||||
delete dmat;
|
||||
}
|
||||
|
||||
@@ -8,8 +8,6 @@ typedef std::pair<std::string, std::string> arg;
|
||||
TEST(Linear, shotgun) {
|
||||
typedef std::pair<std::string, std::string> arg;
|
||||
auto mat = CreateDMatrix(10, 10, 0);
|
||||
std::vector<bool> enabled((*mat)->Info().num_col_, true);
|
||||
(*mat)->InitColAccess(1 << 16, false);
|
||||
auto updater = std::unique_ptr<xgboost::LinearUpdater>(
|
||||
xgboost::LinearUpdater::Create("shotgun"));
|
||||
updater->Init({{"eta", "1."}});
|
||||
@@ -29,8 +27,6 @@ TEST(Linear, shotgun) {
|
||||
TEST(Linear, coordinate) {
|
||||
typedef std::pair<std::string, std::string> arg;
|
||||
auto mat = CreateDMatrix(10, 10, 0);
|
||||
std::vector<bool> enabled((*mat)->Info().num_col_, true);
|
||||
(*mat)->InitColAccess(1 << 16, false);
|
||||
auto updater = std::unique_ptr<xgboost::LinearUpdater>(
|
||||
xgboost::LinearUpdater::Create("coord_descent"));
|
||||
updater->Init({{"eta", "1."}});
|
||||
|
||||
@@ -32,7 +32,7 @@ TEST(cpu_predictor, Test) {
|
||||
}
|
||||
|
||||
// Test predict instance
|
||||
auto batch = (*dmat)->RowIterator()->Value();
|
||||
auto &batch = *(*dmat)->GetRowBatches().begin();
|
||||
for (int i = 0; i < batch.Size(); i++) {
|
||||
std::vector<float> instance_out_predictions;
|
||||
cpu_predictor->PredictInstance(batch[i], &instance_out_predictions, model);
|
||||
|
||||
@@ -45,7 +45,7 @@ TEST(gpu_predictor, Test) {
|
||||
abs_tolerance);
|
||||
}
|
||||
// Test predict instance
|
||||
auto batch = (*dmat)->RowIterator()->Value();
|
||||
const auto &batch = *(*dmat)->GetRowBatches().begin();
|
||||
for (int i = 0; i < batch.Size(); i++) {
|
||||
std::vector<float> gpu_instance_out_predictions;
|
||||
std::vector<float> cpu_instance_out_predictions;
|
||||
|
||||
@@ -24,14 +24,10 @@ TEST(gpu_hist_experimental, TestSparseShard) {
|
||||
TrainParam p;
|
||||
p.max_depth = 6;
|
||||
|
||||
dmlc::DataIter<SparsePage>* iter = (*dmat)->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
CHECK(iter->Next());
|
||||
const SparsePage& batch = iter->Value();
|
||||
const SparsePage& batch = *(*dmat)->GetRowBatches().begin();
|
||||
DeviceShard shard(0, 0, 0, rows, p);
|
||||
shard.InitRowPtrs(batch);
|
||||
shard.InitCompressedData(gmat.cut, batch);
|
||||
CHECK(!iter->Next());
|
||||
|
||||
ASSERT_LT(shard.row_stride, columns);
|
||||
|
||||
@@ -65,15 +61,10 @@ TEST(gpu_hist_experimental, TestDenseShard) {
|
||||
TrainParam p;
|
||||
p.max_depth = 6;
|
||||
|
||||
dmlc::DataIter<SparsePage>* iter = (*dmat)->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
CHECK(iter->Next());
|
||||
const SparsePage& batch = iter->Value();
|
||||
|
||||
const SparsePage& batch = *(*dmat)->GetRowBatches().begin();
|
||||
DeviceShard shard(0, 0, 0, rows, p);
|
||||
shard.InitRowPtrs(batch);
|
||||
shard.InitCompressedData(gmat.cut, batch);
|
||||
CHECK(!iter->Next());
|
||||
|
||||
ASSERT_EQ(shard.row_stride, columns);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user