Refactor DMatrix to return batches of different page types (#4686)

* Use explicit template parameter for specifying page type.
This commit is contained in:
Rong Ou
2019-08-03 12:10:34 -07:00
committed by Jiaming Yuan
parent e930a8e54f
commit 6edddd7966
41 changed files with 477 additions and 470 deletions

View File

@@ -0,0 +1,19 @@
#!/bin/bash
# To be called when R package tests have failed
set -e
if [ -f "xgboost/xgboost.Rcheck/00install.out" ]; then
echo "===== xgboost/xgboost.Rcheck/00install.out ===="
cat xgboost/xgboost.Rcheck/00install.out
fi
if [ -f "xgboost/xgboost.Rcheck/00check.log" ]; then
printf "\n\n===== xgboost/xgboost.Rcheck/00check.log ====\n"
cat xgboost/xgboost.Rcheck/00check.log
fi
# Produce error code to interrupt Jenkins pipeline
exit 1

View File

@@ -20,7 +20,7 @@ TEST(c_api, XGDMatrixCreateFromMatDT) {
ASSERT_EQ(info.num_row_, 3);
ASSERT_EQ(info.num_nonzero_, 6);
for (const auto &batch : (*dmat)->GetRowBatches()) {
for (const auto &batch : (*dmat)->GetBatches<xgboost::SparsePage>()) {
ASSERT_EQ(batch[0][0].fvalue, 0.0f);
ASSERT_EQ(batch[0][1].fvalue, -4.0f);
ASSERT_EQ(batch[2][0].fvalue, 3.0f);
@@ -52,7 +52,7 @@ TEST(c_api, XGDMatrixCreateFromMat_omp) {
ASSERT_EQ(info.num_row_, row);
ASSERT_EQ(info.num_nonzero_, num_cols * row - num_missing);
for (const auto &batch : (*dmat)->GetRowBatches()) {
for (const auto &batch : (*dmat)->GetBatches<xgboost::SparsePage>()) {
for (size_t i = 0; i < batch.Size(); i++) {
auto inst = batch[i];
for (auto e : inst) {

View File

@@ -58,7 +58,7 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
// compare the row stride with the one obtained from the dmatrix
size_t expected_row_stride = 0;
for (const auto &batch : dmat->get()->GetRowBatches()) {
for (const auto &batch : dmat->get()->GetBatches<xgboost::SparsePage>()) {
const auto &offset_vec = batch.offset.ConstHostVector();
for (int i = 1; i <= offset_vec.size() -1; ++i) {
expected_row_stride = std::max(expected_row_stride, offset_vec[i] - offset_vec[i-1]);

View File

@@ -61,7 +61,7 @@ TEST(SparseCuts, SingleThreadedBuild) {
HistogramCuts cuts;
SparseCuts indices(&cuts);
auto const& page = *(p_fmat->GetColumnBatches().begin());
auto const& page = *(p_fmat->GetBatches<xgboost::CSCPage>().begin());
indices.SingleThreadBuild(page, p_fmat->Info(), kBins, false, 0, page.Size(), 0);
ASSERT_EQ(hmat.cut.Ptrs().size(), cuts.Ptrs().size());
@@ -92,7 +92,7 @@ TEST(SparseCuts, MultiThreadedBuild) {
HistogramCuts container;
SparseCuts indices(&container);
auto const& page = *(p_fmat->GetColumnBatches().begin());
auto const& page = *(p_fmat->GetBatches<xgboost::CSCPage>().begin());
indices.SingleThreadBuild(page, p_fmat->Info(), kBins, false, 0, page.Size(), 0);
ASSERT_EQ(container.Ptrs().size(), threaded_container.Ptrs().size());

View File

@@ -63,7 +63,7 @@ TEST(SparsePage, PushCSCAfterTranspose) {
CreateSparsePageDMatrix(n_entries, 64UL, filename);
const int ncols = dmat->Info().num_col_;
SparsePage page; // Consolidated sparse page
for (const auto &batch : dmat->GetRowBatches()) {
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
// Transpose each batch and push
SparsePage tmp = batch.GetTranspose(ncols);
page.PushCSC(tmp);

View File

@@ -122,7 +122,7 @@ TEST(MetaInfo, LoadQid) {
xgboost::Entry(2, 0), xgboost::Entry(3, 0), xgboost::Entry(4, 0.4),
xgboost::Entry(5, 1), xgboost::Entry(1, 0), xgboost::Entry(2, 1),
xgboost::Entry(3, 1), xgboost::Entry(4, 0.5), {5, 0}};
for (const auto &batch : dmat->GetRowBatches()) {
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
CHECK_EQ(batch.base_rowid, 0);
CHECK(batch.offset.HostVector() == expected_offset);
CHECK(batch.data.HostVector() == expected_data);

View File

@@ -20,10 +20,10 @@ TEST(SimpleCSRSource, SaveLoadBinary) {
EXPECT_EQ(dmat->Info().num_row_, dmat_read->Info().num_row_);
// Test we have non-empty batch
EXPECT_EQ(dmat->GetRowBatches().begin().AtEnd(), false);
EXPECT_EQ(dmat->GetBatches<xgboost::SparsePage>().begin().AtEnd(), false);
auto row_iter = dmat->GetRowBatches().begin();
auto row_iter_read = dmat_read->GetRowBatches().begin();
auto row_iter = dmat->GetBatches<xgboost::SparsePage>().begin();
auto row_iter_read = dmat_read->GetBatches<xgboost::SparsePage>().begin();
// Test the data read into the first row
auto first_row = (*row_iter)[0];
auto first_row_read = (*row_iter_read)[0];

View File

@@ -28,12 +28,12 @@ TEST(SimpleDMatrix, RowAccess) {
// Loop over the batches and count the records
int64_t row_count = 0;
for (auto &batch : dmat->GetRowBatches()) {
for (auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
row_count += batch.Size();
}
EXPECT_EQ(row_count, dmat->Info().num_row_);
// Test the data read into the first row
auto &batch = *dmat->GetRowBatches().begin();
auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
auto first_row = batch[0];
ASSERT_EQ(first_row.size(), 3);
EXPECT_EQ(first_row[2].index, 2);
@@ -55,7 +55,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
// Loop over the batches and assert the data is as expected
int64_t num_col_batch = 0;
for (const auto &batch : dmat->GetSortedColumnBatches()) {
for (const auto &batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
num_col_batch += 1;
EXPECT_EQ(batch.Size(), dmat->Info().num_col_)
<< "Expected batch size = number of cells as #batches is 1.";

View File

@@ -33,7 +33,7 @@ TEST(SparsePageDMatrix, RowAccess) {
xgboost::CreateSparsePageDMatrix(12, 64, filename);
// Test the data read into the first row
auto &batch = *dmat->GetRowBatches().begin();
auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
auto first_row = batch[0];
ASSERT_EQ(first_row.size(), 3);
EXPECT_EQ(first_row[2].index, 2);
@@ -51,14 +51,14 @@ TEST(SparsePageDMatrix, ColAccess) {
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
// Loop over the batches and assert the data is as expected
for (auto col_batch : dmat->GetSortedColumnBatches()) {
for (auto col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
EXPECT_EQ(col_batch.Size(), dmat->Info().num_col_);
EXPECT_EQ(col_batch[1][0].fvalue, 10.0f);
EXPECT_EQ(col_batch[1].size(), 1);
}
// Loop over the batches and assert the data is as expected
for (auto col_batch : dmat->GetColumnBatches()) {
for (auto col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
EXPECT_EQ(col_batch.Size(), dmat->Info().num_col_);
EXPECT_EQ(col_batch[1][0].fvalue, 10.0f);
EXPECT_EQ(col_batch[1].size(), 1);
@@ -82,7 +82,7 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
};
auto n_threads = omp_get_max_threads();
omp_set_num_threads(16);
for (auto const& page : dmat->GetColumnBatches()) {
for (auto const& page : dmat->GetBatches<xgboost::CSCPage>()) {
ASSERT_EQ(dmat->Info().num_col_, page.Size());
}
omp_set_num_threads(n_threads);

View File

@@ -157,7 +157,7 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
// Loop over the batches and count the records
int64_t batch_count = 0;
int64_t row_count = 0;
for (const auto &batch : dmat->GetRowBatches()) {
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
batch_count++;
row_count += batch.Size();
}

View File

@@ -26,7 +26,7 @@ TEST(cpu_predictor, Test) {
}
// Test predict instance
auto &batch = *(*dmat)->GetRowBatches().begin();
auto &batch = *(*dmat)->GetBatches<xgboost::SparsePage>().begin();
for (size_t i = 0; i < batch.Size(); i++) {
std::vector<float> instance_out_predictions;
cpu_predictor->PredictInstance(batch[i], &instance_out_predictions, model);

View File

@@ -76,7 +76,7 @@ template <typename GradientSumT>
void BuildGidx(DeviceShard<GradientSumT>* shard, int n_rows, int n_cols,
bst_float sparsity=0) {
auto dmat = CreateDMatrix(n_rows, n_cols, sparsity, 3);
const SparsePage& batch = *(*dmat)->GetRowBatches().begin();
const SparsePage& batch = *(*dmat)->GetBatches<xgboost::SparsePage>().begin();
HistogramCutsWrapper cmat;
cmat.SetPtrs({0, 3, 6, 9, 12, 15, 18, 21, 24});

View File

@@ -65,7 +65,7 @@ class QuantileHistMock : public QuantileHistMaker {
ASSERT_EQ(gmat.row_ptr.size(), num_row + 1);
ASSERT_LT(*std::max_element(gmat.index.begin(), gmat.index.end()),
gmat.cut.Ptrs().back());
for (const auto& batch : p_fmat->GetRowBatches()) {
for (const auto& batch : p_fmat->GetBatches<xgboost::SparsePage>()) {
for (size_t i = 0; i < batch.Size(); ++i) {
const size_t rid = batch.base_rowid + i;
ASSERT_LT(rid, num_row);