diff --git a/src/data/data.cc b/src/data/data.cc index a3b0c250e..d52ff4110 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -376,7 +376,7 @@ void SparsePage::PushCSC(const SparsePage &batch) { std::vector offset(other_offset.size()); offset[0] = 0; - std::vector data(self_data.size() + batch.data.Size()); + std::vector data(self_data.size() + other_data.size()); // n_cols in original csr data matrix, here in csc is n_rows size_t const n_features = other_offset.size() - 1; @@ -385,7 +385,11 @@ void SparsePage::PushCSC(const SparsePage &batch) { for (size_t i = 0; i < n_features; ++i) { size_t const self_beg = self_offset.at(i); size_t const self_length = self_offset.at(i+1) - self_beg; - CHECK_LT(beg, data.size()); + // It is possible that the current feature and further features aren't referenced + // in any rows accumulated thus far. It is also possible for this to happen + // in the current sparse page row batch as well. + // Hence, the incremental number of rows may stay constant thus equaling the data size + CHECK_LE(beg, data.size()); std::memcpy(dmlc::BeginPtr(data)+beg, dmlc::BeginPtr(self_data) + self_beg, sizeof(Entry) * self_length); @@ -393,7 +397,7 @@ void SparsePage::PushCSC(const SparsePage &batch) { size_t const other_beg = other_offset.at(i); size_t const other_length = other_offset.at(i+1) - other_beg; - CHECK_LT(beg, data.size()); + CHECK_LE(beg, data.size()); std::memcpy(dmlc::BeginPtr(data)+beg, dmlc::BeginPtr(other_data) + other_beg, sizeof(Entry) * other_length); diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc index e2cdebc0c..0858ef7a6 100644 --- a/tests/cpp/data/test_data.cc +++ b/tests/cpp/data/test_data.cc @@ -2,6 +2,7 @@ #include #include "xgboost/data.h" +#include "../helpers.h" namespace xgboost { TEST(SparsePage, PushCSC) { @@ -52,4 +53,28 @@ TEST(SparsePage, PushCSC) { ASSERT_EQ(inst[i].index, indices_sol[i % 3]); } } + +TEST(SparsePage, PushCSCAfterTranspose) { + const int n_entries = 9; + std::unique_ptr dmat = CreateSparsePageDMatrix(n_entries, 64UL); + const int ncols = dmat->Info().num_col_; + SparsePage page; // Consolidated sparse page + for (const auto &batch : dmat->GetRowBatches()) { + // Transpose each batch and push + SparsePage tmp = batch.GetTranspose(ncols); + page.PushCSC(tmp); + } + + // Make sure that the final sparse page has the right number of entries + ASSERT_EQ(n_entries, page.data.Size()); + + // The feature value for a feature in each row should be identical, as that is + // how the dmatrix has been created + for (int i = 0; i < page.Size(); ++i) { + auto inst = page[i]; + for (int j = 1; j < inst.size(); ++j) { + ASSERT_EQ(inst[0].fvalue, inst[j].fvalue); + } + } +} } // namespace xgboost