[BLOCKING] Handle empty rows in data iterators correctly (#5929)

* [jvm-packages] Handle empty rows in data iterators correctly

* Fix clang-tidy error

* last empty row

* Add comments [skip ci]

Co-authored-by: Nan Zhu <nanzhu@uber.com>
This commit is contained in:
Philip Hyunsu Cho
2020-07-25 13:46:19 -07:00
committed by GitHub
parent a4de2f68e4
commit 487ab0ce73
5 changed files with 79 additions and 19 deletions

View File

@@ -26,12 +26,13 @@ TEST(Adapter, CSRAdapter) {
EXPECT_EQ(line0.GetElement(1).value, 2);
auto line1 = batch.GetLine(1);
EXPECT_EQ(line1 .GetElement(0).value, 3);
EXPECT_EQ(line1 .GetElement(1).value, 4);
EXPECT_EQ(line1.GetElement(0).value, 3);
EXPECT_EQ(line1.GetElement(1).value, 4);
auto line2 = batch.GetLine(2);
EXPECT_EQ(line2 .GetElement(0).value, 5);
EXPECT_EQ(line2 .GetElement(0).row_idx, 2);
EXPECT_EQ(line2 .GetElement(0).column_idx, 1);
EXPECT_EQ(line2.GetElement(0).value, 5);
EXPECT_EQ(line2.GetElement(0).row_idx, 2);
EXPECT_EQ(line2.GetElement(0).column_idx, 1);
}
TEST(Adapter, CSCAdapterColsMoreThanRows) {
@@ -73,10 +74,11 @@ class CSRIterForTest {
std::vector<std::remove_pointer<decltype(std::declval<XGBoostBatchCSR>().index)>::type>
feature_idx_ {0, 1, 0, 1, 1};
std::vector<std::remove_pointer<decltype(std::declval<XGBoostBatchCSR>().offset)>::type>
row_ptr_ {0, 2, 4, 5};
row_ptr_ {0, 2, 4, 5, 5};
size_t iter_ {0};
public:
size_t static constexpr kRows { 4 }; // Test for the last row being empty
size_t static constexpr kCols { 13 }; // Test for having some missing columns
XGBoostBatchCSR Next() {
@@ -88,7 +90,7 @@ class CSRIterForTest {
batch.offset = dmlc::BeginPtr(row_ptr_);
batch.index = dmlc::BeginPtr(feature_idx_);
batch.value = dmlc::BeginPtr(data_);
batch.size = 3;
batch.size = kRows;
batch.label = nullptr;
batch.weight = nullptr;
@@ -117,16 +119,23 @@ int CSRSetDataNextForTest(DataIterHandle data_handle,
}
}
TEST(Adapter, IteratorAdaper) {
TEST(Adapter, IteratorAdapter) {
CSRIterForTest iter;
data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
XGBoostBatchCSR> adapter{&iter, CSRSetDataNextForTest};
constexpr size_t kRows { 6 };
constexpr size_t kRows { 8 };
std::unique_ptr<DMatrix> data {
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)
};
ASSERT_EQ(data->Info().num_col_, CSRIterForTest::kCols);
ASSERT_EQ(data->Info().num_row_, kRows);
int num_batch = 0;
for (auto const& batch : data->GetBatches<SparsePage>()) {
ASSERT_EQ(batch.offset.HostVector(), std::vector<bst_row_t>({0, 2, 4, 5, 5, 7, 9, 10, 10}));
++num_batch;
}
ASSERT_EQ(num_batch, 1);
}
} // namespace xgboost

View File

@@ -185,16 +185,22 @@ TEST(SimpleDMatrix, FromCSC) {
TEST(SimpleDMatrix, FromFile) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 3 * 5);
// Add an empty row at the end of the matrix
{
std::ofstream fo(filename, std::ios::app | std::ios::out);
fo << "0\n";
}
constexpr size_t kExpectedNumRow = 6;
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
auto verify_batch = [](SparsePage const &batch) {
EXPECT_EQ(batch.Size(), 5);
auto verify_batch = [kExpectedNumRow](SparsePage const &batch) {
EXPECT_EQ(batch.Size(), kExpectedNumRow);
EXPECT_EQ(batch.offset.HostVector(),
std::vector<bst_row_t>({0, 3, 6, 9, 12, 15}));
std::vector<bst_row_t>({0, 3, 6, 9, 12, 15, 15}));
EXPECT_EQ(batch.base_rowid, 0);
for (auto i = 0ull; i < batch.Size(); i++) {
for (auto i = 0ull; i < batch.Size() - 1; i++) {
if (i % 2 == 0) {
EXPECT_EQ(batch[i][0].index, 0);
EXPECT_EQ(batch[i][1].index, 1);