diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala index bb23ba50a..f5466df10 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala @@ -171,4 +171,31 @@ class MissingValueHandlingSuite extends FunSuite with PerTest { val model = new XGBoostClassifier(paramMap).fit(inputDF) model.transform(inputDF).collect() } + + // https://github.com/dmlc/xgboost/pull/5929 + test("handle the empty last row correctly with a missing value as 0") { + val spark = ss + import spark.implicits._ + // spark uses 1.5 * (nnz + 1.0) < size as the condition to decide whether using sparse or dense + // vector, + val testDF = Seq( + (7.0f, 0.0f, -1.0f, 1.0f, 1.0), + (1.0f, 0.0f, 1.0f, 1.0f, 1.0), + (0.0f, 1.0f, 0.0f, 1.0f, 0.0), + (1.0f, 0.0f, 1.0f, 1.0f, 1.0), + (1.0f, -1.0f, 0.0f, 1.0f, 0.0), + (0.0f, 0.0f, 0.0f, 1.0f, 1.0), + (0.0f, 0.0f, 0.0f, 0.0f, 0.0) + ).toDF("col1", "col2", "col3", "col4", "label") + val vectorAssembler = new VectorAssembler() + .setInputCols(Array("col1", "col2", "col3", "col4")) + .setOutputCol("features") + val inputDF = vectorAssembler.transform(testDF).select("features", "label") + inputDF.show() + val paramMap = List("eta" -> "1", "max_depth" -> "2", + "objective" -> "binary:logistic", "missing" -> 0.0f, + "num_workers" -> 1, "allow_non_zero_for_missing" -> "true").toMap + val model = new XGBoostClassifier(paramMap).fit(inputDF) + model.transform(inputDF).collect() + } } diff --git a/src/data/data.cc b/src/data/data.cc index a835e9e17..401a35081 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -833,9 +833,9 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread uint64_t max_columns = 0; // First-pass over the batch counting valid elements - size_t num_lines = batch.Size(); + size_t batch_size = batch.Size(); #pragma omp parallel for schedule(static) - for (omp_ulong i = 0; i < static_cast(num_lines); + for (omp_ulong i = 0; i < static_cast(batch_size); ++i) { // NOLINT(*) int tid = omp_get_thread_num(); auto line = batch.GetLine(i); @@ -847,7 +847,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread size_t key = element.row_idx - base_rowid; // Adapter row index is absolute, here we want it relative to // current page - CHECK_GE(key, builder_base_row_offset); + CHECK_GE(key, builder_base_row_offset); builder.AddBudget(key, tid); } } @@ -856,7 +856,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread // Second pass over batch, placing elements in correct position #pragma omp parallel for schedule(static) - for (omp_ulong i = 0; i < static_cast(num_lines); + for (omp_ulong i = 0; i < static_cast(batch_size); ++i) { // NOLINT(*) int tid = omp_get_thread_num(); auto line = batch.GetLine(i); diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index c002076a6..f054ff64a 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -6,6 +6,7 @@ */ #include #include +#include #include #include "xgboost/data.h" @@ -103,6 +104,8 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { auto& offset_vec = sparse_page_.offset.HostVector(); auto& data_vec = sparse_page_.data.HostVector(); uint64_t inferred_num_columns = 0; + uint64_t total_batch_size = 0; + // batch_size is either number of rows or cols, depending on data layout adapter->BeforeFirst(); // Iterate over batches of input data @@ -110,6 +113,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { auto& batch = adapter->Value(); auto batch_max_columns = sparse_page_.Push(batch, missing, nthread); inferred_num_columns = std::max(batch_max_columns, inferred_num_columns); + total_batch_size += batch.Size(); // Append meta information if available if (batch.Labels() != nullptr) { auto& labels = info_.labels_.HostVector(); @@ -153,16 +157,30 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { info_.num_col_ = adapter->NumColumns(); } + // Synchronise worker columns rabit::Allreduce(&info_.num_col_, 1); if (adapter->NumRows() == kAdapterUnknownSize) { - info_.num_row_ = offset_vec.size() - 1; + using IteratorAdapterT + = IteratorAdapter; + // If AdapterT is either IteratorAdapter or FileAdapter type, use the total batch size to + // determine the correct number of rows, as offset_vec may be too short + if (std::is_same::value + || std::is_same::value) { + info_.num_row_ = total_batch_size; + // Ensure offset_vec.size() - 1 == [number of rows] + while (offset_vec.size() - 1 < total_batch_size) { + offset_vec.emplace_back(offset_vec.back()); + } + } else { + CHECK((std::is_same::value)) << "Expecting CSCAdapter"; + info_.num_row_ = offset_vec.size() - 1; + } } else { if (offset_vec.empty()) { offset_vec.emplace_back(0); } - while (offset_vec.size() - 1 < adapter->NumRows()) { offset_vec.emplace_back(offset_vec.back()); } diff --git a/tests/cpp/data/test_adapter.cc b/tests/cpp/data/test_adapter.cc index de8353584..bb8d8b627 100644 --- a/tests/cpp/data/test_adapter.cc +++ b/tests/cpp/data/test_adapter.cc @@ -26,12 +26,13 @@ TEST(Adapter, CSRAdapter) { EXPECT_EQ(line0.GetElement(1).value, 2); auto line1 = batch.GetLine(1); - EXPECT_EQ(line1 .GetElement(0).value, 3); - EXPECT_EQ(line1 .GetElement(1).value, 4); + EXPECT_EQ(line1.GetElement(0).value, 3); + EXPECT_EQ(line1.GetElement(1).value, 4); + auto line2 = batch.GetLine(2); - EXPECT_EQ(line2 .GetElement(0).value, 5); - EXPECT_EQ(line2 .GetElement(0).row_idx, 2); - EXPECT_EQ(line2 .GetElement(0).column_idx, 1); + EXPECT_EQ(line2.GetElement(0).value, 5); + EXPECT_EQ(line2.GetElement(0).row_idx, 2); + EXPECT_EQ(line2.GetElement(0).column_idx, 1); } TEST(Adapter, CSCAdapterColsMoreThanRows) { @@ -73,10 +74,11 @@ class CSRIterForTest { std::vector().index)>::type> feature_idx_ {0, 1, 0, 1, 1}; std::vector().offset)>::type> - row_ptr_ {0, 2, 4, 5}; + row_ptr_ {0, 2, 4, 5, 5}; size_t iter_ {0}; public: + size_t static constexpr kRows { 4 }; // Test for the last row being empty size_t static constexpr kCols { 13 }; // Test for having some missing columns XGBoostBatchCSR Next() { @@ -88,7 +90,7 @@ class CSRIterForTest { batch.offset = dmlc::BeginPtr(row_ptr_); batch.index = dmlc::BeginPtr(feature_idx_); batch.value = dmlc::BeginPtr(data_); - batch.size = 3; + batch.size = kRows; batch.label = nullptr; batch.weight = nullptr; @@ -117,16 +119,23 @@ int CSRSetDataNextForTest(DataIterHandle data_handle, } } -TEST(Adapter, IteratorAdaper) { +TEST(Adapter, IteratorAdapter) { CSRIterForTest iter; data::IteratorAdapter adapter{&iter, CSRSetDataNextForTest}; - constexpr size_t kRows { 6 }; + constexpr size_t kRows { 8 }; std::unique_ptr data { DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), 1) }; ASSERT_EQ(data->Info().num_col_, CSRIterForTest::kCols); ASSERT_EQ(data->Info().num_row_, kRows); + int num_batch = 0; + for (auto const& batch : data->GetBatches()) { + ASSERT_EQ(batch.offset.HostVector(), std::vector({0, 2, 4, 5, 5, 7, 9, 10, 10})); + ++num_batch; + } + ASSERT_EQ(num_batch, 1); } + } // namespace xgboost diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc index 691dc8545..8fdd0d09f 100644 --- a/tests/cpp/data/test_simple_dmatrix.cc +++ b/tests/cpp/data/test_simple_dmatrix.cc @@ -185,16 +185,22 @@ TEST(SimpleDMatrix, FromCSC) { TEST(SimpleDMatrix, FromFile) { std::string filename = "test.libsvm"; CreateBigTestData(filename, 3 * 5); + // Add an empty row at the end of the matrix + { + std::ofstream fo(filename, std::ios::app | std::ios::out); + fo << "0\n"; + } + constexpr size_t kExpectedNumRow = 6; std::unique_ptr> parser( dmlc::Parser::Create(filename.c_str(), 0, 1, "auto")); - auto verify_batch = [](SparsePage const &batch) { - EXPECT_EQ(batch.Size(), 5); + auto verify_batch = [kExpectedNumRow](SparsePage const &batch) { + EXPECT_EQ(batch.Size(), kExpectedNumRow); EXPECT_EQ(batch.offset.HostVector(), - std::vector({0, 3, 6, 9, 12, 15})); + std::vector({0, 3, 6, 9, 12, 15, 15})); EXPECT_EQ(batch.base_rowid, 0); - for (auto i = 0ull; i < batch.Size(); i++) { + for (auto i = 0ull; i < batch.Size() - 1; i++) { if (i % 2 == 0) { EXPECT_EQ(batch[i][0].index, 0); EXPECT_EQ(batch[i][1].index, 1);