/** * Copyright 2018-2023 by XGBoost Contributors */ #include #include // for bst_bin_t #include // for Context #include // for BatchIterator, BatchSet, DMatrix, Met... #include // for size_t #include // for int32_t, uint16_t, uint8_t #include // for numeric_limits #include // for shared_ptr, __shared_ptr_access, allo... #include // for remove_reference_t #include "../../../src/common/column_matrix.h" // for ColumnMatrix, Column, DenseColumnIter #include "../../../src/common/hist_util.h" // for DispatchBinType, BinTypeSize, Index #include "../../../src/common/ref_resource_view.h" // for RefResourceView #include "../../../src/data/gradient_index.h" // for GHistIndexMatrix #include "../../../src/data/iterative_dmatrix.h" // for IterativeDMatrix #include "../../../src/tree/param.h" // for TrainParam #include "../helpers.h" // for RandomDataGenerator, NumpyArrayIterFo... namespace xgboost::common { TEST(ColumnMatrix, Basic) { int32_t max_num_bins[] = {static_cast(std::numeric_limits::max()) + 1, static_cast(std::numeric_limits::max()) + 1, static_cast(std::numeric_limits::max()) + 2}; Context ctx; BinTypeSize last{kUint8BinsTypeSize}; for (int32_t max_num_bin : max_num_bins) { auto dmat = RandomDataGenerator(100, 10, 0.0).GenerateDMatrix(); auto sparse_thresh = 0.2; GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, sparse_thresh, false}; ColumnMatrix column_matrix; for (auto const& page : dmat->GetBatches()) { column_matrix.InitFromSparse(page, gmat, sparse_thresh, ctx.Threads()); } ASSERT_GE(column_matrix.GetTypeSize(), last); ASSERT_LE(column_matrix.GetTypeSize(), kUint32BinsTypeSize); last = column_matrix.GetTypeSize(); ASSERT_FALSE(column_matrix.AnyMissing()); for (auto i = 0ull; i < dmat->Info().num_row_; i++) { for (auto j = 0ull; j < dmat->Info().num_col_; j++) { DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) { using T = decltype(dtype); auto col = column_matrix.DenseColumn(j); ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j], col.GetGlobalBinIdx(i)); }); } } } } template void CheckSparseColumn(SparseColumnIter* p_col, const GHistIndexMatrix& gmat) { auto& col = *p_col; size_t n_samples = gmat.row_ptr.size() - 1; ASSERT_EQ(col.Size(), gmat.index.Size()); for (auto i = 0ull; i < col.Size(); i++) { ASSERT_EQ(gmat.index[gmat.row_ptr[col.GetRowIdx(i)]], col.GetGlobalBinIdx(i)); } for (auto i = 0ull; i < n_samples; i++) { if (col[i] == Column::kMissingId) { auto beg = gmat.row_ptr[i]; auto end = gmat.row_ptr[i + 1]; ASSERT_EQ(end - beg, 0); } } } TEST(ColumnMatrix, SparseColumn) { int32_t max_num_bins[] = {static_cast(std::numeric_limits::max()) + 1, static_cast(std::numeric_limits::max()) + 1, static_cast(std::numeric_limits::max()) + 2}; Context ctx; for (int32_t max_num_bin : max_num_bins) { auto dmat = RandomDataGenerator(100, 1, 0.85).GenerateDMatrix(); GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, 0.5f, false}; ColumnMatrix column_matrix; for (auto const& page : dmat->GetBatches()) { column_matrix.InitFromSparse(page, gmat, 1.0, ctx.Threads()); } common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) { using T = decltype(dtype); auto col = column_matrix.SparseColumn(0, 0); CheckSparseColumn(&col, gmat); }); } } template void CheckColumWithMissingValue(const DenseColumnIter& col, const GHistIndexMatrix& gmat) { for (auto i = 0ull; i < col.Size(); i++) { if (col.IsMissing(i)) { continue; } EXPECT_EQ(gmat.index[gmat.row_ptr[i]], col.GetGlobalBinIdx(i)); } } TEST(ColumnMatrix, DenseColumnWithMissing) { int32_t max_num_bins[] = {static_cast(std::numeric_limits::max()) + 1, static_cast(std::numeric_limits::max()) + 1, static_cast(std::numeric_limits::max()) + 2}; Context ctx; for (int32_t max_num_bin : max_num_bins) { auto dmat = RandomDataGenerator(100, 1, 0.5).GenerateDMatrix(); GHistIndexMatrix gmat(&ctx, dmat.get(), max_num_bin, 0.2, false); ColumnMatrix column_matrix; for (auto const& page : dmat->GetBatches()) { column_matrix.InitFromSparse(page, gmat, 0.2, ctx.Threads()); } ASSERT_TRUE(column_matrix.AnyMissing()); DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) { using T = decltype(dtype); auto col = column_matrix.DenseColumn(0); CheckColumWithMissingValue(col, gmat); }); } } TEST(ColumnMatrix, GrowMissing) { float sparsity = 0.5; NumpyArrayIterForTest iter(sparsity); auto n_threads = 0; bst_bin_t n_bins = 16; BatchParam batch{n_bins, tree::TrainParam::DftSparseThreshold()}; Context ctx; auto m = std::make_shared(&iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits::quiet_NaN(), n_threads, n_bins); for (auto const& page : m->GetBatches(&ctx, batch)) { auto const& column_matrix = page.Transpose(); auto const& missing = column_matrix.Missing(); auto n = NumpyArrayIterForTest::Rows() * NumpyArrayIterForTest::Cols(); auto expected = std::remove_reference_t::BitFieldT::ComputeStorageSize(n); auto got = missing.storage.size(); ASSERT_EQ(expected, got); DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) { using T = decltype(dtype); auto col = column_matrix.DenseColumn(0); CheckColumWithMissingValue(col, page); }); } } } // namespace xgboost::common