/*! * Copyright 2021-2022 XGBoost contributors */ #include #include #include "../../../src/common/column_matrix.h" #include "../../../src/data/gradient_index.h" #include "../helpers.h" namespace xgboost { namespace data { TEST(GradientIndex, ExternalMemory) { std::unique_ptr dmat = CreateSparsePageDMatrix(10000); std::vector base_rowids; std::vector hessian(dmat->Info().num_row_, 1); for (auto const &page : dmat->GetBatches({64, hessian, true})) { base_rowids.push_back(page.base_rowid); } size_t i = 0; for (auto const &page : dmat->GetBatches()) { ASSERT_EQ(base_rowids[i], page.base_rowid); ++i; } base_rowids.clear(); for (auto const &page : dmat->GetBatches({64, hessian, false})) { base_rowids.push_back(page.base_rowid); } i = 0; for (auto const &page : dmat->GetBatches()) { ASSERT_EQ(base_rowids[i], page.base_rowid); ++i; } } TEST(GradientIndex, FromCategoricalBasic) { size_t constexpr kRows = 1000, kCats = 13, kCols = 1; size_t max_bins = 8; auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats); auto m = GetDMatrixFromData(x, kRows, 1); auto &h_ft = m->Info().feature_types.HostVector(); h_ft.resize(kCols, FeatureType::kCategorical); BatchParam p(max_bins, 0.8); GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, common::OmpGetNumThreads(0), {}); auto x_copy = x; std::sort(x_copy.begin(), x_copy.end()); auto n_uniques = std::unique(x_copy.begin(), x_copy.end()) - x_copy.begin(); ASSERT_EQ(n_uniques, kCats); auto const &h_cut_ptr = gidx.cut.Ptrs(); auto const &h_cut_values = gidx.cut.Values(); ASSERT_EQ(h_cut_ptr.size(), 2); ASSERT_EQ(h_cut_values.size(), kCats); auto const &index = gidx.index; for (size_t i = 0; i < x.size(); ++i) { auto bin = index[i]; auto bin_value = h_cut_values.at(bin); ASSERT_EQ(common::AsCat(x[i]), common::AsCat(bin_value)); } } TEST(GradientIndex, PushBatch) { size_t constexpr kRows = 64, kCols = 4; bst_bin_t max_bins = 64; float st = 0.5; auto test = [&](float sparisty) { auto m = RandomDataGenerator{kRows, kCols, sparisty}.GenerateDMatrix(true); auto cuts = common::SketchOnDMatrix(m.get(), max_bins, common::OmpGetNumThreads(0), false, {}); common::HistogramCuts copy_cuts = cuts; ASSERT_EQ(m->Info().num_row_, kRows); ASSERT_EQ(m->Info().num_col_, kCols); GHistIndexMatrix gmat{m->Info(), std::move(copy_cuts), max_bins}; for (auto const &page : m->GetBatches()) { SparsePageAdapterBatch batch{page.GetView()}; gmat.PushAdapterBatch(m->Ctx(), 0, 0, batch, std::numeric_limits::quiet_NaN(), {}, st, m->Info().num_row_); gmat.PushAdapterBatchColumns(m->Ctx(), batch, std::numeric_limits::quiet_NaN(), 0); } for (auto const &page : m->GetBatches(BatchParam{max_bins, st})) { for (size_t i = 0; i < kRows; ++i) { for (size_t j = 0; j < kCols; ++j) { auto v0 = gmat.GetFvalue(i, j, false); auto v1 = page.GetFvalue(i, j, false); if (sparisty == 0.0) { ASSERT_FALSE(std::isnan(v0)); } if (!std::isnan(v0)) { ASSERT_EQ(v0, v1); } } } } }; test(0.0f); test(0.5f); test(0.9f); } } // namespace data } // namespace xgboost