/** * Copyright 2018-2024, XGBoost Contributors */ #include #include #include #include // for size_t #include #include #include "../../../src/tree/common_row_partitioner.h" #include "../../../src/tree/hist/expand_entry.h" // for MultiExpandEntry, CPUExpandEntry #include "../collective/test_worker.h" // for TestDistributedGlobal #include "../helpers.h" #include "test_column_split.h" // for TestColumnSplit #include "test_partitioner.h" #include "xgboost/data.h" namespace xgboost::tree { namespace { template void TestPartitioner(bst_target_t n_targets) { std::size_t n_samples = 1024, base_rowid = 0; bst_feature_t n_features = 1; Context ctx; ctx.InitAllowUnknown(Args{}); CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false}; ASSERT_EQ(partitioner.base_rowid, base_rowid); ASSERT_EQ(partitioner.Size(), 1); ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples); auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true); std::vector candidates{{0, 0}}; candidates.front().split.loss_chg = 0.4; auto cuts = common::SketchOnDMatrix(&ctx, Xy.get(), 64); for (auto const& page : Xy->GetBatches()) { GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads()); bst_feature_t const split_ind = 0; common::ColumnMatrix column_indices; column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads()); { auto min_value = gmat.cut.MinValues()[split_ind]; RegTree tree{n_targets, n_features}; CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false}; if constexpr (std::is_same_v) { GetSplit(&tree, min_value, &candidates); } else { GetMultiSplitForTest(&tree, min_value, &candidates); } partitioner.UpdatePosition(&ctx, gmat, column_indices, candidates, &tree); ASSERT_EQ(partitioner.Size(), 3); ASSERT_EQ(partitioner[1].Size(), 0); ASSERT_EQ(partitioner[2].Size(), n_samples); } { CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false}; auto ptr = gmat.cut.Ptrs()[split_ind + 1]; float split_value = gmat.cut.Values().at(ptr / 2); RegTree tree{n_targets, n_features}; if constexpr (std::is_same::value) { GetSplit(&tree, split_value, &candidates); } else { GetMultiSplitForTest(&tree, split_value, &candidates); } partitioner.UpdatePosition(&ctx, gmat, column_indices, candidates, &tree); { auto left_nidx = tree.LeftChild(RegTree::kRoot); auto const& elem = partitioner[left_nidx]; ASSERT_LT(elem.Size(), n_samples); ASSERT_GT(elem.Size(), 1); for (auto& it : elem) { auto value = gmat.cut.Values().at(gmat.index[it]); ASSERT_LE(value, split_value); } } { auto right_nidx = tree.RightChild(RegTree::kRoot); auto const& elem = partitioner[right_nidx]; for (auto& it : elem) { auto value = gmat.cut.Values().at(gmat.index[it]); ASSERT_GT(value, split_value); } } } } } } // anonymous namespace TEST(QuantileHist, Partitioner) { TestPartitioner(1); } TEST(QuantileHist, MultiPartitioner) { TestPartitioner(3); } namespace { template void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples, bst_feature_t n_features, size_t base_rowid, std::shared_ptr Xy, float min_value, float mid_value, CommonRowPartitioner const& expected_mid_partitioner) { auto dmat = std::unique_ptr{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())}; Context ctx; ctx.InitAllowUnknown(Args{}); std::vector candidates{{0, 0}}; candidates.front().split.loss_chg = 0.4; auto cuts = common::SketchOnDMatrix(&ctx, dmat.get(), 64); for (auto const& page : Xy->GetBatches()) { GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads()); common::ColumnMatrix column_indices; column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads()); { RegTree tree{n_targets, n_features}; CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true}; if constexpr (std::is_same::value) { GetSplit(&tree, min_value, &candidates); } else { GetMultiSplitForTest(&tree, min_value, &candidates); } partitioner.UpdatePosition(&ctx, gmat, column_indices, candidates, &tree); ASSERT_EQ(partitioner.Size(), 3); ASSERT_EQ(partitioner[1].Size(), 0); ASSERT_EQ(partitioner[2].Size(), n_samples); } { RegTree tree{n_targets, n_features}; CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true}; if constexpr (std::is_same::value) { GetSplit(&tree, mid_value, &candidates); } else { GetMultiSplitForTest(&tree, mid_value, &candidates); } auto left_nidx = tree.LeftChild(RegTree::kRoot); partitioner.UpdatePosition(&ctx, gmat, column_indices, candidates, &tree); { auto const& elem = partitioner[left_nidx]; ASSERT_LT(elem.Size(), n_samples); ASSERT_GT(elem.Size(), 1); auto const& expected_elem = expected_mid_partitioner[left_nidx]; ASSERT_EQ(elem.Size(), expected_elem.Size()); for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) { ASSERT_EQ(*it, *eit); } } { auto right_nidx = tree.RightChild(RegTree::kRoot); auto const& elem = partitioner[right_nidx]; auto const& expected_elem = expected_mid_partitioner[right_nidx]; ASSERT_EQ(elem.Size(), expected_elem.Size()); for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) { ASSERT_EQ(*it, *eit); } } } } } template void TestColumnSplitPartitioner(bst_target_t n_targets) { std::size_t n_samples = 1024, base_rowid = 0; bst_feature_t n_features = 16; auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true); std::vector candidates{{0, 0}}; candidates.front().split.loss_chg = 0.4; Context ctx; ctx.InitAllowUnknown(Args{}); auto cuts = common::SketchOnDMatrix(&ctx, Xy.get(), 64); float min_value, mid_value; CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false}; for (auto const& page : Xy->GetBatches()) { GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads()); bst_feature_t const split_ind = 0; common::ColumnMatrix column_indices; column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads()); min_value = gmat.cut.MinValues()[split_ind]; auto ptr = gmat.cut.Ptrs()[split_ind + 1]; mid_value = gmat.cut.Values().at(ptr / 2); RegTree tree{n_targets, n_features}; if constexpr (std::is_same::value) { GetSplit(&tree, mid_value, &candidates); } else { GetMultiSplitForTest(&tree, mid_value, &candidates); } mid_partitioner.UpdatePosition(&ctx, gmat, column_indices, candidates, &tree); } auto constexpr kWorkers = 4; collective::TestDistributedGlobal(kWorkers, [&] { VerifyColumnSplitPartitioner(n_targets, n_samples, n_features, base_rowid, Xy, min_value, mid_value, mid_partitioner); }); } } // anonymous namespace TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner(1); } TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner(3); } namespace { class TestHistColSplit : public ::testing::TestWithParam> { public: void Run() { auto [n_targets, categorical, sparsity] = GetParam(); TestColumnSplit(n_targets, categorical, "grow_quantile_histmaker", sparsity); } }; } // anonymous namespace TEST_P(TestHistColSplit, Basic) { this->Run(); } INSTANTIATE_TEST_SUITE_P(ColumnSplit, TestHistColSplit, ::testing::ValuesIn([]() { std::vector> params; for (auto categorical : {true, false}) { for (auto sparsity : {0.0f, 0.6f}) { for (bst_target_t n_targets : {1u, 3u}) { params.emplace_back(n_targets, categorical, sparsity); } } } return params; }())); } // namespace xgboost::tree