/** * Copyright 2021-2024, XGBoost contributors. */ #include #include // for TreeUpdater #include // for transform #include // for unique_ptr #include // for vector #include "../../../src/tree/common_row_partitioner.h" #include "../../../src/tree/param.h" // for TrainParam #include "../collective/test_worker.h" // for TestDistributedGlobal #include "../helpers.h" #include "test_column_split.h" // for TestColumnSplit #include "test_partitioner.h" #include "xgboost/tree_model.h" // for RegTree namespace xgboost::tree { namespace { std::vector GenerateHess(size_t n_samples) { auto grad = GenerateRandomGradients(n_samples); std::vector hess(grad.Size()); std::transform(grad.HostVector().cbegin(), grad.HostVector().cend(), hess.begin(), [](auto gpair) { return gpair.GetHess(); }); return hess; } } // anonymous namespace TEST(Approx, Partitioner) { size_t n_samples = 1024, n_features = 1, base_rowid = 0; Context ctx; ctx.InitAllowUnknown(Args{}); CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false}; ASSERT_EQ(partitioner.base_rowid, base_rowid); ASSERT_EQ(partitioner.Size(), 1); ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples); auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true); auto hess = GenerateHess(n_samples); std::vector candidates{{0, 0}}; candidates.front().split.loss_chg = 0.4; for (auto const& page : Xy->GetBatches(&ctx, {64, hess, true})) { bst_feature_t const split_ind = 0; { auto min_value = page.cut.MinValues()[split_ind]; RegTree tree; CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false}; GetSplit(&tree, min_value, &candidates); partitioner.UpdatePosition(&ctx, page, candidates, &tree); ASSERT_EQ(partitioner.Size(), 3); ASSERT_EQ(partitioner[1].Size(), 0); ASSERT_EQ(partitioner[2].Size(), n_samples); } { CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false}; auto ptr = page.cut.Ptrs()[split_ind + 1]; float split_value = page.cut.Values().at(ptr / 2); RegTree tree; GetSplit(&tree, split_value, &candidates); partitioner.UpdatePosition(&ctx, page, candidates, &tree); { auto left_nidx = tree[RegTree::kRoot].LeftChild(); auto const& elem = partitioner[left_nidx]; ASSERT_LT(elem.Size(), n_samples); ASSERT_GT(elem.Size(), 1); for (auto& it : elem) { auto value = page.cut.Values().at(page.index[it]); ASSERT_LE(value, split_value); } } { auto right_nidx = tree[RegTree::kRoot].RightChild(); auto const& elem = partitioner[right_nidx]; for (auto& it : elem) { auto value = page.cut.Values().at(page.index[it]); ASSERT_GT(value, split_value) << it; } } } } } TEST(Approx, InteractionConstraint) { auto constexpr kRows = 32; auto constexpr kCols = 16; auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.6f, false); Context ctx; linalg::Matrix gpair({kRows}, ctx.Device()); gpair.Data()->Copy(GenerateRandomGradients(kRows)); ObjInfo task{ObjInfo::kRegression}; { // With constraints RegTree tree{1, kCols}; std::unique_ptr updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)}; TrainParam param; param.UpdateAllowUnknown( Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}}); std::vector> position(1); updater->Configure(Args{}); updater->Update(¶m, &gpair, p_dmat.get(), position, {&tree}); ASSERT_EQ(tree.NumExtraNodes(), 4); ASSERT_EQ(tree[0].SplitIndex(), 1); ASSERT_EQ(tree[tree[0].LeftChild()].SplitIndex(), 0); ASSERT_EQ(tree[tree[0].RightChild()].SplitIndex(), 0); } { // Without constraints RegTree tree{1u, kCols}; std::unique_ptr updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)}; std::vector> position(1); TrainParam param; param.Init(Args{}); updater->Configure(Args{}); updater->Update(¶m, &gpair, p_dmat.get(), position, {&tree}); ASSERT_EQ(tree.NumExtraNodes(), 10); ASSERT_EQ(tree[0].SplitIndex(), 1); ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0); ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0); } } namespace { void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared_ptr Xy, std::vector* hess, float min_value, float mid_value, CommonRowPartitioner const& expected_mid_partitioner) { auto dmat = std::unique_ptr{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())}; std::vector candidates{{0, 0}}; candidates.front().split.loss_chg = 0.4; Context ctx; ctx.InitAllowUnknown(Args{}); for (auto const& page : dmat->GetBatches(&ctx, {64, *hess, true})) { { RegTree tree; CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true}; GetSplit(&tree, min_value, &candidates); partitioner.UpdatePosition(&ctx, page, candidates, &tree); ASSERT_EQ(partitioner.Size(), 3); ASSERT_EQ(partitioner[1].Size(), 0); ASSERT_EQ(partitioner[2].Size(), n_samples); } { CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true}; RegTree tree; GetSplit(&tree, mid_value, &candidates); partitioner.UpdatePosition(&ctx, page, candidates, &tree); { auto left_nidx = tree[RegTree::kRoot].LeftChild(); auto const& elem = partitioner[left_nidx]; ASSERT_LT(elem.Size(), n_samples); ASSERT_GT(elem.Size(), 1); auto const& expected_elem = expected_mid_partitioner[left_nidx]; ASSERT_EQ(elem.Size(), expected_elem.Size()); for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) { ASSERT_EQ(*it, *eit); } } { auto right_nidx = tree[RegTree::kRoot].RightChild(); auto const& elem = partitioner[right_nidx]; auto const& expected_elem = expected_mid_partitioner[right_nidx]; ASSERT_EQ(elem.Size(), expected_elem.Size()); for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) { ASSERT_EQ(*it, *eit); } } } } } } // anonymous namespace TEST(Approx, PartitionerColumnSplit) { size_t n_samples = 1024, n_features = 16, base_rowid = 0; auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true); auto hess = GenerateHess(n_samples); std::vector candidates{{0, 0}}; candidates.front().split.loss_chg = 0.4; float min_value, mid_value; Context ctx; ctx.InitAllowUnknown(Args{}); CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false}; for (auto const& page : Xy->GetBatches(&ctx, {64, hess, true})) { bst_feature_t const split_ind = 0; min_value = page.cut.MinValues()[split_ind]; auto ptr = page.cut.Ptrs()[split_ind + 1]; mid_value = page.cut.Values().at(ptr / 2); RegTree tree; GetSplit(&tree, mid_value, &candidates); mid_partitioner.UpdatePosition(&ctx, page, candidates, &tree); } auto constexpr kWorkers = 4; collective::TestDistributedGlobal(kWorkers, [&] { TestColumnSplitPartitioner(n_samples, base_rowid, Xy, &hess, min_value, mid_value, mid_partitioner); }); } namespace { class TestApproxColumnSplit : public ::testing::TestWithParam> { public: void Run() { auto [categorical, sparsity] = GetParam(); TestColumnSplit(1u, categorical, "grow_histmaker", sparsity); } }; } // namespace TEST_P(TestApproxColumnSplit, Basic) { this->Run(); } INSTANTIATE_TEST_SUITE_P(ColumnSplit, TestApproxColumnSplit, ::testing::ValuesIn([]() { std::vector> params; for (auto categorical : {true, false}) { for (auto sparsity : {0.0f, 0.6f}) { params.emplace_back(categorical, sparsity); } } return params; }())); } // namespace xgboost::tree