/** * Copyright 2021-2023 by XGBoost Contributors */ #include "../test_evaluate_splits.h" #include #include // for GradientPairPrecise, Args, Gradie... #include // for Context #include // for FeatureType, DMatrix, MetaInfo #include // for CHECK_EQ #include // for RegTree, RTreeNodeStat #include // for make_shared, shared_ptr, addressof #include "../../../../src/common/hist_util.h" // for HistCollection, HistogramCuts #include "../../../../src/common/random.h" // for ColumnSampler #include "../../../../src/common/row_set.h" // for RowSetCollection #include "../../../../src/data/gradient_index.h" // for GHistIndexMatrix #include "../../../../src/tree/hist/evaluate_splits.h" // for HistEvaluator #include "../../../../src/tree/hist/expand_entry.h" // for CPUExpandEntry #include "../../../../src/tree/hist/hist_cache.h" // for BoundedHistCollection #include "../../../../src/tree/hist/param.h" // for HistMakerTrainParam #include "../../../../src/tree/param.h" // for GradStats, TrainParam #include "../../helpers.h" // for RandomDataGenerator, AllThreadsFo... namespace xgboost::tree { void TestEvaluateSplits(bool force_read_by_column) { Context ctx; ctx.nthread = 4; int static constexpr kRows = 8, kCols = 16; auto sampler = std::make_shared(1u); TrainParam param; param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}}); auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix(); auto evaluator = HistEvaluator{&ctx, ¶m, dmat->Info(), sampler}; BoundedHistCollection hist; std::vector row_gpairs = { {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f}, {0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f}}; size_t constexpr kMaxBins = 4; // dense, no missing values GHistIndexMatrix gmat(&ctx, dmat.get(), kMaxBins, 0.5, false); common::RowSetCollection row_set_collection; std::vector &row_indices = *row_set_collection.Data(); row_indices.resize(kRows); std::iota(row_indices.begin(), row_indices.end(), 0); row_set_collection.Init(); HistMakerTrainParam hist_param; hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node); hist.AllocateHistograms({0}); common::BuildHist(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column); // Compute total gradient for all data points GradientPairPrecise total_gpair; for (const auto &e : row_gpairs) { total_gpair += GradientPairPrecise(e); } RegTree tree; std::vector entries(1); entries.front().nid = 0; entries.front().depth = 0; evaluator.InitRoot(GradStats{total_gpair}); evaluator.EvaluateSplits(hist, gmat.cut, {}, tree, &entries); auto best_loss_chg = evaluator.Evaluator().CalcSplitGain( param, 0, entries.front().split.SplitIndex(), entries.front().split.left_sum, entries.front().split.right_sum) - evaluator.Stats().front().root_gain; ASSERT_EQ(entries.front().split.loss_chg, best_loss_chg); ASSERT_GT(entries.front().split.loss_chg, 16.2f); // Assert that's the best split for (size_t i = 1; i < gmat.cut.Ptrs().size(); ++i) { GradStats left, right; for (size_t j = gmat.cut.Ptrs()[i-1]; j < gmat.cut.Ptrs()[i]; ++j) { auto loss_chg = evaluator.Evaluator().CalcSplitGain(param, 0, i - 1, left, right) - evaluator.Stats().front().root_gain; ASSERT_GE(best_loss_chg, loss_chg); left.Add(hist[0][j].GetGrad(), hist[0][j].GetHess()); right.SetSubstract(GradStats{total_gpair}, left); } } } TEST(HistEvaluator, Evaluate) { TestEvaluateSplits(false); TestEvaluateSplits(true); } TEST(HistMultiEvaluator, Evaluate) { Context ctx; ctx.nthread = 1; TrainParam param; param.Init(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}}); auto sampler = std::make_shared(1u); std::size_t n_samples = 3; bst_feature_t n_features = 2; bst_target_t n_targets = 2; bst_bin_t n_bins = 2; auto p_fmat = RandomDataGenerator{n_samples, n_features, 0.5}.Targets(n_targets).GenerateDMatrix(true); HistMultiEvaluator evaluator{&ctx, p_fmat->Info(), ¶m, sampler}; HistMakerTrainParam hist_param; std::vector histogram(n_targets); linalg::Vector root_sum({2}, DeviceOrd::CPU()); for (bst_target_t t{0}; t < n_targets; ++t) { auto &hist = histogram[t]; hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node); hist.AllocateHistograms({0}); auto node_hist = hist[0]; node_hist[0] = {-0.5, 0.5}; node_hist[1] = {2.0, 0.5}; node_hist[2] = {0.5, 0.5}; node_hist[3] = {1.0, 0.5}; root_sum(t) += node_hist[0]; root_sum(t) += node_hist[1]; } RegTree tree{n_targets, n_features}; auto weight = evaluator.InitRoot(root_sum.HostView()); tree.SetLeaf(RegTree::kRoot, weight.HostView()); auto w = weight.HostView(); ASSERT_EQ(w.Size(), n_targets); ASSERT_EQ(w(0), -1.5); ASSERT_EQ(w(1), -1.5); common::HistogramCuts cuts; cuts.cut_ptrs_ = {0, 2, 4}; cuts.cut_values_ = {0.5, 1.0, 2.0, 3.0}; cuts.min_vals_ = {-0.2, 1.8}; std::vector entries(1, {/*nidx=*/0, /*depth=*/0}); std::vector ptrs; std::transform(histogram.cbegin(), histogram.cend(), std::back_inserter(ptrs), [](auto const &h) { return std::addressof(h); }); evaluator.EvaluateSplits(tree, ptrs, cuts, &entries); ASSERT_EQ(entries.front().split.loss_chg, 12.5); ASSERT_EQ(entries.front().split.split_value, 0.5); ASSERT_EQ(entries.front().split.SplitIndex(), 0); ASSERT_EQ(sampler->GetFeatureSet(0)->Size(), n_features); } TEST(HistEvaluator, Apply) { Context ctx; ctx.nthread = 4; RegTree tree; int static constexpr kNRows = 8, kNCols = 16; TrainParam param; param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}}); auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix(); auto sampler = std::make_shared(1u); auto evaluator_ = HistEvaluator{&ctx, ¶m, dmat->Info(), sampler}; CPUExpandEntry entry{0, 0}; entry.split.loss_chg = 10.0f; entry.split.left_sum = GradStats{0.4, 0.6f}; entry.split.right_sum = GradStats{0.5, 0.5f}; evaluator_.ApplyTreeSplit(entry, &tree); ASSERT_EQ(tree.NumExtraNodes(), 2); ASSERT_EQ(tree.Stat(tree[0].LeftChild()).sum_hess, 0.6f); ASSERT_EQ(tree.Stat(tree[0].RightChild()).sum_hess, 0.5f); { RegTree tree; entry.split.is_cat = true; entry.split.split_value = 1.0; evaluator_.ApplyTreeSplit(entry, &tree); auto l = entry.split.left_sum; ASSERT_NEAR(tree[1].LeafValue(), -l.sum_grad / l.sum_hess * param.learning_rate, kRtEps); ASSERT_NEAR(tree[2].LeafValue(), -param.learning_rate, kRtEps); } } TEST_F(TestPartitionBasedSplit, CPUHist) { Context ctx; // check the evaluator is returning the optimal split std::vector ft{FeatureType::kCategorical}; auto sampler = std::make_shared(1u); HistEvaluator evaluator{&ctx, ¶m_, info_, sampler}; evaluator.InitRoot(GradStats{total_gpair_}); RegTree tree; std::vector entries(1); evaluator.EvaluateSplits(hist_, cuts_, {ft}, tree, &entries); ASSERT_NEAR(entries[0].split.loss_chg, best_score_, 1e-16); } namespace { auto CompareOneHotAndPartition(bool onehot) { Context ctx; int static constexpr kRows = 128, kCols = 1; std::vector ft(kCols, FeatureType::kCategorical); TrainParam param; if (onehot) { // force use one-hot param.UpdateAllowUnknown( Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}, {"max_cat_to_onehot", "100"}}); } else { param.UpdateAllowUnknown( Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}, {"max_cat_to_onehot", "1"}}); } size_t n_cats{2}; auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix(); auto sampler = std::make_shared(1u); auto evaluator = HistEvaluator{&ctx, ¶m, dmat->Info(), sampler}; std::vector entries(1); HistMakerTrainParam hist_param; for (auto const &gmat : dmat->GetBatches(&ctx, {32, param.sparse_threshold})) { BoundedHistCollection hist; entries.front().nid = 0; entries.front().depth = 0; hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node); hist.AllocateHistograms({0}); auto node_hist = hist[0]; CHECK_EQ(node_hist.size(), n_cats); CHECK_EQ(node_hist.size(), gmat.cut.Ptrs().back()); GradientPairPrecise total_gpair; for (size_t i = 0; i < node_hist.size(); ++i) { node_hist[i] = {static_cast(node_hist.size() - i), 1.0}; total_gpair += node_hist[i]; } RegTree tree; evaluator.InitRoot(GradStats{total_gpair}); evaluator.EvaluateSplits(hist, gmat.cut, ft, tree, &entries); } return entries.front(); } } // anonymous namespace TEST(HistEvaluator, Categorical) { auto with_onehot = CompareOneHotAndPartition(true); auto with_part = CompareOneHotAndPartition(false); ASSERT_EQ(with_onehot.split.loss_chg, with_part.split.loss_chg); } TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) { BoundedHistCollection hist; HistMakerTrainParam hist_param; hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node); hist.AllocateHistograms({0}); auto node_hist = hist[0]; ASSERT_EQ(node_hist.size(), feature_histogram_.size()); std::copy(feature_histogram_.cbegin(), feature_histogram_.cend(), node_hist.begin()); auto sampler = std::make_shared(1u); MetaInfo info; info.num_col_ = 1; info.feature_types = {FeatureType::kCategorical}; Context ctx; auto evaluator = HistEvaluator{&ctx, ¶m_, info, sampler}; evaluator.InitRoot(GradStats{parent_sum_}); std::vector entries(1); RegTree tree; evaluator.EvaluateSplits(hist, cuts_, info.feature_types.ConstHostSpan(), tree, &entries); auto const &split = entries.front().split; this->CheckResult(split.loss_chg, split.SplitIndex(), split.split_value, split.is_cat, split.DefaultLeft(), GradientPairPrecise{split.left_sum.GetGrad(), split.left_sum.GetHess()}, GradientPairPrecise{split.right_sum.GetGrad(), split.right_sum.GetHess()}); } } // namespace xgboost::tree