diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc index 43dc4f46a..dceae5d2b 100644 --- a/tests/cpp/tree/hist/test_evaluate_splits.cc +++ b/tests/cpp/tree/hist/test_evaluate_splits.cc @@ -1,5 +1,5 @@ /** - * Copyright 2021-2023 by XGBoost Contributors + * Copyright 2021-2024, XGBoost Contributors */ #include "../test_evaluate_splits.h" @@ -10,13 +10,15 @@ #include // for CHECK_EQ #include // for RegTree, RTreeNodeStat -#include // for make_shared, shared_ptr, addressof +#include // for make_shared, shared_ptr, addressof +#include // for iota +#include // for make_tuple #include "../../../../src/common/hist_util.h" // for HistCollection, HistogramCuts #include "../../../../src/common/random.h" // for ColumnSampler #include "../../../../src/common/row_set.h" // for RowSetCollection #include "../../../../src/data/gradient_index.h" // for GHistIndexMatrix -#include "../../../../src/tree/hist/evaluate_splits.h" // for HistEvaluator +#include "../../../../src/tree/hist/evaluate_splits.h" // for HistEvaluator, TreeEvaluator #include "../../../../src/tree/hist/expand_entry.h" // for CPUExpandEntry #include "../../../../src/tree/hist/hist_cache.h" // for BoundedHistCollection #include "../../../../src/tree/hist/param.h" // for HistMakerTrainParam @@ -24,6 +26,74 @@ #include "../../helpers.h" // for RandomDataGenerator, AllThreadsFo... namespace xgboost::tree { +void TestPartitionBasedSplit::SetUp() { + param_.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}}); + sorted_idx_.resize(n_bins_); + std::iota(sorted_idx_.begin(), sorted_idx_.end(), 0); + + info_.num_col_ = 1; + + cuts_.cut_ptrs_.Resize(2); + cuts_.SetCategorical(true, n_bins_); + auto &h_cuts = cuts_.cut_ptrs_.HostVector(); + h_cuts[0] = 0; + h_cuts[1] = n_bins_; + auto &h_vals = cuts_.cut_values_.HostVector(); + h_vals.resize(n_bins_); + std::iota(h_vals.begin(), h_vals.end(), 0.0); + + cuts_.min_vals_.Resize(1); + + Context ctx; + HistMakerTrainParam hist_param; + hist_.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device())); + hist_.AllocateHistograms({0}); + auto node_hist = hist_[0]; + + SimpleLCG lcg; + SimpleRealUniformDistribution grad_dist{-4.0, 4.0}; + SimpleRealUniformDistribution hess_dist{0.0, 4.0}; + + for (auto &e : node_hist) { + e = GradientPairPrecise{grad_dist(&lcg), hess_dist(&lcg)}; + total_gpair_ += e; + } + + auto enumerate = [this, n_feat = info_.num_col_](common::GHistRow hist, + GradientPairPrecise parent_sum) { + int32_t best_thresh = -1; + float best_score{-std::numeric_limits::infinity()}; + TreeEvaluator evaluator{param_, static_cast(n_feat), DeviceOrd::CPU()}; + auto tree_evaluator = evaluator.GetEvaluator(); + GradientPairPrecise left_sum; + auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_}); + for (size_t i = 0; i < hist.size() - 1; ++i) { + left_sum += hist[i]; + auto right_sum = parent_sum - left_sum; + auto gain = + tree_evaluator.CalcSplitGain(param_, 0, 0, GradStats{left_sum}, GradStats{right_sum}) - + parent_gain; + if (gain > best_score) { + best_score = gain; + best_thresh = i; + } + } + return std::make_tuple(best_thresh, best_score); + }; + + // enumerate all possible partitions to find the optimal split + do { + std::vector sorted_hist(node_hist.size()); + for (size_t i = 0; i < sorted_hist.size(); ++i) { + sorted_hist[i] = node_hist[sorted_idx_[i]]; + } + auto [thresh, score] = enumerate({sorted_hist}, total_gpair_); + if (score > best_score_) { + best_score_ = score; + } + } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end())); +} + void TestEvaluateSplits(bool force_read_by_column) { Context ctx; ctx.nthread = 4; diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h index c7c6854f5..bc4b70946 100644 --- a/tests/cpp/tree/test_evaluate_splits.h +++ b/tests/cpp/tree/test_evaluate_splits.h @@ -12,20 +12,15 @@ #include // for size_t #include // for int32_t, uint64_t, uint32_t #include // for numeric_limits -#include // for iota -#include // for make_tuple, tie, tuple #include // for vector #include "../../../src/common/hist_util.h" // for HistogramCuts, HistCollection, GHistRow #include "../../../src/tree/hist/hist_cache.h" // for HistogramCollection -#include "../../../src/tree/hist/param.h" // for HistMakerTrainParam #include "../../../src/tree/param.h" // for TrainParam, GradStats -#include "../../../src/tree/split_evaluator.h" // for TreeEvaluator -#include "../helpers.h" // for SimpleLCG, SimpleRealUniformDistribution namespace xgboost::tree { /** - * \brief Enumerate all possible partitions for categorical split. + * @brief Enumerate all possible partitions for categorical split. */ class TestPartitionBasedSplit : public ::testing::Test { protected: @@ -38,73 +33,7 @@ class TestPartitionBasedSplit : public ::testing::Test { BoundedHistCollection hist_; GradientPairPrecise total_gpair_; - void SetUp() override { - param_.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}}); - sorted_idx_.resize(n_bins_); - std::iota(sorted_idx_.begin(), sorted_idx_.end(), 0); - - info_.num_col_ = 1; - - cuts_.cut_ptrs_.Resize(2); - cuts_.SetCategorical(true, n_bins_); - auto &h_cuts = cuts_.cut_ptrs_.HostVector(); - h_cuts[0] = 0; - h_cuts[1] = n_bins_; - auto &h_vals = cuts_.cut_values_.HostVector(); - h_vals.resize(n_bins_); - std::iota(h_vals.begin(), h_vals.end(), 0.0); - - cuts_.min_vals_.Resize(1); - - Context ctx; - HistMakerTrainParam hist_param; - hist_.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device())); - hist_.AllocateHistograms({0}); - auto node_hist = hist_[0]; - - SimpleLCG lcg; - SimpleRealUniformDistribution grad_dist{-4.0, 4.0}; - SimpleRealUniformDistribution hess_dist{0.0, 4.0}; - - for (auto &e : node_hist) { - e = GradientPairPrecise{grad_dist(&lcg), hess_dist(&lcg)}; - total_gpair_ += e; - } - - auto enumerate = [this, n_feat = info_.num_col_](common::GHistRow hist, - GradientPairPrecise parent_sum) { - int32_t best_thresh = -1; - float best_score{-std::numeric_limits::infinity()}; - TreeEvaluator evaluator{param_, static_cast(n_feat), DeviceOrd::CPU()}; - auto tree_evaluator = evaluator.GetEvaluator(); - GradientPairPrecise left_sum; - auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_}); - for (size_t i = 0; i < hist.size() - 1; ++i) { - left_sum += hist[i]; - auto right_sum = parent_sum - left_sum; - auto gain = - tree_evaluator.CalcSplitGain(param_, 0, 0, GradStats{left_sum}, GradStats{right_sum}) - - parent_gain; - if (gain > best_score) { - best_score = gain; - best_thresh = i; - } - } - return std::make_tuple(best_thresh, best_score); - }; - - // enumerate all possible partitions to find the optimal split - do { - std::vector sorted_hist(node_hist.size()); - for (size_t i = 0; i < sorted_hist.size(); ++i) { - sorted_hist[i] = node_hist[sorted_idx_[i]]; - } - auto [thresh, score] = enumerate({sorted_hist}, total_gpair_); - if (score > best_score_) { - best_score_ = score; - } - } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end())); - } + void SetUp() override; }; inline auto MakeCutsForTest(std::vector values, std::vector ptrs,