Support optimal partitioning for GPU hist. (#7652)
* Implement `MaxCategory` in quantile. * Implement partition-based split for GPU evaluation. Currently, it's based on the existing evaluation function. * Extract an evaluator from GPU Hist to store the needed states. * Added some CUDA stream/event utilities. * Update document with references. * Fixed a bug in approx evaluator where the number of data points is less than the number of categories.
This commit is contained in:
@@ -1,7 +1,11 @@
|
||||
/*!
|
||||
* Copyright 2020-2022 by XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
|
||||
#include "../../helpers.h"
|
||||
#include "../../histogram_helpers.h"
|
||||
#include "../test_evaluate_splits.h" // TestPartitionBasedSplit
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -16,7 +20,6 @@ auto ZeroParam() {
|
||||
} // anonymous namespace
|
||||
|
||||
void TestEvaluateSingleSplit(bool is_categorical) {
|
||||
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
|
||||
GradientPairPrecise parent_sum(0.0, 1.0);
|
||||
TrainParam tparam = ZeroParam();
|
||||
GPUTrainingParam param{tparam};
|
||||
@@ -50,11 +53,13 @@ void TestEvaluateSingleSplit(bool is_categorical) {
|
||||
dh::ToSpan(feature_values),
|
||||
dh::ToSpan(feature_min_values),
|
||||
dh::ToSpan(feature_histogram)};
|
||||
TreeEvaluator tree_evaluator(tparam, feature_min_values.size(), 0);
|
||||
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
EvaluateSingleSplit(dh::ToSpan(out_splits), evaluator, input);
|
||||
|
||||
DeviceSplitCandidate result = out_splits[0];
|
||||
GPUHistEvaluator<GradientPair> evaluator{
|
||||
tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
|
||||
dh::device_vector<common::CatBitField::value_type> out_cats;
|
||||
DeviceSplitCandidate result =
|
||||
evaluator.EvaluateSingleSplit(input, 0, ObjInfo{ObjInfo::kRegression}).split;
|
||||
|
||||
EXPECT_EQ(result.findex, 1);
|
||||
EXPECT_EQ(result.fvalue, 11.0);
|
||||
EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(),
|
||||
@@ -72,7 +77,6 @@ TEST(GpuHist, EvaluateCategoricalSplit) {
|
||||
}
|
||||
|
||||
TEST(GpuHist, EvaluateSingleSplitMissing) {
|
||||
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
|
||||
GradientPairPrecise parent_sum(1.0, 1.5);
|
||||
TrainParam tparam = ZeroParam();
|
||||
GPUTrainingParam param{tparam};
|
||||
@@ -96,11 +100,10 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
|
||||
dh::ToSpan(feature_min_values),
|
||||
dh::ToSpan(feature_histogram)};
|
||||
|
||||
TreeEvaluator tree_evaluator(tparam, feature_set.size(), 0);
|
||||
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
EvaluateSingleSplit(dh::ToSpan(out_splits), evaluator, input);
|
||||
GPUHistEvaluator<GradientPair> evaluator(tparam, feature_set.size(), 0);
|
||||
DeviceSplitCandidate result =
|
||||
evaluator.EvaluateSingleSplit(input, 0, ObjInfo{ObjInfo::kRegression}).split;
|
||||
|
||||
DeviceSplitCandidate result = out_splits[0];
|
||||
EXPECT_EQ(result.findex, 0);
|
||||
EXPECT_EQ(result.fvalue, 1.0);
|
||||
EXPECT_EQ(result.dir, kRightDir);
|
||||
@@ -109,27 +112,18 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
|
||||
}
|
||||
|
||||
TEST(GpuHist, EvaluateSingleSplitEmpty) {
|
||||
DeviceSplitCandidate nonzeroed;
|
||||
nonzeroed.findex = 1;
|
||||
nonzeroed.loss_chg = 1.0;
|
||||
|
||||
thrust::device_vector<DeviceSplitCandidate> out_split(1);
|
||||
out_split[0] = nonzeroed;
|
||||
|
||||
TrainParam tparam = ZeroParam();
|
||||
TreeEvaluator tree_evaluator(tparam, 1, 0);
|
||||
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
EvaluateSingleSplit(dh::ToSpan(out_split), evaluator,
|
||||
EvaluateSplitInputs<GradientPair>{});
|
||||
|
||||
DeviceSplitCandidate result = out_split[0];
|
||||
GPUHistEvaluator<GradientPair> evaluator(tparam, 1, 0);
|
||||
DeviceSplitCandidate result = evaluator
|
||||
.EvaluateSingleSplit(EvaluateSplitInputs<GradientPair>{}, 0,
|
||||
ObjInfo{ObjInfo::kRegression})
|
||||
.split;
|
||||
EXPECT_EQ(result.findex, -1);
|
||||
EXPECT_LT(result.loss_chg, 0.0f);
|
||||
}
|
||||
|
||||
// Feature 0 has a better split, but the algorithm must select feature 1
|
||||
TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
|
||||
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
|
||||
GradientPairPrecise parent_sum(0.0, 1.0);
|
||||
TrainParam tparam = ZeroParam();
|
||||
tparam.UpdateAllowUnknown(Args{});
|
||||
@@ -157,11 +151,10 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
|
||||
dh::ToSpan(feature_min_values),
|
||||
dh::ToSpan(feature_histogram)};
|
||||
|
||||
TreeEvaluator tree_evaluator(tparam, feature_min_values.size(), 0);
|
||||
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
EvaluateSingleSplit(dh::ToSpan(out_splits), evaluator, input);
|
||||
GPUHistEvaluator<GradientPair> evaluator(tparam, feature_min_values.size(), 0);
|
||||
DeviceSplitCandidate result =
|
||||
evaluator.EvaluateSingleSplit(input, 0, ObjInfo{ObjInfo::kRegression}).split;
|
||||
|
||||
DeviceSplitCandidate result = out_splits[0];
|
||||
EXPECT_EQ(result.findex, 1);
|
||||
EXPECT_EQ(result.fvalue, 11.0);
|
||||
EXPECT_EQ(result.left_sum, GradientPairPrecise(-0.5, 0.5));
|
||||
@@ -170,7 +163,6 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
|
||||
|
||||
// Features 0 and 1 have identical gain, the algorithm must select 0
|
||||
TEST(GpuHist, EvaluateSingleSplitBreakTies) {
|
||||
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
|
||||
GradientPairPrecise parent_sum(0.0, 1.0);
|
||||
TrainParam tparam = ZeroParam();
|
||||
tparam.UpdateAllowUnknown(Args{});
|
||||
@@ -198,11 +190,10 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
|
||||
dh::ToSpan(feature_min_values),
|
||||
dh::ToSpan(feature_histogram)};
|
||||
|
||||
TreeEvaluator tree_evaluator(tparam, feature_min_values.size(), 0);
|
||||
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
EvaluateSingleSplit(dh::ToSpan(out_splits), evaluator, input);
|
||||
GPUHistEvaluator<GradientPair> evaluator(tparam, feature_min_values.size(), 0);
|
||||
DeviceSplitCandidate result =
|
||||
evaluator.EvaluateSingleSplit(input, 0, ObjInfo{ObjInfo::kRegression}).split;
|
||||
|
||||
DeviceSplitCandidate result = out_splits[0];
|
||||
EXPECT_EQ(result.findex, 0);
|
||||
EXPECT_EQ(result.fvalue, 1.0);
|
||||
}
|
||||
@@ -250,9 +241,10 @@ TEST(GpuHist, EvaluateSplits) {
|
||||
dh::ToSpan(feature_min_values),
|
||||
dh::ToSpan(feature_histogram_right)};
|
||||
|
||||
TreeEvaluator tree_evaluator(tparam, feature_min_values.size(), 0);
|
||||
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
|
||||
EvaluateSplits(dh::ToSpan(out_splits), evaluator, input_left, input_right);
|
||||
GPUHistEvaluator<GradientPair> evaluator{
|
||||
tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
|
||||
evaluator.EvaluateSplits(input_left, input_right, ObjInfo{ObjInfo::kRegression},
|
||||
evaluator.GetEvaluator(), dh::ToSpan(out_splits));
|
||||
|
||||
DeviceSplitCandidate result_left = out_splits[0];
|
||||
EXPECT_EQ(result_left.findex, 1);
|
||||
@@ -262,5 +254,36 @@ TEST(GpuHist, EvaluateSplits) {
|
||||
EXPECT_EQ(result_right.findex, 0);
|
||||
EXPECT_EQ(result_right.fvalue, 1.0);
|
||||
}
|
||||
|
||||
TEST_F(TestPartitionBasedSplit, GpuHist) {
|
||||
dh::device_vector<FeatureType> ft{std::vector<FeatureType>{FeatureType::kCategorical}};
|
||||
GPUHistEvaluator<GradientPairPrecise> evaluator{param_,
|
||||
static_cast<bst_feature_t>(info_.num_col_), 0};
|
||||
|
||||
cuts_.cut_ptrs_.SetDevice(0);
|
||||
cuts_.cut_values_.SetDevice(0);
|
||||
cuts_.min_vals_.SetDevice(0);
|
||||
|
||||
ObjInfo task{ObjInfo::kRegression};
|
||||
evaluator.Reset(cuts_, dh::ToSpan(ft), task, info_.num_col_, param_, 0);
|
||||
|
||||
dh::device_vector<GradientPairPrecise> d_hist(hist_[0].size());
|
||||
auto node_hist = hist_[0];
|
||||
dh::safe_cuda(cudaMemcpy(d_hist.data().get(), node_hist.data(), node_hist.size_bytes(),
|
||||
cudaMemcpyHostToDevice));
|
||||
dh::device_vector<bst_feature_t> feature_set{std::vector<bst_feature_t>{0}};
|
||||
|
||||
EvaluateSplitInputs<GradientPairPrecise> input{0,
|
||||
total_gpair_,
|
||||
GPUTrainingParam{param_},
|
||||
dh::ToSpan(feature_set),
|
||||
dh::ToSpan(ft),
|
||||
cuts_.cut_ptrs_.ConstDeviceSpan(),
|
||||
cuts_.cut_values_.ConstDeviceSpan(),
|
||||
cuts_.min_vals_.ConstDeviceSpan(),
|
||||
dh::ToSpan(d_hist)};
|
||||
auto split = evaluator.EvaluateSingleSplit(input, 0, ObjInfo{ObjInfo::kRegression}).split;
|
||||
ASSERT_NEAR(split.loss_chg, best_score_, 1e-16);
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -3,9 +3,11 @@
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/base.h>
|
||||
|
||||
#include "../../../../src/common/hist_util.h"
|
||||
#include "../../../../src/tree/hist/evaluate_splits.h"
|
||||
#include "../../../../src/tree/updater_quantile_hist.h"
|
||||
#include "../../../../src/common/hist_util.h"
|
||||
#include "../test_evaluate_splits.h"
|
||||
#include "../../helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -108,80 +110,17 @@ TEST(HistEvaluator, Apply) {
|
||||
ASSERT_EQ(tree.Stat(tree[0].RightChild()).sum_hess, 0.7f);
|
||||
}
|
||||
|
||||
TEST(HistEvaluator, CategoricalPartition) {
|
||||
int static constexpr kRows = 128, kCols = 1;
|
||||
using GradientSumT = double;
|
||||
std::vector<FeatureType> ft(kCols, FeatureType::kCategorical);
|
||||
|
||||
TrainParam param;
|
||||
param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
|
||||
|
||||
size_t n_cats{8};
|
||||
|
||||
auto dmat =
|
||||
RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix();
|
||||
|
||||
int32_t n_threads = 16;
|
||||
TEST_F(TestPartitionBasedSplit, CPUHist) {
|
||||
// check the evaluator is returning the optimal split
|
||||
std::vector<FeatureType> ft{FeatureType::kCategorical};
|
||||
auto sampler = std::make_shared<common::ColumnSampler>();
|
||||
auto evaluator = HistEvaluator<GradientSumT, CPUExpandEntry>{
|
||||
param, dmat->Info(), n_threads, sampler, ObjInfo{ObjInfo::kRegression}};
|
||||
|
||||
for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>({32, param.sparse_threshold})) {
|
||||
common::HistCollection<GradientSumT> hist;
|
||||
|
||||
std::vector<CPUExpandEntry> entries(1);
|
||||
entries.front().nid = 0;
|
||||
entries.front().depth = 0;
|
||||
|
||||
hist.Init(gmat.cut.TotalBins());
|
||||
hist.AddHistRow(0);
|
||||
hist.AllocateAllData();
|
||||
auto node_hist = hist[0];
|
||||
ASSERT_EQ(node_hist.size(), n_cats);
|
||||
ASSERT_EQ(node_hist.size(), gmat.cut.Ptrs().back());
|
||||
|
||||
GradientPairPrecise total_gpair;
|
||||
for (size_t i = 0; i < node_hist.size(); ++i) {
|
||||
node_hist[i] = {static_cast<double>(node_hist.size() - i), 1.0};
|
||||
total_gpair += node_hist[i];
|
||||
}
|
||||
SimpleLCG lcg;
|
||||
std::shuffle(node_hist.begin(), node_hist.end(), lcg);
|
||||
|
||||
RegTree tree;
|
||||
evaluator.InitRoot(GradStats{total_gpair});
|
||||
evaluator.EvaluateSplits(hist, gmat.cut, ft, tree, &entries);
|
||||
ASSERT_TRUE(entries.front().split.is_cat);
|
||||
|
||||
auto run_eval = [&](auto fn) {
|
||||
for (size_t i = 1; i < gmat.cut.Ptrs().size(); ++i) {
|
||||
GradStats left, right;
|
||||
for (size_t j = gmat.cut.Ptrs()[i - 1]; j < gmat.cut.Ptrs()[i]; ++j) {
|
||||
auto loss_chg = evaluator.Evaluator().CalcSplitGain(param, 0, i - 1, left, right) -
|
||||
evaluator.Stats().front().root_gain;
|
||||
fn(loss_chg);
|
||||
left.Add(node_hist[j].GetGrad(), node_hist[j].GetHess());
|
||||
right.SetSubstract(GradStats{total_gpair}, left);
|
||||
}
|
||||
}
|
||||
};
|
||||
// Assert that's the best split
|
||||
auto best_loss_chg = entries.front().split.loss_chg;
|
||||
run_eval([&](auto loss_chg) {
|
||||
// Approximated test that gain returned by optimal partition is greater than
|
||||
// numerical split.
|
||||
ASSERT_GT(best_loss_chg, loss_chg);
|
||||
});
|
||||
// node_hist is captured in lambda.
|
||||
std::sort(node_hist.begin(), node_hist.end(), [&](auto l, auto r) {
|
||||
return evaluator.Evaluator().CalcWeightCat(param, l) <
|
||||
evaluator.Evaluator().CalcWeightCat(param, r);
|
||||
});
|
||||
|
||||
double reimpl = 0;
|
||||
run_eval([&](auto loss_chg) { reimpl = std::max(loss_chg, reimpl); });
|
||||
CHECK_EQ(reimpl, best_loss_chg);
|
||||
}
|
||||
HistEvaluator<double, CPUExpandEntry> evaluator{param_, info_, common::OmpGetNumThreads(0),
|
||||
sampler, ObjInfo{ObjInfo::kRegression}};
|
||||
evaluator.InitRoot(GradStats{total_gpair_});
|
||||
RegTree tree;
|
||||
std::vector<CPUExpandEntry> entries(1);
|
||||
evaluator.EvaluateSplits(hist_, cuts_, {ft}, tree, &entries);
|
||||
ASSERT_NEAR(entries[0].split.loss_chg, best_score_, 1e-16);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
96
tests/cpp/tree/test_evaluate_splits.h
Normal file
96
tests/cpp/tree/test_evaluate_splits.h
Normal file
@@ -0,0 +1,96 @@
|
||||
/*!
|
||||
* Copyright 2022 by XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <algorithm> // next_permutation
|
||||
#include <numeric> // iota
|
||||
|
||||
#include "../../../src/tree/hist/evaluate_splits.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/**
|
||||
* \brief Enumerate all possible partitions for categorical split.
|
||||
*/
|
||||
class TestPartitionBasedSplit : public ::testing::Test {
|
||||
protected:
|
||||
size_t n_bins_ = 6;
|
||||
std::vector<size_t> sorted_idx_;
|
||||
TrainParam param_;
|
||||
MetaInfo info_;
|
||||
float best_score_{-std::numeric_limits<float>::infinity()};
|
||||
common::HistogramCuts cuts_;
|
||||
common::HistCollection<double> hist_;
|
||||
GradientPairPrecise total_gpair_;
|
||||
|
||||
void SetUp() override {
|
||||
param_.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
|
||||
sorted_idx_.resize(n_bins_);
|
||||
std::iota(sorted_idx_.begin(), sorted_idx_.end(), 0);
|
||||
|
||||
info_.num_col_ = 1;
|
||||
|
||||
cuts_.cut_ptrs_.Resize(2);
|
||||
cuts_.SetCategorical(true, n_bins_);
|
||||
auto &h_cuts = cuts_.cut_ptrs_.HostVector();
|
||||
h_cuts[0] = 0;
|
||||
h_cuts[1] = n_bins_;
|
||||
auto &h_vals = cuts_.cut_values_.HostVector();
|
||||
h_vals.resize(n_bins_);
|
||||
std::iota(h_vals.begin(), h_vals.end(), 0.0);
|
||||
|
||||
hist_.Init(cuts_.TotalBins());
|
||||
hist_.AddHistRow(0);
|
||||
hist_.AllocateAllData();
|
||||
auto node_hist = hist_[0];
|
||||
|
||||
SimpleLCG lcg;
|
||||
SimpleRealUniformDistribution<double> grad_dist{-4.0, 4.0};
|
||||
SimpleRealUniformDistribution<double> hess_dist{0.0, 4.0};
|
||||
|
||||
for (auto &e : node_hist) {
|
||||
e = GradientPairPrecise{grad_dist(&lcg), hess_dist(&lcg)};
|
||||
total_gpair_ += e;
|
||||
}
|
||||
|
||||
auto enumerate = [this, n_feat = info_.num_col_](common::GHistRow<double> hist,
|
||||
GradientPairPrecise parent_sum) {
|
||||
int32_t best_thresh = -1;
|
||||
float best_score{-std::numeric_limits<float>::infinity()};
|
||||
TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), -1};
|
||||
auto tree_evaluator = evaluator.GetEvaluator<TrainParam>();
|
||||
GradientPairPrecise left_sum;
|
||||
auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_});
|
||||
for (size_t i = 0; i < hist.size() - 1; ++i) {
|
||||
left_sum += hist[i];
|
||||
auto right_sum = parent_sum - left_sum;
|
||||
auto gain =
|
||||
tree_evaluator.CalcSplitGain(param_, 0, 0, GradStats{left_sum}, GradStats{right_sum}) -
|
||||
parent_gain;
|
||||
if (gain > best_score) {
|
||||
best_score = gain;
|
||||
best_thresh = i;
|
||||
}
|
||||
}
|
||||
return std::make_tuple(best_thresh, best_score);
|
||||
};
|
||||
|
||||
// enumerate all possible partitions to find the optimal split
|
||||
do {
|
||||
int32_t thresh;
|
||||
float score;
|
||||
std::vector<GradientPairPrecise> sorted_hist(node_hist.size());
|
||||
for (size_t i = 0; i < sorted_hist.size(); ++i) {
|
||||
sorted_hist[i] = node_hist[sorted_idx_[i]];
|
||||
}
|
||||
std::tie(thresh, score) = enumerate({sorted_hist}, total_gpair_);
|
||||
if (score > best_score_) {
|
||||
best_score_ = score;
|
||||
}
|
||||
} while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end()));
|
||||
}
|
||||
};
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@@ -262,7 +262,8 @@ TEST(GpuHist, EvaluateRootSplit) {
|
||||
info.num_row_ = kNRows;
|
||||
info.num_col_ = kNCols;
|
||||
|
||||
DeviceSplitCandidate res = maker.EvaluateRootSplit({6.4f, 12.8f});
|
||||
DeviceSplitCandidate res =
|
||||
maker.EvaluateRootSplit({6.4f, 12.8f}, 0, ObjInfo{ObjInfo::kRegression}).split;
|
||||
|
||||
ASSERT_EQ(res.findex, 7);
|
||||
ASSERT_NEAR(res.fvalue, 0.26, xgboost::kRtEps);
|
||||
@@ -300,11 +301,11 @@ void TestHistogramIndexImpl() {
|
||||
const auto &maker = hist_maker.maker;
|
||||
auto grad = GenerateRandomGradients(kNRows);
|
||||
grad.SetDevice(0);
|
||||
maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
|
||||
maker->Reset(&grad, hist_maker_dmat.get(), kNCols, ObjInfo{ObjInfo::kRegression});
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
|
||||
|
||||
const auto &maker_ext = hist_maker_ext.maker;
|
||||
maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols);
|
||||
maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols, ObjInfo{ObjInfo::kRegression});
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());
|
||||
|
||||
ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins());
|
||||
|
||||
Reference in New Issue
Block a user