[EM] Refactor GPU histogram builder. (#10764)

- Expose the maximum number of cached nodes to be consistent with the CPU implementation. Also easier for testing.
- Extract the subtraction trick for easier testing.
- Split up the `GradientQuantiser` to avoid circular dependency.
This commit is contained in:
Jiaming Yuan
2024-08-30 02:39:14 +08:00
committed by GitHub
parent 34937fea41
commit 61dd854a52
17 changed files with 394 additions and 187 deletions

View File

@@ -9,6 +9,7 @@
#include "../../../../src/tree/gpu_hist/histogram.cuh"
#include "../../../../src/tree/gpu_hist/row_partitioner.cuh" // for RowPartitioner
#include "../../../../src/tree/hist/param.h" // for HistMakerTrainParam
#include "../../../../src/tree/param.h" // for TrainParam
#include "../../categorical_helpers.h" // for OneHotEncodeFeature
#include "../../helpers.h"
@@ -21,13 +22,13 @@ TEST(Histogram, DeviceHistogramStorage) {
constexpr size_t kNBins = 128;
constexpr int kNNodes = 4;
constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
DeviceHistogramStorage<kStopGrowing> histogram;
histogram.Init(FstCU(), kNBins);
DeviceHistogramStorage histogram{};
histogram.Reset(&ctx, kNBins, kNNodes);
for (int i = 0; i < kNNodes; ++i) {
histogram.AllocateHistograms(&ctx, {i});
}
histogram.Reset(&ctx);
ASSERT_EQ(histogram.Data().size(), kStopGrowing);
histogram.Reset(&ctx, kNBins, kNNodes);
// Use allocated memory but do not erase nidx_map.
for (int i = 0; i < kNNodes; ++i) {
@@ -55,6 +56,35 @@ TEST(Histogram, DeviceHistogramStorage) {
EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1}););
}
TEST(Histogram, SubtractionTrack) {
auto ctx = MakeCUDACtx(0);
auto page = BuildEllpackPage(&ctx, 64, 4);
auto cuts = page->CutsShared();
FeatureGroups fg{*cuts, true, std::numeric_limits<std::size_t>::max(),
sizeof(GradientPairPrecise)};
auto fg_acc = fg.DeviceAccessor(ctx.Device());
auto n_total_bins = cuts->TotalBins();
// 2 nodes
auto max_cached_hist_nodes = 2ull;
DeviceHistogramBuilder histogram;
histogram.Reset(&ctx, max_cached_hist_nodes, fg_acc, n_total_bins, false);
histogram.AllocateHistograms(&ctx, {0, 1, 2});
GPUExpandEntry root;
root.nid = 0;
auto need_build = histogram.SubtractHist({root}, {0}, {1});
std::vector<GPUExpandEntry> candidates(2);
candidates[0].nid = 1;
candidates[1].nid = 2;
need_build = histogram.SubtractHist(candidates, {3, 5}, {4, 6});
ASSERT_EQ(need_build.size(), 2);
ASSERT_EQ(need_build[0], 4);
ASSERT_EQ(need_build[1], 6);
}
std::vector<GradientPairPrecise> GetHostHistGpair() {
// 24 bins, 3 bins for each feature (column).
std::vector<GradientPairPrecise> hist_gpair = {
@@ -101,17 +131,16 @@ void TestBuildHist(bool use_shared_memory_histograms) {
auto shm_size = use_shared_memory_histograms ? dh::MaxSharedMemoryOptin(ctx.Ordinal()) : 0;
FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64));
DeviceHistogramStorage hist;
hist.Init(ctx.Device(), page->Cuts().TotalBins());
hist.AllocateHistograms(&ctx, {0});
DeviceHistogramBuilder builder;
builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), !use_shared_memory_histograms);
builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
feature_groups.DeviceAccessor(ctx.Device()), page->Cuts().TotalBins(),
!use_shared_memory_histograms);
builder.AllocateHistograms(&ctx, {0});
builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(),
row_partitioner->GetRows(0), hist.GetNodeHistogram(0), *quantiser);
row_partitioner->GetRows(0), builder.GetNodeHistogram(0), *quantiser);
auto node_histogram = hist.GetNodeHistogram(0);
auto node_histogram = builder.GetNodeHistogram(0);
std::vector<GradientPairInt64> h_result(node_histogram.size());
dh::CopyDeviceSpanToVector(&h_result, node_histogram);
@@ -158,7 +187,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
DeviceHistogramBuilder builder;
builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
d_histogram, quantiser);
@@ -173,7 +203,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
DeviceHistogramBuilder builder;
builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
d_new_histogram, quantiser);
@@ -197,7 +228,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
dh::device_vector<GradientPairInt64> baseline(num_bins);
DeviceHistogramBuilder builder;
builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), force_global);
builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
single_group.DeviceAccessor(ctx.Device()), num_bins, force_global);
builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
dh::ToSpan(baseline), quantiser);
@@ -264,7 +296,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
auto* page = batch.Impl();
FeatureGroups single_group(page->Cuts());
DeviceHistogramBuilder builder;
builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
single_group.DeviceAccessor(ctx.Device()), num_categories, false);
builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
dh::ToSpan(cat_hist), quantiser);
@@ -280,7 +313,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
auto* page = batch.Impl();
FeatureGroups single_group(page->Cuts());
DeviceHistogramBuilder builder;
builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
single_group.DeviceAccessor(ctx.Device()), encode_hist.size(), false);
builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
dh::ToSpan(encode_hist), quantiser);
@@ -429,7 +463,8 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
auto ridx = partitioners.at(k)->GetRows(0);
auto d_histogram = dh::ToSpan(multi_hist);
DeviceHistogramBuilder builder;
builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
fg->DeviceAccessor(ctx.Device()), d_histogram.size(), force_global);
builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(ctx.Device()),
fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
d_histogram, quantiser);
@@ -454,7 +489,8 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
auto ridx = partitioner.GetRows(0);
auto d_histogram = dh::ToSpan(single_hist);
DeviceHistogramBuilder builder;
builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(), fg->DeviceAccessor(ctx.Device()),
d_histogram.size(), force_global);
builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(ctx.Device()),
fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
d_histogram, quantiser);

View File

@@ -51,7 +51,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
row_set_collection.Init();
HistMakerTrainParam hist_param;
hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
hist.Reset(gmat.cut.Ptrs().back(), hist_param.MaxCachedHistNodes(ctx.Device()));
hist.AllocateHistograms({0});
auto const &elem = row_set_collection[0];
common::BuildHist<false>(row_gpairs, common::Span{elem.begin(), elem.end()}, gmat, hist[0],
@@ -120,7 +120,7 @@ TEST(HistMultiEvaluator, Evaluate) {
linalg::Vector<GradientPairPrecise> root_sum({2}, DeviceOrd::CPU());
for (bst_target_t t{0}; t < n_targets; ++t) {
auto &hist = histogram[t];
hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
hist.Reset(n_bins * n_features, hist_param.MaxCachedHistNodes(ctx.Device()));
hist.AllocateHistograms({0});
auto node_hist = hist[0];
node_hist[0] = {-0.5, 0.5};
@@ -237,7 +237,7 @@ auto CompareOneHotAndPartition(bool onehot) {
entries.front().nid = 0;
entries.front().depth = 0;
hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node);
hist.Reset(gmat.cut.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
hist.AllocateHistograms({0});
auto node_hist = hist[0];
@@ -265,9 +265,10 @@ TEST(HistEvaluator, Categorical) {
}
TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
Context ctx;
BoundedHistCollection hist;
HistMakerTrainParam hist_param;
hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
hist.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
hist.AllocateHistograms({0});
auto node_hist = hist[0];
ASSERT_EQ(node_hist.size(), feature_histogram_.size());
@@ -277,10 +278,9 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
MetaInfo info;
info.num_col_ = 1;
info.feature_types = {FeatureType::kCategorical};
Context ctx;
auto evaluator = HistEvaluator{&ctx, &param_, info, sampler};
evaluator.InitRoot(GradStats{parent_sum_});
std::vector<CPUExpandEntry> entries(1);
RegTree tree;
evaluator.EvaluateSplits(hist, cuts_, info.feature_types.ConstHostSpan(), tree, &entries);

View File

@@ -56,8 +56,9 @@ class TestPartitionBasedSplit : public ::testing::Test {
cuts_.min_vals_.Resize(1);
Context ctx;
HistMakerTrainParam hist_param;
hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
hist_.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
hist_.AllocateHistograms({0});
auto node_hist = hist_[0];

View File

@@ -216,7 +216,7 @@ TEST(GpuHist, ConfigIO) {
}
TEST(GpuHist, MaxDepth) {
Context ctx(MakeCUDACtx(0));
auto ctx = MakeCUDACtx(0);
size_t constexpr kRows = 16;
size_t constexpr kCols = 4;
auto p_mat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();