Deterministic GPU histogram. (#5361)

* Use pre-rounding based method to obtain reproducible floating point
  summation.
* GPU Hist for regression and classification are bit-by-bit reproducible.
* Add doc.
* Switch to thrust reduce for `node_sum_gradient`.
This commit is contained in:
Jiaming Yuan
2020-03-04 15:13:28 +08:00
committed by GitHub
parent 9775da02d9
commit 8d06878bf9
18 changed files with 410 additions and 97 deletions

View File

@@ -76,6 +76,20 @@ void TestDeviceSketch(bool use_external_memory) {
ASSERT_LT(fabs(hmat_cpu.Values()[i] - hmat_gpu.Values()[i]), eps * nrows);
}
// Determinstic
size_t constexpr kRounds { 100 };
for (size_t r = 0; r < kRounds; ++r) {
HistogramCuts new_sketch;
DeviceSketch(device, max_bin, gpu_batch_nrows, dmat->get(), &new_sketch);
ASSERT_EQ(hmat_gpu.Values().size(), new_sketch.Values().size());
for (size_t i = 0; i < hmat_gpu.Values().size(); ++i) {
ASSERT_EQ(hmat_gpu.Values()[i], new_sketch.Values()[i]);
}
for (size_t i = 0; i < hmat_gpu.MinValues().size(); ++i) {
ASSERT_EQ(hmat_gpu.MinValues()[i], new_sketch.MinValues()[i]);
}
}
delete dmat;
}

View File

@@ -224,9 +224,10 @@ inline GenericParameter CreateEmptyGenericParam(int gpu_id) {
return tparam;
}
inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows) {
inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows,
float lower= 0.0f, float upper = 1.0f) {
xgboost::SimpleLCG gen;
xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower, upper);
std::vector<GradientPair> h_gpair(n_rows);
for (auto &gpair : h_gpair) {
bst_float grad = dist(&gen);
@@ -288,6 +289,5 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
return page;
}
#endif
} // namespace xgboost
#endif

View File

@@ -605,6 +605,10 @@ TEST_F(MultiClassesSerializationTest, GPU_Hist) {
{"seed", "0"},
{"nthread", "1"},
{"max_depth", std::to_string(kClasses)},
// Somehow rebuilding the cache can generate slightly
// different result (1e-7) with CPU predictor for some
// entries.
{"predictor", "gpu_predictor"},
{"enable_experimental_json_serialization", "1"},
{"tree_method", "gpu_hist"}},
fmap_, *pp_dmat_);

View File

@@ -0,0 +1,69 @@
#include <gtest/gtest.h>
#include "../../helpers.h"
#include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
#include "../../../../src/tree/gpu_hist/histogram.cuh"
namespace xgboost {
namespace tree {
template <typename Gradient>
void TestDeterminsticHistogram() {
size_t constexpr kBins = 24, kCols = 8, kRows = 32768, kRounds = 16;
float constexpr kLower = -1e-2, kUpper = 1e2;
auto pp_m = CreateDMatrix(kRows, kCols, 0.5);
auto& matrix = **pp_m;
BatchParam batch_param{0, static_cast<int32_t>(kBins), 0, 0};
for (auto const& batch : matrix.GetBatches<EllpackPage>(batch_param)) {
auto* page = batch.Impl();
tree::RowPartitioner row_partitioner(0, kRows);
auto ridx = row_partitioner.GetRows(0);
dh::device_vector<Gradient> histogram(kBins * kCols);
auto d_histogram = dh::ToSpan(histogram);
auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
gpair.SetDevice(0);
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
d_histogram, rounding, true);
for (size_t i = 0; i < kRounds; ++i) {
dh::device_vector<Gradient> new_histogram(kBins * kCols);
auto d_histogram = dh::ToSpan(new_histogram);
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
d_histogram, rounding, true);
for (size_t j = 0; j < new_histogram.size(); ++j) {
ASSERT_EQ(((Gradient)new_histogram[j]).GetGrad(),
((Gradient)histogram[j]).GetGrad());
ASSERT_EQ(((Gradient)new_histogram[j]).GetHess(),
((Gradient)histogram[j]).GetHess());
}
}
{
auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
gpair.SetDevice(0);
dh::device_vector<Gradient> baseline(kBins * kCols);
BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
dh::ToSpan(baseline), rounding, true);
for (size_t i = 0; i < baseline.size(); ++i) {
EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
((Gradient)baseline[i]).GetGrad() * 1e-3);
}
}
}
delete pp_m;
}
TEST(Histogram, GPUDeterminstic) {
TestDeterminsticHistogram<GradientPair>();
TestDeterminsticHistogram<GradientPairPrecise>();
}
} // namespace tree
} // namespace xgboost

View File

@@ -83,7 +83,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
param.Init(args);
auto page = BuildEllpackPage(kNRows, kNCols);
BatchParam batch_param{};
GPUHistMakerDevice<GradientSumT> maker(0, page.get(), kNRows, param, kNCols, kNCols, batch_param);
GPUHistMakerDevice<GradientSumT> maker(0, page.get(), kNRows, param, kNCols, kNCols,
true, batch_param);
maker.InitHistogram();
xgboost::SimpleLCG gen;
@@ -187,7 +188,7 @@ TEST(GpuHist, EvaluateSplits) {
auto page = BuildEllpackPage(kNRows, kNCols);
BatchParam batch_param{};
GPUHistMakerDevice<GradientPairPrecise>
maker(0, page.get(), kNRows, param, kNCols, kNCols, batch_param);
maker(0, page.get(), kNRows, param, kNCols, kNCols, true, batch_param);
// Initialize GPUHistMakerDevice::node_sum_gradients
maker.node_sum_gradients = {{6.4f, 12.8f}};