[breaking] Use integer atomic for GPU histogram. (#7180)
On GPU we use rouding factor to truncate the gradient for deterministic results. This PR changes the gradient representation to fixed point number with exponent aligned with rounding factor.
[breaking] Drop non-deterministic histogram.
Use fixed point for shared memory.
This PR is to improve the performance of GPU Hist.
Co-authored-by: Andy Adinets <aadinets@nvidia.com>
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
/*!
|
||||
* Copyright 2017-2020 XGBoost contributors
|
||||
* Copyright 2017-2021 XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/host_vector.h>
|
||||
#include <dmlc/filesystem.h>
|
||||
#include <xgboost/base.h>
|
||||
#include <random>
|
||||
@@ -80,8 +81,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
|
||||
param.Init(args);
|
||||
auto page = BuildEllpackPage(kNRows, kNCols);
|
||||
BatchParam batch_param{};
|
||||
GPUHistMakerDevice<GradientSumT> maker(0, page.get(), {}, kNRows, param, kNCols, kNCols,
|
||||
true, batch_param);
|
||||
GPUHistMakerDevice<GradientSumT> maker(0, page.get(), {}, kNRows, param,
|
||||
kNCols, kNCols, batch_param);
|
||||
xgboost::SimpleLCG gen;
|
||||
xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
|
||||
HostDeviceVector<GradientPair> gpair(kNRows);
|
||||
@@ -93,14 +94,18 @@ void TestBuildHist(bool use_shared_memory_histograms) {
|
||||
gpair.SetDevice(0);
|
||||
|
||||
thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
|
||||
|
||||
|
||||
maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
|
||||
maker.hist.AllocateHistogram(0);
|
||||
maker.gpair = gpair.DeviceSpan();
|
||||
maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);;
|
||||
|
||||
maker.BuildHist(0);
|
||||
DeviceHistogram<GradientSumT> d_hist = maker.hist;
|
||||
BuildGradientHistogram(
|
||||
page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0),
|
||||
gpair.DeviceSpan(), maker.row_partitioner->GetRows(0),
|
||||
maker.hist.GetNodeHistogram(0), maker.histogram_rounding,
|
||||
!use_shared_memory_histograms);
|
||||
|
||||
DeviceHistogram<GradientSumT>& d_hist = maker.hist;
|
||||
|
||||
auto node_histogram = d_hist.GetNodeHistogram(0);
|
||||
// d_hist.data stored in float, not gradient pair
|
||||
@@ -115,6 +120,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
|
||||
std::vector<GradientPairPrecise> solution = GetHostHistGpair();
|
||||
std::cout << std::fixed;
|
||||
for (size_t i = 0; i < h_result.size(); ++i) {
|
||||
ASSERT_FALSE(std::isnan(h_result[i].GetGrad()));
|
||||
EXPECT_NEAR(h_result[i].GetGrad(), solution[i].GetGrad(), 0.01f);
|
||||
EXPECT_NEAR(h_result[i].GetHess(), solution[i].GetHess(), 0.01f);
|
||||
}
|
||||
@@ -158,7 +164,8 @@ TEST(GpuHist, ApplySplit) {
|
||||
HostDeviceVector<FeatureType> feature_types(10, FeatureType::kCategorical);
|
||||
feature_types.SetDevice(bparam.gpu_id);
|
||||
tree::GPUHistMakerDevice<GradientPairPrecise> updater(
|
||||
0, impl, feature_types.ConstDeviceSpan(), n_rows, tparam, 0, n_cols, true, bparam);
|
||||
0, impl, feature_types.ConstDeviceSpan(), n_rows, tparam, 0, n_cols,
|
||||
bparam);
|
||||
updater.ApplySplit(candidate, &tree);
|
||||
|
||||
ASSERT_EQ(tree.GetSplitTypes().size(), 3);
|
||||
@@ -217,8 +224,8 @@ TEST(GpuHist, EvaluateRootSplit) {
|
||||
// Initialize GPUHistMakerDevice
|
||||
auto page = BuildEllpackPage(kNRows, kNCols);
|
||||
BatchParam batch_param{};
|
||||
GPUHistMakerDevice<GradientPairPrecise>
|
||||
maker(0, page.get(), {}, kNRows, param, kNCols, kNCols, true, batch_param);
|
||||
GPUHistMakerDevice<GradientPairPrecise> maker(
|
||||
0, page.get(), {}, kNRows, param, kNCols, kNCols, batch_param);
|
||||
// Initialize GPUHistMakerDevice::node_sum_gradients
|
||||
maker.node_sum_gradients = {};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user