[breaking] Use integer atomic for GPU histogram. (#7180)

On GPU we use rouding factor to truncate the gradient for deterministic results. This PR changes the gradient representation to fixed point number with exponent aligned with rounding factor.

    [breaking] Drop non-deterministic histogram.
    Use fixed point for shared memory.

This PR is to improve the performance of GPU Hist. 

Co-authored-by: Andy Adinets <aadinets@nvidia.com>
This commit is contained in:
Jiaming Yuan
2021-08-28 05:17:05 +08:00
committed by GitHub
parent e7d7ab6bc3
commit 7a1d67f9cb
11 changed files with 295 additions and 142 deletions

View File

@@ -46,14 +46,11 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
struct GPUHistMakerTrainParam
: public XGBoostParameter<GPUHistMakerTrainParam> {
bool single_precision_histogram;
bool deterministic_histogram;
bool debug_synchronize;
// declare parameters
DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
"Use single precision to build histograms.");
DMLC_DECLARE_FIELD(deterministic_histogram).set_default(true).describe(
"Pre-round the gradient for obtaining deterministic gradient histogram.");
DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
"Check if all distributed tree are identical after tree construction.");
}
@@ -153,7 +150,7 @@ class DeviceHistogram {
*/
common::Span<GradientSumT> GetNodeHistogram(int nidx) {
CHECK(this->HistogramExists(nidx));
auto ptr = data_.data().get() + nidx_map_[nidx];
auto ptr = data_.data().get() + nidx_map_.at(nidx);
return common::Span<GradientSumT>(
reinterpret_cast<GradientSumT*>(ptr), n_bins_);
}
@@ -179,9 +176,8 @@ struct GPUHistMakerDevice {
std::vector<GradientPair> node_sum_gradients;
TrainParam param;
bool deterministic_histogram;
GradientSumT histogram_rounding;
HistRounding<GradientSumT> histogram_rounding;
dh::PinnedMemory pinned;
@@ -205,7 +201,6 @@ struct GPUHistMakerDevice {
TrainParam _param,
uint32_t column_sampler_seed,
uint32_t n_features,
bool deterministic_histogram,
BatchParam _batch_param)
: device_id(_device_id),
page(_page),
@@ -214,7 +209,6 @@ struct GPUHistMakerDevice {
tree_evaluator(param, n_features, _device_id),
column_sampler(column_sampler_seed),
interaction_constraints(param, n_features),
deterministic_histogram{deterministic_histogram},
batch_param(_batch_param) {
sampler.reset(new GradientBasedSampler(
page, _n_rows, batch_param, param.subsample, param.sampling_method));
@@ -227,9 +221,9 @@ struct GPUHistMakerDevice {
// Init histogram
hist.Init(device_id, page->Cuts().TotalBins());
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(device_id));
feature_groups.reset(new FeatureGroups(
page->Cuts(), page->is_dense, dh::MaxSharedMemoryOptin(device_id),
sizeof(GradientSumT)));
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(device_id),
sizeof(GradientSumT)));
}
~GPUHistMakerDevice() { // NOLINT
@@ -263,11 +257,7 @@ struct GPUHistMakerDevice {
page = sample.page;
gpair = sample.gpair;
if (deterministic_histogram) {
histogram_rounding = CreateRoundingFactor<GradientSumT>(this->gpair);
} else {
histogram_rounding = GradientSumT{0.0, 0.0};
}
histogram_rounding = CreateRoundingFactor<GradientSumT>(this->gpair);
row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner.reset(new RowPartitioner(device_id, sample.sample_rows));
@@ -805,7 +795,6 @@ class GPUHistMakerSpecialised {
param_,
column_sampling_seed,
info_->num_col_,
hist_maker_param_.deterministic_histogram,
batch_param));
p_last_fmat_ = dmat;