From 7f399eac8baa0145fb59975832ea2a30251a2b24 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 22 Dec 2021 08:41:35 +0800 Subject: [PATCH] Use double for GPU Hist node sum. (#7507) --- src/tree/gpu_hist/evaluate_splits.cu | 67 +++++++------- src/tree/gpu_hist/evaluate_splits.cuh | 2 +- src/tree/updater_gpu_common.cuh | 8 +- src/tree/updater_gpu_hist.cu | 88 +++++++++---------- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 18 ++-- 5 files changed, 85 insertions(+), 98 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 308d7fde6..90ea5a66d 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -1,5 +1,5 @@ /*! - * Copyright 2020 by XGBoost Contributors + * Copyright 2020-2021 by XGBoost Contributors */ #include #include "evaluate_splits.cuh" @@ -9,15 +9,13 @@ namespace xgboost { namespace tree { // With constraints -template -XGBOOST_DEVICE float -LossChangeMissing(const GradientPairT &scan, const GradientPairT &missing, - const GradientPairT &parent_sum, - const GPUTrainingParam ¶m, - bst_node_t nidx, - bst_feature_t fidx, - TreeEvaluator::SplitEvaluator evaluator, - bool &missing_left_out) { // NOLINT +XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan, + const GradientPairPrecise &missing, + const GradientPairPrecise &parent_sum, + const GPUTrainingParam ¶m, bst_node_t nidx, + bst_feature_t fidx, + TreeEvaluator::SplitEvaluator evaluator, + bool &missing_left_out) { // NOLINT float parent_gain = CalcGain(param, parent_sum); float missing_left_gain = evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan + missing), @@ -72,32 +70,32 @@ ReduceFeature(common::Span feature_histogram, } template struct OneHotBin { - GradientSumT __device__ operator()( - bool thread_active, uint32_t scan_begin, - SumCallbackOp*, - GradientSumT const &missing, - EvaluateSplitInputs const &inputs, TempStorageT *) { + GradientSumT __device__ operator()(bool thread_active, uint32_t scan_begin, + SumCallbackOp *, + GradientPairPrecise const &missing, + EvaluateSplitInputs const &inputs, + TempStorageT *) { GradientSumT bin = thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT(); - auto rest = inputs.parent_sum - bin - missing; - return rest; + auto rest = inputs.parent_sum - GradientPairPrecise(bin) - missing; + return GradientSumT{rest}; } }; template struct UpdateOneHot { void __device__ operator()(bool missing_left, uint32_t scan_begin, float gain, - bst_feature_t fidx, GradientSumT const &missing, + bst_feature_t fidx, GradientPairPrecise const &missing, GradientSumT const &bin, EvaluateSplitInputs const &inputs, DeviceSplitCandidate *best_split) { int split_gidx = (scan_begin + threadIdx.x); float fvalue = inputs.feature_values[split_gidx]; - GradientSumT left = missing_left ? bin + missing : bin; - GradientSumT right = inputs.parent_sum - left; - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, - GradientPair(left), GradientPair(right), true, + GradientPairPrecise left = + missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; + GradientPairPrecise right = inputs.parent_sum - left; + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, true, inputs.param); } }; @@ -105,8 +103,8 @@ struct UpdateOneHot { template struct NumericBin { GradientSumT __device__ operator()(bool thread_active, uint32_t scan_begin, - SumCallbackOp* prefix_callback, - GradientSumT const &missing, + SumCallbackOp *prefix_callback, + GradientPairPrecise const &missing, EvaluateSplitInputs inputs, TempStorageT *temp_storage) { GradientSumT bin = thread_active @@ -120,7 +118,7 @@ struct NumericBin { template struct UpdateNumeric { void __device__ operator()(bool missing_left, uint32_t scan_begin, float gain, - bst_feature_t fidx, GradientSumT const &missing, + bst_feature_t fidx, GradientPairPrecise const &missing, GradientSumT const &bin, EvaluateSplitInputs const &inputs, DeviceSplitCandidate *best_split) { @@ -133,11 +131,11 @@ struct UpdateNumeric { } else { fvalue = inputs.feature_values[split_gidx]; } - GradientSumT left = missing_left ? bin + missing : bin; - GradientSumT right = inputs.parent_sum - left; - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, - fidx, GradientPair(left), GradientPair(right), - false, inputs.param); + GradientPairPrecise left = + missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; + GradientPairPrecise right = inputs.parent_sum - left; + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, false, + inputs.param); } }; @@ -164,7 +162,7 @@ __device__ void EvaluateFeature( ReduceFeature( feature_hist, temp_storage); - GradientSumT const missing = inputs.parent_sum - feature_sum; + GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum}; float const null_gain = -std::numeric_limits::infinity(); SumCallbackOp prefix_op = SumCallbackOp(); @@ -177,11 +175,8 @@ __device__ void EvaluateFeature( bool missing_left = true; float gain = null_gain; if (thread_active) { - gain = LossChangeMissing(bin, missing, inputs.parent_sum, inputs.param, - inputs.nidx, - fidx, - evaluator, - missing_left); + gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, inputs.param, + inputs.nidx, fidx, evaluator, missing_left); } __syncthreads(); diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index e30901134..fd4abe786 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -15,7 +15,7 @@ namespace tree { template struct EvaluateSplitInputs { int nidx; - GradientSumT parent_sum; + GradientPairPrecise parent_sum; GPUTrainingParam param; common::Span feature_set; common::Span feature_types; diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh index 4219a3399..fc4fe6f89 100644 --- a/src/tree/updater_gpu_common.cuh +++ b/src/tree/updater_gpu_common.cuh @@ -61,8 +61,8 @@ struct DeviceSplitCandidate { float fvalue {0}; bool is_cat { false }; - GradientPair left_sum; - GradientPair right_sum; + GradientPairPrecise left_sum; + GradientPairPrecise right_sum; XGBOOST_DEVICE DeviceSplitCandidate() {} // NOLINT @@ -78,8 +78,8 @@ struct DeviceSplitCandidate { XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in, int findex_in, - GradientPair left_sum_in, - GradientPair right_sum_in, + GradientPairPrecise left_sum_in, + GradientPairPrecise right_sum_in, bool cat, const GPUTrainingParam& param) { if (loss_chg_in > loss_chg && diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 9b673171f..48d58074e 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -173,7 +173,7 @@ struct GPUHistMakerDevice { dh::caching_device_vector monotone_constraints; /*! \brief Sum gradient for each node. */ - std::vector node_sum_gradients; + std::vector node_sum_gradients; TrainParam param; @@ -239,8 +239,7 @@ struct GPUHistMakerDevice { dh::safe_cuda(cudaSetDevice(device_id)); tree_evaluator = TreeEvaluator(param, dmat->Info().num_col_, device_id); this->interaction_constraints.Reset(); - std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), - GradientPair()); + std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), GradientPairPrecise{}); if (d_gpair.size() != dh_gpair->Size()) { d_gpair.resize(dh_gpair->Size()); @@ -260,7 +259,7 @@ struct GPUHistMakerDevice { } - DeviceSplitCandidate EvaluateRootSplit(GradientPair root_sum) { + DeviceSplitCandidate EvaluateRootSplit(GradientPairPrecise root_sum) { int nidx = RegTree::kRoot; dh::TemporaryArray splits_out(1); GPUTrainingParam gpu_param(param); @@ -269,16 +268,15 @@ struct GPUHistMakerDevice { common::Span feature_set = interaction_constraints.Query(sampled_features->DeviceSpan(), nidx); auto matrix = page->GetDeviceAccessor(device_id); - EvaluateSplitInputs inputs{ - nidx, - {root_sum.GetGrad(), root_sum.GetHess()}, - gpu_param, - feature_set, - feature_types, - matrix.feature_segments, - matrix.gidx_fvalue_map, - matrix.min_fvalue, - hist.GetNodeHistogram(nidx)}; + EvaluateSplitInputs inputs{nidx, + root_sum, + gpu_param, + feature_set, + feature_types, + matrix.feature_segments, + matrix.gidx_fvalue_map, + matrix.min_fvalue, + hist.GetNodeHistogram(nidx)}; auto gain_calc = tree_evaluator.GetEvaluator(); EvaluateSingleSplit(dh::ToSpan(splits_out), gain_calc, inputs); std::vector result(1); @@ -307,28 +305,24 @@ struct GPUHistMakerDevice { left_nidx); auto matrix = page->GetDeviceAccessor(device_id); - EvaluateSplitInputs left{ - left_nidx, - {candidate.split.left_sum.GetGrad(), - candidate.split.left_sum.GetHess()}, - gpu_param, - left_feature_set, - feature_types, - matrix.feature_segments, - matrix.gidx_fvalue_map, - matrix.min_fvalue, - hist.GetNodeHistogram(left_nidx)}; - EvaluateSplitInputs right{ - right_nidx, - {candidate.split.right_sum.GetGrad(), - candidate.split.right_sum.GetHess()}, - gpu_param, - right_feature_set, - feature_types, - matrix.feature_segments, - matrix.gidx_fvalue_map, - matrix.min_fvalue, - hist.GetNodeHistogram(right_nidx)}; + EvaluateSplitInputs left{left_nidx, + candidate.split.left_sum, + gpu_param, + left_feature_set, + feature_types, + matrix.feature_segments, + matrix.gidx_fvalue_map, + matrix.min_fvalue, + hist.GetNodeHistogram(left_nidx)}; + EvaluateSplitInputs right{right_nidx, + candidate.split.right_sum, + gpu_param, + right_feature_set, + feature_types, + matrix.feature_segments, + matrix.gidx_fvalue_map, + matrix.min_fvalue, + hist.GetNodeHistogram(right_nidx)}; auto d_splits_out = dh::ToSpan(splits_out); EvaluateSplits(d_splits_out, tree_evaluator.GetEvaluator(), left, right); dh::TemporaryArray entries(2); @@ -502,12 +496,11 @@ struct GPUHistMakerDevice { auto d_ridx = row_partitioner->GetRows(); GPUTrainingParam param_d(param); - dh::TemporaryArray device_node_sum_gradients(node_sum_gradients.size()); + dh::TemporaryArray device_node_sum_gradients(node_sum_gradients.size()); - dh::safe_cuda( - cudaMemcpyAsync(device_node_sum_gradients.data().get(), node_sum_gradients.data(), - sizeof(GradientPair) * node_sum_gradients.size(), - cudaMemcpyHostToDevice)); + dh::safe_cuda(cudaMemcpyAsync(device_node_sum_gradients.data().get(), node_sum_gradients.data(), + sizeof(GradientPairPrecise) * node_sum_gradients.size(), + cudaMemcpyHostToDevice)); auto d_position = row_partitioner->GetPosition(); auto d_node_sum_gradients = device_node_sum_gradients.data().get(); auto evaluator = tree_evaluator.GetEvaluator(); @@ -623,13 +616,12 @@ struct GPUHistMakerDevice { GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) { constexpr bst_node_t kRootNIdx = 0; dh::XGBCachingDeviceAllocator alloc; - GradientPair root_sum = dh::Reduce( - thrust::cuda::par(alloc), - thrust::device_ptr(gpair.data()), - thrust::device_ptr(gpair.data() + gpair.size()), - GradientPair{}, thrust::plus{}); - rabit::Allreduce(reinterpret_cast(&root_sum), - 2); + auto gpair_it = dh::MakeTransformIterator( + dh::tbegin(gpair), [] __device__(auto const& gpair) { return GradientPairPrecise{gpair}; }); + GradientPairPrecise root_sum = + dh::Reduce(thrust::cuda::par(alloc), gpair_it, gpair_it + gpair.size(), + GradientPairPrecise{}, thrust::plus{}); + rabit::Allreduce(reinterpret_cast(&root_sum), 2); this->BuildHist(kRootNIdx); this->AllReduceHist(kRootNIdx, reducer); diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index d90d47b15..0916d1181 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -17,7 +17,7 @@ auto ZeroParam() { void TestEvaluateSingleSplit(bool is_categorical) { thrust::device_vector out_splits(1); - GradientPair parent_sum(0.0, 1.0); + GradientPairPrecise parent_sum(0.0, 1.0); TrainParam tparam = ZeroParam(); GPUTrainingParam param{tparam}; @@ -73,7 +73,7 @@ TEST(GpuHist, EvaluateCategoricalSplit) { TEST(GpuHist, EvaluateSingleSplitMissing) { thrust::device_vector out_splits(1); - GradientPair parent_sum(1.0, 1.5); + GradientPairPrecise parent_sum(1.0, 1.5); TrainParam tparam = ZeroParam(); GPUTrainingParam param{tparam}; @@ -104,8 +104,8 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { EXPECT_EQ(result.findex, 0); EXPECT_EQ(result.fvalue, 1.0); EXPECT_EQ(result.dir, kRightDir); - EXPECT_EQ(result.left_sum, GradientPair(-0.5, 0.5)); - EXPECT_EQ(result.right_sum, GradientPair(1.5, 1.0)); + EXPECT_EQ(result.left_sum, GradientPairPrecise(-0.5, 0.5)); + EXPECT_EQ(result.right_sum, GradientPairPrecise(1.5, 1.0)); } TEST(GpuHist, EvaluateSingleSplitEmpty) { @@ -130,7 +130,7 @@ TEST(GpuHist, EvaluateSingleSplitEmpty) { // Feature 0 has a better split, but the algorithm must select feature 1 TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { thrust::device_vector out_splits(1); - GradientPair parent_sum(0.0, 1.0); + GradientPairPrecise parent_sum(0.0, 1.0); TrainParam tparam = ZeroParam(); tparam.UpdateAllowUnknown(Args{}); GPUTrainingParam param{tparam}; @@ -164,14 +164,14 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { DeviceSplitCandidate result = out_splits[0]; EXPECT_EQ(result.findex, 1); EXPECT_EQ(result.fvalue, 11.0); - EXPECT_EQ(result.left_sum, GradientPair(-0.5, 0.5)); - EXPECT_EQ(result.right_sum, GradientPair(0.5, 0.5)); + EXPECT_EQ(result.left_sum, GradientPairPrecise(-0.5, 0.5)); + EXPECT_EQ(result.right_sum, GradientPairPrecise(0.5, 0.5)); } // Features 0 and 1 have identical gain, the algorithm must select 0 TEST(GpuHist, EvaluateSingleSplitBreakTies) { thrust::device_vector out_splits(1); - GradientPair parent_sum(0.0, 1.0); + GradientPairPrecise parent_sum(0.0, 1.0); TrainParam tparam = ZeroParam(); tparam.UpdateAllowUnknown(Args{}); GPUTrainingParam param{tparam}; @@ -209,7 +209,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) { TEST(GpuHist, EvaluateSplits) { thrust::device_vector out_splits(2); - GradientPair parent_sum(0.0, 1.0); + GradientPairPrecise parent_sum(0.0, 1.0); TrainParam tparam = ZeroParam(); tparam.UpdateAllowUnknown(Args{}); GPUTrainingParam param{tparam};