diff --git a/src/common/deterministic.cuh b/src/common/deterministic.cuh new file mode 100644 index 000000000..e7bc2fb28 --- /dev/null +++ b/src/common/deterministic.cuh @@ -0,0 +1,46 @@ +/** + * Copyright 2020-2023 by XGBoost Contributors + */ +#ifndef XGBOOST_COMMON_DETERMINISTIC_CUH_ +#define XGBOOST_COMMON_DETERMINISTIC_CUH_ + +#include +#include // std::numeric_limits + +#include "xgboost/base.h" // XGBOOST_DEVICE + +namespace xgboost { +namespace common { +// Following 2 functions are slightly modified version of fbcuda. + +/** + * \brief Constructs a rounding factor used to truncate elements in a sum such that the + * sum of the truncated elements is the same no matter what the order of the sum + * is. + * + * Algorithm 5: Reproducible Sequential Sum in 'Fast Reproducible Floating-Point + * Summation' by Demmel and Nguyen. + */ +template +XGBOOST_DEVICE T CreateRoundingFactor(T max_abs, int n) { + T delta = max_abs / (static_cast(1.0) - + static_cast(2.0) * static_cast(n) * std::numeric_limits::epsilon()); + + // Calculate ceil(log_2(delta)). + // frexpf() calculates exp and returns `x` such that + // delta = x * 2^exp, where `x` in (-1.0, -0.5] U [0.5, 1). + // Because |x| < 1, exp is exactly ceil(log_2(delta)). + int exp; + std::frexp(delta, &exp); + + // return M = 2 ^ ceil(log_2(delta)) + return std::ldexp(static_cast(1.0), exp); +} + +template +XGBOOST_DEVICE T TruncateWithRounding(T const rounding_factor, T const x) { + return (rounding_factor + x) - rounding_factor; +} +} // namespace common +} // namespace xgboost +#endif // XGBOOST_COMMON_DETERMINISTIC_CUH_ diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index f02fb909e..1344ecf4f 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -9,6 +9,7 @@ #include #include "../../common/device_helpers.cuh" +#include "../../common/deterministic.cuh" #include "../../data/ellpack_page.cuh" #include "histogram.cuh" #include "row_partitioner.cuh" @@ -16,37 +17,6 @@ namespace xgboost { namespace tree { -// Following 2 functions are slightly modified version of fbcuda. - -/* \brief Constructs a rounding factor used to truncate elements in a sum such that the - sum of the truncated elements is the same no matter what the order of the sum is. - - * Algorithm 5: Reproducible Sequential Sum in 'Fast Reproducible Floating-Point - * Summation' by Demmel and Nguyen - - * In algorithm 5 the bound is calculated as $max(|v_i|) * n$. Here we use the bound - * - * \begin{equation} - * max( fl(\sum^{V}_{v_i>0}{v_i}), fl(\sum^{V}_{v_i<0}|v_i|) ) - * \end{equation} - * - * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree. - */ -template -T CreateRoundingFactor(T max_abs, int n) { - T delta = max_abs / (static_cast(1.0) - 2 * n * std::numeric_limits::epsilon()); - - // Calculate ceil(log_2(delta)). - // frexpf() calculates exp and returns `x` such that - // delta = x * 2^exp, where `x` in (-1.0, -0.5] U [0.5, 1). - // Because |x| < 1, exp is exactly ceil(log_2(delta)). - int exp; - std::frexp(delta, &exp); - - // return M = 2 ^ ceil(log_2(delta)) - return std::ldexp(static_cast(1.0), exp); -} - namespace { struct Pair { GradientPair first; @@ -72,6 +42,16 @@ struct Clip : public thrust::unary_function { } }; +/** + * In algorithm 5 (see common::CreateRoundingFactor) the bound is calculated as + * $max(|v_i|) * n$. Here we use the bound: + * + * \begin{equation} + * max( fl(\sum^{V}_{v_i>0}{v_i}), fl(\sum^{V}_{v_i<0}|v_i|) ) + * \end{equation} + * + * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree. + */ GradientQuantiser::GradientQuantiser(common::Span gpair) { using GradientSumT = GradientPairPrecise; using T = typename GradientSumT::ValueT; @@ -90,10 +70,11 @@ GradientQuantiser::GradientQuantiser(common::Span gpair) { std::size_t total_rows = gpair.size(); collective::Allreduce(&total_rows, 1); - auto histogram_rounding = GradientSumT{ - CreateRoundingFactor(std::max(positive_sum.GetGrad(), negative_sum.GetGrad()), total_rows), - CreateRoundingFactor(std::max(positive_sum.GetHess(), negative_sum.GetHess()), - total_rows)}; + auto histogram_rounding = + GradientSumT{common::CreateRoundingFactor( + std::max(positive_sum.GetGrad(), negative_sum.GetGrad()), total_rows), + common::CreateRoundingFactor( + std::max(positive_sum.GetHess(), negative_sum.GetHess()), total_rows)}; using IntT = typename GradientPairInt64::ValueT; diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh index 5c3c955d1..eb9008d48 100644 --- a/src/tree/gpu_hist/histogram.cuh +++ b/src/tree/gpu_hist/histogram.cuh @@ -37,7 +37,8 @@ private: GradientPairPrecise to_fixed_point_; /* Convert fixed point representation back to floating point. */ GradientPairPrecise to_floating_point_; -public: + + public: explicit GradientQuantiser(common::Span gpair); XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const { auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),