Extract floating point rounding routines. (#8771)

2023-02-12 04:26:41 +08:00 · 2023-02-12 04:26:41 +08:00 · 70c9b885ef
commit 70c9b885ef
parent e9c178f402
3 changed files with 64 additions and 36 deletions
--- a/src/common/deterministic.cuh
+++ b/src/common/deterministic.cuh
@ -0,0 +1,46 @@
 /**
 * Copyright 2020-2023 by XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_DETERMINISTIC_CUH_
 #define XGBOOST_COMMON_DETERMINISTIC_CUH_
 #include <cmath>
 #include <limits>          // std::numeric_limits
 #include "xgboost/base.h"  // XGBOOST_DEVICE
 namespace xgboost {
 namespace common {
 // Following 2 functions are slightly modified version of fbcuda.
 /**
 * \brief Constructs a rounding factor used to truncate elements in a sum such that the
 *        sum of the truncated elements is the same no matter what the order of the sum
 *        is.
 *
 * Algorithm 5: Reproducible Sequential Sum in 'Fast Reproducible Floating-Point
 * Summation' by Demmel and Nguyen.
 */
 template <typename T>
 XGBOOST_DEVICE T CreateRoundingFactor(T max_abs, int n) {
  T delta = max_abs / (static_cast<T>(1.0) -
                       static_cast<T>(2.0) * static_cast<T>(n) * std::numeric_limits<T>::epsilon());
  // Calculate ceil(log_2(delta)).
  // frexpf() calculates exp and returns `x` such that
  // delta = x * 2^exp, where `x` in (-1.0, -0.5] U [0.5, 1).
  // Because |x| < 1, exp is exactly ceil(log_2(delta)).
  int exp;
  std::frexp(delta, &exp);
  // return M = 2 ^ ceil(log_2(delta))
  return std::ldexp(static_cast<T>(1.0), exp);
 }
 template <typename T>
 XGBOOST_DEVICE T TruncateWithRounding(T const rounding_factor, T const x) {
  return (rounding_factor + x) - rounding_factor;
 }
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_DETERMINISTIC_CUH_
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@ -9,6 +9,7 @@
 #include <limits>
 #include "../../common/device_helpers.cuh"
 #include "../../common/deterministic.cuh"
 #include "../../data/ellpack_page.cuh"
 #include "histogram.cuh"
 #include "row_partitioner.cuh"
@ -16,37 +17,6 @@
 namespace xgboost {
 namespace tree {
 // Following 2 functions are slightly modified version of fbcuda.
 /* \brief Constructs a rounding factor used to truncate elements in a sum such that the
   sum of the truncated elements is the same no matter what the order of the sum is.
 * Algorithm 5: Reproducible Sequential Sum in 'Fast Reproducible Floating-Point
 * Summation' by Demmel and Nguyen
 * In algorithm 5 the bound is calculated as $max(|v_i|) * n$.  Here we use the bound
 *
 * \begin{equation}
 *   max( fl(\sum^{V}_{v_i>0}{v_i}), fl(\sum^{V}_{v_i<0}|v_i|) )
 * \end{equation}
 *
 * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
 */
 template <typename T>
 T CreateRoundingFactor(T max_abs, int n) {
  T delta = max_abs / (static_cast<T>(1.0) - 2 * n * std::numeric_limits<T>::epsilon());
  // Calculate ceil(log_2(delta)).
  // frexpf() calculates exp and returns `x` such that
  // delta = x * 2^exp, where `x` in (-1.0, -0.5] U [0.5, 1).
  // Because |x| < 1, exp is exactly ceil(log_2(delta)).
  int exp;
  std::frexp(delta, &exp);
  // return M = 2 ^ ceil(log_2(delta))
  return std::ldexp(static_cast<T>(1.0), exp);
 }
 namespace {
 struct Pair {
  GradientPair first;
@ -72,6 +42,16 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
  }
 };
 /**
 * In algorithm 5 (see common::CreateRoundingFactor) the bound is calculated as
 * $max(|v_i|) * n$.  Here we use the bound:
 *
 * \begin{equation}
 *   max( fl(\sum^{V}_{v_i>0}{v_i}), fl(\sum^{V}_{v_i<0}|v_i|) )
 * \end{equation}
 *
 * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
 */
 GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
  using GradientSumT = GradientPairPrecise;
  using T = typename GradientSumT::ValueT;
@ -90,10 +70,11 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
  std::size_t total_rows = gpair.size();
  collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);
-  auto histogram_rounding = GradientSumT{
+  auto histogram_rounding =
-      CreateRoundingFactor<T>(std::max(positive_sum.GetGrad(), negative_sum.GetGrad()), total_rows),
+      GradientSumT{common::CreateRoundingFactor<T>(
-      CreateRoundingFactor<T>(std::max(positive_sum.GetHess(), negative_sum.GetHess()),
+                       std::max(positive_sum.GetGrad(), negative_sum.GetGrad()), total_rows),
-                              total_rows)};
+                   common::CreateRoundingFactor<T>(
                       std::max(positive_sum.GetHess(), negative_sum.GetHess()), total_rows)};
  using IntT = typename GradientPairInt64::ValueT;
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@ -37,7 +37,8 @@ private:
  GradientPairPrecise to_fixed_point_;
  /* Convert fixed point representation back to floating point. */
  GradientPairPrecise to_floating_point_;
-public:
+
 public:
  explicit GradientQuantiser(common::Span<GradientPair const> gpair);
  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),