Extract floating point rounding routines. (#8771)
This commit is contained in:
parent
e9c178f402
commit
70c9b885ef
46
src/common/deterministic.cuh
Normal file
46
src/common/deterministic.cuh
Normal file
@ -0,0 +1,46 @@
|
||||
/**
|
||||
* Copyright 2020-2023 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_DETERMINISTIC_CUH_
|
||||
#define XGBOOST_COMMON_DETERMINISTIC_CUH_
|
||||
|
||||
#include <cmath>
|
||||
#include <limits> // std::numeric_limits
|
||||
|
||||
#include "xgboost/base.h" // XGBOOST_DEVICE
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
// Following 2 functions are slightly modified version of fbcuda.
|
||||
|
||||
/**
|
||||
* \brief Constructs a rounding factor used to truncate elements in a sum such that the
|
||||
* sum of the truncated elements is the same no matter what the order of the sum
|
||||
* is.
|
||||
*
|
||||
* Algorithm 5: Reproducible Sequential Sum in 'Fast Reproducible Floating-Point
|
||||
* Summation' by Demmel and Nguyen.
|
||||
*/
|
||||
template <typename T>
|
||||
XGBOOST_DEVICE T CreateRoundingFactor(T max_abs, int n) {
|
||||
T delta = max_abs / (static_cast<T>(1.0) -
|
||||
static_cast<T>(2.0) * static_cast<T>(n) * std::numeric_limits<T>::epsilon());
|
||||
|
||||
// Calculate ceil(log_2(delta)).
|
||||
// frexpf() calculates exp and returns `x` such that
|
||||
// delta = x * 2^exp, where `x` in (-1.0, -0.5] U [0.5, 1).
|
||||
// Because |x| < 1, exp is exactly ceil(log_2(delta)).
|
||||
int exp;
|
||||
std::frexp(delta, &exp);
|
||||
|
||||
// return M = 2 ^ ceil(log_2(delta))
|
||||
return std::ldexp(static_cast<T>(1.0), exp);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
XGBOOST_DEVICE T TruncateWithRounding(T const rounding_factor, T const x) {
|
||||
return (rounding_factor + x) - rounding_factor;
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_DETERMINISTIC_CUH_
|
||||
@ -9,6 +9,7 @@
|
||||
#include <limits>
|
||||
|
||||
#include "../../common/device_helpers.cuh"
|
||||
#include "../../common/deterministic.cuh"
|
||||
#include "../../data/ellpack_page.cuh"
|
||||
#include "histogram.cuh"
|
||||
#include "row_partitioner.cuh"
|
||||
@ -16,37 +17,6 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
// Following 2 functions are slightly modified version of fbcuda.
|
||||
|
||||
/* \brief Constructs a rounding factor used to truncate elements in a sum such that the
|
||||
sum of the truncated elements is the same no matter what the order of the sum is.
|
||||
|
||||
* Algorithm 5: Reproducible Sequential Sum in 'Fast Reproducible Floating-Point
|
||||
* Summation' by Demmel and Nguyen
|
||||
|
||||
* In algorithm 5 the bound is calculated as $max(|v_i|) * n$. Here we use the bound
|
||||
*
|
||||
* \begin{equation}
|
||||
* max( fl(\sum^{V}_{v_i>0}{v_i}), fl(\sum^{V}_{v_i<0}|v_i|) )
|
||||
* \end{equation}
|
||||
*
|
||||
* to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
|
||||
*/
|
||||
template <typename T>
|
||||
T CreateRoundingFactor(T max_abs, int n) {
|
||||
T delta = max_abs / (static_cast<T>(1.0) - 2 * n * std::numeric_limits<T>::epsilon());
|
||||
|
||||
// Calculate ceil(log_2(delta)).
|
||||
// frexpf() calculates exp and returns `x` such that
|
||||
// delta = x * 2^exp, where `x` in (-1.0, -0.5] U [0.5, 1).
|
||||
// Because |x| < 1, exp is exactly ceil(log_2(delta)).
|
||||
int exp;
|
||||
std::frexp(delta, &exp);
|
||||
|
||||
// return M = 2 ^ ceil(log_2(delta))
|
||||
return std::ldexp(static_cast<T>(1.0), exp);
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct Pair {
|
||||
GradientPair first;
|
||||
@ -72,6 +42,16 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* In algorithm 5 (see common::CreateRoundingFactor) the bound is calculated as
|
||||
* $max(|v_i|) * n$. Here we use the bound:
|
||||
*
|
||||
* \begin{equation}
|
||||
* max( fl(\sum^{V}_{v_i>0}{v_i}), fl(\sum^{V}_{v_i<0}|v_i|) )
|
||||
* \end{equation}
|
||||
*
|
||||
* to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
|
||||
*/
|
||||
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
|
||||
using GradientSumT = GradientPairPrecise;
|
||||
using T = typename GradientSumT::ValueT;
|
||||
@ -90,10 +70,11 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
|
||||
std::size_t total_rows = gpair.size();
|
||||
collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);
|
||||
|
||||
auto histogram_rounding = GradientSumT{
|
||||
CreateRoundingFactor<T>(std::max(positive_sum.GetGrad(), negative_sum.GetGrad()), total_rows),
|
||||
CreateRoundingFactor<T>(std::max(positive_sum.GetHess(), negative_sum.GetHess()),
|
||||
total_rows)};
|
||||
auto histogram_rounding =
|
||||
GradientSumT{common::CreateRoundingFactor<T>(
|
||||
std::max(positive_sum.GetGrad(), negative_sum.GetGrad()), total_rows),
|
||||
common::CreateRoundingFactor<T>(
|
||||
std::max(positive_sum.GetHess(), negative_sum.GetHess()), total_rows)};
|
||||
|
||||
using IntT = typename GradientPairInt64::ValueT;
|
||||
|
||||
|
||||
@ -37,7 +37,8 @@ private:
|
||||
GradientPairPrecise to_fixed_point_;
|
||||
/* Convert fixed point representation back to floating point. */
|
||||
GradientPairPrecise to_floating_point_;
|
||||
public:
|
||||
|
||||
public:
|
||||
explicit GradientQuantiser(common::Span<GradientPair const> gpair);
|
||||
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
|
||||
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user