Bound the size of the histogram cache. (#9440)

- A new histogram collection with a limit in size. - Unify histogram building logic between hist, multi-hist, and approx.
2023-08-08 03:21:26 +08:00
parent 5bd163aa25
commit 54029a59af
27 changed files with 994 additions and 565 deletions
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -67,17 +67,6 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
  return out;
 }

-/*!
- * \brief fill a histogram by zeros in range [begin, end)
- */
-void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
-#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  std::fill(hist.begin() + begin, hist.begin() + end, xgboost::GradientPairPrecise());
-#else  // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  memset(hist.data() + begin, '\0', (end - begin) * sizeof(xgboost::GradientPairPrecise));
-#endif  // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-}
-
 /*!
 * \brief Increment hist as dst += add in range [begin, end)
 */
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -364,11 +364,6 @@ bst_bin_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(std::size_t begin, std::size_t
 using GHistRow = Span<xgboost::GradientPairPrecise>;
 using ConstGHistRow = Span<xgboost::GradientPairPrecise const>;

-/*!
- * \brief fill a histogram by zeros
- */
-void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
-
 /*!
 * \brief Increment hist as dst += add in range [begin, end)
 */
@@ -395,12 +390,7 @@ class HistCollection {
    constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
    const size_t id = row_ptr_.at(nid);
    CHECK_NE(id, kMax);
-    GradientPairPrecise* ptr = nullptr;
-    if (contiguous_allocation_) {
-      ptr = const_cast<GradientPairPrecise*>(data_[0].data() + nbins_*id);
-    } else {
-      ptr = const_cast<GradientPairPrecise*>(data_[id].data());
-    }
+    GradientPairPrecise* ptr = const_cast<GradientPairPrecise*>(data_[id].data());
    return {ptr, nbins_};
  }

@@ -445,24 +435,12 @@ class HistCollection {
      data_[row_ptr_[nid]].resize(nbins_, {0, 0});
    }
  }
-  // allocate common buffer contiguously for all nodes, need for single Allreduce call
-  void AllocateAllData() {
-    const size_t new_size = nbins_*data_.size();
-    contiguous_allocation_ = true;
-    if (data_[0].size() != new_size) {
-      data_[0].resize(new_size);
-    }
-  }
-  [[nodiscard]] bool IsContiguous() const { return contiguous_allocation_; }

 private:
  /*! \brief number of all bins over all features */
  uint32_t nbins_ = 0;
  /*! \brief amount of active nodes in hist collection */
  uint32_t n_nodes_added_ = 0;
-  /*! \brief flag to identify contiguous memory allocation */
-  bool contiguous_allocation_ = false;
-
  std::vector<std::vector<GradientPairPrecise>> data_;

  /*! \brief row_ptr_[nid] locates bin for histogram of node nid */
@@ -518,7 +496,7 @@ class ParallelGHistBuilder {
    GHistRow hist = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];

    if (!hist_was_used_[tid * nodes_ + nid]) {
-      InitilizeHistByZeroes(hist, 0, hist.size());
+      std::fill_n(hist.data(), hist.size(), GradientPairPrecise{});
      hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
    }

@@ -548,7 +526,7 @@ class ParallelGHistBuilder {
    if (!is_updated) {
      // In distributed mode - some tree nodes can be empty on local machines,
      // So we need just set local hist by zeros in this case
-      InitilizeHistByZeroes(dst, begin, end);
+      std::fill(dst.data() + begin, dst.data() + end, GradientPairPrecise{});
    }
  }

--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -7,13 +7,14 @@
 #include <dmlc/common.h>
 #include <dmlc/omp.h>

-#include <algorithm>
-#include <cstdint>  // for int32_t
-#include <cstdlib>  // for malloc, free
-#include <limits>
+#include <algorithm>    // for min
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <cstdlib>      // for malloc, free
+#include <functional>   // for function
 #include <new>          // for bad_alloc
-#include <type_traits>  // for is_signed
-#include <vector>
+#include <type_traits>  // for is_signed, conditional_t
+#include <vector>       // for vector

 #include "xgboost/logging.h"

@@ -25,6 +26,8 @@ inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; }  // NOLINT

 // MSVC doesn't implement the thread limit.
 #if defined(_OPENMP) && defined(_MSC_VER)
+#include <limits>
+
 extern "C" {
 inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); }  // NOLINT
 }
@@ -84,8 +87,8 @@ class BlockedSpace2d {
  // dim1 - size of the first dimension in the space
  // getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index
  // grain_size - max size of produced blocks
-  template <typename Func>
-  BlockedSpace2d(std::size_t dim1, Func getter_size_dim2, std::size_t grain_size) {
+  BlockedSpace2d(std::size_t dim1, std::function<std::size_t(std::size_t)> getter_size_dim2,
+                 std::size_t grain_size) {
    for (std::size_t i = 0; i < dim1; ++i) {
      std::size_t size = getter_size_dim2(i);
      // Each row (second dim) is divided into n_blocks
@@ -104,13 +107,13 @@ class BlockedSpace2d {
  }

  // get index of the first dimension of i-th block(task)
-  [[nodiscard]] std::size_t GetFirstDimension(size_t i) const {
+  [[nodiscard]] std::size_t GetFirstDimension(std::size_t i) const {
    CHECK_LT(i, first_dimension_.size());
    return first_dimension_[i];
  }

  // get a range of indexes for the second dimension of i-th block(task)
-  [[nodiscard]] Range1d GetRange(size_t i) const {
+  [[nodiscard]] Range1d GetRange(std::size_t i) const {
    CHECK_LT(i, ranges_.size());
    return ranges_[i];
  }
@@ -129,22 +132,22 @@ class BlockedSpace2d {
  }

  std::vector<Range1d> ranges_;
-  std::vector<size_t> first_dimension_;
+  std::vector<std::size_t> first_dimension_;
 };


 // Wrapper to implement nested parallelism with simple omp parallel for
-template <typename Func>
-void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
+inline void ParallelFor2d(BlockedSpace2d const& space, std::int32_t n_threads,
+                          std::function<void(std::size_t, Range1d)> func) {
  std::size_t n_blocks_in_space = space.Size();
-  CHECK_GE(nthreads, 1);
+  CHECK_GE(n_threads, 1);

  dmlc::OMPException exc;
-#pragma omp parallel num_threads(nthreads)
+#pragma omp parallel num_threads(n_threads)
  {
    exc.Run([&]() {
-      size_t tid = omp_get_thread_num();
-      size_t chunck_size = n_blocks_in_space / nthreads + !!(n_blocks_in_space % nthreads);
+      std::size_t tid = omp_get_thread_num();
+      std::size_t chunck_size = n_blocks_in_space / n_threads + !!(n_blocks_in_space % n_threads);

      std::size_t begin = chunck_size * tid;
      std::size_t end = std::min(begin + chunck_size, n_blocks_in_space);