Bound the size of the histogram cache. (#9440)

- A new histogram collection with a limit in size.
- Unify histogram building logic between hist, multi-hist, and approx.
This commit is contained in:
Jiaming Yuan
2023-08-08 03:21:26 +08:00
committed by GitHub
parent 5bd163aa25
commit 54029a59af
27 changed files with 994 additions and 565 deletions

View File

@@ -67,17 +67,6 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
return out;
}
/*!
* \brief fill a histogram by zeros in range [begin, end)
*/
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
std::fill(hist.begin() + begin, hist.begin() + end, xgboost::GradientPairPrecise());
#else // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
memset(hist.data() + begin, '\0', (end - begin) * sizeof(xgboost::GradientPairPrecise));
#endif // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
}
/*!
* \brief Increment hist as dst += add in range [begin, end)
*/

View File

@@ -364,11 +364,6 @@ bst_bin_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(std::size_t begin, std::size_t
using GHistRow = Span<xgboost::GradientPairPrecise>;
using ConstGHistRow = Span<xgboost::GradientPairPrecise const>;
/*!
* \brief fill a histogram by zeros
*/
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
/*!
* \brief Increment hist as dst += add in range [begin, end)
*/
@@ -395,12 +390,7 @@ class HistCollection {
constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
const size_t id = row_ptr_.at(nid);
CHECK_NE(id, kMax);
GradientPairPrecise* ptr = nullptr;
if (contiguous_allocation_) {
ptr = const_cast<GradientPairPrecise*>(data_[0].data() + nbins_*id);
} else {
ptr = const_cast<GradientPairPrecise*>(data_[id].data());
}
GradientPairPrecise* ptr = const_cast<GradientPairPrecise*>(data_[id].data());
return {ptr, nbins_};
}
@@ -445,24 +435,12 @@ class HistCollection {
data_[row_ptr_[nid]].resize(nbins_, {0, 0});
}
}
// allocate common buffer contiguously for all nodes, need for single Allreduce call
void AllocateAllData() {
const size_t new_size = nbins_*data_.size();
contiguous_allocation_ = true;
if (data_[0].size() != new_size) {
data_[0].resize(new_size);
}
}
[[nodiscard]] bool IsContiguous() const { return contiguous_allocation_; }
private:
/*! \brief number of all bins over all features */
uint32_t nbins_ = 0;
/*! \brief amount of active nodes in hist collection */
uint32_t n_nodes_added_ = 0;
/*! \brief flag to identify contiguous memory allocation */
bool contiguous_allocation_ = false;
std::vector<std::vector<GradientPairPrecise>> data_;
/*! \brief row_ptr_[nid] locates bin for histogram of node nid */
@@ -518,7 +496,7 @@ class ParallelGHistBuilder {
GHistRow hist = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];
if (!hist_was_used_[tid * nodes_ + nid]) {
InitilizeHistByZeroes(hist, 0, hist.size());
std::fill_n(hist.data(), hist.size(), GradientPairPrecise{});
hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
}
@@ -548,7 +526,7 @@ class ParallelGHistBuilder {
if (!is_updated) {
// In distributed mode - some tree nodes can be empty on local machines,
// So we need just set local hist by zeros in this case
InitilizeHistByZeroes(dst, begin, end);
std::fill(dst.data() + begin, dst.data() + end, GradientPairPrecise{});
}
}

View File

@@ -7,13 +7,14 @@
#include <dmlc/common.h>
#include <dmlc/omp.h>
#include <algorithm>
#include <cstdint> // for int32_t
#include <cstdlib> // for malloc, free
#include <limits>
#include <algorithm> // for min
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <cstdlib> // for malloc, free
#include <functional> // for function
#include <new> // for bad_alloc
#include <type_traits> // for is_signed
#include <vector>
#include <type_traits> // for is_signed, conditional_t
#include <vector> // for vector
#include "xgboost/logging.h"
@@ -25,6 +26,8 @@ inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; } // NOLINT
// MSVC doesn't implement the thread limit.
#if defined(_OPENMP) && defined(_MSC_VER)
#include <limits>
extern "C" {
inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); } // NOLINT
}
@@ -84,8 +87,8 @@ class BlockedSpace2d {
// dim1 - size of the first dimension in the space
// getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index
// grain_size - max size of produced blocks
template <typename Func>
BlockedSpace2d(std::size_t dim1, Func getter_size_dim2, std::size_t grain_size) {
BlockedSpace2d(std::size_t dim1, std::function<std::size_t(std::size_t)> getter_size_dim2,
std::size_t grain_size) {
for (std::size_t i = 0; i < dim1; ++i) {
std::size_t size = getter_size_dim2(i);
// Each row (second dim) is divided into n_blocks
@@ -104,13 +107,13 @@ class BlockedSpace2d {
}
// get index of the first dimension of i-th block(task)
[[nodiscard]] std::size_t GetFirstDimension(size_t i) const {
[[nodiscard]] std::size_t GetFirstDimension(std::size_t i) const {
CHECK_LT(i, first_dimension_.size());
return first_dimension_[i];
}
// get a range of indexes for the second dimension of i-th block(task)
[[nodiscard]] Range1d GetRange(size_t i) const {
[[nodiscard]] Range1d GetRange(std::size_t i) const {
CHECK_LT(i, ranges_.size());
return ranges_[i];
}
@@ -129,22 +132,22 @@ class BlockedSpace2d {
}
std::vector<Range1d> ranges_;
std::vector<size_t> first_dimension_;
std::vector<std::size_t> first_dimension_;
};
// Wrapper to implement nested parallelism with simple omp parallel for
template <typename Func>
void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
inline void ParallelFor2d(BlockedSpace2d const& space, std::int32_t n_threads,
std::function<void(std::size_t, Range1d)> func) {
std::size_t n_blocks_in_space = space.Size();
CHECK_GE(nthreads, 1);
CHECK_GE(n_threads, 1);
dmlc::OMPException exc;
#pragma omp parallel num_threads(nthreads)
#pragma omp parallel num_threads(n_threads)
{
exc.Run([&]() {
size_t tid = omp_get_thread_num();
size_t chunck_size = n_blocks_in_space / nthreads + !!(n_blocks_in_space % nthreads);
std::size_t tid = omp_get_thread_num();
std::size_t chunck_size = n_blocks_in_space / n_threads + !!(n_blocks_in_space % n_threads);
std::size_t begin = chunck_size * tid;
std::size_t end = std::min(begin + chunck_size, n_blocks_in_space);