Bound the size of the histogram cache. (#9440)
- A new histogram collection with a limit in size. - Unify histogram building logic between hist, multi-hist, and approx.
This commit is contained in:
@@ -67,17 +67,6 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
|
||||
return out;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief fill a histogram by zeros in range [begin, end)
|
||||
*/
|
||||
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
|
||||
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
|
||||
std::fill(hist.begin() + begin, hist.begin() + end, xgboost::GradientPairPrecise());
|
||||
#else // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
|
||||
memset(hist.data() + begin, '\0', (end - begin) * sizeof(xgboost::GradientPairPrecise));
|
||||
#endif // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Increment hist as dst += add in range [begin, end)
|
||||
*/
|
||||
|
||||
@@ -364,11 +364,6 @@ bst_bin_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(std::size_t begin, std::size_t
|
||||
using GHistRow = Span<xgboost::GradientPairPrecise>;
|
||||
using ConstGHistRow = Span<xgboost::GradientPairPrecise const>;
|
||||
|
||||
/*!
|
||||
* \brief fill a histogram by zeros
|
||||
*/
|
||||
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
|
||||
|
||||
/*!
|
||||
* \brief Increment hist as dst += add in range [begin, end)
|
||||
*/
|
||||
@@ -395,12 +390,7 @@ class HistCollection {
|
||||
constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
|
||||
const size_t id = row_ptr_.at(nid);
|
||||
CHECK_NE(id, kMax);
|
||||
GradientPairPrecise* ptr = nullptr;
|
||||
if (contiguous_allocation_) {
|
||||
ptr = const_cast<GradientPairPrecise*>(data_[0].data() + nbins_*id);
|
||||
} else {
|
||||
ptr = const_cast<GradientPairPrecise*>(data_[id].data());
|
||||
}
|
||||
GradientPairPrecise* ptr = const_cast<GradientPairPrecise*>(data_[id].data());
|
||||
return {ptr, nbins_};
|
||||
}
|
||||
|
||||
@@ -445,24 +435,12 @@ class HistCollection {
|
||||
data_[row_ptr_[nid]].resize(nbins_, {0, 0});
|
||||
}
|
||||
}
|
||||
// allocate common buffer contiguously for all nodes, need for single Allreduce call
|
||||
void AllocateAllData() {
|
||||
const size_t new_size = nbins_*data_.size();
|
||||
contiguous_allocation_ = true;
|
||||
if (data_[0].size() != new_size) {
|
||||
data_[0].resize(new_size);
|
||||
}
|
||||
}
|
||||
[[nodiscard]] bool IsContiguous() const { return contiguous_allocation_; }
|
||||
|
||||
private:
|
||||
/*! \brief number of all bins over all features */
|
||||
uint32_t nbins_ = 0;
|
||||
/*! \brief amount of active nodes in hist collection */
|
||||
uint32_t n_nodes_added_ = 0;
|
||||
/*! \brief flag to identify contiguous memory allocation */
|
||||
bool contiguous_allocation_ = false;
|
||||
|
||||
std::vector<std::vector<GradientPairPrecise>> data_;
|
||||
|
||||
/*! \brief row_ptr_[nid] locates bin for histogram of node nid */
|
||||
@@ -518,7 +496,7 @@ class ParallelGHistBuilder {
|
||||
GHistRow hist = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];
|
||||
|
||||
if (!hist_was_used_[tid * nodes_ + nid]) {
|
||||
InitilizeHistByZeroes(hist, 0, hist.size());
|
||||
std::fill_n(hist.data(), hist.size(), GradientPairPrecise{});
|
||||
hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
|
||||
}
|
||||
|
||||
@@ -548,7 +526,7 @@ class ParallelGHistBuilder {
|
||||
if (!is_updated) {
|
||||
// In distributed mode - some tree nodes can be empty on local machines,
|
||||
// So we need just set local hist by zeros in this case
|
||||
InitilizeHistByZeroes(dst, begin, end);
|
||||
std::fill(dst.data() + begin, dst.data() + end, GradientPairPrecise{});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,13 +7,14 @@
|
||||
#include <dmlc/common.h>
|
||||
#include <dmlc/omp.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint> // for int32_t
|
||||
#include <cstdlib> // for malloc, free
|
||||
#include <limits>
|
||||
#include <algorithm> // for min
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t
|
||||
#include <cstdlib> // for malloc, free
|
||||
#include <functional> // for function
|
||||
#include <new> // for bad_alloc
|
||||
#include <type_traits> // for is_signed
|
||||
#include <vector>
|
||||
#include <type_traits> // for is_signed, conditional_t
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "xgboost/logging.h"
|
||||
|
||||
@@ -25,6 +26,8 @@ inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; } // NOLINT
|
||||
|
||||
// MSVC doesn't implement the thread limit.
|
||||
#if defined(_OPENMP) && defined(_MSC_VER)
|
||||
#include <limits>
|
||||
|
||||
extern "C" {
|
||||
inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); } // NOLINT
|
||||
}
|
||||
@@ -84,8 +87,8 @@ class BlockedSpace2d {
|
||||
// dim1 - size of the first dimension in the space
|
||||
// getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index
|
||||
// grain_size - max size of produced blocks
|
||||
template <typename Func>
|
||||
BlockedSpace2d(std::size_t dim1, Func getter_size_dim2, std::size_t grain_size) {
|
||||
BlockedSpace2d(std::size_t dim1, std::function<std::size_t(std::size_t)> getter_size_dim2,
|
||||
std::size_t grain_size) {
|
||||
for (std::size_t i = 0; i < dim1; ++i) {
|
||||
std::size_t size = getter_size_dim2(i);
|
||||
// Each row (second dim) is divided into n_blocks
|
||||
@@ -104,13 +107,13 @@ class BlockedSpace2d {
|
||||
}
|
||||
|
||||
// get index of the first dimension of i-th block(task)
|
||||
[[nodiscard]] std::size_t GetFirstDimension(size_t i) const {
|
||||
[[nodiscard]] std::size_t GetFirstDimension(std::size_t i) const {
|
||||
CHECK_LT(i, first_dimension_.size());
|
||||
return first_dimension_[i];
|
||||
}
|
||||
|
||||
// get a range of indexes for the second dimension of i-th block(task)
|
||||
[[nodiscard]] Range1d GetRange(size_t i) const {
|
||||
[[nodiscard]] Range1d GetRange(std::size_t i) const {
|
||||
CHECK_LT(i, ranges_.size());
|
||||
return ranges_[i];
|
||||
}
|
||||
@@ -129,22 +132,22 @@ class BlockedSpace2d {
|
||||
}
|
||||
|
||||
std::vector<Range1d> ranges_;
|
||||
std::vector<size_t> first_dimension_;
|
||||
std::vector<std::size_t> first_dimension_;
|
||||
};
|
||||
|
||||
|
||||
// Wrapper to implement nested parallelism with simple omp parallel for
|
||||
template <typename Func>
|
||||
void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
|
||||
inline void ParallelFor2d(BlockedSpace2d const& space, std::int32_t n_threads,
|
||||
std::function<void(std::size_t, Range1d)> func) {
|
||||
std::size_t n_blocks_in_space = space.Size();
|
||||
CHECK_GE(nthreads, 1);
|
||||
CHECK_GE(n_threads, 1);
|
||||
|
||||
dmlc::OMPException exc;
|
||||
#pragma omp parallel num_threads(nthreads)
|
||||
#pragma omp parallel num_threads(n_threads)
|
||||
{
|
||||
exc.Run([&]() {
|
||||
size_t tid = omp_get_thread_num();
|
||||
size_t chunck_size = n_blocks_in_space / nthreads + !!(n_blocks_in_space % nthreads);
|
||||
std::size_t tid = omp_get_thread_num();
|
||||
std::size_t chunck_size = n_blocks_in_space / n_threads + !!(n_blocks_in_space % n_threads);
|
||||
|
||||
std::size_t begin = chunck_size * tid;
|
||||
std::size_t end = std::min(begin + chunck_size, n_blocks_in_space);
|
||||
|
||||
Reference in New Issue
Block a user