Bound the size of the histogram cache. (#9440)

- A new histogram collection with a limit in size.
- Unify histogram building logic between hist, multi-hist, and approx.
This commit is contained in:
Jiaming Yuan
2023-08-08 03:21:26 +08:00
committed by GitHub
parent 5bd163aa25
commit 54029a59af
27 changed files with 994 additions and 565 deletions

View File

@@ -67,17 +67,6 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
return out;
}
/*!
* \brief fill a histogram by zeros in range [begin, end)
*/
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
std::fill(hist.begin() + begin, hist.begin() + end, xgboost::GradientPairPrecise());
#else // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
memset(hist.data() + begin, '\0', (end - begin) * sizeof(xgboost::GradientPairPrecise));
#endif // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
}
/*!
* \brief Increment hist as dst += add in range [begin, end)
*/

View File

@@ -364,11 +364,6 @@ bst_bin_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(std::size_t begin, std::size_t
using GHistRow = Span<xgboost::GradientPairPrecise>;
using ConstGHistRow = Span<xgboost::GradientPairPrecise const>;
/*!
* \brief fill a histogram by zeros
*/
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
/*!
* \brief Increment hist as dst += add in range [begin, end)
*/
@@ -395,12 +390,7 @@ class HistCollection {
constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
const size_t id = row_ptr_.at(nid);
CHECK_NE(id, kMax);
GradientPairPrecise* ptr = nullptr;
if (contiguous_allocation_) {
ptr = const_cast<GradientPairPrecise*>(data_[0].data() + nbins_*id);
} else {
ptr = const_cast<GradientPairPrecise*>(data_[id].data());
}
GradientPairPrecise* ptr = const_cast<GradientPairPrecise*>(data_[id].data());
return {ptr, nbins_};
}
@@ -445,24 +435,12 @@ class HistCollection {
data_[row_ptr_[nid]].resize(nbins_, {0, 0});
}
}
// allocate common buffer contiguously for all nodes, need for single Allreduce call
void AllocateAllData() {
const size_t new_size = nbins_*data_.size();
contiguous_allocation_ = true;
if (data_[0].size() != new_size) {
data_[0].resize(new_size);
}
}
[[nodiscard]] bool IsContiguous() const { return contiguous_allocation_; }
private:
/*! \brief number of all bins over all features */
uint32_t nbins_ = 0;
/*! \brief amount of active nodes in hist collection */
uint32_t n_nodes_added_ = 0;
/*! \brief flag to identify contiguous memory allocation */
bool contiguous_allocation_ = false;
std::vector<std::vector<GradientPairPrecise>> data_;
/*! \brief row_ptr_[nid] locates bin for histogram of node nid */
@@ -518,7 +496,7 @@ class ParallelGHistBuilder {
GHistRow hist = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];
if (!hist_was_used_[tid * nodes_ + nid]) {
InitilizeHistByZeroes(hist, 0, hist.size());
std::fill_n(hist.data(), hist.size(), GradientPairPrecise{});
hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
}
@@ -548,7 +526,7 @@ class ParallelGHistBuilder {
if (!is_updated) {
// In distributed mode - some tree nodes can be empty on local machines,
// So we need just set local hist by zeros in this case
InitilizeHistByZeroes(dst, begin, end);
std::fill(dst.data() + begin, dst.data() + end, GradientPairPrecise{});
}
}

View File

@@ -7,13 +7,14 @@
#include <dmlc/common.h>
#include <dmlc/omp.h>
#include <algorithm>
#include <cstdint> // for int32_t
#include <cstdlib> // for malloc, free
#include <limits>
#include <algorithm> // for min
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <cstdlib> // for malloc, free
#include <functional> // for function
#include <new> // for bad_alloc
#include <type_traits> // for is_signed
#include <vector>
#include <type_traits> // for is_signed, conditional_t
#include <vector> // for vector
#include "xgboost/logging.h"
@@ -25,6 +26,8 @@ inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; } // NOLINT
// MSVC doesn't implement the thread limit.
#if defined(_OPENMP) && defined(_MSC_VER)
#include <limits>
extern "C" {
inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); } // NOLINT
}
@@ -84,8 +87,8 @@ class BlockedSpace2d {
// dim1 - size of the first dimension in the space
// getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index
// grain_size - max size of produced blocks
template <typename Func>
BlockedSpace2d(std::size_t dim1, Func getter_size_dim2, std::size_t grain_size) {
BlockedSpace2d(std::size_t dim1, std::function<std::size_t(std::size_t)> getter_size_dim2,
std::size_t grain_size) {
for (std::size_t i = 0; i < dim1; ++i) {
std::size_t size = getter_size_dim2(i);
// Each row (second dim) is divided into n_blocks
@@ -104,13 +107,13 @@ class BlockedSpace2d {
}
// get index of the first dimension of i-th block(task)
[[nodiscard]] std::size_t GetFirstDimension(size_t i) const {
[[nodiscard]] std::size_t GetFirstDimension(std::size_t i) const {
CHECK_LT(i, first_dimension_.size());
return first_dimension_[i];
}
// get a range of indexes for the second dimension of i-th block(task)
[[nodiscard]] Range1d GetRange(size_t i) const {
[[nodiscard]] Range1d GetRange(std::size_t i) const {
CHECK_LT(i, ranges_.size());
return ranges_[i];
}
@@ -129,22 +132,22 @@ class BlockedSpace2d {
}
std::vector<Range1d> ranges_;
std::vector<size_t> first_dimension_;
std::vector<std::size_t> first_dimension_;
};
// Wrapper to implement nested parallelism with simple omp parallel for
template <typename Func>
void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
inline void ParallelFor2d(BlockedSpace2d const& space, std::int32_t n_threads,
std::function<void(std::size_t, Range1d)> func) {
std::size_t n_blocks_in_space = space.Size();
CHECK_GE(nthreads, 1);
CHECK_GE(n_threads, 1);
dmlc::OMPException exc;
#pragma omp parallel num_threads(nthreads)
#pragma omp parallel num_threads(n_threads)
{
exc.Run([&]() {
size_t tid = omp_get_thread_num();
size_t chunck_size = n_blocks_in_space / nthreads + !!(n_blocks_in_space % nthreads);
std::size_t tid = omp_get_thread_num();
std::size_t chunck_size = n_blocks_in_space / n_threads + !!(n_blocks_in_space % n_threads);
std::size_t begin = chunck_size * tid;
std::size_t end = std::min(begin + chunck_size, n_blocks_in_space);

View File

@@ -477,7 +477,6 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
ArrayInterface<1> indptr_;
ArrayInterface<1> indices_;
ArrayInterface<1> values_;
bst_row_t n_rows_;
class Line {
std::size_t column_idx_;
@@ -503,11 +502,8 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
static constexpr bool kIsRowMajor = false;
CSCArrayAdapterBatch(ArrayInterface<1> indptr, ArrayInterface<1> indices,
ArrayInterface<1> values, bst_row_t n_rows)
: indptr_{std::move(indptr)},
indices_{std::move(indices)},
values_{std::move(values)},
n_rows_{n_rows} {}
ArrayInterface<1> values)
: indptr_{std::move(indptr)}, indices_{std::move(indices)}, values_{std::move(values)} {}
std::size_t Size() const { return indptr_.n - 1; }
Line GetLine(std::size_t idx) const {
@@ -542,8 +538,7 @@ class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch>
indices_{indices},
values_{values},
num_rows_{num_rows},
batch_{
CSCArrayAdapterBatch{indptr_, indices_, values_, static_cast<bst_row_t>(num_rows_)}} {}
batch_{CSCArrayAdapterBatch{indptr_, indices_, values_}} {}
// JVM package sends 0 as unknown
size_t NumRows() const { return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_; }

View File

@@ -4,13 +4,13 @@
#ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
#define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
#include <algorithm> // for copy
#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <memory> // for shared_ptr
#include <numeric> // for accumulate
#include <utility> // for move
#include <vector> // for vector
#include <algorithm> // for copy
#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <memory> // for shared_ptr
#include <numeric> // for accumulate
#include <utility> // for move
#include <vector> // for vector
#include "../../common/categorical.h" // for CatBitField
#include "../../common/hist_util.h" // for GHistRow, HistogramCuts
@@ -20,6 +20,7 @@
#include "../param.h" // for TrainParam
#include "../split_evaluator.h" // for TreeEvaluator
#include "expand_entry.h" // for MultiExpandEntry
#include "hist_cache.h" // for BoundedHistCollection
#include "xgboost/base.h" // for bst_node_t, bst_target_t, bst_feature_t
#include "xgboost/context.h" // for COntext
#include "xgboost/linalg.h" // for Constants, Vector
@@ -317,7 +318,7 @@ class HistEvaluator {
}
public:
void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut,
common::Span<FeatureType const> feature_types, const RegTree &tree,
std::vector<CPUExpandEntry> *p_entries) {
auto n_threads = ctx_->Threads();
@@ -623,7 +624,7 @@ class HistMultiEvaluator {
}
public:
void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
void EvaluateSplits(RegTree const &tree, common::Span<const BoundedHistCollection *> hist,
common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
auto &entries = *p_entries;
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());

View File

@@ -18,8 +18,8 @@ namespace xgboost::tree {
*/
template <typename Impl>
struct ExpandEntryImpl {
bst_node_t nid;
bst_node_t depth;
bst_node_t nid{0};
bst_node_t depth{0};
[[nodiscard]] float GetLossChange() const {
return static_cast<Impl const*>(this)->split.loss_chg;

109
src/tree/hist/hist_cache.h Normal file
View File

@@ -0,0 +1,109 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#ifndef XGBOOST_TREE_HIST_HIST_CACHE_H_
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
#include <cstddef> // for size_t
#include <map> // for map
#include <vector> // for vector
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
#include "xgboost/logging.h" // for CHECK_GT
#include "xgboost/span.h" // for Span
namespace xgboost::tree {
/**
* @brief A persistent cache for CPU histogram.
*
* The size of the cache is first bounded by the `Driver` class then by this cache
* implementaiton. The former limits the number of nodes that can be built for each node
* batch, while this cache limits the number of all nodes up to the size of
* max(|node_batch|, n_cached_node).
*
* The caller is responsible for clearing up the cache as it needs to rearrange the
* nodes before making overflowed allocations. The strcut only reports whether the size
* limit has benn reached.
*/
class BoundedHistCollection {
// maps node index to offset in `data_`.
std::map<bst_node_t, std::size_t> node_map_;
// currently allocated bins, used for tracking consistentcy.
std::size_t current_size_{0};
// stores the histograms in a contiguous buffer
std::vector<GradientPairPrecise> data_;
// number of histogram bins across all features
bst_bin_t n_total_bins_{0};
// limits the number of nodes that can be in the cache for each tree
std::size_t n_cached_nodes_{0};
// whether the tree has grown beyond the cache limit
bool has_exceeded_{false};
public:
common::GHistRow operator[](std::size_t idx) {
auto offset = node_map_.at(idx);
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
}
common::ConstGHistRow operator[](std::size_t idx) const {
auto offset = node_map_.at(idx);
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
}
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
n_total_bins_ = n_total_bins;
n_cached_nodes_ = n_cached_nodes;
this->Clear(false);
}
/**
* @brief Clear the cache, mark whether the cache is exceeded the limit.
*/
void Clear(bool exceeded) {
node_map_.clear();
current_size_ = 0;
has_exceeded_ = exceeded;
}
[[nodiscard]] bool CanHost(common::Span<bst_node_t const> nodes_to_build,
common::Span<bst_node_t const> nodes_to_sub) const {
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
return n_new_nodes + node_map_.size() <= n_cached_nodes_;
}
/**
* @brief Allocate histogram buffers for all nodes.
*
* The resulting histogram buffer is contiguous for all nodes in the order of
* allocation.
*/
void AllocateHistograms(common::Span<bst_node_t const> nodes_to_build,
common::Span<bst_node_t const> nodes_to_sub) {
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
auto alloc_size = n_new_nodes * n_total_bins_;
auto new_size = alloc_size + current_size_;
if (new_size > data_.size()) {
data_.resize(new_size);
}
for (auto nidx : nodes_to_build) {
node_map_[nidx] = current_size_;
current_size_ += n_total_bins_;
}
for (auto nidx : nodes_to_sub) {
node_map_[nidx] = current_size_;
current_size_ += n_total_bins_;
}
CHECK_EQ(current_size_, new_size);
}
void AllocateHistograms(std::vector<bst_node_t> const& nodes) {
this->AllocateHistograms(common::Span<bst_node_t const>{nodes},
common::Span<bst_node_t const>{});
}
[[nodiscard]] bool HasExceeded() const { return has_exceeded_; }
[[nodiscard]] bool HistogramExists(bst_node_t nidx) const {
return node_map_.find(nidx) != node_map_.cend();
}
[[nodiscard]] std::size_t Size() const { return current_size_; }
};
} // namespace xgboost::tree
#endif // XGBOOST_TREE_HIST_HIST_CACHE_H_

View File

@@ -0,0 +1,63 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#include "histogram.h"
#include <cstddef> // for size_t
#include <numeric> // for accumulate
#include <utility> // for swap
#include <vector> // for vector
#include "../../common/transform_iterator.h" // for MakeIndexTransformIter
#include "expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
#include "xgboost/logging.h" // for CHECK_NE
#include "xgboost/span.h" // for Span
#include "xgboost/tree_model.h" // for RegTree
namespace xgboost::tree {
void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
CHECK_EQ(nodes_to_build.size(), valid_candidates.size());
std::size_t n_idx = 0;
for (auto const &c : valid_candidates) {
auto left_nidx = p_tree->LeftChild(c.nid);
auto right_nidx = p_tree->RightChild(c.nid);
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
auto lit =
common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
auto rit =
common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
auto fewer_right = right_sum < left_sum;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build[n_idx] = build_nidx;
nodes_to_sub[n_idx] = subtract_nidx;
++n_idx;
}
}
void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
std::size_t n_idx = 0;
for (auto const &c : candidates) {
auto left_nidx = (*p_tree)[c.nid].LeftChild();
auto right_nidx = (*p_tree)[c.nid].RightChild();
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build[n_idx] = build_nidx;
nodes_to_sub[n_idx] = subtract_nidx;
++n_idx;
}
}
} // namespace xgboost::tree

View File

@@ -4,80 +4,85 @@
#ifndef XGBOOST_TREE_HIST_HISTOGRAM_H_
#define XGBOOST_TREE_HIST_HISTOGRAM_H_
#include <algorithm>
#include <limits>
#include <vector>
#include <algorithm> // for max
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <functional> // for function
#include <utility> // for move
#include <vector> // for vector
#include "../../collective/communicator-inl.h"
#include "../../common/hist_util.h"
#include "../../data/gradient_index.h"
#include "expand_entry.h"
#include "xgboost/tree_model.h" // for RegTree
#include "../../collective/communicator-inl.h" // for Allreduce
#include "../../collective/communicator.h" // for Operation
#include "../../common/hist_util.h" // for GHistRow, ParallelGHi...
#include "../../common/row_set.h" // for RowSetCollection
#include "../../common/threading_utils.h" // for ParallelFor2d, Range1d, BlockedSpace2d
#include "../../data/gradient_index.h" // for GHistIndexMatrix
#include "expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
#include "hist_cache.h" // for BoundedHistCollection
#include "param.h" // for HistMakerTrainParam
#include "xgboost/base.h" // for bst_node_t, bst_target_t, bst_bin_t
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for BatchIterator, BatchSet
#include "xgboost/linalg.h" // for MatrixView, All, Vect...
#include "xgboost/logging.h" // for CHECK_GE
#include "xgboost/span.h" // for Span
#include "xgboost/tree_model.h" // for RegTree
namespace xgboost::tree {
template <typename ExpandEntry>
/**
* @brief Decide which node as the build node for multi-target trees.
*/
void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
/**
* @brief Decide which node as the build node.
*/
void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
class HistogramBuilder {
/*! \brief culmulative histogram of gradients. */
common::HistCollection hist_;
BoundedHistCollection hist_;
common::ParallelGHistBuilder buffer_;
BatchParam param_;
int32_t n_threads_{-1};
size_t n_batches_{0};
// Whether XGBoost is running in distributed environment.
bool is_distributed_{false};
bool is_col_split_{false};
public:
/**
* \param total_bins Total number of bins across all features
* \param max_bin_per_feat Maximum number of bins per feature, same as the `max_bin`
* training parameter.
* \param n_threads Number of threads.
* \param is_distributed Mostly used for testing to allow injecting parameters instead
* @brief Reset the builder, should be called before growing a new tree.
*
* @param total_bins Total number of bins across all features
* @param is_distributed Mostly used for testing to allow injecting parameters instead
* of using global rabit variable.
*/
void Reset(uint32_t total_bins, BatchParam p, int32_t n_threads, size_t n_batches,
bool is_distributed, bool is_col_split) {
CHECK_GE(n_threads, 1);
n_threads_ = n_threads;
n_batches_ = n_batches;
void Reset(Context const *ctx, bst_bin_t total_bins, BatchParam const &p, bool is_distributed,
bool is_col_split, HistMakerTrainParam const *param) {
n_threads_ = ctx->Threads();
param_ = p;
hist_.Init(total_bins);
hist_.Reset(total_bins, param->internal_max_cached_hist_node);
buffer_.Init(total_bins);
is_distributed_ = is_distributed;
is_col_split_ = is_col_split;
}
template <bool any_missing>
void BuildLocalHistograms(size_t page_idx, common::BlockedSpace2d space,
GHistIndexMatrix const &gidx,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
void BuildLocalHistograms(common::BlockedSpace2d const &space, GHistIndexMatrix const &gidx,
std::vector<bst_node_t> const &nodes_to_build,
common::RowSetCollection const &row_set_collection,
common::Span<GradientPair const> gpair_h, bool force_read_by_column) {
const size_t n_nodes = nodes_for_explicit_hist_build.size();
CHECK_GT(n_nodes, 0);
std::vector<common::GHistRow> target_hists(n_nodes);
for (size_t i = 0; i < n_nodes; ++i) {
auto const nidx = nodes_for_explicit_hist_build[i].nid;
target_hists[i] = hist_[nidx];
}
if (page_idx == 0) {
// FIXME(jiamingy): Handle different size of space. Right now we use the maximum
// partition size for the buffer, which might not be efficient if partition sizes
// has significant variance.
buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
}
// Parallel processing by nodes and data in each node
common::ParallelFor2d(space, this->n_threads_, [&](size_t nid_in_set, common::Range1d r) {
const auto tid = static_cast<unsigned>(omp_get_thread_num());
const int32_t nid = nodes_for_explicit_hist_build[nid_in_set].nid;
auto elem = row_set_collection[nid];
bst_node_t const nidx = nodes_to_build[nid_in_set];
auto elem = row_set_collection[nidx];
auto start_of_row_set = std::min(r.begin(), elem.Size());
auto end_of_row_set = std::min(r.end(), elem.Size());
auto rid_set = common::RowSetCollection::Elem(elem.begin + start_of_row_set,
elem.begin + end_of_row_set, nid);
elem.begin + end_of_row_set, nidx);
auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
if (rid_set.Size() != 0) {
common::BuildHist<any_missing>(gpair_h, rid_set, gidx, hist, force_read_by_column);
@@ -85,117 +90,143 @@ class HistogramBuilder {
});
}
void AddHistRows(int *starting_index,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
for (auto const &entry : nodes_for_explicit_hist_build) {
int nid = entry.nid;
this->hist_.AddHistRow(nid);
(*starting_index) = std::min(nid, (*starting_index));
/**
* @brief Allocate histogram, rearrange the nodes if `rearrange` is true and the tree
* has reached the cache size limit.
*/
void AddHistRows(RegTree const *p_tree, std::vector<bst_node_t> *p_nodes_to_build,
std::vector<bst_node_t> *p_nodes_to_sub, bool rearrange) {
CHECK(p_nodes_to_build);
auto &nodes_to_build = *p_nodes_to_build;
CHECK(p_nodes_to_sub);
auto &nodes_to_sub = *p_nodes_to_sub;
// We first check whether the cache size is already exceeded or about to be exceeded.
// If not, then we can allocate histograms without clearing the cache and without
// worrying about missing parent histogram.
//
// Otherwise, we need to rearrange the nodes before the allocation to make sure the
// resulting buffer is contiguous. This is to facilitate efficient allreduce.
bool can_host = this->hist_.CanHost(nodes_to_build, nodes_to_sub);
// True if the tree is still within the size of cache limit. Allocate histogram as
// usual.
auto cache_is_valid = can_host && !this->hist_.HasExceeded();
if (!can_host) {
this->hist_.Clear(true);
}
for (auto const &node : nodes_for_subtraction_trick) {
this->hist_.AddHistRow(node.nid);
}
this->hist_.AllocateAllData();
}
/** Main entry point of this class, build histogram for tree nodes. */
void BuildHist(size_t page_id, common::BlockedSpace2d space, GHistIndexMatrix const &gidx,
RegTree const *p_tree, common::RowSetCollection const &row_set_collection,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
int starting_index = std::numeric_limits<int>::max();
if (page_id == 0) {
this->AddHistRows(&starting_index, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick);
}
if (gidx.IsDense()) {
this->BuildLocalHistograms<false>(page_id, space, gidx, nodes_for_explicit_hist_build,
row_set_collection, gpair, force_read_by_column);
} else {
this->BuildLocalHistograms<true>(page_id, space, gidx, nodes_for_explicit_hist_build,
row_set_collection, gpair, force_read_by_column);
}
CHECK_GE(n_batches_, 1);
if (page_id != n_batches_ - 1) {
if (!rearrange || cache_is_valid) {
// If not rearrange, we allocate the histogram as usual, assuming the nodes have
// been properly arranged by other builders.
this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
if (rearrange) {
CHECK(!this->hist_.HasExceeded());
}
return;
}
this->SyncHistogram(p_tree, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, starting_index);
}
/** same as the other build hist but handles only single batch data (in-core) */
void BuildHist(size_t page_id, GHistIndexMatrix const &gidx, RegTree *p_tree,
common::RowSetCollection const &row_set_collection,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
const size_t n_nodes = nodes_for_explicit_hist_build.size();
// create space of size (# rows in each node)
common::BlockedSpace2d space(
n_nodes,
[&](size_t nidx_in_set) {
const int32_t nidx = nodes_for_explicit_hist_build[nidx_in_set].nid;
return row_set_collection[nidx].Size();
},
256);
this->BuildHist(page_id, space, gidx, p_tree, row_set_collection, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, gpair, force_read_by_column);
}
void SyncHistogram(RegTree const *p_tree,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
int starting_index) {
auto n_bins = buffer_.TotalBins();
common::BlockedSpace2d space(
nodes_for_explicit_hist_build.size(), [&](size_t) { return n_bins; }, 1024);
CHECK(hist_.IsContiguous());
common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
const auto &entry = nodes_for_explicit_hist_build[node];
auto this_hist = this->hist_[entry.nid];
// Merging histograms from each thread into once
this->buffer_.ReduceHist(node, r.begin(), r.end());
});
if (is_distributed_ && !is_col_split_) {
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double *>(this->hist_[starting_index].data()),
n_bins * nodes_for_explicit_hist_build.size() * 2);
// The cache is full, parent histogram might be removed in previous iterations to
// saved memory.
std::vector<bst_node_t> can_subtract;
for (auto const &v : nodes_to_sub) {
if (this->hist_.HistogramExists(p_tree->Parent(v))) {
// We can still use the subtraction trick for this node
can_subtract.push_back(v);
} else {
// This node requires a full build
nodes_to_build.push_back(v);
}
}
common::ParallelFor2d(space, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
const auto &entry = nodes_for_explicit_hist_build[nidx_in_set];
auto this_hist = this->hist_[entry.nid];
if (!p_tree->IsRoot(entry.nid)) {
auto const parent_id = p_tree->Parent(entry.nid);
auto const subtraction_node_id = nodes_for_subtraction_trick[nidx_in_set].nid;
auto parent_hist = this->hist_[parent_id];
auto sibling_hist = this->hist_[subtraction_node_id];
common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
nodes_to_sub = std::move(can_subtract);
this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
}
/** Main entry point of this class, build histogram for tree nodes. */
void BuildHist(std::size_t page_idx, common::BlockedSpace2d const &space,
GHistIndexMatrix const &gidx, common::RowSetCollection const &row_set_collection,
std::vector<bst_node_t> const &nodes_to_build,
linalg::VectorView<GradientPair const> gpair, bool force_read_by_column = false) {
CHECK(gpair.Contiguous());
if (page_idx == 0) {
// Add the local histogram cache to the parallel buffer before processing the first page.
auto n_nodes = nodes_to_build.size();
std::vector<common::GHistRow> target_hists(n_nodes);
for (size_t i = 0; i < n_nodes; ++i) {
auto const nidx = nodes_to_build[i];
target_hists[i] = hist_[nidx];
}
buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
}
if (gidx.IsDense()) {
this->BuildLocalHistograms<false>(space, gidx, nodes_to_build, row_set_collection,
gpair.Values(), force_read_by_column);
} else {
this->BuildLocalHistograms<true>(space, gidx, nodes_to_build, row_set_collection,
gpair.Values(), force_read_by_column);
}
}
void SyncHistogram(RegTree const *p_tree, std::vector<bst_node_t> const &nodes_to_build,
std::vector<bst_node_t> const &nodes_to_trick) {
auto n_total_bins = buffer_.TotalBins();
common::BlockedSpace2d space(
nodes_to_build.size(), [&](std::size_t) { return n_total_bins; }, 1024);
common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
// Merging histograms from each thread.
this->buffer_.ReduceHist(node, r.begin(), r.end());
});
if (is_distributed_ && !is_col_split_) {
// The cache is contiguous, we can perform allreduce for all nodes in one go.
CHECK(!nodes_to_build.empty());
auto first_nidx = nodes_to_build.front();
std::size_t n = n_total_bins * nodes_to_build.size() * 2;
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double *>(this->hist_[first_nidx].data()), n);
}
common::BlockedSpace2d const &subspace =
nodes_to_trick.size() == nodes_to_build.size()
? space
: common::BlockedSpace2d{nodes_to_trick.size(),
[&](std::size_t) { return n_total_bins; }, 1024};
common::ParallelFor2d(
subspace, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
auto subtraction_nidx = nodes_to_trick[nidx_in_set];
auto parent_id = p_tree->Parent(subtraction_nidx);
auto sibling_nidx = p_tree->IsLeftChild(subtraction_nidx) ? p_tree->RightChild(parent_id)
: p_tree->LeftChild(parent_id);
auto sibling_hist = this->hist_[sibling_nidx];
auto parent_hist = this->hist_[parent_id];
auto subtract_hist = this->hist_[subtraction_nidx];
common::SubtractionHist(subtract_hist, parent_hist, sibling_hist, r.begin(), r.end());
});
}
public:
/* Getters for tests. */
common::HistCollection const &Histogram() { return hist_; }
[[nodiscard]] BoundedHistCollection const &Histogram() const { return hist_; }
[[nodiscard]] BoundedHistCollection &Histogram() { return hist_; }
auto &Buffer() { return buffer_; }
};
// Construct a work space for building histogram. Eventually we should move this
// function into histogram builder once hist tree method supports external memory.
template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
template <typename Partitioner>
common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
std::vector<ExpandEntry> const &nodes_to_build) {
std::vector<size_t> partition_size(nodes_to_build.size(), 0);
std::vector<bst_node_t> const &nodes_to_build) {
// FIXME(jiamingy): Handle different size of space. Right now we use the maximum
// partition size for the buffer, which might not be efficient if partition sizes
// has significant variance.
std::vector<std::size_t> partition_size(nodes_to_build.size(), 0);
for (auto const &partition : partitioners) {
size_t k = 0;
for (auto node : nodes_to_build) {
auto n_rows_in_node = partition.Partitions()[node.nid].Size();
for (auto nidx : nodes_to_build) {
auto n_rows_in_node = partition.Partitions()[nidx].Size();
partition_size[k] = std::max(partition_size[k], n_rows_in_node);
k++;
}
@@ -204,5 +235,107 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
nodes_to_build.size(), [&](size_t nidx_in_set) { return partition_size[nidx_in_set]; }, 256};
return space;
}
/**
* @brief Histogram builder that can handle multiple targets.
*/
class MultiHistogramBuilder {
std::vector<HistogramBuilder> target_builders_;
Context const *ctx_;
public:
/**
* @brief Build the histogram for root node.
*/
template <typename Partitioner, typename ExpandEntry>
void BuildRootHist(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<Partitioner> const &partitioners,
linalg::MatrixView<GradientPair const> gpair, ExpandEntry const &best,
BatchParam const &param, bool force_read_by_column = false) {
auto n_targets = p_tree->NumTargets();
CHECK_EQ(gpair.Shape(1), n_targets);
CHECK_EQ(p_fmat->Info().num_row_, gpair.Shape(0));
CHECK_EQ(target_builders_.size(), n_targets);
std::vector<bst_node_t> nodes{best.nid};
std::vector<bst_node_t> dummy_sub;
auto space = ConstructHistSpace(partitioners, nodes);
for (bst_target_t t{0}; t < n_targets; ++t) {
this->target_builders_[t].AddHistRows(p_tree, &nodes, &dummy_sub, false);
}
CHECK(dummy_sub.empty());
std::size_t page_idx{0};
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
for (bst_target_t t{0}; t < n_targets; ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
this->target_builders_[t].BuildHist(page_idx, space, gidx,
partitioners[page_idx].Partitions(), nodes, t_gpair,
force_read_by_column);
}
++page_idx;
}
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub);
}
}
/**
* @brief Build histogram for left and right child of valid candidates
*/
template <typename Partitioner, typename ExpandEntry>
void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<Partitioner> const &partitioners,
std::vector<ExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair, BatchParam const &param,
bool force_read_by_column = false) {
std::vector<bst_node_t> nodes_to_build(valid_candidates.size());
std::vector<bst_node_t> nodes_to_sub(valid_candidates.size());
AssignNodes(p_tree, valid_candidates, nodes_to_build, nodes_to_sub);
// use the first builder for getting number of valid nodes.
target_builders_.front().AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, true);
CHECK_GE(nodes_to_build.size(), nodes_to_sub.size());
CHECK_EQ(nodes_to_sub.size() + nodes_to_build.size(), valid_candidates.size() * 2);
// allocate storage for the rest of the builders
for (bst_target_t t = 1; t < target_builders_.size(); ++t) {
target_builders_[t].AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, false);
}
auto space = ConstructHistSpace(partitioners, nodes_to_build);
std::size_t page_idx{0};
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
CHECK_EQ(gpair.Shape(1), p_tree->NumTargets());
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
CHECK_EQ(t_gpair.Shape(0), p_fmat->Info().num_row_);
this->target_builders_[t].BuildHist(page_idx, space, page,
partitioners[page_idx].Partitions(), nodes_to_build,
t_gpair, force_read_by_column);
}
page_idx++;
}
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub);
}
}
[[nodiscard]] auto const &Histogram(bst_target_t t) const {
return target_builders_[t].Histogram();
}
[[nodiscard]] auto &Histogram(bst_target_t t) { return target_builders_[t].Histogram(); }
void Reset(Context const *ctx, bst_bin_t total_bins, bst_target_t n_targets, BatchParam const &p,
bool is_distributed, bool is_col_split, HistMakerTrainParam const *param) {
ctx_ = ctx;
target_builders_.resize(n_targets);
CHECK_GE(n_targets, 1);
for (auto &v : target_builders_) {
v.Reset(ctx, total_bins, p, is_distributed, is_col_split, param);
}
}
};
} // namespace xgboost::tree
#endif // XGBOOST_TREE_HIST_HISTOGRAM_H_

View File

@@ -2,12 +2,19 @@
* Copyright 2021-2023, XGBoost Contributors
*/
#pragma once
#include "xgboost/parameter.h"
#include <cstddef> // for size_t
#include "xgboost/parameter.h" // for XGBoostParameter
#include "xgboost/tree_model.h" // for RegTree
namespace xgboost::tree {
struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
bool debug_synchronize;
constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
bool debug_synchronize{false};
std::size_t internal_max_cached_hist_node{DefaultNodes()};
void CheckTreesSynchronized(RegTree const* local_tree) const;
// declare parameters
@@ -15,6 +22,10 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
DMLC_DECLARE_FIELD(debug_synchronize)
.set_default(false)
.describe("Check if all distributed tree are identical after tree construction.");
DMLC_DECLARE_FIELD(internal_max_cached_hist_node)
.set_default(DefaultNodes())
.set_lower_bound(1)
.describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");
}
};
} // namespace xgboost::tree

View File

@@ -526,7 +526,7 @@ struct SplitEntryContainer {
* \return whether the proposed split is better and can replace current split
*/
template <typename GradientSumT>
bool Update(bst_float new_loss_chg, unsigned split_index, bst_float new_split_value,
bool Update(bst_float new_loss_chg, bst_feature_t split_index, float new_split_value,
bool default_left, bool is_cat, GradientSumT const &left_sum,
GradientSumT const &right_sum) {
if (this->NeedReplace(new_loss_chg, split_index)) {

View File

@@ -3,27 +3,39 @@
*
* \brief Implementation for the approx tree method.
*/
#include <algorithm>
#include <memory>
#include <vector>
#include <algorithm> // for max, transform, fill_n
#include <cstddef> // for size_t
#include <map> // for map
#include <memory> // for allocator, unique_ptr, make_shared, make_unique
#include <utility> // for move
#include <vector> // for vector
#include "../collective/aggregator.h"
#include "../common/random.h"
#include "../data/gradient_index.h"
#include "common_row_partitioner.h"
#include "driver.h"
#include "hist/evaluate_splits.h"
#include "hist/histogram.h"
#include "hist/param.h"
#include "hist/sampler.h" // for SampleGradient
#include "param.h" // for HistMakerTrainParam
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/json.h"
#include "xgboost/linalg.h"
#include "xgboost/task.h" // for ObjInfo
#include "xgboost/tree_model.h"
#include "xgboost/tree_updater.h" // for TreeUpdater
#include "../collective/aggregator.h" // for GlobalSum
#include "../collective/communicator-inl.h" // for IsDistributed
#include "../common/hist_util.h" // for HistogramCuts
#include "../common/random.h" // for ColumnSampler
#include "../common/timer.h" // for Monitor
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "common_row_partitioner.h" // for CommonRowPartitioner
#include "dmlc/registry.h" // for DMLC_REGISTRY_FILE_TAG
#include "driver.h" // for Driver
#include "hist/evaluate_splits.h" // for HistEvaluator, UpdatePredictionCacheImpl
#include "hist/expand_entry.h" // for CPUExpandEntry
#include "hist/histogram.h" // for MultiHistogramBuilder
#include "hist/param.h" // for HistMakerTrainParam
#include "hist/sampler.h" // for SampleGradient
#include "param.h" // for GradStats, TrainParam
#include "xgboost/base.h" // for Args, GradientPair, bst_node_t, bst_bin_t
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for DMatrix, BatchSet, BatchIterator, MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/json.h" // for Object, Json, FromJson, ToJson, get
#include "xgboost/linalg.h" // for Matrix, MakeTensorView, Empty, MatrixView
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK
#include "xgboost/span.h" // for Span
#include "xgboost/task.h" // for ObjInfo
#include "xgboost/tree_model.h" // for RegTree, RTreeNodeStat
#include "xgboost/tree_updater.h" // for TreeUpdater, TreeUpdaterReg, XGBOOST_REGISTE...
namespace xgboost::tree {
@@ -46,7 +58,7 @@ class GloablApproxBuilder {
HistMakerTrainParam const *hist_param_{nullptr};
std::shared_ptr<common::ColumnSampler> col_sampler_;
HistEvaluator evaluator_;
HistogramBuilder<CPUExpandEntry> histogram_builder_;
MultiHistogramBuilder histogram_builder_;
Context const *ctx_;
ObjInfo const *const task_;
@@ -59,7 +71,7 @@ class GloablApproxBuilder {
common::HistogramCuts feature_values_;
public:
void InitData(DMatrix *p_fmat, common::Span<float> hess) {
void InitData(DMatrix *p_fmat, RegTree const *p_tree, common::Span<float> hess) {
monitor_->Start(__func__);
n_batches_ = 0;
@@ -79,8 +91,9 @@ class GloablApproxBuilder {
n_batches_++;
}
histogram_builder_.Reset(n_total_bins, BatchSpec(*param_, hess), ctx_->Threads(), n_batches_,
collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
histogram_builder_.Reset(ctx_, n_total_bins, p_tree->NumTargets(), BatchSpec(*param_, hess),
collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
hist_param_);
monitor_->Stop(__func__);
}
@@ -96,20 +109,16 @@ class GloablApproxBuilder {
}
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
std::vector<CPUExpandEntry> nodes{best};
size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
{}, gpair);
i++;
}
this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
best, BatchSpec(*param_, hess));
auto weight = evaluator_.InitRoot(root_sum);
p_tree->Stat(RegTree::kRoot).sum_hess = root_sum.GetHess();
p_tree->Stat(RegTree::kRoot).base_weight = weight;
(*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
auto const &histograms = histogram_builder_.Histogram();
auto const &histograms = histogram_builder_.Histogram(0);
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &nodes);
monitor_->Stop(__func__);
@@ -130,30 +139,9 @@ class GloablApproxBuilder {
std::vector<CPUExpandEntry> const &valid_candidates,
std::vector<GradientPair> const &gpair, common::Span<float> hess) {
monitor_->Start(__func__);
std::vector<CPUExpandEntry> nodes_to_build;
std::vector<CPUExpandEntry> nodes_to_sub;
for (auto const &c : valid_candidates) {
auto left_nidx = (*p_tree)[c.nid].LeftChild();
auto right_nidx = (*p_tree)[c.nid].RightChild();
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build.push_back(CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}});
nodes_to_sub.push_back(CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}});
}
size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
nodes_to_build, nodes_to_sub, gpair);
i++;
}
this->histogram_builder_.BuildHistLeftRight(
p_fmat, p_tree, partitioner_, valid_candidates,
linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
monitor_->Stop(__func__);
}
@@ -185,7 +173,7 @@ class GloablApproxBuilder {
void UpdateTree(DMatrix *p_fmat, std::vector<GradientPair> const &gpair, common::Span<float> hess,
RegTree *p_tree, HostDeviceVector<bst_node_t> *p_out_position) {
p_last_tree_ = p_tree;
this->InitData(p_fmat, hess);
this->InitData(p_fmat, p_tree, hess);
Driver<CPUExpandEntry> driver(*param_);
auto &tree = *p_tree;
@@ -235,7 +223,7 @@ class GloablApproxBuilder {
best_splits.push_back(l_best);
best_splits.push_back(r_best);
}
auto const &histograms = histogram_builder_.Histogram();
auto const &histograms = histogram_builder_.Histogram(0);
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
monitor_->Start("EvaluateSplits");
evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &best_splits);

View File

@@ -7,35 +7,37 @@
#include <algorithm> // for max, copy, transform
#include <cstddef> // for size_t
#include <cstdint> // for uint32_t, int32_t
#include <memory> // for unique_ptr, allocator, make_unique, shared_ptr
#include <numeric> // for accumulate
#include <ostream> // for basic_ostream, char_traits, operator<<
#include <utility> // for move, swap
#include <exception> // for exception
#include <memory> // for allocator, unique_ptr, make_unique, shared_ptr
#include <ostream> // for operator<<, basic_ostream, char_traits
#include <utility> // for move
#include <vector> // for vector
#include "../collective/aggregator.h" // for GlobalSum
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
#include "../common/hist_util.h" // for HistogramCuts, HistCollection
#include "../collective/communicator-inl.h" // for IsDistributed
#include "../common/hist_util.h" // for HistogramCuts, GHistRow
#include "../common/linalg_op.h" // for begin, cbegin, cend
#include "../common/random.h" // for ColumnSampler
#include "../common/threading_utils.h" // for ParallelFor
#include "../common/timer.h" // for Monitor
#include "../common/transform_iterator.h" // for IndexTransformIter, MakeIndexTransformIter
#include "../common/transform_iterator.h" // for IndexTransformIter
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "common_row_partitioner.h" // for CommonRowPartitioner
#include "dmlc/registry.h" // for DMLC_REGISTRY_FILE_TAG
#include "driver.h" // for Driver
#include "hist/evaluate_splits.h" // for HistEvaluator, HistMultiEvaluator, UpdatePre...
#include "hist/expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
#include "hist/histogram.h" // for HistogramBuilder, ConstructHistSpace
#include "hist/hist_cache.h" // for BoundedHistCollection
#include "hist/histogram.h" // for MultiHistogramBuilder
#include "hist/param.h" // for HistMakerTrainParam
#include "hist/sampler.h" // for SampleGradient
#include "param.h" // for TrainParam, SplitEntryContainer, GradStats
#include "xgboost/base.h" // for GradientPairInternal, GradientPair, bst_targ...
#include "param.h" // for TrainParam, GradStats
#include "xgboost/base.h" // for Args, GradientPairPrecise, GradientPair, Gra...
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for BatchIterator, BatchSet, DMatrix, MetaInfo
#include "xgboost/data.h" // for BatchSet, DMatrix, BatchIterator, MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for All, MatrixView, TensorView, Matrix, Empty
#include "xgboost/json.h" // for Object, Json, FromJson, ToJson, get
#include "xgboost/linalg.h" // for MatrixView, TensorView, All, Matrix, Empty
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
#include "xgboost/span.h" // for Span, operator!=, SpanIterator
#include "xgboost/string_view.h" // for operator<<
@@ -120,7 +122,7 @@ class MultiTargetHistBuilder {
std::shared_ptr<common::ColumnSampler> col_sampler_;
std::unique_ptr<HistMultiEvaluator> evaluator_;
// Histogram builder for each target.
std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
Context const *ctx_{nullptr};
// Partitioner for each data batch.
std::vector<CommonRowPartitioner> partitioner_;
@@ -150,7 +152,6 @@ class MultiTargetHistBuilder {
monitor_->Start(__func__);
p_last_fmat_ = p_fmat;
std::size_t page_id = 0;
bst_bin_t n_total_bins = 0;
partitioner_.clear();
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
@@ -160,16 +161,13 @@ class MultiTargetHistBuilder {
CHECK_EQ(n_total_bins, page.cut.TotalBins());
}
partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->Info().IsColumnSplit());
page_id++;
}
bst_target_t n_targets = p_tree->NumTargets();
histogram_builder_.clear();
for (std::size_t i = 0; i < n_targets; ++i) {
histogram_builder_.emplace_back();
histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
}
histogram_builder_ = std::make_unique<MultiHistogramBuilder>();
histogram_builder_->Reset(ctx_, n_total_bins, n_targets, HistBatch(param_),
collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
hist_param_);
evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
p_last_tree_ = p_tree;
@@ -204,17 +202,7 @@ class MultiTargetHistBuilder {
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
root_sum.Size() * 2);
std::vector<MultiExpandEntry> nodes{best};
std::size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
for (bst_target_t t{0}; t < n_targets; ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
nodes, {}, t_gpair.Values());
}
i++;
}
histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));
auto weight = evaluator_->InitRoot(root_sum);
auto weight_t = weight.HostView();
@@ -222,9 +210,10 @@ class MultiTargetHistBuilder {
[&](float w) { return w * param_->learning_rate; });
p_tree->SetLeaf(RegTree::kRoot, weight_t);
std::vector<common::HistCollection const *> hists;
std::vector<BoundedHistCollection const *> hists;
std::vector<MultiExpandEntry> nodes{{RegTree::kRoot, 0}};
for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
hists.push_back(&histogram_builder_[t].Histogram());
hists.push_back(&(*histogram_builder_).Histogram(t));
}
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
@@ -239,50 +228,17 @@ class MultiTargetHistBuilder {
std::vector<MultiExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair) {
monitor_->Start(__func__);
std::vector<MultiExpandEntry> nodes_to_build;
std::vector<MultiExpandEntry> nodes_to_sub;
for (auto const &c : valid_candidates) {
auto left_nidx = p_tree->LeftChild(c.nid);
auto right_nidx = p_tree->RightChild(c.nid);
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
auto lit =
common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
auto rit =
common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
auto fewer_right = right_sum < left_sum;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
}
std::size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
// Make sure the gradient matrix is f-order.
CHECK(t_gpair.Contiguous());
histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
nodes_to_build, nodes_to_sub, t_gpair.Values());
}
i++;
}
histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair,
HistBatch(param_));
monitor_->Stop(__func__);
}
void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<MultiExpandEntry> *best_splits) {
monitor_->Start(__func__);
std::vector<common::HistCollection const *> hists;
std::vector<BoundedHistCollection const *> hists;
for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
hists.push_back(&histogram_builder_[t].Histogram());
hists.push_back(&(*histogram_builder_).Histogram(t));
}
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
@@ -349,7 +305,7 @@ class HistUpdater {
const RegTree *p_last_tree_{nullptr};
DMatrix const *const p_last_fmat_{nullptr};
std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
ObjInfo const *task_{nullptr};
// Context for number of threads
Context const *ctx_{nullptr};
@@ -364,7 +320,7 @@ class HistUpdater {
col_sampler_{std::move(column_sampler)},
evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(), col_sampler_)},
p_last_fmat_(fmat),
histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
histogram_builder_{new MultiHistogramBuilder},
task_{task},
ctx_{ctx} {
monitor_->Init(__func__);
@@ -387,7 +343,6 @@ class HistUpdater {
// initialize temp data structure
void InitData(DMatrix *fmat, RegTree const *p_tree) {
monitor_->Start(__func__);
std::size_t page_id{0};
bst_bin_t n_total_bins{0};
partitioner_.clear();
for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
@@ -398,10 +353,9 @@ class HistUpdater {
}
partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid,
fmat->Info().IsColumnSplit());
++page_id;
}
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
collective::IsDistributed(), fmat->Info().IsColumnSplit());
histogram_builder_->Reset(ctx_, n_total_bins, 1, HistBatch(param_), collective::IsDistributed(),
fmat->Info().IsColumnSplit(), hist_param_);
evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
p_last_tree_ = p_tree;
monitor_->Stop(__func__);
@@ -410,7 +364,7 @@ class HistUpdater {
void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<CPUExpandEntry> *best_splits) {
monitor_->Start(__func__);
auto const &histograms = histogram_builder_->Histogram();
auto const &histograms = histogram_builder_->Histogram(0);
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
@@ -428,16 +382,8 @@ class HistUpdater {
monitor_->Start(__func__);
CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
std::size_t page_id = 0;
auto space = ConstructHistSpace(partitioner_, {node});
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
std::vector<CPUExpandEntry> nodes_to_build{node};
std::vector<CPUExpandEntry> nodes_to_sub;
this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
partitioner_.at(page_id).Partitions(), nodes_to_build,
nodes_to_sub, gpair.Slice(linalg::All(), 0).Values());
++page_id;
}
this->histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, node,
HistBatch(param_));
{
GradientPairPrecise grad_stat;
@@ -451,7 +397,7 @@ class HistUpdater {
CHECK_GE(row_ptr.size(), 2);
std::uint32_t const ibegin = row_ptr[0];
std::uint32_t const iend = row_ptr[1];
auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
auto hist = this->histogram_builder_->Histogram(0)[RegTree::kRoot];
auto begin = hist.data();
for (std::uint32_t i = ibegin; i < iend; ++i) {
GradientPairPrecise const &et = begin[i];
@@ -474,7 +420,7 @@ class HistUpdater {
monitor_->Start("EvaluateSplits");
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
evaluator_->EvaluateSplits(histogram_builder_->Histogram(0), gmat.cut, ft, *p_tree,
&entries);
break;
}
@@ -490,33 +436,8 @@ class HistUpdater {
std::vector<CPUExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair) {
monitor_->Start(__func__);
std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
std::size_t n_idx = 0;
for (auto const &c : valid_candidates) {
auto left_nidx = (*p_tree)[c.nid].LeftChild();
auto right_nidx = (*p_tree)[c.nid].RightChild();
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
n_idx++;
}
std::size_t page_id{0};
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
partitioner_.at(page_id).Partitions(), nodes_to_build,
nodes_to_sub, gpair.Values());
++page_id;
}
this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates,
gpair, HistBatch(param_));
monitor_->Stop(__func__);
}