Use realloc for histogram cache and expose the cache limit. (#9455)

This commit is contained in:
Jiaming Yuan
2023-08-10 14:05:27 +08:00
committed by GitHub
parent a57371ef7c
commit 1caa93221a
10 changed files with 71 additions and 20 deletions

View File

@@ -5,12 +5,14 @@
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
#include <cstddef> // for size_t
#include <map> // for map
#include <memory> // for unique_ptr
#include <vector> // for vector
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
#include "xgboost/logging.h" // for CHECK_GT
#include "xgboost/span.h" // for Span
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
#include "../../common/ref_resource_view.h" // for ReallocVector
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
#include "xgboost/logging.h" // for CHECK_GT
#include "xgboost/span.h" // for Span
namespace xgboost::tree {
/**
@@ -32,7 +34,8 @@ class BoundedHistCollection {
std::size_t current_size_{0};
// stores the histograms in a contiguous buffer
std::vector<GradientPairPrecise> data_;
using Vec = common::ReallocVector<GradientPairPrecise>;
std::unique_ptr<Vec> data_{new Vec{}}; // nvcc 12.1 trips over std::make_unique
// number of histogram bins across all features
bst_bin_t n_total_bins_{0};
@@ -42,13 +45,14 @@ class BoundedHistCollection {
bool has_exceeded_{false};
public:
BoundedHistCollection() = default;
common::GHistRow operator[](std::size_t idx) {
auto offset = node_map_.at(idx);
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
}
common::ConstGHistRow operator[](std::size_t idx) const {
auto offset = node_map_.at(idx);
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
}
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
n_total_bins_ = n_total_bins;
@@ -81,8 +85,8 @@ class BoundedHistCollection {
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
auto alloc_size = n_new_nodes * n_total_bins_;
auto new_size = alloc_size + current_size_;
if (new_size > data_.size()) {
data_.resize(new_size);
if (new_size > data_->size()) {
data_->Resize(new_size);
}
for (auto nidx : nodes_to_build) {
node_map_[nidx] = current_size_;

View File

@@ -63,7 +63,7 @@ class HistogramBuilder {
bool is_col_split, HistMakerTrainParam const *param) {
n_threads_ = ctx->Threads();
param_ = p;
hist_.Reset(total_bins, param->internal_max_cached_hist_node);
hist_.Reset(total_bins, param->max_cached_hist_node);
buffer_.Init(total_bins);
is_distributed_ = is_distributed;
is_col_split_ = is_col_split;

View File

@@ -13,7 +13,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
bool debug_synchronize{false};
std::size_t internal_max_cached_hist_node{DefaultNodes()};
std::size_t max_cached_hist_node{DefaultNodes()};
void CheckTreesSynchronized(RegTree const* local_tree) const;
@@ -22,7 +22,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
DMLC_DECLARE_FIELD(debug_synchronize)
.set_default(false)
.describe("Check if all distributed tree are identical after tree construction.");
DMLC_DECLARE_FIELD(internal_max_cached_hist_node)
DMLC_DECLARE_FIELD(max_cached_hist_node)
.set_default(DefaultNodes())
.set_lower_bound(1)
.describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");

View File

@@ -866,6 +866,9 @@ class GPUGlobalApproxMaker : public TreeUpdater {
// Used in test to count how many configurations are performed
LOG(DEBUG) << "[GPU Approx]: Configure";
hist_maker_param_.UpdateAllowUnknown(args);
if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
}
dh::CheckComputeCapability();
initialised_ = false;