Use realloc for histogram cache and expose the cache limit. (#9455)

This commit is contained in:
Jiaming Yuan 2023-08-10 14:05:27 +08:00 committed by GitHub
parent a57371ef7c
commit 1caa93221a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 71 additions and 20 deletions

View File

@ -226,6 +226,15 @@ Parameters for Tree Booster
- ``one_output_per_tree``: One model for each target. - ``one_output_per_tree``: One model for each target.
- ``multi_output_tree``: Use multi-target trees. - ``multi_output_tree``: Use multi-target trees.
* ``max_cached_hist_node``, [default = 65536]
Maximum number of cached nodes for CPU histogram.
.. versionadded:: 2.0.0
- For most of the cases this parameter should not be set except for growing deep trees
on CPU.
.. _cat-param: .. _cat-param:
Parameters for Categorical Feature Parameters for Categorical Feature

View File

@ -42,7 +42,7 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
) )
hist_cache_strategy = strategies.fixed_dictionaries( hist_cache_strategy = strategies.fixed_dictionaries(
{"internal_max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])} {"max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
) )
hist_multi_parameter_strategy = strategies.fixed_dictionaries( hist_multi_parameter_strategy = strategies.fixed_dictionaries(

View File

@ -35,6 +35,13 @@ class RefResourceView {
size_type size_{0}; size_type size_{0};
std::shared_ptr<common::ResourceHandler> mem_{nullptr}; std::shared_ptr<common::ResourceHandler> mem_{nullptr};
protected:
void Init(value_type* ptr, size_type size, std::shared_ptr<common::ResourceHandler> mem) {
ptr_ = ptr;
size_ = size;
mem_ = std::move(mem);
}
public: public:
RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem) RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
: ptr_{ptr}, size_{n}, mem_{std::move(mem)} { : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
@ -60,11 +67,11 @@ class RefResourceView {
RefResourceView() = default; RefResourceView() = default;
RefResourceView(RefResourceView const& that) = delete; RefResourceView(RefResourceView const& that) = delete;
RefResourceView(RefResourceView&& that) = delete;
RefResourceView& operator=(RefResourceView const& that) = delete; RefResourceView& operator=(RefResourceView const& that) = delete;
/** /**
* @brief We allow move assignment for lazy initialization. * @brief We allow move assignment for lazy initialization.
*/ */
RefResourceView(RefResourceView&& that) = default;
RefResourceView& operator=(RefResourceView&& that) = default; RefResourceView& operator=(RefResourceView&& that) = default;
[[nodiscard]] size_type size() const { return size_; } // NOLINT [[nodiscard]] size_type size() const { return size_; } // NOLINT
@ -154,5 +161,33 @@ template <typename T>
auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T)); auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
return RefResourceView{resource->DataAs<T>(), n_elements, resource, init}; return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
} }
template <typename T>
class ReallocVector : public RefResourceView<T> {
static_assert(!std::is_reference_v<T>);
static_assert(!std::is_const_v<T>);
static_assert(std::is_trivially_copyable_v<T>);
using Upper = RefResourceView<T>;
using size_type = typename Upper::size_type; // NOLINT
using value_type = typename Upper::value_type; // NOLINT
public:
ReallocVector() : RefResourceView<T>{MakeFixedVecWithMalloc(0, T{})} {}
ReallocVector(size_type n, value_type const& init)
: RefResourceView<T>{MakeFixedVecWithMalloc(n, init)} {}
ReallocVector(ReallocVector const& that) = delete;
ReallocVector(ReallocVector&& that) = delete;
ReallocVector& operator=(ReallocVector const& that) = delete;
ReallocVector& operator=(ReallocVector&& that) = delete;
void Resize(typename Upper::size_type new_size) {
auto resource = std::dynamic_pointer_cast<common::MallocResource>(this->Resource());
CHECK(resource);
resource->Resize(new_size * sizeof(T));
this->Init(resource->template DataAs<T>(), new_size, resource);
}
};
} // namespace xgboost::common } // namespace xgboost::common
#endif // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_ #endif // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_

View File

@ -5,12 +5,14 @@
#define XGBOOST_TREE_HIST_HIST_CACHE_H_ #define XGBOOST_TREE_HIST_HIST_CACHE_H_
#include <cstddef> // for size_t #include <cstddef> // for size_t
#include <map> // for map #include <map> // for map
#include <memory> // for unique_ptr
#include <vector> // for vector #include <vector> // for vector
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow #include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
#include "xgboost/base.h" // for bst_node_t, bst_bin_t #include "../../common/ref_resource_view.h" // for ReallocVector
#include "xgboost/logging.h" // for CHECK_GT #include "xgboost/base.h" // for bst_node_t, bst_bin_t
#include "xgboost/span.h" // for Span #include "xgboost/logging.h" // for CHECK_GT
#include "xgboost/span.h" // for Span
namespace xgboost::tree { namespace xgboost::tree {
/** /**
@ -32,7 +34,8 @@ class BoundedHistCollection {
std::size_t current_size_{0}; std::size_t current_size_{0};
// stores the histograms in a contiguous buffer // stores the histograms in a contiguous buffer
std::vector<GradientPairPrecise> data_; using Vec = common::ReallocVector<GradientPairPrecise>;
std::unique_ptr<Vec> data_{new Vec{}}; // nvcc 12.1 trips over std::make_unique
// number of histogram bins across all features // number of histogram bins across all features
bst_bin_t n_total_bins_{0}; bst_bin_t n_total_bins_{0};
@ -42,13 +45,14 @@ class BoundedHistCollection {
bool has_exceeded_{false}; bool has_exceeded_{false};
public: public:
BoundedHistCollection() = default;
common::GHistRow operator[](std::size_t idx) { common::GHistRow operator[](std::size_t idx) {
auto offset = node_map_.at(idx); auto offset = node_map_.at(idx);
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_); return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
} }
common::ConstGHistRow operator[](std::size_t idx) const { common::ConstGHistRow operator[](std::size_t idx) const {
auto offset = node_map_.at(idx); auto offset = node_map_.at(idx);
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_); return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
} }
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) { void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
n_total_bins_ = n_total_bins; n_total_bins_ = n_total_bins;
@ -81,8 +85,8 @@ class BoundedHistCollection {
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size(); auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
auto alloc_size = n_new_nodes * n_total_bins_; auto alloc_size = n_new_nodes * n_total_bins_;
auto new_size = alloc_size + current_size_; auto new_size = alloc_size + current_size_;
if (new_size > data_.size()) { if (new_size > data_->size()) {
data_.resize(new_size); data_->Resize(new_size);
} }
for (auto nidx : nodes_to_build) { for (auto nidx : nodes_to_build) {
node_map_[nidx] = current_size_; node_map_[nidx] = current_size_;

View File

@ -63,7 +63,7 @@ class HistogramBuilder {
bool is_col_split, HistMakerTrainParam const *param) { bool is_col_split, HistMakerTrainParam const *param) {
n_threads_ = ctx->Threads(); n_threads_ = ctx->Threads();
param_ = p; param_ = p;
hist_.Reset(total_bins, param->internal_max_cached_hist_node); hist_.Reset(total_bins, param->max_cached_hist_node);
buffer_.Init(total_bins); buffer_.Init(total_bins);
is_distributed_ = is_distributed; is_distributed_ = is_distributed;
is_col_split_ = is_col_split; is_col_split_ = is_col_split;

View File

@ -13,7 +13,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; } constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
bool debug_synchronize{false}; bool debug_synchronize{false};
std::size_t internal_max_cached_hist_node{DefaultNodes()}; std::size_t max_cached_hist_node{DefaultNodes()};
void CheckTreesSynchronized(RegTree const* local_tree) const; void CheckTreesSynchronized(RegTree const* local_tree) const;
@ -22,7 +22,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
DMLC_DECLARE_FIELD(debug_synchronize) DMLC_DECLARE_FIELD(debug_synchronize)
.set_default(false) .set_default(false)
.describe("Check if all distributed tree are identical after tree construction."); .describe("Check if all distributed tree are identical after tree construction.");
DMLC_DECLARE_FIELD(internal_max_cached_hist_node) DMLC_DECLARE_FIELD(max_cached_hist_node)
.set_default(DefaultNodes()) .set_default(DefaultNodes())
.set_lower_bound(1) .set_lower_bound(1)
.describe("Maximum number of nodes in CPU histogram cache. Only for internal usage."); .describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");

View File

@ -866,6 +866,9 @@ class GPUGlobalApproxMaker : public TreeUpdater {
// Used in test to count how many configurations are performed // Used in test to count how many configurations are performed
LOG(DEBUG) << "[GPU Approx]: Configure"; LOG(DEBUG) << "[GPU Approx]: Configure";
hist_maker_param_.UpdateAllowUnknown(args); hist_maker_param_.UpdateAllowUnknown(args);
if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
}
dh::CheckComputeCapability(); dh::CheckComputeCapability();
initialised_ = false; initialised_ = false;

View File

@ -51,7 +51,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
row_set_collection.Init(); row_set_collection.Init();
HistMakerTrainParam hist_param; HistMakerTrainParam hist_param;
hist.Reset(gmat.cut.Ptrs().back(), hist_param.internal_max_cached_hist_node); hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
hist.AllocateHistograms({0}); hist.AllocateHistograms({0});
common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column); common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);
@ -118,7 +118,7 @@ TEST(HistMultiEvaluator, Evaluate) {
linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId); linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
for (bst_target_t t{0}; t < n_targets; ++t) { for (bst_target_t t{0}; t < n_targets; ++t) {
auto &hist = histogram[t]; auto &hist = histogram[t];
hist.Reset(n_bins * n_features, hist_param.internal_max_cached_hist_node); hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
hist.AllocateHistograms({0}); hist.AllocateHistograms({0});
auto node_hist = hist[0]; auto node_hist = hist[0];
node_hist[0] = {-0.5, 0.5}; node_hist[0] = {-0.5, 0.5};
@ -235,7 +235,7 @@ auto CompareOneHotAndPartition(bool onehot) {
entries.front().nid = 0; entries.front().nid = 0;
entries.front().depth = 0; entries.front().depth = 0;
hist.Reset(gmat.cut.TotalBins(), hist_param.internal_max_cached_hist_node); hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node);
hist.AllocateHistograms({0}); hist.AllocateHistograms({0});
auto node_hist = hist[0]; auto node_hist = hist[0];
@ -265,7 +265,7 @@ TEST(HistEvaluator, Categorical) {
TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) { TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
BoundedHistCollection hist; BoundedHistCollection hist;
HistMakerTrainParam hist_param; HistMakerTrainParam hist_param;
hist.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node); hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
hist.AllocateHistograms({0}); hist.AllocateHistograms({0});
auto node_hist = hist[0]; auto node_hist = hist[0];
ASSERT_EQ(node_hist.size(), feature_histogram_.size()); ASSERT_EQ(node_hist.size(), feature_histogram_.size());

View File

@ -516,7 +516,7 @@ class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
Context ctx; Context ctx;
HistMakerTrainParam hist_param; HistMakerTrainParam hist_param;
if (limit) { if (limit) {
hist_param.Init(Args{{"internal_max_cached_hist_node", "1"}}); hist_param.Init(Args{{"max_cached_hist_node", "1"}});
} }
std::shared_ptr<DMatrix> Xy = std::shared_ptr<DMatrix> Xy =

View File

@ -59,7 +59,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
cuts_.min_vals_.Resize(1); cuts_.min_vals_.Resize(1);
HistMakerTrainParam hist_param; HistMakerTrainParam hist_param;
hist_.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node); hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
hist_.AllocateHistograms({0}); hist_.AllocateHistograms({0});
auto node_hist = hist_[0]; auto node_hist = hist_[0];