Extract histogram builder from CPU Hist. (#7152)

* Extract the CPU histogram builder.
* Fix tests.
* Reduce number of histograms being built.
This commit is contained in:
Jiaming Yuan
2021-08-09 21:15:21 +08:00
committed by GitHub
parent 336af4f974
commit 149f209af6
6 changed files with 647 additions and 676 deletions

302
src/tree/hist/histogram.h Normal file
View File

@@ -0,0 +1,302 @@
/*!
* Copyright 2021 by XGBoost Contributors
*/
#ifndef XGBOOST_TREE_HIST_HISTOGRAM_H_
#define XGBOOST_TREE_HIST_HISTOGRAM_H_
#include <algorithm>
#include <limits>
#include <vector>
#include "rabit/rabit.h"
#include "xgboost/tree_model.h"
#include "../../common/hist_util.h"
namespace xgboost {
namespace tree {
template <typename GradientSumT, typename ExpandEntry> class HistogramBuilder {
using GradientPairT = xgboost::detail::GradientPairInternal<GradientSumT>;
using GHistRowT = common::GHistRow<GradientSumT>;
/*! \brief culmulative histogram of gradients. */
common::HistCollection<GradientSumT> hist_;
/*! \brief culmulative local parent histogram of gradients. */
common::HistCollection<GradientSumT> hist_local_worker_;
common::GHistBuilder<GradientSumT> builder_;
common::ParallelGHistBuilder<GradientSumT> buffer_;
rabit::Reducer<GradientPairT, GradientPairT::Reduce> reducer_;
int32_t max_bin_ {-1};
int32_t n_threads_ {-1};
// Whether XGBoost is running in distributed environment.
bool is_distributed_ {false};
public:
/**
* \param total_bins Total number of bins across all features
* \param max_bin_per_feat Maximum number of bins per feature, same as the `max_bin`
* training parameter.
* \param n_threads Number of threads.
* \param is_distributed Mostly used for testing to allow injecting parameters instead
* of using global rabit variable.
*/
void Reset(uint32_t total_bins, int32_t max_bin_per_feat, int32_t n_threads,
bool is_distributed = rabit::IsDistributed()) {
CHECK_GE(n_threads, 1);
n_threads_ = n_threads;
CHECK_GE(max_bin_per_feat, 2);
max_bin_ = max_bin_per_feat;
hist_.Init(total_bins);
hist_local_worker_.Init(total_bins);
buffer_.Init(total_bins);
builder_ = common::GHistBuilder<GradientSumT>(n_threads, total_bins);
is_distributed_ = is_distributed;
}
template <bool any_missing>
void
BuildLocalHistograms(DMatrix *p_fmat,
std::vector<ExpandEntry> nodes_for_explicit_hist_build,
common::RowSetCollection const &row_set_collection,
const std::vector<GradientPair> &gpair_h) {
const size_t n_nodes = nodes_for_explicit_hist_build.size();
// create space of size (# rows in each node)
common::BlockedSpace2d space(
n_nodes,
[&](size_t node) {
const int32_t nid = nodes_for_explicit_hist_build[node].nid;
return row_set_collection[nid].Size();
},
256);
std::vector<GHistRowT> target_hists(n_nodes);
for (size_t i = 0; i < n_nodes; ++i) {
const int32_t nid = nodes_for_explicit_hist_build[i].nid;
target_hists[i] = hist_[nid];
}
buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
// Parallel processing by nodes and data in each node
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(
BatchParam{GenericParameter::kCpuId, max_bin_})) {
common::ParallelFor2d(
space, this->n_threads_, [&](size_t nid_in_set, common::Range1d r) {
const auto tid = static_cast<unsigned>(omp_get_thread_num());
const int32_t nid = nodes_for_explicit_hist_build[nid_in_set].nid;
auto start_of_row_set = row_set_collection[nid].begin;
auto rid_set = common::RowSetCollection::Elem(
start_of_row_set + r.begin(), start_of_row_set + r.end(), nid);
builder_.template BuildHist<any_missing>(
gpair_h, rid_set, gmat,
buffer_.GetInitializedHist(tid, nid_in_set));
});
}
}
void
AddHistRows(int *starting_index, int *sync_count,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
RegTree *p_tree) {
if (is_distributed_) {
this->AddHistRowsDistributed(starting_index, sync_count,
nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, p_tree);
} else {
this->AddHistRowsLocal(starting_index, sync_count,
nodes_for_explicit_hist_build,
nodes_for_subtraction_trick);
}
}
/* Main entry point of this class, build histogram for tree nodes. */
void BuildHist(DMatrix *p_fmat, RegTree *p_tree,
common::RowSetCollection const &row_set_collection,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
std::vector<GradientPair> const &gpair) {
int starting_index = std::numeric_limits<int>::max();
int sync_count = 0;
this->AddHistRows(&starting_index, &sync_count,
nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, p_tree);
if (p_fmat->IsDense()) {
BuildLocalHistograms<false>(p_fmat, nodes_for_explicit_hist_build,
row_set_collection, gpair);
} else {
BuildLocalHistograms<true>(p_fmat, nodes_for_explicit_hist_build,
row_set_collection, gpair);
}
if (is_distributed_) {
this->SyncHistogramDistributed(p_tree, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick,
starting_index, sync_count);
} else {
this->SyncHistogramLocal(p_tree, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, starting_index,
sync_count);
}
}
void SyncHistogramDistributed(
RegTree *p_tree,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
int starting_index, int sync_count) {
const size_t nbins = builder_.GetNumBins();
common::BlockedSpace2d space(
nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; },
1024);
common::ParallelFor2d(
space, n_threads_, [&](size_t node, common::Range1d r) {
const auto &entry = nodes_for_explicit_hist_build[node];
auto this_hist = this->hist_[entry.nid];
// Merging histograms from each thread into once
buffer_.ReduceHist(node, r.begin(), r.end());
// Store posible parent node
auto this_local = hist_local_worker_[entry.nid];
common::CopyHist(this_local, this_hist, r.begin(), r.end());
if (!(*p_tree)[entry.nid].IsRoot()) {
const size_t parent_id = (*p_tree)[entry.nid].Parent();
const int subtraction_node_id =
nodes_for_subtraction_trick[node].nid;
auto parent_hist = this->hist_local_worker_[parent_id];
auto sibling_hist = this->hist_[subtraction_node_id];
common::SubtractionHist(sibling_hist, parent_hist, this_hist,
r.begin(), r.end());
// Store posible parent node
auto sibling_local = hist_local_worker_[subtraction_node_id];
common::CopyHist(sibling_local, sibling_hist, r.begin(), r.end());
}
});
reducer_.Allreduce(this->hist_[starting_index].data(),
builder_.GetNumBins() * sync_count);
ParallelSubtractionHist(space, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, p_tree);
common::BlockedSpace2d space2(
nodes_for_subtraction_trick.size(), [&](size_t) { return nbins; },
1024);
ParallelSubtractionHist(space2, nodes_for_subtraction_trick,
nodes_for_explicit_hist_build, p_tree);
}
void SyncHistogramLocal(
RegTree *p_tree,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
int starting_index, int sync_count) {
const size_t nbins = this->builder_.GetNumBins();
common::BlockedSpace2d space(
nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; },
1024);
common::ParallelFor2d(
space, this->n_threads_, [&](size_t node, common::Range1d r) {
const auto &entry = nodes_for_explicit_hist_build[node];
auto this_hist = this->hist_[entry.nid];
// Merging histograms from each thread into once
this->buffer_.ReduceHist(node, r.begin(), r.end());
if (!(*p_tree)[entry.nid].IsRoot()) {
const size_t parent_id = (*p_tree)[entry.nid].Parent();
const int subtraction_node_id =
nodes_for_subtraction_trick[node].nid;
auto parent_hist = this->hist_[parent_id];
auto sibling_hist = this->hist_[subtraction_node_id];
common::SubtractionHist(sibling_hist, parent_hist, this_hist,
r.begin(), r.end());
}
});
}
public:
/* Getters for tests. */
common::HistCollection<GradientSumT> const& Histogram() {
return hist_;
}
auto& Buffer() { return buffer_; }
private:
void
ParallelSubtractionHist(const common::BlockedSpace2d &space,
const std::vector<ExpandEntry> &nodes,
const std::vector<ExpandEntry> &subtraction_nodes,
const RegTree *p_tree) {
common::ParallelFor2d(
space, this->n_threads_, [&](size_t node, common::Range1d r) {
const auto &entry = nodes[node];
if (!((*p_tree)[entry.nid].IsLeftChild())) {
auto this_hist = this->hist_[entry.nid];
if (!(*p_tree)[entry.nid].IsRoot()) {
const int subtraction_node_id = subtraction_nodes[node].nid;
auto parent_hist = hist_[(*p_tree)[entry.nid].Parent()];
auto sibling_hist = hist_[subtraction_node_id];
common::SubtractionHist(this_hist, parent_hist, sibling_hist,
r.begin(), r.end());
}
}
});
}
// Add a tree node to histogram buffer in local training environment.
void AddHistRowsLocal(
int *starting_index, int *sync_count,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
for (auto const &entry : nodes_for_explicit_hist_build) {
int nid = entry.nid;
this->hist_.AddHistRow(nid);
(*starting_index) = std::min(nid, (*starting_index));
}
(*sync_count) = nodes_for_explicit_hist_build.size();
for (auto const &node : nodes_for_subtraction_trick) {
this->hist_.AddHistRow(node.nid);
}
this->hist_.AllocateAllData();
}
void AddHistRowsDistributed(
int *starting_index, int *sync_count,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
RegTree *p_tree) {
const size_t explicit_size = nodes_for_explicit_hist_build.size();
const size_t subtaction_size = nodes_for_subtraction_trick.size();
std::vector<int> merged_node_ids(explicit_size + subtaction_size);
for (size_t i = 0; i < explicit_size; ++i) {
merged_node_ids[i] = nodes_for_explicit_hist_build[i].nid;
}
for (size_t i = 0; i < subtaction_size; ++i) {
merged_node_ids[explicit_size + i] = nodes_for_subtraction_trick[i].nid;
}
std::sort(merged_node_ids.begin(), merged_node_ids.end());
int n_left = 0;
for (auto const &nid : merged_node_ids) {
if ((*p_tree)[nid].IsLeftChild()) {
this->hist_.AddHistRow(nid);
(*starting_index) = std::min(nid, (*starting_index));
n_left++;
this->hist_local_worker_.AddHistRow(nid);
}
}
for (auto const &nid : merged_node_ids) {
if (!((*p_tree)[nid].IsLeftChild())) {
this->hist_.AddHistRow(nid);
this->hist_local_worker_.AddHistRow(nid);
}
}
this->hist_.AllocateAllData();
this->hist_local_worker_.AllocateAllData();
(*sync_count) = std::max(1, n_left);
}
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_HIST_HISTOGRAM_H_

View File

@@ -53,13 +53,6 @@ void QuantileHistMaker::SetBuilder(const size_t n_trees,
DMatrix *dmat) {
builder->reset(
new Builder<GradientSumT>(n_trees, param_, std::move(pruner_), dmat));
if (rabit::IsDistributed()) {
(*builder)->SetHistSynchronizer(new DistributedHistSynchronizer<GradientSumT>());
(*builder)->SetHistRowsAdder(new DistributedHistRowsAdder<GradientSumT>());
} else {
(*builder)->SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
(*builder)->SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
}
}
template<typename GradientSumT>
@@ -96,7 +89,7 @@ void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair,
const size_t n_trees = trees.size();
if (hist_maker_param_.single_precision_histogram) {
if (!float_builder_) {
SetBuilder(n_trees, &float_builder_, dmat);
this->SetBuilder(n_trees, &float_builder_, dmat);
}
CallBuilderUpdate(float_builder_, gpair, dmat, gmat, trees);
} else {
@@ -123,199 +116,34 @@ bool QuantileHistMaker::UpdatePredictionCache(
}
template <typename GradientSumT>
void BatchHistSynchronizer<GradientSumT>::SyncHistograms(BuilderT *builder,
int,
int,
RegTree *p_tree) {
builder->builder_monitor_.Start("SyncHistograms");
const size_t nbins = builder->hist_builder_.GetNumBins();
common::BlockedSpace2d space(builder->nodes_for_explicit_hist_build_.size(), [&](size_t) {
return nbins;
}, 1024);
QuantileHistMaker::Builder<GradientSumT>::~Builder() = default;
common::ParallelFor2d(space, builder->nthread_, [&](size_t node, common::Range1d r) {
const auto& entry = builder->nodes_for_explicit_hist_build_[node];
auto this_hist = builder->hist_[entry.nid];
// Merging histograms from each thread into once
builder->hist_buffer_.ReduceHist(node, r.begin(), r.end());
if (!(*p_tree)[entry.nid].IsRoot()) {
const size_t parent_id = (*p_tree)[entry.nid].Parent();
const int subtraction_node_id = builder->nodes_for_subtraction_trick_[node].nid;
auto parent_hist = builder->hist_[parent_id];
auto sibling_hist = builder->hist_[subtraction_node_id];
SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
}
});
builder->builder_monitor_.Stop("SyncHistograms");
}
template <typename GradientSumT>
void DistributedHistSynchronizer<GradientSumT>::SyncHistograms(BuilderT* builder,
int starting_index,
int sync_count,
RegTree *p_tree) {
builder->builder_monitor_.Start("SyncHistograms");
const size_t nbins = builder->hist_builder_.GetNumBins();
common::BlockedSpace2d space(builder->nodes_for_explicit_hist_build_.size(), [&](size_t) {
return nbins;
}, 1024);
common::ParallelFor2d(space, builder->nthread_, [&](size_t node, common::Range1d r) {
const auto& entry = builder->nodes_for_explicit_hist_build_[node];
auto this_hist = builder->hist_[entry.nid];
// Merging histograms from each thread into once
builder->hist_buffer_.ReduceHist(node, r.begin(), r.end());
// Store posible parent node
auto this_local = builder->hist_local_worker_[entry.nid];
CopyHist(this_local, this_hist, r.begin(), r.end());
if (!(*p_tree)[entry.nid].IsRoot()) {
const size_t parent_id = (*p_tree)[entry.nid].Parent();
const int subtraction_node_id = builder->nodes_for_subtraction_trick_[node].nid;
auto parent_hist = builder->hist_local_worker_[parent_id];
auto sibling_hist = builder->hist_[subtraction_node_id];
SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
// Store posible parent node
auto sibling_local = builder->hist_local_worker_[subtraction_node_id];
CopyHist(sibling_local, sibling_hist, r.begin(), r.end());
}
});
builder->builder_monitor_.Start("SyncHistogramsAllreduce");
builder->histred_.Allreduce(builder->hist_[starting_index].data(),
builder->hist_builder_.GetNumBins() * sync_count);
builder->builder_monitor_.Stop("SyncHistogramsAllreduce");
ParallelSubtractionHist(builder, space, builder->nodes_for_explicit_hist_build_,
builder->nodes_for_subtraction_trick_, p_tree);
common::BlockedSpace2d space2(builder->nodes_for_subtraction_trick_.size(), [&](size_t) {
return nbins;
}, 1024);
ParallelSubtractionHist(builder, space2, builder->nodes_for_subtraction_trick_,
builder->nodes_for_explicit_hist_build_, p_tree);
builder->builder_monitor_.Stop("SyncHistograms");
}
template <typename GradientSumT>
void DistributedHistSynchronizer<GradientSumT>::ParallelSubtractionHist(
BuilderT* builder,
const common::BlockedSpace2d& space,
const std::vector<CPUExpandEntry>& nodes,
const std::vector<CPUExpandEntry>& subtraction_nodes,
const RegTree * p_tree) {
common::ParallelFor2d(space, builder->nthread_, [&](size_t node, common::Range1d r) {
const auto& entry = nodes[node];
if (!((*p_tree)[entry.nid].IsLeftChild())) {
auto this_hist = builder->hist_[entry.nid];
if (!(*p_tree)[entry.nid].IsRoot()) {
const int subtraction_node_id = subtraction_nodes[node].nid;
auto parent_hist = builder->hist_[(*p_tree)[entry.nid].Parent()];
auto sibling_hist = builder->hist_[subtraction_node_id];
SubtractionHist(this_hist, parent_hist, sibling_hist, r.begin(), r.end());
}
}
});
}
template <typename GradientSumT>
void BatchHistRowsAdder<GradientSumT>::AddHistRows(BuilderT *builder,
int *starting_index,
int *sync_count,
RegTree *) {
builder->builder_monitor_.Start("AddHistRows");
for (auto const& entry : builder->nodes_for_explicit_hist_build_) {
int nid = entry.nid;
builder->hist_.AddHistRow(nid);
(*starting_index) = std::min(nid, (*starting_index));
}
(*sync_count) = builder->nodes_for_explicit_hist_build_.size();
for (auto const& node : builder->nodes_for_subtraction_trick_) {
builder->hist_.AddHistRow(node.nid);
}
builder->hist_.AllocateAllData();
builder->builder_monitor_.Stop("AddHistRows");
}
template <typename GradientSumT>
void DistributedHistRowsAdder<GradientSumT>::AddHistRows(BuilderT *builder,
int *starting_index,
int *sync_count,
RegTree *p_tree) {
builder->builder_monitor_.Start("AddHistRows");
const size_t explicit_size = builder->nodes_for_explicit_hist_build_.size();
const size_t subtaction_size = builder->nodes_for_subtraction_trick_.size();
std::vector<int> merged_node_ids(explicit_size + subtaction_size);
for (size_t i = 0; i < explicit_size; ++i) {
merged_node_ids[i] = builder->nodes_for_explicit_hist_build_[i].nid;
}
for (size_t i = 0; i < subtaction_size; ++i) {
merged_node_ids[explicit_size + i] =
builder->nodes_for_subtraction_trick_[i].nid;
}
std::sort(merged_node_ids.begin(), merged_node_ids.end());
int n_left = 0;
for (auto const& nid : merged_node_ids) {
if ((*p_tree)[nid].IsLeftChild()) {
builder->hist_.AddHistRow(nid);
(*starting_index) = std::min(nid, (*starting_index));
n_left++;
builder->hist_local_worker_.AddHistRow(nid);
}
}
for (auto const& nid : merged_node_ids) {
if (!((*p_tree)[nid].IsLeftChild())) {
builder->hist_.AddHistRow(nid);
builder->hist_local_worker_.AddHistRow(nid);
}
}
builder->hist_.AllocateAllData();
builder->hist_local_worker_.AllocateAllData();
(*sync_count) = std::max(1, n_left);
builder->builder_monitor_.Stop("AddHistRows");
}
template <typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::SetHistSynchronizer(
HistSynchronizer<GradientSumT> *sync) {
hist_synchronizer_.reset(sync);
}
template <typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::SetHistRowsAdder(
HistRowsAdder<GradientSumT> *adder) {
hist_rows_adder_.reset(adder);
}
template <typename GradientSumT>
template <bool any_missing>
void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
const GHistIndexMatrix &gmat, const DMatrix &fmat, RegTree *p_tree,
const std::vector<GradientPair> &gpair_h, int *num_leaves,
std::vector<CPUExpandEntry> *expand) {
DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h,
int *num_leaves, std::vector<CPUExpandEntry> *expand) {
CPUExpandEntry node(CPUExpandEntry::kRootNid, p_tree->GetDepth(0), 0.0f);
nodes_for_explicit_hist_build_.clear();
nodes_for_subtraction_trick_.clear();
nodes_for_explicit_hist_build_.push_back(node);
int starting_index = std::numeric_limits<int>::max();
int sync_count = 0;
hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree);
BuildLocalHistograms<any_missing>(gmat, p_tree, gpair_h);
hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree);
this->histogram_builder_->BuildHist(p_fmat, p_tree, row_set_collection_,
nodes_for_explicit_hist_build_,
nodes_for_subtraction_trick_, gpair_h);
{
auto nid = CPUExpandEntry::kRootNid;
GHistRowT hist = hist_[nid];
GHistRowT hist = this->histogram_builder_->Histogram()[nid];
GradientPairT grad_stat;
if (data_layout_ == DataLayout::kDenseDataZeroBased ||
data_layout_ == DataLayout::kDenseDataOneBased) {
auto const &gmat = *(p_fmat
->GetBatches<GHistIndexMatrix>(BatchParam{
GenericParameter::kCpuId, param_.max_bin})
.begin());
const std::vector<uint32_t> &row_ptr = gmat.cut.Ptrs();
const uint32_t ibegin = row_ptr[fid_least_bins_];
const uint32_t iend = row_ptr[fid_least_bins_ + 1];
@@ -329,7 +157,8 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
for (const size_t *it = e.begin; it < e.end; ++it) {
grad_stat.Add(gpair_h[*it].GetGrad(), gpair_h[*it].GetHess());
}
histred_.Allreduce(&grad_stat, 1);
rabit::Allreduce<rabit::op::Sum, GradientSumT>(
reinterpret_cast<GradientSumT *>(&grad_stat), 2);
}
auto weight = evaluator_->InitRoot(GradStats{grad_stat});
@@ -339,7 +168,10 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
std::vector<CPUExpandEntry> entries{node};
builder_monitor_.Start("EvaluateSplits");
evaluator_->EvaluateSplits(hist_, gmat, *p_tree, &entries);
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(
BatchParam{GenericParameter::kCpuId, param_.max_bin})) {
evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat, *p_tree, &entries);
}
builder_monitor_.Stop("EvaluateSplits");
node = entries.front();
}
@@ -348,46 +180,6 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
++(*num_leaves);
}
template<typename GradientSumT>
template <bool any_missing>
void QuantileHistMaker::Builder<GradientSumT>::BuildLocalHistograms(
const GHistIndexMatrix &gmat,
RegTree *p_tree,
const std::vector<GradientPair> &gpair_h) {
builder_monitor_.Start("BuildLocalHistograms");
const size_t n_nodes = nodes_for_explicit_hist_build_.size();
// create space of size (# rows in each node)
common::BlockedSpace2d space(n_nodes, [&](size_t node) {
const int32_t nid = nodes_for_explicit_hist_build_[node].nid;
return row_set_collection_[nid].Size();
}, 256);
std::vector<GHistRowT> target_hists(n_nodes);
for (size_t i = 0; i < n_nodes; ++i) {
const int32_t nid = nodes_for_explicit_hist_build_[i].nid;
target_hists[i] = hist_[nid];
}
hist_buffer_.Reset(this->nthread_, n_nodes, space, target_hists);
// Parallel processing by nodes and data in each node
common::ParallelFor2d(space, this->nthread_, [&](size_t nid_in_set, common::Range1d r) {
const auto tid = static_cast<unsigned>(omp_get_thread_num());
const int32_t nid = nodes_for_explicit_hist_build_[nid_in_set].nid;
auto start_of_row_set = row_set_collection_[nid].begin;
auto rid_set = RowSetCollection::Elem(start_of_row_set + r.begin(),
start_of_row_set + r.end(),
nid);
hist_builder_.template BuildHist<any_missing>(gpair_h, rid_set, gmat,
hist_buffer_.GetInitializedHist(tid, nid_in_set));
});
builder_monitor_.Stop("BuildLocalHistograms");
}
template<typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::AddSplitsToTree(
const std::vector<CPUExpandEntry>& expand,
@@ -448,10 +240,10 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
std::vector<CPUExpandEntry> expand;
InitRoot<any_missing>(gmat, *p_fmat, p_tree, gpair_h, &num_leaves, &expand);
InitRoot<any_missing>(p_fmat, p_tree, gpair_h, &num_leaves, &expand);
driver.Push(expand[0]);
int depth = 0;
int32_t depth = 0;
while (!driver.IsEmpty()) {
expand = driver.Pop();
depth = expand[0].depth + 1;
@@ -463,19 +255,24 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
AddSplitsToTree(expand, p_tree, &num_leaves, &nodes_for_apply_split);
if (nodes_for_apply_split.size() != 0) {
ApplySplit<any_missing>(nodes_for_apply_split, gmat, column_matrix, hist_, p_tree);
ApplySplit<any_missing>(nodes_for_apply_split, gmat, column_matrix, p_tree);
SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree);
int starting_index = std::numeric_limits<int>::max();
int sync_count = 0;
hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree);
if (depth < param_.max_depth) {
BuildLocalHistograms<any_missing>(gmat, p_tree, gpair_h);
hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree);
this->histogram_builder_->BuildHist(
p_fmat, p_tree, row_set_collection_, nodes_for_explicit_hist_build_,
nodes_for_subtraction_trick_, gpair_h);
} else {
int starting_index = std::numeric_limits<int>::max();
int sync_count = 0;
this->histogram_builder_->AddHistRows(
&starting_index, &sync_count, nodes_for_explicit_hist_build_,
nodes_for_subtraction_trick_, p_tree);
}
builder_monitor_.Start("EvaluateSplits");
evaluator_->EvaluateSplits(hist_, gmat, *p_tree, &nodes_to_evaluate);
evaluator_->EvaluateSplits(this->histogram_builder_->Histogram(), gmat,
*p_tree, &nodes_to_evaluate);
builder_monitor_.Stop("EvaluateSplits");
for (size_t i = 0; i < nodes_for_apply_split.size(); ++i) {
@@ -606,11 +403,10 @@ size_t QuantileHistMaker::Builder<GradientSumT>::GetNumberOfTrees() {
return n_trees_;
}
template<typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix& gmat,
const DMatrix& fmat,
const RegTree& tree,
std::vector<GradientPair>* gpair) {
template <typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::InitData(
const GHistIndexMatrix &gmat, const DMatrix &fmat, const RegTree &tree,
std::vector<GradientPair> *gpair) {
CHECK((param_.max_depth > 0 || param_.max_leaves > 0))
<< "max_depth or max_leaves cannot be both 0 (unlimited); "
<< "at least one should be a positive quantity.";
@@ -626,10 +422,6 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
row_set_collection_.Clear();
// initialize histogram collection
uint32_t nbins = gmat.cut.Ptrs().back();
hist_.Init(nbins);
hist_local_worker_.Init(nbins);
hist_buffer_.Init(nbins);
// initialize histogram builder
dmlc::OMPException exc;
#pragma omp parallel
@@ -639,7 +431,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
});
}
exc.Rethrow();
hist_builder_ = GHistBuilder<GradientSumT>(this->nthread_, nbins);
this->histogram_builder_->Reset(nbins, param_.max_bin, this->nthread_);
std::vector<size_t>& row_indices = *row_set_collection_.Data();
row_indices.resize(info.num_row_);
@@ -815,7 +607,6 @@ template <bool any_missing>
void QuantileHistMaker::Builder<GradientSumT>::ApplySplit(const std::vector<CPUExpandEntry> nodes,
const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix,
const HistCollection<GradientSumT>& hist,
RegTree* p_tree) {
builder_monitor_.Start("ApplySplit");
// 1. Find split condition for each split

View File

@@ -22,6 +22,7 @@
#include "xgboost/json.h"
#include "hist/evaluate_splits.h"
#include "hist/histogram.h"
#include "constraints.h"
#include "./param.h"
#include "./driver.h"
@@ -88,24 +89,6 @@ using xgboost::common::GHistBuilder;
using xgboost::common::ColumnMatrix;
using xgboost::common::Column;
template <typename GradientSumT>
class HistSynchronizer;
template <typename GradientSumT>
class BatchHistSynchronizer;
template <typename GradientSumT>
class DistributedHistSynchronizer;
template <typename GradientSumT>
class HistRowsAdder;
template <typename GradientSumT>
class BatchHistRowsAdder;
template <typename GradientSumT>
class DistributedHistRowsAdder;
// training parameters specific to this algorithm
struct CPUHistMakerTrainParam
: public XGBoostParameter<CPUHistMakerTrainParam> {
@@ -198,20 +181,6 @@ class QuantileHistMaker: public TreeUpdater {
}
protected:
template <typename GradientSumT>
friend class HistSynchronizer;
template <typename GradientSumT>
friend class BatchHistSynchronizer;
template <typename GradientSumT>
friend class DistributedHistSynchronizer;
template <typename GradientSumT>
friend class HistRowsAdder;
template <typename GradientSumT>
friend class BatchHistRowsAdder;
template <typename GradientSumT>
friend class DistributedHistRowsAdder;
CPUHistMakerTrainParam hist_maker_param_;
// training parameter
TrainParam param_;
@@ -230,9 +199,12 @@ class QuantileHistMaker: public TreeUpdater {
explicit Builder(const size_t n_trees, const TrainParam &param,
std::unique_ptr<TreeUpdater> pruner, DMatrix const *fmat)
: n_trees_(n_trees), param_(param), pruner_(std::move(pruner)),
p_last_tree_(nullptr), p_last_fmat_(fmat) {
p_last_tree_(nullptr), p_last_fmat_(fmat),
histogram_builder_{
new HistogramBuilder<GradientSumT, CPUExpandEntry>} {
builder_monitor_.Init("Quantile::Builder");
}
~Builder();
// update one tree, growing
virtual void Update(const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix,
@@ -240,28 +212,10 @@ class QuantileHistMaker: public TreeUpdater {
DMatrix* p_fmat,
RegTree* p_tree);
inline void SubtractionTrick(GHistRowT self,
GHistRowT sibling,
GHistRowT parent) {
builder_monitor_.Start("SubtractionTrick");
hist_builder_.SubtractionTrick(self, sibling, parent);
builder_monitor_.Stop("SubtractionTrick");
}
bool UpdatePredictionCache(const DMatrix* data,
VectorView<float> out_preds);
void SetHistSynchronizer(HistSynchronizer<GradientSumT>* sync);
void SetHistRowsAdder(HistRowsAdder<GradientSumT>* adder);
protected:
friend class HistSynchronizer<GradientSumT>;
friend class BatchHistSynchronizer<GradientSumT>;
friend class DistributedHistSynchronizer<GradientSumT>;
friend class HistRowsAdder<GradientSumT>;
friend class BatchHistRowsAdder<GradientSumT>;
friend class DistributedHistRowsAdder<GradientSumT>;
// initialize temp data structure
void InitData(const GHistIndexMatrix& gmat,
const DMatrix& fmat,
@@ -278,7 +232,6 @@ class QuantileHistMaker: public TreeUpdater {
void ApplySplit(std::vector<CPUExpandEntry> nodes,
const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix,
const HistCollection<GradientSumT>& hist,
RegTree* p_tree);
void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree* p_tree);
@@ -287,14 +240,8 @@ class QuantileHistMaker: public TreeUpdater {
void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions);
template <bool any_missing>
void BuildLocalHistograms(const GHistIndexMatrix &gmat,
RegTree *p_tree,
const std::vector<GradientPair> &gpair_h);
template <bool any_missing>
void InitRoot(const GHistIndexMatrix &gmat,
const DMatrix& fmat,
void InitRoot(DMatrix* p_fmat,
RegTree *p_tree,
const std::vector<GradientPair> &gpair_h,
int *num_leaves, std::vector<CPUExpandEntry> *expand);
@@ -330,15 +277,11 @@ class QuantileHistMaker: public TreeUpdater {
// the internal row sets
RowSetCollection row_set_collection_;
std::vector<GradientPair> gpair_local_;
/*! \brief culmulative histogram of gradients. */
HistCollection<GradientSumT> hist_;
/*! \brief culmulative local parent histogram of gradients. */
HistCollection<GradientSumT> hist_local_worker_;
/*! \brief feature with least # of bins. to be used for dense specialization
of InitNewNode() */
uint32_t fid_least_bins_;
GHistBuilder<GradientSumT> hist_builder_;
std::unique_ptr<TreeUpdater> pruner_;
std::unique_ptr<HistEvaluator<GradientSumT, CPUExpandEntry>> evaluator_;
@@ -358,12 +301,10 @@ class QuantileHistMaker: public TreeUpdater {
enum class DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
DataLayout data_layout_;
std::unique_ptr<HistogramBuilder<GradientSumT, CPUExpandEntry>>
histogram_builder_;
common::Monitor builder_monitor_;
common::ParallelGHistBuilder<GradientSumT> hist_buffer_;
rabit::Reducer<GradientPairT, GradientPairT::Reduce> histred_;
std::unique_ptr<HistSynchronizer<GradientSumT>> hist_synchronizer_;
std::unique_ptr<HistRowsAdder<GradientSumT>> hist_rows_adder_;
};
common::Monitor updater_monitor_;
@@ -383,71 +324,6 @@ class QuantileHistMaker: public TreeUpdater {
std::unique_ptr<TreeUpdater> pruner_;
};
template <typename GradientSumT>
class HistSynchronizer {
public:
using BuilderT = QuantileHistMaker::Builder<GradientSumT>;
virtual void SyncHistograms(BuilderT* builder,
int starting_index,
int sync_count,
RegTree *p_tree) = 0;
virtual ~HistSynchronizer() = default;
};
template <typename GradientSumT>
class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
public:
using BuilderT = QuantileHistMaker::Builder<GradientSumT>;
void SyncHistograms(BuilderT* builder,
int starting_index,
int sync_count,
RegTree *p_tree) override;
};
template <typename GradientSumT>
class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
public:
using BuilderT = QuantileHistMaker::Builder<GradientSumT>;
void SyncHistograms(BuilderT* builder, int starting_index,
int sync_count, RegTree *p_tree) override;
void ParallelSubtractionHist(BuilderT* builder,
const common::BlockedSpace2d& space,
const std::vector<CPUExpandEntry>& nodes,
const std::vector<CPUExpandEntry>& subtraction_nodes,
const RegTree * p_tree);
};
template <typename GradientSumT>
class HistRowsAdder {
public:
using BuilderT = QuantileHistMaker::Builder<GradientSumT>;
virtual void AddHistRows(BuilderT* builder, int *starting_index,
int *sync_count, RegTree *p_tree) = 0;
virtual ~HistRowsAdder() = default;
};
template <typename GradientSumT>
class BatchHistRowsAdder: public HistRowsAdder<GradientSumT> {
public:
using BuilderT = QuantileHistMaker::Builder<GradientSumT>;
void AddHistRows(BuilderT*, int *starting_index,
int *sync_count, RegTree *p_tree) override;
};
template <typename GradientSumT>
class DistributedHistRowsAdder: public HistRowsAdder<GradientSumT> {
public:
using BuilderT = QuantileHistMaker::Builder<GradientSumT>;
void AddHistRows(BuilderT*, int *starting_index,
int *sync_count, RegTree *p_tree) override;
};
} // namespace tree
} // namespace xgboost