Simplify sparse and dense CPU hist kernels (#7029)

* Simplify sparse and dense kernels
* Extract row partitioner.

Co-authored-by: Kirill Shvets <kirill.shvets@intel.com>
This commit is contained in:
ShvetsKS
2021-06-11 13:26:30 +03:00
committed by GitHub
parent 1faad825f4
commit 2567404ab6
10 changed files with 369 additions and 434 deletions

View File

@@ -290,6 +290,7 @@ void QuantileHistMaker::Builder<GradientSumT>::SetHistRowsAdder(
}
template <typename GradientSumT>
template <bool any_missing>
void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
const GHistIndexMatrix &gmat,
const DMatrix& fmat,
@@ -307,7 +308,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
int sync_count = 0;
hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree);
BuildLocalHistograms(gmat, p_tree, gpair_h);
BuildLocalHistograms<any_missing>(gmat, p_tree, gpair_h);
hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree);
this->InitNewNode(CPUExpandEntry::kRootNid, gmat, gpair_h, fmat, *p_tree);
@@ -319,6 +320,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
}
template<typename GradientSumT>
template <bool any_missing>
void QuantileHistMaker::Builder<GradientSumT>::BuildLocalHistograms(
const GHistIndexMatrix &gmat,
RegTree *p_tree,
@@ -350,7 +352,8 @@ void QuantileHistMaker::Builder<GradientSumT>::BuildLocalHistograms(
auto rid_set = RowSetCollection::Elem(start_of_row_set + r.begin(),
start_of_row_set + r.end(),
nid);
BuildHist(gpair_h, rid_set, gmat, hist_buffer_.GetInitializedHist(tid, nid_in_set));
hist_builder_.template BuildHist<any_missing>(gpair_h, rid_set, gmat,
hist_buffer_.GetInitializedHist(tid, nid_in_set));
});
builder_monitor_.Stop("BuildLocalHistograms");
@@ -439,6 +442,7 @@ void QuantileHistMaker::Builder<GradientSumT>::BuildNodeStats(
}
template<typename GradientSumT>
template <bool any_missing>
void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix,
@@ -450,7 +454,7 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
std::vector<CPUExpandEntry> expand;
InitRoot(gmat, *p_fmat, p_tree, gpair_h, &num_leaves, &expand);
InitRoot<any_missing>(gmat, *p_fmat, p_tree, gpair_h, &num_leaves, &expand);
driver.Push(expand[0]);
int depth = 0;
@@ -465,14 +469,14 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
AddSplitsToTree(expand, p_tree, &num_leaves, &nodes_for_apply_split);
if (nodes_for_apply_split.size() != 0) {
ApplySplit(nodes_for_apply_split, gmat, column_matrix, hist_, p_tree);
ApplySplit<any_missing>(nodes_for_apply_split, gmat, column_matrix, hist_, p_tree);
SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree);
int starting_index = std::numeric_limits<int>::max();
int sync_count = 0;
hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree);
if (depth < param_.max_depth) {
BuildLocalHistograms(gmat, p_tree, gpair_h);
BuildLocalHistograms<any_missing>(gmat, p_tree, gpair_h);
hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree);
}
@@ -520,8 +524,11 @@ void QuantileHistMaker::Builder<GradientSumT>::Update(
this->InitData(gmat, *p_fmat, *p_tree, gpair_ptr);
ExpandTree(gmat, column_matrix, p_fmat, p_tree, *gpair_ptr);
if (column_matrix.AnyMissing()) {
ExpandTree<true>(gmat, column_matrix, p_fmat, p_tree, *gpair_ptr);
} else {
ExpandTree<false>(gmat, column_matrix, p_fmat, p_tree, *gpair_ptr);
}
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
p_tree->Stat(nid).base_weight = snode_[nid].weight;
@@ -867,165 +874,6 @@ void QuantileHistMaker::Builder<GradientSumT>::EvaluateSplits(
builder_monitor_.Stop("EvaluateSplits");
}
// split row indexes (rid_span) to 2 parts (left_part, right_part) depending
// on comparison of indexes values (idx_span) and split point (split_cond)
// Handle dense columns
// Analog of std::stable_partition, but in no-inplace manner
template <bool default_left, bool any_missing, typename BinIdxType>
inline std::pair<size_t, size_t> PartitionDenseKernel(const common::DenseColumn<BinIdxType>& column,
common::Span<const size_t> rid_span, const int32_t split_cond,
common::Span<size_t> left_part, common::Span<size_t> right_part) {
const int32_t offset = column.GetBaseIdx();
const BinIdxType* idx = column.GetFeatureBinIdxPtr().data();
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
if (any_missing) {
for (auto rid : rid_span) {
if (column.IsMissing(rid)) {
if (default_left) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
} else {
if ((static_cast<int32_t>(idx[rid]) + offset) <= split_cond) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
}
}
} else {
for (auto rid : rid_span) {
if ((static_cast<int32_t>(idx[rid]) + offset) <= split_cond) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
}
}
return {nleft_elems, nright_elems};
}
// Split row indexes (rid_span) to 2 parts (left_part, right_part) depending
// on comparison of indexes values (idx_span) and split point (split_cond).
// Handle sparse columns
template<bool default_left, typename BinIdxType>
inline std::pair<size_t, size_t> PartitionSparseKernel(
const common::SparseColumn<BinIdxType>& column,
common::Span<const size_t> rid_span, const int32_t split_cond,
common::Span<size_t> left_part, common::Span<size_t> right_part) {
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
const size_t* row_data = column.GetRowData();
const size_t column_size = column.Size();
if (rid_span.size()) { // ensure that rid_span is nonempty range
// search first nonzero row with index >= rid_span.front()
const size_t* p = std::lower_bound(row_data, row_data + column_size,
rid_span.front());
if (p != row_data + column_size && *p <= rid_span.back()) {
size_t cursor = p - row_data;
for (auto rid : rid_span) {
while (cursor < column_size
&& column.GetRowIdx(cursor) < rid
&& column.GetRowIdx(cursor) <= rid_span.back()) {
++cursor;
}
if (cursor < column_size && column.GetRowIdx(cursor) == rid) {
if (static_cast<int32_t>(column.GetGlobalBinIdx(cursor)) <= split_cond) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
++cursor;
} else {
// missing value
if (default_left) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
}
}
} else { // all rows in rid_span have missing values
if (default_left) {
std::copy(rid_span.begin(), rid_span.end(), p_left_part);
nleft_elems = rid_span.size();
} else {
std::copy(rid_span.begin(), rid_span.end(), p_right_part);
nright_elems = rid_span.size();
}
}
}
return {nleft_elems, nright_elems};
}
template <typename GradientSumT>
template <typename BinIdxType>
void QuantileHistMaker::Builder<GradientSumT>::PartitionKernel(
const size_t node_in_set, const size_t nid, const common::Range1d range,
const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree) {
const size_t* rid = row_set_collection_[nid].begin;
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = partition_builder_.GetLeftBuffer(node_in_set,
range.begin(), range.end());
common::Span<size_t> right = partition_builder_.GetRightBuffer(node_in_set,
range.begin(), range.end());
const bst_uint fid = tree[nid].SplitIndex();
const bool default_left = tree[nid].DefaultLeft();
const auto column_ptr = column_matrix.GetColumn<BinIdxType>(fid);
std::pair<size_t, size_t> child_nodes_sizes;
if (column_ptr->GetType() == xgboost::common::kDenseColumn) {
const common::DenseColumn<BinIdxType>& column =
static_cast<const common::DenseColumn<BinIdxType>& >(*(column_ptr.get()));
if (default_left) {
if (column_matrix.AnyMissing()) {
child_nodes_sizes = PartitionDenseKernel<true, true>(column, rid_span,
split_cond, left, right);
} else {
child_nodes_sizes = PartitionDenseKernel<true, false>(column, rid_span,
split_cond, left, right);
}
} else {
if (column_matrix.AnyMissing()) {
child_nodes_sizes = PartitionDenseKernel<false, true>(column, rid_span,
split_cond, left, right);
} else {
child_nodes_sizes = PartitionDenseKernel<false, false>(column, rid_span,
split_cond, left, right);
}
}
} else {
const common::SparseColumn<BinIdxType>& column
= static_cast<const common::SparseColumn<BinIdxType>& >(*(column_ptr.get()));
if (default_left) {
child_nodes_sizes = PartitionSparseKernel<true>(column, rid_span,
split_cond, left, right);
} else {
child_nodes_sizes = PartitionSparseKernel<false>(column, rid_span,
split_cond, left, right);
}
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
partition_builder_.SetNLeftElems(node_in_set, range.begin(), range.end(), n_left);
partition_builder_.SetNRightElems(node_in_set, range.begin(), range.end(), n_right);
}
template <typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::FindSplitConditions(
const std::vector<CPUExpandEntry>& nodes,
@@ -1070,6 +918,7 @@ void QuantileHistMaker::Builder<GradientSumT>::AddSplitsToRowSet(
}
template <typename GradientSumT>
template <bool any_missing>
void QuantileHistMaker::Builder<GradientSumT>::ApplySplit(const std::vector<CPUExpandEntry> nodes,
const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix,
@@ -1102,16 +951,19 @@ void QuantileHistMaker::Builder<GradientSumT>::ApplySplit(const std::vector<CPUE
partition_builder_.AllocateForTask(task_id);
switch (column_matrix.GetTypeSize()) {
case common::kUint8BinsTypeSize:
PartitionKernel<uint8_t>(node_in_set, nid, r,
split_conditions[node_in_set], column_matrix, *p_tree);
partition_builder_.template Partition<uint8_t, any_missing>(node_in_set, nid, r,
split_conditions[node_in_set], column_matrix,
*p_tree, row_set_collection_[nid].begin);
break;
case common::kUint16BinsTypeSize:
PartitionKernel<uint16_t>(node_in_set, nid, r,
split_conditions[node_in_set], column_matrix, *p_tree);
partition_builder_.template Partition<uint16_t, any_missing>(node_in_set, nid, r,
split_conditions[node_in_set], column_matrix,
*p_tree, row_set_collection_[nid].begin);
break;
case common::kUint32BinsTypeSize:
PartitionKernel<uint32_t>(node_in_set, nid, r,
split_conditions[node_in_set], column_matrix, *p_tree);
partition_builder_.template Partition<uint32_t, any_missing>(node_in_set, nid, r,
split_conditions[node_in_set], column_matrix,
*p_tree, row_set_collection_[nid].begin);
break;
default:
CHECK(false); // no default behavior
@@ -1268,24 +1120,6 @@ GradStats QuantileHistMaker::Builder<GradientSumT>::EnumerateSplit(
template struct QuantileHistMaker::Builder<float>;
template struct QuantileHistMaker::Builder<double>;
template void QuantileHistMaker::Builder<float>::PartitionKernel<uint8_t>(
const size_t node_in_set, const size_t nid, common::Range1d range,
const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree);
template void QuantileHistMaker::Builder<float>::PartitionKernel<uint16_t>(
const size_t node_in_set, const size_t nid, common::Range1d range,
const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree);
template void QuantileHistMaker::Builder<float>::PartitionKernel<uint32_t>(
const size_t node_in_set, const size_t nid, common::Range1d range,
const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree);
template void QuantileHistMaker::Builder<double>::PartitionKernel<uint8_t>(
const size_t node_in_set, const size_t nid, common::Range1d range,
const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree);
template void QuantileHistMaker::Builder<double>::PartitionKernel<uint16_t>(
const size_t node_in_set, const size_t nid, common::Range1d range,
const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree);
template void QuantileHistMaker::Builder<double>::PartitionKernel<uint32_t>(
const size_t node_in_set, const size_t nid, common::Range1d range,
const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree);
XGBOOST_REGISTER_TREE_UPDATER(FastHistMaker, "grow_fast_histmaker")
.describe("(Deprecated, use grow_quantile_histmaker instead.)"

View File

@@ -28,6 +28,7 @@
#include "../common/timer.h"
#include "../common/hist_util.h"
#include "../common/row_set.h"
#include "../common/partition_builder.h"
#include "../common/column_matrix.h"
namespace xgboost {
@@ -291,14 +292,6 @@ class QuantileHistMaker: public TreeUpdater {
DMatrix* p_fmat,
RegTree* p_tree);
inline void BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRowT hist) {
hist_builder_.BuildHist(gpair, row_indices, gmat, hist,
data_layout_ != DataLayout::kSparseData);
}
inline void SubtractionTrick(GHistRowT self,
GHistRowT sibling,
GHistRowT parent) {
@@ -338,17 +331,13 @@ class QuantileHistMaker: public TreeUpdater {
const HistCollection<GradientSumT>& hist,
const RegTree& tree);
template <bool any_missing>
void ApplySplit(std::vector<CPUExpandEntry> nodes,
const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix,
const HistCollection<GradientSumT>& hist,
RegTree* p_tree);
template <typename BinIdxType>
void PartitionKernel(const size_t node_in_set, const size_t nid, const common::Range1d range,
const int32_t split_cond,
const ColumnMatrix& column_matrix, const RegTree& tree);
void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree* p_tree);
@@ -376,10 +365,11 @@ class QuantileHistMaker: public TreeUpdater {
// else - there are missing values
bool SplitContainsMissingValues(const GradStats e, const NodeEntry& snode);
template <bool any_missing>
void BuildLocalHistograms(const GHistIndexMatrix &gmat,
RegTree *p_tree,
const std::vector<GradientPair> &gpair_h);
template <bool any_missing>
void InitRoot(const GHistIndexMatrix &gmat,
const DMatrix& fmat,
RegTree *p_tree,
@@ -402,7 +392,7 @@ class QuantileHistMaker: public TreeUpdater {
const DMatrix& fmat,
const std::vector<GradientPair> &gpair_h,
const std::vector<CPUExpandEntry>& nodes_for_apply_split, RegTree *p_tree);
template <bool any_missing>
void ExpandTree(const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix,
DMatrix* p_fmat,