skip missing lookup if nothing is missing in CPU hist partition kernel. (#5644)

* [xgboost] skip missing lookup if nothing is missing
This commit is contained in:
Oleksandr Kuvshynov 2020-05-11 19:50:08 -07:00 committed by GitHub
parent 9ad40901a8
commit 4e64e2ef8e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 123 additions and 9 deletions

View File

@ -154,6 +154,7 @@ class ColumnMatrix {
index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
any_missing_ = !noMissingValues;
if (noMissingValues) {
missing_flags_.resize(feature_offsets_[nfeature], false);
@ -311,11 +312,18 @@ class ColumnMatrix {
const BinTypeSize GetTypeSize() const {
return bins_type_size_;
}
// This is just an utility function
const bool NoMissingValues(const size_t n_elements,
const size_t n_row, const size_t n_features) {
return n_elements == n_features * n_row;
}
// And this returns part of state
const bool AnyMissing() const {
return any_missing_;
}
private:
std::vector<uint8_t> index_;
@ -329,6 +337,7 @@ class ColumnMatrix {
uint32_t* index_base_;
std::vector<bool> missing_flags_;
BinTypeSize bins_type_size_;
bool any_missing_;
};
} // namespace common

View File

@ -826,7 +826,7 @@ void QuantileHistMaker::Builder::EvaluateSplits(const std::vector<ExpandEntry>&
// on comparison of indexes values (idx_span) and split point (split_cond)
// Handle dense columns
// Analog of std::stable_partition, but in no-inplace manner
template <bool default_left, typename BinIdxType>
template <bool default_left, bool any_missing, typename BinIdxType>
inline std::pair<size_t, size_t> PartitionDenseKernel(const common::DenseColumn<BinIdxType>& column,
common::Span<const size_t> rid_span, const int32_t split_cond,
common::Span<size_t> left_part, common::Span<size_t> right_part) {
@ -837,14 +837,24 @@ inline std::pair<size_t, size_t> PartitionDenseKernel(const common::DenseColumn<
size_t nleft_elems = 0;
size_t nright_elems = 0;
for (auto rid : rid_span) {
if (column.IsMissing(rid)) {
if (default_left) {
p_left_part[nleft_elems++] = rid;
if (any_missing) {
for (auto rid : rid_span) {
if (column.IsMissing(rid)) {
if (default_left) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
} else {
p_right_part[nright_elems++] = rid;
if ((static_cast<int32_t>(idx[rid]) + offset) <= split_cond) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
}
} else {
}
} else {
for (auto rid : rid_span) {
if ((static_cast<int32_t>(idx[rid]) + offset) <= split_cond) {
p_left_part[nleft_elems++] = rid;
} else {
@ -919,6 +929,7 @@ void QuantileHistMaker::Builder::PartitionKernel(
const size_t node_in_set, const size_t nid, common::Range1d range,
const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree) {
const size_t* rid = row_set_collection_[nid].begin;
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = partition_builder_.GetLeftBuffer(node_in_set,
range.begin(), range.end());
@ -934,9 +945,21 @@ void QuantileHistMaker::Builder::PartitionKernel(
const common::DenseColumn<BinIdxType>& column =
static_cast<const common::DenseColumn<BinIdxType>& >(*(column_ptr.get()));
if (default_left) {
child_nodes_sizes = PartitionDenseKernel<true>(column, rid_span, split_cond, left, right);
if (column_matrix.AnyMissing()) {
child_nodes_sizes = PartitionDenseKernel<true, true>(column, rid_span, split_cond,
left, right);
} else {
child_nodes_sizes = PartitionDenseKernel<true, false>(column, rid_span, split_cond,
left, right);
}
} else {
child_nodes_sizes = PartitionDenseKernel<false>(column, rid_span, split_cond, left, right);
if (column_matrix.AnyMissing()) {
child_nodes_sizes = PartitionDenseKernel<false, true>(column, rid_span, split_cond,
left, right);
} else {
child_nodes_sizes = PartitionDenseKernel<false, false>(column, rid_span, split_cond,
left, right);
}
}
} else {
const common::SparseColumn<BinIdxType>& column

View File

@ -250,6 +250,71 @@ class QuantileHistMock : public QuantileHistMaker {
omp_set_num_threads(1);
}
void TestApplySplit(const GHistIndexBlockMatrix& quantile_index_block,
const RegTree& tree) {
std::vector<GradientPair> row_gpairs =
{ {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f},
{0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} };
size_t constexpr kMaxBins = 4;
// try out different sparsity to get different number of missing values
for (double sparsity : {0.0, 0.1, 0.2}) {
// kNRows samples with kNCols features
auto dmat = RandomDataGenerator(kNRows, kNCols, sparsity).Seed(3).GenerateDMatrix();
common::GHistIndexMatrix gmat;
gmat.Init(dmat.get(), kMaxBins);
ColumnMatrix cm;
// treat everything as dense, as this is what we intend to test here
cm.Init(gmat, 0.0);
RealImpl::InitData(gmat, row_gpairs, *dmat, tree);
hist_.AddHistRow(0);
RealImpl::InitNewNode(0, gmat, row_gpairs, *dmat, tree);
const size_t num_row = dmat->Info().num_row_;
// split by feature 0
const size_t bin_id_min = gmat.cut.Ptrs()[0];
const size_t bin_id_max = gmat.cut.Ptrs()[1];
// attempt to split at different bins
for (size_t split = 0; split < 4; split++) {
size_t left_cnt = 0, right_cnt = 0;
// manually compute how many samples go left or right
for (size_t rid = 0; rid < num_row; ++rid) {
for (size_t offset = gmat.row_ptr[rid]; offset < gmat.row_ptr[rid + 1]; ++offset) {
const size_t bin_id = gmat.index[offset];
if (bin_id >= bin_id_min && bin_id < bin_id_max) {
if (bin_id <= split) {
left_cnt ++;
} else {
right_cnt ++;
}
}
}
}
// if any were missing due to sparsity, we add them to the left or to the right
size_t missing = kNRows - left_cnt - right_cnt;
if (tree[0].DefaultLeft()) {
left_cnt += missing;
} else {
right_cnt += missing;
}
// have one node with kNRows (=8 at the moment) rows, just one task
RealImpl::partition_builder_.Init(1, 1, [&](size_t node_in_set) {
return 1;
});
RealImpl::PartitionKernel<uint8_t>(0, 0, common::Range1d(0, kNRows), split, cm, tree);
RealImpl::partition_builder_.CalculateRowOffsets();
ASSERT_EQ(RealImpl::partition_builder_.GetNLeftElems(0), left_cnt);
ASSERT_EQ(RealImpl::partition_builder_.GetNRightElems(0), right_cnt);
}
}
}
};
int static constexpr kNRows = 8, kNCols = 16;
@ -322,6 +387,13 @@ class QuantileHistMock : public QuantileHistMaker {
builder_->TestEvaluateSplit(gmatb_, tree);
}
void TestApplySplit() {
RegTree tree = RegTree();
tree.param.UpdateAllowUnknown(cfg_);
builder_->TestApplySplit(gmatb_, tree);
}
};
TEST(QuantileHist, InitData) {
@ -359,5 +431,15 @@ TEST(QuantileHist, EvalSplits) {
maker.TestEvaluateSplit();
}
TEST(QuantileHist, ApplySplit) {
std::vector<std::pair<std::string, std::string>> cfg
{{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
{"split_evaluator", "elastic_net"},
{"reg_lambda", "0"}, {"reg_alpha", "0"}, {"max_delta_step", "0"},
{"min_child_weight", "0"}};
QuantileHistMock maker(cfg);
maker.TestApplySplit();
}
} // namespace tree
} // namespace xgboost