Support hist in the partition builder under column split (#9120)
This commit is contained in:
parent
52311dcec9
commit
603f8ce2fa
@ -183,14 +183,28 @@ class PartitionBuilder {
|
|||||||
SetNRightElems(node_in_set, range.begin(), n_right);
|
SetNRightElems(node_in_set, range.begin(), n_right);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <bool any_missing, typename ColumnType, typename Predicate>
|
||||||
|
void MaskKernel(ColumnType* p_column, common::Span<const size_t> row_indices, size_t base_rowid,
|
||||||
|
BitVector* decision_bits, BitVector* missing_bits, Predicate&& pred) {
|
||||||
|
auto& column = *p_column;
|
||||||
|
for (auto const row_id : row_indices) {
|
||||||
|
auto const bin_id = column[row_id - base_rowid];
|
||||||
|
if (any_missing && bin_id == ColumnType::kMissingId) {
|
||||||
|
missing_bits->Set(row_id - base_rowid);
|
||||||
|
} else if (pred(row_id, bin_id)) {
|
||||||
|
decision_bits->Set(row_id - base_rowid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief When data is split by column, we don't have all the features locally on the current
|
* @brief When data is split by column, we don't have all the features locally on the current
|
||||||
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
|
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
|
||||||
* to go right, or if the feature value used for the split is missing.
|
* to go right, or if the feature value used for the split is missing.
|
||||||
*/
|
*/
|
||||||
template <typename ExpandEntry>
|
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
|
||||||
void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
|
void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
|
||||||
const common::Range1d range, GHistIndexMatrix const& gmat,
|
const common::Range1d range, bst_bin_t split_cond, GHistIndexMatrix const& gmat,
|
||||||
const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
|
const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
|
||||||
BitVector* decision_bits, BitVector* missing_bits) {
|
BitVector* decision_bits, BitVector* missing_bits) {
|
||||||
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
||||||
@ -204,7 +218,7 @@ class PartitionBuilder {
|
|||||||
for (auto row_id : rid_span) {
|
for (auto row_id : rid_span) {
|
||||||
auto gidx = gmat.GetGindex(row_id, fid);
|
auto gidx = gmat.GetGindex(row_id, fid);
|
||||||
if (gidx > -1) {
|
if (gidx > -1) {
|
||||||
bool go_left = false;
|
bool go_left;
|
||||||
if (is_cat) {
|
if (is_cat) {
|
||||||
go_left = Decision(node_cats, cut_values[gidx]);
|
go_left = Decision(node_cats, cut_values[gidx]);
|
||||||
} else {
|
} else {
|
||||||
@ -218,7 +232,27 @@ class PartitionBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
|
auto pred_hist = [&](auto ridx, auto bin_id) {
|
||||||
|
if (any_cat && is_cat) {
|
||||||
|
auto gidx = gmat.GetGindex(ridx, fid);
|
||||||
|
CHECK_GT(gidx, -1);
|
||||||
|
return Decision(node_cats, cut_values[gidx]);
|
||||||
|
} else {
|
||||||
|
return bin_id <= split_cond;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
|
||||||
|
auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
|
||||||
|
MaskKernel<any_missing>(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits,
|
||||||
|
pred_hist);
|
||||||
|
} else {
|
||||||
|
CHECK_EQ(any_missing, true);
|
||||||
|
auto column =
|
||||||
|
column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
|
||||||
|
MaskKernel<any_missing>(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits,
|
||||||
|
pred_hist);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -238,7 +272,7 @@ class PartitionBuilder {
|
|||||||
std::size_t nid = nodes[node_in_set].nid;
|
std::size_t nid = nodes[node_in_set].nid;
|
||||||
bool default_left = tree[nid].DefaultLeft();
|
bool default_left = tree[nid].DefaultLeft();
|
||||||
|
|
||||||
auto pred_approx = [&](auto ridx) {
|
auto pred = [&](auto ridx) {
|
||||||
bool go_left = default_left;
|
bool go_left = default_left;
|
||||||
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
|
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
|
||||||
if (!is_missing) {
|
if (!is_missing) {
|
||||||
@ -248,11 +282,7 @@ class PartitionBuilder {
|
|||||||
};
|
};
|
||||||
|
|
||||||
std::pair<size_t, size_t> child_nodes_sizes;
|
std::pair<size_t, size_t> child_nodes_sizes;
|
||||||
if (!column_matrix.IsInitialized()) {
|
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred);
|
||||||
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
|
|
||||||
} else {
|
|
||||||
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t n_left = child_nodes_sizes.first;
|
const size_t n_left = child_nodes_sizes.first;
|
||||||
const size_t n_right = child_nodes_sizes.second;
|
const size_t n_right = child_nodes_sizes.second;
|
||||||
|
|||||||
@ -38,19 +38,21 @@ class ColumnSplitHelper {
|
|||||||
missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
|
missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename ExpandEntry>
|
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
|
||||||
void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
|
void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
|
||||||
GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
|
GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
|
||||||
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
|
std::vector<ExpandEntry> const& nodes,
|
||||||
|
std::vector<int32_t> const& split_conditions, RegTree const* p_tree) {
|
||||||
// When data is split by column, we don't have all the feature values in the local worker, so
|
// When data is split by column, we don't have all the feature values in the local worker, so
|
||||||
// we first collect all the decisions and whether the feature is missing into bit vectors.
|
// we first collect all the decisions and whether the feature is missing into bit vectors.
|
||||||
std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
|
std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
|
||||||
std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
|
std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
|
||||||
common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
|
common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
|
||||||
const int32_t nid = nodes[node_in_set].nid;
|
const int32_t nid = nodes[node_in_set].nid;
|
||||||
partition_builder_->MaskRows(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
|
bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
|
||||||
(*row_set_collection_)[nid].begin, &decision_bits_,
|
partition_builder_->MaskRows<BinIdxType, any_missing, any_cat>(
|
||||||
&missing_bits_);
|
node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
|
||||||
|
(*row_set_collection_)[nid].begin, &decision_bits_, &missing_bits_);
|
||||||
});
|
});
|
||||||
|
|
||||||
// Then aggregate the bit vectors across all the workers.
|
// Then aggregate the bit vectors across all the workers.
|
||||||
@ -217,7 +219,8 @@ class CommonRowPartitioner {
|
|||||||
// 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
|
// 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
|
||||||
// Store results in intermediate buffers from partition_builder_
|
// Store results in intermediate buffers from partition_builder_
|
||||||
if (is_col_split_) {
|
if (is_col_split_) {
|
||||||
column_split_helper_.Partition(space, ctx->Threads(), gmat, column_matrix, nodes, p_tree);
|
column_split_helper_.Partition<BinIdxType, any_missing, any_cat>(
|
||||||
|
space, ctx->Threads(), gmat, column_matrix, nodes, split_conditions, p_tree);
|
||||||
} else {
|
} else {
|
||||||
common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
|
common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
|
||||||
size_t begin = r.begin();
|
size_t begin = r.begin();
|
||||||
|
|||||||
@ -19,6 +19,8 @@
|
|||||||
#include "xgboost/data.h"
|
#include "xgboost/data.h"
|
||||||
|
|
||||||
namespace xgboost::tree {
|
namespace xgboost::tree {
|
||||||
|
|
||||||
|
namespace {
|
||||||
template <typename ExpandEntry>
|
template <typename ExpandEntry>
|
||||||
void TestPartitioner(bst_target_t n_targets) {
|
void TestPartitioner(bst_target_t n_targets) {
|
||||||
std::size_t n_samples = 1024, base_rowid = 0;
|
std::size_t n_samples = 1024, base_rowid = 0;
|
||||||
@ -86,8 +88,117 @@ void TestPartitioner(bst_target_t n_targets) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
TEST(QuantileHist, Partitioner) { TestPartitioner<CPUExpandEntry>(1); }
|
TEST(QuantileHist, Partitioner) { TestPartitioner<CPUExpandEntry>(1); }
|
||||||
|
|
||||||
TEST(QuantileHist, MultiPartitioner) { TestPartitioner<MultiExpandEntry>(3); }
|
TEST(QuantileHist, MultiPartitioner) { TestPartitioner<MultiExpandEntry>(3); }
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
template <typename ExpandEntry>
|
||||||
|
void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
|
||||||
|
bst_feature_t n_features, size_t base_rowid,
|
||||||
|
std::shared_ptr<DMatrix> Xy, float min_value, float mid_value,
|
||||||
|
CommonRowPartitioner const& expected_mid_partitioner) {
|
||||||
|
auto dmat =
|
||||||
|
std::unique_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
|
||||||
|
|
||||||
|
Context ctx;
|
||||||
|
ctx.InitAllowUnknown(Args{});
|
||||||
|
|
||||||
|
std::vector<ExpandEntry> candidates{{0, 0}};
|
||||||
|
candidates.front().split.loss_chg = 0.4;
|
||||||
|
auto cuts = common::SketchOnDMatrix(&ctx, dmat.get(), 64);
|
||||||
|
|
||||||
|
for (auto const& page : Xy->GetBatches<SparsePage>()) {
|
||||||
|
GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
|
||||||
|
bst_feature_t const split_ind = 0;
|
||||||
|
common::ColumnMatrix column_indices;
|
||||||
|
column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
|
||||||
|
{
|
||||||
|
RegTree tree{n_targets, n_features};
|
||||||
|
CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
|
||||||
|
if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
|
||||||
|
GetSplit(&tree, min_value, &candidates);
|
||||||
|
} else {
|
||||||
|
GetMultiSplitForTest(&tree, min_value, &candidates);
|
||||||
|
}
|
||||||
|
partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
|
||||||
|
ASSERT_EQ(partitioner.Size(), 3);
|
||||||
|
ASSERT_EQ(partitioner[1].Size(), 0);
|
||||||
|
ASSERT_EQ(partitioner[2].Size(), n_samples);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
RegTree tree{n_targets, n_features};
|
||||||
|
CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
|
||||||
|
if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
|
||||||
|
GetSplit(&tree, mid_value, &candidates);
|
||||||
|
} else {
|
||||||
|
GetMultiSplitForTest(&tree, mid_value, &candidates);
|
||||||
|
}
|
||||||
|
auto left_nidx = tree.LeftChild(RegTree::kRoot);
|
||||||
|
partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
|
||||||
|
|
||||||
|
auto elem = partitioner[left_nidx];
|
||||||
|
ASSERT_LT(elem.Size(), n_samples);
|
||||||
|
ASSERT_GT(elem.Size(), 1);
|
||||||
|
auto expected_elem = expected_mid_partitioner[left_nidx];
|
||||||
|
ASSERT_EQ(elem.Size(), expected_elem.Size());
|
||||||
|
for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
|
||||||
|
ASSERT_EQ(*it, *eit);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto right_nidx = tree.RightChild(RegTree::kRoot);
|
||||||
|
elem = partitioner[right_nidx];
|
||||||
|
expected_elem = expected_mid_partitioner[right_nidx];
|
||||||
|
ASSERT_EQ(elem.Size(), expected_elem.Size());
|
||||||
|
for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
|
||||||
|
ASSERT_EQ(*it, *eit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ExpandEntry>
|
||||||
|
void TestColumnSplitPartitioner(bst_target_t n_targets) {
|
||||||
|
std::size_t n_samples = 1024, base_rowid = 0;
|
||||||
|
bst_feature_t n_features = 16;
|
||||||
|
auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
|
||||||
|
std::vector<ExpandEntry> candidates{{0, 0}};
|
||||||
|
candidates.front().split.loss_chg = 0.4;
|
||||||
|
|
||||||
|
Context ctx;
|
||||||
|
ctx.InitAllowUnknown(Args{});
|
||||||
|
auto cuts = common::SketchOnDMatrix(&ctx, Xy.get(), 64);
|
||||||
|
|
||||||
|
float min_value, mid_value;
|
||||||
|
CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false};
|
||||||
|
for (auto const& page : Xy->GetBatches<SparsePage>()) {
|
||||||
|
GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
|
||||||
|
bst_feature_t const split_ind = 0;
|
||||||
|
common::ColumnMatrix column_indices;
|
||||||
|
column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
|
||||||
|
min_value = gmat.cut.MinValues()[split_ind];
|
||||||
|
|
||||||
|
auto ptr = gmat.cut.Ptrs()[split_ind + 1];
|
||||||
|
mid_value = gmat.cut.Values().at(ptr / 2);
|
||||||
|
RegTree tree{n_targets, n_features};
|
||||||
|
if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
|
||||||
|
GetSplit(&tree, mid_value, &candidates);
|
||||||
|
} else {
|
||||||
|
GetMultiSplitForTest(&tree, mid_value, &candidates);
|
||||||
|
}
|
||||||
|
mid_partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto constexpr kWorkers = 4;
|
||||||
|
RunWithInMemoryCommunicator(kWorkers, VerifyColumnSplitPartitioner<ExpandEntry>, n_targets,
|
||||||
|
n_samples, n_features, base_rowid, Xy, min_value, mid_value, mid_partitioner);
|
||||||
|
}
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
|
TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEntry>(1); }
|
||||||
|
|
||||||
|
TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }
|
||||||
} // namespace xgboost::tree
|
} // namespace xgboost::tree
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user