sync Jun 1

This commit is contained in:
Your Name
2023-06-01 15:55:06 -07:00
76 changed files with 1424 additions and 595 deletions

View File

@@ -67,7 +67,7 @@ class ColumnSplitHelper {
const int32_t nid = nodes[node_in_set].nid;
const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
partition_builder_->AllocateForTask(task_id);
partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, *p_tree,
(*row_set_collection_)[nid].begin, decision_bits_,
missing_bits_);
});

View File

@@ -25,7 +25,6 @@
#include "xgboost/linalg.h" // for Constants, Vector
namespace xgboost::tree {
template <typename ExpandEntry>
class HistEvaluator {
private:
struct NodeEntry {
@@ -285,10 +284,42 @@ class HistEvaluator {
return left_sum;
}
/**
* @brief Gather the expand entries from all the workers.
* @param entries Local expand entries on this worker.
* @return Global expand entries gathered from all workers.
*/
std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
auto const world = collective::GetWorldSize();
auto const rank = collective::GetRank();
auto const num_entries = entries.size();
// First, gather all the primitive fields.
std::vector<CPUExpandEntry> all_entries(num_entries * world);
std::vector<uint32_t> cat_bits;
std::vector<std::size_t> cat_bits_sizes;
for (std::size_t i = 0; i < num_entries; i++) {
all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
}
collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
// Gather all the cat_bits.
auto gathered = collective::AllgatherV(cat_bits, cat_bits_sizes);
common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
// Copy the cat_bits back into all expand entries.
all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
all_entries[i].split.cat_bits.begin());
});
return all_entries;
}
public:
void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
common::Span<FeatureType const> feature_types, const RegTree &tree,
std::vector<ExpandEntry> *p_entries) {
std::vector<CPUExpandEntry> *p_entries) {
auto n_threads = ctx_->Threads();
auto& entries = *p_entries;
// All nodes are on the same level, so we can store the shared ptr.
@@ -306,7 +337,7 @@ class HistEvaluator {
return features[nidx_in_set]->Size();
}, grain_size);
std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
for (size_t i = 0; i < entries.size(); ++i) {
for (decltype(n_threads) j = 0; j < n_threads; ++j) {
tloc_candidates[i * n_threads + j] = entries[i];
@@ -365,22 +396,18 @@ class HistEvaluator {
if (is_col_split_) {
// With column-wise data split, we gather the best splits from all the workers and update the
// expand entries accordingly.
auto const world = collective::GetWorldSize();
auto const rank = collective::GetRank();
auto const num_entries = entries.size();
std::vector<ExpandEntry> buffer{num_entries * world};
std::copy_n(entries.cbegin(), num_entries, buffer.begin() + num_entries * rank);
collective::Allgather(buffer.data(), buffer.size() * sizeof(ExpandEntry));
for (auto worker = 0; worker < world; ++worker) {
auto all_entries = Allgather(entries);
for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
entries[nidx_in_set].split.Update(buffer[worker * num_entries + nidx_in_set].split);
entries[nidx_in_set].split.Update(
all_entries[worker * entries.size() + nidx_in_set].split);
}
}
}
}
// Add splits to tree, handles all statistic
void ApplyTreeSplit(ExpandEntry const& candidate, RegTree *p_tree) {
void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
auto evaluator = tree_evaluator_.GetEvaluator();
RegTree &tree = *p_tree;
@@ -465,6 +492,7 @@ class HistMultiEvaluator {
FeatureInteractionConstraintHost interaction_constraints_;
std::shared_ptr<common::ColumnSampler> column_sampler_;
Context const *ctx_;
bool is_col_split_{false};
private:
static double MultiCalcSplitGain(TrainParam const &param,
@@ -543,6 +571,57 @@ class HistMultiEvaluator {
return false;
}
/**
* @brief Gather the expand entries from all the workers.
* @param entries Local expand entries on this worker.
* @return Global expand entries gathered from all workers.
*/
std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
auto const world = collective::GetWorldSize();
auto const rank = collective::GetRank();
auto const num_entries = entries.size();
// First, gather all the primitive fields.
std::vector<MultiExpandEntry> all_entries(num_entries * world);
std::vector<uint32_t> cat_bits;
std::vector<std::size_t> cat_bits_sizes;
std::vector<GradientPairPrecise> gradients;
for (std::size_t i = 0; i < num_entries; i++) {
all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
&gradients);
}
collective::Allgather(all_entries.data(), all_entries.size() * sizeof(MultiExpandEntry));
// Gather all the cat_bits.
auto gathered_cat_bits = collective::AllgatherV(cat_bits, cat_bits_sizes);
// Gather all the gradients.
auto const num_gradients = gradients.size();
std::vector<GradientPairPrecise> all_gradients(num_gradients * world);
std::copy_n(gradients.cbegin(), num_gradients, all_gradients.begin() + num_gradients * rank);
collective::Allgather(all_gradients.data(), all_gradients.size() * sizeof(GradientPairPrecise));
auto const total_entries = num_entries * world;
auto const gradients_per_entry = num_gradients / num_entries;
auto const gradients_per_side = gradients_per_entry / 2;
common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
// Copy the cat_bits back into all expand entries.
all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
// Copy the gradients back into all expand entries.
all_entries[i].split.left_sum.resize(gradients_per_side);
std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
all_entries[i].split.left_sum.begin());
all_entries[i].split.right_sum.resize(gradients_per_side);
std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
gradients_per_side, all_entries[i].split.right_sum.begin());
});
return all_entries;
}
public:
void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
@@ -597,6 +676,18 @@ class HistMultiEvaluator {
entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
}
}
if (is_col_split_) {
// With column-wise data split, we gather the best splits from all the workers and update the
// expand entries accordingly.
auto all_entries = Allgather(entries);
for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
entries[nidx_in_set].split.Update(
all_entries[worker * entries.size() + nidx_in_set].split);
}
}
}
}
linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
@@ -660,7 +751,10 @@ class HistMultiEvaluator {
explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
std::shared_ptr<common::ColumnSampler> sampler)
: param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
: param_{param},
column_sampler_{std::move(sampler)},
ctx_{ctx},
is_col_split_{info.IsColumnSplit()} {
interaction_constraints_.Configure(*param, info.num_col_);
column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
param_->colsample_bynode, param_->colsample_bylevel,

View File

@@ -70,6 +70,22 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
os << "split:\n" << e.split << std::endl;
return os;
}
/**
* @brief Copy primitive fields into this, and collect cat_bits into a vector.
*
* This is used for allgather.
*
* @param that The other entry to copy from
* @param collected_cat_bits The vector to collect cat_bits
* @param cat_bits_sizes The sizes of the collected cat_bits
*/
void CopyAndCollect(CPUExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
std::vector<std::size_t>* cat_bits_sizes) {
nid = that.nid;
depth = that.depth;
split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes);
}
};
struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
@@ -119,6 +135,24 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
os << "]\n";
return os;
}
/**
* @brief Copy primitive fields into this, and collect cat_bits and gradients into vectors.
*
* This is used for allgather.
*
* @param that The other entry to copy from
* @param collected_cat_bits The vector to collect cat_bits
* @param cat_bits_sizes The sizes of the collected cat_bits
* @param collected_gradients The vector to collect gradients
*/
void CopyAndCollect(MultiExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
std::vector<std::size_t>* cat_bits_sizes,
std::vector<GradientPairPrecise>* collected_gradients) {
nid = that.nid;
depth = that.depth;
split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes, collected_gradients);
}
};
} // namespace xgboost::tree
#endif // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_

View File

@@ -419,6 +419,60 @@ struct SplitEntryContainer {
<< "right_sum: " << s.right_sum << std::endl;
return os;
}
/**
* @brief Copy primitive fields into this, and collect cat_bits into a vector.
*
* This is used for allgather.
*
* @param that The other entry to copy from
* @param collected_cat_bits The vector to collect cat_bits
* @param cat_bits_sizes The sizes of the collected cat_bits
*/
void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
std::vector<uint32_t> *collected_cat_bits,
std::vector<std::size_t> *cat_bits_sizes) {
loss_chg = that.loss_chg;
sindex = that.sindex;
split_value = that.split_value;
is_cat = that.is_cat;
static_assert(std::is_trivially_copyable_v<GradientT>);
left_sum = that.left_sum;
right_sum = that.right_sum;
collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
that.cat_bits.cend());
cat_bits_sizes->emplace_back(that.cat_bits.size());
}
/**
* @brief Copy primitive fields into this, and collect cat_bits and gradient sums into vectors.
*
* This is used for allgather.
*
* @param that The other entry to copy from
* @param collected_cat_bits The vector to collect cat_bits
* @param cat_bits_sizes The sizes of the collected cat_bits
* @param collected_gradients The vector to collect gradients
*/
template <typename G>
void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
std::vector<uint32_t> *collected_cat_bits,
std::vector<std::size_t> *cat_bits_sizes,
std::vector<G> *collected_gradients) {
loss_chg = that.loss_chg;
sindex = that.sindex;
split_value = that.split_value;
is_cat = that.is_cat;
collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
that.cat_bits.cend());
cat_bits_sizes->emplace_back(that.cat_bits.size());
static_assert(!std::is_trivially_copyable_v<GradientT>);
collected_gradients->insert(collected_gradients->end(), that.left_sum.cbegin(),
that.left_sum.cend());
collected_gradients->insert(collected_gradients->end(), that.right_sum.cbegin(),
that.right_sum.cend());
}
/*!\return feature index to split on */
[[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
/*!\return whether missing value goes to left branch */

View File

@@ -44,7 +44,7 @@ class GloablApproxBuilder {
protected:
TrainParam const *param_;
std::shared_ptr<common::ColumnSampler> col_sampler_;
HistEvaluator<CPUExpandEntry> evaluator_;
HistEvaluator evaluator_;
HistogramBuilder<CPUExpandEntry> histogram_builder_;
Context const *ctx_;
ObjInfo const *const task_;

View File

@@ -13,6 +13,7 @@
#include <utility> // for move, swap
#include <vector> // for vector
#include "../collective/aggregator.h" // for GlobalSum
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
#include "../collective/communicator.h" // for Operation
#include "../common/hist_util.h" // for HistogramCuts, HistCollection
@@ -200,8 +201,8 @@ class MultiTargetHistBuilder {
}
}
CHECK(root_sum.CContiguous());
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
root_sum.Size() * 2);
std::vector<MultiExpandEntry> nodes{best};
std::size_t i = 0;
@@ -335,7 +336,7 @@ class HistBuilder {
common::Monitor *monitor_;
TrainParam const *param_;
std::shared_ptr<common::ColumnSampler> col_sampler_;
std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
std::unique_ptr<HistEvaluator> evaluator_;
std::vector<CommonRowPartitioner> partitioner_;
// back pointers to tree and data matrix
@@ -354,7 +355,7 @@ class HistBuilder {
: monitor_{monitor},
param_{param},
col_sampler_{std::move(column_sampler)},
evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
col_sampler_)},
p_last_fmat_(fmat),
histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
@@ -395,8 +396,7 @@ class HistBuilder {
}
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
collective::IsDistributed(), fmat->Info().IsColumnSplit());
evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
col_sampler_);
evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
p_last_tree_ = p_tree;
monitor_->Stop(__func__);
}
@@ -455,8 +455,7 @@ class HistBuilder {
for (auto const &grad : gpair_h) {
grad_stat.Add(grad.GetGrad(), grad.GetHess());
}
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
2);
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
}
auto weight = evaluator_->InitRoot(GradStats{grad_stat});

View File

@@ -20,7 +20,7 @@ namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_refresh);
/*! \brief pruner that prunes a tree after growing finishs */
/*! \brief pruner that prunes a tree after growing finishes */
class TreeRefresher : public TreeUpdater {
public:
explicit TreeRefresher(Context const *ctx) : TreeUpdater(ctx) {}