sync Jun 1
This commit is contained in:
@@ -67,7 +67,7 @@ class ColumnSplitHelper {
|
||||
const int32_t nid = nodes[node_in_set].nid;
|
||||
const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
|
||||
partition_builder_->AllocateForTask(task_id);
|
||||
partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
|
||||
partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, *p_tree,
|
||||
(*row_set_collection_)[nid].begin, decision_bits_,
|
||||
missing_bits_);
|
||||
});
|
||||
|
||||
@@ -25,7 +25,6 @@
|
||||
#include "xgboost/linalg.h" // for Constants, Vector
|
||||
|
||||
namespace xgboost::tree {
|
||||
template <typename ExpandEntry>
|
||||
class HistEvaluator {
|
||||
private:
|
||||
struct NodeEntry {
|
||||
@@ -285,10 +284,42 @@ class HistEvaluator {
|
||||
return left_sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gather the expand entries from all the workers.
|
||||
* @param entries Local expand entries on this worker.
|
||||
* @return Global expand entries gathered from all workers.
|
||||
*/
|
||||
std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
|
||||
auto const world = collective::GetWorldSize();
|
||||
auto const rank = collective::GetRank();
|
||||
auto const num_entries = entries.size();
|
||||
|
||||
// First, gather all the primitive fields.
|
||||
std::vector<CPUExpandEntry> all_entries(num_entries * world);
|
||||
std::vector<uint32_t> cat_bits;
|
||||
std::vector<std::size_t> cat_bits_sizes;
|
||||
for (std::size_t i = 0; i < num_entries; i++) {
|
||||
all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
|
||||
}
|
||||
collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
|
||||
|
||||
// Gather all the cat_bits.
|
||||
auto gathered = collective::AllgatherV(cat_bits, cat_bits_sizes);
|
||||
|
||||
common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
|
||||
// Copy the cat_bits back into all expand entries.
|
||||
all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
|
||||
std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
|
||||
all_entries[i].split.cat_bits.begin());
|
||||
});
|
||||
|
||||
return all_entries;
|
||||
}
|
||||
|
||||
public:
|
||||
void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
|
||||
common::Span<FeatureType const> feature_types, const RegTree &tree,
|
||||
std::vector<ExpandEntry> *p_entries) {
|
||||
std::vector<CPUExpandEntry> *p_entries) {
|
||||
auto n_threads = ctx_->Threads();
|
||||
auto& entries = *p_entries;
|
||||
// All nodes are on the same level, so we can store the shared ptr.
|
||||
@@ -306,7 +337,7 @@ class HistEvaluator {
|
||||
return features[nidx_in_set]->Size();
|
||||
}, grain_size);
|
||||
|
||||
std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
|
||||
std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
|
||||
for (size_t i = 0; i < entries.size(); ++i) {
|
||||
for (decltype(n_threads) j = 0; j < n_threads; ++j) {
|
||||
tloc_candidates[i * n_threads + j] = entries[i];
|
||||
@@ -365,22 +396,18 @@ class HistEvaluator {
|
||||
if (is_col_split_) {
|
||||
// With column-wise data split, we gather the best splits from all the workers and update the
|
||||
// expand entries accordingly.
|
||||
auto const world = collective::GetWorldSize();
|
||||
auto const rank = collective::GetRank();
|
||||
auto const num_entries = entries.size();
|
||||
std::vector<ExpandEntry> buffer{num_entries * world};
|
||||
std::copy_n(entries.cbegin(), num_entries, buffer.begin() + num_entries * rank);
|
||||
collective::Allgather(buffer.data(), buffer.size() * sizeof(ExpandEntry));
|
||||
for (auto worker = 0; worker < world; ++worker) {
|
||||
auto all_entries = Allgather(entries);
|
||||
for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
|
||||
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
|
||||
entries[nidx_in_set].split.Update(buffer[worker * num_entries + nidx_in_set].split);
|
||||
entries[nidx_in_set].split.Update(
|
||||
all_entries[worker * entries.size() + nidx_in_set].split);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add splits to tree, handles all statistic
|
||||
void ApplyTreeSplit(ExpandEntry const& candidate, RegTree *p_tree) {
|
||||
void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
|
||||
auto evaluator = tree_evaluator_.GetEvaluator();
|
||||
RegTree &tree = *p_tree;
|
||||
|
||||
@@ -465,6 +492,7 @@ class HistMultiEvaluator {
|
||||
FeatureInteractionConstraintHost interaction_constraints_;
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler_;
|
||||
Context const *ctx_;
|
||||
bool is_col_split_{false};
|
||||
|
||||
private:
|
||||
static double MultiCalcSplitGain(TrainParam const ¶m,
|
||||
@@ -543,6 +571,57 @@ class HistMultiEvaluator {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gather the expand entries from all the workers.
|
||||
* @param entries Local expand entries on this worker.
|
||||
* @return Global expand entries gathered from all workers.
|
||||
*/
|
||||
std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
|
||||
auto const world = collective::GetWorldSize();
|
||||
auto const rank = collective::GetRank();
|
||||
auto const num_entries = entries.size();
|
||||
|
||||
// First, gather all the primitive fields.
|
||||
std::vector<MultiExpandEntry> all_entries(num_entries * world);
|
||||
std::vector<uint32_t> cat_bits;
|
||||
std::vector<std::size_t> cat_bits_sizes;
|
||||
std::vector<GradientPairPrecise> gradients;
|
||||
for (std::size_t i = 0; i < num_entries; i++) {
|
||||
all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
|
||||
&gradients);
|
||||
}
|
||||
collective::Allgather(all_entries.data(), all_entries.size() * sizeof(MultiExpandEntry));
|
||||
|
||||
// Gather all the cat_bits.
|
||||
auto gathered_cat_bits = collective::AllgatherV(cat_bits, cat_bits_sizes);
|
||||
|
||||
// Gather all the gradients.
|
||||
auto const num_gradients = gradients.size();
|
||||
std::vector<GradientPairPrecise> all_gradients(num_gradients * world);
|
||||
std::copy_n(gradients.cbegin(), num_gradients, all_gradients.begin() + num_gradients * rank);
|
||||
collective::Allgather(all_gradients.data(), all_gradients.size() * sizeof(GradientPairPrecise));
|
||||
|
||||
auto const total_entries = num_entries * world;
|
||||
auto const gradients_per_entry = num_gradients / num_entries;
|
||||
auto const gradients_per_side = gradients_per_entry / 2;
|
||||
common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
|
||||
// Copy the cat_bits back into all expand entries.
|
||||
all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
|
||||
std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
|
||||
gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
|
||||
|
||||
// Copy the gradients back into all expand entries.
|
||||
all_entries[i].split.left_sum.resize(gradients_per_side);
|
||||
std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
|
||||
all_entries[i].split.left_sum.begin());
|
||||
all_entries[i].split.right_sum.resize(gradients_per_side);
|
||||
std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
|
||||
gradients_per_side, all_entries[i].split.right_sum.begin());
|
||||
});
|
||||
|
||||
return all_entries;
|
||||
}
|
||||
|
||||
public:
|
||||
void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
|
||||
common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
|
||||
@@ -597,6 +676,18 @@ class HistMultiEvaluator {
|
||||
entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
|
||||
}
|
||||
}
|
||||
|
||||
if (is_col_split_) {
|
||||
// With column-wise data split, we gather the best splits from all the workers and update the
|
||||
// expand entries accordingly.
|
||||
auto all_entries = Allgather(entries);
|
||||
for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
|
||||
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
|
||||
entries[nidx_in_set].split.Update(
|
||||
all_entries[worker * entries.size() + nidx_in_set].split);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
|
||||
@@ -660,7 +751,10 @@ class HistMultiEvaluator {
|
||||
|
||||
explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
|
||||
std::shared_ptr<common::ColumnSampler> sampler)
|
||||
: param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
|
||||
: param_{param},
|
||||
column_sampler_{std::move(sampler)},
|
||||
ctx_{ctx},
|
||||
is_col_split_{info.IsColumnSplit()} {
|
||||
interaction_constraints_.Configure(*param, info.num_col_);
|
||||
column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
|
||||
param_->colsample_bynode, param_->colsample_bylevel,
|
||||
|
||||
@@ -70,6 +70,22 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
|
||||
os << "split:\n" << e.split << std::endl;
|
||||
return os;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Copy primitive fields into this, and collect cat_bits into a vector.
|
||||
*
|
||||
* This is used for allgather.
|
||||
*
|
||||
* @param that The other entry to copy from
|
||||
* @param collected_cat_bits The vector to collect cat_bits
|
||||
* @param cat_bits_sizes The sizes of the collected cat_bits
|
||||
*/
|
||||
void CopyAndCollect(CPUExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
|
||||
std::vector<std::size_t>* cat_bits_sizes) {
|
||||
nid = that.nid;
|
||||
depth = that.depth;
|
||||
split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes);
|
||||
}
|
||||
};
|
||||
|
||||
struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
|
||||
@@ -119,6 +135,24 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
|
||||
os << "]\n";
|
||||
return os;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Copy primitive fields into this, and collect cat_bits and gradients into vectors.
|
||||
*
|
||||
* This is used for allgather.
|
||||
*
|
||||
* @param that The other entry to copy from
|
||||
* @param collected_cat_bits The vector to collect cat_bits
|
||||
* @param cat_bits_sizes The sizes of the collected cat_bits
|
||||
* @param collected_gradients The vector to collect gradients
|
||||
*/
|
||||
void CopyAndCollect(MultiExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
|
||||
std::vector<std::size_t>* cat_bits_sizes,
|
||||
std::vector<GradientPairPrecise>* collected_gradients) {
|
||||
nid = that.nid;
|
||||
depth = that.depth;
|
||||
split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes, collected_gradients);
|
||||
}
|
||||
};
|
||||
} // namespace xgboost::tree
|
||||
#endif // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
|
||||
|
||||
@@ -419,6 +419,60 @@ struct SplitEntryContainer {
|
||||
<< "right_sum: " << s.right_sum << std::endl;
|
||||
return os;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Copy primitive fields into this, and collect cat_bits into a vector.
|
||||
*
|
||||
* This is used for allgather.
|
||||
*
|
||||
* @param that The other entry to copy from
|
||||
* @param collected_cat_bits The vector to collect cat_bits
|
||||
* @param cat_bits_sizes The sizes of the collected cat_bits
|
||||
*/
|
||||
void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
|
||||
std::vector<uint32_t> *collected_cat_bits,
|
||||
std::vector<std::size_t> *cat_bits_sizes) {
|
||||
loss_chg = that.loss_chg;
|
||||
sindex = that.sindex;
|
||||
split_value = that.split_value;
|
||||
is_cat = that.is_cat;
|
||||
static_assert(std::is_trivially_copyable_v<GradientT>);
|
||||
left_sum = that.left_sum;
|
||||
right_sum = that.right_sum;
|
||||
collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
|
||||
that.cat_bits.cend());
|
||||
cat_bits_sizes->emplace_back(that.cat_bits.size());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Copy primitive fields into this, and collect cat_bits and gradient sums into vectors.
|
||||
*
|
||||
* This is used for allgather.
|
||||
*
|
||||
* @param that The other entry to copy from
|
||||
* @param collected_cat_bits The vector to collect cat_bits
|
||||
* @param cat_bits_sizes The sizes of the collected cat_bits
|
||||
* @param collected_gradients The vector to collect gradients
|
||||
*/
|
||||
template <typename G>
|
||||
void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
|
||||
std::vector<uint32_t> *collected_cat_bits,
|
||||
std::vector<std::size_t> *cat_bits_sizes,
|
||||
std::vector<G> *collected_gradients) {
|
||||
loss_chg = that.loss_chg;
|
||||
sindex = that.sindex;
|
||||
split_value = that.split_value;
|
||||
is_cat = that.is_cat;
|
||||
collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
|
||||
that.cat_bits.cend());
|
||||
cat_bits_sizes->emplace_back(that.cat_bits.size());
|
||||
static_assert(!std::is_trivially_copyable_v<GradientT>);
|
||||
collected_gradients->insert(collected_gradients->end(), that.left_sum.cbegin(),
|
||||
that.left_sum.cend());
|
||||
collected_gradients->insert(collected_gradients->end(), that.right_sum.cbegin(),
|
||||
that.right_sum.cend());
|
||||
}
|
||||
|
||||
/*!\return feature index to split on */
|
||||
[[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
|
||||
/*!\return whether missing value goes to left branch */
|
||||
|
||||
@@ -44,7 +44,7 @@ class GloablApproxBuilder {
|
||||
protected:
|
||||
TrainParam const *param_;
|
||||
std::shared_ptr<common::ColumnSampler> col_sampler_;
|
||||
HistEvaluator<CPUExpandEntry> evaluator_;
|
||||
HistEvaluator evaluator_;
|
||||
HistogramBuilder<CPUExpandEntry> histogram_builder_;
|
||||
Context const *ctx_;
|
||||
ObjInfo const *const task_;
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <utility> // for move, swap
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../collective/aggregator.h" // for GlobalSum
|
||||
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
|
||||
#include "../collective/communicator.h" // for Operation
|
||||
#include "../common/hist_util.h" // for HistogramCuts, HistCollection
|
||||
@@ -200,8 +201,8 @@ class MultiTargetHistBuilder {
|
||||
}
|
||||
}
|
||||
CHECK(root_sum.CContiguous());
|
||||
collective::Allreduce<collective::Operation::kSum>(
|
||||
reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
|
||||
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
|
||||
root_sum.Size() * 2);
|
||||
|
||||
std::vector<MultiExpandEntry> nodes{best};
|
||||
std::size_t i = 0;
|
||||
@@ -335,7 +336,7 @@ class HistBuilder {
|
||||
common::Monitor *monitor_;
|
||||
TrainParam const *param_;
|
||||
std::shared_ptr<common::ColumnSampler> col_sampler_;
|
||||
std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
|
||||
std::unique_ptr<HistEvaluator> evaluator_;
|
||||
std::vector<CommonRowPartitioner> partitioner_;
|
||||
|
||||
// back pointers to tree and data matrix
|
||||
@@ -354,7 +355,7 @@ class HistBuilder {
|
||||
: monitor_{monitor},
|
||||
param_{param},
|
||||
col_sampler_{std::move(column_sampler)},
|
||||
evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
|
||||
evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
|
||||
col_sampler_)},
|
||||
p_last_fmat_(fmat),
|
||||
histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
|
||||
@@ -395,8 +396,7 @@ class HistBuilder {
|
||||
}
|
||||
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
|
||||
collective::IsDistributed(), fmat->Info().IsColumnSplit());
|
||||
evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
|
||||
col_sampler_);
|
||||
evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
|
||||
p_last_tree_ = p_tree;
|
||||
monitor_->Stop(__func__);
|
||||
}
|
||||
@@ -455,8 +455,7 @@ class HistBuilder {
|
||||
for (auto const &grad : gpair_h) {
|
||||
grad_stat.Add(grad.GetGrad(), grad.GetHess());
|
||||
}
|
||||
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
|
||||
2);
|
||||
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
|
||||
}
|
||||
|
||||
auto weight = evaluator_->InitRoot(GradStats{grad_stat});
|
||||
|
||||
@@ -20,7 +20,7 @@ namespace xgboost::tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_refresh);
|
||||
|
||||
/*! \brief pruner that prunes a tree after growing finishs */
|
||||
/*! \brief pruner that prunes a tree after growing finishes */
|
||||
class TreeRefresher : public TreeUpdater {
|
||||
public:
|
||||
explicit TreeRefresher(Context const *ctx) : TreeUpdater(ctx) {}
|
||||
|
||||
Reference in New Issue
Block a user