merge latest changes

This commit is contained in:
Hui Liu
2023-12-13 21:06:28 -08:00
194 changed files with 4859 additions and 2838 deletions

View File

@@ -429,11 +429,11 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
}
}
void GPUHistEvaluator::EvaluateSplits(
const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_entries) {
void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_node_t> &nidx,
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_entries) {
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
@@ -451,19 +451,20 @@ void GPUHistEvaluator::EvaluateSplits(
out_splits.size() * sizeof(DeviceSplitCandidate));
// Reduce to get the best candidate from all workers.
dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
out_splits[i] = all_candidates[i];
for (auto rank = 1; rank < world_size; rank++) {
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
}
});
dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(),
[world_size, all_candidates, out_splits] __device__(size_t i) {
out_splits[i] = all_candidates[i];
for (auto rank = 1; rank < world_size; rank++) {
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
}
});
}
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
auto d_entries = out_entries;
auto device_cats_accessor = this->DeviceCatStorage(nidx);
// turn candidate into entry, along with handling sort based split.
dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable {
dh::LaunchN(d_inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t i) mutable {
auto const input = d_inputs[i];
auto &split = out_splits[i];
// Subtract parent gain here
@@ -498,12 +499,12 @@ void GPUHistEvaluator::EvaluateSplits(
this->CopyToHost(nidx);
}
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs) {
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
EvaluateSplitSharedInputs shared_inputs) {
dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
dh::TemporaryArray<GPUExpandEntry> out_entries(1);
this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
dh::ToSpan(out_entries));
this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
shared_inputs, dh::ToSpan(out_entries));
GPUExpandEntry root_entry;
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),

View File

@@ -193,7 +193,7 @@ class GPUHistEvaluator {
/**
* \brief Evaluate splits for left and right nodes.
*/
void EvaluateSplits(const std::vector<bst_node_t> &nidx,
void EvaluateSplits(Context const* ctx, const std::vector<bst_node_t> &nidx,
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
@@ -201,7 +201,7 @@ class GPUHistEvaluator {
/**
* \brief Evaluate splits for root node.
*/
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,
GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
EvaluateSplitSharedInputs shared_inputs);
};
} // namespace tree

View File

@@ -74,7 +74,7 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
dh::Iota(sorted_idx);
dh::Iota(sorted_idx, dh::DefaultStream());
auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size());
auto it = thrust::make_counting_iterator(0u);
auto d_feature_idx = dh::ToSpan(feature_idx_);

View File

@@ -16,8 +16,7 @@
#include "row_partitioner.cuh"
#include "xgboost/base.h"
namespace xgboost {
namespace tree {
namespace xgboost::tree {
namespace {
struct Pair {
GradientPair first;
@@ -53,7 +52,8 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
*
* to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
*/
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
MetaInfo const& info) {
using GradientSumT = GradientPairPrecise;
using T = typename GradientSumT::ValueT;
dh::XGBCachingDeviceAllocator<char> alloc;
@@ -100,7 +100,6 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, Met
static_cast<T>(1) / to_floating_point_.GetHess());
}
XGBOOST_DEV_INLINE void
AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
xgboost::GradientPairInt64 const &gpair) {
@@ -333,6 +332,4 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
dh::safe_cuda(cudaGetLastError());
}
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree

View File

@@ -39,18 +39,20 @@ private:
GradientPairPrecise to_floating_point_;
public:
GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
gpair.GetHess() * to_fixed_point_.GetHess());
gpair.GetHess() * to_fixed_point_.GetHess());
return adjusted;
}
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPairPrecise const& gpair) const {
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64
ToFixedPoint(GradientPairPrecise const& gpair) const {
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
gpair.GetHess() * to_fixed_point_.GetHess());
gpair.GetHess() * to_fixed_point_.GetHess());
return adjusted;
}
XGBOOST_DEVICE GradientPairPrecise ToFloatingPoint(const GradientPairInt64&gpair) const {
[[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
ToFloatingPoint(const GradientPairInt64& gpair) const {
auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
return {g,h};

View File

@@ -171,7 +171,8 @@ class HistogramBuilder {
}
}
void SyncHistogram(RegTree const *p_tree, std::vector<bst_node_t> const &nodes_to_build,
void SyncHistogram(Context const *, RegTree const *p_tree,
std::vector<bst_node_t> const &nodes_to_build,
std::vector<bst_node_t> const &nodes_to_trick) {
auto n_total_bins = buffer_.TotalBins();
common::BlockedSpace2d space(
@@ -277,14 +278,14 @@ class MultiHistogramBuilder {
}
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub);
this->target_builders_[t].SyncHistogram(ctx_, p_tree, nodes, dummy_sub);
}
}
/**
* @brief Build histogram for left and right child of valid candidates
*/
template <typename Partitioner, typename ExpandEntry>
void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree,
void BuildHistLeftRight(Context const *ctx, DMatrix *p_fmat, RegTree const *p_tree,
std::vector<Partitioner> const &partitioners,
std::vector<ExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair, BatchParam const &param,
@@ -318,7 +319,7 @@ class MultiHistogramBuilder {
}
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub);
this->target_builders_[t].SyncHistogram(ctx, p_tree, nodes_to_build, nodes_to_sub);
}
}

View File

@@ -12,7 +12,7 @@
namespace xgboost::tree {
DMLC_REGISTER_PARAMETER(HistMakerTrainParam);
void HistMakerTrainParam::CheckTreesSynchronized(RegTree const* local_tree) const {
void HistMakerTrainParam::CheckTreesSynchronized(Context const*, RegTree const* local_tree) const {
if (!this->debug_synchronize) {
return;
}

View File

@@ -15,7 +15,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
bool debug_synchronize{false};
std::size_t max_cached_hist_node{DefaultNodes()};
void CheckTreesSynchronized(RegTree const* local_tree) const;
void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const;
// declare parameters
DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {

View File

@@ -140,7 +140,7 @@ class GloablApproxBuilder {
std::vector<GradientPair> const &gpair, common::Span<float> hess) {
monitor_->Start(__func__);
this->histogram_builder_.BuildHistLeftRight(
p_fmat, p_tree, partitioner_, valid_candidates,
ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
monitor_->Stop(__func__);
}
@@ -248,8 +248,7 @@ class GlobalApproxUpdater : public TreeUpdater {
std::unique_ptr<GloablApproxBuilder> pimpl_;
// pointer to the last DMatrix, used for update prediction cache.
DMatrix *cached_{nullptr};
std::shared_ptr<common::ColumnSampler> column_sampler_ =
std::make_shared<common::ColumnSampler>();
std::shared_ptr<common::ColumnSampler> column_sampler_;
ObjInfo const *task_;
HistMakerTrainParam hist_param_;
@@ -284,6 +283,9 @@ class GlobalApproxUpdater : public TreeUpdater {
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree *> &trees) override {
CHECK(hist_param_.GetInitialised());
if (!column_sampler_) {
column_sampler_ = common::MakeColumnSampler(ctx_);
}
pimpl_ = std::make_unique<GloablApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
column_sampler_, task_, &monitor_);
@@ -300,7 +302,7 @@ class GlobalApproxUpdater : public TreeUpdater {
std::size_t t_idx = 0;
for (auto p_tree : trees) {
this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
hist_param_.CheckTreesSynchronized(p_tree);
hist_param_.CheckTreesSynchronized(ctx_, p_tree);
++t_idx;
}
}

View File

@@ -225,9 +225,12 @@ class ColMaker: public TreeUpdater {
}
}
{
column_sampler_.Init(ctx_, fmat.Info().num_col_,
fmat.Info().feature_weights.ConstHostVector(), param_.colsample_bynode,
param_.colsample_bylevel, param_.colsample_bytree);
if (!column_sampler_) {
column_sampler_ = common::MakeColumnSampler(ctx_);
}
column_sampler_->Init(
ctx_, fmat.Info().num_col_, fmat.Info().feature_weights.ConstHostVector(),
param_.colsample_bynode, param_.colsample_bylevel, param_.colsample_bytree);
}
{
// setup temp space for each thread
@@ -467,7 +470,7 @@ class ColMaker: public TreeUpdater {
RegTree *p_tree) {
auto evaluator = tree_evaluator_.GetEvaluator();
auto feat_set = column_sampler_.GetFeatureSet(depth);
auto feat_set = column_sampler_->GetFeatureSet(depth);
for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
this->UpdateSolution(batch, feat_set->HostVector(), gpair, p_fmat);
}
@@ -586,7 +589,7 @@ class ColMaker: public TreeUpdater {
const ColMakerTrainParam& colmaker_train_param_;
// number of omp thread used during training
Context const* ctx_;
common::ColumnSampler column_sampler_;
std::shared_ptr<common::ColumnSampler> column_sampler_;
// Instance Data: current node position in the tree of each instance
std::vector<int> position_;
// PerThread x PerTreeNode: statistics for per thread construction

View File

@@ -247,7 +247,7 @@ struct GPUHistMakerDevice {
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
dmat->Info().IsColumnSplit(), ctx_->Device());
quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, dmat->Info());
row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
@@ -277,7 +277,7 @@ struct GPUHistMakerDevice {
matrix.min_fvalue,
matrix.is_dense && !collective::IsDistributed()
};
auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs);
auto split = this->evaluator_.EvaluateSingleSplit(ctx_, inputs, shared_inputs);
return split;
}
@@ -330,7 +330,7 @@ struct GPUHistMakerDevice {
d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
this->evaluator_.EvaluateSplits(ctx_, nidx, max_active_features, dh::ToSpan(d_node_inputs),
shared_inputs, dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
@@ -848,7 +848,7 @@ class GPUHistMaker : public TreeUpdater {
std::size_t t_idx{0};
for (xgboost::RegTree* tree : trees) {
this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
this->hist_maker_param_.CheckTreesSynchronized(tree);
this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
++t_idx;
}
@@ -992,7 +992,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
std::size_t t_idx{0};
for (xgboost::RegTree* tree : trees) {
this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]);
this->hist_maker_param_.CheckTreesSynchronized(tree);
this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
++t_idx;
}

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
* Copyright 2017-2023, XGBoost Contributors
* \file updater_quantile_hist.cc
* \brief use quantized feature values to construct a tree
* \author Philip Cho, Tianqi Checn, Egor Smirnov
@@ -228,8 +228,8 @@ class MultiTargetHistBuilder {
std::vector<MultiExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair) {
monitor_->Start(__func__);
histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair,
HistBatch(param_));
histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
gpair, HistBatch(param_));
monitor_->Stop(__func__);
}
@@ -436,8 +436,8 @@ class HistUpdater {
std::vector<CPUExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair) {
monitor_->Start(__func__);
this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates,
gpair, HistBatch(param_));
this->histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_,
valid_candidates, gpair, HistBatch(param_));
monitor_->Stop(__func__);
}
@@ -470,8 +470,7 @@ class HistUpdater {
class QuantileHistMaker : public TreeUpdater {
std::unique_ptr<HistUpdater> p_impl_{nullptr};
std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
std::shared_ptr<common::ColumnSampler> column_sampler_ =
std::make_shared<common::ColumnSampler>();
std::shared_ptr<common::ColumnSampler> column_sampler_;
common::Monitor monitor_;
ObjInfo const *task_{nullptr};
HistMakerTrainParam hist_param_;
@@ -495,6 +494,10 @@ class QuantileHistMaker : public TreeUpdater {
void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree *> &trees) override {
if (!column_sampler_) {
column_sampler_ = common::MakeColumnSampler(ctx_);
}
if (trees.front()->IsMultiTarget()) {
CHECK(hist_param_.GetInitialised());
CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
@@ -537,7 +540,7 @@ class QuantileHistMaker : public TreeUpdater {
h_out_position, *tree_it);
}
hist_param_.CheckTreesSynchronized(*tree_it);
hist_param_.CheckTreesSynchronized(ctx_, *tree_it);
}
}