[coll] Pass context to various functions. (#9772)

* [coll] Pass context to various functions.

In the future, the `Context` object would be required for collective operations, this PR
passes the context object to some required functions to prepare for swapping out the
implementation.
This commit is contained in:
Jiaming Yuan
2023-11-08 09:54:05 +08:00
committed by GitHub
parent 6c0a190f6d
commit 06bdc15e9b
45 changed files with 275 additions and 255 deletions

View File

@@ -395,11 +395,11 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
}
}
void GPUHistEvaluator::EvaluateSplits(
const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_entries) {
void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_node_t> &nidx,
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_entries) {
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
@@ -417,19 +417,20 @@ void GPUHistEvaluator::EvaluateSplits(
out_splits.size() * sizeof(DeviceSplitCandidate));
// Reduce to get the best candidate from all workers.
dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
out_splits[i] = all_candidates[i];
for (auto rank = 1; rank < world_size; rank++) {
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
}
});
dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(),
[world_size, all_candidates, out_splits] __device__(size_t i) {
out_splits[i] = all_candidates[i];
for (auto rank = 1; rank < world_size; rank++) {
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
}
});
}
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
auto d_entries = out_entries;
auto device_cats_accessor = this->DeviceCatStorage(nidx);
// turn candidate into entry, along with handling sort based split.
dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable {
dh::LaunchN(d_inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t i) mutable {
auto const input = d_inputs[i];
auto &split = out_splits[i];
// Subtract parent gain here
@@ -464,12 +465,12 @@ void GPUHistEvaluator::EvaluateSplits(
this->CopyToHost(nidx);
}
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs) {
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
EvaluateSplitSharedInputs shared_inputs) {
dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
dh::TemporaryArray<GPUExpandEntry> out_entries(1);
this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
dh::ToSpan(out_entries));
this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
shared_inputs, dh::ToSpan(out_entries));
GPUExpandEntry root_entry;
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
cudaMemcpyDeviceToHost));

View File

@@ -193,7 +193,7 @@ class GPUHistEvaluator {
/**
* \brief Evaluate splits for left and right nodes.
*/
void EvaluateSplits(const std::vector<bst_node_t> &nidx,
void EvaluateSplits(Context const* ctx, const std::vector<bst_node_t> &nidx,
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
@@ -201,7 +201,7 @@ class GPUHistEvaluator {
/**
* \brief Evaluate splits for root node.
*/
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,
GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
EvaluateSplitSharedInputs shared_inputs);
};
} // namespace tree

View File

@@ -16,8 +16,7 @@
#include "row_partitioner.cuh"
#include "xgboost/base.h"
namespace xgboost {
namespace tree {
namespace xgboost::tree {
namespace {
struct Pair {
GradientPair first;
@@ -53,7 +52,8 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
*
* to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
*/
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
MetaInfo const& info) {
using GradientSumT = GradientPairPrecise;
using T = typename GradientSumT::ValueT;
dh::XGBCachingDeviceAllocator<char> alloc;
@@ -99,7 +99,6 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, Met
static_cast<T>(1) / to_floating_point_.GetHess());
}
XGBOOST_DEV_INLINE void
AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
xgboost::GradientPairInt64 const &gpair) {
@@ -314,6 +313,4 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
dh::safe_cuda(cudaGetLastError());
}
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree

View File

@@ -39,18 +39,20 @@ private:
GradientPairPrecise to_floating_point_;
public:
GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
gpair.GetHess() * to_fixed_point_.GetHess());
gpair.GetHess() * to_fixed_point_.GetHess());
return adjusted;
}
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPairPrecise const& gpair) const {
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64
ToFixedPoint(GradientPairPrecise const& gpair) const {
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
gpair.GetHess() * to_fixed_point_.GetHess());
gpair.GetHess() * to_fixed_point_.GetHess());
return adjusted;
}
XGBOOST_DEVICE GradientPairPrecise ToFloatingPoint(const GradientPairInt64&gpair) const {
[[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
ToFloatingPoint(const GradientPairInt64& gpair) const {
auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
return {g,h};