[coll] Pass context to various functions. (#9772)
* [coll] Pass context to various functions. In the future, the `Context` object would be required for collective operations, this PR passes the context object to some required functions to prepare for swapping out the implementation.
This commit is contained in:
@@ -395,11 +395,11 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
|
||||
}
|
||||
}
|
||||
|
||||
void GPUHistEvaluator::EvaluateSplits(
|
||||
const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
|
||||
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||
EvaluateSplitSharedInputs shared_inputs,
|
||||
common::Span<GPUExpandEntry> out_entries) {
|
||||
void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_node_t> &nidx,
|
||||
bst_feature_t max_active_features,
|
||||
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||
EvaluateSplitSharedInputs shared_inputs,
|
||||
common::Span<GPUExpandEntry> out_entries) {
|
||||
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
|
||||
|
||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
|
||||
@@ -417,19 +417,20 @@ void GPUHistEvaluator::EvaluateSplits(
|
||||
out_splits.size() * sizeof(DeviceSplitCandidate));
|
||||
|
||||
// Reduce to get the best candidate from all workers.
|
||||
dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
|
||||
out_splits[i] = all_candidates[i];
|
||||
for (auto rank = 1; rank < world_size; rank++) {
|
||||
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
|
||||
}
|
||||
});
|
||||
dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(),
|
||||
[world_size, all_candidates, out_splits] __device__(size_t i) {
|
||||
out_splits[i] = all_candidates[i];
|
||||
for (auto rank = 1; rank < world_size; rank++) {
|
||||
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
|
||||
auto d_entries = out_entries;
|
||||
auto device_cats_accessor = this->DeviceCatStorage(nidx);
|
||||
// turn candidate into entry, along with handling sort based split.
|
||||
dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable {
|
||||
dh::LaunchN(d_inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t i) mutable {
|
||||
auto const input = d_inputs[i];
|
||||
auto &split = out_splits[i];
|
||||
// Subtract parent gain here
|
||||
@@ -464,12 +465,12 @@ void GPUHistEvaluator::EvaluateSplits(
|
||||
this->CopyToHost(nidx);
|
||||
}
|
||||
|
||||
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
|
||||
EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs) {
|
||||
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
|
||||
EvaluateSplitSharedInputs shared_inputs) {
|
||||
dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
|
||||
dh::TemporaryArray<GPUExpandEntry> out_entries(1);
|
||||
this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
|
||||
dh::ToSpan(out_entries));
|
||||
this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
|
||||
shared_inputs, dh::ToSpan(out_entries));
|
||||
GPUExpandEntry root_entry;
|
||||
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
@@ -193,7 +193,7 @@ class GPUHistEvaluator {
|
||||
/**
|
||||
* \brief Evaluate splits for left and right nodes.
|
||||
*/
|
||||
void EvaluateSplits(const std::vector<bst_node_t> &nidx,
|
||||
void EvaluateSplits(Context const* ctx, const std::vector<bst_node_t> &nidx,
|
||||
bst_feature_t max_active_features,
|
||||
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||
EvaluateSplitSharedInputs shared_inputs,
|
||||
@@ -201,7 +201,7 @@ class GPUHistEvaluator {
|
||||
/**
|
||||
* \brief Evaluate splits for root node.
|
||||
*/
|
||||
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,
|
||||
GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
|
||||
EvaluateSplitSharedInputs shared_inputs);
|
||||
};
|
||||
} // namespace tree
|
||||
|
||||
@@ -16,8 +16,7 @@
|
||||
#include "row_partitioner.cuh"
|
||||
#include "xgboost/base.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace xgboost::tree {
|
||||
namespace {
|
||||
struct Pair {
|
||||
GradientPair first;
|
||||
@@ -53,7 +52,8 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
|
||||
*
|
||||
* to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
|
||||
*/
|
||||
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
|
||||
GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
|
||||
MetaInfo const& info) {
|
||||
using GradientSumT = GradientPairPrecise;
|
||||
using T = typename GradientSumT::ValueT;
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
@@ -99,7 +99,6 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, Met
|
||||
static_cast<T>(1) / to_floating_point_.GetHess());
|
||||
}
|
||||
|
||||
|
||||
XGBOOST_DEV_INLINE void
|
||||
AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
|
||||
xgboost::GradientPairInt64 const &gpair) {
|
||||
@@ -314,6 +313,4 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
|
||||
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -39,18 +39,20 @@ private:
|
||||
GradientPairPrecise to_floating_point_;
|
||||
|
||||
public:
|
||||
GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
|
||||
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
|
||||
GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
|
||||
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
|
||||
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
|
||||
gpair.GetHess() * to_fixed_point_.GetHess());
|
||||
gpair.GetHess() * to_fixed_point_.GetHess());
|
||||
return adjusted;
|
||||
}
|
||||
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPairPrecise const& gpair) const {
|
||||
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64
|
||||
ToFixedPoint(GradientPairPrecise const& gpair) const {
|
||||
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
|
||||
gpair.GetHess() * to_fixed_point_.GetHess());
|
||||
gpair.GetHess() * to_fixed_point_.GetHess());
|
||||
return adjusted;
|
||||
}
|
||||
XGBOOST_DEVICE GradientPairPrecise ToFloatingPoint(const GradientPairInt64&gpair) const {
|
||||
[[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
|
||||
ToFloatingPoint(const GradientPairInt64& gpair) const {
|
||||
auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
|
||||
auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
|
||||
return {g,h};
|
||||
|
||||
Reference in New Issue
Block a user