Define CUDA Context. (#8604)

We will transition to non-default and non-blocking CUDA stream.
This commit is contained in:
Jiaming Yuan
2022-12-20 15:15:07 +08:00
committed by GitHub
parent e01639548a
commit c6a8754c62
11 changed files with 120 additions and 62 deletions

View File

@@ -267,12 +267,12 @@ __global__ void __launch_bounds__(kBlockThreads)
}
}
void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
FeatureGroupsAccessor const& feature_groups,
common::Span<GradientPair const> gpair,
common::Span<const uint32_t> d_ridx,
common::Span<GradientPairInt64> histogram,
GradientQuantiser rounding, bool force_global_memory) {
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding,
bool force_global_memory) {
// decide whether to use shared memory
int device = 0;
dh::safe_cuda(cudaGetDevice(&device));
@@ -318,9 +318,9 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
min(grid_size,
unsigned(common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
dh::LaunchKernel {dim3(grid_size, num_groups),
static_cast<uint32_t>(kBlockThreads), smem_size}(
kernel, matrix, feature_groups, d_ridx, histogram.data(), gpair.data(), rounding);
dh::LaunchKernel{dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(),
gpair.data(), rounding);
};
if (shared) {

View File

@@ -5,9 +5,9 @@
#define HISTOGRAM_CUH_
#include <thrust/transform.h>
#include "feature_groups.cuh"
#include "../../common/cuda_context.cuh"
#include "../../data/ellpack_page.cuh"
#include "feature_groups.cuh"
namespace xgboost {
namespace tree {
@@ -56,12 +56,11 @@ public:
}
};
void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
FeatureGroupsAccessor const& feature_groups,
common::Span<GradientPair const> gpair,
common::Span<const uint32_t> ridx,
common::Span<GradientPairInt64> histogram,
GradientQuantiser rounding,
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding,
bool force_global_memory = false);
} // namespace tree
} // namespace xgboost

View File

@@ -20,6 +20,7 @@
#include "../common/io.h"
#include "../common/timer.h"
#include "../data/ellpack_page.cuh"
#include "../common/cuda_context.cuh" // CUDAContext
#include "constraints.cuh"
#include "driver.h"
#include "gpu_hist/evaluate_splits.cuh"
@@ -344,9 +345,9 @@ struct GPUHistMakerDevice {
void BuildHist(int nidx) {
auto d_node_hist = hist.GetNodeHistogram(nidx);
auto d_ridx = row_partitioner->GetRows(nidx);
BuildGradientHistogram(page->GetDeviceAccessor(ctx_->gpu_id),
feature_groups->DeviceAccessor(ctx_->gpu_id), gpair,
d_ridx, d_node_hist, *quantiser);
BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id),
feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist,
*quantiser);
}
// Attempt to do subtraction trick
@@ -646,7 +647,7 @@ struct GPUHistMakerDevice {
return quantiser.ToFixedPoint(gpair);
});
GradientPairInt64 root_sum_quantised =
dh::Reduce(thrust::cuda::par(alloc), gpair_it, gpair_it + gpair.size(),
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
using ReduceT = typename decltype(root_sum_quantised)::ValueT;
collective::Allreduce<collective::Operation::kSum>(