Define CUDA Context. (#8604)

We will transition to non-default and non-blocking CUDA stream.
This commit is contained in:
Jiaming Yuan
2022-12-20 15:15:07 +08:00
committed by GitHub
parent e01639548a
commit c6a8754c62
11 changed files with 120 additions and 62 deletions

View File

@@ -0,0 +1,28 @@
/**
* Copyright 2022 by XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
#define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
#include <thrust/execution_policy.h>
#include "device_helpers.cuh"
namespace xgboost {
struct CUDAContext {
private:
dh::XGBCachingDeviceAllocator<char> caching_alloc_;
dh::XGBDeviceAllocator<char> alloc_;
public:
/**
* \brief Caching thrust policy.
*/
auto CTP() const { return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream()); }
/**
* \brief Thrust policy without caching allocator.
*/
auto TP() const { return thrust::cuda::par(alloc_).on(dh::DefaultStream()); }
auto Stream() const { return dh::DefaultStream(); }
};
} // namespace xgboost
#endif // XGBOOST_COMMON_CUDA_CONTEXT_CUH_

View File

@@ -5,7 +5,7 @@
*/
#include <xgboost/context.h>
#include "common/common.h"
#include "common/common.h" // AssertGPUSupport
#include "common/threading_utils.h"
namespace xgboost {
@@ -59,4 +59,11 @@ std::int32_t Context::Threads() const {
}
return n_threads;
}
#if !defined(XGBOOST_USE_CUDA)
CUDAContext const* Context::CUDACtx() const {
common::AssertGPUSupport();
return nullptr;
}
#endif // defined(XGBOOST_USE_CUDA)
} // namespace xgboost

14
src/context.cu Normal file
View File

@@ -0,0 +1,14 @@
/**
* Copyright 2022 by XGBoost Contributors
*/
#include "common/cuda_context.cuh" // CUDAContext
#include "xgboost/context.h"
namespace xgboost {
CUDAContext const* Context::CUDACtx() const {
if (!cuctx_) {
cuctx_.reset(new CUDAContext{});
}
return cuctx_.get();
}
} // namespace xgboost

View File

@@ -1,18 +1,19 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
/**
* Copyright 2019-2022 by XGBoost Contributors
*
* \file data.cu
* \brief Handles setting metainfo from array interface.
*/
#include "xgboost/data.h"
#include "xgboost/logging.h"
#include "xgboost/json.h"
#include "array_interface.h"
#include "../common/cuda_context.cuh"
#include "../common/device_helpers.cuh"
#include "../common/linalg_op.cuh"
#include "array_interface.h"
#include "device_adapter.cuh"
#include "simple_dmatrix.h"
#include "validation.h"
#include "xgboost/data.h"
#include "xgboost/json.h"
#include "xgboost/logging.h"
namespace xgboost {
namespace {
@@ -25,7 +26,7 @@ auto SetDeviceToPtr(void const* ptr) {
}
template <typename T, int32_t D>
void CopyTensorInfoImpl(Json arr_interface, linalg::Tensor<T, D>* p_out) {
void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
ArrayInterface<D> array(arr_interface);
if (array.n == 0) {
p_out->SetDevice(0);
@@ -43,15 +44,19 @@ void CopyTensorInfoImpl(Json arr_interface, linalg::Tensor<T, D>* p_out) {
// set data
data->Resize(array.n);
dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
cudaMemcpyDefault));
cudaMemcpyDefault, ctx->Stream()));
});
return;
}
p_out->Reshape(array.shape);
auto t = p_out->View(ptr_device);
linalg::ElementWiseTransformDevice(t, [=] __device__(size_t i, T) {
return linalg::detail::Apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, array.shape));
});
linalg::ElementWiseTransformDevice(
t,
[=] __device__(size_t i, T) {
return linalg::detail::Apply(TypedIndex<T, D>{array},
linalg::UnravelIndex<D>(i, array.shape));
},
ctx->Stream());
}
void CopyGroupInfoImpl(ArrayInterface<1> column, std::vector<bst_group_t>* out) {
@@ -115,14 +120,13 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
}
} // namespace
// Context is not used until we have CUDA stream.
void MetaInfo::SetInfoFromCUDA(Context const&, StringView key, Json array) {
void MetaInfo::SetInfoFromCUDA(Context const& ctx, StringView key, Json array) {
// multi-dim float info
if (key == "base_margin") {
CopyTensorInfoImpl(array, &base_margin_);
CopyTensorInfoImpl(ctx.CUDACtx(), array, &base_margin_);
return;
} else if (key == "label") {
CopyTensorInfoImpl(array, &labels);
CopyTensorInfoImpl(ctx.CUDACtx(), array, &labels);
auto ptr = labels.Data()->ConstDevicePointer();
auto valid = thrust::none_of(thrust::device, ptr, ptr + labels.Size(), data::LabelsCheck{});
CHECK(valid) << "Label contains NaN, infinity or a value too large.";
@@ -142,7 +146,7 @@ void MetaInfo::SetInfoFromCUDA(Context const&, StringView key, Json array) {
}
// float info
linalg::Tensor<float, 1> t;
CopyTensorInfoImpl(array, &t);
CopyTensorInfoImpl(ctx.CUDACtx(), array, &t);
if (key == "weight") {
this->weights_ = std::move(*t.Data());
auto ptr = weights_.ConstDevicePointer();
@@ -156,7 +160,7 @@ void MetaInfo::SetInfoFromCUDA(Context const&, StringView key, Json array) {
this->feature_weights = std::move(*t.Data());
auto d_feature_weights = feature_weights.ConstDeviceSpan();
auto valid =
thrust::none_of(thrust::device, d_feature_weights.data(),
thrust::none_of(ctx.CUDACtx()->CTP(), d_feature_weights.data(),
d_feature_weights.data() + d_feature_weights.size(), data::WeightsCheck{});
CHECK(valid) << "Feature weight must be greater than 0.";
} else {

View File

@@ -35,7 +35,7 @@
#include "common/version.h"
#include "xgboost/base.h"
#include "xgboost/c_api.h"
#include "xgboost/context.h"
#include "xgboost/context.h" // Context
#include "xgboost/data.h"
#include "xgboost/feature_map.h"
#include "xgboost/gbm.h"

View File

@@ -267,12 +267,12 @@ __global__ void __launch_bounds__(kBlockThreads)
}
}
void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
FeatureGroupsAccessor const& feature_groups,
common::Span<GradientPair const> gpair,
common::Span<const uint32_t> d_ridx,
common::Span<GradientPairInt64> histogram,
GradientQuantiser rounding, bool force_global_memory) {
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding,
bool force_global_memory) {
// decide whether to use shared memory
int device = 0;
dh::safe_cuda(cudaGetDevice(&device));
@@ -318,9 +318,9 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
min(grid_size,
unsigned(common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
dh::LaunchKernel {dim3(grid_size, num_groups),
static_cast<uint32_t>(kBlockThreads), smem_size}(
kernel, matrix, feature_groups, d_ridx, histogram.data(), gpair.data(), rounding);
dh::LaunchKernel{dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(),
gpair.data(), rounding);
};
if (shared) {

View File

@@ -5,9 +5,9 @@
#define HISTOGRAM_CUH_
#include <thrust/transform.h>
#include "feature_groups.cuh"
#include "../../common/cuda_context.cuh"
#include "../../data/ellpack_page.cuh"
#include "feature_groups.cuh"
namespace xgboost {
namespace tree {
@@ -56,12 +56,11 @@ public:
}
};
void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
FeatureGroupsAccessor const& feature_groups,
common::Span<GradientPair const> gpair,
common::Span<const uint32_t> ridx,
common::Span<GradientPairInt64> histogram,
GradientQuantiser rounding,
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding,
bool force_global_memory = false);
} // namespace tree
} // namespace xgboost

View File

@@ -20,6 +20,7 @@
#include "../common/io.h"
#include "../common/timer.h"
#include "../data/ellpack_page.cuh"
#include "../common/cuda_context.cuh" // CUDAContext
#include "constraints.cuh"
#include "driver.h"
#include "gpu_hist/evaluate_splits.cuh"
@@ -344,9 +345,9 @@ struct GPUHistMakerDevice {
void BuildHist(int nidx) {
auto d_node_hist = hist.GetNodeHistogram(nidx);
auto d_ridx = row_partitioner->GetRows(nidx);
BuildGradientHistogram(page->GetDeviceAccessor(ctx_->gpu_id),
feature_groups->DeviceAccessor(ctx_->gpu_id), gpair,
d_ridx, d_node_hist, *quantiser);
BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id),
feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist,
*quantiser);
}
// Attempt to do subtraction trick
@@ -646,7 +647,7 @@ struct GPUHistMakerDevice {
return quantiser.ToFixedPoint(gpair);
});
GradientPairInt64 root_sum_quantised =
dh::Reduce(thrust::cuda::par(alloc), gpair_it, gpair_it + gpair.size(),
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
using ReduceT = typename decltype(root_sum_quantised)::ValueT;
collective::Allreduce<collective::Operation::kSum>(