Check cub errors. (#10721)

- Make sure cuda error returned by cub scan is caught.
- Avoid temporary buffer allocation in thrust device vector.
This commit is contained in:
Jiaming Yuan 2024-08-21 02:50:26 +08:00 committed by GitHub
parent b949a4bf7b
commit 508ac13243
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 27 additions and 21 deletions

View File

@ -309,9 +309,9 @@ ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch) ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
namespace { namespace {
void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const> d_row_ptr, void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
size_t row_stride, common::CompressedByteT* d_compressed_buffer, common::Span<size_t const> d_row_ptr, size_t row_stride,
size_t null) { common::CompressedByteT* d_compressed_buffer, size_t null) {
dh::device_vector<uint8_t> data(page.index.begin(), page.index.end()); dh::device_vector<uint8_t> data(page.index.begin(), page.index.end());
auto d_data = dh::ToSpan(data); auto d_data = dh::ToSpan(data);
@ -323,7 +323,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
common::CompressedBufferWriter writer{page.cut.TotalBins() + common::CompressedBufferWriter writer{page.cut.TotalBins() +
static_cast<std::size_t>(1)}; // +1 for null value static_cast<std::size_t>(1)}; // +1 for null value
dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable { auto cuctx = ctx->CUDACtx();
dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable {
auto ridx = idx / row_stride; auto ridx = idx / row_stride;
auto ifeature = idx % row_stride; auto ifeature = idx % row_stride;
@ -336,7 +337,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
return; return;
} }
size_t offset = 0; bst_idx_t offset = 0;
if (!d_csc_indptr.empty()) { if (!d_csc_indptr.empty()) {
// is dense, ifeature is the actual feature index. // is dense, ifeature is the actual feature index.
offset = d_csc_indptr[ifeature]; offset = d_csc_indptr[ifeature];
@ -362,7 +363,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
row_stride = *std::max_element(it, it + page.Size()); row_stride = *std::max_element(it, it + page.Size());
CHECK(ctx->IsCUDA()); CHECK(ctx->IsCUDA());
InitCompressedData(ctx); this->InitCompressedData(ctx);
// copy gidx // copy gidx
common::CompressedByteT* d_compressed_buffer = gidx_buffer.data(); common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
@ -373,7 +374,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
auto accessor = this->GetDeviceAccessor(ctx->Device(), ft); auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
auto null = accessor.NullValue(); auto null = accessor.NullValue();
CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null); this->monitor_.Start("CopyGHistToEllpack");
CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, d_compressed_buffer, null);
this->monitor_.Stop("CopyGHistToEllpack");
} }
// A functor that copies the data from one EllpackPage to another. // A functor that copies the data from one EllpackPage to another.

View File

@ -472,7 +472,9 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input, GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
EvaluateSplitSharedInputs shared_inputs) { EvaluateSplitSharedInputs shared_inputs) {
dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input}; dh::device_vector<EvaluateSplitInputs> inputs(1);
dh::safe_cuda(cudaMemcpyAsync(inputs.data().get(), &input, sizeof(input), cudaMemcpyDefault));
dh::TemporaryArray<GPUExpandEntry> out_entries(1); dh::TemporaryArray<GPUExpandEntry> out_entries(1);
this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
shared_inputs, dh::ToSpan(out_entries)); shared_inputs, dh::ToSpan(out_entries));

View File

@ -325,7 +325,7 @@ class DeviceHistogramBuilderImpl {
void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix, void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
FeatureGroupsAccessor const& feature_groups, FeatureGroupsAccessor const& feature_groups,
common::Span<GradientPair const> gpair, common::Span<GradientPair const> gpair,
common::Span<const std::uint32_t> d_ridx, common::Span<const cuda_impl::RowIndexT> d_ridx,
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) { common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
CHECK(kernel_); CHECK(kernel_);
// Otherwise launch blocks such that each block has a minimum amount of work to do // Otherwise launch blocks such that each block has a minimum amount of work to do
@ -369,7 +369,7 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
EllpackDeviceAccessor const& matrix, EllpackDeviceAccessor const& matrix,
FeatureGroupsAccessor const& feature_groups, FeatureGroupsAccessor const& feature_groups,
common::Span<GradientPair const> gpair, common::Span<GradientPair const> gpair,
common::Span<const std::uint32_t> ridx, common::Span<const cuda_impl::RowIndexT> ridx,
common::Span<GradientPairInt64> histogram, common::Span<GradientPairInt64> histogram,
GradientQuantiser rounding) { GradientQuantiser rounding) {
this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding); this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);

View File

@ -132,7 +132,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
common::Span<cuda_impl::RowIndexT> ridx, common::Span<cuda_impl::RowIndexT> ridx,
common::Span<cuda_impl::RowIndexT> ridx_tmp, common::Span<cuda_impl::RowIndexT> ridx_tmp,
common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op, common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op,
dh::device_vector<int8_t>* tmp) { dh::DeviceUVector<int8_t>* tmp) {
dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data()); dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(), WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
d_counts.data()}; d_counts.data()};
@ -150,14 +150,16 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
go_left}; go_left};
}); });
std::size_t temp_bytes = 0; std::size_t temp_bytes = 0;
// Restriction imposed by cub.
CHECK_LE(total_rows, static_cast<bst_idx_t>(std::numeric_limits<std::int32_t>::max()));
if (tmp->empty()) { if (tmp->empty()) {
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator, dh::safe_cuda(cub::DeviceScan::InclusiveScan(
IndexFlagOp{}, total_rows); nullptr, temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp{}, total_rows));
tmp->resize(temp_bytes); tmp->resize(temp_bytes);
} }
temp_bytes = tmp->size(); temp_bytes = tmp->size();
cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator, dh::safe_cuda(cub::DeviceScan::InclusiveScan(tmp->data(), temp_bytes, input_iterator,
discard_write_iterator, IndexFlagOp{}, total_rows); discard_write_iterator, IndexFlagOp{}, total_rows));
constexpr int kBlockSize = 256; constexpr int kBlockSize = 256;
@ -236,7 +238,7 @@ class RowPartitioner {
dh::DeviceUVector<RowIndexT> ridx_; dh::DeviceUVector<RowIndexT> ridx_;
// Staging area for sorting ridx // Staging area for sorting ridx
dh::DeviceUVector<RowIndexT> ridx_tmp_; dh::DeviceUVector<RowIndexT> ridx_tmp_;
dh::device_vector<int8_t> tmp_; dh::DeviceUVector<int8_t> tmp_;
dh::PinnedMemory pinned_; dh::PinnedMemory pinned_;
dh::PinnedMemory pinned2_; dh::PinnedMemory pinned2_;
bst_node_t n_nodes_{0}; // Counter for internal checks. bst_node_t n_nodes_{0}; // Counter for internal checks.

View File

@ -49,9 +49,9 @@ void TestUpdatePositionBatch() {
TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); } TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) { void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
thrust::device_vector<uint32_t> ridx = ridx_in; thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size()); thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
thrust::device_vector<bst_uint> counts(segments.size()); thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; }; auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
std::vector<int> op_data(segments.size()); std::vector<int> op_data(segments.size());
@ -66,7 +66,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault, h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
nullptr)); nullptr));
dh::device_vector<int8_t> tmp; dh::DeviceUVector<int8_t> tmp;
SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx), SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op, dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
&tmp); &tmp);
@ -91,5 +91,4 @@ TEST(GpuHist, SortPositionBatch) {
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}}); TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}}); TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
} }
} // namespace xgboost::tree } // namespace xgboost::tree