From 508ac13243b95f7fa8006d244a6c1a93cd099e11 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 21 Aug 2024 02:50:26 +0800 Subject: [PATCH] Check cub errors. (#10721) - Make sure cuda error returned by cub scan is caught. - Avoid temporary buffer allocation in thrust device vector. --- src/data/ellpack_page.cu | 17 ++++++++++------- src/tree/gpu_hist/evaluate_splits.cu | 4 +++- src/tree/gpu_hist/histogram.cu | 4 ++-- src/tree/gpu_hist/row_partitioner.cuh | 14 ++++++++------ tests/cpp/tree/gpu_hist/test_row_partitioner.cu | 9 ++++----- 5 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index b7ec72ad3..bb279b3d8 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -309,9 +309,9 @@ ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch) ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch) namespace { -void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span d_row_ptr, - size_t row_stride, common::CompressedByteT* d_compressed_buffer, - size_t null) { +void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page, + common::Span d_row_ptr, size_t row_stride, + common::CompressedByteT* d_compressed_buffer, size_t null) { dh::device_vector data(page.index.begin(), page.index.end()); auto d_data = dh::ToSpan(data); @@ -323,7 +323,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span common::CompressedBufferWriter writer{page.cut.TotalBins() + static_cast(1)}; // +1 for null value - dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable { + auto cuctx = ctx->CUDACtx(); + dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable { auto ridx = idx / row_stride; auto ifeature = idx % row_stride; @@ -336,7 +337,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span return; } - size_t offset = 0; + bst_idx_t offset = 0; if (!d_csc_indptr.empty()) { // is dense, ifeature is the actual feature index. offset = d_csc_indptr[ifeature]; @@ -362,7 +363,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag row_stride = *std::max_element(it, it + page.Size()); CHECK(ctx->IsCUDA()); - InitCompressedData(ctx); + this->InitCompressedData(ctx); // copy gidx common::CompressedByteT* d_compressed_buffer = gidx_buffer.data(); @@ -373,7 +374,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag auto accessor = this->GetDeviceAccessor(ctx->Device(), ft); auto null = accessor.NullValue(); - CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null); + this->monitor_.Start("CopyGHistToEllpack"); + CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, d_compressed_buffer, null); + this->monitor_.Stop("CopyGHistToEllpack"); } // A functor that copies the data from one EllpackPage to another. diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 631f2bd8f..0131f166f 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -472,7 +472,9 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector inputs = std::vector{input}; + dh::device_vector inputs(1); + dh::safe_cuda(cudaMemcpyAsync(inputs.data().get(), &input, sizeof(input), cudaMemcpyDefault)); + dh::TemporaryArray out_entries(1); this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs, dh::ToSpan(out_entries)); diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index e90b6831f..731e71367 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -325,7 +325,7 @@ class DeviceHistogramBuilderImpl { void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix, FeatureGroupsAccessor const& feature_groups, common::Span gpair, - common::Span d_ridx, + common::Span d_ridx, common::Span histogram, GradientQuantiser rounding) { CHECK(kernel_); // Otherwise launch blocks such that each block has a minimum amount of work to do @@ -369,7 +369,7 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix, FeatureGroupsAccessor const& feature_groups, common::Span gpair, - common::Span ridx, + common::Span ridx, common::Span histogram, GradientQuantiser rounding) { this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 5f8f0a30b..c754f84c0 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -132,7 +132,7 @@ void SortPositionBatch(common::Span> d_batch_info, common::Span ridx, common::Span ridx_tmp, common::Span d_counts, bst_idx_t total_rows, OpT op, - dh::device_vector* tmp) { + dh::DeviceUVector* tmp) { dh::LDGIterator> batch_info_itr(d_batch_info.data()); WriteResultsFunctor write_results{batch_info_itr, ridx.data(), ridx_tmp.data(), d_counts.data()}; @@ -150,14 +150,16 @@ void SortPositionBatch(common::Span> d_batch_info, go_left}; }); std::size_t temp_bytes = 0; + // Restriction imposed by cub. + CHECK_LE(total_rows, static_cast(std::numeric_limits::max())); if (tmp->empty()) { - cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator, - IndexFlagOp{}, total_rows); + dh::safe_cuda(cub::DeviceScan::InclusiveScan( + nullptr, temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp{}, total_rows)); tmp->resize(temp_bytes); } temp_bytes = tmp->size(); - cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator, - discard_write_iterator, IndexFlagOp{}, total_rows); + dh::safe_cuda(cub::DeviceScan::InclusiveScan(tmp->data(), temp_bytes, input_iterator, + discard_write_iterator, IndexFlagOp{}, total_rows)); constexpr int kBlockSize = 256; @@ -236,7 +238,7 @@ class RowPartitioner { dh::DeviceUVector ridx_; // Staging area for sorting ridx dh::DeviceUVector ridx_tmp_; - dh::device_vector tmp_; + dh::DeviceUVector tmp_; dh::PinnedMemory pinned_; dh::PinnedMemory pinned2_; bst_node_t n_nodes_{0}; // Counter for internal checks. diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 86080a797..ec8372815 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -49,9 +49,9 @@ void TestUpdatePositionBatch() { TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); } void TestSortPositionBatch(const std::vector& ridx_in, const std::vector& segments) { - thrust::device_vector ridx = ridx_in; - thrust::device_vector ridx_tmp(ridx_in.size()); - thrust::device_vector counts(segments.size()); + thrust::device_vector ridx = ridx_in; + thrust::device_vector ridx_tmp(ridx_in.size()); + thrust::device_vector counts(segments.size()); auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; }; std::vector op_data(segments.size()); @@ -66,7 +66,7 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector), cudaMemcpyDefault, nullptr)); - dh::device_vector tmp; + dh::DeviceUVector tmp; SortPositionBatch(dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op, &tmp); @@ -91,5 +91,4 @@ TEST(GpuHist, SortPositionBatch) { TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}}); TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}}); } - } // namespace xgboost::tree