From 508ac13243b95f7fa8006d244a6c1a93cd099e11 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 21 Aug 2024 02:50:26 +0800
Subject: [PATCH] Check cub errors. (#10721)

- Make sure cuda error returned by cub scan is caught.
- Avoid temporary buffer allocation in thrust device vector.
---
 src/data/ellpack_page.cu                        | 17 ++++++++++-------
 src/tree/gpu_hist/evaluate_splits.cu            |  4 +++-
 src/tree/gpu_hist/histogram.cu                  |  4 ++--
 src/tree/gpu_hist/row_partitioner.cuh           | 14 ++++++++------
 tests/cpp/tree/gpu_hist/test_row_partitioner.cu |  9 ++++-----
 5 files changed, 27 insertions(+), 21 deletions(-)
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index b7ec72ad3..bb279b3d8 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -309,9 +309,9 @@ ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
 ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
 
 namespace {
-void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const> d_row_ptr,
-                        size_t row_stride, common::CompressedByteT* d_compressed_buffer,
-                        size_t null) {
+void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
+                        common::Span<size_t const> d_row_ptr, size_t row_stride,
+                        common::CompressedByteT* d_compressed_buffer, size_t null) {
   dh::device_vector<uint8_t> data(page.index.begin(), page.index.end());
   auto d_data = dh::ToSpan(data);
 
@@ -323,7 +323,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
   common::CompressedBufferWriter writer{page.cut.TotalBins() +
                                         static_cast<std::size_t>(1)};  // +1 for null value
 
-  dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
+  auto cuctx = ctx->CUDACtx();
+  dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable {
     auto ridx = idx / row_stride;
     auto ifeature = idx % row_stride;
 
@@ -336,7 +337,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
       return;
     }
 
-    size_t offset = 0;
+    bst_idx_t offset = 0;
     if (!d_csc_indptr.empty()) {
       // is dense, ifeature is the actual feature index.
       offset = d_csc_indptr[ifeature];
@@ -362,7 +363,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   row_stride = *std::max_element(it, it + page.Size());
 
   CHECK(ctx->IsCUDA());
-  InitCompressedData(ctx);
+  this->InitCompressedData(ctx);
 
   // copy gidx
   common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
@@ -373,7 +374,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
 
   auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
   auto null = accessor.NullValue();
-  CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
+  this->monitor_.Start("CopyGHistToEllpack");
+  CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, d_compressed_buffer, null);
+  this->monitor_.Stop("CopyGHistToEllpack");
 }
 
 // A functor that copies the data from one EllpackPage to another.
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 631f2bd8f..0131f166f 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -472,7 +472,9 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
 
 GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
                                                      EvaluateSplitSharedInputs shared_inputs) {
-  dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
+  dh::device_vector<EvaluateSplitInputs> inputs(1);
+  dh::safe_cuda(cudaMemcpyAsync(inputs.data().get(), &input, sizeof(input), cudaMemcpyDefault));
+
   dh::TemporaryArray<GPUExpandEntry> out_entries(1);
   this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
                        shared_inputs, dh::ToSpan(out_entries));
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index e90b6831f..731e71367 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -325,7 +325,7 @@ class DeviceHistogramBuilderImpl {
   void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
                       FeatureGroupsAccessor const& feature_groups,
                       common::Span<GradientPair const> gpair,
-                      common::Span<const std::uint32_t> d_ridx,
+                      common::Span<const cuda_impl::RowIndexT> d_ridx,
                       common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
     CHECK(kernel_);
     // Otherwise launch blocks such that each block has a minimum amount of work to do
@@ -369,7 +369,7 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
                                             EllpackDeviceAccessor const& matrix,
                                             FeatureGroupsAccessor const& feature_groups,
                                             common::Span<GradientPair const> gpair,
-                                            common::Span<const std::uint32_t> ridx,
+                                            common::Span<const cuda_impl::RowIndexT> ridx,
                                             common::Span<GradientPairInt64> histogram,
                                             GradientQuantiser rounding) {
   this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 5f8f0a30b..c754f84c0 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -132,7 +132,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
                        common::Span<cuda_impl::RowIndexT> ridx,
                        common::Span<cuda_impl::RowIndexT> ridx_tmp,
                        common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op,
-                       dh::device_vector<int8_t>* tmp) {
+                       dh::DeviceUVector<int8_t>* tmp) {
   dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
   WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
                                              d_counts.data()};
@@ -150,14 +150,16 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
                               go_left};
       });
   std::size_t temp_bytes = 0;
+  // Restriction imposed by cub.
+  CHECK_LE(total_rows, static_cast<bst_idx_t>(std::numeric_limits<std::int32_t>::max()));
   if (tmp->empty()) {
-    cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                   IndexFlagOp{}, total_rows);
+    dh::safe_cuda(cub::DeviceScan::InclusiveScan(
+        nullptr, temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp{}, total_rows));
     tmp->resize(temp_bytes);
   }
   temp_bytes = tmp->size();
-  cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp{}, total_rows);
+  dh::safe_cuda(cub::DeviceScan::InclusiveScan(tmp->data(), temp_bytes, input_iterator,
+                                               discard_write_iterator, IndexFlagOp{}, total_rows));
 
   constexpr int kBlockSize = 256;
 
@@ -236,7 +238,7 @@ class RowPartitioner {
   dh::DeviceUVector<RowIndexT> ridx_;
   // Staging area for sorting ridx
   dh::DeviceUVector<RowIndexT> ridx_tmp_;
-  dh::device_vector<int8_t> tmp_;
+  dh::DeviceUVector<int8_t> tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
   bst_node_t n_nodes_{0};  // Counter for internal checks.
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 86080a797..ec8372815 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -49,9 +49,9 @@ void TestUpdatePositionBatch() {
 TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
 
 void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
-  thrust::device_vector<uint32_t> ridx = ridx_in;
-  thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
-  thrust::device_vector<bst_uint> counts(segments.size());
+  thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
+  thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
+  thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
 
   auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
   std::vector<int> op_data(segments.size());
@@ -66,7 +66,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                 h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
                                 nullptr));
-  dh::device_vector<int8_t> tmp;
+  dh::DeviceUVector<int8_t> tmp;
   SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
                                        dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
                                        &tmp);
@@ -91,5 +91,4 @@ TEST(GpuHist, SortPositionBatch) {
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
 }
-
 }  // namespace xgboost::tree