From c311a8c1d845f27b9c80d82da2c5810dc1f6b83c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 17 Sep 2021 14:28:18 +0800
Subject: [PATCH] Enable compiling with system cub. (#7232)

- Tested with all CUDA 11.x.
- Workaround cub scan by using discard iterator in AUC.
- Limit the size of Argsort when compiled with CUDA cub.
---
 CMakeLists.txt                       |  1 +
 cmake/Utils.cmake                    |  9 ++++-
 src/common/device_helpers.cuh        | 55 ++++++++++++++++++++++++----
 src/data/ellpack_page.cu             |  6 +--
 src/metric/auc.cu                    | 17 +++++----
 src/tree/gpu_hist/row_partitioner.cu |  5 +--
 6 files changed, 67 insertions(+), 26 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6a042668..bd21e9b67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,7 @@ option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF)
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
+option(BUILD_WITH_CUDA_CUB "Build with cub in CUDA installation" OFF)
 set(GPU_COMPUTE_VER "" CACHE STRING
   "Semicolon separated list of compute versions to be built against, e.g. '35;61'")
 ## Copied From dmlc
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 3684c250c..645f31fef 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -154,8 +154,13 @@ function(xgboost_set_cuda_flags target)
     enable_nvtx(${target})
   endif (USE_NVTX)
 
-  target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1 -DTHRUST_IGNORE_CUB_VERSION_CHECK=1)
-  target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/cub/ ${xgboost_SOURCE_DIR}/gputreeshap)
+  if (NOT BUILD_WITH_CUDA_CUB)
+    target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1 -DTHRUST_IGNORE_CUB_VERSION_CHECK=1)
+    target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/cub/ ${xgboost_SOURCE_DIR}/gputreeshap)
+  else ()
+    target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1)
+    target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/gputreeshap)
+  endif (NOT BUILD_WITH_CUDA_CUB)
 
   if (MSVC)
     target_compile_options(${target} PRIVATE
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 699a3f627..c2c7ff07b 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -75,6 +75,16 @@ __device__ __forceinline__ double atomicAdd(double* address, double val) {  // N
 #endif
 
 namespace dh {
+
+// FIXME(jiamingy): Remove this once we get rid of cub submodule.
+constexpr bool BuildWithCUDACub() {
+#if defined(THRUST_IGNORE_CUB_VERSION_CHECK) && THRUST_IGNORE_CUB_VERSION_CHECK == 1
+  return false;
+#else
+  return true;
+#endif // defined(THRUST_IGNORE_CUB_VERSION_CHECK) && THRUST_IGNORE_CUB_VERSION_CHECK == 1
+}
+
 namespace detail {
 template <size_t size>
 struct AtomicDispatcher;
@@ -689,6 +699,33 @@ typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) {
   return sum;
 }
 
+constexpr std::pair<int, int> CUDAVersion() {
+#if defined(__CUDACC_VER_MAJOR__)
+  return std::make_pair(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__);
+#else
+  // clang/clang-tidy
+  return std::make_pair((CUDA_VERSION) / 1000, (CUDA_VERSION) % 100 / 10);
+#endif  // defined(__CUDACC_VER_MAJOR__)
+}
+
+namespace detail {
+template <typename T>
+using TypedDiscardCTK114 = thrust::discard_iterator<T>;
+
+template <typename T>
+class TypedDiscard : public thrust::discard_iterator<T> {
+ public:
+  using value_type = T;  // NOLINT
+};
+} // namespace detail
+
+template <typename T>
+using TypedDiscard =
+    std::conditional_t<((CUDAVersion().first == 11 &&
+                         CUDAVersion().second >= 4) ||
+                        CUDAVersion().first > 11),
+                       detail::TypedDiscardCTK114<T>, detail::TypedDiscard<T>>;
+
 /**
  * \class AllReducer
  *
@@ -1326,7 +1363,7 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
                         OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
                                            cub::NullType(), num_items, nullptr,
                                            false)));
-  dh::TemporaryArray<char> storage(bytes);
+  TemporaryArray<char> storage(bytes);
   safe_cuda((
       cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
                         OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
@@ -1369,24 +1406,27 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
   cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
                                      sorted_idx_out.data().get());
 
+  // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
+  using OffsetT = std::conditional_t<!BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
+  CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
   if (accending) {
     void *d_temp_storage = nullptr;
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, std::ptrdiff_t>::Dispatch(
+    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
         sizeof(KeyT) * 8, false, nullptr, false)));
     TemporaryArray<char> storage(bytes);
     d_temp_storage = storage.data().get();
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, std::ptrdiff_t>::Dispatch(
+    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
         sizeof(KeyT) * 8, false, nullptr, false)));
   } else {
     void *d_temp_storage = nullptr;
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, std::ptrdiff_t>::Dispatch(
+    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
         sizeof(KeyT) * 8, false, nullptr, false)));
     TemporaryArray<char> storage(bytes);
     d_temp_storage = storage.data().get();
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, std::ptrdiff_t>::Dispatch(
+    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
         sizeof(KeyT) * 8, false, nullptr, false)));
   }
@@ -1396,7 +1436,7 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
 }
 
 namespace detail {
-// Wrapper around cub sort for easier `descending` sort and `size_t num_items`.
+// Wrapper around cub sort for easier `descending` sort.
 template <bool descending, typename KeyT, typename ValueT,
           typename OffsetIteratorT>
 void DeviceSegmentedRadixSortPair(
@@ -1408,7 +1448,8 @@ void DeviceSegmentedRadixSortPair(
   cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
   cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
                                      d_values_out);
-  using OffsetT = size_t;
+  using OffsetT = int32_t;  // num items in dispatch is also int32_t, no way to change.
+  CHECK_LE(num_items, std::numeric_limits<int32_t>::max());
   safe_cuda((cub::DispatchSegmentedRadixSort<
              descending, KeyT, ValueT, OffsetIteratorT,
              OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 97eccf682..82d90eb13 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -183,10 +183,6 @@ struct TupleScanOp {
   }
 };
 
-// Change the value type of thrust discard iterator so we can use it with cub
-template <typename T>
-using TypedDiscard = thrust::discard_iterator<T>;
-
 // Here the data is already correctly ordered and simply needs to be compacted
 // to remove missing data
 template <typename AdapterBatchT>
@@ -229,7 +225,7 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
   WriteCompressedEllpackFunctor<AdapterBatchT> functor(
       d_compressed_buffer, writer, batch, device_accessor, feature_types,
       is_valid);
-  TypedDiscard<Tuple> discard;
+  dh::TypedDiscard<Tuple> discard;
   thrust::transform_output_iterator<
     WriteCompressedEllpackFunctor<AdapterBatchT>, decltype(discard)>
       out(discard, functor);
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 2e33a2a5f..63c928c53 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -331,24 +331,25 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
   // expand to tuple to include class id
   auto fptp_it_in = dh::MakeTransformIterator<Triple>(
       thrust::make_counting_iterator(0), [=] __device__(size_t i) {
-        uint32_t class_id = i / n_samples;
-        return thrust::make_tuple(class_id, d_fptp[i].first, d_fptp[i].second);
+        return thrust::make_tuple(i, d_fptp[i].first, d_fptp[i].second);
       });
   // shrink down to pair
   auto fptp_it_out = thrust::make_transform_output_iterator(
-      dh::tbegin(d_fptp), [=] __device__(Triple const &t) {
-        return thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t));
+      dh::TypedDiscard<Triple>{}, [d_fptp] __device__(Triple const &t) {
+        d_fptp[thrust::get<0>(t)] =
+            thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t));
+        return t;
       });
   dh::InclusiveScan(
       fptp_it_in, fptp_it_out,
       [=] __device__(Triple const &l, Triple const &r) {
-        uint32_t l_cid = thrust::get<0>(l);
-        uint32_t r_cid = thrust::get<0>(r);
+        uint32_t l_cid = thrust::get<0>(l) / n_samples;
+        uint32_t r_cid = thrust::get<0>(r) / n_samples;
         if (l_cid != r_cid) {
           return r;
         }
 
-        return Triple(r_cid,                                   // class_id
+        return Triple(thrust::get<0>(r),
                       thrust::get<1>(l) + thrust::get<1>(r),   // fp
                       thrust::get<2>(l) + thrust::get<2>(r));  // tp
       },
@@ -521,7 +522,7 @@ GPURankingAUC(common::Span<float const> predts, MetaInfo const &info,
   dh::TemporaryArray<float> d_auc(group_ptr.size() - 1);
   auto s_d_auc = dh::ToSpan(d_auc);
   auto out = thrust::make_transform_output_iterator(
-      Discard<RankScanItem>(), [=] __device__(RankScanItem const &item) -> RankScanItem {
+      dh::TypedDiscard<RankScanItem>{}, [=] __device__(RankScanItem const &item) -> RankScanItem {
         auto group_id = item.group_id;
         assert(group_id < d_group_ptr.size());
         auto data_group_begin = d_group_ptr[group_id];
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 36baa9557..9e002f77b 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -50,9 +50,6 @@ struct WriteResultsFunctor {
   }
 };
 
-// Change the value type of thrust discard iterator so we can use it with cub
-using DiscardOverload = thrust::discard_iterator<IndexFlagTuple>;
-
 // Implement partitioning via single scan operation using transform output to
 // write the result
 void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
@@ -64,7 +61,7 @@ void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
   WriteResultsFunctor write_results{left_nidx, position, position_out,
                                     ridx,      ridx_out, d_left_count};
   auto discard_write_iterator =
-      thrust::make_transform_output_iterator(DiscardOverload(), write_results);
+      thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
   auto counting = thrust::make_counting_iterator(0llu);
   auto input_iterator = dh::MakeTransformIterator<IndexFlagTuple>(
       counting, [=] __device__(size_t idx) {