From c311a8c1d845f27b9c80d82da2c5810dc1f6b83c Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 17 Sep 2021 14:28:18 +0800 Subject: [PATCH] Enable compiling with system cub. (#7232) - Tested with all CUDA 11.x. - Workaround cub scan by using discard iterator in AUC. - Limit the size of Argsort when compiled with CUDA cub. --- CMakeLists.txt | 1 + cmake/Utils.cmake | 9 ++++- src/common/device_helpers.cuh | 55 ++++++++++++++++++++++++---- src/data/ellpack_page.cu | 6 +-- src/metric/auc.cu | 17 +++++---- src/tree/gpu_hist/row_partitioner.cu | 5 +-- 6 files changed, 67 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b6a042668..bd21e9b67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,7 @@ option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF) option(USE_CUDA "Build with GPU acceleration" OFF) option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF) +option(BUILD_WITH_CUDA_CUB "Build with cub in CUDA installation" OFF) set(GPU_COMPUTE_VER "" CACHE STRING "Semicolon separated list of compute versions to be built against, e.g. '35;61'") ## Copied From dmlc diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 3684c250c..645f31fef 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -154,8 +154,13 @@ function(xgboost_set_cuda_flags target) enable_nvtx(${target}) endif (USE_NVTX) - target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1 -DTHRUST_IGNORE_CUB_VERSION_CHECK=1) - target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/cub/ ${xgboost_SOURCE_DIR}/gputreeshap) + if (NOT BUILD_WITH_CUDA_CUB) + target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1 -DTHRUST_IGNORE_CUB_VERSION_CHECK=1) + target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/cub/ ${xgboost_SOURCE_DIR}/gputreeshap) + else () + target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1) + target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/gputreeshap) + endif (NOT BUILD_WITH_CUDA_CUB) if (MSVC) target_compile_options(${target} PRIVATE diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 699a3f627..c2c7ff07b 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -75,6 +75,16 @@ __device__ __forceinline__ double atomicAdd(double* address, double val) { // N #endif namespace dh { + +// FIXME(jiamingy): Remove this once we get rid of cub submodule. +constexpr bool BuildWithCUDACub() { +#if defined(THRUST_IGNORE_CUB_VERSION_CHECK) && THRUST_IGNORE_CUB_VERSION_CHECK == 1 + return false; +#else + return true; +#endif // defined(THRUST_IGNORE_CUB_VERSION_CHECK) && THRUST_IGNORE_CUB_VERSION_CHECK == 1 +} + namespace detail { template struct AtomicDispatcher; @@ -689,6 +699,33 @@ typename std::iterator_traits::value_type SumReduction(T in, int nVals) { return sum; } +constexpr std::pair CUDAVersion() { +#if defined(__CUDACC_VER_MAJOR__) + return std::make_pair(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__); +#else + // clang/clang-tidy + return std::make_pair((CUDA_VERSION) / 1000, (CUDA_VERSION) % 100 / 10); +#endif // defined(__CUDACC_VER_MAJOR__) +} + +namespace detail { +template +using TypedDiscardCTK114 = thrust::discard_iterator; + +template +class TypedDiscard : public thrust::discard_iterator { + public: + using value_type = T; // NOLINT +}; +} // namespace detail + +template +using TypedDiscard = + std::conditional_t<((CUDAVersion().first == 11 && + CUDAVersion().second >= 4) || + CUDAVersion().first > 11), + detail::TypedDiscardCTK114, detail::TypedDiscard>; + /** * \class AllReducer * @@ -1326,7 +1363,7 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr, false))); - dh::TemporaryArray storage(bytes); + TemporaryArray storage(bytes); safe_cuda(( cub::DispatchScan::Dispatch(storage.data().get(), bytes, d_in, @@ -1369,24 +1406,27 @@ void ArgSort(xgboost::common::Span keys, xgboost::common::Span sorted_i cub::DoubleBuffer d_values(const_cast(sorted_idx.data()), sorted_idx_out.data().get()); + // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support + using OffsetT = std::conditional_t; + CHECK_LE(sorted_idx.size(), std::numeric_limits::max()); if (accending) { void *d_temp_storage = nullptr; - safe_cuda((cub::DispatchRadixSort::Dispatch( + safe_cuda((cub::DispatchRadixSort::Dispatch( d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false, nullptr, false))); TemporaryArray storage(bytes); d_temp_storage = storage.data().get(); - safe_cuda((cub::DispatchRadixSort::Dispatch( + safe_cuda((cub::DispatchRadixSort::Dispatch( d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false, nullptr, false))); } else { void *d_temp_storage = nullptr; - safe_cuda((cub::DispatchRadixSort::Dispatch( + safe_cuda((cub::DispatchRadixSort::Dispatch( d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false, nullptr, false))); TemporaryArray storage(bytes); d_temp_storage = storage.data().get(); - safe_cuda((cub::DispatchRadixSort::Dispatch( + safe_cuda((cub::DispatchRadixSort::Dispatch( d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false, nullptr, false))); } @@ -1396,7 +1436,7 @@ void ArgSort(xgboost::common::Span keys, xgboost::common::Span sorted_i } namespace detail { -// Wrapper around cub sort for easier `descending` sort and `size_t num_items`. +// Wrapper around cub sort for easier `descending` sort. template void DeviceSegmentedRadixSortPair( @@ -1408,7 +1448,8 @@ void DeviceSegmentedRadixSortPair( cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); cub::DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - using OffsetT = size_t; + using OffsetT = int32_t; // num items in dispatch is also int32_t, no way to change. + CHECK_LE(num_items, std::numeric_limits::max()); safe_cuda((cub::DispatchSegmentedRadixSort< descending, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 97eccf682..82d90eb13 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -183,10 +183,6 @@ struct TupleScanOp { } }; -// Change the value type of thrust discard iterator so we can use it with cub -template -using TypedDiscard = thrust::discard_iterator; - // Here the data is already correctly ordered and simply needs to be compacted // to remove missing data template @@ -229,7 +225,7 @@ void CopyDataToEllpack(const AdapterBatchT &batch, WriteCompressedEllpackFunctor functor( d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid); - TypedDiscard discard; + dh::TypedDiscard discard; thrust::transform_output_iterator< WriteCompressedEllpackFunctor, decltype(discard)> out(discard, functor); diff --git a/src/metric/auc.cu b/src/metric/auc.cu index 2e33a2a5f..63c928c53 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -331,24 +331,25 @@ float GPUMultiClassAUCOVR(common::Span predts, MetaInfo const &info // expand to tuple to include class id auto fptp_it_in = dh::MakeTransformIterator( thrust::make_counting_iterator(0), [=] __device__(size_t i) { - uint32_t class_id = i / n_samples; - return thrust::make_tuple(class_id, d_fptp[i].first, d_fptp[i].second); + return thrust::make_tuple(i, d_fptp[i].first, d_fptp[i].second); }); // shrink down to pair auto fptp_it_out = thrust::make_transform_output_iterator( - dh::tbegin(d_fptp), [=] __device__(Triple const &t) { - return thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t)); + dh::TypedDiscard{}, [d_fptp] __device__(Triple const &t) { + d_fptp[thrust::get<0>(t)] = + thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t)); + return t; }); dh::InclusiveScan( fptp_it_in, fptp_it_out, [=] __device__(Triple const &l, Triple const &r) { - uint32_t l_cid = thrust::get<0>(l); - uint32_t r_cid = thrust::get<0>(r); + uint32_t l_cid = thrust::get<0>(l) / n_samples; + uint32_t r_cid = thrust::get<0>(r) / n_samples; if (l_cid != r_cid) { return r; } - return Triple(r_cid, // class_id + return Triple(thrust::get<0>(r), thrust::get<1>(l) + thrust::get<1>(r), // fp thrust::get<2>(l) + thrust::get<2>(r)); // tp }, @@ -521,7 +522,7 @@ GPURankingAUC(common::Span predts, MetaInfo const &info, dh::TemporaryArray d_auc(group_ptr.size() - 1); auto s_d_auc = dh::ToSpan(d_auc); auto out = thrust::make_transform_output_iterator( - Discard(), [=] __device__(RankScanItem const &item) -> RankScanItem { + dh::TypedDiscard{}, [=] __device__(RankScanItem const &item) -> RankScanItem { auto group_id = item.group_id; assert(group_id < d_group_ptr.size()); auto data_group_begin = d_group_ptr[group_id]; diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 36baa9557..9e002f77b 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -50,9 +50,6 @@ struct WriteResultsFunctor { } }; -// Change the value type of thrust discard iterator so we can use it with cub -using DiscardOverload = thrust::discard_iterator; - // Implement partitioning via single scan operation using transform output to // write the result void RowPartitioner::SortPosition(common::Span position, @@ -64,7 +61,7 @@ void RowPartitioner::SortPosition(common::Span position, WriteResultsFunctor write_results{left_nidx, position, position_out, ridx, ridx_out, d_left_count}; auto discard_write_iterator = - thrust::make_transform_output_iterator(DiscardOverload(), write_results); + thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); auto counting = thrust::make_counting_iterator(0llu); auto input_iterator = dh::MakeTransformIterator( counting, [=] __device__(size_t idx) {