[backport] Support building with CTK11.5. (#7379) (#7411)

* Support building with CTK11.5.

* Require system cub installation for CTK11.4+.
* Check thrust version for segmented sort.
This commit is contained in:
Jiaming Yuan 2021-11-10 19:23:29 +08:00 committed by GitHub
parent e7ac2486eb
commit 11f8b5cfcd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 35 additions and 11 deletions

View File

@ -135,6 +135,10 @@ if (USE_CUDA)
set(GEN_CODE "") set(GEN_CODE "")
format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE) format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap) add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
if ((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 11.4) AND (NOT BUILD_WITH_CUDA_CUB))
message(SEND_ERROR "`BUILD_WITH_CUDA_CUB` should be set to `ON` for CUDA >= 11.4")
endif ()
endif (USE_CUDA) endif (USE_CUDA)
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND

View File

@ -148,7 +148,8 @@ From the command line on Linux starting from the XGBoost directory:
mkdir build mkdir build
cd build cd build
cmake .. -DUSE_CUDA=ON # For CUDA toolkit >= 11.4, `BUILD_WITH_CUDA_CUB` is required.
cmake .. -DUSE_CUDA=ON -DBUILD_WITH_CUDA_CUB=ON
make -j4 make -j4
.. note:: Specifying compute capability .. note:: Specifying compute capability

View File

@ -711,6 +711,12 @@ constexpr std::pair<int, int> CUDAVersion() {
constexpr std::pair<int32_t, int32_t> ThrustVersion() { constexpr std::pair<int32_t, int32_t> ThrustVersion() {
return std::make_pair(THRUST_MAJOR_VERSION, THRUST_MINOR_VERSION); return std::make_pair(THRUST_MAJOR_VERSION, THRUST_MINOR_VERSION);
} }
// Whether do we have thrust 1.x with x >= minor
template <int32_t minor>
constexpr bool HasThrustMinorVer() {
return (ThrustVersion().first == 1 && ThrustVersion().second >= minor) ||
ThrustVersion().first > 1;
}
namespace detail { namespace detail {
template <typename T> template <typename T>
@ -725,10 +731,8 @@ class TypedDiscard : public thrust::discard_iterator<T> {
template <typename T> template <typename T>
using TypedDiscard = using TypedDiscard =
std::conditional_t<((ThrustVersion().first == 1 && std::conditional_t<HasThrustMinorVer<12>(), detail::TypedDiscardCTK114<T>,
ThrustVersion().second >= 12) || detail::TypedDiscard<T>>;
ThrustVersion().first > 1),
detail::TypedDiscardCTK114<T>, detail::TypedDiscard<T>>;
/** /**
* \class AllReducer * \class AllReducer
@ -1442,24 +1446,39 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
namespace detail { namespace detail {
// Wrapper around cub sort for easier `descending` sort. // Wrapper around cub sort for easier `descending` sort.
template <bool descending, typename KeyT, typename ValueT, template <bool descending, typename KeyT, typename ValueT,
typename OffsetIteratorT> typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
void DeviceSegmentedRadixSortPair( void DeviceSegmentedRadixSortPair(
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out,
size_t num_items, size_t num_segments, OffsetIteratorT d_begin_offsets, size_t num_items, size_t num_segments, BeginOffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets, int begin_bit = 0, EndOffsetIteratorT d_end_offsets, int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8) { int end_bit = sizeof(KeyT) * 8) {
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out); cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
d_values_out); d_values_out);
using OffsetT = int32_t; // num items in dispatch is also int32_t, no way to change. // In old version of cub, num_items in dispatch is also int32_t, no way to change.
CHECK_LE(num_items, std::numeric_limits<int32_t>::max()); using OffsetT =
std::conditional_t<BuildWithCUDACub() && HasThrustMinorVer<13>(), size_t,
int32_t>;
CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
// For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
#if (THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION >= 13) || THRUST_MAJOR_VERSION > 1
safe_cuda((cub::DispatchSegmentedRadixSort< safe_cuda((cub::DispatchSegmentedRadixSort<
descending, KeyT, ValueT, OffsetIteratorT, descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
d_values, num_items, num_segments, d_values, num_items, num_segments,
d_begin_offsets, d_end_offsets, begin_bit, d_begin_offsets, d_end_offsets, begin_bit,
end_bit, false, nullptr, false))); end_bit, false, nullptr, false)));
#else
safe_cuda((cub::DispatchSegmentedRadixSort<
descending, KeyT, ValueT, BeginOffsetIteratorT,
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
d_values, num_items, num_segments,
d_begin_offsets, d_end_offsets, begin_bit,
end_bit, false, nullptr, false)));
#endif
} }
} // namespace detail } // namespace detail