Enable compiling with system cub. (#7232)
- Tested with all CUDA 11.x. - Workaround cub scan by using discard iterator in AUC. - Limit the size of Argsort when compiled with CUDA cub.
This commit is contained in:
parent
b18f5f61b0
commit
c311a8c1d8
@ -49,6 +49,7 @@ option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF)
|
|||||||
option(USE_CUDA "Build with GPU acceleration" OFF)
|
option(USE_CUDA "Build with GPU acceleration" OFF)
|
||||||
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
||||||
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
|
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
|
||||||
|
option(BUILD_WITH_CUDA_CUB "Build with cub in CUDA installation" OFF)
|
||||||
set(GPU_COMPUTE_VER "" CACHE STRING
|
set(GPU_COMPUTE_VER "" CACHE STRING
|
||||||
"Semicolon separated list of compute versions to be built against, e.g. '35;61'")
|
"Semicolon separated list of compute versions to be built against, e.g. '35;61'")
|
||||||
## Copied From dmlc
|
## Copied From dmlc
|
||||||
|
|||||||
@ -154,8 +154,13 @@ function(xgboost_set_cuda_flags target)
|
|||||||
enable_nvtx(${target})
|
enable_nvtx(${target})
|
||||||
endif (USE_NVTX)
|
endif (USE_NVTX)
|
||||||
|
|
||||||
|
if (NOT BUILD_WITH_CUDA_CUB)
|
||||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1 -DTHRUST_IGNORE_CUB_VERSION_CHECK=1)
|
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1 -DTHRUST_IGNORE_CUB_VERSION_CHECK=1)
|
||||||
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/cub/ ${xgboost_SOURCE_DIR}/gputreeshap)
|
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/cub/ ${xgboost_SOURCE_DIR}/gputreeshap)
|
||||||
|
else ()
|
||||||
|
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1)
|
||||||
|
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/gputreeshap)
|
||||||
|
endif (NOT BUILD_WITH_CUDA_CUB)
|
||||||
|
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
target_compile_options(${target} PRIVATE
|
target_compile_options(${target} PRIVATE
|
||||||
|
|||||||
@ -75,6 +75,16 @@ __device__ __forceinline__ double atomicAdd(double* address, double val) { // N
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace dh {
|
namespace dh {
|
||||||
|
|
||||||
|
// FIXME(jiamingy): Remove this once we get rid of cub submodule.
|
||||||
|
constexpr bool BuildWithCUDACub() {
|
||||||
|
#if defined(THRUST_IGNORE_CUB_VERSION_CHECK) && THRUST_IGNORE_CUB_VERSION_CHECK == 1
|
||||||
|
return false;
|
||||||
|
#else
|
||||||
|
return true;
|
||||||
|
#endif // defined(THRUST_IGNORE_CUB_VERSION_CHECK) && THRUST_IGNORE_CUB_VERSION_CHECK == 1
|
||||||
|
}
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
template <size_t size>
|
template <size_t size>
|
||||||
struct AtomicDispatcher;
|
struct AtomicDispatcher;
|
||||||
@ -689,6 +699,33 @@ typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) {
|
|||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr std::pair<int, int> CUDAVersion() {
|
||||||
|
#if defined(__CUDACC_VER_MAJOR__)
|
||||||
|
return std::make_pair(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__);
|
||||||
|
#else
|
||||||
|
// clang/clang-tidy
|
||||||
|
return std::make_pair((CUDA_VERSION) / 1000, (CUDA_VERSION) % 100 / 10);
|
||||||
|
#endif // defined(__CUDACC_VER_MAJOR__)
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
template <typename T>
|
||||||
|
using TypedDiscardCTK114 = thrust::discard_iterator<T>;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class TypedDiscard : public thrust::discard_iterator<T> {
|
||||||
|
public:
|
||||||
|
using value_type = T; // NOLINT
|
||||||
|
};
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using TypedDiscard =
|
||||||
|
std::conditional_t<((CUDAVersion().first == 11 &&
|
||||||
|
CUDAVersion().second >= 4) ||
|
||||||
|
CUDAVersion().first > 11),
|
||||||
|
detail::TypedDiscardCTK114<T>, detail::TypedDiscard<T>>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \class AllReducer
|
* \class AllReducer
|
||||||
*
|
*
|
||||||
@ -1326,7 +1363,7 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
|
|||||||
OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
|
OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
|
||||||
cub::NullType(), num_items, nullptr,
|
cub::NullType(), num_items, nullptr,
|
||||||
false)));
|
false)));
|
||||||
dh::TemporaryArray<char> storage(bytes);
|
TemporaryArray<char> storage(bytes);
|
||||||
safe_cuda((
|
safe_cuda((
|
||||||
cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
|
cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
|
||||||
OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
|
OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
|
||||||
@ -1369,24 +1406,27 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
|
|||||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
|
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
|
||||||
sorted_idx_out.data().get());
|
sorted_idx_out.data().get());
|
||||||
|
|
||||||
|
// track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
|
||||||
|
using OffsetT = std::conditional_t<!BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
|
||||||
|
CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
|
||||||
if (accending) {
|
if (accending) {
|
||||||
void *d_temp_storage = nullptr;
|
void *d_temp_storage = nullptr;
|
||||||
safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, std::ptrdiff_t>::Dispatch(
|
safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
|
||||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||||
sizeof(KeyT) * 8, false, nullptr, false)));
|
sizeof(KeyT) * 8, false, nullptr, false)));
|
||||||
TemporaryArray<char> storage(bytes);
|
TemporaryArray<char> storage(bytes);
|
||||||
d_temp_storage = storage.data().get();
|
d_temp_storage = storage.data().get();
|
||||||
safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, std::ptrdiff_t>::Dispatch(
|
safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
|
||||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||||
sizeof(KeyT) * 8, false, nullptr, false)));
|
sizeof(KeyT) * 8, false, nullptr, false)));
|
||||||
} else {
|
} else {
|
||||||
void *d_temp_storage = nullptr;
|
void *d_temp_storage = nullptr;
|
||||||
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, std::ptrdiff_t>::Dispatch(
|
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
|
||||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||||
sizeof(KeyT) * 8, false, nullptr, false)));
|
sizeof(KeyT) * 8, false, nullptr, false)));
|
||||||
TemporaryArray<char> storage(bytes);
|
TemporaryArray<char> storage(bytes);
|
||||||
d_temp_storage = storage.data().get();
|
d_temp_storage = storage.data().get();
|
||||||
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, std::ptrdiff_t>::Dispatch(
|
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
|
||||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||||
sizeof(KeyT) * 8, false, nullptr, false)));
|
sizeof(KeyT) * 8, false, nullptr, false)));
|
||||||
}
|
}
|
||||||
@ -1396,7 +1436,7 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
// Wrapper around cub sort for easier `descending` sort and `size_t num_items`.
|
// Wrapper around cub sort for easier `descending` sort.
|
||||||
template <bool descending, typename KeyT, typename ValueT,
|
template <bool descending, typename KeyT, typename ValueT,
|
||||||
typename OffsetIteratorT>
|
typename OffsetIteratorT>
|
||||||
void DeviceSegmentedRadixSortPair(
|
void DeviceSegmentedRadixSortPair(
|
||||||
@ -1408,7 +1448,8 @@ void DeviceSegmentedRadixSortPair(
|
|||||||
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
||||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
|
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
|
||||||
d_values_out);
|
d_values_out);
|
||||||
using OffsetT = size_t;
|
using OffsetT = int32_t; // num items in dispatch is also int32_t, no way to change.
|
||||||
|
CHECK_LE(num_items, std::numeric_limits<int32_t>::max());
|
||||||
safe_cuda((cub::DispatchSegmentedRadixSort<
|
safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||||
descending, KeyT, ValueT, OffsetIteratorT,
|
descending, KeyT, ValueT, OffsetIteratorT,
|
||||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
||||||
|
|||||||
@ -183,10 +183,6 @@ struct TupleScanOp {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Change the value type of thrust discard iterator so we can use it with cub
|
|
||||||
template <typename T>
|
|
||||||
using TypedDiscard = thrust::discard_iterator<T>;
|
|
||||||
|
|
||||||
// Here the data is already correctly ordered and simply needs to be compacted
|
// Here the data is already correctly ordered and simply needs to be compacted
|
||||||
// to remove missing data
|
// to remove missing data
|
||||||
template <typename AdapterBatchT>
|
template <typename AdapterBatchT>
|
||||||
@ -229,7 +225,7 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
|
|||||||
WriteCompressedEllpackFunctor<AdapterBatchT> functor(
|
WriteCompressedEllpackFunctor<AdapterBatchT> functor(
|
||||||
d_compressed_buffer, writer, batch, device_accessor, feature_types,
|
d_compressed_buffer, writer, batch, device_accessor, feature_types,
|
||||||
is_valid);
|
is_valid);
|
||||||
TypedDiscard<Tuple> discard;
|
dh::TypedDiscard<Tuple> discard;
|
||||||
thrust::transform_output_iterator<
|
thrust::transform_output_iterator<
|
||||||
WriteCompressedEllpackFunctor<AdapterBatchT>, decltype(discard)>
|
WriteCompressedEllpackFunctor<AdapterBatchT>, decltype(discard)>
|
||||||
out(discard, functor);
|
out(discard, functor);
|
||||||
|
|||||||
@ -331,24 +331,25 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
|
|||||||
// expand to tuple to include class id
|
// expand to tuple to include class id
|
||||||
auto fptp_it_in = dh::MakeTransformIterator<Triple>(
|
auto fptp_it_in = dh::MakeTransformIterator<Triple>(
|
||||||
thrust::make_counting_iterator(0), [=] __device__(size_t i) {
|
thrust::make_counting_iterator(0), [=] __device__(size_t i) {
|
||||||
uint32_t class_id = i / n_samples;
|
return thrust::make_tuple(i, d_fptp[i].first, d_fptp[i].second);
|
||||||
return thrust::make_tuple(class_id, d_fptp[i].first, d_fptp[i].second);
|
|
||||||
});
|
});
|
||||||
// shrink down to pair
|
// shrink down to pair
|
||||||
auto fptp_it_out = thrust::make_transform_output_iterator(
|
auto fptp_it_out = thrust::make_transform_output_iterator(
|
||||||
dh::tbegin(d_fptp), [=] __device__(Triple const &t) {
|
dh::TypedDiscard<Triple>{}, [d_fptp] __device__(Triple const &t) {
|
||||||
return thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t));
|
d_fptp[thrust::get<0>(t)] =
|
||||||
|
thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t));
|
||||||
|
return t;
|
||||||
});
|
});
|
||||||
dh::InclusiveScan(
|
dh::InclusiveScan(
|
||||||
fptp_it_in, fptp_it_out,
|
fptp_it_in, fptp_it_out,
|
||||||
[=] __device__(Triple const &l, Triple const &r) {
|
[=] __device__(Triple const &l, Triple const &r) {
|
||||||
uint32_t l_cid = thrust::get<0>(l);
|
uint32_t l_cid = thrust::get<0>(l) / n_samples;
|
||||||
uint32_t r_cid = thrust::get<0>(r);
|
uint32_t r_cid = thrust::get<0>(r) / n_samples;
|
||||||
if (l_cid != r_cid) {
|
if (l_cid != r_cid) {
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
return Triple(r_cid, // class_id
|
return Triple(thrust::get<0>(r),
|
||||||
thrust::get<1>(l) + thrust::get<1>(r), // fp
|
thrust::get<1>(l) + thrust::get<1>(r), // fp
|
||||||
thrust::get<2>(l) + thrust::get<2>(r)); // tp
|
thrust::get<2>(l) + thrust::get<2>(r)); // tp
|
||||||
},
|
},
|
||||||
@ -521,7 +522,7 @@ GPURankingAUC(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
dh::TemporaryArray<float> d_auc(group_ptr.size() - 1);
|
dh::TemporaryArray<float> d_auc(group_ptr.size() - 1);
|
||||||
auto s_d_auc = dh::ToSpan(d_auc);
|
auto s_d_auc = dh::ToSpan(d_auc);
|
||||||
auto out = thrust::make_transform_output_iterator(
|
auto out = thrust::make_transform_output_iterator(
|
||||||
Discard<RankScanItem>(), [=] __device__(RankScanItem const &item) -> RankScanItem {
|
dh::TypedDiscard<RankScanItem>{}, [=] __device__(RankScanItem const &item) -> RankScanItem {
|
||||||
auto group_id = item.group_id;
|
auto group_id = item.group_id;
|
||||||
assert(group_id < d_group_ptr.size());
|
assert(group_id < d_group_ptr.size());
|
||||||
auto data_group_begin = d_group_ptr[group_id];
|
auto data_group_begin = d_group_ptr[group_id];
|
||||||
|
|||||||
@ -50,9 +50,6 @@ struct WriteResultsFunctor {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Change the value type of thrust discard iterator so we can use it with cub
|
|
||||||
using DiscardOverload = thrust::discard_iterator<IndexFlagTuple>;
|
|
||||||
|
|
||||||
// Implement partitioning via single scan operation using transform output to
|
// Implement partitioning via single scan operation using transform output to
|
||||||
// write the result
|
// write the result
|
||||||
void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
|
void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
|
||||||
@ -64,7 +61,7 @@ void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
|
|||||||
WriteResultsFunctor write_results{left_nidx, position, position_out,
|
WriteResultsFunctor write_results{left_nidx, position, position_out,
|
||||||
ridx, ridx_out, d_left_count};
|
ridx, ridx_out, d_left_count};
|
||||||
auto discard_write_iterator =
|
auto discard_write_iterator =
|
||||||
thrust::make_transform_output_iterator(DiscardOverload(), write_results);
|
thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
|
||||||
auto counting = thrust::make_counting_iterator(0llu);
|
auto counting = thrust::make_counting_iterator(0llu);
|
||||||
auto input_iterator = dh::MakeTransformIterator<IndexFlagTuple>(
|
auto input_iterator = dh::MakeTransformIterator<IndexFlagTuple>(
|
||||||
counting, [=] __device__(size_t idx) {
|
counting, [=] __device__(size_t idx) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user