fix auc.cu
This commit is contained in:
parent
6eba0a56ec
commit
a56055225a
@ -4,7 +4,11 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "../common/device_helpers.hip.h"
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace collective {
|
namespace collective {
|
||||||
|
|||||||
@ -10,14 +10,26 @@
|
|||||||
|
|
||||||
#include <cstddef> // size_t
|
#include <cstddef> // size_t
|
||||||
#include <cstdint> // int32_t
|
#include <cstdint> // int32_t
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include <hipcub/hipcub.hpp>
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
#include <cub/cub.cuh> // DispatchSegmentedRadixSort,NullType,DoubleBuffer
|
#include <cub/cub.cuh> // DispatchSegmentedRadixSort,NullType,DoubleBuffer
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <iterator> // distance
|
#include <iterator> // distance
|
||||||
#include <limits> // numeric_limits
|
#include <limits> // numeric_limits
|
||||||
#include <type_traits> // conditional_t,remove_const_t
|
#include <type_traits> // conditional_t,remove_const_t
|
||||||
|
|
||||||
#include "common.h" // safe_cuda
|
#include "common.h" // safe_cuda
|
||||||
#include "cuda_context.cuh" // CUDAContext
|
#include "cuda_context.cuh" // CUDAContext
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "device_helpers.hip.h"
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
#include "device_helpers.cuh" // TemporaryArray,SegmentId,LaunchN,Iota,device_vector
|
#include "device_helpers.cuh" // TemporaryArray,SegmentId,LaunchN,Iota,device_vector
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "xgboost/base.h" // XGBOOST_DEVICE
|
#include "xgboost/base.h" // XGBOOST_DEVICE
|
||||||
#include "xgboost/context.h" // Context
|
#include "xgboost/context.h" // Context
|
||||||
#include "xgboost/logging.h" // CHECK
|
#include "xgboost/logging.h" // CHECK
|
||||||
@ -39,6 +51,7 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
|
|||||||
using OffsetT = int;
|
using OffsetT = int;
|
||||||
|
|
||||||
// Null value type
|
// Null value type
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
||||||
cub::DoubleBuffer<cub::NullType> d_values;
|
cub::DoubleBuffer<cub::NullType> d_values;
|
||||||
|
|
||||||
@ -47,6 +60,20 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
|
|||||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
|
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
|
||||||
num_segments, d_begin_offsets, d_end_offsets, begin_bit,
|
num_segments, d_begin_offsets, d_end_offsets, begin_bit,
|
||||||
end_bit, false, ctx->Stream(), debug_synchronous)));
|
end_bit, false, ctx->Stream(), debug_synchronous)));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
if (IS_DESCENDING) {
|
||||||
|
rocprim::segmented_radix_sort_pairs_desc<KeyT, hipcub::NullType, BeginOffsetIteratorT>(d_temp_storage,
|
||||||
|
temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
|
||||||
|
num_segments, d_begin_offsets, d_end_offsets,
|
||||||
|
begin_bit, end_bit, ctx->Stream(), debug_synchronous);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
rocprim::segmented_radix_sort_pairs<KeyT, hipcub::NullType, BeginOffsetIteratorT>(d_temp_storage,
|
||||||
|
temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items,
|
||||||
|
num_segments, d_begin_offsets, d_end_offsets,
|
||||||
|
begin_bit, end_bit, ctx->Stream(), debug_synchronous);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wrapper around cub sort for easier `descending` sort.
|
// Wrapper around cub sort for easier `descending` sort.
|
||||||
@ -60,14 +87,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
|
|||||||
BeginOffsetIteratorT d_begin_offsets,
|
BeginOffsetIteratorT d_begin_offsets,
|
||||||
EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream,
|
EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream,
|
||||||
int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) {
|
int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) {
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
||||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
|
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
|
||||||
|
#endif
|
||||||
|
|
||||||
// In old version of cub, num_items in dispatch is also int32_t, no way to change.
|
// In old version of cub, num_items in dispatch is also int32_t, no way to change.
|
||||||
using OffsetT = std::conditional_t<dh::BuildWithCUDACub() && dh::HasThrustMinorVer<13>(),
|
using OffsetT = std::conditional_t<dh::BuildWithCUDACub() && dh::HasThrustMinorVer<13>(),
|
||||||
std::size_t, std::int32_t>;
|
std::size_t, std::int32_t>;
|
||||||
CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
|
CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
|
||||||
// For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
|
// For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
#if THRUST_MAJOR_VERSION >= 2
|
#if THRUST_MAJOR_VERSION >= 2
|
||||||
dh::safe_cuda((cub::DispatchSegmentedRadixSort<
|
dh::safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||||
descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
|
descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
|
||||||
@ -88,6 +119,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
|
|||||||
d_begin_offsets, d_end_offsets, begin_bit,
|
d_begin_offsets, d_end_offsets, begin_bit,
|
||||||
end_bit, false, stream, false)));
|
end_bit, false, stream, false)));
|
||||||
#endif
|
#endif
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
if (descending) {
|
||||||
|
rocprim::segmented_radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
||||||
|
d_values_in, d_values_out, num_items, num_segments,
|
||||||
|
d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream, false);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
rocprim::segmented_radix_sort_pairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
||||||
|
d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets,
|
||||||
|
begin_bit, end_bit, stream, false);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|
||||||
|
|||||||
@ -1208,8 +1208,7 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
safe_cuda((rocprim::inclusive_scan<InputIteratorT, OutputIteratorT, ScanOpT>(nullptr,
|
safe_cuda((rocprim::inclusive_scan(nullptr, bytes, d_in, d_out, (size_t) num_items, scan_op)));
|
||||||
bytes, d_in, d_out, num_items, scan_op)));
|
|
||||||
|
|
||||||
TemporaryArray<char> storage(bytes);
|
TemporaryArray<char> storage(bytes);
|
||||||
|
|
||||||
@ -1229,8 +1228,7 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
safe_cuda((rocprim::inclusive_scan<InputIteratorT, OutputIteratorT, ScanOpT>(
|
safe_cuda((rocprim::inclusive_scan(storage.data().get(), bytes, d_in, d_out, (size_t) num_items, scan_op)));
|
||||||
storage.data().get(), bytes, d_in, d_out, num_items, scan_op)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename InIt, typename OutIt, typename Predicate>
|
template <typename InIt, typename OutIt, typename Predicate>
|
||||||
@ -1262,11 +1260,7 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
|
|||||||
using ValueT = std::remove_const_t<IdxT>;
|
using ValueT = std::remove_const_t<IdxT>;
|
||||||
|
|
||||||
TemporaryArray<KeyT> out(keys.size());
|
TemporaryArray<KeyT> out(keys.size());
|
||||||
hipcub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()),
|
|
||||||
out.data().get());
|
|
||||||
TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
|
TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
|
||||||
hipcub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
|
|
||||||
sorted_idx_out.data().get());
|
|
||||||
|
|
||||||
// track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
|
// track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
|
||||||
using OffsetT = std::conditional_t<!BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
|
using OffsetT = std::conditional_t<!BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
|
||||||
@ -1286,8 +1280,8 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
safe_cuda((rocprim::radix_sort_pairs<KeyT, ValueT, OffsetT>(d_temp_storage,
|
safe_cuda((rocprim::radix_sort_pairs(d_temp_storage,
|
||||||
bytes, d_keys, d_values, sorted_idx.size(), 0,
|
bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0,
|
||||||
sizeof(KeyT) * 8)));
|
sizeof(KeyT) * 8)));
|
||||||
|
|
||||||
TemporaryArray<char> storage(bytes);
|
TemporaryArray<char> storage(bytes);
|
||||||
@ -1305,8 +1299,8 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
safe_cuda((rocprim::radix_sort_pairs<KeyT, ValueT, OffsetT>(d_temp_storage,
|
safe_cuda((rocprim::radix_sort_pairs(d_temp_storage,
|
||||||
bytes, d_keys, d_values, sorted_idx.size(), 0,
|
bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0,
|
||||||
sizeof(KeyT) * 8)));
|
sizeof(KeyT) * 8)));
|
||||||
} else {
|
} else {
|
||||||
void *d_temp_storage = nullptr;
|
void *d_temp_storage = nullptr;
|
||||||
@ -1323,8 +1317,8 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
safe_cuda((rocprim::radix_sort_pairs<KeyT, ValueT, OffsetT>(d_temp_storage,
|
safe_cuda((rocprim::radix_sort_pairs(d_temp_storage,
|
||||||
bytes, d_keys, d_values, sorted_idx.size(), 0,
|
bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0,
|
||||||
sizeof(KeyT) * 8)));
|
sizeof(KeyT) * 8)));
|
||||||
|
|
||||||
TemporaryArray<char> storage(bytes);
|
TemporaryArray<char> storage(bytes);
|
||||||
@ -1341,8 +1335,8 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
|
|||||||
sizeof(KeyT) * 8, false, nullptr, false)));
|
sizeof(KeyT) * 8, false, nullptr, false)));
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
safe_cuda((rocprim::radix_sort_pairs<KeyT, ValueT, OffsetT>(d_temp_storage,
|
safe_cuda((rocprim::radix_sort_pairs(d_temp_storage,
|
||||||
bytes, d_keys, d_values, sorted_idx.size(), 0,
|
bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0,
|
||||||
sizeof(KeyT) * 8)));
|
sizeof(KeyT) * 8)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,13 @@
|
|||||||
|
|
||||||
#include "./math.h" // Sqr
|
#include "./math.h" // Sqr
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "device_helpers.hip.h"
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
#include "device_helpers.cuh" // LaunchN
|
#include "device_helpers.cuh" // LaunchN
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "xgboost/base.h" // XGBOOST_DEVICE
|
#include "xgboost/base.h" // XGBOOST_DEVICE
|
||||||
#include "xgboost/span.h" // Span
|
#include "xgboost/span.h" // Span
|
||||||
|
|
||||||
@ -67,7 +73,7 @@ SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
|
|||||||
dh::safe_cuda(hipMemcpy(
|
dh::safe_cuda(hipMemcpy(
|
||||||
&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
|
&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
|
||||||
sizeof(total), hipMemcpyDeviceToHost));
|
sizeof(total), hipMemcpyDeviceToHost));
|
||||||
#else
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaMemcpy(
|
dh::safe_cuda(cudaMemcpy(
|
||||||
&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
|
&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
|
||||||
sizeof(total), cudaMemcpyDeviceToHost));
|
sizeof(total), cudaMemcpyDeviceToHost));
|
||||||
|
|||||||
@ -393,7 +393,7 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")
|
|||||||
.describe("Receiver Operating Characteristic Area Under the Curve.")
|
.describe("Receiver Operating Characteristic Area Under the Curve.")
|
||||||
.set_body([](const char*) { return new EvalROCAUC(); });
|
.set_body([](const char*) { return new EvalROCAUC(); });
|
||||||
|
|
||||||
#if !defined(XGBOOST_USE_CUDA)
|
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||||
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
|
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
|
||||||
std::int32_t,
|
std::int32_t,
|
||||||
std::shared_ptr<DeviceAUCCache> *) {
|
std::shared_ptr<DeviceAUCCache> *) {
|
||||||
@ -414,7 +414,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *, common::Span<flo
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
struct DeviceAUCCache {};
|
struct DeviceAUCCache {};
|
||||||
#endif // !defined(XGBOOST_USE_CUDA)
|
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||||
|
|
||||||
class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
||||||
std::shared_ptr<DeviceAUCCache> d_cache_;
|
std::shared_ptr<DeviceAUCCache> d_cache_;
|
||||||
@ -471,7 +471,7 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
|
|||||||
.describe("Area under PR curve for both classification and rank.")
|
.describe("Area under PR curve for both classification and rank.")
|
||||||
.set_body([](char const *) { return new EvalPRAUC{}; });
|
.set_body([](char const *) { return new EvalPRAUC{}; });
|
||||||
|
|
||||||
#if !defined(XGBOOST_USE_CUDA)
|
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||||
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
|
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
|
||||||
std::int32_t, std::shared_ptr<DeviceAUCCache> *) {
|
std::int32_t, std::shared_ptr<DeviceAUCCache> *) {
|
||||||
common::AssertGPUSupport();
|
common::AssertGPUSupport();
|
||||||
|
|||||||
@ -5,7 +5,13 @@
|
|||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include <hipcub/hipcub.hpp> // NOLINT
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
#include <cub/cub.cuh> // NOLINT
|
#include <cub/cub.cuh> // NOLINT
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
@ -89,7 +95,12 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
|
Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
|
||||||
auto labels = info.labels.View(device);
|
auto labels = info.labels.View(device);
|
||||||
auto weights = info.weights_.ConstDeviceSpan();
|
auto weights = info.weights_.ConstDeviceSpan();
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipSetDevice(device));
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
dh::safe_cuda(cudaSetDevice(device));
|
||||||
|
#endif
|
||||||
|
|
||||||
CHECK_NE(labels.Size(), 0);
|
CHECK_NE(labels.Size(), 0);
|
||||||
CHECK_EQ(labels.Size(), predts.size());
|
CHECK_EQ(labels.Size(), predts.size());
|
||||||
@ -120,10 +131,19 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
auto uni_key = dh::MakeTransformIterator<float>(
|
auto uni_key = dh::MakeTransformIterator<float>(
|
||||||
thrust::make_counting_iterator(0),
|
thrust::make_counting_iterator(0),
|
||||||
[=] XGBOOST_DEVICE(size_t i) { return predts[d_sorted_idx[i]]; });
|
[=] XGBOOST_DEVICE(size_t i) { return predts[d_sorted_idx[i]]; });
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
auto end_unique = thrust::unique_by_key_copy(
|
||||||
|
thrust::hip::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
|
||||||
|
dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
|
||||||
|
dh::tbegin(d_unique_idx));
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
auto end_unique = thrust::unique_by_key_copy(
|
auto end_unique = thrust::unique_by_key_copy(
|
||||||
thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
|
thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
|
||||||
dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
|
dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
|
||||||
dh::tbegin(d_unique_idx));
|
dh::tbegin(d_unique_idx));
|
||||||
|
#endif
|
||||||
|
|
||||||
d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx));
|
d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx));
|
||||||
|
|
||||||
dh::InclusiveScan(dh::tbegin(d_fptp), dh::tbegin(d_fptp),
|
dh::InclusiveScan(dh::tbegin(d_fptp), dh::tbegin(d_fptp),
|
||||||
@ -163,7 +183,13 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
});
|
});
|
||||||
|
|
||||||
Pair last = cache->fptp.back();
|
Pair last = cache->fptp.back();
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
double auc = thrust::reduce(thrust::hip::par(alloc), in, in + d_unique_idx.size());
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
double auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size());
|
double auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size());
|
||||||
|
#endif
|
||||||
|
|
||||||
return std::make_tuple(last.first, last.second, auc);
|
return std::make_tuple(last.first, last.second, auc);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -218,9 +244,17 @@ double ScaleClasses(common::Span<double> results, common::Span<double> local_are
|
|||||||
|
|
||||||
double tp_sum;
|
double tp_sum;
|
||||||
double auc_sum;
|
double auc_sum;
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
thrust::tie(auc_sum, tp_sum) =
|
||||||
|
thrust::reduce(thrust::hip::par(alloc), reduce_in, reduce_in + n_classes,
|
||||||
|
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
thrust::tie(auc_sum, tp_sum) =
|
thrust::tie(auc_sum, tp_sum) =
|
||||||
thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
|
thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
|
||||||
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
||||||
|
#endif
|
||||||
|
|
||||||
if (tp_sum != 0 && !std::isnan(auc_sum)) {
|
if (tp_sum != 0 && !std::isnan(auc_sum)) {
|
||||||
auc_sum /= tp_sum;
|
auc_sum /= tp_sum;
|
||||||
} else {
|
} else {
|
||||||
@ -300,9 +334,16 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
|
|||||||
double auc = area_fn(fp_prev, fp, tp_prev, tp, class_id);
|
double auc = area_fn(fp_prev, fp, tp_prev, tp, class_id);
|
||||||
return auc;
|
return auc;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
thrust::reduce_by_key(thrust::hip::par(alloc), key_in,
|
||||||
|
key_in + d_unique_idx.size(), val_in,
|
||||||
|
thrust::make_discard_iterator(), dh::tbegin(d_auc));
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
thrust::reduce_by_key(thrust::cuda::par(alloc), key_in,
|
thrust::reduce_by_key(thrust::cuda::par(alloc), key_in,
|
||||||
key_in + d_unique_idx.size(), val_in,
|
key_in + d_unique_idx.size(), val_in,
|
||||||
thrust::make_discard_iterator(), dh::tbegin(d_auc));
|
thrust::make_discard_iterator(), dh::tbegin(d_auc));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -312,7 +353,12 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
|
|||||||
template <bool scale, typename Fn>
|
template <bool scale, typename Fn>
|
||||||
double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span<uint32_t> d_class_ptr,
|
double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span<uint32_t> d_class_ptr,
|
||||||
size_t n_classes, std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
|
size_t n_classes, std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipSetDevice(device));
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
dh::safe_cuda(cudaSetDevice(device));
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sorted idx
|
* Sorted idx
|
||||||
*/
|
*/
|
||||||
@ -373,6 +419,19 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span<ui
|
|||||||
// unique values are sparse, so we need a CSR style indptr
|
// unique values are sparse, so we need a CSR style indptr
|
||||||
dh::TemporaryArray<uint32_t> unique_class_ptr(d_class_ptr.size());
|
dh::TemporaryArray<uint32_t> unique_class_ptr(d_class_ptr.size());
|
||||||
auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
|
auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
auto n_uniques = dh::SegmentedUniqueByKey(
|
||||||
|
thrust::hip::par(alloc),
|
||||||
|
dh::tbegin(d_class_ptr),
|
||||||
|
dh::tend(d_class_ptr),
|
||||||
|
uni_key,
|
||||||
|
uni_key + d_sorted_idx.size(),
|
||||||
|
dh::tbegin(d_unique_idx),
|
||||||
|
d_unique_class_ptr.data(),
|
||||||
|
dh::tbegin(d_unique_idx),
|
||||||
|
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
auto n_uniques = dh::SegmentedUniqueByKey(
|
auto n_uniques = dh::SegmentedUniqueByKey(
|
||||||
thrust::cuda::par(alloc),
|
thrust::cuda::par(alloc),
|
||||||
dh::tbegin(d_class_ptr),
|
dh::tbegin(d_class_ptr),
|
||||||
@ -383,6 +442,8 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span<ui
|
|||||||
d_unique_class_ptr.data(),
|
d_unique_class_ptr.data(),
|
||||||
dh::tbegin(d_unique_idx),
|
dh::tbegin(d_unique_idx),
|
||||||
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
||||||
|
#endif
|
||||||
|
|
||||||
d_unique_idx = d_unique_idx.subspan(0, n_uniques);
|
d_unique_idx = d_unique_idx.subspan(0, n_uniques);
|
||||||
|
|
||||||
auto get_class_id = [=] XGBOOST_DEVICE(size_t idx) { return idx / n_samples; };
|
auto get_class_id = [=] XGBOOST_DEVICE(size_t idx) { return idx / n_samples; };
|
||||||
@ -500,9 +561,17 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
|
|||||||
auto check_it = dh::MakeTransformIterator<size_t>(
|
auto check_it = dh::MakeTransformIterator<size_t>(
|
||||||
thrust::make_counting_iterator(0),
|
thrust::make_counting_iterator(0),
|
||||||
[=] XGBOOST_DEVICE(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; });
|
[=] XGBOOST_DEVICE(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; });
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
size_t n_valid = thrust::count_if(
|
||||||
|
thrust::hip::par(alloc), check_it, check_it + group_ptr.size() - 1,
|
||||||
|
[=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
size_t n_valid = thrust::count_if(
|
size_t n_valid = thrust::count_if(
|
||||||
thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1,
|
thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1,
|
||||||
[=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
|
[=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
|
||||||
|
#endif
|
||||||
|
|
||||||
if (n_valid < info.group_ptr_.size() - 1) {
|
if (n_valid < info.group_ptr_.size() - 1) {
|
||||||
InvalidGroupAUC();
|
InvalidGroupAUC();
|
||||||
}
|
}
|
||||||
@ -599,8 +668,14 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
|
|||||||
/**
|
/**
|
||||||
* Scale the AUC with number of items in each group.
|
* Scale the AUC with number of items in each group.
|
||||||
*/
|
*/
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
double auc = thrust::reduce(thrust::hip::par(alloc), dh::tbegin(s_d_auc),
|
||||||
|
dh::tend(s_d_auc), 0.0);
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
double auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc),
|
double auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc),
|
||||||
dh::tend(s_d_auc), 0.0);
|
dh::tend(s_d_auc), 0.0);
|
||||||
|
#endif
|
||||||
|
|
||||||
return std::make_pair(auc, n_valid);
|
return std::make_pair(auc, n_valid);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -627,9 +702,16 @@ std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> pred
|
|||||||
});
|
});
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
double total_pos, total_neg;
|
double total_pos, total_neg;
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
thrust::tie(total_pos, total_neg) =
|
||||||
|
thrust::reduce(thrust::hip::par(alloc), it, it + labels.Size(),
|
||||||
|
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
thrust::tie(total_pos, total_neg) =
|
thrust::tie(total_pos, total_neg) =
|
||||||
thrust::reduce(thrust::cuda::par(alloc), it, it + labels.Size(),
|
thrust::reduce(thrust::cuda::par(alloc), it, it + labels.Size(),
|
||||||
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
||||||
|
#endif
|
||||||
|
|
||||||
if (total_pos <= 0.0 || total_neg <= 0.0) {
|
if (total_pos <= 0.0 || total_neg <= 0.0) {
|
||||||
return {0.0f, 0.0f, 0.0f};
|
return {0.0f, 0.0f, 0.0f};
|
||||||
@ -681,10 +763,18 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
|
|||||||
return thrust::make_pair(y * w, (1.0f - y) * w);
|
return thrust::make_pair(y * w, (1.0f - y) * w);
|
||||||
});
|
});
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
thrust::reduce_by_key(thrust::hip::par(alloc), key_it,
|
||||||
|
key_it + predts.size(), val_it,
|
||||||
|
thrust::make_discard_iterator(), totals.begin(),
|
||||||
|
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
|
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
|
||||||
key_it + predts.size(), val_it,
|
key_it + predts.size(), val_it,
|
||||||
thrust::make_discard_iterator(), totals.begin(),
|
thrust::make_discard_iterator(), totals.begin(),
|
||||||
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculate AUC
|
* Calculate AUC
|
||||||
@ -752,6 +842,19 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
// unique values are sparse, so we need a CSR style indptr
|
// unique values are sparse, so we need a CSR style indptr
|
||||||
dh::TemporaryArray<uint32_t> unique_class_ptr(d_group_ptr.size());
|
dh::TemporaryArray<uint32_t> unique_class_ptr(d_group_ptr.size());
|
||||||
auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
|
auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
auto n_uniques = dh::SegmentedUniqueByKey(
|
||||||
|
thrust::hip::par(alloc),
|
||||||
|
dh::tbegin(d_group_ptr),
|
||||||
|
dh::tend(d_group_ptr),
|
||||||
|
uni_key,
|
||||||
|
uni_key + d_sorted_idx.size(),
|
||||||
|
dh::tbegin(d_unique_idx),
|
||||||
|
d_unique_class_ptr.data(),
|
||||||
|
dh::tbegin(d_unique_idx),
|
||||||
|
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
auto n_uniques = dh::SegmentedUniqueByKey(
|
auto n_uniques = dh::SegmentedUniqueByKey(
|
||||||
thrust::cuda::par(alloc),
|
thrust::cuda::par(alloc),
|
||||||
dh::tbegin(d_group_ptr),
|
dh::tbegin(d_group_ptr),
|
||||||
@ -762,6 +865,8 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
d_unique_class_ptr.data(),
|
d_unique_class_ptr.data(),
|
||||||
dh::tbegin(d_unique_idx),
|
dh::tbegin(d_unique_idx),
|
||||||
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
||||||
|
#endif
|
||||||
|
|
||||||
d_unique_idx = d_unique_idx.subspan(0, n_uniques);
|
d_unique_idx = d_unique_idx.subspan(0, n_uniques);
|
||||||
|
|
||||||
auto get_group_id = [=] XGBOOST_DEVICE(size_t idx) {
|
auto get_group_id = [=] XGBOOST_DEVICE(size_t idx) {
|
||||||
@ -812,9 +917,16 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
}
|
}
|
||||||
return thrust::make_pair(0.0, static_cast<uint32_t>(1));
|
return thrust::make_pair(0.0, static_cast<uint32_t>(1));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
thrust::tie(auc, invalid_groups) = thrust::reduce(
|
||||||
|
thrust::hip::par(alloc), it, it + n_groups,
|
||||||
|
thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
thrust::tie(auc, invalid_groups) = thrust::reduce(
|
thrust::tie(auc, invalid_groups) = thrust::reduce(
|
||||||
thrust::cuda::par(alloc), it, it + n_groups,
|
thrust::cuda::par(alloc), it, it + n_groups,
|
||||||
thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
|
thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
return std::make_pair(auc, n_groups - invalid_groups);
|
return std::make_pair(auc, n_groups - invalid_groups);
|
||||||
}
|
}
|
||||||
@ -823,7 +935,12 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
|
|||||||
common::Span<float const> predts,
|
common::Span<float const> predts,
|
||||||
MetaInfo const &info,
|
MetaInfo const &info,
|
||||||
std::shared_ptr<DeviceAUCCache> *p_cache) {
|
std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
||||||
|
#endif
|
||||||
|
|
||||||
if (predts.empty()) {
|
if (predts.empty()) {
|
||||||
return std::make_pair(0.0, static_cast<uint32_t>(0));
|
return std::make_pair(0.0, static_cast<uint32_t>(0));
|
||||||
}
|
}
|
||||||
@ -845,10 +962,19 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
|
|||||||
|
|
||||||
dh::XGBDeviceAllocator<char> alloc;
|
dh::XGBDeviceAllocator<char> alloc;
|
||||||
auto labels = info.labels.View(ctx->gpu_id);
|
auto labels = info.labels.View(ctx->gpu_id);
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()),
|
||||||
|
dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
|
||||||
|
InvalidLabels();
|
||||||
|
}
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
|
if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
|
||||||
dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
|
dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
|
||||||
InvalidLabels();
|
InvalidLabels();
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get total positive/negative for each group.
|
* Get total positive/negative for each group.
|
||||||
*/
|
*/
|
||||||
@ -868,10 +994,18 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
|
|||||||
auto y = labels(i);
|
auto y = labels(i);
|
||||||
return thrust::make_pair(y * w, (1.0 - y) * w);
|
return thrust::make_pair(y * w, (1.0 - y) * w);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
thrust::reduce_by_key(thrust::hip::par(alloc), key_it,
|
||||||
|
key_it + predts.size(), val_it,
|
||||||
|
thrust::make_discard_iterator(), totals.begin(),
|
||||||
|
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
||||||
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
|
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
|
||||||
key_it + predts.size(), val_it,
|
key_it + predts.size(), val_it,
|
||||||
thrust::make_discard_iterator(), totals.begin(),
|
thrust::make_discard_iterator(), totals.begin(),
|
||||||
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculate AUC
|
* Calculate AUC
|
||||||
|
|||||||
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "auc.cu"
|
||||||
|
#endif
|
||||||
Loading…
x
Reference in New Issue
Block a user