Re-implement ROC-AUC. (#6747)
* Re-implement ROC-AUC. * Binary * MultiClass * LTR * Add documents. This PR resolves a few issues: - Define a value when the dataset is invalid, which can happen if there's an empty dataset, or when the dataset contains only positive or negative values. - Define ROC-AUC for multi-class classification. - Define weighted average value for distributed setting. - A correct implementation for learning to rank task. Previous implementation is just binary classification with averaging across groups, which doesn't measure ordered learning to rank.
This commit is contained in:
@@ -8,6 +8,7 @@
|
||||
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <xgboost/span.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <exception>
|
||||
@@ -163,13 +164,14 @@ inline void AssertOneAPISupport() {
|
||||
#endif // XGBOOST_USE_ONEAPI
|
||||
}
|
||||
|
||||
template <typename Idx, typename V, typename Comp = std::less<V>>
|
||||
std::vector<Idx> ArgSort(std::vector<V> const &array, Comp comp = std::less<V>{}) {
|
||||
template <typename Idx, typename Container,
|
||||
typename V = typename Container::value_type,
|
||||
typename Comp = std::less<V>>
|
||||
std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
|
||||
std::vector<Idx> result(array.size());
|
||||
std::iota(result.begin(), result.end(), 0);
|
||||
std::stable_sort(
|
||||
result.begin(), result.end(),
|
||||
[&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); });
|
||||
auto op = [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); };
|
||||
XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op);
|
||||
return result;
|
||||
}
|
||||
} // namespace common
|
||||
|
||||
@@ -1198,6 +1198,62 @@ size_t SegmentedUnique(Inputs &&...inputs) {
|
||||
return SegmentedUnique(thrust::cuda::par(alloc), std::forward<Inputs&&>(inputs)...);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Unique by key for many groups of data. Has same constraint as `SegmentedUnique`.
|
||||
*
|
||||
* \tparam exec thrust execution policy
|
||||
* \tparam key_segments_first start iter to segment pointer
|
||||
* \tparam key_segments_last end iter to segment pointer
|
||||
* \tparam key_first start iter to key for comparison
|
||||
* \tparam key_last end iter to key for comparison
|
||||
* \tparam val_first start iter to values
|
||||
* \tparam key_segments_out output iterator for new segment pointer
|
||||
* \tparam val_out output iterator for values
|
||||
* \tparam comp binary comparison operator
|
||||
*/
|
||||
template <typename DerivedPolicy, typename SegInIt, typename SegOutIt,
|
||||
typename KeyInIt, typename ValInIt, typename ValOutIt, typename Comp>
|
||||
size_t SegmentedUniqueByKey(
|
||||
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
|
||||
SegInIt key_segments_first, SegInIt key_segments_last, KeyInIt key_first,
|
||||
KeyInIt key_last, ValInIt val_first, SegOutIt key_segments_out,
|
||||
ValOutIt val_out, Comp comp) {
|
||||
using Key =
|
||||
thrust::pair<size_t,
|
||||
typename thrust::iterator_traits<KeyInIt>::value_type>;
|
||||
|
||||
auto unique_key_it = dh::MakeTransformIterator<Key>(
|
||||
thrust::make_counting_iterator(static_cast<size_t>(0)),
|
||||
[=] __device__(size_t i) {
|
||||
size_t seg = dh::SegmentId(key_segments_first, key_segments_last, i);
|
||||
return thrust::make_pair(seg, *(key_first + i));
|
||||
});
|
||||
size_t segments_len = key_segments_last - key_segments_first;
|
||||
thrust::fill(thrust::device, key_segments_out,
|
||||
key_segments_out + segments_len, 0);
|
||||
size_t n_inputs = std::distance(key_first, key_last);
|
||||
// Reduce the number of uniques elements per segment, avoid creating an
|
||||
// intermediate array for `reduce_by_key`. It's limited by the types that
|
||||
// atomicAdd supports. For example, size_t is not supported as of CUDA 10.2.
|
||||
auto reduce_it = thrust::make_transform_output_iterator(
|
||||
thrust::make_discard_iterator(),
|
||||
detail::SegmentedUniqueReduceOp<Key, SegOutIt>{key_segments_out});
|
||||
auto uniques_ret = thrust::unique_by_key_copy(
|
||||
exec, unique_key_it, unique_key_it + n_inputs, val_first, reduce_it,
|
||||
val_out, [=] __device__(Key const &l, Key const &r) {
|
||||
if (l.first == r.first) {
|
||||
// In the same segment.
|
||||
return comp(thrust::get<1>(l), thrust::get<1>(r));
|
||||
}
|
||||
return false;
|
||||
});
|
||||
auto n_uniques = uniques_ret.second - val_out;
|
||||
CHECK_LE(n_uniques, n_inputs);
|
||||
thrust::exclusive_scan(exec, key_segments_out,
|
||||
key_segments_out + segments_len, key_segments_out, 0);
|
||||
return n_uniques;
|
||||
}
|
||||
|
||||
template <typename Policy, typename InputIt, typename Init, typename Func>
|
||||
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
|
||||
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
|
||||
@@ -1215,36 +1271,73 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
|
||||
return aggregate;
|
||||
}
|
||||
|
||||
// wrapper to avoid integer `num_items`.
|
||||
template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT,
|
||||
typename OffsetT>
|
||||
void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
|
||||
OffsetT num_items) {
|
||||
size_t bytes = 0;
|
||||
safe_cuda((
|
||||
cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
|
||||
OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
|
||||
cub::NullType(), num_items, nullptr,
|
||||
false)));
|
||||
dh::TemporaryArray<char> storage(bytes);
|
||||
safe_cuda((
|
||||
cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
|
||||
OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
|
||||
d_out, scan_op, cub::NullType(),
|
||||
num_items, nullptr, false)));
|
||||
}
|
||||
|
||||
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
|
||||
void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) {
|
||||
InclusiveScan(d_in, d_out, cub::Sum(), num_items);
|
||||
}
|
||||
|
||||
template <bool accending, typename IdxT, typename U>
|
||||
void ArgSort(xgboost::common::Span<U> values, xgboost::common::Span<IdxT> sorted_idx) {
|
||||
void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_idx) {
|
||||
size_t bytes = 0;
|
||||
Iota(sorted_idx);
|
||||
CHECK_LT(sorted_idx.size(), 1 << 31);
|
||||
TemporaryArray<U> out(values.size());
|
||||
|
||||
using KeyT = typename decltype(keys)::value_type;
|
||||
using ValueT = std::remove_const_t<IdxT>;
|
||||
|
||||
TemporaryArray<KeyT> out(keys.size());
|
||||
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()),
|
||||
out.data().get());
|
||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
|
||||
sorted_idx.data());
|
||||
|
||||
if (accending) {
|
||||
cub::DeviceRadixSort::SortPairs(nullptr, bytes, values.data(),
|
||||
out.data().get(), sorted_idx.data(),
|
||||
sorted_idx.data(), sorted_idx.size());
|
||||
void *d_temp_storage = nullptr;
|
||||
cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
|
||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||
sizeof(KeyT) * 8, false, nullptr, false);
|
||||
dh::TemporaryArray<char> storage(bytes);
|
||||
cub::DeviceRadixSort::SortPairs(storage.data().get(), bytes, values.data(),
|
||||
out.data().get(), sorted_idx.data(),
|
||||
sorted_idx.data(), sorted_idx.size());
|
||||
d_temp_storage = storage.data().get();
|
||||
cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
|
||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||
sizeof(KeyT) * 8, false, nullptr, false);
|
||||
} else {
|
||||
cub::DeviceRadixSort::SortPairsDescending(
|
||||
nullptr, bytes, values.data(), out.data().get(), sorted_idx.data(),
|
||||
sorted_idx.data(), sorted_idx.size());
|
||||
void *d_temp_storage = nullptr;
|
||||
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, size_t>::Dispatch(
|
||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||
sizeof(KeyT) * 8, false, nullptr, false)));
|
||||
dh::TemporaryArray<char> storage(bytes);
|
||||
cub::DeviceRadixSort::SortPairsDescending(
|
||||
storage.data().get(), bytes, values.data(), out.data().get(),
|
||||
sorted_idx.data(), sorted_idx.data(), sorted_idx.size());
|
||||
d_temp_storage = storage.data().get();
|
||||
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, size_t>::Dispatch(
|
||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||
sizeof(KeyT) * 8, false, nullptr, false)));
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
// Wrapper around cub sort for easier `descending` sort
|
||||
template <bool descending, typename KeyT, typename ValueT, typename OffsetIteratorT>
|
||||
// Wrapper around cub sort for easier `descending` sort and `size_t num_items`.
|
||||
template <bool descending, typename KeyT, typename ValueT,
|
||||
typename OffsetIteratorT>
|
||||
void DeviceSegmentedRadixSortPair(
|
||||
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
|
||||
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
|
||||
KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out,
|
||||
size_t num_items, size_t num_segments, OffsetIteratorT d_begin_offsets,
|
||||
OffsetIteratorT d_end_offsets, int begin_bit = 0,
|
||||
@@ -1253,12 +1346,12 @@ void DeviceSegmentedRadixSortPair(
|
||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
|
||||
d_values_out);
|
||||
using OffsetT = size_t;
|
||||
dh::safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||
descending, KeyT, ValueT, OffsetIteratorT,
|
||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
||||
d_values, num_items, num_segments,
|
||||
d_begin_offsets, d_end_offsets, begin_bit,
|
||||
end_bit, false, nullptr, false)));
|
||||
safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||
descending, KeyT, ValueT, OffsetIteratorT,
|
||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
||||
d_values, num_items, num_segments,
|
||||
d_begin_offsets, d_end_offsets, begin_bit,
|
||||
end_bit, false, nullptr, false)));
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
@@ -1270,12 +1363,11 @@ void SegmentedArgSort(xgboost::common::Span<U> values,
|
||||
size_t n_groups = group_ptr.size() - 1;
|
||||
size_t bytes = 0;
|
||||
Iota(sorted_idx);
|
||||
CHECK_LT(sorted_idx.size(), 1 << 31);
|
||||
TemporaryArray<U> values_out(values.size());
|
||||
TemporaryArray<std::remove_const_t<U>> values_out(values.size());
|
||||
detail::DeviceSegmentedRadixSortPair<!accending>(
|
||||
nullptr, bytes, values.data(), values_out.data().get(),
|
||||
sorted_idx.data(), sorted_idx.data(), sorted_idx.size(), n_groups,
|
||||
group_ptr.data(), group_ptr.data() + 1);
|
||||
nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(),
|
||||
sorted_idx.data(), sorted_idx.size(), n_groups, group_ptr.data(),
|
||||
group_ptr.data() + 1);
|
||||
dh::TemporaryArray<xgboost::common::byte> temp_storage(bytes);
|
||||
detail::DeviceSegmentedRadixSortPair<!accending>(
|
||||
temp_storage.data().get(), bytes, values.data(), values_out.data().get(),
|
||||
|
||||
@@ -26,6 +26,9 @@ XGBOOST_DEVICE inline float Sigmoid(float x) {
|
||||
return 1.0f / (1.0f + expf(-x));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
XGBOOST_DEVICE inline static T Sqr(T a) { return a * a; }
|
||||
|
||||
/*!
|
||||
* \brief Equality test for both integer and floating point.
|
||||
*/
|
||||
|
||||
@@ -99,7 +99,7 @@ std::vector<T> WeightedSamplingWithoutReplacement(
|
||||
auto k = std::log(u) / w;
|
||||
keys[i] = k;
|
||||
}
|
||||
auto ind = ArgSort<size_t>(keys, std::greater<>{});
|
||||
auto ind = ArgSort<size_t>(Span<float>{keys}, std::greater<>{});
|
||||
ind.resize(n);
|
||||
|
||||
std::vector<T> results(ind.size());
|
||||
|
||||
84
src/common/ranking_utils.cuh
Normal file
84
src/common/ranking_utils.cuh
Normal file
@@ -0,0 +1,84 @@
|
||||
/*!
|
||||
* Copyright 2021 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_RANKING_UTILS_H_
|
||||
#define XGBOOST_COMMON_RANKING_UTILS_H_
|
||||
|
||||
#include <cub/cub.cuh>
|
||||
#include "xgboost/base.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "./math.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
/**
|
||||
* \param n Number of items (length of the base)
|
||||
* \param h hight
|
||||
*/
|
||||
XGBOOST_DEVICE inline size_t DiscreteTrapezoidArea(size_t n, size_t h) {
|
||||
n -= 1; // without diagonal entries
|
||||
h = std::min(n, h); // Specific for ranking.
|
||||
size_t total = ((n - (h - 1)) + n) * h / 2;
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used for mapping many groups of trapezoid shaped computation onto CUDA blocks. The
|
||||
* trapezoid must be on upper right corner.
|
||||
*
|
||||
* Equivalent to loops like:
|
||||
*
|
||||
* \code
|
||||
* for (size i = 0; i < h; ++i) {
|
||||
* for (size_t j = i + 1; j < n; ++j) {
|
||||
* do_something();
|
||||
* }
|
||||
* }
|
||||
* \endcode
|
||||
*/
|
||||
template <typename U>
|
||||
inline size_t
|
||||
SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
|
||||
xgboost::common::Span<size_t> out_group_threads_ptr,
|
||||
size_t h) {
|
||||
CHECK_GE(group_ptr.size(), 1);
|
||||
CHECK_EQ(group_ptr.size(), out_group_threads_ptr.size());
|
||||
dh::LaunchN(
|
||||
dh::CurrentDevice(), group_ptr.size(), [=] XGBOOST_DEVICE(size_t idx) {
|
||||
if (idx == 0) {
|
||||
out_group_threads_ptr[0] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
size_t cnt = static_cast<size_t>(group_ptr[idx] - group_ptr[idx - 1]);
|
||||
out_group_threads_ptr[idx] = DiscreteTrapezoidArea(cnt, h);
|
||||
});
|
||||
dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(),
|
||||
out_group_threads_ptr.size());
|
||||
size_t total = 0;
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
|
||||
sizeof(total), cudaMemcpyDeviceToHost));
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called inside kernel to obtain coordinate from trapezoid grid.
|
||||
*/
|
||||
XGBOOST_DEVICE inline void UnravelTrapeziodIdx(size_t i_idx, size_t n,
|
||||
size_t *out_i, size_t *out_j) {
|
||||
auto &i = *out_i;
|
||||
auto &j = *out_j;
|
||||
double idx = static_cast<double>(i_idx);
|
||||
double N = static_cast<double>(n);
|
||||
|
||||
i = std::ceil(-(0.5 - N + std::sqrt(common::Sqr(N - 0.5) + 2.0 * (-idx - 1.0)))) - 1.0;
|
||||
|
||||
auto I = static_cast<double>(i);
|
||||
size_t n_elems = -0.5 * common::Sqr(I) + (N - 0.5) * I;
|
||||
|
||||
j = idx - n_elems + i + 1;
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_RANKING_UTILS_H_
|
||||
Reference in New Issue
Block a user