Re-implement ROC-AUC. (#6747)

* Re-implement ROC-AUC.

* Binary
* MultiClass
* LTR
* Add documents.

This PR resolves a few issues:
  - Define a value when the dataset is invalid, which can happen if there's an
  empty dataset, or when the dataset contains only positive or negative values.
  - Define ROC-AUC for multi-class classification.
  - Define weighted average value for distributed setting.
  - A correct implementation for learning to rank task.  Previous
  implementation is just binary classification with averaging across groups,
  which doesn't measure ordered learning to rank.
This commit is contained in:
Jiaming Yuan
2021-03-20 16:52:40 +08:00
committed by GitHub
parent 4ee8340e79
commit bcc0277338
27 changed files with 1622 additions and 461 deletions

View File

@@ -1198,6 +1198,62 @@ size_t SegmentedUnique(Inputs &&...inputs) {
return SegmentedUnique(thrust::cuda::par(alloc), std::forward<Inputs&&>(inputs)...);
}
/**
* \brief Unique by key for many groups of data. Has same constraint as `SegmentedUnique`.
*
* \tparam exec thrust execution policy
* \tparam key_segments_first start iter to segment pointer
* \tparam key_segments_last end iter to segment pointer
* \tparam key_first start iter to key for comparison
* \tparam key_last end iter to key for comparison
* \tparam val_first start iter to values
* \tparam key_segments_out output iterator for new segment pointer
* \tparam val_out output iterator for values
* \tparam comp binary comparison operator
*/
template <typename DerivedPolicy, typename SegInIt, typename SegOutIt,
typename KeyInIt, typename ValInIt, typename ValOutIt, typename Comp>
size_t SegmentedUniqueByKey(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
SegInIt key_segments_first, SegInIt key_segments_last, KeyInIt key_first,
KeyInIt key_last, ValInIt val_first, SegOutIt key_segments_out,
ValOutIt val_out, Comp comp) {
using Key =
thrust::pair<size_t,
typename thrust::iterator_traits<KeyInIt>::value_type>;
auto unique_key_it = dh::MakeTransformIterator<Key>(
thrust::make_counting_iterator(static_cast<size_t>(0)),
[=] __device__(size_t i) {
size_t seg = dh::SegmentId(key_segments_first, key_segments_last, i);
return thrust::make_pair(seg, *(key_first + i));
});
size_t segments_len = key_segments_last - key_segments_first;
thrust::fill(thrust::device, key_segments_out,
key_segments_out + segments_len, 0);
size_t n_inputs = std::distance(key_first, key_last);
// Reduce the number of uniques elements per segment, avoid creating an
// intermediate array for `reduce_by_key`. It's limited by the types that
// atomicAdd supports. For example, size_t is not supported as of CUDA 10.2.
auto reduce_it = thrust::make_transform_output_iterator(
thrust::make_discard_iterator(),
detail::SegmentedUniqueReduceOp<Key, SegOutIt>{key_segments_out});
auto uniques_ret = thrust::unique_by_key_copy(
exec, unique_key_it, unique_key_it + n_inputs, val_first, reduce_it,
val_out, [=] __device__(Key const &l, Key const &r) {
if (l.first == r.first) {
// In the same segment.
return comp(thrust::get<1>(l), thrust::get<1>(r));
}
return false;
});
auto n_uniques = uniques_ret.second - val_out;
CHECK_LE(n_uniques, n_inputs);
thrust::exclusive_scan(exec, key_segments_out,
key_segments_out + segments_len, key_segments_out, 0);
return n_uniques;
}
template <typename Policy, typename InputIt, typename Init, typename Func>
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
@@ -1215,36 +1271,73 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
return aggregate;
}
// wrapper to avoid integer `num_items`.
template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT,
typename OffsetT>
void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
OffsetT num_items) {
size_t bytes = 0;
safe_cuda((
cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
cub::NullType(), num_items, nullptr,
false)));
dh::TemporaryArray<char> storage(bytes);
safe_cuda((
cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
d_out, scan_op, cub::NullType(),
num_items, nullptr, false)));
}
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) {
InclusiveScan(d_in, d_out, cub::Sum(), num_items);
}
template <bool accending, typename IdxT, typename U>
void ArgSort(xgboost::common::Span<U> values, xgboost::common::Span<IdxT> sorted_idx) {
void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_idx) {
size_t bytes = 0;
Iota(sorted_idx);
CHECK_LT(sorted_idx.size(), 1 << 31);
TemporaryArray<U> out(values.size());
using KeyT = typename decltype(keys)::value_type;
using ValueT = std::remove_const_t<IdxT>;
TemporaryArray<KeyT> out(keys.size());
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()),
out.data().get());
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
sorted_idx.data());
if (accending) {
cub::DeviceRadixSort::SortPairs(nullptr, bytes, values.data(),
out.data().get(), sorted_idx.data(),
sorted_idx.data(), sorted_idx.size());
void *d_temp_storage = nullptr;
cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
sizeof(KeyT) * 8, false, nullptr, false);
dh::TemporaryArray<char> storage(bytes);
cub::DeviceRadixSort::SortPairs(storage.data().get(), bytes, values.data(),
out.data().get(), sorted_idx.data(),
sorted_idx.data(), sorted_idx.size());
d_temp_storage = storage.data().get();
cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
sizeof(KeyT) * 8, false, nullptr, false);
} else {
cub::DeviceRadixSort::SortPairsDescending(
nullptr, bytes, values.data(), out.data().get(), sorted_idx.data(),
sorted_idx.data(), sorted_idx.size());
void *d_temp_storage = nullptr;
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, size_t>::Dispatch(
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
sizeof(KeyT) * 8, false, nullptr, false)));
dh::TemporaryArray<char> storage(bytes);
cub::DeviceRadixSort::SortPairsDescending(
storage.data().get(), bytes, values.data(), out.data().get(),
sorted_idx.data(), sorted_idx.data(), sorted_idx.size());
d_temp_storage = storage.data().get();
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, size_t>::Dispatch(
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
sizeof(KeyT) * 8, false, nullptr, false)));
}
}
namespace detail {
// Wrapper around cub sort for easier `descending` sort
template <bool descending, typename KeyT, typename ValueT, typename OffsetIteratorT>
// Wrapper around cub sort for easier `descending` sort and `size_t num_items`.
template <bool descending, typename KeyT, typename ValueT,
typename OffsetIteratorT>
void DeviceSegmentedRadixSortPair(
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out,
size_t num_items, size_t num_segments, OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets, int begin_bit = 0,
@@ -1253,12 +1346,12 @@ void DeviceSegmentedRadixSortPair(
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
d_values_out);
using OffsetT = size_t;
dh::safe_cuda((cub::DispatchSegmentedRadixSort<
descending, KeyT, ValueT, OffsetIteratorT,
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
d_values, num_items, num_segments,
d_begin_offsets, d_end_offsets, begin_bit,
end_bit, false, nullptr, false)));
safe_cuda((cub::DispatchSegmentedRadixSort<
descending, KeyT, ValueT, OffsetIteratorT,
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
d_values, num_items, num_segments,
d_begin_offsets, d_end_offsets, begin_bit,
end_bit, false, nullptr, false)));
}
} // namespace detail
@@ -1270,12 +1363,11 @@ void SegmentedArgSort(xgboost::common::Span<U> values,
size_t n_groups = group_ptr.size() - 1;
size_t bytes = 0;
Iota(sorted_idx);
CHECK_LT(sorted_idx.size(), 1 << 31);
TemporaryArray<U> values_out(values.size());
TemporaryArray<std::remove_const_t<U>> values_out(values.size());
detail::DeviceSegmentedRadixSortPair<!accending>(
nullptr, bytes, values.data(), values_out.data().get(),
sorted_idx.data(), sorted_idx.data(), sorted_idx.size(), n_groups,
group_ptr.data(), group_ptr.data() + 1);
nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(),
sorted_idx.data(), sorted_idx.size(), n_groups, group_ptr.data(),
group_ptr.data() + 1);
dh::TemporaryArray<xgboost::common::byte> temp_storage(bytes);
detail::DeviceSegmentedRadixSortPair<!accending>(
temp_storage.data().get(), bytes, values.data(), values_out.data().get(),