Re-implement ROC-AUC. (#6747)
* Re-implement ROC-AUC. * Binary * MultiClass * LTR * Add documents. This PR resolves a few issues: - Define a value when the dataset is invalid, which can happen if there's an empty dataset, or when the dataset contains only positive or negative values. - Define ROC-AUC for multi-class classification. - Define weighted average value for distributed setting. - A correct implementation for learning to rank task. Previous implementation is just binary classification with averaging across groups, which doesn't measure ordered learning to rank.
This commit is contained in:
@@ -8,6 +8,7 @@
|
||||
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <xgboost/span.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <exception>
|
||||
@@ -163,13 +164,14 @@ inline void AssertOneAPISupport() {
|
||||
#endif // XGBOOST_USE_ONEAPI
|
||||
}
|
||||
|
||||
template <typename Idx, typename V, typename Comp = std::less<V>>
|
||||
std::vector<Idx> ArgSort(std::vector<V> const &array, Comp comp = std::less<V>{}) {
|
||||
template <typename Idx, typename Container,
|
||||
typename V = typename Container::value_type,
|
||||
typename Comp = std::less<V>>
|
||||
std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
|
||||
std::vector<Idx> result(array.size());
|
||||
std::iota(result.begin(), result.end(), 0);
|
||||
std::stable_sort(
|
||||
result.begin(), result.end(),
|
||||
[&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); });
|
||||
auto op = [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); };
|
||||
XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op);
|
||||
return result;
|
||||
}
|
||||
} // namespace common
|
||||
|
||||
@@ -1198,6 +1198,62 @@ size_t SegmentedUnique(Inputs &&...inputs) {
|
||||
return SegmentedUnique(thrust::cuda::par(alloc), std::forward<Inputs&&>(inputs)...);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Unique by key for many groups of data. Has same constraint as `SegmentedUnique`.
|
||||
*
|
||||
* \tparam exec thrust execution policy
|
||||
* \tparam key_segments_first start iter to segment pointer
|
||||
* \tparam key_segments_last end iter to segment pointer
|
||||
* \tparam key_first start iter to key for comparison
|
||||
* \tparam key_last end iter to key for comparison
|
||||
* \tparam val_first start iter to values
|
||||
* \tparam key_segments_out output iterator for new segment pointer
|
||||
* \tparam val_out output iterator for values
|
||||
* \tparam comp binary comparison operator
|
||||
*/
|
||||
template <typename DerivedPolicy, typename SegInIt, typename SegOutIt,
|
||||
typename KeyInIt, typename ValInIt, typename ValOutIt, typename Comp>
|
||||
size_t SegmentedUniqueByKey(
|
||||
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
|
||||
SegInIt key_segments_first, SegInIt key_segments_last, KeyInIt key_first,
|
||||
KeyInIt key_last, ValInIt val_first, SegOutIt key_segments_out,
|
||||
ValOutIt val_out, Comp comp) {
|
||||
using Key =
|
||||
thrust::pair<size_t,
|
||||
typename thrust::iterator_traits<KeyInIt>::value_type>;
|
||||
|
||||
auto unique_key_it = dh::MakeTransformIterator<Key>(
|
||||
thrust::make_counting_iterator(static_cast<size_t>(0)),
|
||||
[=] __device__(size_t i) {
|
||||
size_t seg = dh::SegmentId(key_segments_first, key_segments_last, i);
|
||||
return thrust::make_pair(seg, *(key_first + i));
|
||||
});
|
||||
size_t segments_len = key_segments_last - key_segments_first;
|
||||
thrust::fill(thrust::device, key_segments_out,
|
||||
key_segments_out + segments_len, 0);
|
||||
size_t n_inputs = std::distance(key_first, key_last);
|
||||
// Reduce the number of uniques elements per segment, avoid creating an
|
||||
// intermediate array for `reduce_by_key`. It's limited by the types that
|
||||
// atomicAdd supports. For example, size_t is not supported as of CUDA 10.2.
|
||||
auto reduce_it = thrust::make_transform_output_iterator(
|
||||
thrust::make_discard_iterator(),
|
||||
detail::SegmentedUniqueReduceOp<Key, SegOutIt>{key_segments_out});
|
||||
auto uniques_ret = thrust::unique_by_key_copy(
|
||||
exec, unique_key_it, unique_key_it + n_inputs, val_first, reduce_it,
|
||||
val_out, [=] __device__(Key const &l, Key const &r) {
|
||||
if (l.first == r.first) {
|
||||
// In the same segment.
|
||||
return comp(thrust::get<1>(l), thrust::get<1>(r));
|
||||
}
|
||||
return false;
|
||||
});
|
||||
auto n_uniques = uniques_ret.second - val_out;
|
||||
CHECK_LE(n_uniques, n_inputs);
|
||||
thrust::exclusive_scan(exec, key_segments_out,
|
||||
key_segments_out + segments_len, key_segments_out, 0);
|
||||
return n_uniques;
|
||||
}
|
||||
|
||||
template <typename Policy, typename InputIt, typename Init, typename Func>
|
||||
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
|
||||
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
|
||||
@@ -1215,36 +1271,73 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
|
||||
return aggregate;
|
||||
}
|
||||
|
||||
// wrapper to avoid integer `num_items`.
|
||||
template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT,
|
||||
typename OffsetT>
|
||||
void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
|
||||
OffsetT num_items) {
|
||||
size_t bytes = 0;
|
||||
safe_cuda((
|
||||
cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
|
||||
OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
|
||||
cub::NullType(), num_items, nullptr,
|
||||
false)));
|
||||
dh::TemporaryArray<char> storage(bytes);
|
||||
safe_cuda((
|
||||
cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
|
||||
OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
|
||||
d_out, scan_op, cub::NullType(),
|
||||
num_items, nullptr, false)));
|
||||
}
|
||||
|
||||
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
|
||||
void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) {
|
||||
InclusiveScan(d_in, d_out, cub::Sum(), num_items);
|
||||
}
|
||||
|
||||
template <bool accending, typename IdxT, typename U>
|
||||
void ArgSort(xgboost::common::Span<U> values, xgboost::common::Span<IdxT> sorted_idx) {
|
||||
void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_idx) {
|
||||
size_t bytes = 0;
|
||||
Iota(sorted_idx);
|
||||
CHECK_LT(sorted_idx.size(), 1 << 31);
|
||||
TemporaryArray<U> out(values.size());
|
||||
|
||||
using KeyT = typename decltype(keys)::value_type;
|
||||
using ValueT = std::remove_const_t<IdxT>;
|
||||
|
||||
TemporaryArray<KeyT> out(keys.size());
|
||||
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()),
|
||||
out.data().get());
|
||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
|
||||
sorted_idx.data());
|
||||
|
||||
if (accending) {
|
||||
cub::DeviceRadixSort::SortPairs(nullptr, bytes, values.data(),
|
||||
out.data().get(), sorted_idx.data(),
|
||||
sorted_idx.data(), sorted_idx.size());
|
||||
void *d_temp_storage = nullptr;
|
||||
cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
|
||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||
sizeof(KeyT) * 8, false, nullptr, false);
|
||||
dh::TemporaryArray<char> storage(bytes);
|
||||
cub::DeviceRadixSort::SortPairs(storage.data().get(), bytes, values.data(),
|
||||
out.data().get(), sorted_idx.data(),
|
||||
sorted_idx.data(), sorted_idx.size());
|
||||
d_temp_storage = storage.data().get();
|
||||
cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
|
||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||
sizeof(KeyT) * 8, false, nullptr, false);
|
||||
} else {
|
||||
cub::DeviceRadixSort::SortPairsDescending(
|
||||
nullptr, bytes, values.data(), out.data().get(), sorted_idx.data(),
|
||||
sorted_idx.data(), sorted_idx.size());
|
||||
void *d_temp_storage = nullptr;
|
||||
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, size_t>::Dispatch(
|
||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||
sizeof(KeyT) * 8, false, nullptr, false)));
|
||||
dh::TemporaryArray<char> storage(bytes);
|
||||
cub::DeviceRadixSort::SortPairsDescending(
|
||||
storage.data().get(), bytes, values.data(), out.data().get(),
|
||||
sorted_idx.data(), sorted_idx.data(), sorted_idx.size());
|
||||
d_temp_storage = storage.data().get();
|
||||
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, size_t>::Dispatch(
|
||||
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
|
||||
sizeof(KeyT) * 8, false, nullptr, false)));
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
// Wrapper around cub sort for easier `descending` sort
|
||||
template <bool descending, typename KeyT, typename ValueT, typename OffsetIteratorT>
|
||||
// Wrapper around cub sort for easier `descending` sort and `size_t num_items`.
|
||||
template <bool descending, typename KeyT, typename ValueT,
|
||||
typename OffsetIteratorT>
|
||||
void DeviceSegmentedRadixSortPair(
|
||||
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
|
||||
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
|
||||
KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out,
|
||||
size_t num_items, size_t num_segments, OffsetIteratorT d_begin_offsets,
|
||||
OffsetIteratorT d_end_offsets, int begin_bit = 0,
|
||||
@@ -1253,12 +1346,12 @@ void DeviceSegmentedRadixSortPair(
|
||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
|
||||
d_values_out);
|
||||
using OffsetT = size_t;
|
||||
dh::safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||
descending, KeyT, ValueT, OffsetIteratorT,
|
||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
||||
d_values, num_items, num_segments,
|
||||
d_begin_offsets, d_end_offsets, begin_bit,
|
||||
end_bit, false, nullptr, false)));
|
||||
safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||
descending, KeyT, ValueT, OffsetIteratorT,
|
||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
||||
d_values, num_items, num_segments,
|
||||
d_begin_offsets, d_end_offsets, begin_bit,
|
||||
end_bit, false, nullptr, false)));
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
@@ -1270,12 +1363,11 @@ void SegmentedArgSort(xgboost::common::Span<U> values,
|
||||
size_t n_groups = group_ptr.size() - 1;
|
||||
size_t bytes = 0;
|
||||
Iota(sorted_idx);
|
||||
CHECK_LT(sorted_idx.size(), 1 << 31);
|
||||
TemporaryArray<U> values_out(values.size());
|
||||
TemporaryArray<std::remove_const_t<U>> values_out(values.size());
|
||||
detail::DeviceSegmentedRadixSortPair<!accending>(
|
||||
nullptr, bytes, values.data(), values_out.data().get(),
|
||||
sorted_idx.data(), sorted_idx.data(), sorted_idx.size(), n_groups,
|
||||
group_ptr.data(), group_ptr.data() + 1);
|
||||
nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(),
|
||||
sorted_idx.data(), sorted_idx.size(), n_groups, group_ptr.data(),
|
||||
group_ptr.data() + 1);
|
||||
dh::TemporaryArray<xgboost::common::byte> temp_storage(bytes);
|
||||
detail::DeviceSegmentedRadixSortPair<!accending>(
|
||||
temp_storage.data().get(), bytes, values.data(), values_out.data().get(),
|
||||
|
||||
@@ -26,6 +26,9 @@ XGBOOST_DEVICE inline float Sigmoid(float x) {
|
||||
return 1.0f / (1.0f + expf(-x));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
XGBOOST_DEVICE inline static T Sqr(T a) { return a * a; }
|
||||
|
||||
/*!
|
||||
* \brief Equality test for both integer and floating point.
|
||||
*/
|
||||
|
||||
@@ -99,7 +99,7 @@ std::vector<T> WeightedSamplingWithoutReplacement(
|
||||
auto k = std::log(u) / w;
|
||||
keys[i] = k;
|
||||
}
|
||||
auto ind = ArgSort<size_t>(keys, std::greater<>{});
|
||||
auto ind = ArgSort<size_t>(Span<float>{keys}, std::greater<>{});
|
||||
ind.resize(n);
|
||||
|
||||
std::vector<T> results(ind.size());
|
||||
|
||||
84
src/common/ranking_utils.cuh
Normal file
84
src/common/ranking_utils.cuh
Normal file
@@ -0,0 +1,84 @@
|
||||
/*!
|
||||
* Copyright 2021 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_RANKING_UTILS_H_
|
||||
#define XGBOOST_COMMON_RANKING_UTILS_H_
|
||||
|
||||
#include <cub/cub.cuh>
|
||||
#include "xgboost/base.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "./math.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
/**
|
||||
* \param n Number of items (length of the base)
|
||||
* \param h hight
|
||||
*/
|
||||
XGBOOST_DEVICE inline size_t DiscreteTrapezoidArea(size_t n, size_t h) {
|
||||
n -= 1; // without diagonal entries
|
||||
h = std::min(n, h); // Specific for ranking.
|
||||
size_t total = ((n - (h - 1)) + n) * h / 2;
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used for mapping many groups of trapezoid shaped computation onto CUDA blocks. The
|
||||
* trapezoid must be on upper right corner.
|
||||
*
|
||||
* Equivalent to loops like:
|
||||
*
|
||||
* \code
|
||||
* for (size i = 0; i < h; ++i) {
|
||||
* for (size_t j = i + 1; j < n; ++j) {
|
||||
* do_something();
|
||||
* }
|
||||
* }
|
||||
* \endcode
|
||||
*/
|
||||
template <typename U>
|
||||
inline size_t
|
||||
SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
|
||||
xgboost::common::Span<size_t> out_group_threads_ptr,
|
||||
size_t h) {
|
||||
CHECK_GE(group_ptr.size(), 1);
|
||||
CHECK_EQ(group_ptr.size(), out_group_threads_ptr.size());
|
||||
dh::LaunchN(
|
||||
dh::CurrentDevice(), group_ptr.size(), [=] XGBOOST_DEVICE(size_t idx) {
|
||||
if (idx == 0) {
|
||||
out_group_threads_ptr[0] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
size_t cnt = static_cast<size_t>(group_ptr[idx] - group_ptr[idx - 1]);
|
||||
out_group_threads_ptr[idx] = DiscreteTrapezoidArea(cnt, h);
|
||||
});
|
||||
dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(),
|
||||
out_group_threads_ptr.size());
|
||||
size_t total = 0;
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
|
||||
sizeof(total), cudaMemcpyDeviceToHost));
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called inside kernel to obtain coordinate from trapezoid grid.
|
||||
*/
|
||||
XGBOOST_DEVICE inline void UnravelTrapeziodIdx(size_t i_idx, size_t n,
|
||||
size_t *out_i, size_t *out_j) {
|
||||
auto &i = *out_i;
|
||||
auto &j = *out_j;
|
||||
double idx = static_cast<double>(i_idx);
|
||||
double N = static_cast<double>(n);
|
||||
|
||||
i = std::ceil(-(0.5 - N + std::sqrt(common::Sqr(N - 0.5) + 2.0 * (-idx - 1.0)))) - 1.0;
|
||||
|
||||
auto I = static_cast<double>(i);
|
||||
size_t n_elems = -0.5 * common::Sqr(I) + (N - 0.5) * I;
|
||||
|
||||
j = idx - n_elems + i + 1;
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_RANKING_UTILS_H_
|
||||
@@ -400,7 +400,9 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
|
||||
group_ptr_.push_back(i);
|
||||
}
|
||||
}
|
||||
group_ptr_.push_back(query_ids.size());
|
||||
if (group_ptr_.back() != query_ids.size()) {
|
||||
group_ptr_.push_back(query_ids.size());
|
||||
}
|
||||
} else if (!std::strcmp(key, "label_lower_bound")) {
|
||||
auto& labels = labels_lower_bound_.HostVector();
|
||||
labels.resize(num);
|
||||
|
||||
340
src/metric/auc.cc
Normal file
340
src/metric/auc.cc
Normal file
@@ -0,0 +1,340 @@
|
||||
/*!
|
||||
* Copyright 2021 by XGBoost Contributors
|
||||
*/
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "rabit/rabit.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/metric.h"
|
||||
#include "auc.h"
|
||||
#include "../common/common.h"
|
||||
#include "../common/math.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
|
||||
namespace detail {
|
||||
template <class T, std::size_t N, std::size_t... Idx>
|
||||
constexpr auto UnpackArr(std::array<T, N> &&arr, std::index_sequence<Idx...>) {
|
||||
return std::make_tuple(std::forward<std::array<T, N>>(arr)[Idx]...);
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
template <class T, std::size_t N>
|
||||
constexpr auto UnpackArr(std::array<T, N> &&arr) {
|
||||
return detail::UnpackArr(std::forward<std::array<T, N>>(arr),
|
||||
std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate AUC for binary classification problem. This function does not normalize the
|
||||
* AUC by 1 / (num_positive * num_negative), instead it returns a tuple for caller to
|
||||
* handle the normalization.
|
||||
*/
|
||||
std::tuple<float, float, float> BinaryAUC(std::vector<float> const &predts,
|
||||
std::vector<float> const &labels,
|
||||
std::vector<float> const &weights) {
|
||||
CHECK(!labels.empty());
|
||||
CHECK_EQ(labels.size(), predts.size());
|
||||
|
||||
float auc {0};
|
||||
auto const sorted_idx = common::ArgSort<size_t>(
|
||||
common::Span<float const>(predts), std::greater<>{});
|
||||
|
||||
auto get_weight = [&](size_t i) {
|
||||
return weights.empty() ? 1.0f : weights[sorted_idx[i]];
|
||||
};
|
||||
float label = labels[sorted_idx.front()];
|
||||
float w = get_weight(0);
|
||||
float fp = (1.0 - label) * w, tp = label * w;
|
||||
float tp_prev = 0, fp_prev = 0;
|
||||
// TODO(jiaming): We can parallize this if we have a parallel scan for CPU.
|
||||
for (size_t i = 1; i < sorted_idx.size(); ++i) {
|
||||
if (predts[sorted_idx[i]] != predts[sorted_idx[i-1]]) {
|
||||
auc += TrapesoidArea(fp_prev, fp, tp_prev, tp);
|
||||
tp_prev = tp;
|
||||
fp_prev = fp;
|
||||
}
|
||||
label = labels[sorted_idx[i]];
|
||||
float w = get_weight(i);
|
||||
fp += (1.0f - label) * w;
|
||||
tp += label * w;
|
||||
}
|
||||
|
||||
auc += TrapesoidArea(fp_prev, fp, tp_prev, tp);
|
||||
if (fp <= 0.0f || tp <= 0.0f) {
|
||||
auc = 0;
|
||||
fp = 0;
|
||||
tp = 0;
|
||||
}
|
||||
|
||||
return std::make_tuple(fp, tp, auc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate AUC for multi-class classification problem using 1-vs-rest approach.
|
||||
*
|
||||
* TODO(jiaming): Use better algorithms like:
|
||||
*
|
||||
* - Kleiman, Ross and Page, David. $AUC_{\mu}$: A Performance Metric for Multi-Class
|
||||
* Machine Learning Models
|
||||
*/
|
||||
float MultiClassOVR(std::vector<float> const& predts, MetaInfo const& info) {
|
||||
auto n_classes = predts.size() / info.labels_.Size();
|
||||
CHECK_NE(n_classes, 0);
|
||||
auto const& labels = info.labels_.ConstHostVector();
|
||||
|
||||
std::vector<float> results(n_classes * 3, 0);
|
||||
auto s_results = common::Span<float>(results);
|
||||
auto local_area = s_results.subspan(0, n_classes);
|
||||
auto tp = s_results.subspan(n_classes, n_classes);
|
||||
auto auc = s_results.subspan(2 * n_classes, n_classes);
|
||||
|
||||
if (!info.labels_.Empty()) {
|
||||
dmlc::OMPException omp_handler;
|
||||
#pragma omp parallel for
|
||||
for (omp_ulong c = 0; c < n_classes; ++c) {
|
||||
omp_handler.Run([&]() {
|
||||
std::vector<float> proba(info.labels_.Size());
|
||||
std::vector<float> response(info.labels_.Size());
|
||||
for (size_t i = 0; i < proba.size(); ++i) {
|
||||
proba[i] = predts[i * n_classes + c];
|
||||
response[i] = labels[i] == c ? 1.0f : 0.0;
|
||||
}
|
||||
float fp;
|
||||
std::tie(fp, tp[c], auc[c]) =
|
||||
BinaryAUC(proba, response, info.weights_.ConstHostVector());
|
||||
local_area[c] = fp * tp[c];
|
||||
});
|
||||
}
|
||||
omp_handler.Rethrow();
|
||||
}
|
||||
|
||||
// we have 2 averages going in here, first is among workers, second is among classes.
|
||||
// allreduce sums up fp/tp auc for each class.
|
||||
rabit::Allreduce<rabit::op::Sum>(results.data(), results.size());
|
||||
float auc_sum{0};
|
||||
float tp_sum{0};
|
||||
for (size_t c = 0; c < n_classes; ++c) {
|
||||
if (local_area[c] != 0) {
|
||||
// normalize and weight it by prevalence. After allreduce, `local_area` means the
|
||||
// total covered area (not area under curve, rather it's the accessible are for each
|
||||
// worker) for each class.
|
||||
auc_sum += auc[c] / local_area[c] * tp[c];
|
||||
tp_sum += tp[c];
|
||||
} else {
|
||||
auc_sum = std::numeric_limits<float>::quiet_NaN();
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (tp_sum == 0 || std::isnan(auc_sum)) {
|
||||
auc_sum = std::numeric_limits<float>::quiet_NaN();
|
||||
} else {
|
||||
auc_sum /= tp_sum;
|
||||
}
|
||||
return auc_sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate AUC for 1 ranking group;
|
||||
*/
|
||||
float GroupRankingAUC(common::Span<float const> predts,
|
||||
common::Span<float const> labels, float w) {
|
||||
// on ranking, we just count all pairs.
|
||||
float auc{0};
|
||||
auto const sorted_idx = common::ArgSort<size_t>(labels, std::greater<>{});
|
||||
w = common::Sqr(w);
|
||||
|
||||
float sum_w = 0.0f;
|
||||
for (size_t i = 0; i < labels.size(); ++i) {
|
||||
for (size_t j = i + 1; j < labels.size(); ++j) {
|
||||
auto predt = predts[sorted_idx[i]] - predts[sorted_idx[j]];
|
||||
if (predt > 0) {
|
||||
predt = 1.0;
|
||||
} else if (predt == 0) {
|
||||
predt = 0.5;
|
||||
} else {
|
||||
predt = 0;
|
||||
}
|
||||
auc += predt * w;
|
||||
sum_w += w;
|
||||
}
|
||||
}
|
||||
if (sum_w != 0) {
|
||||
auc /= sum_w;
|
||||
}
|
||||
CHECK_LE(auc, 1.0f);
|
||||
return auc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cast LTR problem to binary classification problem by comparing pairs.
|
||||
*/
|
||||
std::pair<float, uint32_t> RankingAUC(std::vector<float> const &predts,
|
||||
MetaInfo const &info) {
|
||||
CHECK_GE(info.group_ptr_.size(), 2);
|
||||
uint32_t n_groups = info.group_ptr_.size() - 1;
|
||||
float sum_auc = 0;
|
||||
auto s_predts = common::Span<float const>{predts};
|
||||
auto s_labels = info.labels_.ConstHostSpan();
|
||||
auto s_weights = info.weights_.ConstHostSpan();
|
||||
|
||||
std::atomic<uint32_t> invalid_groups{0};
|
||||
dmlc::OMPException omp_handler;
|
||||
|
||||
#pragma omp parallel for reduction(+:sum_auc)
|
||||
for (omp_ulong g = 1; g < info.group_ptr_.size(); ++g) {
|
||||
omp_handler.Run([&]() {
|
||||
size_t cnt = info.group_ptr_[g] - info.group_ptr_[g - 1];
|
||||
float w = s_weights.empty() ? 1.0f : s_weights[g - 1];
|
||||
auto g_predts = s_predts.subspan(info.group_ptr_[g - 1], cnt);
|
||||
auto g_labels = s_labels.subspan(info.group_ptr_[g - 1], cnt);
|
||||
float auc;
|
||||
if (g_labels.size() < 3) {
|
||||
// With 2 documents, there's only 1 comparison can be made. So either
|
||||
// TP or FP will be zero.
|
||||
invalid_groups++;
|
||||
auc = 0;
|
||||
} else {
|
||||
auc = GroupRankingAUC(g_predts, g_labels, w);
|
||||
}
|
||||
sum_auc += auc;
|
||||
});
|
||||
}
|
||||
omp_handler.Rethrow();
|
||||
|
||||
if (invalid_groups != 0) {
|
||||
InvalidGroupAUC();
|
||||
}
|
||||
|
||||
return std::make_pair(sum_auc, n_groups - invalid_groups);
|
||||
}
|
||||
|
||||
class EvalAUC : public Metric {
|
||||
std::shared_ptr<DeviceAUCCache> d_cache_;
|
||||
|
||||
public:
|
||||
float Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info,
|
||||
bool distributed) override {
|
||||
float auc {0};
|
||||
if (tparam_->gpu_id != GenericParameter::kCpuId) {
|
||||
preds.SetDevice(tparam_->gpu_id);
|
||||
info.labels_.SetDevice(tparam_->gpu_id);
|
||||
info.weights_.SetDevice(tparam_->gpu_id);
|
||||
}
|
||||
if (!info.group_ptr_.empty()) {
|
||||
/**
|
||||
* learning to rank
|
||||
*/
|
||||
if (!info.weights_.Empty()) {
|
||||
CHECK_EQ(info.weights_.Size(), info.group_ptr_.size() - 1);
|
||||
}
|
||||
uint32_t valid_groups = 0;
|
||||
if (!info.labels_.Empty()) {
|
||||
CHECK_EQ(info.group_ptr_.back(), info.labels_.Size());
|
||||
if (tparam_->gpu_id == GenericParameter::kCpuId) {
|
||||
std::tie(auc, valid_groups) =
|
||||
RankingAUC(preds.ConstHostVector(), info);
|
||||
} else {
|
||||
std::tie(auc, valid_groups) = GPURankingAUC(
|
||||
preds.ConstDeviceSpan(), info, tparam_->gpu_id, &this->d_cache_);
|
||||
}
|
||||
}
|
||||
|
||||
std::array<float, 2> results{auc, static_cast<float>(valid_groups)};
|
||||
rabit::Allreduce<rabit::op::Sum>(results.data(), results.size());
|
||||
auc = results[0];
|
||||
valid_groups = static_cast<uint32_t>(results[1]);
|
||||
|
||||
if (valid_groups <= 0) {
|
||||
auc = std::numeric_limits<float>::quiet_NaN();
|
||||
} else {
|
||||
auc /= valid_groups;
|
||||
CHECK_LE(auc, 1) << "Total AUC across groups: " << auc * valid_groups
|
||||
<< ", valid groups: " << valid_groups;
|
||||
}
|
||||
} else if (info.labels_.Size() != preds.Size() &&
|
||||
preds.Size() % info.labels_.Size() == 0) {
|
||||
/**
|
||||
* multi class
|
||||
*/
|
||||
if (tparam_->gpu_id == GenericParameter::kCpuId) {
|
||||
auc = MultiClassOVR(preds.ConstHostVector(), info);
|
||||
} else {
|
||||
auc = GPUMultiClassAUCOVR(preds.ConstDeviceSpan(), info, tparam_->gpu_id,
|
||||
&this->d_cache_);
|
||||
}
|
||||
} else {
|
||||
/**
|
||||
* binary classification
|
||||
*/
|
||||
float fp{0}, tp{0};
|
||||
if (!(preds.Empty() || info.labels_.Empty())) {
|
||||
if (tparam_->gpu_id == GenericParameter::kCpuId) {
|
||||
std::tie(fp, tp, auc) =
|
||||
BinaryAUC(preds.ConstHostVector(), info.labels_.ConstHostVector(),
|
||||
info.weights_.ConstHostVector());
|
||||
} else {
|
||||
std::tie(fp, tp, auc) = GPUBinaryAUC(
|
||||
preds.ConstDeviceSpan(), info, tparam_->gpu_id, &this->d_cache_);
|
||||
}
|
||||
}
|
||||
float local_area = fp * tp;
|
||||
std::array<float, 2> result{auc, local_area};
|
||||
rabit::Allreduce<rabit::op::Sum>(result.data(), result.size());
|
||||
std::tie(auc, local_area) = UnpackArr(std::move(result));
|
||||
if (local_area <= 0) {
|
||||
// the dataset across all workers have only positive or negative sample
|
||||
auc = std::numeric_limits<float>::quiet_NaN();
|
||||
} else {
|
||||
// normalization
|
||||
auc = auc / local_area;
|
||||
}
|
||||
}
|
||||
if (std::isnan(auc)) {
|
||||
LOG(WARNING) << "Dataset contains only positive or negative samples.";
|
||||
}
|
||||
return auc;
|
||||
}
|
||||
|
||||
char const* Name() const override {
|
||||
return "auc";
|
||||
}
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_METRIC(EvalBinaryAUC, "auc")
|
||||
.describe("Receiver Operating Characteristic Area Under the Curve.")
|
||||
.set_body([](const char*) { return new EvalAUC(); });
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
std::tuple<float, float, float>
|
||||
GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
int32_t device, std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||
common::AssertGPUSupport();
|
||||
return std::make_tuple(0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info,
|
||||
int32_t device, std::shared_ptr<DeviceAUCCache>* cache) {
|
||||
common::AssertGPUSupport();
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::pair<float, uint32_t>
|
||||
GPURankingAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
int32_t device, std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||
common::AssertGPUSupport();
|
||||
return std::make_pair(0.0f, 0u);
|
||||
}
|
||||
struct DeviceAUCCache {};
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
540
src/metric/auc.cu
Normal file
540
src/metric/auc.cu
Normal file
@@ -0,0 +1,540 @@
|
||||
/*!
|
||||
* Copyright 2021 by XGBoost Contributors
|
||||
*/
|
||||
#include <thrust/scan.h>
|
||||
#include <cub/cub.cuh>
|
||||
#include <cassert>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <tuple>
|
||||
|
||||
#include "rabit/rabit.h"
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "auc.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "../common/ranking_utils.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
namespace {
|
||||
template <typename T>
|
||||
class Discard : public thrust::discard_iterator<T> {
|
||||
public:
|
||||
using value_type = T; // NOLINT
|
||||
};
|
||||
|
||||
struct GetWeightOp {
|
||||
common::Span<float const> weights;
|
||||
common::Span<size_t const> sorted_idx;
|
||||
|
||||
__device__ float operator()(size_t i) const {
|
||||
return weights.empty() ? 1.0f : weights[sorted_idx[i]];
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* A cache to GPU data to avoid reallocating memory.
|
||||
*/
|
||||
struct DeviceAUCCache {
|
||||
// Pair of FP/TP
|
||||
using Pair = thrust::pair<float, float>;
|
||||
// index sorted by prediction value
|
||||
dh::device_vector<size_t> sorted_idx;
|
||||
// track FP/TP for computation on trapesoid area
|
||||
dh::device_vector<Pair> fptp;
|
||||
// track FP_PREV/TP_PREV for computation on trapesoid area
|
||||
dh::device_vector<Pair> neg_pos;
|
||||
// index of unique prediction values.
|
||||
dh::device_vector<size_t> unique_idx;
|
||||
// p^T: transposed prediction matrix, used by MultiClassAUC
|
||||
dh::device_vector<float> predts_t;
|
||||
std::unique_ptr<dh::AllReducer> reducer;
|
||||
|
||||
void Init(common::Span<float const> predts, bool is_multi, int32_t device) {
|
||||
if (sorted_idx.size() != predts.size()) {
|
||||
sorted_idx.resize(predts.size());
|
||||
fptp.resize(sorted_idx.size());
|
||||
unique_idx.resize(sorted_idx.size());
|
||||
neg_pos.resize(sorted_idx.size());
|
||||
if (is_multi) {
|
||||
predts_t.resize(sorted_idx.size());
|
||||
reducer.reset(new dh::AllReducer);
|
||||
reducer->Init(rabit::GetRank());
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* The GPU implementation uses same calculation as CPU with a few more steps to distribute
|
||||
* work across threads:
|
||||
*
|
||||
* - Run scan to obtain TP/FP values, which are right coordinates of trapesoid.
|
||||
* - Find distinct prediction values and get the corresponding FP_PREV/TP_PREV value,
|
||||
* which are left coordinates of trapesoid.
|
||||
* - Reduce the scan array into 1 AUC value.
|
||||
*/
|
||||
std::tuple<float, float, float>
|
||||
GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
int32_t device, std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||
auto& cache = *p_cache;
|
||||
if (!cache) {
|
||||
cache.reset(new DeviceAUCCache);
|
||||
}
|
||||
cache->Init(predts, false, device);
|
||||
|
||||
auto labels = info.labels_.ConstDeviceSpan();
|
||||
auto weights = info.weights_.ConstDeviceSpan();
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
CHECK(!labels.empty());
|
||||
CHECK_EQ(labels.size(), predts.size());
|
||||
|
||||
/**
|
||||
* Create sorted index for each class
|
||||
*/
|
||||
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
|
||||
dh::ArgSort<false>(predts, d_sorted_idx);
|
||||
|
||||
/**
|
||||
* Linear scan
|
||||
*/
|
||||
auto get_weight = GetWeightOp{weights, d_sorted_idx};
|
||||
using Pair = thrust::pair<float, float>;
|
||||
auto get_fp_tp = [=]__device__(size_t i) {
|
||||
size_t idx = d_sorted_idx[i];
|
||||
|
||||
float label = labels[idx];
|
||||
float w = get_weight(i);
|
||||
|
||||
float fp = (1.0 - label) * w;
|
||||
float tp = label * w;
|
||||
|
||||
return thrust::make_pair(fp, tp);
|
||||
}; // NOLINT
|
||||
auto d_fptp = dh::ToSpan(cache->fptp);
|
||||
dh::LaunchN(device, d_sorted_idx.size(),
|
||||
[=] __device__(size_t i) { d_fptp[i] = get_fp_tp(i); });
|
||||
|
||||
dh::XGBDeviceAllocator<char> alloc;
|
||||
auto d_unique_idx = dh::ToSpan(cache->unique_idx);
|
||||
dh::Iota(d_unique_idx, device);
|
||||
|
||||
auto uni_key = dh::MakeTransformIterator<float>(
|
||||
thrust::make_counting_iterator(0),
|
||||
[=] __device__(size_t i) { return predts[d_sorted_idx[i]]; });
|
||||
auto end_unique = thrust::unique_by_key_copy(
|
||||
thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
|
||||
dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
|
||||
dh::tbegin(d_unique_idx));
|
||||
d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx));
|
||||
|
||||
dh::InclusiveScan(
|
||||
dh::tbegin(d_fptp), dh::tbegin(d_fptp),
|
||||
[=] __device__(Pair const &l, Pair const &r) {
|
||||
return thrust::make_pair(l.first + r.first, l.second + r.second);
|
||||
},
|
||||
d_fptp.size());
|
||||
|
||||
auto d_neg_pos = dh::ToSpan(cache->neg_pos);
|
||||
// scatter unique negaive/positive values
|
||||
// shift to right by 1 with initial value being 0
|
||||
dh::LaunchN(device, d_unique_idx.size(), [=] __device__(size_t i) {
|
||||
if (d_unique_idx[i] == 0) { // first unique index is 0
|
||||
assert(i == 0);
|
||||
d_neg_pos[0] = {0, 0};
|
||||
return;
|
||||
}
|
||||
d_neg_pos[d_unique_idx[i]] = d_fptp[d_unique_idx[i] - 1];
|
||||
if (i == d_unique_idx.size() - 1) {
|
||||
// last one needs to be included, may override above assignment if the last
|
||||
// prediction value is district from previous one.
|
||||
d_neg_pos.back() = d_fptp[d_unique_idx[i] - 1];
|
||||
return;
|
||||
}
|
||||
});
|
||||
|
||||
auto in = dh::MakeTransformIterator<float>(
|
||||
thrust::make_counting_iterator(0), [=] __device__(size_t i) {
|
||||
float fp, tp;
|
||||
float fp_prev, tp_prev;
|
||||
if (i == 0) {
|
||||
// handle the last element
|
||||
thrust::tie(fp, tp) = d_fptp.back();
|
||||
thrust::tie(fp_prev, tp_prev) = d_neg_pos[d_unique_idx.back()];
|
||||
} else {
|
||||
thrust::tie(fp, tp) = d_fptp[d_unique_idx[i] - 1];
|
||||
thrust::tie(fp_prev, tp_prev) = d_neg_pos[d_unique_idx[i - 1]];
|
||||
}
|
||||
return TrapesoidArea(fp_prev, fp, tp_prev, tp);
|
||||
});
|
||||
|
||||
Pair last = cache->fptp.back();
|
||||
float auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size());
|
||||
return std::make_tuple(last.first, last.second, auc);
|
||||
}
|
||||
|
||||
void Transpose(common::Span<float const> in, common::Span<float> out, size_t m,
|
||||
size_t n, int32_t device) {
|
||||
CHECK_EQ(in.size(), out.size());
|
||||
CHECK_EQ(in.size(), m * n);
|
||||
dh::LaunchN(device, in.size(), [=] __device__(size_t i) {
|
||||
size_t col = i / m;
|
||||
size_t row = i % m;
|
||||
size_t idx = row * n + col;
|
||||
out[i] = in[idx];
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Last index of a group in a CSR style of index pointer.
|
||||
*/
|
||||
template <typename Idx>
|
||||
XGBOOST_DEVICE size_t LastOf(size_t group, common::Span<Idx> indptr) {
|
||||
return indptr[group + 1] - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* MultiClass implementation is similar to binary classification, except we need to split
|
||||
* up each class in all kernels.
|
||||
*/
|
||||
float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info,
|
||||
int32_t device, std::shared_ptr<DeviceAUCCache>* p_cache) {
|
||||
auto& cache = *p_cache;
|
||||
if (!cache) {
|
||||
cache.reset(new DeviceAUCCache);
|
||||
}
|
||||
cache->Init(predts, true, device);
|
||||
|
||||
auto labels = info.labels_.ConstDeviceSpan();
|
||||
auto weights = info.weights_.ConstDeviceSpan();
|
||||
|
||||
size_t n_samples = labels.size();
|
||||
size_t n_classes = predts.size() / labels.size();
|
||||
CHECK_NE(n_classes, 0);
|
||||
|
||||
/**
|
||||
* Create sorted index for each class
|
||||
*/
|
||||
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
|
||||
dh::Iota(d_sorted_idx, device);
|
||||
auto d_predts_t = dh::ToSpan(cache->predts_t);
|
||||
Transpose(predts, d_predts_t, n_samples, n_classes, device);
|
||||
|
||||
dh::TemporaryArray<uint32_t> class_ptr(n_classes + 1, 0);
|
||||
auto d_class_ptr = dh::ToSpan(class_ptr);
|
||||
dh::LaunchN(device, n_classes + 1, [=]__device__(size_t i) {
|
||||
d_class_ptr[i] = i * n_samples;
|
||||
});
|
||||
// no out-of-place sort for thrust, cub sort doesn't accept general iterator. So can't
|
||||
// use transform iterator in sorting.
|
||||
dh::SegmentedArgSort<false>(d_predts_t, d_class_ptr, d_sorted_idx);
|
||||
|
||||
/**
|
||||
* Linear scan
|
||||
*/
|
||||
dh::caching_device_vector<float> d_auc(n_classes, 0);
|
||||
auto s_d_auc = dh::ToSpan(d_auc);
|
||||
auto get_weight = GetWeightOp{weights, d_sorted_idx};
|
||||
using Pair = thrust::pair<float, float>;
|
||||
auto d_fptp = dh::ToSpan(cache->fptp);
|
||||
auto get_fp_tp = [=]__device__(size_t i) {
|
||||
size_t idx = d_sorted_idx[i];
|
||||
|
||||
size_t class_id = i / n_samples;
|
||||
// labels is a vector of size n_samples.
|
||||
float label = labels[idx % n_samples] == class_id;
|
||||
|
||||
float w = get_weight(i % n_samples);
|
||||
float fp = (1.0 - label) * w;
|
||||
float tp = label * w;
|
||||
return thrust::make_pair(fp, tp);
|
||||
}; // NOLINT
|
||||
dh::LaunchN(device, d_sorted_idx.size(),
|
||||
[=] __device__(size_t i) { d_fptp[i] = get_fp_tp(i); });
|
||||
|
||||
/**
|
||||
* Handle duplicated predictions
|
||||
*/
|
||||
dh::XGBDeviceAllocator<char> alloc;
|
||||
auto d_unique_idx = dh::ToSpan(cache->unique_idx);
|
||||
dh::Iota(d_unique_idx, device);
|
||||
auto uni_key = dh::MakeTransformIterator<thrust::pair<uint32_t, float>>(
|
||||
thrust::make_counting_iterator(0), [=] __device__(size_t i) {
|
||||
uint32_t class_id = i / n_samples;
|
||||
float predt = d_predts_t[d_sorted_idx[i]];
|
||||
return thrust::make_pair(class_id, predt);
|
||||
});
|
||||
|
||||
// unique values are sparse, so we need a CSR style indptr
|
||||
dh::TemporaryArray<uint32_t> unique_class_ptr(class_ptr.size() + 1);
|
||||
auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
|
||||
auto n_uniques = dh::SegmentedUniqueByKey(
|
||||
thrust::cuda::par(alloc),
|
||||
dh::tbegin(d_class_ptr),
|
||||
dh::tend(d_class_ptr),
|
||||
uni_key,
|
||||
uni_key + d_sorted_idx.size(),
|
||||
dh::tbegin(d_unique_idx),
|
||||
d_unique_class_ptr.data(),
|
||||
dh::tbegin(d_unique_idx),
|
||||
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
||||
d_unique_idx = d_unique_idx.subspan(0, n_uniques);
|
||||
|
||||
using Triple = thrust::tuple<uint32_t, float, float>;
|
||||
// expand to tuple to include class id
|
||||
auto fptp_it_in = dh::MakeTransformIterator<Triple>(
|
||||
thrust::make_counting_iterator(0), [=] __device__(size_t i) {
|
||||
uint32_t class_id = i / n_samples;
|
||||
return thrust::make_tuple(class_id, d_fptp[i].first, d_fptp[i].second);
|
||||
});
|
||||
// shrink down to pair
|
||||
auto fptp_it_out = thrust::make_transform_output_iterator(
|
||||
dh::tbegin(d_fptp), [=] __device__(Triple const &t) {
|
||||
return thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t));
|
||||
});
|
||||
dh::InclusiveScan(
|
||||
fptp_it_in, fptp_it_out,
|
||||
[=] __device__(Triple const &l, Triple const &r) {
|
||||
uint32_t l_cid = thrust::get<0>(l);
|
||||
uint32_t r_cid = thrust::get<0>(r);
|
||||
if (l_cid != r_cid) {
|
||||
return r;
|
||||
}
|
||||
|
||||
return Triple(r_cid, // class_id
|
||||
thrust::get<1>(l) + thrust::get<1>(r), // fp
|
||||
thrust::get<2>(l) + thrust::get<2>(r)); // tp
|
||||
},
|
||||
d_fptp.size());
|
||||
|
||||
// scatter unique FP_PREV/TP_PREV values
|
||||
auto d_neg_pos = dh::ToSpan(cache->neg_pos);
|
||||
// When dataset is not empty, each class must have at least 1 (unique) sample
|
||||
// prediction, so no need to handle special case.
|
||||
dh::LaunchN(device, d_unique_idx.size(), [=]__device__(size_t i) {
|
||||
if (d_unique_idx[i] % n_samples == 0) { // first unique index is 0
|
||||
assert(d_unique_idx[i] % n_samples == 0);
|
||||
d_neg_pos[d_unique_idx[i]] = {0, 0}; // class_id * n_samples = i
|
||||
return;
|
||||
}
|
||||
uint32_t class_id = d_unique_idx[i] / n_samples;
|
||||
d_neg_pos[d_unique_idx[i]] = d_fptp[d_unique_idx[i] - 1];
|
||||
if (i == LastOf(class_id, d_unique_class_ptr)) {
|
||||
// last one needs to be included.
|
||||
size_t last = d_unique_idx[LastOf(class_id, d_unique_class_ptr)];
|
||||
d_neg_pos[LastOf(class_id, d_class_ptr)] = d_fptp[last - 1];
|
||||
return;
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Reduce the result for each class
|
||||
*/
|
||||
auto key_in = dh::MakeTransformIterator<uint32_t>(
|
||||
thrust::make_counting_iterator(0), [=] __device__(size_t i) {
|
||||
size_t class_id = d_unique_idx[i] / n_samples;
|
||||
return class_id;
|
||||
});
|
||||
auto val_in = dh::MakeTransformIterator<float>(
|
||||
thrust::make_counting_iterator(0), [=] __device__(size_t i) {
|
||||
size_t class_id = d_unique_idx[i] / n_samples;
|
||||
float fp, tp;
|
||||
float fp_prev, tp_prev;
|
||||
if (i == d_unique_class_ptr[class_id]) {
|
||||
// first item is ignored, we use this thread to calculate the last item
|
||||
thrust::tie(fp, tp) = d_fptp[class_id * n_samples + (n_samples - 1)];
|
||||
thrust::tie(fp_prev, tp_prev) =
|
||||
d_neg_pos[d_unique_idx[LastOf(class_id, d_unique_class_ptr)]];
|
||||
} else {
|
||||
thrust::tie(fp, tp) = d_fptp[d_unique_idx[i] - 1];
|
||||
thrust::tie(fp_prev, tp_prev) = d_neg_pos[d_unique_idx[i - 1]];
|
||||
}
|
||||
float auc = TrapesoidArea(fp_prev, fp, tp_prev, tp);
|
||||
return auc;
|
||||
});
|
||||
|
||||
thrust::reduce_by_key(thrust::cuda::par(alloc), key_in,
|
||||
key_in + d_unique_idx.size(), val_in,
|
||||
thrust::make_discard_iterator(), d_auc.begin());
|
||||
|
||||
/**
|
||||
* Scale the classes with number of samples for each class.
|
||||
*/
|
||||
dh::TemporaryArray<float> resutls(n_classes * 4);
|
||||
auto d_results = dh::ToSpan(resutls);
|
||||
auto local_area = d_results.subspan(0, n_classes);
|
||||
auto fp = d_results.subspan(n_classes, n_classes);
|
||||
auto tp = d_results.subspan(2 * n_classes, n_classes);
|
||||
auto auc = d_results.subspan(3 * n_classes, n_classes);
|
||||
|
||||
dh::LaunchN(device, n_classes, [=] __device__(size_t c) {
|
||||
auc[c] = s_d_auc[c];
|
||||
auto last = d_fptp[n_samples * c + (n_samples - 1)];
|
||||
fp[c] = last.first;
|
||||
tp[c] = last.second;
|
||||
local_area[c] = last.first * last.second;
|
||||
});
|
||||
if (rabit::IsDistributed()) {
|
||||
cache->reducer->AllReduceSum(resutls.data().get(), resutls.data().get(),
|
||||
resutls.size());
|
||||
}
|
||||
auto reduce_in = dh::MakeTransformIterator<thrust::pair<float, float>>(
|
||||
thrust::make_counting_iterator(0), [=] __device__(size_t i) {
|
||||
if (local_area[i] > 0) {
|
||||
return thrust::make_pair(auc[i] / local_area[i] * tp[i], tp[i]);
|
||||
}
|
||||
return thrust::make_pair(std::numeric_limits<float>::quiet_NaN(), 0.0f);
|
||||
});
|
||||
|
||||
float tp_sum;
|
||||
float auc_sum;
|
||||
thrust::tie(auc_sum, tp_sum) = thrust::reduce(
|
||||
thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
|
||||
thrust::make_pair(0.0f, 0.0f),
|
||||
[=] __device__(auto const &l, auto const &r) {
|
||||
return thrust::make_pair(l.first + r.first, l.second + r.second);
|
||||
});
|
||||
if (tp_sum != 0 && !std::isnan(auc_sum)) {
|
||||
auc_sum /= tp_sum;
|
||||
} else {
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
return auc_sum;
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct RankScanItem {
|
||||
size_t idx;
|
||||
float predt;
|
||||
float w;
|
||||
bst_group_t group_id;
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
std::pair<float, uint32_t>
|
||||
GPURankingAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
int32_t device, std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||
auto& cache = *p_cache;
|
||||
if (!cache) {
|
||||
cache.reset(new DeviceAUCCache);
|
||||
}
|
||||
cache->Init(predts, false, device);
|
||||
|
||||
dh::caching_device_vector<bst_group_t> group_ptr(info.group_ptr_);
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
auto d_group_ptr = dh::ToSpan(group_ptr);
|
||||
/**
|
||||
* Validate the dataset
|
||||
*/
|
||||
auto check_it = dh::MakeTransformIterator<size_t>(
|
||||
thrust::make_counting_iterator(0),
|
||||
[=] __device__(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; });
|
||||
size_t n_valid = thrust::count_if(
|
||||
thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1,
|
||||
[=] __device__(size_t len) { return len >= 3; });
|
||||
if (n_valid < info.group_ptr_.size() - 1) {
|
||||
InvalidGroupAUC();
|
||||
}
|
||||
if (n_valid == 0) {
|
||||
return std::make_pair(0.0f, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort the labels
|
||||
*/
|
||||
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
|
||||
auto d_labels = info.labels_.ConstDeviceSpan();
|
||||
|
||||
dh::Iota(d_sorted_idx, device);
|
||||
dh::SegmentedArgSort<false>(d_labels, d_group_ptr, d_sorted_idx);
|
||||
|
||||
auto d_weights = info.weights_.ConstDeviceSpan();
|
||||
|
||||
dh::caching_device_vector<size_t> threads_group_ptr(group_ptr.size(), 0);
|
||||
auto d_threads_group_ptr = dh::ToSpan(threads_group_ptr);
|
||||
// Use max to represent triangle
|
||||
auto n_threads = common::SegmentedTrapezoidThreads(
|
||||
d_group_ptr, d_threads_group_ptr, std::numeric_limits<size_t>::max());
|
||||
// get the coordinate in nested summation
|
||||
auto get_i_j = [=]__device__(size_t idx, size_t query_group_idx) {
|
||||
auto data_group_begin = d_group_ptr[query_group_idx];
|
||||
size_t n_samples = d_group_ptr[query_group_idx + 1] - data_group_begin;
|
||||
auto thread_group_begin = d_threads_group_ptr[query_group_idx];
|
||||
auto idx_in_thread_group = idx - thread_group_begin;
|
||||
|
||||
size_t i, j;
|
||||
common::UnravelTrapeziodIdx(idx_in_thread_group, n_samples, &i, &j);
|
||||
// we use global index among all groups for sorted idx, so i, j should also be global
|
||||
// index.
|
||||
i += data_group_begin;
|
||||
j += data_group_begin;
|
||||
return thrust::make_pair(i, j);
|
||||
}; // NOLINT
|
||||
auto in = dh::MakeTransformIterator<RankScanItem>(
|
||||
thrust::make_counting_iterator(0), [=] __device__(size_t idx) {
|
||||
bst_group_t query_group_idx = dh::SegmentId(d_threads_group_ptr, idx);
|
||||
auto data_group_begin = d_group_ptr[query_group_idx];
|
||||
size_t n_samples = d_group_ptr[query_group_idx + 1] - data_group_begin;
|
||||
if (n_samples < 3) {
|
||||
// at least 3 documents are required.
|
||||
return RankScanItem{idx, 0, 0, query_group_idx};
|
||||
}
|
||||
|
||||
size_t i, j;
|
||||
thrust::tie(i, j) = get_i_j(idx, query_group_idx);
|
||||
|
||||
float predt = predts[d_sorted_idx[i]] - predts[d_sorted_idx[j]];
|
||||
float w = common::Sqr(d_weights.empty() ? 1.0f : d_weights[query_group_idx]);
|
||||
if (predt > 0) {
|
||||
predt = 1.0;
|
||||
} else if (predt == 0) {
|
||||
predt = 0.5;
|
||||
} else {
|
||||
predt = 0;
|
||||
}
|
||||
predt *= w;
|
||||
return RankScanItem{idx, predt, w, query_group_idx};
|
||||
});
|
||||
|
||||
dh::TemporaryArray<float> d_auc(group_ptr.size() - 1);
|
||||
auto s_d_auc = dh::ToSpan(d_auc);
|
||||
auto out = thrust::make_transform_output_iterator(
|
||||
Discard<RankScanItem>(), [=] __device__(RankScanItem const &item) -> RankScanItem {
|
||||
auto group_id = item.group_id;
|
||||
assert(group_id < d_group_ptr.size());
|
||||
auto data_group_begin = d_group_ptr[group_id];
|
||||
size_t n_samples = d_group_ptr[group_id + 1] - data_group_begin;
|
||||
// last item of current group
|
||||
if (item.idx == LastOf(group_id, d_threads_group_ptr)) {
|
||||
if (item.w > 0) {
|
||||
s_d_auc[group_id] = item.predt / item.w;
|
||||
} else {
|
||||
s_d_auc[group_id] = 0;
|
||||
}
|
||||
}
|
||||
return {}; // discard
|
||||
});
|
||||
dh::InclusiveScan(
|
||||
in, out,
|
||||
[] __device__(RankScanItem const &l, RankScanItem const &r) {
|
||||
if (l.group_id != r.group_id) {
|
||||
return r;
|
||||
}
|
||||
return RankScanItem{r.idx, l.predt + r.predt, l.w + r.w, l.group_id};
|
||||
},
|
||||
n_threads);
|
||||
|
||||
/**
|
||||
* Scale the AUC with number of items in each group.
|
||||
*/
|
||||
float auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc),
|
||||
dh::tend(s_d_auc), 0.0f);
|
||||
return std::make_pair(auc, n_valid);
|
||||
}
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
42
src/metric/auc.h
Normal file
42
src/metric/auc.h
Normal file
@@ -0,0 +1,42 @@
|
||||
/*!
|
||||
* Copyright 2021 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_METRIC_AUC_H_
|
||||
#define XGBOOST_METRIC_AUC_H_
|
||||
#include <cmath>
|
||||
#include <memory>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
#include "rabit/rabit.h"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
XGBOOST_DEVICE inline float TrapesoidArea(float x0, float x1, float y0, float y1) {
|
||||
return std::abs(x0 - x1) * (y0 + y1) * 0.5f;
|
||||
}
|
||||
|
||||
struct DeviceAUCCache;
|
||||
|
||||
std::tuple<float, float, float>
|
||||
GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
int32_t device, std::shared_ptr<DeviceAUCCache> *p_cache);
|
||||
|
||||
float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info,
|
||||
int32_t device, std::shared_ptr<DeviceAUCCache>* cache);
|
||||
|
||||
std::pair<float, uint32_t>
|
||||
GPURankingAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
int32_t device, std::shared_ptr<DeviceAUCCache> *cache);
|
||||
|
||||
inline void InvalidGroupAUC() {
|
||||
LOG(INFO) << "Invalid group with less than 3 samples is found on worker "
|
||||
<< rabit::GetRank() << ". Calculating AUC value requires at "
|
||||
<< "least 2 pairs of samples.";
|
||||
}
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_METRIC_AUC_H_
|
||||
@@ -156,134 +156,6 @@ struct EvalAMS : public Metric {
|
||||
float ratio_;
|
||||
};
|
||||
|
||||
/*! \brief Area Under Curve, for both classification and rank computed on CPU */
|
||||
struct EvalAuc : public Metric {
|
||||
private:
|
||||
// This is used to compute the AUC metrics on the GPU - for ranking tasks and
|
||||
// for training jobs that run on the GPU.
|
||||
std::unique_ptr<xgboost::Metric> auc_gpu_;
|
||||
|
||||
template <typename WeightPolicy>
|
||||
bst_float Eval(const HostDeviceVector<bst_float> &preds,
|
||||
const MetaInfo &info,
|
||||
bool distributed,
|
||||
const std::vector<unsigned> &gptr) {
|
||||
const auto ngroups = static_cast<bst_omp_uint>(gptr.size() - 1);
|
||||
// sum of all AUC's across all query groups
|
||||
double sum_auc = 0.0;
|
||||
int auc_error = 0;
|
||||
const auto& labels = info.labels_.ConstHostVector();
|
||||
const auto &h_preds = preds.ConstHostVector();
|
||||
|
||||
dmlc::OMPException exc;
|
||||
#pragma omp parallel reduction(+:sum_auc, auc_error) if (ngroups > 1)
|
||||
{
|
||||
exc.Run([&]() {
|
||||
// Each thread works on a distinct group and sorts the predictions in that group
|
||||
PredIndPairContainer rec;
|
||||
#pragma omp for schedule(static)
|
||||
for (bst_omp_uint group_id = 0; group_id < ngroups; ++group_id) {
|
||||
exc.Run([&]() {
|
||||
// Same thread can work on multiple groups one after another; hence, resize
|
||||
// the predictions array based on the current group
|
||||
rec.resize(gptr[group_id + 1] - gptr[group_id]);
|
||||
#pragma omp parallel for schedule(static) if (!omp_in_parallel())
|
||||
for (bst_omp_uint j = gptr[group_id]; j < gptr[group_id + 1]; ++j) {
|
||||
exc.Run([&]() {
|
||||
rec[j - gptr[group_id]] = {h_preds[j], j};
|
||||
});
|
||||
}
|
||||
|
||||
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
||||
// calculate AUC
|
||||
double sum_pospair = 0.0;
|
||||
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
|
||||
for (size_t j = 0; j < rec.size(); ++j) {
|
||||
const bst_float wt = WeightPolicy::GetWeightOfSortedRecord(info, rec, j, group_id);
|
||||
const bst_float ctr = labels[rec[j].second];
|
||||
// keep bucketing predictions in same bucket
|
||||
if (j != 0 && rec[j].first != rec[j - 1].first) {
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos * 0.5);
|
||||
sum_npos += buf_pos;
|
||||
sum_nneg += buf_neg;
|
||||
buf_neg = buf_pos = 0.0f;
|
||||
}
|
||||
buf_pos += ctr * wt;
|
||||
buf_neg += (1.0f - ctr) * wt;
|
||||
}
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos * 0.5);
|
||||
sum_npos += buf_pos;
|
||||
sum_nneg += buf_neg;
|
||||
// check weird conditions
|
||||
if (sum_npos <= 0.0 || sum_nneg <= 0.0) {
|
||||
auc_error += 1;
|
||||
} else {
|
||||
// this is the AUC
|
||||
sum_auc += sum_pospair / (sum_npos * sum_nneg);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
exc.Rethrow();
|
||||
|
||||
// Report average AUC across all groups
|
||||
// In distributed mode, workers which only contains pos or neg samples
|
||||
// will be ignored when aggregate AUC.
|
||||
bst_float dat[2] = {0.0f, 0.0f};
|
||||
if (auc_error < static_cast<int>(ngroups)) {
|
||||
dat[0] = static_cast<bst_float>(sum_auc);
|
||||
dat[1] = static_cast<bst_float>(static_cast<int>(ngroups) - auc_error);
|
||||
}
|
||||
if (distributed) {
|
||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||
}
|
||||
CHECK_GT(dat[1], 0.0f)
|
||||
<< "AUC: the dataset only contains pos or neg samples";
|
||||
return dat[0] / dat[1];
|
||||
}
|
||||
|
||||
public:
|
||||
bst_float Eval(const HostDeviceVector<bst_float> &preds,
|
||||
const MetaInfo &info,
|
||||
bool distributed) override {
|
||||
CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
|
||||
CHECK_EQ(preds.Size(), info.labels_.Size())
|
||||
<< "label size predict size not match";
|
||||
std::vector<unsigned> tgptr(2, 0);
|
||||
tgptr[1] = static_cast<unsigned>(info.labels_.Size());
|
||||
|
||||
const auto &gptr = info.group_ptr_.empty() ? tgptr : info.group_ptr_;
|
||||
CHECK_EQ(gptr.back(), info.labels_.Size())
|
||||
<< "EvalAuc: group structure must match number of prediction";
|
||||
|
||||
// For ranking task, weights are per-group
|
||||
// For binary classification task, weights are per-instance
|
||||
const bool is_ranking_task =
|
||||
!info.group_ptr_.empty() && info.weights_.Size() != info.num_row_;
|
||||
|
||||
// Check if we have a GPU assignment; else, revert back to CPU
|
||||
if (tparam_->gpu_id >= 0) {
|
||||
if (!auc_gpu_) {
|
||||
// Check and see if we have the GPU metric registered in the internal registry
|
||||
auc_gpu_.reset(GPUMetric::CreateGPUMetric(this->Name(), tparam_));
|
||||
}
|
||||
|
||||
if (auc_gpu_) {
|
||||
return auc_gpu_->Eval(preds, info, distributed);
|
||||
}
|
||||
}
|
||||
|
||||
if (is_ranking_task) {
|
||||
return Eval<PerGroupWeightPolicy>(preds, info, distributed, gptr);
|
||||
} else {
|
||||
return Eval<PerInstanceWeightPolicy>(preds, info, distributed, gptr);
|
||||
}
|
||||
}
|
||||
|
||||
const char *Name() const override { return "auc"; }
|
||||
};
|
||||
|
||||
/*! \brief Evaluate rank list */
|
||||
struct EvalRank : public Metric, public EvalRankConfig {
|
||||
private:
|
||||
@@ -672,10 +544,6 @@ XGBOOST_REGISTER_METRIC(AMS, "ams")
|
||||
.describe("AMS metric for higgs.")
|
||||
.set_body([](const char* param) { return new EvalAMS(param); });
|
||||
|
||||
XGBOOST_REGISTER_METRIC(Auc, "auc")
|
||||
.describe("Area under curve for both classification and rank.")
|
||||
.set_body([](const char*) { return new EvalAuc(); });
|
||||
|
||||
XGBOOST_REGISTER_METRIC(AucPR, "aucpr")
|
||||
.describe("Area under PR curve for both classification and rank.")
|
||||
.set_body([](const char*) { return new EvalAucPR(); });
|
||||
|
||||
@@ -274,237 +274,6 @@ struct EvalMAPGpu {
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Area Under Curve metric computation for ranking datasets */
|
||||
struct EvalAucGpu : public Metric {
|
||||
public:
|
||||
// This function object computes the positive precision pair for each prediction group
|
||||
class ComputePosPair : public thrust::unary_function<uint32_t, double> {
|
||||
public:
|
||||
XGBOOST_DEVICE ComputePosPair(const double *pred_group_pos_precision,
|
||||
const double *pred_group_neg_precision,
|
||||
const double *pred_group_incr_precision)
|
||||
: pred_group_pos_precision_(pred_group_pos_precision),
|
||||
pred_group_neg_precision_(pred_group_neg_precision),
|
||||
pred_group_incr_precision_(pred_group_incr_precision) {}
|
||||
|
||||
// Compute positive precision pair for the prediction group at 'idx'
|
||||
__device__ __forceinline__ double operator()(uint32_t idx) const {
|
||||
return pred_group_neg_precision_[idx] *
|
||||
(pred_group_incr_precision_[idx] + pred_group_pos_precision_[idx] * 0.5);
|
||||
}
|
||||
|
||||
private:
|
||||
// Accumulated positive precision for the prediction group
|
||||
const double *pred_group_pos_precision_{nullptr};
|
||||
// Accumulated negative precision for the prediction group
|
||||
const double *pred_group_neg_precision_{nullptr};
|
||||
// Incremental positive precision for the prediction group
|
||||
const double *pred_group_incr_precision_{nullptr};
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void ReleaseMemory(dh::caching_device_vector<T> &vec) { // NOLINT
|
||||
dh::caching_device_vector<T>().swap(vec);
|
||||
}
|
||||
|
||||
bst_float Eval(const HostDeviceVector<bst_float> &preds,
|
||||
const MetaInfo &info,
|
||||
bool distributed) override {
|
||||
// Sanity check is done by the caller
|
||||
std::vector<unsigned> tgptr(2, 0);
|
||||
tgptr[1] = static_cast<unsigned>(info.labels_.Size());
|
||||
const std::vector<unsigned> &gptr = info.group_ptr_.empty() ? tgptr : info.group_ptr_;
|
||||
|
||||
auto device = tparam_->gpu_id;
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
info.labels_.SetDevice(device);
|
||||
preds.SetDevice(device);
|
||||
info.weights_.SetDevice(device);
|
||||
|
||||
auto dpreds = preds.ConstDevicePointer();
|
||||
auto dlabels = info.labels_.ConstDevicePointer();
|
||||
auto dweights = info.weights_.ConstDevicePointer();
|
||||
|
||||
// Sort all the predictions (from one or more groups)
|
||||
dh::SegmentSorter<float> segment_pred_sorter;
|
||||
segment_pred_sorter.SortItems(dpreds, preds.Size(), gptr);
|
||||
|
||||
const auto &dsorted_preds = segment_pred_sorter.GetItemsSpan();
|
||||
const auto &dpreds_orig_pos = segment_pred_sorter.GetOriginalPositionsSpan();
|
||||
|
||||
// Group info on device
|
||||
const auto &dgroups = segment_pred_sorter.GetGroupsSpan();
|
||||
uint32_t ngroups = segment_pred_sorter.GetNumGroups();
|
||||
|
||||
// Final values
|
||||
double hsum_auc = 0.0;
|
||||
unsigned hauc_error = 0;
|
||||
|
||||
int device_id = -1;
|
||||
dh::safe_cuda(cudaGetDevice(&device_id));
|
||||
|
||||
// Allocator to be used for managing space overhead while performing reductions
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
if (ngroups == 1) {
|
||||
const auto nitems = segment_pred_sorter.GetNumItems();
|
||||
|
||||
// First, segment all the predictions in the group. This is required so that we can
|
||||
// aggregate the positive and negative precisions within that prediction group
|
||||
dh::caching_device_vector<unsigned> dpred_segs(nitems, 0);
|
||||
auto *pred_seg_arr = dpred_segs.data().get();
|
||||
// This is for getting the next segment number
|
||||
dh::caching_device_vector<unsigned> seg_idx(1, 0);
|
||||
auto *seg_idx_ptr = seg_idx.data().get();
|
||||
|
||||
dh::caching_device_vector<double> dbuf_pos(nitems, 0);
|
||||
dh::caching_device_vector<double> dbuf_neg(nitems, 0);
|
||||
auto *buf_pos_arr = dbuf_pos.data().get();
|
||||
auto *buf_neg_arr = dbuf_neg.data().get();
|
||||
|
||||
dh::LaunchN(device_id, nitems, nullptr, [=] __device__(int idx) {
|
||||
auto ctr = dlabels[dpreds_orig_pos[idx]];
|
||||
// For ranking task, weights are per-group
|
||||
// For binary classification task, weights are per-instance
|
||||
const auto wt = dweights == nullptr ? 1.0f : dweights[dpreds_orig_pos[idx]];
|
||||
buf_pos_arr[idx] = ctr * wt;
|
||||
buf_neg_arr[idx] = (1.0f - ctr) * wt;
|
||||
if (idx == nitems - 1 || dsorted_preds[idx] != dsorted_preds[idx + 1]) {
|
||||
auto new_seg_idx = atomicAdd(seg_idx_ptr, 1);
|
||||
auto pred_val = dsorted_preds[idx];
|
||||
do {
|
||||
pred_seg_arr[idx] = new_seg_idx;
|
||||
idx--;
|
||||
} while (idx >= 0 && dsorted_preds[idx] == pred_val);
|
||||
}
|
||||
});
|
||||
|
||||
std::array<uint32_t, 1> h_nunique_preds;
|
||||
dh::safe_cuda(cudaMemcpyAsync(h_nunique_preds.data(),
|
||||
seg_idx.data().get() + seg_idx.size() - 1,
|
||||
sizeof(uint32_t), cudaMemcpyDeviceToHost));
|
||||
auto nunique_preds = h_nunique_preds.back();
|
||||
ReleaseMemory(seg_idx);
|
||||
|
||||
// Next, accumulate the positive and negative precisions for every prediction group
|
||||
dh::caching_device_vector<double> sum_dbuf_pos(nunique_preds, 0);
|
||||
auto itr = thrust::reduce_by_key(thrust::cuda::par(alloc),
|
||||
dpred_segs.begin(), dpred_segs.end(), // Segmented by this
|
||||
dbuf_pos.begin(), // Individual precisions
|
||||
thrust::make_discard_iterator(), // Ignore unique segments
|
||||
sum_dbuf_pos.begin()); // Write accumulated results here
|
||||
ReleaseMemory(dbuf_pos);
|
||||
CHECK(itr.second - sum_dbuf_pos.begin() == nunique_preds);
|
||||
|
||||
dh::caching_device_vector<double> sum_dbuf_neg(nunique_preds, 0);
|
||||
itr = thrust::reduce_by_key(thrust::cuda::par(alloc),
|
||||
dpred_segs.begin(), dpred_segs.end(),
|
||||
dbuf_neg.begin(),
|
||||
thrust::make_discard_iterator(),
|
||||
sum_dbuf_neg.begin());
|
||||
ReleaseMemory(dbuf_neg);
|
||||
ReleaseMemory(dpred_segs);
|
||||
CHECK(itr.second - sum_dbuf_neg.begin() == nunique_preds);
|
||||
|
||||
dh::caching_device_vector<double> sum_nneg(nunique_preds, 0);
|
||||
thrust::inclusive_scan(thrust::cuda::par(alloc),
|
||||
sum_dbuf_neg.begin(), sum_dbuf_neg.end(),
|
||||
sum_nneg.begin());
|
||||
double sum_neg_prec_val = sum_nneg.back();
|
||||
ReleaseMemory(sum_nneg);
|
||||
|
||||
// Find incremental sum for the positive precisions that is then used to
|
||||
// compute incremental positive precision pair
|
||||
dh::caching_device_vector<double> sum_npos(nunique_preds + 1, 0);
|
||||
thrust::inclusive_scan(thrust::cuda::par(alloc),
|
||||
sum_dbuf_pos.begin(), sum_dbuf_pos.end(),
|
||||
sum_npos.begin() + 1);
|
||||
double sum_pos_prec_val = sum_npos.back();
|
||||
|
||||
if (sum_pos_prec_val <= 0.0 || sum_neg_prec_val <= 0.0) {
|
||||
hauc_error = 1;
|
||||
} else {
|
||||
dh::caching_device_vector<double> sum_pospair(nunique_preds, 0);
|
||||
// Finally, compute the positive precision pair
|
||||
thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
|
||||
thrust::make_counting_iterator(static_cast<uint32_t>(nunique_preds)),
|
||||
sum_pospair.begin(),
|
||||
ComputePosPair(sum_dbuf_pos.data().get(),
|
||||
sum_dbuf_neg.data().get(),
|
||||
sum_npos.data().get()));
|
||||
ReleaseMemory(sum_dbuf_pos);
|
||||
ReleaseMemory(sum_dbuf_neg);
|
||||
ReleaseMemory(sum_npos);
|
||||
hsum_auc = thrust::reduce(thrust::cuda::par(alloc),
|
||||
sum_pospair.begin(), sum_pospair.end())
|
||||
/ (sum_pos_prec_val * sum_neg_prec_val);
|
||||
}
|
||||
} else {
|
||||
// AUC sum for each group
|
||||
dh::caching_device_vector<double> sum_auc(ngroups, 0);
|
||||
// AUC error across all groups
|
||||
dh::caching_device_vector<int> auc_error(1, 0);
|
||||
auto *dsum_auc = sum_auc.data().get();
|
||||
auto *dauc_error = auc_error.data().get();
|
||||
|
||||
// For each group item compute the aggregated precision
|
||||
dh::LaunchN<1, 32>(device_id, ngroups, nullptr, [=] __device__(uint32_t gidx) {
|
||||
double sum_pospair = 0.0, sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
|
||||
|
||||
for (auto i = dgroups[gidx]; i < dgroups[gidx + 1]; ++i) {
|
||||
const auto ctr = dlabels[dpreds_orig_pos[i]];
|
||||
// Keep bucketing predictions in same bucket
|
||||
if (i != dgroups[gidx] && dsorted_preds[i] != dsorted_preds[i - 1]) {
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos * 0.5);
|
||||
sum_npos += buf_pos;
|
||||
sum_nneg += buf_neg;
|
||||
buf_neg = buf_pos = 0.0f;
|
||||
}
|
||||
// For ranking task, weights are per-group
|
||||
// For binary classification task, weights are per-instance
|
||||
const auto wt = dweights == nullptr ? 1.0f : dweights[gidx];
|
||||
buf_pos += ctr * wt;
|
||||
buf_neg += (1.0f - ctr) * wt;
|
||||
}
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos * 0.5);
|
||||
sum_npos += buf_pos;
|
||||
sum_nneg += buf_neg;
|
||||
|
||||
// Check weird conditions
|
||||
if (sum_npos <= 0.0 || sum_nneg <= 0.0) {
|
||||
atomicAdd(dauc_error, 1);
|
||||
} else {
|
||||
// This is the AUC
|
||||
dsum_auc[gidx] = sum_pospair / (sum_npos * sum_nneg);
|
||||
}
|
||||
});
|
||||
|
||||
hsum_auc = thrust::reduce(thrust::cuda::par(alloc), sum_auc.begin(), sum_auc.end());
|
||||
hauc_error = auc_error.back(); // Copy it back to host
|
||||
}
|
||||
|
||||
// Report average AUC across all groups
|
||||
// In distributed mode, workers which only contains pos or neg samples
|
||||
// will be ignored when aggregate AUC.
|
||||
bst_float dat[2] = {0.0f, 0.0f};
|
||||
if (hauc_error < ngroups) {
|
||||
dat[0] = static_cast<bst_float>(hsum_auc);
|
||||
dat[1] = static_cast<bst_float>(ngroups - hauc_error);
|
||||
}
|
||||
if (distributed) {
|
||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||
}
|
||||
CHECK_GT(dat[1], 0.0f)
|
||||
<< "AUC: the dataset only contains pos or neg samples";
|
||||
return dat[0] / dat[1];
|
||||
}
|
||||
|
||||
const char* Name() const override {
|
||||
return "auc";
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Area Under PR Curve metric computation for ranking datasets */
|
||||
struct EvalAucPRGpu : public Metric {
|
||||
public:
|
||||
@@ -691,10 +460,6 @@ struct EvalAucPRGpu : public Metric {
|
||||
}
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_GPU_METRIC(AucGpu, "auc")
|
||||
.describe("Area under curve for rank computed on GPU.")
|
||||
.set_body([](const char* param) { return new EvalAucGpu(); });
|
||||
|
||||
XGBOOST_REGISTER_GPU_METRIC(AucPRGpu, "aucpr")
|
||||
.describe("Area under PR curve for rank computed on GPU.")
|
||||
.set_body([](const char* param) { return new EvalAucPRGpu(); });
|
||||
|
||||
@@ -293,7 +293,7 @@ class NDCGLambdaWeightComputer
|
||||
group_segments)),
|
||||
thrust::make_discard_iterator(), // We don't care for the group indices
|
||||
dgroup_dcg_.begin()); // Sum of the item's DCG values in the group
|
||||
CHECK(static_cast<unsigned>(end_range.second - dgroup_dcg_.begin()) == dgroup_dcg_.size());
|
||||
CHECK_EQ(static_cast<unsigned>(end_range.second - dgroup_dcg_.begin()), dgroup_dcg_.size());
|
||||
}
|
||||
|
||||
inline const common::Span<const float> GetGroupDcgsSpan() const {
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
#include "xgboost/parameter.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "../common/math.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -264,14 +265,11 @@ XGBOOST_DEVICE inline static T1 ThresholdL1(T1 w, T2 alpha) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
XGBOOST_DEVICE inline static T Sqr(T a) { return a * a; }
|
||||
|
||||
// calculate the cost of loss function
|
||||
template <typename TrainingParams, typename T>
|
||||
XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p,
|
||||
T sum_grad, T sum_hess, T w) {
|
||||
return -(T(2.0) * sum_grad * w + (sum_hess + p.reg_lambda) * Sqr(w));
|
||||
return -(T(2.0) * sum_grad * w + (sum_hess + p.reg_lambda) * common::Sqr(w));
|
||||
}
|
||||
|
||||
// calculate weight given the statistics
|
||||
@@ -296,9 +294,9 @@ XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess
|
||||
}
|
||||
if (p.max_delta_step == 0.0f) {
|
||||
if (p.reg_alpha == 0.0f) {
|
||||
return Sqr(sum_grad) / (sum_hess + p.reg_lambda);
|
||||
return common::Sqr(sum_grad) / (sum_hess + p.reg_lambda);
|
||||
} else {
|
||||
return Sqr(ThresholdL1(sum_grad, p.reg_alpha)) /
|
||||
return common::Sqr(ThresholdL1(sum_grad, p.reg_alpha)) /
|
||||
(sum_hess + p.reg_lambda);
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -114,7 +114,7 @@ class TreeEvaluator {
|
||||
}
|
||||
// Avoiding tree::CalcGainGivenWeight can significantly reduce avg floating point error.
|
||||
if (p.max_delta_step == 0.0f && has_constraint == false) {
|
||||
return Sqr(ThresholdL1(stats.sum_grad, p.reg_alpha)) /
|
||||
return common::Sqr(ThresholdL1(stats.sum_grad, p.reg_alpha)) /
|
||||
(stats.sum_hess + p.reg_lambda);
|
||||
}
|
||||
return tree::CalcGainGivenWeight<ParamT, float>(p, stats.sum_grad,
|
||||
|
||||
Reference in New Issue
Block a user