Implement column sampler in CUDA. (#9785)
- CUDA implementation. - Extract the broadcasting logic, we will need the context parameter after revamping the collective implementation. - Some changes to the event loop for fixing a deadlock in CI. - Move argsort into algorithms.cuh, add support for cuda stream.
This commit is contained in:
@@ -360,7 +360,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
|
||||
common::OptionalWeights{info.weights_.ConstHostSpan()});
|
||||
} else {
|
||||
std::tie(fp, tp, auc) =
|
||||
GPUBinaryROCAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
|
||||
GPUBinaryROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
|
||||
}
|
||||
return std::make_tuple(fp, tp, auc);
|
||||
}
|
||||
@@ -376,8 +376,9 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")
|
||||
.set_body([](const char*) { return new EvalROCAUC(); });
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
|
||||
DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
|
||||
std::tuple<double, double, double> GPUBinaryROCAUC(Context const *, common::Span<float const>,
|
||||
MetaInfo const &,
|
||||
std::shared_ptr<DeviceAUCCache> *) {
|
||||
common::AssertGPUSupport();
|
||||
return {};
|
||||
}
|
||||
@@ -409,8 +410,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
||||
BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
|
||||
common::OptionalWeights{info.weights_.ConstHostSpan()});
|
||||
} else {
|
||||
std::tie(pr, re, auc) =
|
||||
GPUBinaryPRAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
|
||||
std::tie(pr, re, auc) = GPUBinaryPRAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
|
||||
}
|
||||
return std::make_tuple(pr, re, auc);
|
||||
}
|
||||
@@ -453,8 +453,9 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
|
||||
.set_body([](char const *) { return new EvalPRAUC{}; });
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
|
||||
DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
|
||||
std::tuple<double, double, double> GPUBinaryPRAUC(Context const *, common::Span<float const>,
|
||||
MetaInfo const &,
|
||||
std::shared_ptr<DeviceAUCCache> *) {
|
||||
common::AssertGPUSupport();
|
||||
return {};
|
||||
}
|
||||
|
||||
@@ -83,13 +83,14 @@ void InitCacheOnce(common::Span<float const> predts, std::shared_ptr<DeviceAUCCa
|
||||
* - Reduce the scan array into 1 AUC value.
|
||||
*/
|
||||
template <typename Fn>
|
||||
std::tuple<double, double, double>
|
||||
GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
DeviceOrd device, common::Span<size_t const> d_sorted_idx,
|
||||
Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
|
||||
auto labels = info.labels.View(device);
|
||||
std::tuple<double, double, double> GPUBinaryAUC(Context const *ctx,
|
||||
common::Span<float const> predts,
|
||||
MetaInfo const &info,
|
||||
common::Span<size_t const> d_sorted_idx, Fn area_fn,
|
||||
std::shared_ptr<DeviceAUCCache> cache) {
|
||||
auto labels = info.labels.View(ctx->Device());
|
||||
auto weights = info.weights_.ConstDeviceSpan();
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||
|
||||
CHECK_NE(labels.Size(), 0);
|
||||
CHECK_EQ(labels.Size(), predts.size());
|
||||
@@ -115,7 +116,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
|
||||
dh::XGBDeviceAllocator<char> alloc;
|
||||
auto d_unique_idx = dh::ToSpan(cache->unique_idx);
|
||||
dh::Iota(d_unique_idx);
|
||||
dh::Iota(d_unique_idx, ctx->CUDACtx()->Stream());
|
||||
|
||||
auto uni_key = dh::MakeTransformIterator<float>(
|
||||
thrust::make_counting_iterator(0),
|
||||
@@ -167,8 +168,9 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
||||
return std::make_tuple(last.first, last.second, auc);
|
||||
}
|
||||
|
||||
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
|
||||
MetaInfo const &info, DeviceOrd device,
|
||||
std::tuple<double, double, double> GPUBinaryROCAUC(Context const *ctx,
|
||||
common::Span<float const> predts,
|
||||
MetaInfo const &info,
|
||||
std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||
auto &cache = *p_cache;
|
||||
InitCacheOnce<false>(predts, p_cache);
|
||||
@@ -177,10 +179,10 @@ std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> pre
|
||||
* Create sorted index for each class
|
||||
*/
|
||||
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
|
||||
dh::ArgSort<false>(predts, d_sorted_idx);
|
||||
common::ArgSort<false>(ctx, predts, d_sorted_idx);
|
||||
// Create lambda to avoid pass function pointer.
|
||||
return GPUBinaryAUC(
|
||||
predts, info, device, d_sorted_idx,
|
||||
ctx, predts, info, d_sorted_idx,
|
||||
[] XGBOOST_DEVICE(double x0, double x1, double y0, double y1) -> double {
|
||||
return TrapezoidArea(x0, x1, y0, y1);
|
||||
},
|
||||
@@ -361,7 +363,7 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
|
||||
*/
|
||||
dh::XGBDeviceAllocator<char> alloc;
|
||||
auto d_unique_idx = dh::ToSpan(cache->unique_idx);
|
||||
dh::Iota(d_unique_idx);
|
||||
dh::Iota(d_unique_idx, ctx->CUDACtx()->Stream());
|
||||
auto uni_key = dh::MakeTransformIterator<thrust::pair<uint32_t, float>>(
|
||||
thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {
|
||||
uint32_t class_id = i / n_samples;
|
||||
@@ -603,8 +605,9 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
|
||||
return std::make_pair(auc, n_valid);
|
||||
}
|
||||
|
||||
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
|
||||
MetaInfo const &info, DeviceOrd device,
|
||||
std::tuple<double, double, double> GPUBinaryPRAUC(Context const *ctx,
|
||||
common::Span<float const> predts,
|
||||
MetaInfo const &info,
|
||||
std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||
auto& cache = *p_cache;
|
||||
InitCacheOnce<false>(predts, p_cache);
|
||||
@@ -613,9 +616,9 @@ std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> pred
|
||||
* Create sorted index for each class
|
||||
*/
|
||||
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
|
||||
dh::ArgSort<false>(predts, d_sorted_idx);
|
||||
common::ArgSort<false>(ctx, predts, d_sorted_idx);
|
||||
|
||||
auto labels = info.labels.View(device);
|
||||
auto labels = info.labels.View(ctx->Device());
|
||||
auto d_weights = info.weights_.ConstDeviceSpan();
|
||||
auto get_weight = common::OptionalWeights{d_weights};
|
||||
auto it = dh::MakeTransformIterator<Pair>(
|
||||
@@ -639,7 +642,7 @@ std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> pred
|
||||
return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp, total_pos);
|
||||
};
|
||||
double fp, tp, auc;
|
||||
std::tie(fp, tp, auc) = GPUBinaryAUC(predts, info, device, d_sorted_idx, fn, cache);
|
||||
std::tie(fp, tp, auc) = GPUBinaryAUC(ctx, predts, info, d_sorted_idx, fn, cache);
|
||||
return std::make_tuple(1.0, 1.0, auc);
|
||||
}
|
||||
|
||||
@@ -699,16 +702,17 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
|
||||
}
|
||||
|
||||
template <typename Fn>
|
||||
std::pair<double, uint32_t>
|
||||
GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
|
||||
common::Span<uint32_t> d_group_ptr, DeviceOrd device,
|
||||
std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
|
||||
std::pair<double, uint32_t> GPURankingPRAUCImpl(Context const *ctx,
|
||||
common::Span<float const> predts,
|
||||
MetaInfo const &info,
|
||||
common::Span<uint32_t> d_group_ptr,
|
||||
std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
|
||||
/**
|
||||
* Sorted idx
|
||||
*/
|
||||
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
|
||||
|
||||
auto labels = info.labels.View(device);
|
||||
auto labels = info.labels.View(ctx->Device());
|
||||
auto weights = info.weights_.ConstDeviceSpan();
|
||||
|
||||
uint32_t n_groups = static_cast<uint32_t>(info.group_ptr_.size() - 1);
|
||||
@@ -739,7 +743,7 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
|
||||
*/
|
||||
dh::XGBDeviceAllocator<char> alloc;
|
||||
auto d_unique_idx = dh::ToSpan(cache->unique_idx);
|
||||
dh::Iota(d_unique_idx);
|
||||
dh::Iota(d_unique_idx, ctx->CUDACtx()->Stream());
|
||||
auto uni_key = dh::MakeTransformIterator<thrust::pair<uint32_t, float>>(
|
||||
thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {
|
||||
auto idx = d_sorted_idx[i];
|
||||
@@ -882,7 +886,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
|
||||
return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
|
||||
d_totals[group_id].first);
|
||||
};
|
||||
return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->Device(), cache, fn);
|
||||
return GPURankingPRAUCImpl(ctx, predts, info, d_group_ptr, cache, fn);
|
||||
}
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2021 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_METRIC_AUC_H_
|
||||
#define XGBOOST_METRIC_AUC_H_
|
||||
@@ -18,8 +18,7 @@
|
||||
#include "xgboost/metric.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
namespace xgboost::metric {
|
||||
/***********
|
||||
* ROC AUC *
|
||||
***********/
|
||||
@@ -29,8 +28,9 @@ XGBOOST_DEVICE inline double TrapezoidArea(double x0, double x1, double y0, doub
|
||||
|
||||
struct DeviceAUCCache;
|
||||
|
||||
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
|
||||
MetaInfo const &info, DeviceOrd,
|
||||
std::tuple<double, double, double> GPUBinaryROCAUC(Context const *ctx,
|
||||
common::Span<float const> predts,
|
||||
MetaInfo const &info,
|
||||
std::shared_ptr<DeviceAUCCache> *p_cache);
|
||||
|
||||
double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
|
||||
@@ -44,8 +44,9 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
|
||||
/**********
|
||||
* PR AUC *
|
||||
**********/
|
||||
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
|
||||
MetaInfo const &info, DeviceOrd,
|
||||
std::tuple<double, double, double> GPUBinaryPRAUC(Context const *ctx,
|
||||
common::Span<float const> predts,
|
||||
MetaInfo const &info,
|
||||
std::shared_ptr<DeviceAUCCache> *p_cache);
|
||||
|
||||
double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
|
||||
@@ -111,6 +112,5 @@ struct PRAUCLabelInvalid {
|
||||
inline void InvalidLabels() {
|
||||
LOG(FATAL) << "PR-AUC supports only binary relevance for learning to rank.";
|
||||
}
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::metric
|
||||
#endif // XGBOOST_METRIC_AUC_H_
|
||||
|
||||
Reference in New Issue
Block a user