Use the new DeviceOrd in the linalg module. (#9527)

This commit is contained in:
Jiaming Yuan
2023-08-29 13:37:29 +08:00
committed by GitHub
parent 942b957eef
commit ddf2e68821
43 changed files with 252 additions and 273 deletions

View File

@@ -82,22 +82,19 @@ template <typename BinaryAUC>
double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaInfo const &info,
size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) {
CHECK_NE(n_classes, 0);
auto const labels = info.labels.View(Context::kCpuId);
auto const labels = info.labels.HostView();
if (labels.Shape(0) != 0) {
CHECK_EQ(labels.Shape(1), 1) << "AUC doesn't support multi-target model.";
}
std::vector<double> results_storage(n_classes * 3, 0);
linalg::TensorView<double, 2> results(results_storage, {n_classes, static_cast<size_t>(3)},
Context::kCpuId);
auto results = linalg::MakeTensorView(ctx, results_storage, n_classes, 3);
auto local_area = results.Slice(linalg::All(), 0);
auto tp = results.Slice(linalg::All(), 1);
auto auc = results.Slice(linalg::All(), 2);
auto weights = common::OptionalWeights{info.weights_.ConstHostSpan()};
auto predts_t = linalg::TensorView<float const, 2>(
predts, {static_cast<size_t>(info.num_row_), n_classes},
Context::kCpuId);
auto predts_t = linalg::MakeTensorView(ctx, predts, info.num_row_, n_classes);
if (info.labels.Size() != 0) {
common::ParallelFor(n_classes, n_threads, [&](auto c) {
@@ -108,8 +105,8 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
response[i] = labels(i) == c ? 1.0f : 0.0;
}
double fp;
std::tie(fp, tp(c), auc(c)) =
binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
std::tie(fp, tp(c), auc(c)) = binary_auc(
ctx, proba, linalg::MakeVec(response.data(), response.size(), ctx->Device()), weights);
local_area(c) = fp * tp(c);
});
}
@@ -220,7 +217,7 @@ std::pair<double, uint32_t> RankingAUC(Context const *ctx, std::vector<float> co
CHECK_GE(info.group_ptr_.size(), 2);
uint32_t n_groups = info.group_ptr_.size() - 1;
auto s_predts = common::Span<float const>{predts};
auto labels = info.labels.View(Context::kCpuId);
auto labels = info.labels.View(ctx->Device());
auto s_weights = info.weights_.ConstHostSpan();
std::atomic<uint32_t> invalid_groups{0};
@@ -363,8 +360,8 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()});
} else {
std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info,
ctx_->gpu_id, &this->d_cache_);
std::tie(fp, tp, auc) =
GPUBinaryROCAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
}
return std::make_tuple(fp, tp, auc);
}
@@ -381,8 +378,7 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")
#if !defined(XGBOOST_USE_CUDA)
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
std::int32_t,
std::shared_ptr<DeviceAUCCache> *) {
DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
common::AssertGPUSupport();
return {};
}
@@ -414,8 +410,8 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()});
} else {
std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info,
ctx_->gpu_id, &this->d_cache_);
std::tie(pr, re, auc) =
GPUBinaryPRAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
}
return std::make_tuple(pr, re, auc);
}
@@ -459,7 +455,7 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
#if !defined(XGBOOST_USE_CUDA)
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
std::int32_t, std::shared_ptr<DeviceAUCCache> *) {
DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
common::AssertGPUSupport();
return {};
}

View File

@@ -85,11 +85,11 @@ void InitCacheOnce(common::Span<float const> predts, std::shared_ptr<DeviceAUCCa
template <typename Fn>
std::tuple<double, double, double>
GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
int32_t device, common::Span<size_t const> d_sorted_idx,
DeviceOrd device, common::Span<size_t const> d_sorted_idx,
Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
auto labels = info.labels.View(device);
auto weights = info.weights_.ConstDeviceSpan();
dh::safe_cuda(cudaSetDevice(device));
dh::safe_cuda(cudaSetDevice(device.ordinal));
CHECK_NE(labels.Size(), 0);
CHECK_EQ(labels.Size(), predts.size());
@@ -168,7 +168,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
}
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
MetaInfo const &info, std::int32_t device,
MetaInfo const &info, DeviceOrd device,
std::shared_ptr<DeviceAUCCache> *p_cache) {
auto &cache = *p_cache;
InitCacheOnce<false>(predts, p_cache);
@@ -309,9 +309,10 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
* up each class in all kernels.
*/
template <bool scale, typename Fn>
double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span<uint32_t> d_class_ptr,
size_t n_classes, std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
dh::safe_cuda(cudaSetDevice(device));
double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
common::Span<uint32_t> d_class_ptr, size_t n_classes,
std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
/**
* Sorted idx
*/
@@ -467,11 +468,12 @@ double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
dh::TemporaryArray<uint32_t> class_ptr(n_classes + 1, 0);
MultiClassSortedIdx(ctx, predts, dh::ToSpan(class_ptr), cache);
auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev,
double tp, size_t /*class_id*/) {
auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev, double tp,
size_t /*class_id*/) {
return TrapezoidArea(fp_prev, fp, tp_prev, tp);
};
return GPUMultiClassAUCOVR<true>(info, ctx->gpu_id, dh::ToSpan(class_ptr), n_classes, cache, fn);
return GPUMultiClassAUCOVR<true>(info, ctx->Device(), dh::ToSpan(class_ptr), n_classes, cache,
fn);
}
namespace {
@@ -512,7 +514,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
/**
* Sort the labels
*/
auto d_labels = info.labels.View(ctx->gpu_id);
auto d_labels = info.labels.View(ctx->Device());
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
common::SegmentedArgSort<false, false>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
@@ -604,7 +606,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
}
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
MetaInfo const &info, std::int32_t device,
MetaInfo const &info, DeviceOrd device,
std::shared_ptr<DeviceAUCCache> *p_cache) {
auto& cache = *p_cache;
InitCacheOnce<false>(predts, p_cache);
@@ -662,7 +664,7 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
/**
* Get total positive/negative
*/
auto labels = info.labels.View(ctx->gpu_id);
auto labels = info.labels.View(ctx->Device());
auto n_samples = info.num_row_;
dh::caching_device_vector<Pair> totals(n_classes);
auto key_it =
@@ -695,13 +697,13 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
d_totals[class_id].first);
};
return GPUMultiClassAUCOVR<false>(info, ctx->gpu_id, d_class_ptr, n_classes, cache, fn);
return GPUMultiClassAUCOVR<false>(info, ctx->Device(), d_class_ptr, n_classes, cache, fn);
}
template <typename Fn>
std::pair<double, uint32_t>
GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
common::Span<uint32_t> d_group_ptr, int32_t device,
common::Span<uint32_t> d_group_ptr, DeviceOrd device,
std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
/**
* Sorted idx
@@ -843,7 +845,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
common::SegmentedArgSort<false, false>(ctx, predts, d_group_ptr, d_sorted_idx);
dh::XGBDeviceAllocator<char> alloc;
auto labels = info.labels.View(ctx->gpu_id);
auto labels = info.labels.View(ctx->Device());
if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
InvalidLabels();
@@ -882,7 +884,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
d_totals[group_id].first);
};
return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->gpu_id, cache, fn);
return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->Device(), cache, fn);
}
} // namespace metric
} // namespace xgboost

View File

@@ -30,7 +30,7 @@ XGBOOST_DEVICE inline double TrapezoidArea(double x0, double x1, double y0, doub
struct DeviceAUCCache;
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
MetaInfo const &info, std::int32_t device,
MetaInfo const &info, DeviceOrd,
std::shared_ptr<DeviceAUCCache> *p_cache);
double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
@@ -45,7 +45,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
* PR AUC *
**********/
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
MetaInfo const &info, std::int32_t device,
MetaInfo const &info, DeviceOrd,
std::shared_ptr<DeviceAUCCache> *p_cache);
double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,

View File

@@ -45,7 +45,7 @@ namespace {
template <typename Fn>
PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
PackedReduceResult result;
auto labels = info.labels.View(ctx->gpu_id);
auto labels = info.labels.View(ctx->Device());
if (ctx->IsCPU()) {
auto n_threads = ctx->Threads();
std::vector<double> score_tloc(n_threads, 0.0);
@@ -183,10 +183,10 @@ class PseudoErrorLoss : public MetricNoCache {
double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
CHECK_EQ(info.labels.Shape(0), info.num_row_);
auto labels = info.labels.View(ctx_->gpu_id);
preds.SetDevice(ctx_->gpu_id);
auto labels = info.labels.View(ctx_->Device());
preds.SetDevice(ctx_->Device());
auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
info.weights_.SetDevice(ctx_->gpu_id);
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan());
float slope = this->param_.huber_slope;
@@ -349,11 +349,11 @@ struct EvalEWiseBase : public MetricNoCache {
if (info.labels.Size() != 0) {
CHECK_NE(info.labels.Shape(1), 0);
}
auto labels = info.labels.View(ctx_->gpu_id);
info.weights_.SetDevice(ctx_->gpu_id);
auto labels = info.labels.View(ctx_->Device());
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan());
preds.SetDevice(ctx_->gpu_id);
preds.SetDevice(ctx_->Device());
auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
auto d_policy = policy_;
@@ -444,16 +444,16 @@ class QuantileError : public MetricNoCache {
}
auto const* ctx = ctx_;
auto y_true = info.labels.View(ctx->gpu_id);
preds.SetDevice(ctx->gpu_id);
alpha_.SetDevice(ctx->gpu_id);
auto y_true = info.labels.View(ctx->Device());
preds.SetDevice(ctx->Device());
alpha_.SetDevice(ctx->Device());
auto alpha = ctx->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
std::size_t n_targets = preds.Size() / info.num_row_ / alpha_.Size();
CHECK_NE(n_targets, 0);
auto y_predt = linalg::MakeTensorView(ctx, &preds, static_cast<std::size_t>(info.num_row_),
alpha_.Size(), n_targets);
info.weights_.SetDevice(ctx->gpu_id);
info.weights_.SetDevice(ctx->Device());
common::OptionalWeights weight{ctx->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

View File

@@ -75,7 +75,7 @@ struct EvalAMS : public MetricNoCache {
const double br = 10.0;
unsigned thresindex = 0;
double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
const auto& labels = info.labels.View(Context::kCpuId);
const auto& labels = info.labels.View(DeviceOrd::CPU());
for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
const unsigned ridx = rec[i].second;
const bst_float wt = info.GetWeight(ridx);
@@ -134,7 +134,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
std::vector<double> sum_tloc(ctx_->Threads(), 0.0);
{
const auto& labels = info.labels.View(Context::kCpuId);
const auto& labels = info.labels.HostView();
const auto &h_preds = preds.ConstHostVector();
dmlc::OMPException exc;

View File

@@ -33,7 +33,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
HostDeviceVector<float> const &predt,
std::shared_ptr<ltr::PreCache> p_cache) {
auto d_gptr = p_cache->DataGroupPtr(ctx);
auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
@@ -89,7 +89,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
if (!d_weight.Empty()) {
CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
}
auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
@@ -119,9 +119,9 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
HostDeviceVector<float> const &predt, bool minus,
std::shared_ptr<ltr::MAPCache> p_cache) {
auto d_group_ptr = p_cache->DataGroupPtr(ctx);
auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
predt.SetDevice(ctx->Device());
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
auto key_it = dh::MakeTransformIterator<std::size_t>(
thrust::make_counting_iterator(0ul),