diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h index d56170aec..901c9ae91 100644 --- a/include/xgboost/linalg.h +++ b/include/xgboost/linalg.h @@ -603,13 +603,13 @@ auto MakeTensorView(Context const *ctx, Order order, common::Span data, S &&. template auto MakeTensorView(Context const *ctx, HostDeviceVector *data, S &&...shape) { - auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan(); + auto span = ctx->IsCUDA() ? data->DeviceSpan() : data->HostSpan(); return MakeTensorView(ctx->Device(), span, std::forward(shape)...); } template auto MakeTensorView(Context const *ctx, HostDeviceVector const *data, S &&...shape) { - auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan(); + auto span = ctx->IsCUDA() ? data->ConstDeviceSpan() : data->ConstHostSpan(); return MakeTensorView(ctx->Device(), span, std::forward(shape)...); } diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh index 5d52e4100..361f926e9 100644 --- a/src/common/linalg_op.cuh +++ b/src/common/linalg_op.cuh @@ -42,7 +42,7 @@ void ElementWiseTransformDevice(linalg::TensorView t, Fn&& fn, cudaStream_ template void ElementWiseKernel(Context const* ctx, linalg::TensorView t, Fn&& fn) { - ctx->IsCPU() ? ElementWiseKernelHost(t, ctx->Threads(), fn) : ElementWiseKernelDevice(t, fn); + ctx->IsCUDA() ? ElementWiseKernelDevice(t, fn) : ElementWiseKernelHost(t, ctx->Threads(), fn); } } // namespace linalg } // namespace xgboost diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h index f55927402..d89e5a736 100644 --- a/src/common/linalg_op.h +++ b/src/common/linalg_op.h @@ -55,7 +55,7 @@ void ElementWiseTransformDevice(linalg::TensorView, Fn&&, void* = nullptr) template void ElementWiseKernel(Context const* ctx, linalg::TensorView t, Fn&& fn) { - if (!ctx->IsCPU()) { + if (ctx->IsCUDA()) { common::AssertGPUSupport(); } ElementWiseKernelHost(t, ctx->Threads(), fn); diff --git a/src/common/numeric.cc b/src/common/numeric.cc index 240e0234a..f19932311 100644 --- a/src/common/numeric.cc +++ b/src/common/numeric.cc @@ -11,13 +11,14 @@ namespace xgboost { namespace common { double Reduce(Context const* ctx, HostDeviceVector const& values) { - if (ctx->IsCPU()) { + if (ctx->IsCUDA()) { + return cuda_impl::Reduce(ctx, values); + } else { auto const& h_values = values.ConstHostVector(); auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0); static_assert(std::is_same::value); return result; } - return cuda_impl::Reduce(ctx, values); } } // namespace common } // namespace xgboost diff --git a/src/common/optional_weight.h b/src/common/optional_weight.h index 997f3fad1..bbfd365c8 100644 --- a/src/common/optional_weight.h +++ b/src/common/optional_weight.h @@ -26,7 +26,7 @@ inline OptionalWeights MakeOptionalWeights(Context const* ctx, if (ctx->IsCUDA()) { weights.SetDevice(ctx->Device()); } - return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()}; + return OptionalWeights{ctx->IsCUDA() ? weights.ConstDeviceSpan() : weights.ConstHostSpan()}; } } // namespace xgboost::common #endif // XGBOOST_COMMON_OPTIONAL_WEIGHT_H_ diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h index 31531a597..e6b87ed4b 100644 --- a/src/common/ranking_utils.h +++ b/src/common/ranking_utils.h @@ -197,10 +197,10 @@ class RankingCache { CHECK_EQ(info.group_ptr_.back(), info.labels.Size()) << error::GroupSize() << "the size of label."; } - if (ctx->IsCPU()) { - this->InitOnCPU(ctx, info); - } else { + if (ctx->IsCUDA()) { this->InitOnCUDA(ctx, info); + } else { + this->InitOnCPU(ctx, info); } if (!info.weights_.Empty()) { CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight(); @@ -218,7 +218,7 @@ class RankingCache { // Constructed as [1, n_samples] if group ptr is not supplied by the user common::Span DataGroupPtr(Context const* ctx) const { group_ptr_.SetDevice(ctx->Device()); - return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan(); + return ctx->IsCUDA() ? group_ptr_.ConstDeviceSpan() : group_ptr_.ConstHostSpan(); } [[nodiscard]] auto const& Param() const { return param_; } @@ -231,10 +231,10 @@ class RankingCache { sorted_idx_cache_.SetDevice(ctx->Device()); sorted_idx_cache_.Resize(predt.size()); } - if (ctx->IsCPU()) { - return this->MakeRankOnCPU(ctx, predt); - } else { + if (ctx->IsCUDA()) { return this->MakeRankOnCUDA(ctx, predt); + } else { + return this->MakeRankOnCPU(ctx, predt); } } // The function simply returns a uninitialized buffer as this is only used by the @@ -307,10 +307,10 @@ class NDCGCache : public RankingCache { public: NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : RankingCache{ctx, info, p} { - if (ctx->IsCPU()) { - this->InitOnCPU(ctx, info); - } else { + if (ctx->IsCUDA()) { this->InitOnCUDA(ctx, info); + } else { + this->InitOnCPU(ctx, info); } } @@ -318,7 +318,7 @@ class NDCGCache : public RankingCache { return inv_idcg_.View(ctx->Device()); } common::Span Discount(Context const* ctx) const { - return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan(); + return ctx->IsCUDA() ? discounts_.ConstDeviceSpan() : discounts_.ConstHostSpan(); } linalg::VectorView Dcg(Context const* ctx) { if (dcg_.Size() == 0) { @@ -387,10 +387,10 @@ class PreCache : public RankingCache { public: PreCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : RankingCache{ctx, info, p} { - if (ctx->IsCPU()) { - this->InitOnCPU(ctx, info); - } else { + if (ctx->IsCUDA()) { this->InitOnCUDA(ctx, info); + } else { + this->InitOnCPU(ctx, info); } } @@ -399,7 +399,7 @@ class PreCache : public RankingCache { pre_.SetDevice(ctx->Device()); pre_.Resize(this->Groups()); } - return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan(); + return ctx->IsCUDA() ? pre_.DeviceSpan() : pre_.HostSpan(); } }; @@ -418,10 +418,10 @@ class MAPCache : public RankingCache { public: MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : RankingCache{ctx, info, p}, n_samples_{static_cast(info.num_row_)} { - if (ctx->IsCPU()) { - this->InitOnCPU(ctx, info); - } else { + if (ctx->IsCUDA()) { this->InitOnCUDA(ctx, info); + } else { + this->InitOnCPU(ctx, info); } } @@ -430,21 +430,21 @@ class MAPCache : public RankingCache { n_rel_.SetDevice(ctx->Device()); n_rel_.Resize(n_samples_); } - return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan(); + return ctx->IsCUDA() ? n_rel_.DeviceSpan() : n_rel_.HostSpan(); } common::Span Acc(Context const* ctx) { if (acc_.Empty()) { acc_.SetDevice(ctx->Device()); acc_.Resize(n_samples_); } - return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan(); + return ctx->IsCUDA() ? acc_.DeviceSpan() : acc_.HostSpan(); } common::Span Map(Context const* ctx) { if (map_.Empty()) { map_.SetDevice(ctx->Device()); map_.Resize(this->Groups()); } - return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan(); + return ctx->IsCUDA() ? map_.DeviceSpan() : map_.HostSpan(); } }; diff --git a/src/common/stats.cc b/src/common/stats.cc index aa73a07c3..bbf969fcc 100644 --- a/src/common/stats.cc +++ b/src/common/stats.cc @@ -49,7 +49,9 @@ void Mean(Context const* ctx, linalg::Vector const& v, linalg::VectorSetDevice(ctx->Device()); out->Reshape(1); - if (ctx->IsCPU()) { + if (ctx->IsCUDA()) { + cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device())); + } else { auto h_v = v.HostView(); float n = v.Size(); MemStackAllocator tloc(ctx->Threads(), 0.0f); @@ -57,8 +59,6 @@ void Mean(Context const* ctx, linalg::Vector const& v, linalg::VectorHostView()(0) = ret; - } else { - cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device())); } } } // namespace xgboost::common diff --git a/src/learner.cc b/src/learner.cc index d2b297dca..08c59ba60 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -278,7 +278,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy std::swap(base_score_, base_margin); // Make sure read access everywhere for thread-safe prediction. std::as_const(base_score_).HostView(); - if (!ctx->IsCPU()) { + if (ctx->IsCUDA()) { std::as_const(base_score_).View(ctx->Device()); } CHECK(std::as_const(base_score_).Data()->HostCanRead()); @@ -287,7 +287,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy linalg::TensorView LearnerModelParam::BaseScore(DeviceOrd device) const { // multi-class is not yet supported. CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted(); - if (device.IsCPU()) { + if (!device.IsCUDA()) { // Make sure that we won't run into race condition. CHECK(base_score_.Data()->HostCanRead()); return base_score_.HostView(); diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu index e16f9f8cc..feabedfab 100644 --- a/src/metric/elementwise_metric.cu +++ b/src/metric/elementwise_metric.cu @@ -46,7 +46,26 @@ template PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) { PackedReduceResult result; auto labels = info.labels.View(ctx->Device()); - if (ctx->IsCPU()) { + if (ctx->IsCUDA()) { +#if defined(XGBOOST_USE_CUDA) + dh::XGBCachingDeviceAllocator alloc; + thrust::counting_iterator begin(0); + thrust::counting_iterator end = begin + labels.Size(); + result = thrust::transform_reduce( + thrust::cuda::par(alloc), begin, end, + [=] XGBOOST_DEVICE(size_t i) { + auto idx = linalg::UnravelIndex(i, labels.Shape()); + auto sample_id = std::get<0>(idx); + auto target_id = std::get<1>(idx); + auto res = loss(i, sample_id, target_id); + float v{std::get<0>(res)}, wt{std::get<1>(res)}; + return PackedReduceResult{v, wt}; + }, + PackedReduceResult{}, thrust::plus()); +#else + common::AssertGPUSupport(); +#endif // defined(XGBOOST_USE_CUDA) + } else { auto n_threads = ctx->Threads(); std::vector score_tloc(n_threads, 0.0); std::vector weight_tloc(n_threads, 0.0); @@ -69,25 +88,6 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) { double residue_sum = std::accumulate(score_tloc.cbegin(), score_tloc.cend(), 0.0); double weights_sum = std::accumulate(weight_tloc.cbegin(), weight_tloc.cend(), 0.0); result = PackedReduceResult{residue_sum, weights_sum}; - } else { -#if defined(XGBOOST_USE_CUDA) - dh::XGBCachingDeviceAllocator alloc; - thrust::counting_iterator begin(0); - thrust::counting_iterator end = begin + labels.Size(); - result = thrust::transform_reduce( - thrust::cuda::par(alloc), begin, end, - [=] XGBOOST_DEVICE(size_t i) { - auto idx = linalg::UnravelIndex(i, labels.Shape()); - auto sample_id = std::get<0>(idx); - auto target_id = std::get<1>(idx); - auto res = loss(i, sample_id, target_id); - float v{std::get<0>(res)}, wt{std::get<1>(res)}; - return PackedReduceResult{v, wt}; - }, - PackedReduceResult{}, thrust::plus()); -#else - common::AssertGPUSupport(); -#endif // defined(XGBOOST_USE_CUDA) } return result; } @@ -185,10 +185,10 @@ class PseudoErrorLoss : public MetricNoCache { CHECK_EQ(info.labels.Shape(0), info.num_row_); auto labels = info.labels.View(ctx_->Device()); preds.SetDevice(ctx_->Device()); - auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan(); + auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan(); info.weights_.SetDevice(ctx_->Device()); - common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan() - : info.weights_.ConstDeviceSpan()); + common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan() + : info.weights_.ConstHostSpan()); float slope = this->param_.huber_slope; CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0."; PackedReduceResult result = @@ -351,10 +351,10 @@ struct EvalEWiseBase : public MetricNoCache { } auto labels = info.labels.View(ctx_->Device()); info.weights_.SetDevice(ctx_->Device()); - common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan() - : info.weights_.ConstDeviceSpan()); + common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan() + : info.weights_.ConstHostSpan()); preds.SetDevice(ctx_->Device()); - auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan(); + auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan(); auto d_policy = policy_; auto result = diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h index c49c5b6fb..a64f37f63 100644 --- a/src/objective/adaptive.h +++ b/src/objective/adaptive.h @@ -96,13 +96,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector const& posit inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector const& position, std::int32_t group_idx, MetaInfo const& info, float learning_rate, HostDeviceVector const& predt, float alpha, RegTree* p_tree) { - if (ctx->IsCPU()) { - detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate, - predt, alpha, p_tree); - } else { + if (ctx->IsCUDA()) { position.SetDevice(ctx->Device()); detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate, predt, alpha, p_tree); + } else { + detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate, + predt, alpha, p_tree); } } } // namespace obj diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc index bd41aebb6..0c9d1262a 100644 --- a/src/objective/lambdarank_obj.cc +++ b/src/objective/lambdarank_obj.cc @@ -108,14 +108,14 @@ class LambdaRankObj : public FitIntercept { li_.SetDevice(ctx_->Device()); lj_.SetDevice(ctx_->Device()); - if (ctx_->IsCPU()) { - cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()), - lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_, - &li_, &lj_, p_cache_); - } else { + if (ctx_->IsCUDA()) { cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()), lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_, &li_, &lj_, p_cache_); + } else { + cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()), + lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_, + &li_, &lj_, p_cache_); } li_full_.Data()->Fill(0.0); diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu index d8571ed17..57f432c7f 100644 --- a/src/objective/quantile_obj.cu +++ b/src/objective/quantile_obj.cu @@ -71,15 +71,15 @@ class QuantileRegression : public ObjFunction { auto gpair = out_gpair->View(ctx_->Device()); info.weights_.SetDevice(ctx_->Device()); - common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan() - : info.weights_.ConstDeviceSpan()}; + common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan() + : info.weights_.ConstHostSpan()}; preds.SetDevice(ctx_->Device()); auto predt = linalg::MakeVec(&preds); auto n_samples = info.num_row_; alpha_.SetDevice(ctx_->Device()); - auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan(); + auto alpha = ctx_->IsCUDA() ? alpha_.ConstDeviceSpan() : alpha_.ConstHostSpan(); linalg::ElementWiseKernel( ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable { @@ -107,27 +107,7 @@ class QuantileRegression : public ObjFunction { base_score->Reshape(n_targets); double sw{0}; - if (ctx_->IsCPU()) { - auto quantiles = base_score->HostView(); - auto h_weights = info.weights_.ConstHostVector(); - if (info.weights_.Empty()) { - sw = info.num_row_; - } else { - sw = std::accumulate(std::cbegin(h_weights), std::cend(h_weights), 0.0); - } - for (bst_target_t t{0}; t < n_targets; ++t) { - auto alpha = param_.quantile_alpha[t]; - auto h_labels = info.labels.HostView(); - if (h_weights.empty()) { - quantiles(t) = - common::Quantile(ctx_, alpha, linalg::cbegin(h_labels), linalg::cend(h_labels)); - } else { - CHECK_EQ(h_weights.size(), h_labels.Size()); - quantiles(t) = common::WeightedQuantile(ctx_, alpha, linalg::cbegin(h_labels), - linalg::cend(h_labels), std::cbegin(h_weights)); - } - } - } else { + if (ctx_->IsCUDA()) { #if defined(XGBOOST_USE_CUDA) alpha_.SetDevice(ctx_->Device()); auto d_alpha = alpha_.ConstDeviceSpan(); @@ -164,6 +144,26 @@ class QuantileRegression : public ObjFunction { #else common::AssertGPUSupport(); #endif // defined(XGBOOST_USE_CUDA) + } else { + auto quantiles = base_score->HostView(); + auto h_weights = info.weights_.ConstHostVector(); + if (info.weights_.Empty()) { + sw = info.num_row_; + } else { + sw = std::accumulate(std::cbegin(h_weights), std::cend(h_weights), 0.0); + } + for (bst_target_t t{0}; t < n_targets; ++t) { + auto alpha = param_.quantile_alpha[t]; + auto h_labels = info.labels.HostView(); + if (h_weights.empty()) { + quantiles(t) = + common::Quantile(ctx_, alpha, linalg::cbegin(h_labels), linalg::cend(h_labels)); + } else { + CHECK_EQ(h_weights.size(), h_labels.Size()); + quantiles(t) = common::WeightedQuantile(ctx_, alpha, linalg::cbegin(h_labels), + linalg::cend(h_labels), std::cbegin(h_weights)); + } + } } // For multiple quantiles, we should extend the base score to a vector instead of diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index cf774db93..7f498c5f1 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -254,8 +254,8 @@ class PseudoHuberRegression : public FitIntercept { auto predt = linalg::MakeVec(&preds); info.weights_.SetDevice(ctx_->Device()); - common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan() - : info.weights_.ConstDeviceSpan()}; + common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan() + : info.weights_.ConstHostSpan()}; linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(size_t i, float const y) mutable { auto sample_id = std::get<0>(linalg::UnravelIndex(i, labels.Shape())); @@ -714,8 +714,8 @@ class MeanAbsoluteError : public ObjFunction { preds.SetDevice(ctx_->Device()); auto predt = linalg::MakeVec(&preds); info.weights_.SetDevice(ctx_->Device()); - common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan() - : info.weights_.ConstDeviceSpan()}; + common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan() + : info.weights_.ConstHostSpan()}; linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, float y) mutable { auto sign = [](auto x) { diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc index b80293fae..21a050536 100644 --- a/src/tree/fit_stump.cc +++ b/src/tree/fit_stump.cc @@ -72,7 +72,7 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::MatrixDevice()); auto gpair_t = gpair.View(ctx->Device()); - ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView()) - : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device())); + ctx->IsCUDA() ? cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device())) + : cpu_impl::FitStump(ctx, info, gpair_t, out->HostView()); } } // namespace xgboost::tree