diff --git a/include/xgboost/base.h b/include/xgboost/base.h index ba2ea7886..d12e71a3a 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -48,21 +48,6 @@ #define XGBOOST_ALIGNAS(X) #endif // defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) -#if defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) && \ - !defined(__CUDACC__) && !defined(__sun) && !defined(sun) -#include -#define XGBOOST_PARALLEL_SORT(X, Y, Z) __gnu_parallel::sort((X), (Y), (Z)) -#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) \ - __gnu_parallel::stable_sort((X), (Y), (Z)) -#elif defined(_MSC_VER) && (!__INTEL_COMPILER) -#include -#define XGBOOST_PARALLEL_SORT(X, Y, Z) concurrency::parallel_sort((X), (Y), (Z)) -#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) std::stable_sort((X), (Y), (Z)) -#else -#define XGBOOST_PARALLEL_SORT(X, Y, Z) std::sort((X), (Y), (Z)) -#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) std::stable_sort((X), (Y), (Z)) -#endif // GLIBC VERSION - #if defined(__GNUC__) #define XGBOOST_EXPECT(cond, ret) __builtin_expect((cond), (ret)) #else diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 9411fcfab..bcace4656 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -124,18 +124,7 @@ class MetaInfo { return weights_.Size() != 0 ? weights_.HostVector()[i] : 1.0f; } /*! \brief get sorted indexes (argsort) of labels by absolute value (used by cox loss) */ - inline const std::vector& LabelAbsSort() const { - if (label_order_cache_.size() == labels.Size()) { - return label_order_cache_; - } - label_order_cache_.resize(labels.Size()); - std::iota(label_order_cache_.begin(), label_order_cache_.end(), 0); - const auto& l = labels.Data()->HostVector(); - XGBOOST_PARALLEL_STABLE_SORT(label_order_cache_.begin(), label_order_cache_.end(), - [&l](size_t i1, size_t i2) {return std::abs(l[i1]) < std::abs(l[i2]);}); - - return label_order_cache_; - } + const std::vector& LabelAbsSort(Context const* ctx) const; /*! \brief clear all the information */ void Clear(); /*! diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h index 2f84bb1cb..489e5e565 100644 --- a/include/xgboost/linalg.h +++ b/include/xgboost/linalg.h @@ -23,6 +23,10 @@ #include #include +#if defined(_MSC_VER) +#include +#endif // defined(_MSC_VER) + // decouple it from xgboost. #ifndef LINALG_HD #if defined(__CUDA__) || defined(__NVCC__) diff --git a/src/common/algorithm.h b/src/common/algorithm.h index a5d2d1974..937b6b638 100644 --- a/src/common/algorithm.h +++ b/src/common/algorithm.h @@ -1,10 +1,31 @@ -/*! - * Copyright 2022 by XGBoost Contributors +/** + * Copyright 2022-2023 by XGBoost Contributors */ #ifndef XGBOOST_COMMON_ALGORITHM_H_ #define XGBOOST_COMMON_ALGORITHM_H_ -#include // std::upper_bound -#include // std::size_t +#include // upper_bound, stable_sort, sort, max +#include // size_t +#include // less +#include // iterator_traits, distance +#include // vector + +#include "numeric.h" // Iota +#include "xgboost/context.h" // Context + +// clang with libstdc++ works as well +#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && !defined(__APPLE__) +#define GCC_HAS_PARALLEL 1 +#endif // GLIC_VERSION + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#define MSVC_HAS_PARALLEL 1 +#endif // MSC + +#if defined(GCC_HAS_PARALLEL) +#include +#elif defined(MSVC_HAS_PARALLEL) +#include +#endif // GLIBC VERSION namespace xgboost { namespace common { @@ -13,6 +34,63 @@ auto SegmentId(It first, It last, Idx idx) { std::size_t segment_id = std::upper_bound(first, last, idx) - 1 - first; return segment_id; } + +template +void StableSort(Context const *ctx, Iter begin, Iter end, Comp &&comp) { + if (ctx->Threads() > 1) { +#if defined(GCC_HAS_PARALLEL) + __gnu_parallel::stable_sort(begin, end, comp, + __gnu_parallel::default_parallel_tag(ctx->Threads())); +#else + // the only stable sort is radix sort for msvc ppl. + std::stable_sort(begin, end, comp); +#endif // GLIBC VERSION + } else { + std::stable_sort(begin, end, comp); + } +} + +template +void Sort(Context const *ctx, Iter begin, Iter end, Comp comp) { + if (ctx->Threads() > 1) { +#if defined(GCC_HAS_PARALLEL) + __gnu_parallel::sort(begin, end, comp, __gnu_parallel::default_parallel_tag(ctx->Threads())); +#elif defined(MSVC_HAS_PARALLEL) + auto n = std::distance(begin, end); + // use chunk size as hint to number of threads. No local policy/scheduler input with the + // concurrency module. + std::size_t chunk_size = n / ctx->Threads(); + // 2048 is the default of msvc ppl as of v2022. + chunk_size = std::max(chunk_size, static_cast(2048)); + concurrency::parallel_sort(begin, end, comp, chunk_size); +#else + std::sort(begin, end, comp); +#endif // GLIBC VERSION + } else { + std::sort(begin, end, comp); + } +} + +template ::value_type, + typename Comp = std::less> +std::vector ArgSort(Context const *ctx, Iter begin, Iter end, Comp comp = std::less{}) { + CHECK(ctx->IsCPU()); + auto n = std::distance(begin, end); + std::vector result(n); + Iota(ctx, result.begin(), result.end(), 0); + auto op = [&](Idx const &l, Idx const &r) { return comp(begin[l], begin[r]); }; + StableSort(ctx, result.begin(), result.end(), op); + return result; +} } // namespace common } // namespace xgboost + +#if defined(GCC_HAS_PARALLEL) +#undef GCC_HAS_PARALLEL +#endif // defined(GCC_HAS_PARALLEL) + +#if defined(MSVC_HAS_PARALLEL) +#undef MSVC_HAS_PARALLEL +#endif // defined(MSVC_HAS_PARALLEL) + #endif // XGBOOST_COMMON_ALGORITHM_H_ diff --git a/src/common/common.h b/src/common/common.h index 5ac764817..6bc96dfc9 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -188,17 +188,6 @@ inline void SetDevice(std::int32_t device) { } #endif -template > -std::vector ArgSort(Container const &array, Comp comp = std::less{}) { - std::vector result(array.size()); - std::iota(result.begin(), result.end(), 0); - auto op = [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); }; - XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op); - return result; -} - /** * Last index of a group in a CSR style of index pointer. */ diff --git a/src/common/random.cc b/src/common/random.cc index f66b084cc..d0e75729d 100644 --- a/src/common/random.cc +++ b/src/common/random.cc @@ -24,8 +24,9 @@ std::shared_ptr> ColumnSampler::ColSample( for (size_t i = 0; i < h_features.size(); ++i) { weights[i] = feature_weights_[h_features[i]]; } + CHECK(ctx_); new_features.HostVector() = - WeightedSamplingWithoutReplacement(p_features->HostVector(), weights, n); + WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weights, n); } else { new_features.Resize(features.size()); std::copy(features.begin(), features.end(), new_features.HostVector().begin()); diff --git a/src/common/random.h b/src/common/random.h index 2d29bede3..5efdb486d 100644 --- a/src/common/random.h +++ b/src/common/random.h @@ -20,7 +20,9 @@ #include #include "../collective/communicator-inl.h" +#include "algorithm.h" // ArgSort #include "common.h" +#include "xgboost/context.h" // Context #include "xgboost/host_device_vector.h" namespace xgboost { @@ -87,8 +89,8 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*) * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/ */ template -std::vector WeightedSamplingWithoutReplacement( - std::vector const &array, std::vector const &weights, size_t n) { +std::vector WeightedSamplingWithoutReplacement(Context const* ctx, std::vector const& array, + std::vector const& weights, size_t n) { // ES sampling. CHECK_EQ(array.size(), weights.size()); std::vector keys(weights.size()); @@ -100,7 +102,7 @@ std::vector WeightedSamplingWithoutReplacement( auto k = std::log(u) / w; keys[i] = k; } - auto ind = ArgSort(Span{keys}, std::greater<>{}); + auto ind = ArgSort(ctx, keys.data(), keys.data() + keys.size(), std::greater<>{}); ind.resize(n); std::vector results(ind.size()); @@ -126,6 +128,7 @@ class ColumnSampler { float colsample_bytree_{1.0f}; float colsample_bynode_{1.0f}; GlobalRandomEngine rng_; + Context const* ctx_; public: std::shared_ptr> ColSample( @@ -157,12 +160,13 @@ class ColumnSampler { * \param colsample_bytree * \param skip_index_0 (Optional) True to skip index 0. */ - void Init(int64_t num_col, std::vector feature_weights, float colsample_bynode, - float colsample_bylevel, float colsample_bytree) { + void Init(Context const* ctx, int64_t num_col, std::vector feature_weights, + float colsample_bynode, float colsample_bylevel, float colsample_bytree) { feature_weights_ = std::move(feature_weights); colsample_bylevel_ = colsample_bylevel; colsample_bytree_ = colsample_bytree; colsample_bynode_ = colsample_bynode; + ctx_ = ctx; if (feature_set_tree_ == nullptr) { feature_set_tree_ = std::make_shared>(); diff --git a/src/data/data.cc b/src/data/data.cc index a935220e5..585b2f252 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -10,12 +10,13 @@ #include #include "../collective/communicator-inl.h" +#include "../common/algorithm.h" // StableSort #include "../common/api_entry.h" // XGBAPIThreadLocalEntry #include "../common/group_data.h" #include "../common/io.h" #include "../common/linalg_op.h" #include "../common/math.h" -#include "../common/numeric.h" +#include "../common/numeric.h" // Iota #include "../common/threading_utils.h" #include "../common/version.h" #include "../data/adapter.h" @@ -258,6 +259,19 @@ void LoadFeatureType(std::vectorconst& type_names, std::vector& MetaInfo::LabelAbsSort(Context const* ctx) const { + if (label_order_cache_.size() == labels.Size()) { + return label_order_cache_; + } + label_order_cache_.resize(labels.Size()); + common::Iota(ctx, label_order_cache_.begin(), label_order_cache_.end(), 0); + const auto& l = labels.Data()->HostVector(); + common::StableSort(ctx, label_order_cache_.begin(), label_order_cache_.end(), + [&l](size_t i1, size_t i2) { return std::abs(l[i1]) < std::abs(l[i2]); }); + + return label_order_cache_; +} + void MetaInfo::LoadBinary(dmlc::Stream *fi) { auto version = Version::Load(fi); auto major = std::get<0>(version); diff --git a/src/metric/auc.cc b/src/metric/auc.cc index 9bedd95ee..a926c2c5b 100644 --- a/src/metric/auc.cc +++ b/src/metric/auc.cc @@ -14,9 +14,11 @@ #include #include +#include "../common/algorithm.h" // ArgSort #include "../common/math.h" #include "../common/optional_weight.h" // OptionalWeights #include "metric_common.h" // MetricNoCache +#include "xgboost/context.h" #include "xgboost/host_device_vector.h" #include "xgboost/linalg.h" #include "xgboost/metric.h" @@ -77,9 +79,8 @@ BinaryAUC(common::Span predts, linalg::VectorView labe * Machine Learning Models */ template -double MultiClassOVR(common::Span predts, MetaInfo const &info, - size_t n_classes, int32_t n_threads, - BinaryAUC &&binary_auc) { +double MultiClassOVR(Context const *ctx, common::Span predts, MetaInfo const &info, + size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) { CHECK_NE(n_classes, 0); auto const labels = info.labels.View(Context::kCpuId); if (labels.Shape(0) != 0) { @@ -108,7 +109,7 @@ double MultiClassOVR(common::Span predts, MetaInfo const &info, } double fp; std::tie(fp, tp(c), auc(c)) = - binary_auc(proba, linalg::MakeVec(response.data(), response.size(), -1), weights); + binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights); local_area(c) = fp * tp(c); }); } @@ -139,23 +140,26 @@ double MultiClassOVR(common::Span predts, MetaInfo const &info, return auc_sum; } -std::tuple BinaryROCAUC(common::Span predts, +std::tuple BinaryROCAUC(Context const *ctx, + common::Span predts, linalg::VectorView labels, common::OptionalWeights weights) { - auto const sorted_idx = common::ArgSort(predts, std::greater<>{}); + auto const sorted_idx = + common::ArgSort(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{}); return BinaryAUC(predts, labels, weights, sorted_idx, TrapezoidArea); } /** * Calculate AUC for 1 ranking group; */ -double GroupRankingROC(common::Span predts, +double GroupRankingROC(Context const* ctx, common::Span predts, linalg::VectorView labels, float w) { // on ranking, we just count all pairs. double auc{0}; // argsort doesn't support tensor input yet. auto raw_labels = labels.Values().subspan(0, labels.Size()); - auto const sorted_idx = common::ArgSort(raw_labels, std::greater<>{}); + auto const sorted_idx = common::ArgSort( + ctx, raw_labels.data(), raw_labels.data() + raw_labels.size(), std::greater<>{}); w = common::Sqr(w); double sum_w = 0.0f; @@ -185,10 +189,11 @@ double GroupRankingROC(common::Span predts, * * https://doi.org/10.1371/journal.pone.0092209 */ -std::tuple BinaryPRAUC(common::Span predts, +std::tuple BinaryPRAUC(Context const *ctx, common::Span predts, linalg::VectorView labels, common::OptionalWeights weights) { - auto const sorted_idx = common::ArgSort(predts, std::greater<>{}); + auto const sorted_idx = + common::ArgSort(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{}); double total_pos{0}, total_neg{0}; for (size_t i = 0; i < labels.Size(); ++i) { auto w = weights[i]; @@ -211,9 +216,8 @@ std::tuple BinaryPRAUC(common::Span predts, * Cast LTR problem to binary classification problem by comparing pairs. */ template -std::pair RankingAUC(std::vector const &predts, - MetaInfo const &info, - int32_t n_threads) { +std::pair RankingAUC(Context const *ctx, std::vector const &predts, + MetaInfo const &info, int32_t n_threads) { CHECK_GE(info.group_ptr_.size(), 2); uint32_t n_groups = info.group_ptr_.size() - 1; auto s_predts = common::Span{predts}; @@ -237,9 +241,9 @@ std::pair RankingAUC(std::vector const &predts, auc = 0; } else { if (is_roc) { - auc = GroupRankingROC(g_predts, g_labels, w); + auc = GroupRankingROC(ctx, g_predts, g_labels, w); } else { - auc = std::get<2>(BinaryPRAUC(g_predts, g_labels, common::OptionalWeights{w})); + auc = std::get<2>(BinaryPRAUC(ctx, g_predts, g_labels, common::OptionalWeights{w})); } if (std::isnan(auc)) { invalid_groups++; @@ -344,7 +348,7 @@ class EvalROCAUC : public EvalAUC { auto n_threads = ctx_->Threads(); if (ctx_->gpu_id == Context::kCpuId) { std::tie(auc, valid_groups) = - RankingAUC(predts.ConstHostVector(), info, n_threads); + RankingAUC(ctx_, predts.ConstHostVector(), info, n_threads); } else { std::tie(auc, valid_groups) = GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_); @@ -358,8 +362,7 @@ class EvalROCAUC : public EvalAUC { auto n_threads = ctx_->Threads(); CHECK_NE(n_classes, 0); if (ctx_->gpu_id == Context::kCpuId) { - auc = MultiClassOVR(predts.ConstHostVector(), info, n_classes, n_threads, - BinaryROCAUC); + auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC); } else { auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes); } @@ -370,9 +373,9 @@ class EvalROCAUC : public EvalAUC { EvalBinary(HostDeviceVector const &predts, MetaInfo const &info) { double fp, tp, auc; if (ctx_->gpu_id == Context::kCpuId) { - std::tie(fp, tp, auc) = - BinaryROCAUC(predts.ConstHostVector(), info.labels.HostView().Slice(linalg::All(), 0), - common::OptionalWeights{info.weights_.ConstHostSpan()}); + std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(), + info.labels.HostView().Slice(linalg::All(), 0), + common::OptionalWeights{info.weights_.ConstHostSpan()}); } else { std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info, ctx_->gpu_id, &this->d_cache_); @@ -422,7 +425,7 @@ class EvalPRAUC : public EvalAUC { double pr, re, auc; if (ctx_->gpu_id == Context::kCpuId) { std::tie(pr, re, auc) = - BinaryPRAUC(predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0), + BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0), common::OptionalWeights{info.weights_.ConstHostSpan()}); } else { std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info, @@ -435,8 +438,7 @@ class EvalPRAUC : public EvalAUC { size_t n_classes) { if (ctx_->gpu_id == Context::kCpuId) { auto n_threads = this->ctx_->Threads(); - return MultiClassOVR(predts.ConstHostSpan(), info, n_classes, n_threads, - BinaryPRAUC); + return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC); } else { return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes); } @@ -453,7 +455,7 @@ class EvalPRAUC : public EvalAUC { InvalidLabels(); } std::tie(auc, valid_groups) = - RankingAUC(predts.ConstHostVector(), info, n_threads); + RankingAUC(ctx_, predts.ConstHostVector(), info, n_threads); } else { std::tie(auc, valid_groups) = GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_); diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc index 7ca0243f2..d39c7302a 100644 --- a/src/metric/rank_metric.cc +++ b/src/metric/rank_metric.cc @@ -27,6 +27,7 @@ #include #include "../collective/communicator-inl.h" +#include "../common/algorithm.h" // Sort #include "../common/math.h" #include "../common/ranking_utils.h" // MakeMetricName #include "../common/threading_utils.h" @@ -113,7 +114,7 @@ struct EvalAMS : public MetricNoCache { const auto &h_preds = preds.ConstHostVector(); common::ParallelFor(ndata, ctx_->Threads(), [&](bst_omp_uint i) { rec[i] = std::make_pair(h_preds[i], i); }); - XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst); + common::Sort(ctx_, rec.begin(), rec.end(), common::CmpFirst); auto ntop = static_cast(ratio_ * ndata); if (ntop == 0) ntop = ndata; const double br = 10.0; @@ -330,7 +331,7 @@ struct EvalCox : public MetricNoCache { using namespace std; // NOLINT(*) const auto ndata = static_cast(info.labels.Size()); - const auto &label_order = info.LabelAbsSort(); + const auto &label_order = info.LabelAbsSort(ctx_); // pre-compute a sum for the denominator double exp_p_sum = 0; // we use double because we might need the precision with large datasets diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc index 173decb96..9b341f4a7 100644 --- a/src/objective/adaptive.cc +++ b/src/objective/adaptive.cc @@ -6,24 +6,25 @@ #include #include -#include "../common/common.h" -#include "../common/numeric.h" -#include "../common/stats.h" -#include "../common/threading_utils.h" +#include "../common/algorithm.h" // ArgSort +#include "../common/numeric.h" // RunLengthEncode +#include "../common/stats.h" // Quantile,WeightedQuantile +#include "../common/threading_utils.h" // ParallelFor #include "../common/transform_iterator.h" // MakeIndexTransformIter +#include "xgboost/context.h" // Context #include "xgboost/linalg.h" #include "xgboost/tree_model.h" namespace xgboost { namespace obj { namespace detail { -void EncodeTreeLeafHost(RegTree const& tree, std::vector const& position, - std::vector* p_nptr, std::vector* p_nidx, - std::vector* p_ridx) { +void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree, + std::vector const& position, std::vector* p_nptr, + std::vector* p_nidx, std::vector* p_ridx) { auto& nptr = *p_nptr; auto& nidx = *p_nidx; auto& ridx = *p_ridx; - ridx = common::ArgSort(position); + ridx = common::ArgSort(ctx, position.cbegin(), position.cend()); std::vector sorted_pos(position); // permutation for (size_t i = 0; i < position.size(); ++i) { @@ -74,7 +75,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector const& posit std::vector nidx; std::vector nptr; std::vector ridx; - EncodeTreeLeafHost(*p_tree, position, &nptr, &nidx, &ridx); + EncodeTreeLeafHost(ctx, *p_tree, position, &nptr, &nidx, &ridx); size_t n_leaf = nidx.size(); if (nptr.empty()) { std::vector quantiles; diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc index 3a2ff0dec..96fd5d653 100644 --- a/src/objective/init_estimation.cc +++ b/src/objective/init_estimation.cc @@ -1,5 +1,10 @@ +/** + * Copyright 2022-2023 by XGBoost contributors + */ #include "init_estimation.h" +#include // unique_ptr + #include "../common/stats.h" // Mean #include "../tree/fit_stump.h" // FitStump #include "xgboost/base.h" // GradientPair diff --git a/src/objective/init_estimation.h b/src/objective/init_estimation.h index 1a243523c..b0a91d8c3 100644 --- a/src/objective/init_estimation.h +++ b/src/objective/init_estimation.h @@ -1,3 +1,8 @@ +/** + * Copyright 2022-2023 by XGBoost contributors + */ +#ifndef XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_ +#define XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_ #include "xgboost/data.h" // MetaInfo #include "xgboost/linalg.h" // Tensor #include "xgboost/objective.h" // ObjFunction @@ -17,3 +22,4 @@ inline void CheckInitInputs(MetaInfo const& info) { } } // namespace obj } // namespace xgboost +#endif // XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_ diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index 29e2255e4..332646095 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -393,7 +393,7 @@ class CoxRegression : public FitIntercept { const auto& preds_h = preds.HostVector(); out_gpair->Resize(preds_h.size()); auto& gpair = out_gpair->HostVector(); - const std::vector &label_order = info.LabelAbsSort(); + const std::vector &label_order = info.LabelAbsSort(ctx_); const omp_ulong ndata = static_cast(preds_h.size()); // NOLINT(*) const bool is_null_weight = info.weights_.Size() == 0; diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index f76565e9a..5d7f3b193 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -34,10 +34,10 @@ class HistEvaluator { }; private: + Context const* ctx_; TrainParam param_; std::shared_ptr column_sampler_; TreeEvaluator tree_evaluator_; - int32_t n_threads_ {0}; FeatureInteractionConstraintHost interaction_constraints_; std::vector snode_; @@ -283,6 +283,7 @@ class HistEvaluator { void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut, common::Span feature_types, const RegTree &tree, std::vector *p_entries) { + auto n_threads = ctx_->Threads(); auto& entries = *p_entries; // All nodes are on the same level, so we can store the shared ptr. std::vector>> features( @@ -294,23 +295,23 @@ class HistEvaluator { } CHECK(!features.empty()); const size_t grain_size = - std::max(1, features.front()->Size() / n_threads_); + std::max(1, features.front()->Size() / n_threads); common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) { return features[nidx_in_set]->Size(); }, grain_size); - std::vector tloc_candidates(n_threads_ * entries.size()); + std::vector tloc_candidates(n_threads * entries.size()); for (size_t i = 0; i < entries.size(); ++i) { - for (decltype(n_threads_) j = 0; j < n_threads_; ++j) { - tloc_candidates[i * n_threads_ + j] = entries[i]; + for (decltype(n_threads) j = 0; j < n_threads; ++j) { + tloc_candidates[i * n_threads + j] = entries[i]; } } auto evaluator = tree_evaluator_.GetEvaluator(); auto const& cut_ptrs = cut.Ptrs(); - common::ParallelFor2d(space, n_threads_, [&](size_t nidx_in_set, common::Range1d r) { + common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) { auto tidx = omp_get_thread_num(); - auto entry = &tloc_candidates[n_threads_ * nidx_in_set + tidx]; + auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx]; auto best = &entry->split; auto nidx = entry->nid; auto histogram = hist[nidx]; @@ -349,9 +350,9 @@ class HistEvaluator { for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) { - for (auto tidx = 0; tidx < n_threads_; ++tidx) { + for (auto tidx = 0; tidx < n_threads; ++tidx) { entries[nidx_in_set].split.Update( - tloc_candidates[n_threads_ * nidx_in_set + tidx].split); + tloc_candidates[n_threads * nidx_in_set + tidx].split); } } } @@ -424,15 +425,15 @@ class HistEvaluator { public: // The column sampler must be constructed by caller since we need to preserve the rng // for the entire training session. - explicit HistEvaluator(TrainParam const ¶m, MetaInfo const &info, int32_t n_threads, + explicit HistEvaluator(Context const* ctx, TrainParam const ¶m, MetaInfo const &info, std::shared_ptr sampler) - : param_{param}, + : ctx_{ctx}, param_{param}, column_sampler_{std::move(sampler)}, - tree_evaluator_{param, static_cast(info.num_col_), Context::kCpuId}, - n_threads_{n_threads} { + tree_evaluator_{param, static_cast(info.num_col_), Context::kCpuId} { interaction_constraints_.Configure(param, info.num_col_); - column_sampler_->Init(info.num_col_, info.feature_weights.HostVector(), param_.colsample_bynode, - param_.colsample_bylevel, param_.colsample_bytree); + column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(), + param_.colsample_bynode, param_.colsample_bylevel, + param_.colsample_bytree); } }; diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc index 0e3675888..4852e325f 100644 --- a/src/tree/updater_approx.cc +++ b/src/tree/updater_approx.cc @@ -171,7 +171,7 @@ class GloablApproxBuilder { common::Monitor *monitor) : param_{std::move(param)}, col_sampler_{std::move(column_sampler)}, - evaluator_{param_, info, ctx->Threads(), col_sampler_}, + evaluator_{ctx, param_, info, col_sampler_}, ctx_{ctx}, task_{task}, monitor_{monitor} {} diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 07483038c..197c8fe5c 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -234,9 +234,9 @@ class ColMaker: public TreeUpdater { } } { - column_sampler_.Init(fmat.Info().num_col_, fmat.Info().feature_weights.ConstHostVector(), - param_.colsample_bynode, param_.colsample_bylevel, - param_.colsample_bytree); + column_sampler_.Init(ctx_, fmat.Info().num_col_, + fmat.Info().feature_weights.ConstHostVector(), param_.colsample_bynode, + param_.colsample_bylevel, param_.colsample_bytree); } { // setup temp space for each thread diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index a02ee5cdd..ccd1c24dd 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -243,7 +243,7 @@ struct GPUHistMakerDevice { // thread safe void Reset(HostDeviceVector* dh_gpair, DMatrix* dmat, int64_t num_columns) { auto const& info = dmat->Info(); - this->column_sampler.Init(num_columns, info.feature_weights.HostVector(), + this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(), param.colsample_bynode, param.colsample_bylevel, param.colsample_bytree); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index f7cf73f1d..ad2e57aa9 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -290,8 +290,7 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree, // store a pointer to the tree p_last_tree_ = &tree; - evaluator_.reset( - new HistEvaluator{param_, info, this->ctx_->Threads(), column_sampler_}); + evaluator_.reset(new HistEvaluator{ctx_, param_, info, column_sampler_}); monitor_->Stop(__func__); } diff --git a/tests/cpp/common/test_algorithm.cc b/tests/cpp/common/test_algorithm.cc new file mode 100644 index 000000000..630460714 --- /dev/null +++ b/tests/cpp/common/test_algorithm.cc @@ -0,0 +1,35 @@ +/** + * Copyright 2020-2023 by XGBoost Contributors + */ +#include +#include // Context +#include + +#include // is_sorted + +#include "../../../src/common/algorithm.h" + +namespace xgboost { +namespace common { +TEST(Algorithm, ArgSort) { + Context ctx; + std::vector inputs{3.0, 2.0, 1.0}; + auto ret = ArgSort(&ctx, inputs.cbegin(), inputs.cend()); + std::vector sol{2, 1, 0}; + ASSERT_EQ(ret, sol); +} + +TEST(Algorithm, Sort) { + Context ctx; + ctx.Init(Args{{"nthread", "8"}}); + std::vector inputs{3.0, 1.0, 2.0}; + + Sort(&ctx, inputs.begin(), inputs.end(), std::less<>{}); + ASSERT_TRUE(std::is_sorted(inputs.cbegin(), inputs.cend())); + + inputs = {3.0, 1.0, 2.0}; + StableSort(&ctx, inputs.begin(), inputs.end(), std::less<>{}); + ASSERT_TRUE(std::is_sorted(inputs.cbegin(), inputs.cend())); +} +} // namespace common +} // namespace xgboost diff --git a/tests/cpp/common/test_algorithm.cu b/tests/cpp/common/test_algorithm.cu index c2e159dc4..982f0c9ca 100644 --- a/tests/cpp/common/test_algorithm.cu +++ b/tests/cpp/common/test_algorithm.cu @@ -52,9 +52,9 @@ void TestSegmentedArgSort() { } } -TEST(Algorithms, SegmentedArgSort) { TestSegmentedArgSort(); } +TEST(Algorithm, SegmentedArgSort) { TestSegmentedArgSort(); } -TEST(Algorithms, ArgSort) { +TEST(Algorithm, GpuArgSort) { Context ctx; ctx.gpu_id = 0; @@ -80,7 +80,7 @@ TEST(Algorithms, ArgSort) { thrust::is_sorted(sorted_idx.begin() + 10, sorted_idx.end(), thrust::greater{})); } -TEST(Algorithms, SegmentedSequence) { +TEST(Algorithm, SegmentedSequence) { dh::device_vector idx(16); dh::device_vector ptr(3); Context ctx = CreateEmptyGenericParam(0); diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc deleted file mode 100644 index adaf21fea..000000000 --- a/tests/cpp/common/test_common.cc +++ /dev/null @@ -1,14 +0,0 @@ -#include -#include -#include "../../../src/common/common.h" - -namespace xgboost { -namespace common { -TEST(ArgSort, Basic) { - std::vector inputs {3.0, 2.0, 1.0}; - auto ret = ArgSort(Span{inputs}); - std::vector sol{2, 1, 0}; - ASSERT_EQ(ret, sol); -} -} // namespace common -} // namespace xgboost diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc index 201f7b407..e2ecd0990 100644 --- a/tests/cpp/common/test_random.cc +++ b/tests/cpp/common/test_random.cc @@ -2,16 +2,18 @@ #include "../../../src/common/random.h" #include "../helpers.h" #include "gtest/gtest.h" +#include "xgboost/context.h" // Context namespace xgboost { namespace common { TEST(ColumnSampler, Test) { + Context ctx; int n = 128; ColumnSampler cs; std::vector feature_weights; // No node sampling - cs.Init(n, feature_weights, 1.0f, 0.5f, 0.5f); + cs.Init(&ctx, n, feature_weights, 1.0f, 0.5f, 0.5f); auto set0 = cs.GetFeatureSet(0); ASSERT_EQ(set0->Size(), 32); @@ -24,7 +26,7 @@ TEST(ColumnSampler, Test) { ASSERT_EQ(set2->Size(), 32); // Node sampling - cs.Init(n, feature_weights, 0.5f, 1.0f, 0.5f); + cs.Init(&ctx, n, feature_weights, 0.5f, 1.0f, 0.5f); auto set3 = cs.GetFeatureSet(0); ASSERT_EQ(set3->Size(), 32); @@ -34,24 +36,25 @@ TEST(ColumnSampler, Test) { ASSERT_EQ(set4->Size(), 32); // No level or node sampling, should be the same at different depth - cs.Init(n, feature_weights, 1.0f, 1.0f, 0.5f); + cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 0.5f); ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector()); - cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f); + cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f); auto set5 = cs.GetFeatureSet(0); ASSERT_EQ(set5->Size(), n); - cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f); + cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f); auto set6 = cs.GetFeatureSet(0); ASSERT_EQ(set5->HostVector(), set6->HostVector()); // Should always be a minimum of one feature - cs.Init(n, feature_weights, 1e-16f, 1e-16f, 1e-16f); + cs.Init(&ctx, n, feature_weights, 1e-16f, 1e-16f, 1e-16f); ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1); } // Test if different threads using the same seed produce the same result TEST(ColumnSampler, ThreadSynchronisation) { + Context ctx; const int64_t num_threads = 100; int n = 128; size_t iterations = 10; @@ -63,7 +66,7 @@ TEST(ColumnSampler, ThreadSynchronisation) { { for (auto j = 0ull; j < iterations; j++) { ColumnSampler cs(j); - cs.Init(n, feature_weights, 0.5f, 0.5f, 0.5f); + cs.Init(&ctx, n, feature_weights, 0.5f, 0.5f, 0.5f); for (auto level = 0ull; level < levels; level++) { auto result = cs.GetFeatureSet(level)->ConstHostVector(); #pragma omp single @@ -80,11 +83,12 @@ TEST(ColumnSampler, ThreadSynchronisation) { TEST(ColumnSampler, WeightedSampling) { auto test_basic = [](int first) { + Context ctx; std::vector feature_weights(2); feature_weights[0] = std::abs(first - 1.0f); feature_weights[1] = first - 0.0f; ColumnSampler cs{0}; - cs.Init(2, feature_weights, 1.0, 1.0, 0.5); + cs.Init(&ctx, 2, feature_weights, 1.0, 1.0, 0.5); auto feature_sets = cs.GetFeatureSet(0); auto const &h_feat_set = feature_sets->HostVector(); ASSERT_EQ(h_feat_set.size(), 1); @@ -100,7 +104,8 @@ TEST(ColumnSampler, WeightedSampling) { SimpleRealUniformDistribution dist(.0f, 12.0f); std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); }); ColumnSampler cs{0}; - cs.Init(kCols, feature_weights, 0.5f, 1.0f, 1.0f); + Context ctx; + cs.Init(&ctx, kCols, feature_weights, 0.5f, 1.0f, 1.0f); std::vector features(kCols); std::iota(features.begin(), features.end(), 0); std::vector freq(kCols, 0); @@ -135,7 +140,8 @@ TEST(ColumnSampler, WeightedMultiSampling) { } ColumnSampler cs{0}; float bytree{0.5}, bylevel{0.5}, bynode{0.5}; - cs.Init(feature_weights.size(), feature_weights, bytree, bylevel, bynode); + Context ctx; + cs.Init(&ctx, feature_weights.size(), feature_weights, bytree, bylevel, bynode); auto feature_set = cs.GetFeatureSet(0); size_t n_sampled = kCols * bytree * bylevel * bynode; ASSERT_EQ(feature_set->Size(), n_sampled); diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc index c45ed5385..984f46881 100644 --- a/tests/cpp/tree/hist/test_evaluate_splits.cc +++ b/tests/cpp/tree/hist/test_evaluate_splits.cc @@ -9,12 +9,14 @@ #include "../../../../src/tree/hist/evaluate_splits.h" #include "../test_evaluate_splits.h" #include "../../helpers.h" +#include "xgboost/context.h" // Context namespace xgboost { namespace tree { void TestEvaluateSplits(bool force_read_by_column) { + Context ctx; + ctx.nthread = 4; int static constexpr kRows = 8, kCols = 16; - int32_t n_threads = std::min(omp_get_max_threads(), 4); auto sampler = std::make_shared(); TrainParam param; @@ -22,7 +24,7 @@ void TestEvaluateSplits(bool force_read_by_column) { auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix(); - auto evaluator = HistEvaluator{param, dmat->Info(), n_threads, sampler}; + auto evaluator = HistEvaluator{&ctx, param, dmat->Info(), sampler}; common::HistCollection hist; std::vector row_gpairs = { {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f}, @@ -86,13 +88,15 @@ TEST(HistEvaluator, Evaluate) { } TEST(HistEvaluator, Apply) { + Context ctx; + ctx.nthread = 4; RegTree tree; int static constexpr kNRows = 8, kNCols = 16; TrainParam param; param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}}); auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix(); auto sampler = std::make_shared(); - auto evaluator_ = HistEvaluator{param, dmat->Info(), 4, sampler}; + auto evaluator_ = HistEvaluator{&ctx, param, dmat->Info(), sampler}; CPUExpandEntry entry{0, 0, 10.0f}; entry.split.left_sum = GradStats{0.4, 0.6f}; @@ -115,10 +119,11 @@ TEST(HistEvaluator, Apply) { } TEST_F(TestPartitionBasedSplit, CPUHist) { + Context ctx; // check the evaluator is returning the optimal split std::vector ft{FeatureType::kCategorical}; auto sampler = std::make_shared(); - HistEvaluator evaluator{param_, info_, AllThreadsForTest(), sampler}; + HistEvaluator evaluator{&ctx, param_, info_, sampler}; evaluator.InitRoot(GradStats{total_gpair_}); RegTree tree; std::vector entries(1); @@ -128,6 +133,7 @@ TEST_F(TestPartitionBasedSplit, CPUHist) { namespace { auto CompareOneHotAndPartition(bool onehot) { + Context ctx; int static constexpr kRows = 128, kCols = 1; std::vector ft(kCols, FeatureType::kCategorical); @@ -147,8 +153,7 @@ auto CompareOneHotAndPartition(bool onehot) { RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix(); auto sampler = std::make_shared(); - auto evaluator = - HistEvaluator{param, dmat->Info(), AllThreadsForTest(), sampler}; + auto evaluator = HistEvaluator{&ctx, param, dmat->Info(), sampler}; std::vector entries(1); for (auto const &gmat : dmat->GetBatches({32, param.sparse_threshold})) { @@ -198,8 +203,8 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) { MetaInfo info; info.num_col_ = 1; info.feature_types = {FeatureType::kCategorical}; - auto evaluator = - HistEvaluator{param_, info, AllThreadsForTest(), sampler}; + Context ctx; + auto evaluator = HistEvaluator{&ctx, param_, info, sampler}; evaluator.InitRoot(GradStats{parent_sum_}); std::vector entries(1);