/** * Copyright 2022-2023 by XGBoost contributors. */ #ifndef XGBOOST_COMMON_NUMERIC_H_ #define XGBOOST_COMMON_NUMERIC_H_ #include // OMPException #include // for std::max #include // for size_t #include // for int32_t #include // for iterator_traits #include // for accumulate #include #include "common.h" // AssertGPUSupport #include "threading_utils.h" // MemStackAllocator, DefaultMaxThreads #include "xgboost/context.h" // Context #include "xgboost/host_device_vector.h" // HostDeviceVector namespace xgboost::common { /** * \brief Run length encode on CPU, input must be sorted. */ template void RunLengthEncode(Iter begin, Iter end, std::vector* p_out) { auto& out = *p_out; out = std::vector{0}; size_t n = std::distance(begin, end); for (size_t i = 1; i < n; ++i) { if (begin[i] != begin[i - 1]) { out.push_back(i); } } if (out.back() != n) { out.push_back(n); } } /** * \brief Varient of std::partial_sum, out_it should point to a container that has n + 1 * elements. Useful for constructing a CSR indptr. */ template void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) { static_assert(std::is_same::value_type>::value); static_assert(std::is_same::value_type>::value); // The number of threads is pegged to the batch size. If the OMP block is parallelized // on anything other than the batch/block size, it should be reassigned auto n = static_cast(std::distance(begin, end)); const size_t batch_threads = std::max(static_cast(1), std::min(n, static_cast(n_threads))); MemStackAllocator partial_sums(batch_threads); size_t block_size = n / batch_threads; dmlc::OMPException exc; #pragma omp parallel num_threads(batch_threads) { #pragma omp for for (omp_ulong tid = 0; tid < batch_threads; ++tid) { exc.Run([&]() { size_t ibegin = block_size * tid; size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1))); T running_sum = 0; for (size_t ridx = ibegin; ridx < iend; ++ridx) { running_sum += *(begin + ridx); *(out_it + 1 + ridx) = running_sum; } }); } #pragma omp single { exc.Run([&]() { partial_sums[0] = init; for (size_t i = 1; i < batch_threads; ++i) { partial_sums[i] = partial_sums[i - 1] + *(out_it + i * block_size); } }); } #pragma omp for for (omp_ulong tid = 0; tid < batch_threads; ++tid) { exc.Run([&]() { size_t ibegin = block_size * tid; size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1))); for (size_t i = ibegin; i < iend; ++i) { *(out_it + 1 + i) += partial_sums[tid]; } }); } } exc.Rethrow(); } namespace cuda_impl { double Reduce(Context const* ctx, HostDeviceVector const& values); #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP) inline double Reduce(Context const*, HostDeviceVector const&) { AssertGPUSupport(); return 0; } #endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP) } // namespace cuda_impl /** * \brief Reduction with iterator. init must be additive identity. (0 for primitive types) */ namespace cpu_impl { template V Reduce(Context const* ctx, It first, It second, V const& init) { std::size_t n = std::distance(first, second); auto n_threads = static_cast(std::min(n, static_cast(ctx->Threads()))); common::MemStackAllocator result_tloc(n_threads, init); common::ParallelFor(n, n_threads, [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; }); auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + n_threads, init); return result; } } // namespace cpu_impl /** * \brief Reduction on host device vector. */ double Reduce(Context const* ctx, HostDeviceVector const& values); template void Iota(Context const* ctx, It first, It last, typename std::iterator_traits::value_type const& value) { auto n = std::distance(first, last); std::int32_t n_threads = ctx->Threads(); const size_t block_size = n / n_threads + !!(n % n_threads); dmlc::OMPException exc; #pragma omp parallel num_threads(n_threads) { exc.Run([&]() { const size_t tid = omp_get_thread_num(); const size_t ibegin = tid * block_size; const size_t iend = std::min(ibegin + block_size, static_cast(n)); for (size_t i = ibegin; i < iend; ++i) { first[i] = i + value; } }); } } } // namespace xgboost::common #endif // XGBOOST_COMMON_NUMERIC_H_