Extract device algorithms. (#8789)

2023-02-13 20:53:53 +08:00
parent 457f704e3d
commit 31d3ec07af
13 changed files with 361 additions and 218 deletions
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -1,27 +1,190 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
 */
-#pragma once
+#ifndef XGBOOST_COMMON_ALGORITHM_CUH_
+#define XGBOOST_COMMON_ALGORITHM_CUH_

-#include <thrust/binary_search.h>     // thrust::upper_bound
-#include <thrust/execution_policy.h>  // thrust::seq
+#include <thrust/copy.h>       // copy
+#include <thrust/sort.h>       // stable_sort_by_key
+#include <thrust/tuple.h>      // tuple,get

-#include "xgboost/base.h"
-#include "xgboost/span.h"
+#include <cstddef>             // size_t
+#include <cstdint>             // int32_t
+#include <cub/cub.cuh>         // DispatchSegmentedRadixSort,NullType,DoubleBuffer
+#include <iterator>            // distance
+#include <limits>              // numeric_limits
+#include <type_traits>         // conditional_t,remove_const_t
+
+#include "common.h"            // safe_cuda
+#include "cuda_context.cuh"    // CUDAContext
+#include "device_helpers.cuh"  // TemporaryArray,SegmentId,LaunchN,Iota,device_vector
+#include "xgboost/base.h"      // XGBOOST_DEVICE
+#include "xgboost/context.h"   // Context
+#include "xgboost/logging.h"   // CHECK
+#include "xgboost/span.h"      // Span,byte

 namespace xgboost {
 namespace common {
-namespace cuda {
-template <typename It>
-size_t XGBOOST_DEVICE SegmentId(It first, It last, size_t idx) {
-  size_t segment_id = thrust::upper_bound(thrust::seq, first, last, idx) - 1 - first;
-  return segment_id;
+namespace detail {
+// Wrapper around cub sort to define is_decending
+template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_storage,
+                                         std::size_t &temp_storage_bytes,  // NOLINT
+                                         const KeyT *d_keys_in, KeyT *d_keys_out, int num_items,
+                                         int num_segments, BeginOffsetIteratorT d_begin_offsets,
+                                         EndOffsetIteratorT d_end_offsets, int begin_bit = 0,
+                                         int end_bit = sizeof(KeyT) * 8,
+                                         bool debug_synchronous = false) {
+  using OffsetT = int;
+
+  // Null value type
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+  cub::DoubleBuffer<cub::NullType> d_values;
+
+  dh::safe_cuda((cub::DispatchSegmentedRadixSort<
+                 IS_DESCENDING, KeyT, cub::NullType, BeginOffsetIteratorT, EndOffsetIteratorT,
+                 OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
+                                    num_segments, d_begin_offsets, d_end_offsets, begin_bit,
+                                    end_bit, false, ctx->Stream(), debug_synchronous)));
 }

-template <typename T>
-size_t XGBOOST_DEVICE SegmentId(Span<T> segments_ptr, size_t idx) {
-  return SegmentId(segments_ptr.cbegin(), segments_ptr.cend(), idx);
+// Wrapper around cub sort for easier `descending` sort.
+template <bool descending, typename KeyT, typename ValueT, typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+void DeviceSegmentedRadixSortPair(void *d_temp_storage,
+                                  std::size_t &temp_storage_bytes,  // NOLINT
+                                  const KeyT *d_keys_in, KeyT *d_keys_out,
+                                  const ValueT *d_values_in, ValueT *d_values_out,
+                                  std::size_t num_items, std::size_t num_segments,
+                                  BeginOffsetIteratorT d_begin_offsets,
+                                  EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream,
+                                  int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) {
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
+  // In old version of cub, num_items in dispatch is also int32_t, no way to change.
+  using OffsetT = std::conditional_t<dh::BuildWithCUDACub() && dh::HasThrustMinorVer<13>(),
+                                     std::size_t, std::int32_t>;
+  CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
+  // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
+
+#if THRUST_MAJOR_VERSION >= 2
+  dh::safe_cuda((cub::DispatchSegmentedRadixSort<
+                 descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
+                 OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
+                                    num_segments, d_begin_offsets, d_end_offsets, begin_bit,
+                                    end_bit, false, stream)));
+#elif (THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION >= 13)
+  dh::safe_cuda((cub::DispatchSegmentedRadixSort<
+                 descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
+                 OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
+                                    num_segments, d_begin_offsets, d_end_offsets, begin_bit,
+                                    end_bit, false, stream, false)));
+#else
+  dh::safe_cuda(
+      (cub::DispatchSegmentedRadixSort<descending, KeyT, ValueT, BeginOffsetIteratorT,
+                                       OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes,
+                                                          d_keys, d_values, num_items, num_segments,
+                                                          d_begin_offsets, d_end_offsets, begin_bit,
+                                                          end_bit, false, stream, false)));
+#endif
+}
+}  // namespace detail
+
+template <typename U, typename V>
+void SegmentedSequence(Context const *ctx, Span<U> d_offset_ptr, Span<V> out_sequence) {
+  dh::LaunchN(out_sequence.size(), ctx->CUDACtx()->Stream(),
+              [out_sequence, d_offset_ptr] __device__(std::size_t idx) {
+                auto group = dh::SegmentId(d_offset_ptr, idx);
+                out_sequence[idx] = idx - d_offset_ptr[group];
+              });
+}
+
+template <bool descending, typename U, typename V>
+inline void SegmentedSortKeys(Context const *ctx, Span<V const> group_ptr,
+                              Span<U> out_sorted_values) {
+  CHECK_GE(group_ptr.size(), 1ul);
+  std::size_t n_groups = group_ptr.size() - 1;
+  std::size_t bytes = 0;
+  auto const *cuctx = ctx->CUDACtx();
+  CHECK(cuctx);
+  detail::DeviceSegmentedRadixSortKeys<descending>(
+      cuctx, nullptr, bytes, out_sorted_values.data(), out_sorted_values.data(),
+      out_sorted_values.size(), n_groups, group_ptr.data(), group_ptr.data() + 1);
+  dh::TemporaryArray<byte> temp_storage(bytes);
+  detail::DeviceSegmentedRadixSortKeys<descending>(
+      cuctx, temp_storage.data().get(), bytes, out_sorted_values.data(), out_sorted_values.data(),
+      out_sorted_values.size(), n_groups, group_ptr.data(), group_ptr.data() + 1);
+}
+
+/**
+ * \brief Create sorted index for data with multiple segments.
+ *
+ * \tparam accending sorted in non-decreasing order.
+ * \tparam per_seg_index Index starts from 0 for each segment if true, otherwise the
+ *                       the index span the whole data.
+ */
+template <bool accending, bool per_seg_index, typename U, typename V, typename IdxT>
+void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
+                      Span<IdxT> sorted_idx) {
+  CHECK_GE(group_ptr.size(), 1ul);
+  std::size_t n_groups = group_ptr.size() - 1;
+  std::size_t bytes = 0;
+  if (per_seg_index) {
+    SegmentedSequence(ctx, group_ptr, sorted_idx);
+  } else {
+    dh::Iota(sorted_idx);
+  }
+  dh::TemporaryArray<std::remove_const_t<U>> values_out(values.size());
+  dh::TemporaryArray<std::remove_const_t<IdxT>> sorted_idx_out(sorted_idx.size());
+
+  detail::DeviceSegmentedRadixSortPair<!accending>(
+      nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(),
+      sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
+      group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+  dh::TemporaryArray<byte> temp_storage(bytes);
+  detail::DeviceSegmentedRadixSortPair<!accending>(
+      temp_storage.data().get(), bytes, values.data(), values_out.data().get(), sorted_idx.data(),
+      sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
+      group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+
+  dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
+                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
+}
+
+/**
+ * \brief Different from the radix-sort-based argsort, this one can handle cases where
+ *        segment doesn't start from 0, but as a result it uses comparison sort.
+ */
+template <typename SegIt, typename ValIt>
+void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, ValIt val_begin,
+                           ValIt val_end, dh::device_vector<std::size_t> *p_sorted_idx) {
+  using Tup = thrust::tuple<std::int32_t, float>;
+  auto &sorted_idx = *p_sorted_idx;
+  std::size_t n = std::distance(val_begin, val_end);
+  sorted_idx.resize(n);
+  dh::Iota(dh::ToSpan(sorted_idx));
+  dh::device_vector<Tup> keys(sorted_idx.size());
+  auto key_it = dh::MakeTransformIterator<Tup>(thrust::make_counting_iterator(0ul),
+                                               [=] XGBOOST_DEVICE(std::size_t i) -> Tup {
+                                                 std::int32_t seg_idx;
+                                                 if (i < *seg_begin) {
+                                                   seg_idx = -1;
+                                                 } else {
+                                                   seg_idx = dh::SegmentId(seg_begin, seg_end, i);
+                                                 }
+                                                 auto residue = val_begin[i];
+                                                 return thrust::make_tuple(seg_idx, residue);
+                                               });
+  thrust::copy(ctx->CUDACtx()->CTP(), key_it, key_it + keys.size(), keys.begin());
+  thrust::stable_sort_by_key(ctx->CUDACtx()->TP(), keys.begin(), keys.end(), sorted_idx.begin(),
+                             [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) {
+                               if (thrust::get<0>(l) != thrust::get<0>(r)) {
+                                 return thrust::get<0>(l) < thrust::get<0>(r);  // segment index
+                               }
+                               return thrust::get<1>(l) < thrust::get<1>(r);    // residue
+                             });
 }
-}  // namespace cuda
 }  // namespace common
 }  // namespace xgboost
+#endif  // XGBOOST_COMMON_ALGORITHM_CUH_
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1,43 +1,39 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2023 XGBoost contributors
 */
 #pragma once
+#include <thrust/binary_search.h>  // thrust::upper_bound
+#include <thrust/device_malloc_allocator.h>
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
-#include <thrust/device_malloc_allocator.h>
+#include <thrust/execution_policy.h>                    // thrust::seq
+#include <thrust/gather.h>                              // gather
 #include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>  // make_transform_output_iterator
+#include <thrust/logical.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
-#include <thrust/execution_policy.h>
-
 #include <thrust/transform_scan.h>
-#include <thrust/logical.h>
-#include <thrust/gather.h>
 #include <thrust/unique.h>
-#include <thrust/binary_search.h>
-
-#include <cub/cub.cuh>
-#include <cub/util_allocator.cuh>

 #include <algorithm>
 #include <chrono>
+#include <cub/cub.cuh>
+#include <cub/util_allocator.cuh>
 #include <numeric>
 #include <sstream>
 #include <string>
-#include <vector>
 #include <tuple>
-
-#include "xgboost/logging.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/span.h"
-#include "xgboost/global_config.h"
+#include <vector>

 #include "../collective/communicator-inl.h"
 #include "common.h"
-#include "algorithm.cuh"
+#include "xgboost/global_config.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/logging.h"
+#include "xgboost/span.h"

 #ifdef XGBOOST_USE_NCCL
 #include "nccl.h"
@@ -1015,7 +1011,16 @@ XGBOOST_DEVICE thrust::transform_iterator<FuncT, IterT, ReturnT> MakeTransformIt
  return thrust::transform_iterator<FuncT, IterT, ReturnT>(iter, func);
 }

-using xgboost::common::cuda::SegmentId;  // import it for compatibility
+template <typename It>
+size_t XGBOOST_DEVICE SegmentId(It first, It last, size_t idx) {
+  size_t segment_id = thrust::upper_bound(thrust::seq, first, last, idx) - 1 - first;
+  return segment_id;
+}
+
+template <typename T>
+size_t XGBOOST_DEVICE SegmentId(xgboost::common::Span<T> segments_ptr, size_t idx) {
+  return SegmentId(segments_ptr.cbegin(), segments_ptr.cend(), idx);
+}

 namespace detail {
 template <typename Key, typename KeyOutIt>
@@ -1288,114 +1293,6 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
                            sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
 }

-namespace detail {
-// Wrapper around cub sort for easier `descending` sort.
-template <bool descending, typename KeyT, typename ValueT,
-          typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-void DeviceSegmentedRadixSortPair(
-    void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
-    KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out,
-    size_t num_items, size_t num_segments, BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets, int begin_bit = 0,
-    int end_bit = sizeof(KeyT) * 8) {
-  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
-  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
-                                     d_values_out);
-  // In old version of cub, num_items in dispatch is also int32_t, no way to change.
-  using OffsetT =
-      std::conditional_t<BuildWithCUDACub() && HasThrustMinorVer<13>(), size_t,
-                         int32_t>;
-  CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
-  // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
-
-#if THRUST_MAJOR_VERSION >= 2
-  safe_cuda((cub::DispatchSegmentedRadixSort<
-             descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
-             OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
-                                d_values, num_items, num_segments,
-                                d_begin_offsets, d_end_offsets, begin_bit,
-                                end_bit, false, nullptr)));
-#elif (THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION >= 13)
-  safe_cuda((cub::DispatchSegmentedRadixSort<
-             descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
-             OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
-                                d_values, num_items, num_segments,
-                                d_begin_offsets, d_end_offsets, begin_bit,
-                                end_bit, false, nullptr, false)));
-#else
-  safe_cuda((cub::DispatchSegmentedRadixSort<
-             descending, KeyT, ValueT, BeginOffsetIteratorT,
-             OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
-                                d_values, num_items, num_segments,
-                                d_begin_offsets, d_end_offsets, begin_bit,
-                                end_bit, false, nullptr, false)));
-#endif
-
-}
-}  // namespace detail
-
-template <bool accending, typename U, typename V, typename IdxT>
-void SegmentedArgSort(xgboost::common::Span<U> values,
-                      xgboost::common::Span<V> group_ptr,
-                      xgboost::common::Span<IdxT> sorted_idx) {
-  CHECK_GE(group_ptr.size(), 1ul);
-  size_t n_groups = group_ptr.size() - 1;
-  size_t bytes = 0;
-  Iota(sorted_idx);
-  TemporaryArray<std::remove_const_t<U>> values_out(values.size());
-  TemporaryArray<std::remove_const_t<IdxT>> sorted_idx_out(sorted_idx.size());
-
-  detail::DeviceSegmentedRadixSortPair<!accending>(
-      nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(),
-      sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
-      group_ptr.data() + 1);
-  TemporaryArray<xgboost::common::byte> temp_storage(bytes);
-  detail::DeviceSegmentedRadixSortPair<!accending>(
-      temp_storage.data().get(), bytes, values.data(), values_out.data().get(),
-      sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(),
-      n_groups, group_ptr.data(), group_ptr.data() + 1);
-
-  safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
-                            sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
-}
-
-/**
- * \brief Different from the above one, this one can handle cases where segment doesn't
- *        start from 0, but as a result it uses comparison sort.
- */
-template <typename SegIt, typename ValIt>
-void SegmentedArgSort(SegIt seg_begin, SegIt seg_end, ValIt val_begin, ValIt val_end,
-                      dh::device_vector<size_t> *p_sorted_idx) {
-  using Tup = thrust::tuple<int32_t, float>;
-  auto &sorted_idx = *p_sorted_idx;
-  size_t n = std::distance(val_begin, val_end);
-  sorted_idx.resize(n);
-  dh::Iota(dh::ToSpan(sorted_idx));
-  dh::device_vector<Tup> keys(sorted_idx.size());
-  auto key_it = dh::MakeTransformIterator<Tup>(thrust::make_counting_iterator(0ul),
-                                               [=] XGBOOST_DEVICE(size_t i) -> Tup {
-                                                 int32_t leaf_idx;
-                                                 if (i < *seg_begin) {
-                                                   leaf_idx = -1;
-                                                 } else {
-                                                   leaf_idx = dh::SegmentId(seg_begin, seg_end, i);
-                                                 }
-                                                 auto residue = val_begin[i];
-                                                 return thrust::make_tuple(leaf_idx, residue);
-                                               });
-  dh::XGBCachingDeviceAllocator<char> caching;
-  thrust::copy(thrust::cuda::par(caching), key_it, key_it + keys.size(), keys.begin());
-
-  dh::XGBDeviceAllocator<char> alloc;
-  thrust::stable_sort_by_key(thrust::cuda::par(alloc), keys.begin(), keys.end(), sorted_idx.begin(),
-                             [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) {
-                               if (thrust::get<0>(l) != thrust::get<0>(r)) {
-                                 return thrust::get<0>(l) < thrust::get<0>(r);  // segment index
-                               }
-                               return thrust::get<1>(l) < thrust::get<1>(r);  // residue
-                             });
-}
-
 class CUDAStreamView;

 class CUDAEvent {
@@ -1412,7 +1309,7 @@ class CUDAEvent {
  CUDAEvent(CUDAEvent const &that) = delete;
  CUDAEvent &operator=(CUDAEvent const &that) = delete;

-  inline void Record(CUDAStreamView stream);  // NOLINT
+  inline void Record(CUDAStreamView stream);       // NOLINT

  operator cudaEvent_t() const { return event_; }  // NOLINT
 };
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@@ -17,6 +17,7 @@
 #include <limits>                                  // std::numeric_limits
 #include <type_traits>                             // std::is_floating_point,std::iterator_traits

+#include "algorithm.cuh"                           // SegmentedArgMergeSort
 #include "cuda_context.cuh"                        // CUDAContext
 #include "device_helpers.cuh"
 #include "xgboost/context.h"                       // Context
@@ -150,7 +151,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
                       ValIt val_begin, ValIt val_end, HostDeviceVector<float>* quantiles) {
  dh::device_vector<std::size_t> sorted_idx;
  using Tup = thrust::tuple<std::size_t, float>;
-  dh::SegmentedArgSort(seg_begin, seg_end, val_begin, val_end, &sorted_idx);
+  common::SegmentedArgMergeSort(ctx, seg_begin, seg_end, val_begin, val_end, &sorted_idx);
  auto n_segments = std::distance(seg_begin, seg_end) - 1;
  if (n_segments <= 0) {
    return;
@@ -203,7 +204,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
                               HostDeviceVector<float>* quantiles) {
  auto cuctx = ctx->CUDACtx();
  dh::device_vector<std::size_t> sorted_idx;
-  dh::SegmentedArgSort(seg_beg, seg_end, val_begin, val_end, &sorted_idx);
+  common::SegmentedArgMergeSort(ctx, seg_beg, seg_end, val_begin, val_end, &sorted_idx);
  auto d_sorted_idx = dh::ToSpan(sorted_idx);
  std::size_t n_weights = std::distance(w_begin, w_end);
  dh::device_vector<float> weights_cdf(n_weights);