Loop over thrust::reduce. (#6229)

* Check input chunk size of dqdm. * Add doc for current limitation.
2020-10-14 05:40:56 +08:00
parent 734a911a26
commit bed7ae4083
10 changed files with 46 additions and 8 deletions
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -53,7 +53,7 @@ struct Pair {
  GradientPair first;
  GradientPair second;
 };
-XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
+__host__ XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
  return {lhs.first + rhs.first, lhs.second + rhs.second};
 }
 }  // anonymous namespace
@@ -86,7 +86,7 @@ GradientSumT CreateRoundingFactor(common::Span<GradientPair const> gpair) {
  thrust::device_ptr<GradientPair const> gpair_end {gpair.data() + gpair.size()};
  auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
  auto end = thrust::make_transform_iterator(gpair_end, Clip());
-  Pair p = thrust::reduce(thrust::cuda::par(alloc), beg, end, Pair{});
+  Pair p = dh::Reduce(thrust::cuda::par(alloc), beg, end, Pair{}, thrust::plus<Pair>{});
  GradientPair positive_sum {p.first}, negative_sum {p.second};

  auto histogram_rounding = GradientSumT {
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -642,10 +642,11 @@ struct GPUHistMakerDevice {
  ExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
    constexpr bst_node_t kRootNIdx = 0;
    dh::XGBCachingDeviceAllocator<char> alloc;
-    GradientPair root_sum = thrust::reduce(
+    GradientPair root_sum = dh::Reduce(
        thrust::cuda::par(alloc),
        thrust::device_ptr<GradientPair const>(gpair.data()),
-        thrust::device_ptr<GradientPair const>(gpair.data() + gpair.size()));
+        thrust::device_ptr<GradientPair const>(gpair.data() + gpair.size()),
+        GradientPair{}, thrust::plus<GradientPair>{});
    rabit::Allreduce<rabit::op::Sum, float>(reinterpret_cast<float*>(&root_sum),
                                            2);