Single precision histograms on GPU (#3965)

* Allow single precision histogram summation in gpu_hist * Add python test, reduce run-time of gpu_hist tests * Update documentation
2018-12-10 10:55:30 +13:00
parent 9af6b689d6
commit 93f9ce9ef9
10 changed files with 351 additions and 212 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -944,6 +944,32 @@ class AllReducer {
 #endif
  }

+  /**
+   * \brief Allreduce. Use in exactly the same way as NCCL but without needing
+   * streams or comms.
+   *
+   * \param communication_group_idx Zero-based index of the communication group.
+   * \param sendbuff                The sendbuff.
+   * \param recvbuff                The recvbuff.
+   * \param count                   Number of elements.
+   */
+
+  void AllReduceSum(int communication_group_idx, const float *sendbuff,
+                    float *recvbuff, int count) {
+#ifdef XGBOOST_USE_NCCL
+    CHECK(initialised_);
+    dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx)));
+    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum,
+                                comms.at(communication_group_idx),
+                                streams.at(communication_group_idx)));
+    if(communication_group_idx == 0)
+    {
+      allreduce_bytes_ += count * sizeof(float);
+      allreduce_calls_ += 1;
+    }
+#endif
+  }
+
  /**
   * \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
   *