Single precision histograms on GPU (#3965)

* Allow single precision histogram summation in gpu_hist * Add python test, reduce run-time of gpu_hist tests * Update documentation
2018-12-10 10:55:30 +13:00
parent 9af6b689d6
commit 93f9ce9ef9
10 changed files with 351 additions and 212 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -944,6 +944,32 @@ class AllReducer {
 #endif
  }

+  /**
+   * \brief Allreduce. Use in exactly the same way as NCCL but without needing
+   * streams or comms.
+   *
+   * \param communication_group_idx Zero-based index of the communication group.
+   * \param sendbuff                The sendbuff.
+   * \param recvbuff                The recvbuff.
+   * \param count                   Number of elements.
+   */
+
+  void AllReduceSum(int communication_group_idx, const float *sendbuff,
+                    float *recvbuff, int count) {
+#ifdef XGBOOST_USE_NCCL
+    CHECK(initialised_);
+    dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx)));
+    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum,
+                                comms.at(communication_group_idx),
+                                streams.at(communication_group_idx)));
+    if(communication_group_idx == 0)
+    {
+      allreduce_bytes_ += count * sizeof(float);
+      allreduce_calls_ += 1;
+    }
+#endif
+  }
+
  /**
   * \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
   *
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -116,19 +116,19 @@ struct GPUSketcher {
      n_rows_(row_end - row_begin), param_(std::move(param)) {
    }

-    void Init(const SparsePage& row_batch, const MetaInfo& info) {
+    void Init(const SparsePage& row_batch, const MetaInfo& info, int gpu_batch_nrows) {
      num_cols_ = info.num_col_;
      has_weights_ = info.weights_.Size() > 0;

      // find the batch size
-      if (param_.gpu_batch_nrows == 0) {
+      if (gpu_batch_nrows == 0) {
        // By default, use no more than 1/16th of GPU memory
        gpu_batch_nrows_ = dh::TotalMemory(device_) /
          (16 * num_cols_ * sizeof(Entry));
-      } else if (param_.gpu_batch_nrows == -1) {
+      } else if (gpu_batch_nrows == -1) {
        gpu_batch_nrows_ = n_rows_;
      } else {
-        gpu_batch_nrows_ = param_.gpu_batch_nrows;
+        gpu_batch_nrows_ = gpu_batch_nrows;
      }
      if (gpu_batch_nrows_ > n_rows_) {
        gpu_batch_nrows_ = n_rows_;
@@ -346,7 +346,8 @@ struct GPUSketcher {
    }
  };

-  void Sketch(const SparsePage& batch, const MetaInfo& info, HistCutMatrix* hmat) {
+  void Sketch(const SparsePage& batch, const MetaInfo& info,
+              HistCutMatrix* hmat, int gpu_batch_nrows) {
    // create device shards
    shards_.resize(dist_.Devices().Size());
    dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
@@ -358,10 +359,11 @@ struct GPUSketcher {
      });

    // compute sketches for each shard
-    dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
-        shard->Init(batch, info);
-        shard->Sketch(batch, info);
-      });
+    dh::ExecuteIndexShards(&shards_,
+                           [&](int idx, std::unique_ptr<DeviceShard>& shard) {
+                             shard->Init(batch, info, gpu_batch_nrows);
+                             shard->Sketch(batch, info);
+                           });

    // merge the sketches from all shards
    // TODO(canonizer): do it in a tree-like reduction
@@ -390,9 +392,9 @@ struct GPUSketcher {

 void DeviceSketch
  (const SparsePage& batch, const MetaInfo& info,
-   const tree::TrainParam& param, HistCutMatrix* hmat) {
+   const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows) {
  GPUSketcher sketcher(param, info.num_row_);
-  sketcher.Sketch(batch, info, hmat);
+  sketcher.Sketch(batch, info, hmat, gpu_batch_nrows);
 }

 }  // namespace common
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -72,7 +72,7 @@ struct HistCutMatrix {
 /*! \brief Builds the cut matrix on the GPU */
 void DeviceSketch
  (const SparsePage& batch, const MetaInfo& info,
-   const tree::TrainParam& param, HistCutMatrix* hmat);
+   const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows);

 /*!
 * \brief A single row in global histogram index.