Reduce device synchronisation (#5631)

* Reduce device synchronisation * Initialise pinned memory
2020-05-07 21:19:46 +12:00
parent 9910265064
commit fcf57823b6
7 changed files with 260 additions and 118 deletions
--- a/src/tree/gpu_hist/driver.cuh
+++ b/src/tree/gpu_hist/driver.cuh
@@ -0,0 +1,120 @@
+/*!
+ * Copyright 2020 by XGBoost Contributors
+ */
+#ifndef DRIVER_CUH_
+#define DRIVER_CUH_
+#include <xgboost/span.h>
+#include <queue>
+#include "../param.h"
+#include "evaluate_splits.cuh"
+
+namespace xgboost {
+namespace tree {
+struct ExpandEntry {
+  int nid;
+  int depth;
+  DeviceSplitCandidate split;
+  ExpandEntry() = default;
+  XGBOOST_DEVICE ExpandEntry(int nid, int depth, DeviceSplitCandidate split)
+      : nid(nid), depth(depth), split(std::move(split)) {}
+  bool IsValid(const TrainParam& param, int num_leaves) const {
+    if (split.loss_chg <= kRtEps) return false;
+    if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
+      return false;
+    }
+    if (split.loss_chg < param.min_split_loss) {
+      return false;
+    }
+    if (param.max_depth > 0 && depth == param.max_depth) {
+      return false;
+    }
+    if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
+      return false;
+    }
+    return true;
+  }
+
+  static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
+    if (param.max_depth > 0 && depth >= param.max_depth) return false;
+    if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
+    return true;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const ExpandEntry& e) {
+    os << "ExpandEntry: \n";
+    os << "nidx: " << e.nid << "\n";
+    os << "depth: " << e.depth << "\n";
+    os << "loss: " << e.split.loss_chg << "\n";
+    os << "left_sum: " << e.split.left_sum << "\n";
+    os << "right_sum: " << e.split.right_sum << "\n";
+    return os;
+  }
+};
+
+inline bool DepthWise(const ExpandEntry& lhs, const ExpandEntry& rhs) {
+  return lhs.depth > rhs.depth;  // favor small depth
+}
+
+inline bool LossGuide(const ExpandEntry& lhs, const ExpandEntry& rhs) {
+  if (lhs.split.loss_chg == rhs.split.loss_chg) {
+    return lhs.nid > rhs.nid;  // favor small timestamp
+  } else {
+    return lhs.split.loss_chg < rhs.split.loss_chg;  // favor large loss_chg
+  }
+}
+
+// Drives execution of tree building on device
+class Driver {
+  using ExpandQueue =
+      std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
+                          std::function<bool(ExpandEntry, ExpandEntry)>>;
+
+ public:
+  explicit Driver(TrainParam::TreeGrowPolicy policy)
+      : policy_(policy),
+        queue_(policy == TrainParam::kDepthWise ? DepthWise : LossGuide) {}
+  template <typename EntryIterT>
+  void Push(EntryIterT begin,EntryIterT end) {
+    for (auto it = begin; it != end; ++it) {
+      const ExpandEntry& e = *it;
+      if (e.split.loss_chg > kRtEps) {
+        queue_.push(e);
+      }
+    }
+  }
+  void Push(const std::vector<ExpandEntry> &entries) {
+    this->Push(entries.begin(), entries.end());
+  }
+  // Return the set of nodes to be expanded
+  // This set has no dependencies between entries so they may be expanded in
+  // parallel or asynchronously
+  std::vector<ExpandEntry> Pop() {
+    if (queue_.empty()) return {};
+    // Return a single entry for loss guided mode
+    if (policy_ == TrainParam::kLossGuide) {
+      ExpandEntry e = queue_.top();
+      queue_.pop();
+      return {e};
+    }
+    // Return nodes on same level for depth wise
+    std::vector<ExpandEntry> result;
+    ExpandEntry e = queue_.top();
+    int level = e.depth;
+    while (e.depth == level && !queue_.empty()) {
+      queue_.pop();
+      result.emplace_back(e);
+      if (!queue_.empty()) {
+        e = queue_.top();
+      }
+    }
+    return result;
+  }
+
+ private:
+  TrainParam::TreeGrowPolicy policy_;
+  ExpandQueue queue_;
+};
+}  // namespace tree
+}  // namespace xgboost
+
+#endif  // DRIVER_CUH_
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -61,6 +61,7 @@ class RowPartitioner {
  dh::caching_device_vector<int64_t>
      left_counts_;  // Useful to keep a bunch of zeroed memory for sort position
  std::vector<cudaStream_t> streams_;
+  dh::PinnedMemory pinned_;

 public:
  RowPartitioner(int device_idx, size_t num_rows);
@@ -129,12 +130,12 @@ class RowPartitioner {
      d_position[idx] = new_position;
    });
    // Overlap device to host memory copy (left_count) with sort
-    int64_t left_count;
+    int64_t &left_count = pinned_.GetSpan<int64_t>(1)[0];
    dh::safe_cuda(cudaMemcpyAsync(&left_count, d_left_count, sizeof(int64_t),
                                  cudaMemcpyDeviceToHost, streams_[0]));

-    SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count,
-                        streams_[1]);
+    SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, streams_[1]
+                        );

    dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
    CHECK_LE(left_count, segment.Size());