Use heuristic to select histogram node, avoid rabit call (#4951)

2019-10-18 11:33:54 +13:00 · 2019-10-18 11:33:54 +13:00 · 60748b2071
commit 60748b2071
parent 185e3f1916
2 changed files with 8 additions and 33 deletions
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -776,21 +776,15 @@ struct GPUHistMakerDevice {
  /**
   * \brief Build GPU local histograms for the left and right child of some parent node
   */
-  void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right, dh::AllReducer* reducer) {
+  void BuildHistLeftRight(const ExpandEntry &candidate, int nidx_left,
        int nidx_right, dh::AllReducer* reducer) {
    auto build_hist_nidx = nidx_left;
    auto subtraction_trick_nidx = nidx_right;
-    auto left_node_rows = row_partitioner->GetRows(nidx_left).size();
+
    auto right_node_rows = row_partitioner->GetRows(nidx_right).size();
    // Decide whether to build the left histogram or right histogram
-    // Find the largest number of training instances on any given device
+    // Use sum of Hessian as a heuristic to select node with fewest training instances
-    // Assume this will be the bottleneck and avoid building this node if
+    bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess();
    // possible
    std::vector<size_t> max_reduce;
    max_reduce.push_back(left_node_rows);
    max_reduce.push_back(right_node_rows);
    reducer->HostMaxAllReduce(&max_reduce);
    bool fewer_right = max_reduce[1] < max_reduce[0];
    if (fewer_right) {
      std::swap(build_hist_nidx, subtraction_trick_nidx);
    }
@ -800,11 +794,11 @@ struct GPUHistMakerDevice {
    // Check whether we can use the subtraction trick to calculate the other
    bool do_subtraction_trick = this->CanDoSubtractionTrick(
-        nidx_parent, build_hist_nidx, subtraction_trick_nidx);
+         candidate.nid, build_hist_nidx, subtraction_trick_nidx);
    if (do_subtraction_trick) {
      // Calculate other histogram using subtraction trick
-      this->SubtractionTrick(nidx_parent, build_hist_nidx,
+      this->SubtractionTrick(candidate.nid, build_hist_nidx,
                             subtraction_trick_nidx);
    } else {
      // Calculate other histogram manually
@ -917,7 +911,7 @@ struct GPUHistMakerDevice {
        monitor.StopCuda("UpdatePosition");
        monitor.StartCuda("BuildHist");
-        this->BuildHistLeftRight(candidate.nid, left_child_nidx, right_child_nidx, reducer);
+        this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
        monitor.StopCuda("BuildHist");
        monitor.StartCuda("EvaluateSplits");
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@ -84,22 +84,3 @@ void TestAllocator() {
 TEST(bulkAllocator, Test) {
  TestAllocator();
 }
 // Test thread safe max reduction
 #if defined(XGBOOST_USE_NCCL)
 TEST(AllReducer, MGPU_HostMaxAllReduce) {
  dh::AllReducer reducer;
  size_t num_threads = 50;
  std::vector<std::vector<size_t>> thread_data(num_threads);
 #pragma omp parallel num_threads(num_threads)
  {
    int tid = omp_get_thread_num();
    thread_data[tid] = {size_t(tid)};
    reducer.HostMaxAllReduce(&thread_data[tid]);
  }
  for (auto data : thread_data) {
    ASSERT_EQ(data.front(), num_threads - 1);
  }
 }
 #endif