diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu old mode 100644 new mode 100755 index 7b22c4420..cbc3b12bb --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -776,21 +776,15 @@ struct GPUHistMakerDevice { /** * \brief Build GPU local histograms for the left and right child of some parent node */ - void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right, dh::AllReducer* reducer) { + void BuildHistLeftRight(const ExpandEntry &candidate, int nidx_left, + int nidx_right, dh::AllReducer* reducer) { auto build_hist_nidx = nidx_left; auto subtraction_trick_nidx = nidx_right; - auto left_node_rows = row_partitioner->GetRows(nidx_left).size(); - auto right_node_rows = row_partitioner->GetRows(nidx_right).size(); + // Decide whether to build the left histogram or right histogram - // Find the largest number of training instances on any given device - // Assume this will be the bottleneck and avoid building this node if - // possible - std::vector max_reduce; - max_reduce.push_back(left_node_rows); - max_reduce.push_back(right_node_rows); - reducer->HostMaxAllReduce(&max_reduce); - bool fewer_right = max_reduce[1] < max_reduce[0]; + // Use sum of Hessian as a heuristic to select node with fewest training instances + bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess(); if (fewer_right) { std::swap(build_hist_nidx, subtraction_trick_nidx); } @@ -800,11 +794,11 @@ struct GPUHistMakerDevice { // Check whether we can use the subtraction trick to calculate the other bool do_subtraction_trick = this->CanDoSubtractionTrick( - nidx_parent, build_hist_nidx, subtraction_trick_nidx); + candidate.nid, build_hist_nidx, subtraction_trick_nidx); if (do_subtraction_trick) { // Calculate other histogram using subtraction trick - this->SubtractionTrick(nidx_parent, build_hist_nidx, + this->SubtractionTrick(candidate.nid, build_hist_nidx, subtraction_trick_nidx); } else { // Calculate other histogram manually @@ -917,7 +911,7 @@ struct GPUHistMakerDevice { monitor.StopCuda("UpdatePosition"); monitor.StartCuda("BuildHist"); - this->BuildHistLeftRight(candidate.nid, left_child_nidx, right_child_nidx, reducer); + this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer); monitor.StopCuda("BuildHist"); monitor.StartCuda("EvaluateSplits"); diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu old mode 100644 new mode 100755 index 271ac8412..795f9420d --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -84,22 +84,3 @@ void TestAllocator() { TEST(bulkAllocator, Test) { TestAllocator(); } - - // Test thread safe max reduction -#if defined(XGBOOST_USE_NCCL) -TEST(AllReducer, MGPU_HostMaxAllReduce) { - dh::AllReducer reducer; - size_t num_threads = 50; - std::vector> thread_data(num_threads); -#pragma omp parallel num_threads(num_threads) - { - int tid = omp_get_thread_num(); - thread_data[tid] = {size_t(tid)}; - reducer.HostMaxAllReduce(&thread_data[tid]); - } - - for (auto data : thread_data) { - ASSERT_EQ(data.front(), num_threads - 1); - } -} -#endif