Use heuristic to select histogram node, avoid rabit call (#4951)
This commit is contained in:
parent
185e3f1916
commit
60748b2071
22
src/tree/updater_gpu_hist.cu
Normal file → Executable file
22
src/tree/updater_gpu_hist.cu
Normal file → Executable file
@ -776,21 +776,15 @@ struct GPUHistMakerDevice {
|
|||||||
/**
|
/**
|
||||||
* \brief Build GPU local histograms for the left and right child of some parent node
|
* \brief Build GPU local histograms for the left and right child of some parent node
|
||||||
*/
|
*/
|
||||||
void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right, dh::AllReducer* reducer) {
|
void BuildHistLeftRight(const ExpandEntry &candidate, int nidx_left,
|
||||||
|
int nidx_right, dh::AllReducer* reducer) {
|
||||||
auto build_hist_nidx = nidx_left;
|
auto build_hist_nidx = nidx_left;
|
||||||
auto subtraction_trick_nidx = nidx_right;
|
auto subtraction_trick_nidx = nidx_right;
|
||||||
|
|
||||||
auto left_node_rows = row_partitioner->GetRows(nidx_left).size();
|
|
||||||
auto right_node_rows = row_partitioner->GetRows(nidx_right).size();
|
|
||||||
// Decide whether to build the left histogram or right histogram
|
// Decide whether to build the left histogram or right histogram
|
||||||
// Find the largest number of training instances on any given device
|
// Use sum of Hessian as a heuristic to select node with fewest training instances
|
||||||
// Assume this will be the bottleneck and avoid building this node if
|
bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess();
|
||||||
// possible
|
|
||||||
std::vector<size_t> max_reduce;
|
|
||||||
max_reduce.push_back(left_node_rows);
|
|
||||||
max_reduce.push_back(right_node_rows);
|
|
||||||
reducer->HostMaxAllReduce(&max_reduce);
|
|
||||||
bool fewer_right = max_reduce[1] < max_reduce[0];
|
|
||||||
if (fewer_right) {
|
if (fewer_right) {
|
||||||
std::swap(build_hist_nidx, subtraction_trick_nidx);
|
std::swap(build_hist_nidx, subtraction_trick_nidx);
|
||||||
}
|
}
|
||||||
@ -800,11 +794,11 @@ struct GPUHistMakerDevice {
|
|||||||
|
|
||||||
// Check whether we can use the subtraction trick to calculate the other
|
// Check whether we can use the subtraction trick to calculate the other
|
||||||
bool do_subtraction_trick = this->CanDoSubtractionTrick(
|
bool do_subtraction_trick = this->CanDoSubtractionTrick(
|
||||||
nidx_parent, build_hist_nidx, subtraction_trick_nidx);
|
candidate.nid, build_hist_nidx, subtraction_trick_nidx);
|
||||||
|
|
||||||
if (do_subtraction_trick) {
|
if (do_subtraction_trick) {
|
||||||
// Calculate other histogram using subtraction trick
|
// Calculate other histogram using subtraction trick
|
||||||
this->SubtractionTrick(nidx_parent, build_hist_nidx,
|
this->SubtractionTrick(candidate.nid, build_hist_nidx,
|
||||||
subtraction_trick_nidx);
|
subtraction_trick_nidx);
|
||||||
} else {
|
} else {
|
||||||
// Calculate other histogram manually
|
// Calculate other histogram manually
|
||||||
@ -917,7 +911,7 @@ struct GPUHistMakerDevice {
|
|||||||
monitor.StopCuda("UpdatePosition");
|
monitor.StopCuda("UpdatePosition");
|
||||||
|
|
||||||
monitor.StartCuda("BuildHist");
|
monitor.StartCuda("BuildHist");
|
||||||
this->BuildHistLeftRight(candidate.nid, left_child_nidx, right_child_nidx, reducer);
|
this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
|
||||||
monitor.StopCuda("BuildHist");
|
monitor.StopCuda("BuildHist");
|
||||||
|
|
||||||
monitor.StartCuda("EvaluateSplits");
|
monitor.StartCuda("EvaluateSplits");
|
||||||
|
|||||||
19
tests/cpp/common/test_device_helpers.cu
Normal file → Executable file
19
tests/cpp/common/test_device_helpers.cu
Normal file → Executable file
@ -84,22 +84,3 @@ void TestAllocator() {
|
|||||||
TEST(bulkAllocator, Test) {
|
TEST(bulkAllocator, Test) {
|
||||||
TestAllocator();
|
TestAllocator();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test thread safe max reduction
|
|
||||||
#if defined(XGBOOST_USE_NCCL)
|
|
||||||
TEST(AllReducer, MGPU_HostMaxAllReduce) {
|
|
||||||
dh::AllReducer reducer;
|
|
||||||
size_t num_threads = 50;
|
|
||||||
std::vector<std::vector<size_t>> thread_data(num_threads);
|
|
||||||
#pragma omp parallel num_threads(num_threads)
|
|
||||||
{
|
|
||||||
int tid = omp_get_thread_num();
|
|
||||||
thread_data[tid] = {size_t(tid)};
|
|
||||||
reducer.HostMaxAllReduce(&thread_data[tid]);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto data : thread_data) {
|
|
||||||
ASSERT_EQ(data.front(), num_threads - 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user