Smarter choice of histogram construction for distributed gpu_hist (#4519)

* Smarter choice of histogram construction for distributed gpu_hist * Limit omp team size in ExecuteShards
2019-05-31 14:11:34 +12:00
parent dd60fc23e6
commit fbbae3386a
3 changed files with 67 additions and 10 deletions
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -95,3 +95,20 @@ void TestAllocator() {
 TEST(bulkAllocator, Test) {
  TestAllocator();
 }
+
+ // Test thread safe max reduction
+TEST(AllReducer, HostMaxAllReduce) {
+  dh::AllReducer reducer;
+  size_t num_threads = 50;
+  std::vector<std::vector<size_t>> thread_data(num_threads);
+#pragma omp parallel num_threads(num_threads)
+  {
+    int tid = omp_get_thread_num();
+    thread_data[tid] = {size_t(tid)};
+    reducer.HostMaxAllReduce(&thread_data[tid]);
+  }
+
+  for (auto data : thread_data) {
+    ASSERT_EQ(data.front(), num_threads - 1);
+  }
+}