Combine thread launches into single launch per tree for gpu_hist (#4343)

* Combine thread launches into single launch per tree for gpu_hist algorithm. * Address deprecation warning * Add manual column sampler constructor * Turn off omp dynamic to get a guaranteed number of threads * Enable openmp in cuda code
2019-04-29 09:58:34 +12:00
parent 146e83f3b3
commit 5e582b0fa7
10 changed files with 402 additions and 325 deletions
--- a/tests/cpp/common/test_random.cc
+++ b/tests/cpp/common/test_random.cc
@@ -1,3 +1,4 @@
+#include <valarray>
 #include "../../../src/common/random.h"
 #include "../helpers.h"
 #include "gtest/gtest.h"
@@ -33,7 +34,8 @@ TEST(ColumnSampler, Test) {

  // No level or node sampling, should be the same at different depth
  cs.Init(n, 1.0f, 1.0f, 0.5f);
-  ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector());
+  ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
+            cs.GetFeatureSet(1)->HostVector());

  cs.Init(n, 1.0f, 1.0f, 1.0f);
  auto set5 = *cs.GetFeatureSet(0);
@@ -45,7 +47,34 @@ TEST(ColumnSampler, Test) {
  // Should always be a minimum of one feature
  cs.Init(n, 1e-16f, 1e-16f, 1e-16f);
  ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
+}

+// Test if different threads using the same seed produce the same result
+TEST(ColumnSampler, ThreadSynchronisation) {
+  const int64_t num_threads = 100;
+  int n = 128;
+  int iterations = 10;
+  int levels = 5;
+  std::vector<int> reference_result;
+  bool success =
+      true;  // Cannot use google test asserts in multithreaded region
+#pragma omp parallel num_threads(num_threads)
+  {
+    for (auto j = 0ull; j < iterations; j++) {
+      ColumnSampler cs(j);
+      cs.Init(n, 0.5f, 0.5f, 0.5f);
+      for (auto level = 0ull; level < levels; level++) {
+        auto result = cs.GetFeatureSet(level)->ConstHostVector();
+#pragma omp single
+        { reference_result = result; }
+        if (result != reference_result) {
+          success = false;
+        }
+#pragma omp barrier
+      }
+    }
+  }
+  ASSERT_TRUE(success);
 }
 }  // namespace common
 }  // namespace xgboost
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -89,7 +89,7 @@ TEST(GpuHist, BuildGidxDense) {
  param.n_gpus = 1;
  param.max_leaves = 0;

-  DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
+  DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
  BuildGidx(&shard, kNRows, kNCols);

  std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -128,7 +128,7 @@ TEST(GpuHist, BuildGidxSparse) {
  param.n_gpus = 1;
  param.max_leaves = 0;

-  DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
+  DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
  BuildGidx(&shard, kNRows, kNCols, 0.9f);

  std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -172,7 +172,7 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
  param.n_gpus = 1;
  param.max_leaves = 0;

-  DeviceShard<GradientSumT> shard(0, 0, kNRows, param);
+  DeviceShard<GradientSumT> shard(0, 0, 0, kNRows, param, kNCols);

  BuildGidx(&shard, kNRows, kNCols);

@@ -282,8 +282,8 @@ TEST(GpuHist, EvaluateSplits) {
  int max_bins = 4;

  // Initialize DeviceShard
-  std::unique_ptr<DeviceShard<GradientPairPrecise>> shard {
-    new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param)};
+  std::unique_ptr<DeviceShard<GradientPairPrecise>> shard{
+      new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols)};
  // Initialize DeviceShard::node_sum_gradients
  shard->node_sum_gradients = {{6.4f, 12.8f}};

@@ -321,12 +321,7 @@ TEST(GpuHist, EvaluateSplits) {
  thrust::copy(hist.begin(), hist.end(),
               shard->hist.Data().begin());

-  // Initialize GPUHistMaker
-  GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
-      GPUHistMakerSpecialised<GradientPairPrecise>();
-  hist_maker.param_ = param;
-  hist_maker.shards_.push_back(std::move(shard));
-  hist_maker.column_sampler_.Init(kNCols,
+  shard->column_sampler.Init(kNCols,
                                  param.colsample_bynode,
                                  param.colsample_bylevel,
                                  param.colsample_bytree,
@@ -337,13 +332,12 @@ TEST(GpuHist, EvaluateSplits) {
  info.num_row_ = kNRows;
  info.num_col_ = kNCols;

-  hist_maker.info_ = &info;
-  hist_maker.node_value_constraints_.resize(1);
-  hist_maker.node_value_constraints_[0].lower_bound = -1.0;
-  hist_maker.node_value_constraints_[0].upper_bound = 1.0;
+  shard->node_value_constraints.resize(1);
+  shard->node_value_constraints[0].lower_bound = -1.0;
+  shard->node_value_constraints[0].upper_bound = 1.0;

  std::vector<DeviceSplitCandidate> res =
-    hist_maker.EvaluateSplits({ 0,0 }, &tree);
+    shard->EvaluateSplits({ 0,0 }, tree, kNCols);

  ASSERT_EQ(res[0].findex, 7);
  ASSERT_EQ(res[1].findex, 7);
@@ -368,7 +362,8 @@ TEST(GpuHist, ApplySplit) {
  }

  hist_maker.shards_.resize(1);
-  hist_maker.shards_[0].reset(new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param));
+  hist_maker.shards_[0].reset(
+      new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols));

  auto& shard = hist_maker.shards_.at(0);
  shard->ridx_segments.resize(3);  // 3 nodes.
@@ -435,8 +430,8 @@ TEST(GpuHist, ApplySplit) {
      shard->gidx_buffer.data(), num_symbols);

  hist_maker.info_ = &info;
-  hist_maker.ApplySplit(candidate_entry, &tree);
-  hist_maker.UpdatePosition(candidate_entry, &tree);
+  shard->ApplySplit(candidate_entry, &tree);
+  shard->UpdatePosition(candidate_entry.nid, tree[candidate_entry.nid]);

  ASSERT_FALSE(tree[kNId].IsLeaf());