Combine thread launches into single launch per tree for gpu_hist (#4343)

* Combine thread launches into single launch per tree for gpu_hist algorithm. * Address deprecation warning * Add manual column sampler constructor * Turn off omp dynamic to get a guaranteed number of threads * Enable openmp in cuda code
2019-04-29 09:58:34 +12:00
parent 146e83f3b3
commit 5e582b0fa7
10 changed files with 402 additions and 325 deletions
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -89,7 +89,7 @@ TEST(GpuHist, BuildGidxDense) {
  param.n_gpus = 1;
  param.max_leaves = 0;

-  DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
+  DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
  BuildGidx(&shard, kNRows, kNCols);

  std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -128,7 +128,7 @@ TEST(GpuHist, BuildGidxSparse) {
  param.n_gpus = 1;
  param.max_leaves = 0;

-  DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
+  DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
  BuildGidx(&shard, kNRows, kNCols, 0.9f);

  std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -172,7 +172,7 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
  param.n_gpus = 1;
  param.max_leaves = 0;

-  DeviceShard<GradientSumT> shard(0, 0, kNRows, param);
+  DeviceShard<GradientSumT> shard(0, 0, 0, kNRows, param, kNCols);

  BuildGidx(&shard, kNRows, kNCols);

@@ -282,8 +282,8 @@ TEST(GpuHist, EvaluateSplits) {
  int max_bins = 4;

  // Initialize DeviceShard
-  std::unique_ptr<DeviceShard<GradientPairPrecise>> shard {
-    new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param)};
+  std::unique_ptr<DeviceShard<GradientPairPrecise>> shard{
+      new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols)};
  // Initialize DeviceShard::node_sum_gradients
  shard->node_sum_gradients = {{6.4f, 12.8f}};

@@ -321,12 +321,7 @@ TEST(GpuHist, EvaluateSplits) {
  thrust::copy(hist.begin(), hist.end(),
               shard->hist.Data().begin());

-  // Initialize GPUHistMaker
-  GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
-      GPUHistMakerSpecialised<GradientPairPrecise>();
-  hist_maker.param_ = param;
-  hist_maker.shards_.push_back(std::move(shard));
-  hist_maker.column_sampler_.Init(kNCols,
+  shard->column_sampler.Init(kNCols,
                                  param.colsample_bynode,
                                  param.colsample_bylevel,
                                  param.colsample_bytree,
@@ -337,13 +332,12 @@ TEST(GpuHist, EvaluateSplits) {
  info.num_row_ = kNRows;
  info.num_col_ = kNCols;

-  hist_maker.info_ = &info;
-  hist_maker.node_value_constraints_.resize(1);
-  hist_maker.node_value_constraints_[0].lower_bound = -1.0;
-  hist_maker.node_value_constraints_[0].upper_bound = 1.0;
+  shard->node_value_constraints.resize(1);
+  shard->node_value_constraints[0].lower_bound = -1.0;
+  shard->node_value_constraints[0].upper_bound = 1.0;

  std::vector<DeviceSplitCandidate> res =
-    hist_maker.EvaluateSplits({ 0,0 }, &tree);
+    shard->EvaluateSplits({ 0,0 }, tree, kNCols);

  ASSERT_EQ(res[0].findex, 7);
  ASSERT_EQ(res[1].findex, 7);
@@ -368,7 +362,8 @@ TEST(GpuHist, ApplySplit) {
  }

  hist_maker.shards_.resize(1);
-  hist_maker.shards_[0].reset(new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param));
+  hist_maker.shards_[0].reset(
+      new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols));

  auto& shard = hist_maker.shards_.at(0);
  shard->ridx_segments.resize(3);  // 3 nodes.
@@ -435,8 +430,8 @@ TEST(GpuHist, ApplySplit) {
      shard->gidx_buffer.data(), num_symbols);

  hist_maker.info_ = &info;
-  hist_maker.ApplySplit(candidate_entry, &tree);
-  hist_maker.UpdatePosition(candidate_entry, &tree);
+  shard->ApplySplit(candidate_entry, &tree);
+  shard->UpdatePosition(candidate_entry.nid, tree[candidate_entry.nid]);

  ASSERT_FALSE(tree[kNId].IsLeaf());