Combine thread launches into single launch per tree for gpu_hist (#4343)

* Combine thread launches into single launch per tree for gpu_hist
algorithm.

* Address deprecation warning

* Add manual column sampler constructor

* Turn off omp dynamic to get a guaranteed number of threads

* Enable openmp in cuda code
This commit is contained in:
Rory Mitchell
2019-04-29 09:58:34 +12:00
committed by GitHub
parent 146e83f3b3
commit 5e582b0fa7
10 changed files with 402 additions and 325 deletions

View File

@@ -89,7 +89,7 @@ TEST(GpuHist, BuildGidxDense) {
param.n_gpus = 1;
param.max_leaves = 0;
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
BuildGidx(&shard, kNRows, kNCols);
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -128,7 +128,7 @@ TEST(GpuHist, BuildGidxSparse) {
param.n_gpus = 1;
param.max_leaves = 0;
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
BuildGidx(&shard, kNRows, kNCols, 0.9f);
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -172,7 +172,7 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
param.n_gpus = 1;
param.max_leaves = 0;
DeviceShard<GradientSumT> shard(0, 0, kNRows, param);
DeviceShard<GradientSumT> shard(0, 0, 0, kNRows, param, kNCols);
BuildGidx(&shard, kNRows, kNCols);
@@ -282,8 +282,8 @@ TEST(GpuHist, EvaluateSplits) {
int max_bins = 4;
// Initialize DeviceShard
std::unique_ptr<DeviceShard<GradientPairPrecise>> shard {
new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param)};
std::unique_ptr<DeviceShard<GradientPairPrecise>> shard{
new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols)};
// Initialize DeviceShard::node_sum_gradients
shard->node_sum_gradients = {{6.4f, 12.8f}};
@@ -321,12 +321,7 @@ TEST(GpuHist, EvaluateSplits) {
thrust::copy(hist.begin(), hist.end(),
shard->hist.Data().begin());
// Initialize GPUHistMaker
GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
GPUHistMakerSpecialised<GradientPairPrecise>();
hist_maker.param_ = param;
hist_maker.shards_.push_back(std::move(shard));
hist_maker.column_sampler_.Init(kNCols,
shard->column_sampler.Init(kNCols,
param.colsample_bynode,
param.colsample_bylevel,
param.colsample_bytree,
@@ -337,13 +332,12 @@ TEST(GpuHist, EvaluateSplits) {
info.num_row_ = kNRows;
info.num_col_ = kNCols;
hist_maker.info_ = &info;
hist_maker.node_value_constraints_.resize(1);
hist_maker.node_value_constraints_[0].lower_bound = -1.0;
hist_maker.node_value_constraints_[0].upper_bound = 1.0;
shard->node_value_constraints.resize(1);
shard->node_value_constraints[0].lower_bound = -1.0;
shard->node_value_constraints[0].upper_bound = 1.0;
std::vector<DeviceSplitCandidate> res =
hist_maker.EvaluateSplits({ 0,0 }, &tree);
shard->EvaluateSplits({ 0,0 }, tree, kNCols);
ASSERT_EQ(res[0].findex, 7);
ASSERT_EQ(res[1].findex, 7);
@@ -368,7 +362,8 @@ TEST(GpuHist, ApplySplit) {
}
hist_maker.shards_.resize(1);
hist_maker.shards_[0].reset(new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param));
hist_maker.shards_[0].reset(
new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols));
auto& shard = hist_maker.shards_.at(0);
shard->ridx_segments.resize(3); // 3 nodes.
@@ -435,8 +430,8 @@ TEST(GpuHist, ApplySplit) {
shard->gidx_buffer.data(), num_symbols);
hist_maker.info_ = &info;
hist_maker.ApplySplit(candidate_entry, &tree);
hist_maker.UpdatePosition(candidate_entry, &tree);
shard->ApplySplit(candidate_entry, &tree);
shard->UpdatePosition(candidate_entry.nid, tree[candidate_entry.nid]);
ASSERT_FALSE(tree[kNId].IsLeaf());