Combine thread launches into single launch per tree for gpu_hist (#4343)

* Combine thread launches into single launch per tree for gpu_hist
algorithm.

* Address deprecation warning

* Add manual column sampler constructor

* Turn off omp dynamic to get a guaranteed number of threads

* Enable openmp in cuda code
This commit is contained in:
Rory Mitchell
2019-04-29 09:58:34 +12:00
committed by GitHub
parent 146e83f3b3
commit 5e582b0fa7
10 changed files with 402 additions and 325 deletions

View File

@@ -1,3 +1,4 @@
#include <valarray>
#include "../../../src/common/random.h"
#include "../helpers.h"
#include "gtest/gtest.h"
@@ -33,7 +34,8 @@ TEST(ColumnSampler, Test) {
// No level or node sampling, should be the same at different depth
cs.Init(n, 1.0f, 1.0f, 0.5f);
ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector());
ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
cs.GetFeatureSet(1)->HostVector());
cs.Init(n, 1.0f, 1.0f, 1.0f);
auto set5 = *cs.GetFeatureSet(0);
@@ -45,7 +47,34 @@ TEST(ColumnSampler, Test) {
// Should always be a minimum of one feature
cs.Init(n, 1e-16f, 1e-16f, 1e-16f);
ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
}
// Test if different threads using the same seed produce the same result
TEST(ColumnSampler, ThreadSynchronisation) {
const int64_t num_threads = 100;
int n = 128;
int iterations = 10;
int levels = 5;
std::vector<int> reference_result;
bool success =
true; // Cannot use google test asserts in multithreaded region
#pragma omp parallel num_threads(num_threads)
{
for (auto j = 0ull; j < iterations; j++) {
ColumnSampler cs(j);
cs.Init(n, 0.5f, 0.5f, 0.5f);
for (auto level = 0ull; level < levels; level++) {
auto result = cs.GetFeatureSet(level)->ConstHostVector();
#pragma omp single
{ reference_result = result; }
if (result != reference_result) {
success = false;
}
#pragma omp barrier
}
}
}
ASSERT_TRUE(success);
}
} // namespace common
} // namespace xgboost

View File

@@ -89,7 +89,7 @@ TEST(GpuHist, BuildGidxDense) {
param.n_gpus = 1;
param.max_leaves = 0;
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
BuildGidx(&shard, kNRows, kNCols);
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -128,7 +128,7 @@ TEST(GpuHist, BuildGidxSparse) {
param.n_gpus = 1;
param.max_leaves = 0;
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
BuildGidx(&shard, kNRows, kNCols, 0.9f);
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -172,7 +172,7 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
param.n_gpus = 1;
param.max_leaves = 0;
DeviceShard<GradientSumT> shard(0, 0, kNRows, param);
DeviceShard<GradientSumT> shard(0, 0, 0, kNRows, param, kNCols);
BuildGidx(&shard, kNRows, kNCols);
@@ -282,8 +282,8 @@ TEST(GpuHist, EvaluateSplits) {
int max_bins = 4;
// Initialize DeviceShard
std::unique_ptr<DeviceShard<GradientPairPrecise>> shard {
new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param)};
std::unique_ptr<DeviceShard<GradientPairPrecise>> shard{
new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols)};
// Initialize DeviceShard::node_sum_gradients
shard->node_sum_gradients = {{6.4f, 12.8f}};
@@ -321,12 +321,7 @@ TEST(GpuHist, EvaluateSplits) {
thrust::copy(hist.begin(), hist.end(),
shard->hist.Data().begin());
// Initialize GPUHistMaker
GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
GPUHistMakerSpecialised<GradientPairPrecise>();
hist_maker.param_ = param;
hist_maker.shards_.push_back(std::move(shard));
hist_maker.column_sampler_.Init(kNCols,
shard->column_sampler.Init(kNCols,
param.colsample_bynode,
param.colsample_bylevel,
param.colsample_bytree,
@@ -337,13 +332,12 @@ TEST(GpuHist, EvaluateSplits) {
info.num_row_ = kNRows;
info.num_col_ = kNCols;
hist_maker.info_ = &info;
hist_maker.node_value_constraints_.resize(1);
hist_maker.node_value_constraints_[0].lower_bound = -1.0;
hist_maker.node_value_constraints_[0].upper_bound = 1.0;
shard->node_value_constraints.resize(1);
shard->node_value_constraints[0].lower_bound = -1.0;
shard->node_value_constraints[0].upper_bound = 1.0;
std::vector<DeviceSplitCandidate> res =
hist_maker.EvaluateSplits({ 0,0 }, &tree);
shard->EvaluateSplits({ 0,0 }, tree, kNCols);
ASSERT_EQ(res[0].findex, 7);
ASSERT_EQ(res[1].findex, 7);
@@ -368,7 +362,8 @@ TEST(GpuHist, ApplySplit) {
}
hist_maker.shards_.resize(1);
hist_maker.shards_[0].reset(new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param));
hist_maker.shards_[0].reset(
new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols));
auto& shard = hist_maker.shards_.at(0);
shard->ridx_segments.resize(3); // 3 nodes.
@@ -435,8 +430,8 @@ TEST(GpuHist, ApplySplit) {
shard->gidx_buffer.data(), num_symbols);
hist_maker.info_ = &info;
hist_maker.ApplySplit(candidate_entry, &tree);
hist_maker.UpdatePosition(candidate_entry, &tree);
shard->ApplySplit(candidate_entry, &tree);
shard->UpdatePosition(candidate_entry.nid, tree[candidate_entry.nid]);
ASSERT_FALSE(tree[kNId].IsLeaf());