Avoid repeated cuda API call in GPU predictor and only synchronize used GPUs (#2936)

This commit is contained in:
PSEUDOTENSOR / Jonathan McKinney 2017-12-08 22:00:42 -05:00 committed by Rory Mitchell
parent e8a6597957
commit 4d36036fe6
3 changed files with 36 additions and 23 deletions

View File

@ -6,6 +6,7 @@
#include <iostream> #include <iostream>
#include <map> #include <map>
#include <string> #include <string>
#include <vector>
namespace xgboost { namespace xgboost {
namespace common { namespace common {
@ -63,11 +64,21 @@ struct Monitor {
this->label = label; this->label = label;
} }
void Start(const std::string &name) { timer_map[name].Start(); } void Start(const std::string &name) { timer_map[name].Start(); }
void Stop(const std::string &name) { void Start(const std::string &name, std::vector<int> dList) {
if (debug_verbose) { if (debug_verbose) {
#ifdef __CUDACC__ #ifdef __CUDACC__
#include "device_helpers.cuh" #include "device_helpers.cuh"
dh::synchronize_all(); dh::synchronize_n_devices(dList.size(), dList);
#endif
}
timer_map[name].Start();
}
void Stop(const std::string &name) { timer_map[name].Stop(); }
void Stop(const std::string &name, std::vector<int> dList) {
if (debug_verbose) {
#ifdef __CUDACC__
#include "device_helpers.cuh"
dh::synchronize_n_devices(dList.size(), dList);
#endif #endif
} }
timer_map[name].Stop(); timer_map[name].Stop();

View File

@ -303,7 +303,7 @@ class GPUPredictor : public xgboost::Predictor {
int shared_memory_bytes = static_cast<int>( int shared_memory_bytes = static_cast<int>(
sizeof(float) * device_matrix->p_mat->info().num_col * BLOCK_THREADS); sizeof(float) * device_matrix->p_mat->info().num_col * BLOCK_THREADS);
bool use_shared = true; bool use_shared = true;
if (shared_memory_bytes > dh::max_shared_memory(param.gpu_id)) { if (shared_memory_bytes > max_shared_memory_bytes) {
shared_memory_bytes = 0; shared_memory_bytes = 0;
use_shared = false; use_shared = false;
} }
@ -391,6 +391,7 @@ class GPUPredictor : public xgboost::Predictor {
Predictor::Init(cfg, cache); Predictor::Init(cfg, cache);
cpu_predictor->Init(cfg, cache); cpu_predictor->Init(cfg, cache);
param.InitAllowUnknown(cfg); param.InitAllowUnknown(cfg);
max_shared_memory_bytes = dh::max_shared_memory(param.gpu_id);
} }
private: private:
@ -401,6 +402,7 @@ class GPUPredictor : public xgboost::Predictor {
thrust::device_vector<DevicePredictionNode> nodes; thrust::device_vector<DevicePredictionNode> nodes;
thrust::device_vector<size_t> tree_segments; thrust::device_vector<size_t> tree_segments;
thrust::device_vector<int> tree_group; thrust::device_vector<int> tree_group;
size_t max_shared_memory_bytes;
}; };
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor") XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
.describe("Make predictions using GPU.") .describe("Make predictions using GPU.")

View File

@ -506,7 +506,7 @@ class GPUHistMaker : public TreeUpdater {
} }
void Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat, void Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
const std::vector<RegTree*>& trees) override { const std::vector<RegTree*>& trees) override {
monitor.Start("Update"); monitor.Start("Update", dList);
GradStats::CheckInfo(dmat->info()); GradStats::CheckInfo(dmat->info());
// rescale learning rate according to size of trees // rescale learning rate according to size of trees
float lr = param.learning_rate; float lr = param.learning_rate;
@ -521,16 +521,16 @@ class GPUHistMaker : public TreeUpdater {
LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl; LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
} }
param.learning_rate = lr; param.learning_rate = lr;
monitor.Stop("Update"); monitor.Stop("Update", dList);
} }
void InitDataOnce(DMatrix* dmat) { void InitDataOnce(DMatrix* dmat) {
info = &dmat->info(); info = &dmat->info();
monitor.Start("Quantiles"); monitor.Start("Quantiles", dList);
hmat_.Init(dmat, param.max_bin); hmat_.Init(dmat, param.max_bin);
gmat_.cut = &hmat_; gmat_.cut = &hmat_;
gmat_.Init(dmat); gmat_.Init(dmat);
monitor.Stop("Quantiles"); monitor.Stop("Quantiles", dList);
n_bins = hmat_.row_ptr.back(); n_bins = hmat_.row_ptr.back();
int n_devices = dh::n_devices(param.n_gpus, info->num_row); int n_devices = dh::n_devices(param.n_gpus, info->num_row);
@ -539,7 +539,7 @@ class GPUHistMaker : public TreeUpdater {
bst_uint shard_size = bst_uint shard_size =
std::ceil(static_cast<double>(info->num_row) / n_devices); std::ceil(static_cast<double>(info->num_row) / n_devices);
std::vector<int> dList(n_devices); dList.resize(n_devices);
for (int d_idx = 0; d_idx < n_devices; ++d_idx) { for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices(); int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices();
dList[d_idx] = device_idx; dList[d_idx] = device_idx;
@ -574,23 +574,23 @@ class GPUHistMaker : public TreeUpdater {
void InitData(const std::vector<bst_gpair>& gpair, DMatrix* dmat, void InitData(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
const RegTree& tree) { const RegTree& tree) {
monitor.Start("InitDataOnce"); monitor.Start("InitDataOnce", dList);
if (!initialised) { if (!initialised) {
this->InitDataOnce(dmat); this->InitDataOnce(dmat);
} }
monitor.Stop("InitDataOnce"); monitor.Stop("InitDataOnce", dList);
column_sampler.Init(info->num_col, param); column_sampler.Init(info->num_col, param);
// Copy gpair & reset memory // Copy gpair & reset memory
monitor.Start("InitDataReset"); monitor.Start("InitDataReset", dList);
omp_set_num_threads(shards.size()); omp_set_num_threads(shards.size());
#pragma omp parallel #pragma omp parallel
{ {
auto cpu_thread_id = omp_get_thread_num(); auto cpu_thread_id = omp_get_thread_num();
shards[cpu_thread_id]->Reset(gpair); shards[cpu_thread_id]->Reset(gpair);
} }
monitor.Stop("InitDataReset"); monitor.Stop("InitDataReset", dList);
} }
void AllReduceHist(int nidx) { void AllReduceHist(int nidx) {
@ -807,12 +807,12 @@ class GPUHistMaker : public TreeUpdater {
auto& tree = *p_tree; auto& tree = *p_tree;
monitor.Start("InitData"); monitor.Start("InitData", dList);
this->InitData(gpair, p_fmat, *p_tree); this->InitData(gpair, p_fmat, *p_tree);
monitor.Stop("InitData"); monitor.Stop("InitData", dList);
monitor.Start("InitRoot"); monitor.Start("InitRoot", dList);
this->InitRoot(gpair, p_tree); this->InitRoot(gpair, p_tree);
monitor.Stop("InitRoot"); monitor.Stop("InitRoot", dList);
auto timestamp = qexpand_->size(); auto timestamp = qexpand_->size();
auto num_leaves = 1; auto num_leaves = 1;
@ -822,9 +822,9 @@ class GPUHistMaker : public TreeUpdater {
qexpand_->pop(); qexpand_->pop();
if (!candidate.IsValid(param, num_leaves)) continue; if (!candidate.IsValid(param, num_leaves)) continue;
// std::cout << candidate; // std::cout << candidate;
monitor.Start("ApplySplit"); monitor.Start("ApplySplit", dList);
this->ApplySplit(candidate, p_tree); this->ApplySplit(candidate, p_tree);
monitor.Stop("ApplySplit"); monitor.Stop("ApplySplit", dList);
num_leaves++; num_leaves++;
auto left_child_nidx = tree[candidate.nid].cleft(); auto left_child_nidx = tree[candidate.nid].cleft();
@ -833,12 +833,12 @@ class GPUHistMaker : public TreeUpdater {
// Only create child entries if needed // Only create child entries if needed
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx), if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
num_leaves)) { num_leaves)) {
monitor.Start("BuildHist"); monitor.Start("BuildHist", dList);
this->BuildHistLeftRight(candidate.nid, left_child_nidx, this->BuildHistLeftRight(candidate.nid, left_child_nidx,
right_child_nidx); right_child_nidx);
monitor.Stop("BuildHist"); monitor.Stop("BuildHist", dList);
monitor.Start("EvaluateSplits"); monitor.Start("EvaluateSplits", dList);
auto splits = auto splits =
this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree); this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree);
qexpand_->push(ExpandEntry(left_child_nidx, qexpand_->push(ExpandEntry(left_child_nidx,
@ -847,10 +847,9 @@ class GPUHistMaker : public TreeUpdater {
qexpand_->push(ExpandEntry(right_child_nidx, qexpand_->push(ExpandEntry(right_child_nidx,
tree.GetDepth(right_child_nidx), splits[1], tree.GetDepth(right_child_nidx), splits[1],
timestamp++)); timestamp++));
monitor.Stop("EvaluateSplits"); monitor.Stop("EvaluateSplits", dList);
} }
} }
// Reset omp num threads // Reset omp num threads
omp_set_num_threads(nthread); omp_set_num_threads(nthread);
} }
@ -919,6 +918,7 @@ class GPUHistMaker : public TreeUpdater {
common::Monitor monitor; common::Monitor monitor;
dh::AllReducer reducer; dh::AllReducer reducer;
std::vector<ValueConstraint> node_value_constraints_; std::vector<ValueConstraint> node_value_constraints_;
std::vector<int> dList;
}; };
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist") XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")