From 4d36036fe6fdc30ba4d72c84f3957a39a29ab23f Mon Sep 17 00:00:00 2001 From: PSEUDOTENSOR / Jonathan McKinney Date: Fri, 8 Dec 2017 22:00:42 -0500 Subject: [PATCH] Avoid repeated cuda API call in GPU predictor and only synchronize used GPUs (#2936) --- src/common/timer.h | 15 +++++++++++-- src/predictor/gpu_predictor.cu | 4 +++- src/tree/updater_gpu_hist.cu | 40 +++++++++++++++++----------------- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/src/common/timer.h b/src/common/timer.h index 81eec7ab0..cbff04cc1 100644 --- a/src/common/timer.h +++ b/src/common/timer.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace xgboost { namespace common { @@ -63,11 +64,21 @@ struct Monitor { this->label = label; } void Start(const std::string &name) { timer_map[name].Start(); } - void Stop(const std::string &name) { + void Start(const std::string &name, std::vector dList) { if (debug_verbose) { #ifdef __CUDACC__ #include "device_helpers.cuh" - dh::synchronize_all(); + dh::synchronize_n_devices(dList.size(), dList); +#endif + } + timer_map[name].Start(); + } + void Stop(const std::string &name) { timer_map[name].Stop(); } + void Stop(const std::string &name, std::vector dList) { + if (debug_verbose) { +#ifdef __CUDACC__ +#include "device_helpers.cuh" + dh::synchronize_n_devices(dList.size(), dList); #endif } timer_map[name].Stop(); diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index f4f9f8cc6..d3d7cf421 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -303,7 +303,7 @@ class GPUPredictor : public xgboost::Predictor { int shared_memory_bytes = static_cast( sizeof(float) * device_matrix->p_mat->info().num_col * BLOCK_THREADS); bool use_shared = true; - if (shared_memory_bytes > dh::max_shared_memory(param.gpu_id)) { + if (shared_memory_bytes > max_shared_memory_bytes) { shared_memory_bytes = 0; use_shared = false; } @@ -391,6 +391,7 @@ class GPUPredictor : public xgboost::Predictor { Predictor::Init(cfg, cache); cpu_predictor->Init(cfg, cache); param.InitAllowUnknown(cfg); + max_shared_memory_bytes = dh::max_shared_memory(param.gpu_id); } private: @@ -401,6 +402,7 @@ class GPUPredictor : public xgboost::Predictor { thrust::device_vector nodes; thrust::device_vector tree_segments; thrust::device_vector tree_group; + size_t max_shared_memory_bytes; }; XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor") .describe("Make predictions using GPU.") diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 3e3700a10..6930ec1e2 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -506,7 +506,7 @@ class GPUHistMaker : public TreeUpdater { } void Update(const std::vector& gpair, DMatrix* dmat, const std::vector& trees) override { - monitor.Start("Update"); + monitor.Start("Update", dList); GradStats::CheckInfo(dmat->info()); // rescale learning rate according to size of trees float lr = param.learning_rate; @@ -521,16 +521,16 @@ class GPUHistMaker : public TreeUpdater { LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl; } param.learning_rate = lr; - monitor.Stop("Update"); + monitor.Stop("Update", dList); } void InitDataOnce(DMatrix* dmat) { info = &dmat->info(); - monitor.Start("Quantiles"); + monitor.Start("Quantiles", dList); hmat_.Init(dmat, param.max_bin); gmat_.cut = &hmat_; gmat_.Init(dmat); - monitor.Stop("Quantiles"); + monitor.Stop("Quantiles", dList); n_bins = hmat_.row_ptr.back(); int n_devices = dh::n_devices(param.n_gpus, info->num_row); @@ -539,7 +539,7 @@ class GPUHistMaker : public TreeUpdater { bst_uint shard_size = std::ceil(static_cast(info->num_row) / n_devices); - std::vector dList(n_devices); + dList.resize(n_devices); for (int d_idx = 0; d_idx < n_devices; ++d_idx) { int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices(); dList[d_idx] = device_idx; @@ -574,23 +574,23 @@ class GPUHistMaker : public TreeUpdater { void InitData(const std::vector& gpair, DMatrix* dmat, const RegTree& tree) { - monitor.Start("InitDataOnce"); + monitor.Start("InitDataOnce", dList); if (!initialised) { this->InitDataOnce(dmat); } - monitor.Stop("InitDataOnce"); + monitor.Stop("InitDataOnce", dList); column_sampler.Init(info->num_col, param); // Copy gpair & reset memory - monitor.Start("InitDataReset"); + monitor.Start("InitDataReset", dList); omp_set_num_threads(shards.size()); #pragma omp parallel { auto cpu_thread_id = omp_get_thread_num(); shards[cpu_thread_id]->Reset(gpair); } - monitor.Stop("InitDataReset"); + monitor.Stop("InitDataReset", dList); } void AllReduceHist(int nidx) { @@ -807,12 +807,12 @@ class GPUHistMaker : public TreeUpdater { auto& tree = *p_tree; - monitor.Start("InitData"); + monitor.Start("InitData", dList); this->InitData(gpair, p_fmat, *p_tree); - monitor.Stop("InitData"); - monitor.Start("InitRoot"); + monitor.Stop("InitData", dList); + monitor.Start("InitRoot", dList); this->InitRoot(gpair, p_tree); - monitor.Stop("InitRoot"); + monitor.Stop("InitRoot", dList); auto timestamp = qexpand_->size(); auto num_leaves = 1; @@ -822,9 +822,9 @@ class GPUHistMaker : public TreeUpdater { qexpand_->pop(); if (!candidate.IsValid(param, num_leaves)) continue; // std::cout << candidate; - monitor.Start("ApplySplit"); + monitor.Start("ApplySplit", dList); this->ApplySplit(candidate, p_tree); - monitor.Stop("ApplySplit"); + monitor.Stop("ApplySplit", dList); num_leaves++; auto left_child_nidx = tree[candidate.nid].cleft(); @@ -833,12 +833,12 @@ class GPUHistMaker : public TreeUpdater { // Only create child entries if needed if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx), num_leaves)) { - monitor.Start("BuildHist"); + monitor.Start("BuildHist", dList); this->BuildHistLeftRight(candidate.nid, left_child_nidx, right_child_nidx); - monitor.Stop("BuildHist"); + monitor.Stop("BuildHist", dList); - monitor.Start("EvaluateSplits"); + monitor.Start("EvaluateSplits", dList); auto splits = this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree); qexpand_->push(ExpandEntry(left_child_nidx, @@ -847,10 +847,9 @@ class GPUHistMaker : public TreeUpdater { qexpand_->push(ExpandEntry(right_child_nidx, tree.GetDepth(right_child_nidx), splits[1], timestamp++)); - monitor.Stop("EvaluateSplits"); + monitor.Stop("EvaluateSplits", dList); } } - // Reset omp num threads omp_set_num_threads(nthread); } @@ -919,6 +918,7 @@ class GPUHistMaker : public TreeUpdater { common::Monitor monitor; dh::AllReducer reducer; std::vector node_value_constraints_; + std::vector dList; }; XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")