Avoid repeated cuda API call in GPU predictor and only synchronize used GPUs (#2936)
This commit is contained in:
parent
e8a6597957
commit
4d36036fe6
@ -6,6 +6,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
@ -63,11 +64,21 @@ struct Monitor {
|
|||||||
this->label = label;
|
this->label = label;
|
||||||
}
|
}
|
||||||
void Start(const std::string &name) { timer_map[name].Start(); }
|
void Start(const std::string &name) { timer_map[name].Start(); }
|
||||||
void Stop(const std::string &name) {
|
void Start(const std::string &name, std::vector<int> dList) {
|
||||||
if (debug_verbose) {
|
if (debug_verbose) {
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
#include "device_helpers.cuh"
|
#include "device_helpers.cuh"
|
||||||
dh::synchronize_all();
|
dh::synchronize_n_devices(dList.size(), dList);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
timer_map[name].Start();
|
||||||
|
}
|
||||||
|
void Stop(const std::string &name) { timer_map[name].Stop(); }
|
||||||
|
void Stop(const std::string &name, std::vector<int> dList) {
|
||||||
|
if (debug_verbose) {
|
||||||
|
#ifdef __CUDACC__
|
||||||
|
#include "device_helpers.cuh"
|
||||||
|
dh::synchronize_n_devices(dList.size(), dList);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
timer_map[name].Stop();
|
timer_map[name].Stop();
|
||||||
|
|||||||
@ -303,7 +303,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
int shared_memory_bytes = static_cast<int>(
|
int shared_memory_bytes = static_cast<int>(
|
||||||
sizeof(float) * device_matrix->p_mat->info().num_col * BLOCK_THREADS);
|
sizeof(float) * device_matrix->p_mat->info().num_col * BLOCK_THREADS);
|
||||||
bool use_shared = true;
|
bool use_shared = true;
|
||||||
if (shared_memory_bytes > dh::max_shared_memory(param.gpu_id)) {
|
if (shared_memory_bytes > max_shared_memory_bytes) {
|
||||||
shared_memory_bytes = 0;
|
shared_memory_bytes = 0;
|
||||||
use_shared = false;
|
use_shared = false;
|
||||||
}
|
}
|
||||||
@ -391,6 +391,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
Predictor::Init(cfg, cache);
|
Predictor::Init(cfg, cache);
|
||||||
cpu_predictor->Init(cfg, cache);
|
cpu_predictor->Init(cfg, cache);
|
||||||
param.InitAllowUnknown(cfg);
|
param.InitAllowUnknown(cfg);
|
||||||
|
max_shared_memory_bytes = dh::max_shared_memory(param.gpu_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -401,6 +402,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
thrust::device_vector<DevicePredictionNode> nodes;
|
thrust::device_vector<DevicePredictionNode> nodes;
|
||||||
thrust::device_vector<size_t> tree_segments;
|
thrust::device_vector<size_t> tree_segments;
|
||||||
thrust::device_vector<int> tree_group;
|
thrust::device_vector<int> tree_group;
|
||||||
|
size_t max_shared_memory_bytes;
|
||||||
};
|
};
|
||||||
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
|
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
|
||||||
.describe("Make predictions using GPU.")
|
.describe("Make predictions using GPU.")
|
||||||
|
|||||||
@ -506,7 +506,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
void Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
|
void Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
|
||||||
const std::vector<RegTree*>& trees) override {
|
const std::vector<RegTree*>& trees) override {
|
||||||
monitor.Start("Update");
|
monitor.Start("Update", dList);
|
||||||
GradStats::CheckInfo(dmat->info());
|
GradStats::CheckInfo(dmat->info());
|
||||||
// rescale learning rate according to size of trees
|
// rescale learning rate according to size of trees
|
||||||
float lr = param.learning_rate;
|
float lr = param.learning_rate;
|
||||||
@ -521,16 +521,16 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
|
LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
|
||||||
}
|
}
|
||||||
param.learning_rate = lr;
|
param.learning_rate = lr;
|
||||||
monitor.Stop("Update");
|
monitor.Stop("Update", dList);
|
||||||
}
|
}
|
||||||
|
|
||||||
void InitDataOnce(DMatrix* dmat) {
|
void InitDataOnce(DMatrix* dmat) {
|
||||||
info = &dmat->info();
|
info = &dmat->info();
|
||||||
monitor.Start("Quantiles");
|
monitor.Start("Quantiles", dList);
|
||||||
hmat_.Init(dmat, param.max_bin);
|
hmat_.Init(dmat, param.max_bin);
|
||||||
gmat_.cut = &hmat_;
|
gmat_.cut = &hmat_;
|
||||||
gmat_.Init(dmat);
|
gmat_.Init(dmat);
|
||||||
monitor.Stop("Quantiles");
|
monitor.Stop("Quantiles", dList);
|
||||||
n_bins = hmat_.row_ptr.back();
|
n_bins = hmat_.row_ptr.back();
|
||||||
|
|
||||||
int n_devices = dh::n_devices(param.n_gpus, info->num_row);
|
int n_devices = dh::n_devices(param.n_gpus, info->num_row);
|
||||||
@ -539,7 +539,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
bst_uint shard_size =
|
bst_uint shard_size =
|
||||||
std::ceil(static_cast<double>(info->num_row) / n_devices);
|
std::ceil(static_cast<double>(info->num_row) / n_devices);
|
||||||
|
|
||||||
std::vector<int> dList(n_devices);
|
dList.resize(n_devices);
|
||||||
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
||||||
int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices();
|
int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices();
|
||||||
dList[d_idx] = device_idx;
|
dList[d_idx] = device_idx;
|
||||||
@ -574,23 +574,23 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
|
|
||||||
void InitData(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
|
void InitData(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
|
||||||
const RegTree& tree) {
|
const RegTree& tree) {
|
||||||
monitor.Start("InitDataOnce");
|
monitor.Start("InitDataOnce", dList);
|
||||||
if (!initialised) {
|
if (!initialised) {
|
||||||
this->InitDataOnce(dmat);
|
this->InitDataOnce(dmat);
|
||||||
}
|
}
|
||||||
monitor.Stop("InitDataOnce");
|
monitor.Stop("InitDataOnce", dList);
|
||||||
|
|
||||||
column_sampler.Init(info->num_col, param);
|
column_sampler.Init(info->num_col, param);
|
||||||
|
|
||||||
// Copy gpair & reset memory
|
// Copy gpair & reset memory
|
||||||
monitor.Start("InitDataReset");
|
monitor.Start("InitDataReset", dList);
|
||||||
omp_set_num_threads(shards.size());
|
omp_set_num_threads(shards.size());
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
auto cpu_thread_id = omp_get_thread_num();
|
auto cpu_thread_id = omp_get_thread_num();
|
||||||
shards[cpu_thread_id]->Reset(gpair);
|
shards[cpu_thread_id]->Reset(gpair);
|
||||||
}
|
}
|
||||||
monitor.Stop("InitDataReset");
|
monitor.Stop("InitDataReset", dList);
|
||||||
}
|
}
|
||||||
|
|
||||||
void AllReduceHist(int nidx) {
|
void AllReduceHist(int nidx) {
|
||||||
@ -807,12 +807,12 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
|
|
||||||
auto& tree = *p_tree;
|
auto& tree = *p_tree;
|
||||||
|
|
||||||
monitor.Start("InitData");
|
monitor.Start("InitData", dList);
|
||||||
this->InitData(gpair, p_fmat, *p_tree);
|
this->InitData(gpair, p_fmat, *p_tree);
|
||||||
monitor.Stop("InitData");
|
monitor.Stop("InitData", dList);
|
||||||
monitor.Start("InitRoot");
|
monitor.Start("InitRoot", dList);
|
||||||
this->InitRoot(gpair, p_tree);
|
this->InitRoot(gpair, p_tree);
|
||||||
monitor.Stop("InitRoot");
|
monitor.Stop("InitRoot", dList);
|
||||||
|
|
||||||
auto timestamp = qexpand_->size();
|
auto timestamp = qexpand_->size();
|
||||||
auto num_leaves = 1;
|
auto num_leaves = 1;
|
||||||
@ -822,9 +822,9 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
qexpand_->pop();
|
qexpand_->pop();
|
||||||
if (!candidate.IsValid(param, num_leaves)) continue;
|
if (!candidate.IsValid(param, num_leaves)) continue;
|
||||||
// std::cout << candidate;
|
// std::cout << candidate;
|
||||||
monitor.Start("ApplySplit");
|
monitor.Start("ApplySplit", dList);
|
||||||
this->ApplySplit(candidate, p_tree);
|
this->ApplySplit(candidate, p_tree);
|
||||||
monitor.Stop("ApplySplit");
|
monitor.Stop("ApplySplit", dList);
|
||||||
num_leaves++;
|
num_leaves++;
|
||||||
|
|
||||||
auto left_child_nidx = tree[candidate.nid].cleft();
|
auto left_child_nidx = tree[candidate.nid].cleft();
|
||||||
@ -833,12 +833,12 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
// Only create child entries if needed
|
// Only create child entries if needed
|
||||||
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
|
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
|
||||||
num_leaves)) {
|
num_leaves)) {
|
||||||
monitor.Start("BuildHist");
|
monitor.Start("BuildHist", dList);
|
||||||
this->BuildHistLeftRight(candidate.nid, left_child_nidx,
|
this->BuildHistLeftRight(candidate.nid, left_child_nidx,
|
||||||
right_child_nidx);
|
right_child_nidx);
|
||||||
monitor.Stop("BuildHist");
|
monitor.Stop("BuildHist", dList);
|
||||||
|
|
||||||
monitor.Start("EvaluateSplits");
|
monitor.Start("EvaluateSplits", dList);
|
||||||
auto splits =
|
auto splits =
|
||||||
this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree);
|
this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree);
|
||||||
qexpand_->push(ExpandEntry(left_child_nidx,
|
qexpand_->push(ExpandEntry(left_child_nidx,
|
||||||
@ -847,10 +847,9 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
qexpand_->push(ExpandEntry(right_child_nidx,
|
qexpand_->push(ExpandEntry(right_child_nidx,
|
||||||
tree.GetDepth(right_child_nidx), splits[1],
|
tree.GetDepth(right_child_nidx), splits[1],
|
||||||
timestamp++));
|
timestamp++));
|
||||||
monitor.Stop("EvaluateSplits");
|
monitor.Stop("EvaluateSplits", dList);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reset omp num threads
|
// Reset omp num threads
|
||||||
omp_set_num_threads(nthread);
|
omp_set_num_threads(nthread);
|
||||||
}
|
}
|
||||||
@ -919,6 +918,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
common::Monitor monitor;
|
common::Monitor monitor;
|
||||||
dh::AllReducer reducer;
|
dh::AllReducer reducer;
|
||||||
std::vector<ValueConstraint> node_value_constraints_;
|
std::vector<ValueConstraint> node_value_constraints_;
|
||||||
|
std::vector<int> dList;
|
||||||
};
|
};
|
||||||
|
|
||||||
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user