Fixed the performance regression within EvaluateSplits(). (#3680)

- it turns out creating an std::vector on every call is faster
  than cudaMallocHost()/cudaFreeHost()
This commit is contained in:
Andy Adinets 2018-09-08 04:48:45 +02:00 committed by Rory Mitchell
parent beab6e08dd
commit f606cb8ef4

View File

@ -747,6 +747,7 @@ class GPUHistMaker : public TreeUpdater {
struct ExpandEntry; struct ExpandEntry;
GPUHistMaker() : initialised_(false), p_last_fmat_(nullptr) {} GPUHistMaker() : initialised_(false), p_last_fmat_(nullptr) {}
void Init( void Init(
const std::vector<std::pair<std::string, std::string>>& args) override { const std::vector<std::pair<std::string, std::string>>& args) override {
param_.InitAllowUnknown(args); param_.InitAllowUnknown(args);
@ -919,9 +920,7 @@ class GPUHistMaker : public TreeUpdater {
const std::vector<int>& nidx_set, RegTree* p_tree) { const std::vector<int>& nidx_set, RegTree* p_tree) {
auto columns = info_->num_col_; auto columns = info_->num_col_;
std::vector<DeviceSplitCandidate> best_splits(nidx_set.size()); std::vector<DeviceSplitCandidate> best_splits(nidx_set.size());
DeviceSplitCandidate* candidate_splits; std::vector<DeviceSplitCandidate> candidate_splits(nidx_set.size() * columns);
dh::safe_cuda(cudaMallocHost(&candidate_splits, nidx_set.size() *
columns * sizeof(DeviceSplitCandidate)));
// Use first device // Use first device
auto& shard = shards_.front(); auto& shard = shards_.front();
dh::safe_cuda(cudaSetDevice(shard->device_idx)); dh::safe_cuda(cudaSetDevice(shard->device_idx));
@ -952,8 +951,8 @@ class GPUHistMaker : public TreeUpdater {
} }
dh::safe_cuda(cudaDeviceSynchronize()); dh::safe_cuda(cudaDeviceSynchronize());
dh::safe_cuda( dh::safe_cuda
cudaMemcpy(candidate_splits, shard->temp_memory.d_temp_storage, (cudaMemcpy(candidate_splits.data(), shard->temp_memory.d_temp_storage,
sizeof(DeviceSplitCandidate) * columns * nidx_set.size(), sizeof(DeviceSplitCandidate) * columns * nidx_set.size(),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
for (auto i = 0; i < nidx_set.size(); i++) { for (auto i = 0; i < nidx_set.size(); i++) {
@ -965,7 +964,6 @@ class GPUHistMaker : public TreeUpdater {
} }
best_splits[i] = nidx_best; best_splits[i] = nidx_best;
} }
dh::safe_cuda(cudaFreeHost(candidate_splits));
return std::move(best_splits); return std::move(best_splits);
} }