gpu_hist performance fixes (#5558)

* Remove unnecessary cuda API calls

* Fix histogram memory growth
This commit is contained in:
Rory Mitchell
2020-04-19 12:21:13 +12:00
committed by GitHub
parent e1f22baf8c
commit d6d1035950
7 changed files with 52 additions and 109 deletions

View File

@@ -108,7 +108,6 @@ class RowPartitioner {
template <typename UpdatePositionOpT>
void UpdatePosition(bst_node_t nidx, bst_node_t left_nidx,
bst_node_t right_nidx, UpdatePositionOpT op) {
dh::safe_cuda(cudaSetDevice(device_idx_));
Segment segment = ridx_segments_.at(nidx); // rows belongs to node nidx
auto d_ridx = ridx_.CurrentSpan();
auto d_position = position_.CurrentSpan();

View File

@@ -2,9 +2,6 @@
* Copyright 2017-2020 XGBoost contributors
*/
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>
#include <xgboost/tree_updater.h>
#include <algorithm>
@@ -20,8 +17,6 @@
#include "xgboost/span.h"
#include "xgboost/json.h"
#include "../common/common.h"
#include "../common/compressed_iterator.h"
#include "../common/device_helpers.cuh"
#include "../common/hist_util.h"
#include "../common/timer.h"
@@ -324,9 +319,9 @@ class DeviceHistogram {
}
void Reset() {
dh::safe_cuda(cudaMemsetAsync(
data_.data().get(), 0,
data_.size() * sizeof(typename decltype(data_)::value_type)));
auto d_data = data_.data().get();
dh::LaunchN(device_id_, data_.size(),
[=] __device__(size_t idx) { d_data[idx] = 0.0f; });
nidx_map_.clear();
}
bool HistogramExists(int nidx) const {
@@ -348,30 +343,33 @@ class DeviceHistogram {
// Number of items currently used in data
const size_t used_size = nidx_map_.size() * HistogramSize();
const size_t new_used_size = used_size + HistogramSize();
dh::safe_cuda(cudaSetDevice(device_id_));
if (data_.size() >= kStopGrowingSize) {
// Recycle histogram memory
if (new_used_size <= data_.size()) {
// no need to remove old node, just insert the new one.
nidx_map_[nidx] = used_size;
// memset histogram size in bytes
dh::safe_cuda(cudaMemsetAsync(data_.data().get() + used_size, 0,
n_bins_ * sizeof(GradientSumT)));
} else {
std::pair<int, size_t> old_entry = *nidx_map_.begin();
nidx_map_.erase(old_entry.first);
dh::safe_cuda(cudaMemsetAsync(data_.data().get() + old_entry.second, 0,
n_bins_ * sizeof(GradientSumT)));
nidx_map_[nidx] = old_entry.second;
}
// Zero recycled memory
auto d_data = data_.data().get() + nidx_map_[nidx];
dh::LaunchN(device_id_, n_bins_ * 2,
[=] __device__(size_t idx) { d_data[idx] = 0.0f; });
} else {
// Append new node histogram
nidx_map_[nidx] = used_size;
size_t new_required_memory = std::max(data_.size() * 2, HistogramSize());
if (data_.size() < new_required_memory) {
// Check there is enough memory for another histogram node
if (data_.size() < new_used_size + HistogramSize()) {
size_t new_required_memory =
std::max(data_.size() * 2, HistogramSize());
data_.resize(new_required_memory);
}
}
CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
}
/**
@@ -428,7 +426,6 @@ struct GPUHistMakerDevice {
GradientSumT histogram_rounding;
dh::CubMemory temp_memory;
dh::PinnedMemory pinned_memory;
std::vector<cudaStream_t> streams{};
@@ -531,15 +528,14 @@ struct GPUHistMakerDevice {
std::vector<DeviceSplitCandidate> EvaluateSplits(
std::vector<int> nidxs, const RegTree& tree,
size_t num_columns) {
dh::safe_cuda(cudaSetDevice(device_id));
auto result_all = pinned_memory.GetSpan<DeviceSplitCandidate>(nidxs.size());
// Work out cub temporary memory requirement
GPUTrainingParam gpu_param(param);
DeviceSplitCandidateReduceOp op(gpu_param);
dh::caching_device_vector<DeviceSplitCandidate> d_result_all(nidxs.size());
dh::caching_device_vector<DeviceSplitCandidate> split_candidates_all(nidxs.size()*num_columns);
dh::TemporaryArray<DeviceSplitCandidate> d_result_all(nidxs.size());
dh::TemporaryArray<DeviceSplitCandidate> split_candidates_all(nidxs.size()*num_columns);
auto& streams = this->GetStreams(nidxs.size());
for (auto i = 0ull; i < nidxs.size(); i++) {
@@ -582,7 +578,7 @@ struct GPUHistMakerDevice {
cub_bytes, d_split_candidates.data(),
d_result.data(), d_split_candidates.size(), op,
DeviceSplitCandidate(), streams[i]);
dh::caching_device_vector<char> cub_temp(cub_bytes);
dh::TemporaryArray<char> cub_temp(cub_bytes);
cub::DeviceReduce::Reduce(reinterpret_cast<void*>(cub_temp.data().get()),
cub_bytes, d_split_candidates.data(),
d_result.data(), d_split_candidates.size(), op,
@@ -651,9 +647,8 @@ struct GPUHistMakerDevice {
// instances to their final leaf. This information is used later to update the
// prediction cache
void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat) {
const auto d_nodes =
temp_memory.GetSpan<RegTree::Node>(p_tree->GetNodes().size());
dh::safe_cuda(cudaMemcpy(d_nodes.data(), p_tree->GetNodes().data(),
dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
dh::safe_cuda(cudaMemcpy(d_nodes.data().get(), p_tree->GetNodes().data(),
d_nodes.size() * sizeof(RegTree::Node),
cudaMemcpyHostToDevice));
@@ -662,10 +657,10 @@ struct GPUHistMakerDevice {
row_partitioner.reset(new RowPartitioner(device_id, p_fmat->Info().num_row_));
}
if (page->n_rows == p_fmat->Info().num_row_) {
FinalisePositionInPage(page, d_nodes);
FinalisePositionInPage(page, dh::ToSpan(d_nodes));
} else {
for (auto& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
FinalisePositionInPage(batch.Impl(), d_nodes);
FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes));
}
}
}