Remove various synchronisations from cuda API calls, instrument monitor (#4205)

* Remove various synchronisations from cuda API calls, instrument monitor
with nvtx profiler ranges.
This commit is contained in:
Rory Mitchell
2019-03-10 15:01:23 +13:00
committed by GitHub
parent f83e62dca5
commit 4eeeded7d1
9 changed files with 116 additions and 104 deletions

View File

@@ -290,14 +290,14 @@ struct GPUSketcher {
offset_vec[row_begin_ + batch_row_begin];
// copy the batch to the GPU
dh::safe_cuda
(cudaMemcpy(entries_.data().get(),
(cudaMemcpyAsync(entries_.data().get(),
data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
n_entries * sizeof(Entry), cudaMemcpyDefault));
// copy the weights if necessary
if (has_weights_) {
const auto& weights_vec = info.weights_.HostVector();
dh::safe_cuda
(cudaMemcpy(weights_.data().get(),
(cudaMemcpyAsync(weights_.data().get(),
weights_vec.data() + row_begin_ + batch_row_begin,
batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
}
@@ -315,15 +315,11 @@ struct GPUSketcher {
has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
gpu_batch_nrows_, num_cols_,
offset_vec[row_begin_ + batch_row_begin], batch_nrows);
dh::safe_cuda(cudaGetLastError()); // NOLINT
dh::safe_cuda(cudaDeviceSynchronize()); // NOLINT
for (int icol = 0; icol < num_cols_; ++icol) {
FindColumnCuts(batch_nrows, icol);
}
dh::safe_cuda(cudaDeviceSynchronize()); // NOLINT
// add cuts into sketches
thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
for (int icol = 0; icol < num_cols_; ++icol) {