Remove various synchronisations from cuda API calls, instrument monitor (#4205)

* Remove various synchronisations from cuda API calls, instrument monitor
with nvtx profiler ranges.
This commit is contained in:
Rory Mitchell
2019-03-10 15:01:23 +13:00
committed by GitHub
parent f83e62dca5
commit 4eeeded7d1
9 changed files with 116 additions and 104 deletions

View File

@@ -308,7 +308,7 @@ class DVec {
}
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (other.DeviceIdx() == this->DeviceIdx()) {
dh::safe_cuda(cudaMemcpy(this->Data(), other.Data(),
dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(),
other.Size() * sizeof(T),
cudaMemcpyDeviceToDevice));
} else {
@@ -338,7 +338,7 @@ class DVec {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
}
safe_cuda(cudaMemcpy(this->Data(), begin.get(), Size() * sizeof(T),
safe_cuda(cudaMemcpyAsync(this->Data(), begin.get(), Size() * sizeof(T),
cudaMemcpyDefault));
}
};

View File

@@ -290,14 +290,14 @@ struct GPUSketcher {
offset_vec[row_begin_ + batch_row_begin];
// copy the batch to the GPU
dh::safe_cuda
(cudaMemcpy(entries_.data().get(),
(cudaMemcpyAsync(entries_.data().get(),
data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
n_entries * sizeof(Entry), cudaMemcpyDefault));
// copy the weights if necessary
if (has_weights_) {
const auto& weights_vec = info.weights_.HostVector();
dh::safe_cuda
(cudaMemcpy(weights_.data().get(),
(cudaMemcpyAsync(weights_.data().get(),
weights_vec.data() + row_begin_ + batch_row_begin,
batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
}
@@ -315,15 +315,11 @@ struct GPUSketcher {
has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
gpu_batch_nrows_, num_cols_,
offset_vec[row_begin_ + batch_row_begin], batch_nrows);
dh::safe_cuda(cudaGetLastError()); // NOLINT
dh::safe_cuda(cudaDeviceSynchronize()); // NOLINT
for (int icol = 0; icol < num_cols_; ++icol) {
FindColumnCuts(batch_nrows, icol);
}
dh::safe_cuda(cudaDeviceSynchronize()); // NOLINT
// add cuts into sketches
thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
for (int icol = 0; icol < num_cols_; ++icol) {

View File

@@ -74,14 +74,14 @@ struct HostDeviceVectorImpl {
// TODO(canonizer): avoid full copy of host data
LazySyncDevice(GPUAccess::kWrite);
SetDevice();
dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_,
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_,
data_.size() * sizeof(T), cudaMemcpyDefault));
}
void GatherTo(thrust::device_ptr<T> begin) {
LazySyncDevice(GPUAccess::kRead);
SetDevice();
dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(),
dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(),
proper_size_ * sizeof(T), cudaMemcpyDefault));
}
@@ -97,7 +97,7 @@ struct HostDeviceVectorImpl {
LazySyncDevice(GPUAccess::kWrite);
other->LazySyncDevice(GPUAccess::kRead);
SetDevice();
dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(),
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(),
data_.size() * sizeof(T), cudaMemcpyDefault));
}

View File

@@ -8,7 +8,9 @@
#include <map>
#include <string>
#include "common.h"
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
#include <nvToolsExt.h>
#endif
namespace xgboost {
namespace common {
@@ -45,9 +47,11 @@ struct Timer {
*/
struct Monitor {
private:
struct Statistics {
Timer timer;
size_t count{0};
uint64_t nvtx_id;
};
std::string label = "";
std::map<std::string, Statistics> statistics_map;
@@ -75,35 +79,37 @@ struct Monitor {
}
self_timer.Stop();
}
void Init(std::string label) {
this->label = label;
}
void Start(const std::string &name) { statistics_map[name].timer.Start(); }
void Start(const std::string &name, GPUSet devices) {
void Init(std::string label) { this->label = label; }
void Start(const std::string &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
#ifdef __CUDACC__
for (auto device : devices) {
cudaSetDevice(device);
cudaDeviceSynchronize();
}
#endif // __CUDACC__
statistics_map[name].timer.Start();
}
statistics_map[name].timer.Start();
}
void Stop(const std::string &name) {
statistics_map[name].timer.Stop();
statistics_map[name].count++;
}
void Stop(const std::string &name, GPUSet devices) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
#ifdef __CUDACC__
for (auto device : devices) {
cudaSetDevice(device);
cudaDeviceSynchronize();
}
#endif // __CUDACC__
auto &stats = statistics_map[name];
stats.timer.Stop();
stats.count++;
}
}
void StartCuda(const std::string &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
stats.nvtx_id = nvtxRangeStartA(name.c_str());
#endif
}
}
void StopCuda(const std::string &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map[name];
stats.timer.Stop();
stats.count++;
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
nvtxRangeEnd(stats.nvtx_id);
#endif
}
this->Stop(name);
}
};
} // namespace common

View File

@@ -145,8 +145,6 @@ class Transform {
static_cast<int>(dh::DivRoundUp(*(range_.end()), kBlockThreads));
detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
_func, shard_range, UnpackHDV(_vectors, device)...);
dh::safe_cuda(cudaGetLastError());
dh::safe_cuda(cudaDeviceSynchronize());
}
}
#else