Remove various synchronisations from cuda API calls, instrument monitor (#4205)
* Remove various synchronisations from cuda API calls, instrument monitor with nvtx profiler ranges.
This commit is contained in:
@@ -308,7 +308,7 @@ class DVec {
|
||||
}
|
||||
safe_cuda(cudaSetDevice(this->DeviceIdx()));
|
||||
if (other.DeviceIdx() == this->DeviceIdx()) {
|
||||
dh::safe_cuda(cudaMemcpy(this->Data(), other.Data(),
|
||||
dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(),
|
||||
other.Size() * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
} else {
|
||||
@@ -338,7 +338,7 @@ class DVec {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign vector to dvec, sizes are different");
|
||||
}
|
||||
safe_cuda(cudaMemcpy(this->Data(), begin.get(), Size() * sizeof(T),
|
||||
safe_cuda(cudaMemcpyAsync(this->Data(), begin.get(), Size() * sizeof(T),
|
||||
cudaMemcpyDefault));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -290,14 +290,14 @@ struct GPUSketcher {
|
||||
offset_vec[row_begin_ + batch_row_begin];
|
||||
// copy the batch to the GPU
|
||||
dh::safe_cuda
|
||||
(cudaMemcpy(entries_.data().get(),
|
||||
(cudaMemcpyAsync(entries_.data().get(),
|
||||
data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
// copy the weights if necessary
|
||||
if (has_weights_) {
|
||||
const auto& weights_vec = info.weights_.HostVector();
|
||||
dh::safe_cuda
|
||||
(cudaMemcpy(weights_.data().get(),
|
||||
(cudaMemcpyAsync(weights_.data().get(),
|
||||
weights_vec.data() + row_begin_ + batch_row_begin,
|
||||
batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
|
||||
}
|
||||
@@ -315,15 +315,11 @@ struct GPUSketcher {
|
||||
has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
|
||||
gpu_batch_nrows_, num_cols_,
|
||||
offset_vec[row_begin_ + batch_row_begin], batch_nrows);
|
||||
dh::safe_cuda(cudaGetLastError()); // NOLINT
|
||||
dh::safe_cuda(cudaDeviceSynchronize()); // NOLINT
|
||||
|
||||
for (int icol = 0; icol < num_cols_; ++icol) {
|
||||
FindColumnCuts(batch_nrows, icol);
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaDeviceSynchronize()); // NOLINT
|
||||
|
||||
// add cuts into sketches
|
||||
thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
|
||||
for (int icol = 0; icol < num_cols_; ++icol) {
|
||||
|
||||
@@ -74,14 +74,14 @@ struct HostDeviceVectorImpl {
|
||||
// TODO(canonizer): avoid full copy of host data
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_,
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_,
|
||||
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void GatherTo(thrust::device_ptr<T> begin) {
|
||||
LazySyncDevice(GPUAccess::kRead);
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(),
|
||||
dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(),
|
||||
proper_size_ * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
@@ -97,7 +97,7 @@ struct HostDeviceVectorImpl {
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
other->LazySyncDevice(GPUAccess::kRead);
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(),
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(),
|
||||
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,9 @@
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "common.h"
|
||||
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
|
||||
#include <nvToolsExt.h>
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -45,9 +47,11 @@ struct Timer {
|
||||
*/
|
||||
|
||||
struct Monitor {
|
||||
private:
|
||||
struct Statistics {
|
||||
Timer timer;
|
||||
size_t count{0};
|
||||
uint64_t nvtx_id;
|
||||
};
|
||||
std::string label = "";
|
||||
std::map<std::string, Statistics> statistics_map;
|
||||
@@ -75,35 +79,37 @@ struct Monitor {
|
||||
}
|
||||
self_timer.Stop();
|
||||
}
|
||||
void Init(std::string label) {
|
||||
this->label = label;
|
||||
}
|
||||
void Start(const std::string &name) { statistics_map[name].timer.Start(); }
|
||||
void Start(const std::string &name, GPUSet devices) {
|
||||
void Init(std::string label) { this->label = label; }
|
||||
void Start(const std::string &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
#ifdef __CUDACC__
|
||||
for (auto device : devices) {
|
||||
cudaSetDevice(device);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
#endif // __CUDACC__
|
||||
statistics_map[name].timer.Start();
|
||||
}
|
||||
statistics_map[name].timer.Start();
|
||||
}
|
||||
void Stop(const std::string &name) {
|
||||
statistics_map[name].timer.Stop();
|
||||
statistics_map[name].count++;
|
||||
}
|
||||
void Stop(const std::string &name, GPUSet devices) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
#ifdef __CUDACC__
|
||||
for (auto device : devices) {
|
||||
cudaSetDevice(device);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
#endif // __CUDACC__
|
||||
auto &stats = statistics_map[name];
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
}
|
||||
}
|
||||
void StartCuda(const std::string &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map[name];
|
||||
stats.timer.Start();
|
||||
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
|
||||
stats.nvtx_id = nvtxRangeStartA(name.c_str());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
void StopCuda(const std::string &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map[name];
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
|
||||
nvtxRangeEnd(stats.nvtx_id);
|
||||
#endif
|
||||
}
|
||||
this->Stop(name);
|
||||
}
|
||||
};
|
||||
} // namespace common
|
||||
|
||||
@@ -145,8 +145,6 @@ class Transform {
|
||||
static_cast<int>(dh::DivRoundUp(*(range_.end()), kBlockThreads));
|
||||
detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
|
||||
_func, shard_range, UnpackHDV(_vectors, device)...);
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user