Set the appropriate device before freeing device memory... (#4566)

* - set the appropriate device before freeing device memory...
   - pr #4532 added a global memory tracker/logger to keep track of number of (de)allocations
     and peak memory usage on a per device basis.
   - this pr adds the appropriate check to make sure that the (de)allocation counts and memory usages
     makes sense for the device. since verbosity is typically increased on debug/non-retail builds.  
* - pre-create cub allocators and reuse them
   - create them once and not resize them dynamically. we need to ensure that these allocators
     are created and destroyed exactly once so that the appropriate device id's are set
This commit is contained in:
sriramch 2019-06-17 19:58:05 -07:00 committed by Rory Mitchell
parent a22368d210
commit 90f683b25b
8 changed files with 72 additions and 18 deletions

View File

@ -237,11 +237,19 @@ class MemoryLogger {
peak_allocated_bytes =
std::max(peak_allocated_bytes, currently_allocated_bytes);
num_allocations++;
CHECK_GT(num_allocations, num_deallocations);
}
void RegisterDeallocation(void *ptr, size_t n, int current_device) {
auto itr = device_allocations.find(ptr);
if (itr == device_allocations.end()) {
LOG(FATAL) << "Attempting to deallocate " << n << " bytes on device "
<< current_device << " that was never allocated ";
}
void RegisterDeallocation(void *ptr) {
num_deallocations++;
currently_allocated_bytes -= device_allocations[ptr];
device_allocations.erase(ptr);
CHECK_LE(num_deallocations, num_allocations);
CHECK_EQ(itr->second, n);
currently_allocated_bytes -= itr->second;
device_allocations.erase(itr);
}
};
std::map<int, DeviceStats>
@ -256,14 +264,15 @@ public:
int current_device;
safe_cuda(cudaGetDevice(&current_device));
stats_[current_device].RegisterAllocation(ptr, n);
CHECK_LE(stats_[current_device].peak_allocated_bytes, dh::TotalMemory(current_device));
}
void RegisterDeallocation(void *ptr) {
void RegisterDeallocation(void *ptr, size_t n) {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
return;
std::lock_guard<std::mutex> guard(mutex_);
int current_device;
safe_cuda(cudaGetDevice(&current_device));
stats_[current_device].RegisterDeallocation(ptr);
stats_[current_device].RegisterDeallocation(ptr, n, current_device);
}
void Log() {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
@ -299,7 +308,7 @@ struct XGBDefaultDeviceAllocator : thrust::device_malloc_allocator<T> {
return ptr;
}
void deallocate(pointer ptr, size_t n) {
GlobalMemoryLogger().RegisterDeallocation(ptr.get());
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n);
return super_t::deallocate(ptr, n);
}
};
@ -543,6 +552,8 @@ struct CubMemory {
XGBDeviceAllocator<uint8_t> allocator;
allocator.deallocate(thrust::device_ptr<uint8_t>(static_cast<uint8_t *>(d_temp_storage)),
temp_storage_bytes);
d_temp_storage = nullptr;
temp_storage_bytes = 0;
}
}

View File

@ -150,6 +150,10 @@ struct GPUSketcher {
n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
}
~DeviceShard() {
dh::safe_cuda(cudaSetDevice(device_));
}
inline size_t GetRowStride() const {
return row_stride_;
}

View File

@ -49,6 +49,10 @@ struct HostDeviceVectorImpl {
: proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
~DeviceShard() {
SetDevice();
}
void Init(HostDeviceVectorImpl<T>* vec, int device) {
if (vec_ == nullptr) { vec_ = vec; }
CHECK_EQ(vec, vec_);

View File

@ -81,6 +81,10 @@ class DeviceShard {
RescaleIndices(device_id_, ridx_begin_, data_);
}
~DeviceShard() {
dh::safe_cuda(cudaSetDevice(device_id_));
}
bool IsEmpty() {
return (ridx_end_ - ridx_begin_) == 0;
}

View File

@ -58,6 +58,14 @@ class ElementWiseMetricsReduction {
#if defined(XGBOOST_USE_CUDA)
~ElementWiseMetricsReduction() {
for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
dh::safe_cuda(cudaSetDevice(id));
size_t index = devices_.Index(id);
allocators_.at(index).Free();
}
}
PackedReduceResult DeviceReduceMetrics(
GPUSet::GpuIdType device_id,
size_t device_index,
@ -96,6 +104,7 @@ class ElementWiseMetricsReduction {
#endif // XGBOOST_USE_CUDA
PackedReduceResult Reduce(
const LearnerTrainParam &tparam,
GPUSet devices,
const HostDeviceVector<bst_float>& weights,
const HostDeviceVector<bst_float>& labels,
@ -107,9 +116,9 @@ class ElementWiseMetricsReduction {
}
#if defined(XGBOOST_USE_CUDA)
else { // NOLINT
if (allocators_.size() != devices.Size()) {
allocators_.clear();
allocators_.resize(devices.Size());
if (allocators_.empty()) {
devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
allocators_.resize(devices_.Size());
}
preds.Shard(devices);
labels.Shard(devices);
@ -135,6 +144,7 @@ class ElementWiseMetricsReduction {
private:
EvalRow policy_;
#if defined(XGBOOST_USE_CUDA)
GPUSet devices_;
std::vector<dh::CubMemory> allocators_;
#endif // defined(XGBOOST_USE_CUDA)
};
@ -339,7 +349,7 @@ struct EvalEWiseBase : public Metric {
GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
auto result =
reducer_.Reduce(devices, info.weights_, info.labels_, preds);
reducer_.Reduce(*tparam_, devices, info.weights_, info.labels_, preds);
double dat[2] { result.Residue(), result.Weights() };
if (distributed) {

View File

@ -73,6 +73,14 @@ class MultiClassMetricsReduction {
#if defined(XGBOOST_USE_CUDA)
~MultiClassMetricsReduction() {
for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
dh::safe_cuda(cudaSetDevice(id));
size_t index = devices_.Index(id);
allocators_.at(index).Free();
}
}
PackedReduceResult DeviceReduceMetrics(
GPUSet::GpuIdType device_id,
size_t device_index,
@ -118,6 +126,7 @@ class MultiClassMetricsReduction {
#endif // XGBOOST_USE_CUDA
PackedReduceResult Reduce(
const LearnerTrainParam &tparam,
GPUSet devices,
size_t n_class,
const HostDeviceVector<bst_float>& weights,
@ -130,9 +139,9 @@ class MultiClassMetricsReduction {
}
#if defined(XGBOOST_USE_CUDA)
else { // NOLINT
if (allocators_.size() != devices.Size()) {
allocators_.clear();
allocators_.resize(devices.Size());
if (allocators_.empty()) {
devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
allocators_.resize(devices_.Size());
}
preds.Shard(GPUDistribution::Granular(devices, n_class));
labels.Shard(devices);
@ -158,6 +167,7 @@ class MultiClassMetricsReduction {
private:
#if defined(XGBOOST_USE_CUDA)
dh::PinnedMemory label_error_;
GPUSet devices_;
std::vector<dh::CubMemory> allocators_;
#endif // defined(XGBOOST_USE_CUDA)
};
@ -181,7 +191,7 @@ struct EvalMClassBase : public Metric {
const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
auto result = reducer_.Reduce(devices, nclass, info.weights_, info.labels_, preds);
auto result = reducer_.Reduce(*tparam_, devices, nclass, info.weights_, info.labels_, preds);
double dat[2] { result.Residue(), result.Weights() };
if (distributed) {

View File

@ -19,6 +19,17 @@ void SetDevice(int device) {
dh::safe_cuda(cudaSetDevice(device));
}
struct HostDeviceVectorSetDeviceHandler {
template <typename Functor>
explicit HostDeviceVectorSetDeviceHandler(Functor f) {
SetCudaSetDeviceHandler(f);
}
~HostDeviceVectorSetDeviceHandler() {
SetCudaSetDeviceHandler(nullptr);
}
};
void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
HostDeviceVector<int> *v) {
// create the vector
@ -107,7 +118,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
void TestHostDeviceVector
(size_t n, const GPUDistribution& distribution,
const std::vector<size_t>& starts, const std::vector<size_t>& sizes) {
SetCudaSetDeviceHandler(SetDevice);
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
HostDeviceVector<int> v;
InitHostDeviceVector(n, distribution, &v);
CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
@ -115,7 +126,6 @@ void TestHostDeviceVector
CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
CheckHost(&v, GPUAccess::kRead);
CheckHost(&v, GPUAccess::kWrite);
SetCudaSetDeviceHandler(nullptr);
}
TEST(HostDeviceVector, TestBlock) {
@ -161,7 +171,7 @@ TEST(HostDeviceVector, TestCopy) {
auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
std::vector<size_t> starts{0, 501};
std::vector<size_t> sizes{501, 500};
SetCudaSetDeviceHandler(SetDevice);
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
HostDeviceVector<int> v;
{
@ -175,7 +185,6 @@ TEST(HostDeviceVector, TestCopy) {
CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
CheckHost(&v, GPUAccess::kRead);
CheckHost(&v, GPUAccess::kWrite);
SetCudaSetDeviceHandler(nullptr);
}
TEST(HostDeviceVector, Shard) {

View File

@ -10,6 +10,7 @@
#include "gtest/gtest.h"
#include "../helpers.h"
#if defined(XGBOOST_USE_NCCL)
namespace {
inline void CheckCAPICall(int ret) {
@ -17,6 +18,7 @@ inline void CheckCAPICall(int ret) {
}
} // namespace anonymous
#endif
extern const std::map<std::string, std::string>&
QueryBoosterConfigurationArguments(BoosterHandle handle);