Set the appropriate device before freeing device memory... (#4566)
* - set the appropriate device before freeing device memory... - pr #4532 added a global memory tracker/logger to keep track of number of (de)allocations and peak memory usage on a per device basis. - this pr adds the appropriate check to make sure that the (de)allocation counts and memory usages makes sense for the device. since verbosity is typically increased on debug/non-retail builds. * - pre-create cub allocators and reuse them - create them once and not resize them dynamically. we need to ensure that these allocators are created and destroyed exactly once so that the appropriate device id's are set
This commit is contained in:
parent
a22368d210
commit
90f683b25b
@ -237,11 +237,19 @@ class MemoryLogger {
|
||||
peak_allocated_bytes =
|
||||
std::max(peak_allocated_bytes, currently_allocated_bytes);
|
||||
num_allocations++;
|
||||
CHECK_GT(num_allocations, num_deallocations);
|
||||
}
|
||||
void RegisterDeallocation(void *ptr, size_t n, int current_device) {
|
||||
auto itr = device_allocations.find(ptr);
|
||||
if (itr == device_allocations.end()) {
|
||||
LOG(FATAL) << "Attempting to deallocate " << n << " bytes on device "
|
||||
<< current_device << " that was never allocated ";
|
||||
}
|
||||
void RegisterDeallocation(void *ptr) {
|
||||
num_deallocations++;
|
||||
currently_allocated_bytes -= device_allocations[ptr];
|
||||
device_allocations.erase(ptr);
|
||||
CHECK_LE(num_deallocations, num_allocations);
|
||||
CHECK_EQ(itr->second, n);
|
||||
currently_allocated_bytes -= itr->second;
|
||||
device_allocations.erase(itr);
|
||||
}
|
||||
};
|
||||
std::map<int, DeviceStats>
|
||||
@ -256,14 +264,15 @@ public:
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
stats_[current_device].RegisterAllocation(ptr, n);
|
||||
CHECK_LE(stats_[current_device].peak_allocated_bytes, dh::TotalMemory(current_device));
|
||||
}
|
||||
void RegisterDeallocation(void *ptr) {
|
||||
void RegisterDeallocation(void *ptr, size_t n) {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
|
||||
return;
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
stats_[current_device].RegisterDeallocation(ptr);
|
||||
stats_[current_device].RegisterDeallocation(ptr, n, current_device);
|
||||
}
|
||||
void Log() {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
|
||||
@ -299,7 +308,7 @@ struct XGBDefaultDeviceAllocator : thrust::device_malloc_allocator<T> {
|
||||
return ptr;
|
||||
}
|
||||
void deallocate(pointer ptr, size_t n) {
|
||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get());
|
||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n);
|
||||
return super_t::deallocate(ptr, n);
|
||||
}
|
||||
};
|
||||
@ -543,6 +552,8 @@ struct CubMemory {
|
||||
XGBDeviceAllocator<uint8_t> allocator;
|
||||
allocator.deallocate(thrust::device_ptr<uint8_t>(static_cast<uint8_t *>(d_temp_storage)),
|
||||
temp_storage_bytes);
|
||||
d_temp_storage = nullptr;
|
||||
temp_storage_bytes = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -150,6 +150,10 @@ struct GPUSketcher {
|
||||
n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
|
||||
}
|
||||
|
||||
~DeviceShard() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
}
|
||||
|
||||
inline size_t GetRowStride() const {
|
||||
return row_stride_;
|
||||
}
|
||||
|
||||
@ -49,6 +49,10 @@ struct HostDeviceVectorImpl {
|
||||
: proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
|
||||
cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
|
||||
|
||||
~DeviceShard() {
|
||||
SetDevice();
|
||||
}
|
||||
|
||||
void Init(HostDeviceVectorImpl<T>* vec, int device) {
|
||||
if (vec_ == nullptr) { vec_ = vec; }
|
||||
CHECK_EQ(vec, vec_);
|
||||
|
||||
@ -81,6 +81,10 @@ class DeviceShard {
|
||||
RescaleIndices(device_id_, ridx_begin_, data_);
|
||||
}
|
||||
|
||||
~DeviceShard() {
|
||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
||||
}
|
||||
|
||||
bool IsEmpty() {
|
||||
return (ridx_end_ - ridx_begin_) == 0;
|
||||
}
|
||||
|
||||
@ -58,6 +58,14 @@ class ElementWiseMetricsReduction {
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
|
||||
~ElementWiseMetricsReduction() {
|
||||
for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
|
||||
dh::safe_cuda(cudaSetDevice(id));
|
||||
size_t index = devices_.Index(id);
|
||||
allocators_.at(index).Free();
|
||||
}
|
||||
}
|
||||
|
||||
PackedReduceResult DeviceReduceMetrics(
|
||||
GPUSet::GpuIdType device_id,
|
||||
size_t device_index,
|
||||
@ -96,6 +104,7 @@ class ElementWiseMetricsReduction {
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
|
||||
PackedReduceResult Reduce(
|
||||
const LearnerTrainParam &tparam,
|
||||
GPUSet devices,
|
||||
const HostDeviceVector<bst_float>& weights,
|
||||
const HostDeviceVector<bst_float>& labels,
|
||||
@ -107,9 +116,9 @@ class ElementWiseMetricsReduction {
|
||||
}
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
else { // NOLINT
|
||||
if (allocators_.size() != devices.Size()) {
|
||||
allocators_.clear();
|
||||
allocators_.resize(devices.Size());
|
||||
if (allocators_.empty()) {
|
||||
devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
|
||||
allocators_.resize(devices_.Size());
|
||||
}
|
||||
preds.Shard(devices);
|
||||
labels.Shard(devices);
|
||||
@ -135,6 +144,7 @@ class ElementWiseMetricsReduction {
|
||||
private:
|
||||
EvalRow policy_;
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
GPUSet devices_;
|
||||
std::vector<dh::CubMemory> allocators_;
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
};
|
||||
@ -339,7 +349,7 @@ struct EvalEWiseBase : public Metric {
|
||||
GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
|
||||
|
||||
auto result =
|
||||
reducer_.Reduce(devices, info.weights_, info.labels_, preds);
|
||||
reducer_.Reduce(*tparam_, devices, info.weights_, info.labels_, preds);
|
||||
|
||||
double dat[2] { result.Residue(), result.Weights() };
|
||||
if (distributed) {
|
||||
|
||||
@ -73,6 +73,14 @@ class MultiClassMetricsReduction {
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
|
||||
~MultiClassMetricsReduction() {
|
||||
for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
|
||||
dh::safe_cuda(cudaSetDevice(id));
|
||||
size_t index = devices_.Index(id);
|
||||
allocators_.at(index).Free();
|
||||
}
|
||||
}
|
||||
|
||||
PackedReduceResult DeviceReduceMetrics(
|
||||
GPUSet::GpuIdType device_id,
|
||||
size_t device_index,
|
||||
@ -118,6 +126,7 @@ class MultiClassMetricsReduction {
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
|
||||
PackedReduceResult Reduce(
|
||||
const LearnerTrainParam &tparam,
|
||||
GPUSet devices,
|
||||
size_t n_class,
|
||||
const HostDeviceVector<bst_float>& weights,
|
||||
@ -130,9 +139,9 @@ class MultiClassMetricsReduction {
|
||||
}
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
else { // NOLINT
|
||||
if (allocators_.size() != devices.Size()) {
|
||||
allocators_.clear();
|
||||
allocators_.resize(devices.Size());
|
||||
if (allocators_.empty()) {
|
||||
devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
|
||||
allocators_.resize(devices_.Size());
|
||||
}
|
||||
preds.Shard(GPUDistribution::Granular(devices, n_class));
|
||||
labels.Shard(devices);
|
||||
@ -158,6 +167,7 @@ class MultiClassMetricsReduction {
|
||||
private:
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::PinnedMemory label_error_;
|
||||
GPUSet devices_;
|
||||
std::vector<dh::CubMemory> allocators_;
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
};
|
||||
@ -181,7 +191,7 @@ struct EvalMClassBase : public Metric {
|
||||
const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
|
||||
|
||||
GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
|
||||
auto result = reducer_.Reduce(devices, nclass, info.weights_, info.labels_, preds);
|
||||
auto result = reducer_.Reduce(*tparam_, devices, nclass, info.weights_, info.labels_, preds);
|
||||
double dat[2] { result.Residue(), result.Weights() };
|
||||
|
||||
if (distributed) {
|
||||
|
||||
@ -19,6 +19,17 @@ void SetDevice(int device) {
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
}
|
||||
|
||||
struct HostDeviceVectorSetDeviceHandler {
|
||||
template <typename Functor>
|
||||
explicit HostDeviceVectorSetDeviceHandler(Functor f) {
|
||||
SetCudaSetDeviceHandler(f);
|
||||
}
|
||||
|
||||
~HostDeviceVectorSetDeviceHandler() {
|
||||
SetCudaSetDeviceHandler(nullptr);
|
||||
}
|
||||
};
|
||||
|
||||
void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
|
||||
HostDeviceVector<int> *v) {
|
||||
// create the vector
|
||||
@ -107,7 +118,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
|
||||
void TestHostDeviceVector
|
||||
(size_t n, const GPUDistribution& distribution,
|
||||
const std::vector<size_t>& starts, const std::vector<size_t>& sizes) {
|
||||
SetCudaSetDeviceHandler(SetDevice);
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||
HostDeviceVector<int> v;
|
||||
InitHostDeviceVector(n, distribution, &v);
|
||||
CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
|
||||
@ -115,7 +126,6 @@ void TestHostDeviceVector
|
||||
CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
|
||||
CheckHost(&v, GPUAccess::kRead);
|
||||
CheckHost(&v, GPUAccess::kWrite);
|
||||
SetCudaSetDeviceHandler(nullptr);
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, TestBlock) {
|
||||
@ -161,7 +171,7 @@ TEST(HostDeviceVector, TestCopy) {
|
||||
auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
|
||||
std::vector<size_t> starts{0, 501};
|
||||
std::vector<size_t> sizes{501, 500};
|
||||
SetCudaSetDeviceHandler(SetDevice);
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||
|
||||
HostDeviceVector<int> v;
|
||||
{
|
||||
@ -175,7 +185,6 @@ TEST(HostDeviceVector, TestCopy) {
|
||||
CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
|
||||
CheckHost(&v, GPUAccess::kRead);
|
||||
CheckHost(&v, GPUAccess::kWrite);
|
||||
SetCudaSetDeviceHandler(nullptr);
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, Shard) {
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
#include "gtest/gtest.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
#if defined(XGBOOST_USE_NCCL)
|
||||
namespace {
|
||||
|
||||
inline void CheckCAPICall(int ret) {
|
||||
@ -17,6 +18,7 @@ inline void CheckCAPICall(int ret) {
|
||||
}
|
||||
|
||||
} // namespace anonymous
|
||||
#endif
|
||||
|
||||
extern const std::map<std::string, std::string>&
|
||||
QueryBoosterConfigurationArguments(BoosterHandle handle);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user