xgboost/src/common/host_device_vector.cu
sriramch 90f683b25b Set the appropriate device before freeing device memory... (#4566)
* - set the appropriate device before freeing device memory...
   - pr #4532 added a global memory tracker/logger to keep track of number of (de)allocations
     and peak memory usage on a per device basis.
   - this pr adds the appropriate check to make sure that the (de)allocation counts and memory usages
     makes sense for the device. since verbosity is typically increased on debug/non-retail builds.  
* - pre-create cub allocators and reuse them
   - create them once and not resize them dynamically. we need to ensure that these allocators
     are created and destroyed exactly once so that the appropriate device id's are set
2019-06-18 14:58:05 +12:00

627 lines
19 KiB
Plaintext

/*!
* Copyright 2017 XGBoost contributors
*/
#include "./host_device_vector.h"
#include <thrust/fill.h>
#include <xgboost/data.h>
#include <algorithm>
#include <cstdint>
#include <mutex>
#include "./device_helpers.cuh"
namespace xgboost {
// the handler to call instead of cudaSetDevice; only used for testing
static void (*cudaSetDeviceHandler)(int) = nullptr; // NOLINT
void SetCudaSetDeviceHandler(void (*handler)(int)) {
cudaSetDeviceHandler = handler;
}
// wrapper over access with useful methods
class Permissions {
GPUAccess access_;
explicit Permissions(GPUAccess access) : access_{access} {}
public:
Permissions() : access_{GPUAccess::kNone} {}
explicit Permissions(bool perm)
: access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
bool CanRead() const { return access_ >= kRead; }
bool CanWrite() const { return access_ == kWrite; }
bool CanAccess(GPUAccess access) const { return access_ >= access; }
void Grant(GPUAccess access) { access_ = std::max(access_, access); }
void DenyComplementary(GPUAccess compl_access) {
access_ = std::min(access_, GPUAccess::kWrite - compl_access);
}
Permissions Complementary() const {
return Permissions(GPUAccess::kWrite - access_);
}
};
template <typename T>
struct HostDeviceVectorImpl {
struct DeviceShard {
DeviceShard()
: proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
~DeviceShard() {
SetDevice();
}
void Init(HostDeviceVectorImpl<T>* vec, int device) {
if (vec_ == nullptr) { vec_ = vec; }
CHECK_EQ(vec, vec_);
device_ = device;
LazyResize(vec_->Size());
perm_d_ = vec_->perm_h_.Complementary();
}
void Init(HostDeviceVectorImpl<T>* vec, const DeviceShard& other) {
if (vec_ == nullptr) { vec_ = vec; }
CHECK_EQ(vec, vec_);
device_ = other.device_;
cached_size_ = other.cached_size_;
start_ = other.start_;
proper_size_ = other.proper_size_;
SetDevice();
data_.resize(other.data_.size());
perm_d_ = other.perm_d_;
}
void ScatterFrom(const T* begin) {
// TODO(canonizer): avoid full copy of host data
LazySyncDevice(GPUAccess::kWrite);
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_,
data_.size() * sizeof(T), cudaMemcpyDefault));
}
void GatherTo(thrust::device_ptr<T> begin) {
LazySyncDevice(GPUAccess::kRead);
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(),
proper_size_ * sizeof(T), cudaMemcpyDefault));
}
void Fill(T v) {
// TODO(canonizer): avoid full copy of host data
LazySyncDevice(GPUAccess::kWrite);
SetDevice();
thrust::fill(data_.begin(), data_.end(), v);
}
void Copy(DeviceShard* other) {
// TODO(canonizer): avoid full copy of host data for this (but not for other)
LazySyncDevice(GPUAccess::kWrite);
other->LazySyncDevice(GPUAccess::kRead);
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(),
data_.size() * sizeof(T), cudaMemcpyDefault));
}
void LazySyncHost(GPUAccess access) {
SetDevice();
dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
data_.data().get(), proper_size_ * sizeof(T),
cudaMemcpyDeviceToHost));
perm_d_.DenyComplementary(access);
}
void LazyResize(size_t new_size) {
if (new_size == cached_size_) { return; }
// resize is required
int ndevices = vec_->distribution_.devices_.Size();
int device_index = vec_->distribution_.devices_.Index(device_);
start_ = vec_->distribution_.ShardStart(new_size, device_index);
proper_size_ = vec_->distribution_.ShardProperSize(new_size, device_index);
// The size on this device.
size_t size_d = vec_->distribution_.ShardSize(new_size, device_index);
SetDevice();
data_.resize(size_d);
cached_size_ = new_size;
}
void LazySyncDevice(GPUAccess access) {
if (perm_d_.CanAccess(access)) { return; }
if (perm_d_.CanRead()) {
// deny read to the host
perm_d_.Grant(access);
std::lock_guard<std::mutex> lock(vec_->mutex_);
vec_->perm_h_.DenyComplementary(access);
return;
}
// data is on the host
size_t size_h = vec_->data_h_.size();
LazyResize(size_h);
SetDevice();
dh::safe_cuda(
cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
data_.size() * sizeof(T), cudaMemcpyHostToDevice));
perm_d_.Grant(access);
std::lock_guard<std::mutex> lock(vec_->mutex_);
vec_->perm_h_.DenyComplementary(access);
vec_->size_d_ = size_h;
}
void SetDevice() {
if (cudaSetDeviceHandler == nullptr) {
dh::safe_cuda(cudaSetDevice(device_));
} else {
(*cudaSetDeviceHandler)(device_);
}
}
T* Raw() { return data_.data().get(); }
size_t Start() const { return start_; }
size_t DataSize() const { return data_.size(); }
Permissions& Perm() { return perm_d_; }
Permissions const& Perm() const { return perm_d_; }
private:
int device_;
dh::device_vector<T> data_;
// cached vector size
size_t cached_size_;
size_t start_;
// size of the portion to copy back to the host
size_t proper_size_;
Permissions perm_d_;
HostDeviceVectorImpl<T>* vec_;
};
HostDeviceVectorImpl(size_t size, T v, const GPUDistribution &distribution)
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
if (!distribution_.IsEmpty()) {
size_d_ = size;
InitShards();
Fill(v);
} else {
data_h_.resize(size, v);
}
}
// required, as a new std::mutex has to be created
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
: data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
distribution_(other.distribution_), mutex_() {
shards_.resize(other.shards_.size());
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Init(this, other.shards_.at(i));
});
}
// Initializer can be std::vector<T> or std::initializer_list<T>
template <class Initializer>
HostDeviceVectorImpl(const Initializer& init, const GPUDistribution &distribution)
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
if (!distribution_.IsEmpty()) {
size_d_ = init.size();
InitShards();
Copy(init);
} else {
data_h_ = init;
}
}
void InitShards() {
int ndevices = distribution_.devices_.Size();
shards_.resize(ndevices);
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Init(this, distribution_.devices_.DeviceId(i));
});
}
size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }
GPUSet Devices() const { return distribution_.devices_; }
const GPUDistribution& Distribution() const { return distribution_; }
T* DevicePointer(int device) {
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kWrite);
return shards_.at(distribution_.devices_.Index(device)).Raw();
}
const T* ConstDevicePointer(int device) {
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return shards_.at(distribution_.devices_.Index(device)).Raw();
}
common::Span<T> DeviceSpan(int device) {
GPUSet devices = distribution_.devices_;
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kWrite);
return {shards_.at(devices.Index(device)).Raw(),
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
}
common::Span<const T> ConstDeviceSpan(int device) {
GPUSet devices = distribution_.devices_;
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
using SpanInd = typename common::Span<const T>::index_type;
return {shards_.at(devices.Index(device)).Raw(),
static_cast<SpanInd>(DeviceSize(device))};
}
size_t DeviceSize(int device) {
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return shards_.at(distribution_.devices_.Index(device)).DataSize();
}
size_t DeviceStart(int device) {
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return shards_.at(distribution_.devices_.Index(device)).Start();
}
thrust::device_ptr<T> tbegin(int device) { // NOLINT
return thrust::device_ptr<T>(DevicePointer(device));
}
thrust::device_ptr<const T> tcbegin(int device) { // NOLINT
return thrust::device_ptr<const T>(ConstDevicePointer(device));
}
thrust::device_ptr<T> tend(int device) { // NOLINT
return tbegin(device) + DeviceSize(device);
}
thrust::device_ptr<const T> tcend(int device) { // NOLINT
return tcbegin(device) + DeviceSize(device);
}
void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
CHECK_EQ(end - begin, Size());
if (perm_h_.CanWrite()) {
dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
(end - begin) * sizeof(T),
cudaMemcpyDeviceToHost));
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.ScatterFrom(begin.get());
});
}
}
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
CHECK_EQ(end - begin, Size());
if (perm_h_.CanWrite()) {
dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
data_h_.size() * sizeof(T),
cudaMemcpyHostToDevice));
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.GatherTo(begin); });
}
}
void Fill(T v) { // NOLINT
if (perm_h_.CanWrite()) {
std::fill(data_h_.begin(), data_h_.end(), v);
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.Fill(v); });
}
}
void Copy(HostDeviceVectorImpl<T>* other) {
CHECK_EQ(Size(), other->Size());
// Data is on host.
if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
return;
}
// Data is on device;
if (distribution_ != other->distribution_) {
distribution_ = GPUDistribution();
Shard(other->Distribution());
size_d_ = other->size_d_;
}
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Copy(&other->shards_.at(i));
});
}
void Copy(const std::vector<T>& other) {
CHECK_EQ(Size(), other.size());
if (perm_h_.CanWrite()) {
std::copy(other.begin(), other.end(), data_h_.begin());
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.ScatterFrom(other.data());
});
}
}
void Copy(std::initializer_list<T> other) {
CHECK_EQ(Size(), other.size());
if (perm_h_.CanWrite()) {
std::copy(other.begin(), other.end(), data_h_.begin());
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.ScatterFrom(other.begin());
});
}
}
std::vector<T>& HostVector() {
LazySyncHost(GPUAccess::kWrite);
return data_h_;
}
const std::vector<T>& ConstHostVector() {
LazySyncHost(GPUAccess::kRead);
return data_h_;
}
void Shard(const GPUDistribution& distribution) {
if (distribution_ == distribution) { return; }
CHECK(distribution_.IsEmpty())
<< "This: " << distribution_.Devices().Size() << ", "
<< "Others: " << distribution.Devices().Size();
distribution_ = distribution;
InitShards();
}
void Shard(GPUSet new_devices) {
if (distribution_.Devices() == new_devices) { return; }
Shard(GPUDistribution::Block(new_devices));
}
void Reshard(const GPUDistribution &distribution) {
if (distribution_ == distribution) { return; }
LazySyncHost(GPUAccess::kWrite);
distribution_ = distribution;
shards_.clear();
InitShards();
}
void Resize(size_t new_size, T v) {
if (new_size == Size()) { return; }
if (distribution_.IsFixedSize()) {
CHECK_EQ(new_size, distribution_.offsets_.back());
}
if (Size() == 0 && !distribution_.IsEmpty()) {
// fast on-device resize
perm_h_ = Permissions(false);
size_d_ = new_size;
InitShards();
Fill(v);
} else {
// resize on host
LazySyncHost(GPUAccess::kWrite);
data_h_.resize(new_size, v);
}
}
void LazySyncHost(GPUAccess access) {
if (perm_h_.CanAccess(access)) { return; }
if (perm_h_.CanRead()) {
// data is present, just need to deny access to the device
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.Perm().DenyComplementary(access);
});
perm_h_.Grant(access);
return;
}
if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.LazySyncHost(access);
});
perm_h_.Grant(access);
}
void LazySyncDevice(int device, GPUAccess access) {
GPUSet devices = distribution_.Devices();
CHECK(devices.Contains(device));
shards_.at(devices.Index(device)).LazySyncDevice(access);
}
bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
bool DeviceCanAccess(int device, GPUAccess access) {
GPUSet devices = distribution_.Devices();
if (!devices.Contains(device)) { return false; }
return shards_.at(devices.Index(device)).Perm().CanAccess(access);
}
private:
std::vector<T> data_h_;
Permissions perm_h_;
// the total size of the data stored on the devices
size_t size_d_;
GPUDistribution distribution_;
// protects size_d_ and perm_h_ when updated from multiple threads
std::mutex mutex_;
std::vector<DeviceShard> shards_;
};
template <typename T>
HostDeviceVector<T>::HostDeviceVector
(size_t size, T v, const GPUDistribution &distribution) : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector
(std::initializer_list<T> init, const GPUDistribution &distribution) : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector
(const std::vector<T>& init, const GPUDistribution &distribution) : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
}
template <typename T>
HostDeviceVector<T>& HostDeviceVector<T>::operator=
(const HostDeviceVector<T>& other) {
if (this == &other) { return *this; }
std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
delete impl_;
impl_ = newImpl.release();
return *this;
}
template <typename T>
HostDeviceVector<T>::~HostDeviceVector() {
delete impl_;
impl_ = nullptr;
}
template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
template <typename T>
GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
template <typename T>
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
return impl_->Distribution();
}
template <typename T>
T* HostDeviceVector<T>::DevicePointer(int device) {
return impl_->DevicePointer(device);
}
template <typename T>
const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
return impl_->ConstDevicePointer(device);
}
template <typename T>
common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
return impl_->DeviceSpan(device);
}
template <typename T>
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
return impl_->ConstDeviceSpan(device);
}
template <typename T>
size_t HostDeviceVector<T>::DeviceStart(int device) const {
return impl_->DeviceStart(device);
}
template <typename T>
size_t HostDeviceVector<T>::DeviceSize(int device) const {
return impl_->DeviceSize(device);
}
template <typename T>
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
return impl_->tbegin(device);
}
template <typename T>
thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const { // NOLINT
return impl_->tcbegin(device);
}
template <typename T>
thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) { // NOLINT
return impl_->tend(device);
}
template <typename T>
thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const { // NOLINT
return impl_->tcend(device);
}
template <typename T>
void HostDeviceVector<T>::ScatterFrom
(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
impl_->ScatterFrom(begin, end);
}
template <typename T>
void HostDeviceVector<T>::GatherTo
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
impl_->GatherTo(begin, end);
}
template <typename T>
void HostDeviceVector<T>::Fill(T v) {
impl_->Fill(v);
}
template <typename T>
void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
impl_->Copy(other.impl_);
}
template <typename T>
void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
impl_->Copy(other);
}
template <typename T>
void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
impl_->Copy(other);
}
template <typename T>
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
template <typename T>
const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
return impl_->ConstHostVector();
}
template <typename T>
bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
return impl_->HostCanAccess(access);
}
template <typename T>
bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
return impl_->DeviceCanAccess(device, access);
}
template <typename T>
void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
impl_->Shard(new_devices);
}
template <typename T>
void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
impl_->Shard(distribution);
}
template <typename T>
void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution) {
impl_->Reshard(distribution);
}
template <typename T>
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
impl_->Resize(new_size, v);
}
// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<GradientPair>;
template class HostDeviceVector<int>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<size_t>;
} // namespace xgboost