* - set the appropriate device before freeing device memory... - pr #4532 added a global memory tracker/logger to keep track of number of (de)allocations and peak memory usage on a per device basis. - this pr adds the appropriate check to make sure that the (de)allocation counts and memory usages makes sense for the device. since verbosity is typically increased on debug/non-retail builds. * - pre-create cub allocators and reuse them - create them once and not resize them dynamically. we need to ensure that these allocators are created and destroyed exactly once so that the appropriate device id's are set
627 lines
19 KiB
Plaintext
627 lines
19 KiB
Plaintext
/*!
|
|
* Copyright 2017 XGBoost contributors
|
|
*/
|
|
|
|
#include "./host_device_vector.h"
|
|
#include <thrust/fill.h>
|
|
#include <xgboost/data.h>
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
#include <mutex>
|
|
#include "./device_helpers.cuh"
|
|
|
|
|
|
namespace xgboost {
|
|
|
|
// the handler to call instead of cudaSetDevice; only used for testing
|
|
static void (*cudaSetDeviceHandler)(int) = nullptr; // NOLINT
|
|
|
|
void SetCudaSetDeviceHandler(void (*handler)(int)) {
|
|
cudaSetDeviceHandler = handler;
|
|
}
|
|
|
|
// wrapper over access with useful methods
|
|
class Permissions {
|
|
GPUAccess access_;
|
|
explicit Permissions(GPUAccess access) : access_{access} {}
|
|
|
|
public:
|
|
Permissions() : access_{GPUAccess::kNone} {}
|
|
explicit Permissions(bool perm)
|
|
: access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
|
|
|
|
bool CanRead() const { return access_ >= kRead; }
|
|
bool CanWrite() const { return access_ == kWrite; }
|
|
bool CanAccess(GPUAccess access) const { return access_ >= access; }
|
|
void Grant(GPUAccess access) { access_ = std::max(access_, access); }
|
|
void DenyComplementary(GPUAccess compl_access) {
|
|
access_ = std::min(access_, GPUAccess::kWrite - compl_access);
|
|
}
|
|
Permissions Complementary() const {
|
|
return Permissions(GPUAccess::kWrite - access_);
|
|
}
|
|
};
|
|
|
|
template <typename T>
|
|
struct HostDeviceVectorImpl {
|
|
struct DeviceShard {
|
|
DeviceShard()
|
|
: proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
|
|
cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
|
|
|
|
~DeviceShard() {
|
|
SetDevice();
|
|
}
|
|
|
|
void Init(HostDeviceVectorImpl<T>* vec, int device) {
|
|
if (vec_ == nullptr) { vec_ = vec; }
|
|
CHECK_EQ(vec, vec_);
|
|
device_ = device;
|
|
LazyResize(vec_->Size());
|
|
perm_d_ = vec_->perm_h_.Complementary();
|
|
}
|
|
|
|
void Init(HostDeviceVectorImpl<T>* vec, const DeviceShard& other) {
|
|
if (vec_ == nullptr) { vec_ = vec; }
|
|
CHECK_EQ(vec, vec_);
|
|
device_ = other.device_;
|
|
cached_size_ = other.cached_size_;
|
|
start_ = other.start_;
|
|
proper_size_ = other.proper_size_;
|
|
SetDevice();
|
|
data_.resize(other.data_.size());
|
|
perm_d_ = other.perm_d_;
|
|
}
|
|
|
|
void ScatterFrom(const T* begin) {
|
|
// TODO(canonizer): avoid full copy of host data
|
|
LazySyncDevice(GPUAccess::kWrite);
|
|
SetDevice();
|
|
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_,
|
|
data_.size() * sizeof(T), cudaMemcpyDefault));
|
|
}
|
|
|
|
void GatherTo(thrust::device_ptr<T> begin) {
|
|
LazySyncDevice(GPUAccess::kRead);
|
|
SetDevice();
|
|
dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(),
|
|
proper_size_ * sizeof(T), cudaMemcpyDefault));
|
|
}
|
|
|
|
void Fill(T v) {
|
|
// TODO(canonizer): avoid full copy of host data
|
|
LazySyncDevice(GPUAccess::kWrite);
|
|
SetDevice();
|
|
thrust::fill(data_.begin(), data_.end(), v);
|
|
}
|
|
|
|
void Copy(DeviceShard* other) {
|
|
// TODO(canonizer): avoid full copy of host data for this (but not for other)
|
|
LazySyncDevice(GPUAccess::kWrite);
|
|
other->LazySyncDevice(GPUAccess::kRead);
|
|
SetDevice();
|
|
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(),
|
|
data_.size() * sizeof(T), cudaMemcpyDefault));
|
|
}
|
|
|
|
void LazySyncHost(GPUAccess access) {
|
|
SetDevice();
|
|
dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
|
|
data_.data().get(), proper_size_ * sizeof(T),
|
|
cudaMemcpyDeviceToHost));
|
|
perm_d_.DenyComplementary(access);
|
|
}
|
|
|
|
void LazyResize(size_t new_size) {
|
|
if (new_size == cached_size_) { return; }
|
|
// resize is required
|
|
int ndevices = vec_->distribution_.devices_.Size();
|
|
int device_index = vec_->distribution_.devices_.Index(device_);
|
|
start_ = vec_->distribution_.ShardStart(new_size, device_index);
|
|
proper_size_ = vec_->distribution_.ShardProperSize(new_size, device_index);
|
|
// The size on this device.
|
|
size_t size_d = vec_->distribution_.ShardSize(new_size, device_index);
|
|
SetDevice();
|
|
data_.resize(size_d);
|
|
cached_size_ = new_size;
|
|
}
|
|
|
|
void LazySyncDevice(GPUAccess access) {
|
|
if (perm_d_.CanAccess(access)) { return; }
|
|
if (perm_d_.CanRead()) {
|
|
// deny read to the host
|
|
perm_d_.Grant(access);
|
|
std::lock_guard<std::mutex> lock(vec_->mutex_);
|
|
vec_->perm_h_.DenyComplementary(access);
|
|
return;
|
|
}
|
|
// data is on the host
|
|
size_t size_h = vec_->data_h_.size();
|
|
LazyResize(size_h);
|
|
SetDevice();
|
|
dh::safe_cuda(
|
|
cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
|
|
data_.size() * sizeof(T), cudaMemcpyHostToDevice));
|
|
perm_d_.Grant(access);
|
|
|
|
std::lock_guard<std::mutex> lock(vec_->mutex_);
|
|
vec_->perm_h_.DenyComplementary(access);
|
|
vec_->size_d_ = size_h;
|
|
}
|
|
|
|
void SetDevice() {
|
|
if (cudaSetDeviceHandler == nullptr) {
|
|
dh::safe_cuda(cudaSetDevice(device_));
|
|
} else {
|
|
(*cudaSetDeviceHandler)(device_);
|
|
}
|
|
}
|
|
|
|
T* Raw() { return data_.data().get(); }
|
|
size_t Start() const { return start_; }
|
|
size_t DataSize() const { return data_.size(); }
|
|
Permissions& Perm() { return perm_d_; }
|
|
Permissions const& Perm() const { return perm_d_; }
|
|
|
|
private:
|
|
int device_;
|
|
dh::device_vector<T> data_;
|
|
// cached vector size
|
|
size_t cached_size_;
|
|
size_t start_;
|
|
// size of the portion to copy back to the host
|
|
size_t proper_size_;
|
|
Permissions perm_d_;
|
|
HostDeviceVectorImpl<T>* vec_;
|
|
};
|
|
|
|
HostDeviceVectorImpl(size_t size, T v, const GPUDistribution &distribution)
|
|
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
|
|
if (!distribution_.IsEmpty()) {
|
|
size_d_ = size;
|
|
InitShards();
|
|
Fill(v);
|
|
} else {
|
|
data_h_.resize(size, v);
|
|
}
|
|
}
|
|
|
|
// required, as a new std::mutex has to be created
|
|
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
|
|
: data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
|
|
distribution_(other.distribution_), mutex_() {
|
|
shards_.resize(other.shards_.size());
|
|
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
|
shard.Init(this, other.shards_.at(i));
|
|
});
|
|
}
|
|
|
|
// Initializer can be std::vector<T> or std::initializer_list<T>
|
|
template <class Initializer>
|
|
HostDeviceVectorImpl(const Initializer& init, const GPUDistribution &distribution)
|
|
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
|
|
if (!distribution_.IsEmpty()) {
|
|
size_d_ = init.size();
|
|
InitShards();
|
|
Copy(init);
|
|
} else {
|
|
data_h_ = init;
|
|
}
|
|
}
|
|
|
|
void InitShards() {
|
|
int ndevices = distribution_.devices_.Size();
|
|
shards_.resize(ndevices);
|
|
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
|
shard.Init(this, distribution_.devices_.DeviceId(i));
|
|
});
|
|
}
|
|
|
|
size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }
|
|
|
|
GPUSet Devices() const { return distribution_.devices_; }
|
|
|
|
const GPUDistribution& Distribution() const { return distribution_; }
|
|
|
|
T* DevicePointer(int device) {
|
|
CHECK(distribution_.devices_.Contains(device));
|
|
LazySyncDevice(device, GPUAccess::kWrite);
|
|
return shards_.at(distribution_.devices_.Index(device)).Raw();
|
|
}
|
|
|
|
const T* ConstDevicePointer(int device) {
|
|
CHECK(distribution_.devices_.Contains(device));
|
|
LazySyncDevice(device, GPUAccess::kRead);
|
|
return shards_.at(distribution_.devices_.Index(device)).Raw();
|
|
}
|
|
|
|
common::Span<T> DeviceSpan(int device) {
|
|
GPUSet devices = distribution_.devices_;
|
|
CHECK(devices.Contains(device));
|
|
LazySyncDevice(device, GPUAccess::kWrite);
|
|
return {shards_.at(devices.Index(device)).Raw(),
|
|
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
|
|
}
|
|
|
|
common::Span<const T> ConstDeviceSpan(int device) {
|
|
GPUSet devices = distribution_.devices_;
|
|
CHECK(devices.Contains(device));
|
|
LazySyncDevice(device, GPUAccess::kRead);
|
|
using SpanInd = typename common::Span<const T>::index_type;
|
|
return {shards_.at(devices.Index(device)).Raw(),
|
|
static_cast<SpanInd>(DeviceSize(device))};
|
|
}
|
|
|
|
size_t DeviceSize(int device) {
|
|
CHECK(distribution_.devices_.Contains(device));
|
|
LazySyncDevice(device, GPUAccess::kRead);
|
|
return shards_.at(distribution_.devices_.Index(device)).DataSize();
|
|
}
|
|
|
|
size_t DeviceStart(int device) {
|
|
CHECK(distribution_.devices_.Contains(device));
|
|
LazySyncDevice(device, GPUAccess::kRead);
|
|
return shards_.at(distribution_.devices_.Index(device)).Start();
|
|
}
|
|
|
|
thrust::device_ptr<T> tbegin(int device) { // NOLINT
|
|
return thrust::device_ptr<T>(DevicePointer(device));
|
|
}
|
|
|
|
thrust::device_ptr<const T> tcbegin(int device) { // NOLINT
|
|
return thrust::device_ptr<const T>(ConstDevicePointer(device));
|
|
}
|
|
|
|
thrust::device_ptr<T> tend(int device) { // NOLINT
|
|
return tbegin(device) + DeviceSize(device);
|
|
}
|
|
|
|
thrust::device_ptr<const T> tcend(int device) { // NOLINT
|
|
return tcbegin(device) + DeviceSize(device);
|
|
}
|
|
|
|
void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
|
|
CHECK_EQ(end - begin, Size());
|
|
if (perm_h_.CanWrite()) {
|
|
dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
|
|
(end - begin) * sizeof(T),
|
|
cudaMemcpyDeviceToHost));
|
|
} else {
|
|
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
|
|
shard.ScatterFrom(begin.get());
|
|
});
|
|
}
|
|
}
|
|
|
|
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
|
CHECK_EQ(end - begin, Size());
|
|
if (perm_h_.CanWrite()) {
|
|
dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
|
|
data_h_.size() * sizeof(T),
|
|
cudaMemcpyHostToDevice));
|
|
} else {
|
|
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.GatherTo(begin); });
|
|
}
|
|
}
|
|
|
|
void Fill(T v) { // NOLINT
|
|
if (perm_h_.CanWrite()) {
|
|
std::fill(data_h_.begin(), data_h_.end(), v);
|
|
} else {
|
|
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.Fill(v); });
|
|
}
|
|
}
|
|
|
|
void Copy(HostDeviceVectorImpl<T>* other) {
|
|
CHECK_EQ(Size(), other->Size());
|
|
// Data is on host.
|
|
if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
|
|
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
|
|
return;
|
|
}
|
|
// Data is on device;
|
|
if (distribution_ != other->distribution_) {
|
|
distribution_ = GPUDistribution();
|
|
Shard(other->Distribution());
|
|
size_d_ = other->size_d_;
|
|
}
|
|
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
|
shard.Copy(&other->shards_.at(i));
|
|
});
|
|
}
|
|
|
|
void Copy(const std::vector<T>& other) {
|
|
CHECK_EQ(Size(), other.size());
|
|
if (perm_h_.CanWrite()) {
|
|
std::copy(other.begin(), other.end(), data_h_.begin());
|
|
} else {
|
|
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
|
|
shard.ScatterFrom(other.data());
|
|
});
|
|
}
|
|
}
|
|
|
|
void Copy(std::initializer_list<T> other) {
|
|
CHECK_EQ(Size(), other.size());
|
|
if (perm_h_.CanWrite()) {
|
|
std::copy(other.begin(), other.end(), data_h_.begin());
|
|
} else {
|
|
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
|
|
shard.ScatterFrom(other.begin());
|
|
});
|
|
}
|
|
}
|
|
|
|
std::vector<T>& HostVector() {
|
|
LazySyncHost(GPUAccess::kWrite);
|
|
return data_h_;
|
|
}
|
|
|
|
const std::vector<T>& ConstHostVector() {
|
|
LazySyncHost(GPUAccess::kRead);
|
|
return data_h_;
|
|
}
|
|
|
|
void Shard(const GPUDistribution& distribution) {
|
|
if (distribution_ == distribution) { return; }
|
|
CHECK(distribution_.IsEmpty())
|
|
<< "This: " << distribution_.Devices().Size() << ", "
|
|
<< "Others: " << distribution.Devices().Size();
|
|
distribution_ = distribution;
|
|
InitShards();
|
|
}
|
|
|
|
void Shard(GPUSet new_devices) {
|
|
if (distribution_.Devices() == new_devices) { return; }
|
|
Shard(GPUDistribution::Block(new_devices));
|
|
}
|
|
|
|
void Reshard(const GPUDistribution &distribution) {
|
|
if (distribution_ == distribution) { return; }
|
|
LazySyncHost(GPUAccess::kWrite);
|
|
distribution_ = distribution;
|
|
shards_.clear();
|
|
InitShards();
|
|
}
|
|
|
|
void Resize(size_t new_size, T v) {
|
|
if (new_size == Size()) { return; }
|
|
if (distribution_.IsFixedSize()) {
|
|
CHECK_EQ(new_size, distribution_.offsets_.back());
|
|
}
|
|
if (Size() == 0 && !distribution_.IsEmpty()) {
|
|
// fast on-device resize
|
|
perm_h_ = Permissions(false);
|
|
size_d_ = new_size;
|
|
InitShards();
|
|
Fill(v);
|
|
} else {
|
|
// resize on host
|
|
LazySyncHost(GPUAccess::kWrite);
|
|
data_h_.resize(new_size, v);
|
|
}
|
|
}
|
|
|
|
void LazySyncHost(GPUAccess access) {
|
|
if (perm_h_.CanAccess(access)) { return; }
|
|
if (perm_h_.CanRead()) {
|
|
// data is present, just need to deny access to the device
|
|
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
|
|
shard.Perm().DenyComplementary(access);
|
|
});
|
|
perm_h_.Grant(access);
|
|
return;
|
|
}
|
|
if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
|
|
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
|
|
shard.LazySyncHost(access);
|
|
});
|
|
perm_h_.Grant(access);
|
|
}
|
|
|
|
void LazySyncDevice(int device, GPUAccess access) {
|
|
GPUSet devices = distribution_.Devices();
|
|
CHECK(devices.Contains(device));
|
|
shards_.at(devices.Index(device)).LazySyncDevice(access);
|
|
}
|
|
|
|
bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
|
|
|
|
bool DeviceCanAccess(int device, GPUAccess access) {
|
|
GPUSet devices = distribution_.Devices();
|
|
if (!devices.Contains(device)) { return false; }
|
|
return shards_.at(devices.Index(device)).Perm().CanAccess(access);
|
|
}
|
|
|
|
private:
|
|
std::vector<T> data_h_;
|
|
Permissions perm_h_;
|
|
// the total size of the data stored on the devices
|
|
size_t size_d_;
|
|
GPUDistribution distribution_;
|
|
// protects size_d_ and perm_h_ when updated from multiple threads
|
|
std::mutex mutex_;
|
|
std::vector<DeviceShard> shards_;
|
|
};
|
|
|
|
template <typename T>
|
|
HostDeviceVector<T>::HostDeviceVector
|
|
(size_t size, T v, const GPUDistribution &distribution) : impl_(nullptr) {
|
|
impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
|
|
}
|
|
|
|
template <typename T>
|
|
HostDeviceVector<T>::HostDeviceVector
|
|
(std::initializer_list<T> init, const GPUDistribution &distribution) : impl_(nullptr) {
|
|
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
|
|
}
|
|
|
|
template <typename T>
|
|
HostDeviceVector<T>::HostDeviceVector
|
|
(const std::vector<T>& init, const GPUDistribution &distribution) : impl_(nullptr) {
|
|
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
|
|
}
|
|
|
|
template <typename T>
|
|
HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
|
|
: impl_(nullptr) {
|
|
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
|
|
}
|
|
|
|
template <typename T>
|
|
HostDeviceVector<T>& HostDeviceVector<T>::operator=
|
|
(const HostDeviceVector<T>& other) {
|
|
if (this == &other) { return *this; }
|
|
|
|
std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
|
|
delete impl_;
|
|
impl_ = newImpl.release();
|
|
return *this;
|
|
}
|
|
|
|
template <typename T>
|
|
HostDeviceVector<T>::~HostDeviceVector() {
|
|
delete impl_;
|
|
impl_ = nullptr;
|
|
}
|
|
|
|
template <typename T>
|
|
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
|
|
|
template <typename T>
|
|
GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
|
|
|
|
template <typename T>
|
|
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
|
|
return impl_->Distribution();
|
|
}
|
|
|
|
template <typename T>
|
|
T* HostDeviceVector<T>::DevicePointer(int device) {
|
|
return impl_->DevicePointer(device);
|
|
}
|
|
|
|
template <typename T>
|
|
const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
|
|
return impl_->ConstDevicePointer(device);
|
|
}
|
|
|
|
template <typename T>
|
|
common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
|
|
return impl_->DeviceSpan(device);
|
|
}
|
|
|
|
template <typename T>
|
|
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
|
|
return impl_->ConstDeviceSpan(device);
|
|
}
|
|
|
|
template <typename T>
|
|
size_t HostDeviceVector<T>::DeviceStart(int device) const {
|
|
return impl_->DeviceStart(device);
|
|
}
|
|
|
|
template <typename T>
|
|
size_t HostDeviceVector<T>::DeviceSize(int device) const {
|
|
return impl_->DeviceSize(device);
|
|
}
|
|
|
|
template <typename T>
|
|
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
|
|
return impl_->tbegin(device);
|
|
}
|
|
|
|
template <typename T>
|
|
thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const { // NOLINT
|
|
return impl_->tcbegin(device);
|
|
}
|
|
|
|
template <typename T>
|
|
thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) { // NOLINT
|
|
return impl_->tend(device);
|
|
}
|
|
|
|
template <typename T>
|
|
thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const { // NOLINT
|
|
return impl_->tcend(device);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::ScatterFrom
|
|
(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
|
|
impl_->ScatterFrom(begin, end);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::GatherTo
|
|
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
|
|
impl_->GatherTo(begin, end);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::Fill(T v) {
|
|
impl_->Fill(v);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
|
|
impl_->Copy(other.impl_);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
|
|
impl_->Copy(other);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
|
|
impl_->Copy(other);
|
|
}
|
|
|
|
template <typename T>
|
|
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
|
|
|
|
template <typename T>
|
|
const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
|
|
return impl_->ConstHostVector();
|
|
}
|
|
|
|
template <typename T>
|
|
bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
|
|
return impl_->HostCanAccess(access);
|
|
}
|
|
|
|
template <typename T>
|
|
bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
|
|
return impl_->DeviceCanAccess(device, access);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
|
|
impl_->Shard(new_devices);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
|
|
impl_->Shard(distribution);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution) {
|
|
impl_->Reshard(distribution);
|
|
}
|
|
|
|
template <typename T>
|
|
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
|
impl_->Resize(new_size, v);
|
|
}
|
|
|
|
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
|
template class HostDeviceVector<bst_float>;
|
|
template class HostDeviceVector<GradientPair>;
|
|
template class HostDeviceVector<int>;
|
|
template class HostDeviceVector<Entry>;
|
|
template class HostDeviceVector<size_t>;
|
|
|
|
} // namespace xgboost
|