Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. (#3446)
* Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. - added distributions to HostDeviceVector - using HostDeviceVector for labels, weights and base margings in MetaInfo - using HostDeviceVector for offset and data in SparsePage - other necessary refactoring * Added const version of HostDeviceVector API calls. - const versions added to calls that can trigger data transfers, e.g. DevicePointer() - updated the code that uses HostDeviceVector - objective functions now accept const HostDeviceVector<bst_float>& for predictions * Updated src/linear/updater_gpu_coordinate.cu. * Added read-only state for HostDeviceVector sync. - this means no copies are performed if both host and devices access the HostDeviceVector read-only * Fixed linter and test errors. - updated the lz4 plugin - added ConstDeviceSpan to HostDeviceVector - using device % dh::NVisibleDevices() for the physical device number, e.g. in calls to cudaSetDevice() * Fixed explicit template instantiation errors for HostDeviceVector. - replaced HostDeviceVector<unsigned int> with HostDeviceVector<int> * Fixed HostDeviceVector tests that require multiple GPUs. - added a mock set device handler; when set, it is called instead of cudaSetDevice()
This commit is contained in:
committed by
Rory Mitchell
parent
58d783df16
commit
72cd1517d6
@@ -35,6 +35,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
|
||||
auto iter = p_fmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
const auto& weights = info.weights_.HostVector();
|
||||
while (iter->Next()) {
|
||||
auto &batch = iter->Value();
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
@@ -50,7 +51,8 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
SparsePage::Inst inst = batch[i];
|
||||
for (auto& ins : inst) {
|
||||
if (ins.index >= begin && ins.index < end) {
|
||||
sketchs[ins.index].Push(ins.fvalue, info.GetWeight(ridx));
|
||||
sketchs[ins.index].Push(ins.fvalue,
|
||||
weights.size() > 0 ? weights[ridx] : 1.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ struct GPUSketcher {
|
||||
|
||||
void Init(const SparsePage& row_batch, const MetaInfo& info) {
|
||||
num_cols_ = info.num_col_;
|
||||
has_weights_ = info.weights_.size() > 0;
|
||||
has_weights_ = info.weights_.Size() > 0;
|
||||
|
||||
// find the batch size
|
||||
if (param_.gpu_batch_nrows == 0) {
|
||||
@@ -282,19 +282,23 @@ struct GPUSketcher {
|
||||
size_t batch_row_end = std::min((gpu_batch + 1) * gpu_batch_nrows_,
|
||||
static_cast<size_t>(n_rows_));
|
||||
size_t batch_nrows = batch_row_end - batch_row_begin;
|
||||
size_t n_entries =
|
||||
row_batch.offset[row_begin_ + batch_row_end] -
|
||||
row_batch.offset[row_begin_ + batch_row_begin];
|
||||
|
||||
const auto& offset_vec = row_batch.offset.HostVector();
|
||||
const auto& data_vec = row_batch.data.HostVector();
|
||||
|
||||
size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
|
||||
offset_vec[row_begin_ + batch_row_begin];
|
||||
// copy the batch to the GPU
|
||||
dh::safe_cuda
|
||||
(cudaMemcpy(entries_.data().get(),
|
||||
&row_batch.data[row_batch.offset[row_begin_ + batch_row_begin]],
|
||||
data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
// copy the weights if necessary
|
||||
if (has_weights_) {
|
||||
const auto& weights_vec = info.weights_.HostVector();
|
||||
dh::safe_cuda
|
||||
(cudaMemcpy(weights_.data().get(),
|
||||
info.weights_.data() + row_begin_ + batch_row_begin,
|
||||
weights_vec.data() + row_begin_ + batch_row_begin,
|
||||
batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
@@ -310,7 +314,7 @@ struct GPUSketcher {
|
||||
row_ptrs_.data().get() + batch_row_begin,
|
||||
has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
|
||||
gpu_batch_nrows_, num_cols_,
|
||||
row_batch.offset[row_begin_ + batch_row_begin], batch_nrows);
|
||||
offset_vec[row_begin_ + batch_row_begin], batch_nrows);
|
||||
dh::safe_cuda(cudaGetLastError()); // NOLINT
|
||||
dh::safe_cuda(cudaDeviceSynchronize()); // NOLINT
|
||||
|
||||
@@ -331,13 +335,11 @@ struct GPUSketcher {
|
||||
void Sketch(const SparsePage& row_batch, const MetaInfo& info) {
|
||||
// copy rows to the device
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
const auto& offset_vec = row_batch.offset.HostVector();
|
||||
row_ptrs_.resize(n_rows_ + 1);
|
||||
thrust::copy(row_batch.offset.data() + row_begin_,
|
||||
row_batch.offset.data() + row_end_ + 1,
|
||||
row_ptrs_.begin());
|
||||
|
||||
thrust::copy(offset_vec.data() + row_begin_,
|
||||
offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
|
||||
size_t gpu_nbatches = dh::DivRoundUp(n_rows_, gpu_batch_nrows_);
|
||||
|
||||
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
|
||||
SketchBatch(row_batch, info, gpu_batch);
|
||||
}
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
// dummy implementation of HostDeviceVector in case CUDA is not used
|
||||
|
||||
#include <xgboost/base.h>
|
||||
|
||||
#include <xgboost/data.h>
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
#include "./host_device_vector.h"
|
||||
|
||||
@@ -14,25 +15,27 @@ namespace xgboost {
|
||||
|
||||
template <typename T>
|
||||
struct HostDeviceVectorImpl {
|
||||
explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
|
||||
HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
|
||||
explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)) {}
|
||||
explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v), distribution_() {}
|
||||
HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init), distribution_() {}
|
||||
explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)), distribution_() {}
|
||||
std::vector<T> data_h_;
|
||||
GPUDistribution distribution_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices) : impl_(nullptr) {
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUDistribution distribution)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(size, v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUDistribution distribution)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUDistribution distribution)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||
}
|
||||
@@ -44,33 +47,69 @@ HostDeviceVector<T>::~HostDeviceVector() {
|
||||
delete tmp;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
|
||||
if (this == &other) {
|
||||
return *this;
|
||||
}
|
||||
delete impl_;
|
||||
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }
|
||||
|
||||
template <typename T>
|
||||
GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
|
||||
|
||||
template <typename T>
|
||||
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
|
||||
return impl_->distribution_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
|
||||
|
||||
template <typename T>
|
||||
const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
|
||||
return common::Span<T>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
|
||||
return common::Span<const T>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }
|
||||
|
||||
template <typename T>
|
||||
const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
|
||||
return impl_->data_h_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||
impl_->data_h_.resize(new_size, v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceStart(int device) { return 0; }
|
||||
size_t HostDeviceVector<T>::DeviceStart(int device) const { return 0; }
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceSize(int device) { return 0; }
|
||||
size_t HostDeviceVector<T>::DeviceSize(int device) const { return 0; }
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Fill(T v) {
|
||||
@@ -78,9 +117,9 @@ void HostDeviceVector<T>::Fill(T v) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
|
||||
CHECK_EQ(Size(), other->Size());
|
||||
std::copy(other->HostVector().begin(), other->HostVector().end(), HostVector().begin());
|
||||
void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
|
||||
CHECK_EQ(Size(), other.Size());
|
||||
std::copy(other.HostVector().begin(), other.HostVector().end(), HostVector().begin());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -96,13 +135,27 @@ void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Reshard(GPUSet devices) { }
|
||||
bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const { }
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Reshard(GPUSet devices) const { }
|
||||
|
||||
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<unsigned int>;
|
||||
template class HostDeviceVector<int>;
|
||||
template class HostDeviceVector<Entry>;
|
||||
template class HostDeviceVector<size_t>;
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@@ -2,119 +2,159 @@
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
|
||||
|
||||
#include <thrust/fill.h>
|
||||
#include "./host_device_vector.h"
|
||||
#include <thrust/fill.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <mutex>
|
||||
#include "./device_helpers.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
// the handler to call instead of cudaSetDevice; only used for testing
|
||||
static void (*cudaSetDeviceHandler)(int) = nullptr; // NOLINT
|
||||
|
||||
void SetCudaSetDeviceHandler(void (*handler)(int)) {
|
||||
cudaSetDeviceHandler = handler;
|
||||
}
|
||||
|
||||
// wrapper over access with useful methods
|
||||
class Permissions {
|
||||
GPUAccess access_;
|
||||
explicit Permissions(GPUAccess access) : access_(access) {}
|
||||
|
||||
public:
|
||||
Permissions() : access_(GPUAccess::kNone) {}
|
||||
explicit Permissions(bool perm)
|
||||
: access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
|
||||
|
||||
bool CanRead() const { return access_ >= kRead; }
|
||||
bool CanWrite() const { return access_ == kWrite; }
|
||||
bool CanAccess(GPUAccess access) const { return access_ >= access; }
|
||||
void Grant(GPUAccess access) { access_ = std::max(access_, access); }
|
||||
void DenyComplementary(GPUAccess compl_access) {
|
||||
access_ = std::min(access_, GPUAccess::kWrite - compl_access);
|
||||
}
|
||||
Permissions Complementary() const {
|
||||
return Permissions(GPUAccess::kWrite - access_);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct HostDeviceVectorImpl {
|
||||
struct DeviceShard {
|
||||
DeviceShard() : index_(-1), device_(-1), start_(0), on_d_(false), vec_(nullptr) {}
|
||||
|
||||
static size_t ShardStart(size_t size, int ndevices, int index) {
|
||||
size_t portion = dh::DivRoundUp(size, ndevices);
|
||||
size_t begin = index * portion;
|
||||
begin = begin > size ? size : begin;
|
||||
return begin;
|
||||
}
|
||||
|
||||
static size_t ShardSize(size_t size, int ndevices, int index) {
|
||||
size_t portion = dh::DivRoundUp(size, ndevices);
|
||||
size_t begin = index * portion, end = (index + 1) * portion;
|
||||
begin = begin > size ? size : begin;
|
||||
end = end > size ? size : end;
|
||||
return end - begin;
|
||||
}
|
||||
DeviceShard()
|
||||
: index_(-1), proper_size_(0), device_(-1), start_(0), perm_d_(false),
|
||||
cached_size_(~0), vec_(nullptr) {}
|
||||
|
||||
void Init(HostDeviceVectorImpl<T>* vec, int device) {
|
||||
if (vec_ == nullptr) { vec_ = vec; }
|
||||
CHECK_EQ(vec, vec_);
|
||||
device_ = device;
|
||||
index_ = vec_->devices_.Index(device);
|
||||
size_t size_h = vec_->Size();
|
||||
int ndevices = vec_->devices_.Size();
|
||||
start_ = ShardStart(size_h, ndevices, index_);
|
||||
size_t size_d = ShardSize(size_h, ndevices, index_);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
data_.resize(size_d);
|
||||
on_d_ = !vec_->on_h_;
|
||||
index_ = vec_->distribution_.devices_.Index(device);
|
||||
LazyResize(vec_->Size());
|
||||
perm_d_ = vec_->perm_h_.Complementary();
|
||||
}
|
||||
|
||||
void ScatterFrom(const T* begin) {
|
||||
// TODO(canonizer): avoid full copy of host data
|
||||
LazySyncDevice();
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_,
|
||||
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void GatherTo(thrust::device_ptr<T> begin) {
|
||||
LazySyncDevice();
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
LazySyncDevice(GPUAccess::kRead);
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(),
|
||||
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
proper_size_ * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void Fill(T v) {
|
||||
// TODO(canonizer): avoid full copy of host data
|
||||
LazySyncDevice();
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
SetDevice();
|
||||
thrust::fill(data_.begin(), data_.end(), v);
|
||||
}
|
||||
|
||||
void Copy(DeviceShard* other) {
|
||||
// TODO(canonizer): avoid full copy of host data for this (but not for other)
|
||||
LazySyncDevice();
|
||||
other->LazySyncDevice();
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
other->LazySyncDevice(GPUAccess::kRead);
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(),
|
||||
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void LazySyncHost() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
void LazySyncHost(GPUAccess access) {
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
|
||||
data_.data().get(), data_.size() * sizeof(T),
|
||||
data_.data().get(), proper_size_ * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
on_d_ = false;
|
||||
perm_d_.DenyComplementary(access);
|
||||
}
|
||||
|
||||
void LazySyncDevice() {
|
||||
if (on_d_) { return; }
|
||||
void LazyResize(size_t new_size) {
|
||||
if (new_size == cached_size_) { return; }
|
||||
// resize is required
|
||||
int ndevices = vec_->distribution_.devices_.Size();
|
||||
start_ = vec_->distribution_.ShardStart(new_size, index_);
|
||||
proper_size_ = vec_->distribution_.ShardProperSize(new_size, index_);
|
||||
size_t size_d = vec_->distribution_.ShardSize(new_size, index_);
|
||||
SetDevice();
|
||||
data_.resize(size_d);
|
||||
cached_size_ = new_size;
|
||||
}
|
||||
|
||||
void LazySyncDevice(GPUAccess access) {
|
||||
if (perm_d_.CanAccess(access)) { return; }
|
||||
if (perm_d_.CanRead()) {
|
||||
// deny read to the host
|
||||
perm_d_.Grant(access);
|
||||
std::lock_guard<std::mutex> lock(vec_->mutex_);
|
||||
vec_->perm_h_.DenyComplementary(access);
|
||||
return;
|
||||
}
|
||||
// data is on the host
|
||||
size_t size_h = vec_->data_h_.size();
|
||||
int ndevices = vec_->devices_.Size();
|
||||
start_ = ShardStart(size_h, ndevices, index_);
|
||||
size_t size_d = ShardSize(size_h, ndevices, index_);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
data_.resize(size_d);
|
||||
dh::safe_cuda(cudaMemcpy(data_.data().get(),
|
||||
vec_->data_h_.data() + start_,
|
||||
size_d * sizeof(T), cudaMemcpyHostToDevice));
|
||||
on_d_ = true;
|
||||
// this may cause a race condition if LazySyncDevice() is called
|
||||
// from multiple threads in parallel;
|
||||
// however, the race condition is benign, and will not cause problems
|
||||
vec_->on_h_ = false;
|
||||
vec_->size_d_ = vec_->data_h_.size();
|
||||
LazyResize(size_h);
|
||||
SetDevice();
|
||||
dh::safe_cuda(
|
||||
cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
|
||||
data_.size() * sizeof(T), cudaMemcpyHostToDevice));
|
||||
perm_d_.Grant(access);
|
||||
|
||||
std::lock_guard<std::mutex> lock(vec_->mutex_);
|
||||
vec_->perm_h_.DenyComplementary(access);
|
||||
vec_->size_d_ = size_h;
|
||||
}
|
||||
|
||||
void SetDevice() {
|
||||
if (cudaSetDeviceHandler == nullptr) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
} else {
|
||||
(*cudaSetDeviceHandler)(device_);
|
||||
}
|
||||
}
|
||||
|
||||
int index_;
|
||||
int device_;
|
||||
thrust::device_vector<T> data_;
|
||||
// cached vector size
|
||||
size_t cached_size_;
|
||||
size_t start_;
|
||||
// true if there is an up-to-date copy of data on device, false otherwise
|
||||
bool on_d_;
|
||||
// size of the portion to copy back to the host
|
||||
size_t proper_size_;
|
||||
Permissions perm_d_;
|
||||
HostDeviceVectorImpl<T>* vec_;
|
||||
};
|
||||
|
||||
HostDeviceVectorImpl(size_t size, T v, GPUSet devices)
|
||||
: devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
|
||||
if (!devices.IsEmpty()) {
|
||||
HostDeviceVectorImpl(size_t size, T v, GPUDistribution distribution)
|
||||
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
|
||||
if (!distribution_.IsEmpty()) {
|
||||
size_d_ = size;
|
||||
InitShards();
|
||||
Fill(v);
|
||||
@@ -123,11 +163,16 @@ struct HostDeviceVectorImpl {
|
||||
}
|
||||
}
|
||||
|
||||
// required, as a new std::mutex has to be created
|
||||
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
|
||||
: data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
|
||||
distribution_(other.distribution_), mutex_(), shards_(other.shards_) {}
|
||||
|
||||
// Init can be std::vector<T> or std::initializer_list<T>
|
||||
template <class Init>
|
||||
HostDeviceVectorImpl(const Init& init, GPUSet devices)
|
||||
: devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
|
||||
if (!devices.IsEmpty()) {
|
||||
HostDeviceVectorImpl(const Init& init, GPUDistribution distribution)
|
||||
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
|
||||
if (!distribution_.IsEmpty()) {
|
||||
size_d_ = init.size();
|
||||
InitShards();
|
||||
Copy(init);
|
||||
@@ -137,58 +182,78 @@ struct HostDeviceVectorImpl {
|
||||
}
|
||||
|
||||
void InitShards() {
|
||||
int ndevices = devices_.Size();
|
||||
int ndevices = distribution_.devices_.Size();
|
||||
shards_.resize(ndevices);
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
||||
shard.Init(this, devices_[i]);
|
||||
shard.Init(this, distribution_.devices_[i]);
|
||||
});
|
||||
}
|
||||
|
||||
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>&) = delete;
|
||||
HostDeviceVectorImpl(HostDeviceVectorImpl<T>&&) = delete;
|
||||
void operator=(const HostDeviceVectorImpl<T>&) = delete;
|
||||
void operator=(HostDeviceVectorImpl<T>&&) = delete;
|
||||
size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }
|
||||
|
||||
size_t Size() const { return on_h_ ? data_h_.size() : size_d_; }
|
||||
GPUSet Devices() const { return distribution_.devices_; }
|
||||
|
||||
GPUSet Devices() const { return devices_; }
|
||||
const GPUDistribution& Distribution() const { return distribution_; }
|
||||
|
||||
T* DevicePointer(int device) {
|
||||
CHECK(devices_.Contains(device));
|
||||
LazySyncDevice(device);
|
||||
return shards_[devices_.Index(device)].data_.data().get();
|
||||
CHECK(distribution_.devices_.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kWrite);
|
||||
return shards_[distribution_.devices_.Index(device)].data_.data().get();
|
||||
}
|
||||
|
||||
const T* ConstDevicePointer(int device) {
|
||||
CHECK(distribution_.devices_.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kRead);
|
||||
return shards_[distribution_.devices_.Index(device)].data_.data().get();
|
||||
}
|
||||
|
||||
common::Span<T> DeviceSpan(int device) {
|
||||
CHECK(devices_.Contains(device));
|
||||
LazySyncDevice(device);
|
||||
return { shards_[devices_.Index(device)].data_.data().get(),
|
||||
static_cast<typename common::Span<T>::index_type>(Size()) };
|
||||
GPUSet devices = distribution_.devices_;
|
||||
CHECK(devices.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kWrite);
|
||||
return {shards_[devices.Index(device)].data_.data().get(),
|
||||
static_cast<typename common::Span<T>::index_type>(Size())};
|
||||
}
|
||||
|
||||
common::Span<const T> ConstDeviceSpan(int device) {
|
||||
GPUSet devices = distribution_.devices_;
|
||||
CHECK(devices.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kRead);
|
||||
return {shards_[devices.Index(device)].data_.data().get(),
|
||||
static_cast<typename common::Span<const T>::index_type>(Size())};
|
||||
}
|
||||
|
||||
size_t DeviceSize(int device) {
|
||||
CHECK(devices_.Contains(device));
|
||||
LazySyncDevice(device);
|
||||
return shards_[devices_.Index(device)].data_.size();
|
||||
CHECK(distribution_.devices_.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kRead);
|
||||
return shards_[distribution_.devices_.Index(device)].data_.size();
|
||||
}
|
||||
|
||||
size_t DeviceStart(int device) {
|
||||
CHECK(devices_.Contains(device));
|
||||
LazySyncDevice(device);
|
||||
return shards_[devices_.Index(device)].start_;
|
||||
CHECK(distribution_.devices_.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kRead);
|
||||
return shards_[distribution_.devices_.Index(device)].start_;
|
||||
}
|
||||
|
||||
thrust::device_ptr<T> tbegin(int device) { // NOLINT
|
||||
return thrust::device_ptr<T>(DevicePointer(device));
|
||||
}
|
||||
|
||||
thrust::device_ptr<const T> tcbegin(int device) { // NOLINT
|
||||
return thrust::device_ptr<const T>(ConstDevicePointer(device));
|
||||
}
|
||||
|
||||
thrust::device_ptr<T> tend(int device) { // NOLINT
|
||||
return tbegin(device) + DeviceSize(device);
|
||||
}
|
||||
|
||||
void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
thrust::device_ptr<const T> tcend(int device) { // NOLINT
|
||||
return tcbegin(device) + DeviceSize(device);
|
||||
}
|
||||
|
||||
void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
|
||||
CHECK_EQ(end - begin, Size());
|
||||
if (on_h_) {
|
||||
if (perm_h_.CanWrite()) {
|
||||
dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
|
||||
(end - begin) * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
@@ -201,7 +266,7 @@ struct HostDeviceVectorImpl {
|
||||
|
||||
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
CHECK_EQ(end - begin, Size());
|
||||
if (on_h_) {
|
||||
if (perm_h_.CanWrite()) {
|
||||
dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
|
||||
data_h_.size() * sizeof(T),
|
||||
cudaMemcpyHostToDevice));
|
||||
@@ -211,7 +276,7 @@ struct HostDeviceVectorImpl {
|
||||
}
|
||||
|
||||
void Fill(T v) {
|
||||
if (on_h_) {
|
||||
if (perm_h_.CanWrite()) {
|
||||
std::fill(data_h_.begin(), data_h_.end(), v);
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.Fill(v); });
|
||||
@@ -220,10 +285,10 @@ struct HostDeviceVectorImpl {
|
||||
|
||||
void Copy(HostDeviceVectorImpl<T>* other) {
|
||||
CHECK_EQ(Size(), other->Size());
|
||||
if (on_h_ && other->on_h_) {
|
||||
if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
|
||||
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
|
||||
} else {
|
||||
CHECK(devices_ == other->devices_);
|
||||
CHECK(distribution_ == other->distribution_);
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
||||
shard.Copy(&other->shards_[i]);
|
||||
});
|
||||
@@ -232,7 +297,7 @@ struct HostDeviceVectorImpl {
|
||||
|
||||
void Copy(const std::vector<T>& other) {
|
||||
CHECK_EQ(Size(), other.size());
|
||||
if (on_h_) {
|
||||
if (perm_h_.CanWrite()) {
|
||||
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||
@@ -243,7 +308,7 @@ struct HostDeviceVectorImpl {
|
||||
|
||||
void Copy(std::initializer_list<T> other) {
|
||||
CHECK_EQ(Size(), other.size());
|
||||
if (on_h_) {
|
||||
if (perm_h_.CanWrite()) {
|
||||
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||
@@ -253,72 +318,117 @@ struct HostDeviceVectorImpl {
|
||||
}
|
||||
|
||||
std::vector<T>& HostVector() {
|
||||
LazySyncHost();
|
||||
LazySyncHost(GPUAccess::kWrite);
|
||||
return data_h_;
|
||||
}
|
||||
|
||||
void Reshard(GPUSet new_devices) {
|
||||
if (devices_ == new_devices)
|
||||
return;
|
||||
CHECK(devices_.IsEmpty());
|
||||
devices_ = new_devices;
|
||||
const std::vector<T>& ConstHostVector() {
|
||||
LazySyncHost(GPUAccess::kRead);
|
||||
return data_h_;
|
||||
}
|
||||
|
||||
void Reshard(const GPUDistribution& distribution) {
|
||||
if (distribution_ == distribution) { return; }
|
||||
CHECK(distribution_.IsEmpty());
|
||||
distribution_ = distribution;
|
||||
InitShards();
|
||||
}
|
||||
|
||||
void Reshard(GPUSet new_devices) {
|
||||
if (distribution_.Devices() == new_devices) { return; }
|
||||
Reshard(GPUDistribution::Block(new_devices));
|
||||
}
|
||||
|
||||
void Resize(size_t new_size, T v) {
|
||||
if (new_size == Size())
|
||||
return;
|
||||
if (Size() == 0 && !devices_.IsEmpty()) {
|
||||
if (new_size == Size()) { return; }
|
||||
if (distribution_.IsFixedSize()) {
|
||||
CHECK_EQ(new_size, distribution_.offsets_.back());
|
||||
}
|
||||
if (Size() == 0 && !distribution_.IsEmpty()) {
|
||||
// fast on-device resize
|
||||
on_h_ = false;
|
||||
perm_h_ = Permissions(false);
|
||||
size_d_ = new_size;
|
||||
InitShards();
|
||||
Fill(v);
|
||||
} else {
|
||||
// resize on host
|
||||
LazySyncHost();
|
||||
LazySyncHost(GPUAccess::kWrite);
|
||||
data_h_.resize(new_size, v);
|
||||
}
|
||||
}
|
||||
|
||||
void LazySyncHost() {
|
||||
if (on_h_)
|
||||
void LazySyncHost(GPUAccess access) {
|
||||
if (perm_h_.CanAccess(access)) { return; }
|
||||
if (perm_h_.CanRead()) {
|
||||
// data is present, just need to deny access to the device
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||
shard.perm_d_.DenyComplementary(access);
|
||||
});
|
||||
perm_h_.Grant(access);
|
||||
return;
|
||||
if (data_h_.size() != size_d_)
|
||||
data_h_.resize(size_d_);
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.LazySyncHost(); });
|
||||
on_h_ = true;
|
||||
}
|
||||
if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||
shard.LazySyncHost(access);
|
||||
});
|
||||
perm_h_.Grant(access);
|
||||
}
|
||||
|
||||
void LazySyncDevice(int device) {
|
||||
CHECK(devices_.Contains(device));
|
||||
shards_[devices_.Index(device)].LazySyncDevice();
|
||||
void LazySyncDevice(int device, GPUAccess access) {
|
||||
GPUSet devices = distribution_.Devices();
|
||||
CHECK(devices.Contains(device));
|
||||
shards_[devices.Index(device)].LazySyncDevice(access);
|
||||
}
|
||||
|
||||
bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
|
||||
|
||||
bool DeviceCanAccess(int device, GPUAccess access) {
|
||||
GPUSet devices = distribution_.Devices();
|
||||
if (!devices.Contains(device)) { return false; }
|
||||
return shards_[devices.Index(device)].perm_d_.CanAccess(access);
|
||||
}
|
||||
|
||||
std::vector<T> data_h_;
|
||||
bool on_h_;
|
||||
Permissions perm_h_;
|
||||
// the total size of the data stored on the devices
|
||||
size_t size_d_;
|
||||
GPUSet devices_;
|
||||
GPUDistribution distribution_;
|
||||
// protects size_d_ and perm_h_ when updated from multiple threads
|
||||
std::mutex mutex_;
|
||||
std::vector<DeviceShard> shards_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(size, v, devices);
|
||||
HostDeviceVector<T>::HostDeviceVector
|
||||
(size_t size, T v, GPUDistribution distribution) : impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init, devices);
|
||||
HostDeviceVector<T>::HostDeviceVector
|
||||
(std::initializer_list<T> init, GPUDistribution distribution) : impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
|
||||
HostDeviceVector<T>::HostDeviceVector
|
||||
(const std::vector<T>& init, GPUDistribution distribution) : impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init, devices);
|
||||
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>& HostDeviceVector<T>::operator=
|
||||
(const HostDeviceVector<T>& other) {
|
||||
if (this == &other) { return *this; }
|
||||
delete impl_;
|
||||
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -335,7 +445,19 @@ template <typename T>
|
||||
GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }
|
||||
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
|
||||
return impl_->Distribution();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::DevicePointer(int device) {
|
||||
return impl_->DevicePointer(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
|
||||
return impl_->ConstDevicePointer(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
|
||||
@@ -343,30 +465,49 @@ common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceStart(int device) { return impl_->DeviceStart(device); }
|
||||
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
|
||||
return impl_->ConstDeviceSpan(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceSize(int device) { return impl_->DeviceSize(device); }
|
||||
size_t HostDeviceVector<T>::DeviceStart(int device) const {
|
||||
return impl_->DeviceStart(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceSize(int device) const {
|
||||
return impl_->DeviceSize(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
|
||||
return impl_->tbegin(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const { // NOLINT
|
||||
return impl_->tcbegin(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) { // NOLINT
|
||||
return impl_->tend(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const { // NOLINT
|
||||
return impl_->tcend(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::ScatterFrom
|
||||
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
|
||||
impl_->ScatterFrom(begin, end);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::GatherTo
|
||||
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
|
||||
impl_->GatherTo(begin, end);
|
||||
}
|
||||
|
||||
@@ -376,8 +517,8 @@ void HostDeviceVector<T>::Fill(T v) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
|
||||
impl_->Copy(other->impl_);
|
||||
void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
|
||||
impl_->Copy(other.impl_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -394,10 +535,30 @@ template <typename T>
|
||||
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Reshard(GPUSet new_devices) {
|
||||
const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
|
||||
return impl_->ConstHostVector();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
|
||||
return impl_->HostCanAccess(access);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
|
||||
return impl_->DeviceCanAccess(device, access);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Reshard(GPUSet new_devices) const {
|
||||
impl_->Reshard(new_devices);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const {
|
||||
impl_->Reshard(distribution);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||
impl_->Resize(new_size, v);
|
||||
@@ -406,7 +567,8 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<unsigned int>;
|
||||
template class HostDeviceVector<int>;
|
||||
template class HostDeviceVector<Entry>;
|
||||
template class HostDeviceVector<size_t>;
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,28 +1,6 @@
|
||||
/*!
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
||||
#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
||||
|
||||
#include <dmlc/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
|
||||
#include "gpu_set.h"
|
||||
#include "span.h"
|
||||
|
||||
// only include thrust-related files if host_device_vector.h
|
||||
// is included from a .cu file
|
||||
#ifdef __CUDACC__
|
||||
#include <thrust/device_ptr.h>
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
template <typename T> struct HostDeviceVectorImpl;
|
||||
|
||||
/**
|
||||
* @file host_device_vector.h
|
||||
@@ -70,44 +48,203 @@ template <typename T> struct HostDeviceVectorImpl;
|
||||
* if different threads call these methods with different values of the device argument.
|
||||
* All other methods are not thread safe.
|
||||
*/
|
||||
|
||||
#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
||||
#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
||||
|
||||
#include <dmlc/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
|
||||
#include "gpu_set.h"
|
||||
#include "span.h"
|
||||
|
||||
// only include thrust-related files if host_device_vector.h
|
||||
// is included from a .cu file
|
||||
#ifdef __CUDACC__
|
||||
#include <thrust/device_ptr.h>
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
#ifdef __CUDACC__
|
||||
// Sets a function to call instead of cudaSetDevice();
|
||||
// only added for testing
|
||||
void SetCudaSetDeviceHandler(void (*handler)(int));
|
||||
#endif
|
||||
|
||||
template <typename T> struct HostDeviceVectorImpl;
|
||||
|
||||
// Distribution for the HostDeviceVector; it specifies such aspects as the devices it is
|
||||
// distributed on, whether there are copies of elements from other GPUs as well as the granularity
|
||||
// of splitting. It may also specify explicit boundaries for devices, in which case the size of the
|
||||
// array cannot be changed.
|
||||
class GPUDistribution {
|
||||
template<typename T> friend struct HostDeviceVectorImpl;
|
||||
|
||||
public:
|
||||
explicit GPUDistribution(GPUSet devices = GPUSet::Empty())
|
||||
: devices_(devices), granularity_(1), overlap_(0) {}
|
||||
|
||||
private:
|
||||
GPUDistribution(GPUSet devices, int granularity, int overlap,
|
||||
std::vector<size_t> offsets)
|
||||
: devices_(devices), granularity_(granularity), overlap_(overlap),
|
||||
offsets_(std::move(offsets)) {}
|
||||
|
||||
public:
|
||||
static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
|
||||
|
||||
static GPUDistribution Overlap(GPUSet devices, int overlap) {
|
||||
return GPUDistribution(devices, 1, overlap, std::vector<size_t>());
|
||||
}
|
||||
|
||||
static GPUDistribution Granular(GPUSet devices, int granularity) {
|
||||
return GPUDistribution(devices, granularity, 0, std::vector<size_t>());
|
||||
}
|
||||
|
||||
static GPUDistribution Explicit(GPUSet devices, std::vector<size_t> offsets) {
|
||||
return GPUDistribution(devices, 1, 0, offsets);
|
||||
}
|
||||
|
||||
friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
|
||||
return a.devices_ == b.devices_ && a.granularity_ == b.granularity_ &&
|
||||
a.overlap_ == b.overlap_ && a.offsets_ == b.offsets_;
|
||||
}
|
||||
|
||||
friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
|
||||
return !(a == b);
|
||||
}
|
||||
|
||||
GPUSet Devices() const { return devices_; }
|
||||
|
||||
bool IsEmpty() const { return devices_.IsEmpty(); }
|
||||
|
||||
size_t ShardStart(size_t size, int index) const {
|
||||
if (size == 0) { return 0; }
|
||||
if (offsets_.size() > 0) {
|
||||
// explicit offsets are provided
|
||||
CHECK_EQ(offsets_.back(), size);
|
||||
return offsets_.at(index);
|
||||
}
|
||||
// no explicit offsets
|
||||
size_t begin = std::min(index * Portion(size), size);
|
||||
begin = begin > size ? size : begin;
|
||||
return begin;
|
||||
}
|
||||
|
||||
size_t ShardSize(size_t size, int index) const {
|
||||
if (size == 0) { return 0; }
|
||||
if (offsets_.size() > 0) {
|
||||
// explicit offsets are provided
|
||||
CHECK_EQ(offsets_.back(), size);
|
||||
return offsets_.at(index + 1) - offsets_.at(index) +
|
||||
(index == devices_.Size() - 1 ? overlap_ : 0);
|
||||
}
|
||||
size_t portion = Portion(size);
|
||||
size_t begin = std::min(index * portion, size);
|
||||
size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size);
|
||||
return end - begin;
|
||||
}
|
||||
|
||||
size_t ShardProperSize(size_t size, int index) const {
|
||||
if (size == 0) { return 0; }
|
||||
return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0);
|
||||
}
|
||||
|
||||
bool IsFixedSize() const { return !offsets_.empty(); }
|
||||
|
||||
private:
|
||||
static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; }
|
||||
static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; }
|
||||
|
||||
size_t Portion(size_t size) const {
|
||||
return RoundUp
|
||||
(DivRoundUp
|
||||
(std::max(static_cast<int64_t>(size - overlap_ * granularity_),
|
||||
static_cast<int64_t>(1)),
|
||||
devices_.Size()), granularity_);
|
||||
}
|
||||
|
||||
GPUSet devices_;
|
||||
int granularity_;
|
||||
int overlap_;
|
||||
// explicit offsets for the GPU parts, if any
|
||||
std::vector<size_t> offsets_;
|
||||
};
|
||||
|
||||
enum GPUAccess {
|
||||
kNone, kRead,
|
||||
// write implies read
|
||||
kWrite
|
||||
};
|
||||
|
||||
inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
|
||||
return static_cast<GPUAccess>(static_cast<int>(a) - static_cast<int>(b));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class HostDeviceVector {
|
||||
public:
|
||||
explicit HostDeviceVector(size_t size = 0, T v = T(),
|
||||
GPUSet devices = GPUSet::Empty());
|
||||
HostDeviceVector(std::initializer_list<T> init, GPUSet devices = GPUSet::Empty());
|
||||
GPUDistribution distribution = GPUDistribution());
|
||||
HostDeviceVector(std::initializer_list<T> init,
|
||||
GPUDistribution distribution = GPUDistribution());
|
||||
explicit HostDeviceVector(const std::vector<T>& init,
|
||||
GPUSet devices = GPUSet::Empty());
|
||||
GPUDistribution distribution = GPUDistribution());
|
||||
~HostDeviceVector();
|
||||
HostDeviceVector(const HostDeviceVector<T>&) = delete;
|
||||
HostDeviceVector(HostDeviceVector<T>&&) = delete;
|
||||
void operator=(const HostDeviceVector<T>&) = delete;
|
||||
void operator=(HostDeviceVector<T>&&) = delete;
|
||||
HostDeviceVector(const HostDeviceVector<T>&);
|
||||
HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
|
||||
size_t Size() const;
|
||||
GPUSet Devices() const;
|
||||
T* DevicePointer(int device);
|
||||
const GPUDistribution& Distribution() const;
|
||||
common::Span<T> DeviceSpan(int device);
|
||||
common::Span<const T> ConstDeviceSpan(int device) const;
|
||||
common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
|
||||
T* DevicePointer(int device);
|
||||
const T* ConstDevicePointer(int device) const;
|
||||
const T* DevicePointer(int device) const { return ConstDevicePointer(device); }
|
||||
|
||||
T* HostPointer() { return HostVector().data(); }
|
||||
size_t DeviceStart(int device);
|
||||
size_t DeviceSize(int device);
|
||||
const T* ConstHostPointer() const { return ConstHostVector().data(); }
|
||||
const T* HostPointer() const { return ConstHostPointer(); }
|
||||
|
||||
size_t DeviceStart(int device) const;
|
||||
size_t DeviceSize(int device) const;
|
||||
|
||||
// only define functions returning device_ptr
|
||||
// if HostDeviceVector.h is included from a .cu file
|
||||
#ifdef __CUDACC__
|
||||
thrust::device_ptr<T> tbegin(int device); // NOLINT
|
||||
thrust::device_ptr<T> tend(int device); // NOLINT
|
||||
void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
|
||||
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
|
||||
thrust::device_ptr<const T> tcbegin(int device) const; // NOLINT
|
||||
thrust::device_ptr<const T> tcend(int device) const; // NOLINT
|
||||
thrust::device_ptr<const T> tbegin(int device) const { // NOLINT
|
||||
return tcbegin(device);
|
||||
}
|
||||
thrust::device_ptr<const T> tend(int device) const { return tcend(device); } // NOLINT
|
||||
|
||||
void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end);
|
||||
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const;
|
||||
#endif
|
||||
|
||||
void Fill(T v);
|
||||
void Copy(HostDeviceVector<T>* other);
|
||||
void Copy(const HostDeviceVector<T>& other);
|
||||
void Copy(const std::vector<T>& other);
|
||||
void Copy(std::initializer_list<T> other);
|
||||
|
||||
std::vector<T>& HostVector();
|
||||
void Reshard(GPUSet devices);
|
||||
const std::vector<T>& ConstHostVector() const;
|
||||
const std::vector<T>& HostVector() const {return ConstHostVector(); }
|
||||
|
||||
bool HostCanAccess(GPUAccess access) const;
|
||||
bool DeviceCanAccess(int device, GPUAccess access) const;
|
||||
|
||||
void Reshard(const GPUDistribution& distribution) const;
|
||||
void Reshard(GPUSet devices) const;
|
||||
void Resize(size_t new_size, T v = T());
|
||||
|
||||
private:
|
||||
|
||||
Reference in New Issue
Block a user