Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. (#3446)

* Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage.

- added distributions to HostDeviceVector
- using HostDeviceVector for labels, weights and base margings in MetaInfo
- using HostDeviceVector for offset and data in SparsePage
- other necessary refactoring

* Added const version of HostDeviceVector API calls.

- const versions added to calls that can trigger data transfers, e.g. DevicePointer()
- updated the code that uses HostDeviceVector
- objective functions now accept const HostDeviceVector<bst_float>& for predictions

* Updated src/linear/updater_gpu_coordinate.cu.

* Added read-only state for HostDeviceVector sync.

- this means no copies are performed if both host and devices access
  the HostDeviceVector read-only

* Fixed linter and test errors.

- updated the lz4 plugin
- added ConstDeviceSpan to HostDeviceVector
- using device % dh::NVisibleDevices() for the physical device number,
  e.g. in calls to cudaSetDevice()

* Fixed explicit template instantiation errors for HostDeviceVector.

- replaced HostDeviceVector<unsigned int> with HostDeviceVector<int>

* Fixed HostDeviceVector tests that require multiple GPUs.

- added a mock set device handler; when set, it is called instead of cudaSetDevice()
This commit is contained in:
Andy Adinets
2018-08-30 04:28:47 +02:00
committed by Rory Mitchell
parent 58d783df16
commit 72cd1517d6
45 changed files with 1141 additions and 560 deletions

View File

@@ -35,6 +35,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
auto iter = p_fmat->RowIterator();
iter->BeforeFirst();
const auto& weights = info.weights_.HostVector();
while (iter->Next()) {
auto &batch = iter->Value();
#pragma omp parallel num_threads(nthread)
@@ -50,7 +51,8 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
SparsePage::Inst inst = batch[i];
for (auto& ins : inst) {
if (ins.index >= begin && ins.index < end) {
sketchs[ins.index].Push(ins.fvalue, info.GetWeight(ridx));
sketchs[ins.index].Push(ins.fvalue,
weights.size() > 0 ? weights[ridx] : 1.0f);
}
}
}

View File

@@ -118,7 +118,7 @@ struct GPUSketcher {
void Init(const SparsePage& row_batch, const MetaInfo& info) {
num_cols_ = info.num_col_;
has_weights_ = info.weights_.size() > 0;
has_weights_ = info.weights_.Size() > 0;
// find the batch size
if (param_.gpu_batch_nrows == 0) {
@@ -282,19 +282,23 @@ struct GPUSketcher {
size_t batch_row_end = std::min((gpu_batch + 1) * gpu_batch_nrows_,
static_cast<size_t>(n_rows_));
size_t batch_nrows = batch_row_end - batch_row_begin;
size_t n_entries =
row_batch.offset[row_begin_ + batch_row_end] -
row_batch.offset[row_begin_ + batch_row_begin];
const auto& offset_vec = row_batch.offset.HostVector();
const auto& data_vec = row_batch.data.HostVector();
size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
offset_vec[row_begin_ + batch_row_begin];
// copy the batch to the GPU
dh::safe_cuda
(cudaMemcpy(entries_.data().get(),
&row_batch.data[row_batch.offset[row_begin_ + batch_row_begin]],
data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
n_entries * sizeof(Entry), cudaMemcpyDefault));
// copy the weights if necessary
if (has_weights_) {
const auto& weights_vec = info.weights_.HostVector();
dh::safe_cuda
(cudaMemcpy(weights_.data().get(),
info.weights_.data() + row_begin_ + batch_row_begin,
weights_vec.data() + row_begin_ + batch_row_begin,
batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
}
@@ -310,7 +314,7 @@ struct GPUSketcher {
row_ptrs_.data().get() + batch_row_begin,
has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
gpu_batch_nrows_, num_cols_,
row_batch.offset[row_begin_ + batch_row_begin], batch_nrows);
offset_vec[row_begin_ + batch_row_begin], batch_nrows);
dh::safe_cuda(cudaGetLastError()); // NOLINT
dh::safe_cuda(cudaDeviceSynchronize()); // NOLINT
@@ -331,13 +335,11 @@ struct GPUSketcher {
void Sketch(const SparsePage& row_batch, const MetaInfo& info) {
// copy rows to the device
dh::safe_cuda(cudaSetDevice(device_));
const auto& offset_vec = row_batch.offset.HostVector();
row_ptrs_.resize(n_rows_ + 1);
thrust::copy(row_batch.offset.data() + row_begin_,
row_batch.offset.data() + row_end_ + 1,
row_ptrs_.begin());
thrust::copy(offset_vec.data() + row_begin_,
offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
size_t gpu_nbatches = dh::DivRoundUp(n_rows_, gpu_batch_nrows_);
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
SketchBatch(row_batch, info, gpu_batch);
}

View File

@@ -6,7 +6,8 @@
// dummy implementation of HostDeviceVector in case CUDA is not used
#include <xgboost/base.h>
#include <xgboost/data.h>
#include <cstdint>
#include <utility>
#include "./host_device_vector.h"
@@ -14,25 +15,27 @@ namespace xgboost {
template <typename T>
struct HostDeviceVectorImpl {
explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)) {}
explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v), distribution_() {}
HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init), distribution_() {}
explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)), distribution_() {}
std::vector<T> data_h_;
GPUDistribution distribution_;
};
template <typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices) : impl_(nullptr) {
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUDistribution distribution)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(size, v);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUDistribution distribution)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUDistribution distribution)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init);
}
@@ -44,33 +47,69 @@ HostDeviceVector<T>::~HostDeviceVector() {
delete tmp;
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
}
template <typename T>
HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
if (this == &other) {
return *this;
}
delete impl_;
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
return *this;
}
template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }
template <typename T>
GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
template <typename T>
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
return impl_->distribution_;
}
template <typename T>
T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
template <typename T>
const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
return nullptr;
}
template <typename T>
common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
return common::Span<T>();
}
template <typename T>
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
return common::Span<const T>();
}
template <typename T>
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }
template <typename T>
const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
return impl_->data_h_;
}
template <typename T>
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
impl_->data_h_.resize(new_size, v);
}
template <typename T>
size_t HostDeviceVector<T>::DeviceStart(int device) { return 0; }
size_t HostDeviceVector<T>::DeviceStart(int device) const { return 0; }
template <typename T>
size_t HostDeviceVector<T>::DeviceSize(int device) { return 0; }
size_t HostDeviceVector<T>::DeviceSize(int device) const { return 0; }
template <typename T>
void HostDeviceVector<T>::Fill(T v) {
@@ -78,9 +117,9 @@ void HostDeviceVector<T>::Fill(T v) {
}
template <typename T>
void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
CHECK_EQ(Size(), other->Size());
std::copy(other->HostVector().begin(), other->HostVector().end(), HostVector().begin());
void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
CHECK_EQ(Size(), other.Size());
std::copy(other.HostVector().begin(), other.HostVector().end(), HostVector().begin());
}
template <typename T>
@@ -96,13 +135,27 @@ void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
}
template <typename T>
void HostDeviceVector<T>::Reshard(GPUSet devices) { }
bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
return true;
}
template <typename T>
bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
return false;
}
template <typename T>
void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const { }
template <typename T>
void HostDeviceVector<T>::Reshard(GPUSet devices) const { }
// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<GradientPair>;
template class HostDeviceVector<unsigned int>;
template class HostDeviceVector<int>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<size_t>;
} // namespace xgboost

View File

@@ -2,119 +2,159 @@
* Copyright 2017 XGBoost contributors
*/
#include <thrust/fill.h>
#include "./host_device_vector.h"
#include <thrust/fill.h>
#include <xgboost/data.h>
#include <algorithm>
#include <cstdint>
#include <mutex>
#include "./device_helpers.cuh"
namespace xgboost {
// the handler to call instead of cudaSetDevice; only used for testing
static void (*cudaSetDeviceHandler)(int) = nullptr; // NOLINT
void SetCudaSetDeviceHandler(void (*handler)(int)) {
cudaSetDeviceHandler = handler;
}
// wrapper over access with useful methods
class Permissions {
GPUAccess access_;
explicit Permissions(GPUAccess access) : access_(access) {}
public:
Permissions() : access_(GPUAccess::kNone) {}
explicit Permissions(bool perm)
: access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
bool CanRead() const { return access_ >= kRead; }
bool CanWrite() const { return access_ == kWrite; }
bool CanAccess(GPUAccess access) const { return access_ >= access; }
void Grant(GPUAccess access) { access_ = std::max(access_, access); }
void DenyComplementary(GPUAccess compl_access) {
access_ = std::min(access_, GPUAccess::kWrite - compl_access);
}
Permissions Complementary() const {
return Permissions(GPUAccess::kWrite - access_);
}
};
template <typename T>
struct HostDeviceVectorImpl {
struct DeviceShard {
DeviceShard() : index_(-1), device_(-1), start_(0), on_d_(false), vec_(nullptr) {}
static size_t ShardStart(size_t size, int ndevices, int index) {
size_t portion = dh::DivRoundUp(size, ndevices);
size_t begin = index * portion;
begin = begin > size ? size : begin;
return begin;
}
static size_t ShardSize(size_t size, int ndevices, int index) {
size_t portion = dh::DivRoundUp(size, ndevices);
size_t begin = index * portion, end = (index + 1) * portion;
begin = begin > size ? size : begin;
end = end > size ? size : end;
return end - begin;
}
DeviceShard()
: index_(-1), proper_size_(0), device_(-1), start_(0), perm_d_(false),
cached_size_(~0), vec_(nullptr) {}
void Init(HostDeviceVectorImpl<T>* vec, int device) {
if (vec_ == nullptr) { vec_ = vec; }
CHECK_EQ(vec, vec_);
device_ = device;
index_ = vec_->devices_.Index(device);
size_t size_h = vec_->Size();
int ndevices = vec_->devices_.Size();
start_ = ShardStart(size_h, ndevices, index_);
size_t size_d = ShardSize(size_h, ndevices, index_);
dh::safe_cuda(cudaSetDevice(device_));
data_.resize(size_d);
on_d_ = !vec_->on_h_;
index_ = vec_->distribution_.devices_.Index(device);
LazyResize(vec_->Size());
perm_d_ = vec_->perm_h_.Complementary();
}
void ScatterFrom(const T* begin) {
// TODO(canonizer): avoid full copy of host data
LazySyncDevice();
dh::safe_cuda(cudaSetDevice(device_));
LazySyncDevice(GPUAccess::kWrite);
SetDevice();
dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_,
data_.size() * sizeof(T), cudaMemcpyDefault));
}
void GatherTo(thrust::device_ptr<T> begin) {
LazySyncDevice();
dh::safe_cuda(cudaSetDevice(device_));
LazySyncDevice(GPUAccess::kRead);
SetDevice();
dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(),
data_.size() * sizeof(T), cudaMemcpyDefault));
proper_size_ * sizeof(T), cudaMemcpyDefault));
}
void Fill(T v) {
// TODO(canonizer): avoid full copy of host data
LazySyncDevice();
dh::safe_cuda(cudaSetDevice(device_));
LazySyncDevice(GPUAccess::kWrite);
SetDevice();
thrust::fill(data_.begin(), data_.end(), v);
}
void Copy(DeviceShard* other) {
// TODO(canonizer): avoid full copy of host data for this (but not for other)
LazySyncDevice();
other->LazySyncDevice();
dh::safe_cuda(cudaSetDevice(device_));
LazySyncDevice(GPUAccess::kWrite);
other->LazySyncDevice(GPUAccess::kRead);
SetDevice();
dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(),
data_.size() * sizeof(T), cudaMemcpyDefault));
}
void LazySyncHost() {
dh::safe_cuda(cudaSetDevice(device_));
void LazySyncHost(GPUAccess access) {
SetDevice();
dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
data_.data().get(), data_.size() * sizeof(T),
data_.data().get(), proper_size_ * sizeof(T),
cudaMemcpyDeviceToHost));
on_d_ = false;
perm_d_.DenyComplementary(access);
}
void LazySyncDevice() {
if (on_d_) { return; }
void LazyResize(size_t new_size) {
if (new_size == cached_size_) { return; }
// resize is required
int ndevices = vec_->distribution_.devices_.Size();
start_ = vec_->distribution_.ShardStart(new_size, index_);
proper_size_ = vec_->distribution_.ShardProperSize(new_size, index_);
size_t size_d = vec_->distribution_.ShardSize(new_size, index_);
SetDevice();
data_.resize(size_d);
cached_size_ = new_size;
}
void LazySyncDevice(GPUAccess access) {
if (perm_d_.CanAccess(access)) { return; }
if (perm_d_.CanRead()) {
// deny read to the host
perm_d_.Grant(access);
std::lock_guard<std::mutex> lock(vec_->mutex_);
vec_->perm_h_.DenyComplementary(access);
return;
}
// data is on the host
size_t size_h = vec_->data_h_.size();
int ndevices = vec_->devices_.Size();
start_ = ShardStart(size_h, ndevices, index_);
size_t size_d = ShardSize(size_h, ndevices, index_);
dh::safe_cuda(cudaSetDevice(device_));
data_.resize(size_d);
dh::safe_cuda(cudaMemcpy(data_.data().get(),
vec_->data_h_.data() + start_,
size_d * sizeof(T), cudaMemcpyHostToDevice));
on_d_ = true;
// this may cause a race condition if LazySyncDevice() is called
// from multiple threads in parallel;
// however, the race condition is benign, and will not cause problems
vec_->on_h_ = false;
vec_->size_d_ = vec_->data_h_.size();
LazyResize(size_h);
SetDevice();
dh::safe_cuda(
cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
data_.size() * sizeof(T), cudaMemcpyHostToDevice));
perm_d_.Grant(access);
std::lock_guard<std::mutex> lock(vec_->mutex_);
vec_->perm_h_.DenyComplementary(access);
vec_->size_d_ = size_h;
}
void SetDevice() {
if (cudaSetDeviceHandler == nullptr) {
dh::safe_cuda(cudaSetDevice(device_));
} else {
(*cudaSetDeviceHandler)(device_);
}
}
int index_;
int device_;
thrust::device_vector<T> data_;
// cached vector size
size_t cached_size_;
size_t start_;
// true if there is an up-to-date copy of data on device, false otherwise
bool on_d_;
// size of the portion to copy back to the host
size_t proper_size_;
Permissions perm_d_;
HostDeviceVectorImpl<T>* vec_;
};
HostDeviceVectorImpl(size_t size, T v, GPUSet devices)
: devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
if (!devices.IsEmpty()) {
HostDeviceVectorImpl(size_t size, T v, GPUDistribution distribution)
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
if (!distribution_.IsEmpty()) {
size_d_ = size;
InitShards();
Fill(v);
@@ -123,11 +163,16 @@ struct HostDeviceVectorImpl {
}
}
// required, as a new std::mutex has to be created
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
: data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
distribution_(other.distribution_), mutex_(), shards_(other.shards_) {}
// Init can be std::vector<T> or std::initializer_list<T>
template <class Init>
HostDeviceVectorImpl(const Init& init, GPUSet devices)
: devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
if (!devices.IsEmpty()) {
HostDeviceVectorImpl(const Init& init, GPUDistribution distribution)
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
if (!distribution_.IsEmpty()) {
size_d_ = init.size();
InitShards();
Copy(init);
@@ -137,58 +182,78 @@ struct HostDeviceVectorImpl {
}
void InitShards() {
int ndevices = devices_.Size();
int ndevices = distribution_.devices_.Size();
shards_.resize(ndevices);
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Init(this, devices_[i]);
shard.Init(this, distribution_.devices_[i]);
});
}
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>&) = delete;
HostDeviceVectorImpl(HostDeviceVectorImpl<T>&&) = delete;
void operator=(const HostDeviceVectorImpl<T>&) = delete;
void operator=(HostDeviceVectorImpl<T>&&) = delete;
size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }
size_t Size() const { return on_h_ ? data_h_.size() : size_d_; }
GPUSet Devices() const { return distribution_.devices_; }
GPUSet Devices() const { return devices_; }
const GPUDistribution& Distribution() const { return distribution_; }
T* DevicePointer(int device) {
CHECK(devices_.Contains(device));
LazySyncDevice(device);
return shards_[devices_.Index(device)].data_.data().get();
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kWrite);
return shards_[distribution_.devices_.Index(device)].data_.data().get();
}
const T* ConstDevicePointer(int device) {
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return shards_[distribution_.devices_.Index(device)].data_.data().get();
}
common::Span<T> DeviceSpan(int device) {
CHECK(devices_.Contains(device));
LazySyncDevice(device);
return { shards_[devices_.Index(device)].data_.data().get(),
static_cast<typename common::Span<T>::index_type>(Size()) };
GPUSet devices = distribution_.devices_;
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kWrite);
return {shards_[devices.Index(device)].data_.data().get(),
static_cast<typename common::Span<T>::index_type>(Size())};
}
common::Span<const T> ConstDeviceSpan(int device) {
GPUSet devices = distribution_.devices_;
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return {shards_[devices.Index(device)].data_.data().get(),
static_cast<typename common::Span<const T>::index_type>(Size())};
}
size_t DeviceSize(int device) {
CHECK(devices_.Contains(device));
LazySyncDevice(device);
return shards_[devices_.Index(device)].data_.size();
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return shards_[distribution_.devices_.Index(device)].data_.size();
}
size_t DeviceStart(int device) {
CHECK(devices_.Contains(device));
LazySyncDevice(device);
return shards_[devices_.Index(device)].start_;
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return shards_[distribution_.devices_.Index(device)].start_;
}
thrust::device_ptr<T> tbegin(int device) { // NOLINT
return thrust::device_ptr<T>(DevicePointer(device));
}
thrust::device_ptr<const T> tcbegin(int device) { // NOLINT
return thrust::device_ptr<const T>(ConstDevicePointer(device));
}
thrust::device_ptr<T> tend(int device) { // NOLINT
return tbegin(device) + DeviceSize(device);
}
void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
thrust::device_ptr<const T> tcend(int device) { // NOLINT
return tcbegin(device) + DeviceSize(device);
}
void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
CHECK_EQ(end - begin, Size());
if (on_h_) {
if (perm_h_.CanWrite()) {
dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
(end - begin) * sizeof(T),
cudaMemcpyDeviceToHost));
@@ -201,7 +266,7 @@ struct HostDeviceVectorImpl {
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
CHECK_EQ(end - begin, Size());
if (on_h_) {
if (perm_h_.CanWrite()) {
dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
data_h_.size() * sizeof(T),
cudaMemcpyHostToDevice));
@@ -211,7 +276,7 @@ struct HostDeviceVectorImpl {
}
void Fill(T v) {
if (on_h_) {
if (perm_h_.CanWrite()) {
std::fill(data_h_.begin(), data_h_.end(), v);
} else {
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.Fill(v); });
@@ -220,10 +285,10 @@ struct HostDeviceVectorImpl {
void Copy(HostDeviceVectorImpl<T>* other) {
CHECK_EQ(Size(), other->Size());
if (on_h_ && other->on_h_) {
if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
} else {
CHECK(devices_ == other->devices_);
CHECK(distribution_ == other->distribution_);
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Copy(&other->shards_[i]);
});
@@ -232,7 +297,7 @@ struct HostDeviceVectorImpl {
void Copy(const std::vector<T>& other) {
CHECK_EQ(Size(), other.size());
if (on_h_) {
if (perm_h_.CanWrite()) {
std::copy(other.begin(), other.end(), data_h_.begin());
} else {
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
@@ -243,7 +308,7 @@ struct HostDeviceVectorImpl {
void Copy(std::initializer_list<T> other) {
CHECK_EQ(Size(), other.size());
if (on_h_) {
if (perm_h_.CanWrite()) {
std::copy(other.begin(), other.end(), data_h_.begin());
} else {
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
@@ -253,72 +318,117 @@ struct HostDeviceVectorImpl {
}
std::vector<T>& HostVector() {
LazySyncHost();
LazySyncHost(GPUAccess::kWrite);
return data_h_;
}
void Reshard(GPUSet new_devices) {
if (devices_ == new_devices)
return;
CHECK(devices_.IsEmpty());
devices_ = new_devices;
const std::vector<T>& ConstHostVector() {
LazySyncHost(GPUAccess::kRead);
return data_h_;
}
void Reshard(const GPUDistribution& distribution) {
if (distribution_ == distribution) { return; }
CHECK(distribution_.IsEmpty());
distribution_ = distribution;
InitShards();
}
void Reshard(GPUSet new_devices) {
if (distribution_.Devices() == new_devices) { return; }
Reshard(GPUDistribution::Block(new_devices));
}
void Resize(size_t new_size, T v) {
if (new_size == Size())
return;
if (Size() == 0 && !devices_.IsEmpty()) {
if (new_size == Size()) { return; }
if (distribution_.IsFixedSize()) {
CHECK_EQ(new_size, distribution_.offsets_.back());
}
if (Size() == 0 && !distribution_.IsEmpty()) {
// fast on-device resize
on_h_ = false;
perm_h_ = Permissions(false);
size_d_ = new_size;
InitShards();
Fill(v);
} else {
// resize on host
LazySyncHost();
LazySyncHost(GPUAccess::kWrite);
data_h_.resize(new_size, v);
}
}
void LazySyncHost() {
if (on_h_)
void LazySyncHost(GPUAccess access) {
if (perm_h_.CanAccess(access)) { return; }
if (perm_h_.CanRead()) {
// data is present, just need to deny access to the device
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
shard.perm_d_.DenyComplementary(access);
});
perm_h_.Grant(access);
return;
if (data_h_.size() != size_d_)
data_h_.resize(size_d_);
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.LazySyncHost(); });
on_h_ = true;
}
if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
shard.LazySyncHost(access);
});
perm_h_.Grant(access);
}
void LazySyncDevice(int device) {
CHECK(devices_.Contains(device));
shards_[devices_.Index(device)].LazySyncDevice();
void LazySyncDevice(int device, GPUAccess access) {
GPUSet devices = distribution_.Devices();
CHECK(devices.Contains(device));
shards_[devices.Index(device)].LazySyncDevice(access);
}
bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
bool DeviceCanAccess(int device, GPUAccess access) {
GPUSet devices = distribution_.Devices();
if (!devices.Contains(device)) { return false; }
return shards_[devices.Index(device)].perm_d_.CanAccess(access);
}
std::vector<T> data_h_;
bool on_h_;
Permissions perm_h_;
// the total size of the data stored on the devices
size_t size_d_;
GPUSet devices_;
GPUDistribution distribution_;
// protects size_d_ and perm_h_ when updated from multiple threads
std::mutex mutex_;
std::vector<DeviceShard> shards_;
};
template <typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(size, v, devices);
HostDeviceVector<T>::HostDeviceVector
(size_t size, T v, GPUDistribution distribution) : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init, devices);
HostDeviceVector<T>::HostDeviceVector
(std::initializer_list<T> init, GPUDistribution distribution) : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
HostDeviceVector<T>::HostDeviceVector
(const std::vector<T>& init, GPUDistribution distribution) : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init, devices);
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
}
template <typename T>
HostDeviceVector<T>& HostDeviceVector<T>::operator=
(const HostDeviceVector<T>& other) {
if (this == &other) { return *this; }
delete impl_;
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
return *this;
}
template <typename T>
@@ -335,7 +445,19 @@ template <typename T>
GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
template <typename T>
T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
return impl_->Distribution();
}
template <typename T>
T* HostDeviceVector<T>::DevicePointer(int device) {
return impl_->DevicePointer(device);
}
template <typename T>
const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
return impl_->ConstDevicePointer(device);
}
template <typename T>
common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
@@ -343,30 +465,49 @@ common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
}
template <typename T>
size_t HostDeviceVector<T>::DeviceStart(int device) { return impl_->DeviceStart(device); }
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
return impl_->ConstDeviceSpan(device);
}
template <typename T>
size_t HostDeviceVector<T>::DeviceSize(int device) { return impl_->DeviceSize(device); }
size_t HostDeviceVector<T>::DeviceStart(int device) const {
return impl_->DeviceStart(device);
}
template <typename T>
size_t HostDeviceVector<T>::DeviceSize(int device) const {
return impl_->DeviceSize(device);
}
template <typename T>
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
return impl_->tbegin(device);
}
template <typename T>
thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const { // NOLINT
return impl_->tcbegin(device);
}
template <typename T>
thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) { // NOLINT
return impl_->tend(device);
}
template <typename T>
thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const { // NOLINT
return impl_->tcend(device);
}
template <typename T>
void HostDeviceVector<T>::ScatterFrom
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
impl_->ScatterFrom(begin, end);
}
template <typename T>
void HostDeviceVector<T>::GatherTo
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
impl_->GatherTo(begin, end);
}
@@ -376,8 +517,8 @@ void HostDeviceVector<T>::Fill(T v) {
}
template <typename T>
void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
impl_->Copy(other->impl_);
void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
impl_->Copy(other.impl_);
}
template <typename T>
@@ -394,10 +535,30 @@ template <typename T>
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
template <typename T>
void HostDeviceVector<T>::Reshard(GPUSet new_devices) {
const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
return impl_->ConstHostVector();
}
template <typename T>
bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
return impl_->HostCanAccess(access);
}
template <typename T>
bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
return impl_->DeviceCanAccess(device, access);
}
template <typename T>
void HostDeviceVector<T>::Reshard(GPUSet new_devices) const {
impl_->Reshard(new_devices);
}
template <typename T>
void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const {
impl_->Reshard(distribution);
}
template <typename T>
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
impl_->Resize(new_size, v);
@@ -406,7 +567,8 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<GradientPair>;
template class HostDeviceVector<unsigned int>;
template class HostDeviceVector<int>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<size_t>;
} // namespace xgboost

View File

@@ -1,28 +1,6 @@
/*!
* Copyright 2017 XGBoost contributors
*/
#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
#include <dmlc/logging.h>
#include <algorithm>
#include <cstdlib>
#include <initializer_list>
#include <vector>
#include "gpu_set.h"
#include "span.h"
// only include thrust-related files if host_device_vector.h
// is included from a .cu file
#ifdef __CUDACC__
#include <thrust/device_ptr.h>
#endif
namespace xgboost {
template <typename T> struct HostDeviceVectorImpl;
/**
* @file host_device_vector.h
@@ -70,44 +48,203 @@ template <typename T> struct HostDeviceVectorImpl;
* if different threads call these methods with different values of the device argument.
* All other methods are not thread safe.
*/
#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
#include <dmlc/logging.h>
#include <algorithm>
#include <cstdlib>
#include <initializer_list>
#include <vector>
#include "gpu_set.h"
#include "span.h"
// only include thrust-related files if host_device_vector.h
// is included from a .cu file
#ifdef __CUDACC__
#include <thrust/device_ptr.h>
#endif
namespace xgboost {
#ifdef __CUDACC__
// Sets a function to call instead of cudaSetDevice();
// only added for testing
void SetCudaSetDeviceHandler(void (*handler)(int));
#endif
template <typename T> struct HostDeviceVectorImpl;
// Distribution for the HostDeviceVector; it specifies such aspects as the devices it is
// distributed on, whether there are copies of elements from other GPUs as well as the granularity
// of splitting. It may also specify explicit boundaries for devices, in which case the size of the
// array cannot be changed.
class GPUDistribution {
template<typename T> friend struct HostDeviceVectorImpl;
public:
explicit GPUDistribution(GPUSet devices = GPUSet::Empty())
: devices_(devices), granularity_(1), overlap_(0) {}
private:
GPUDistribution(GPUSet devices, int granularity, int overlap,
std::vector<size_t> offsets)
: devices_(devices), granularity_(granularity), overlap_(overlap),
offsets_(std::move(offsets)) {}
public:
static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
static GPUDistribution Overlap(GPUSet devices, int overlap) {
return GPUDistribution(devices, 1, overlap, std::vector<size_t>());
}
static GPUDistribution Granular(GPUSet devices, int granularity) {
return GPUDistribution(devices, granularity, 0, std::vector<size_t>());
}
static GPUDistribution Explicit(GPUSet devices, std::vector<size_t> offsets) {
return GPUDistribution(devices, 1, 0, offsets);
}
friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
return a.devices_ == b.devices_ && a.granularity_ == b.granularity_ &&
a.overlap_ == b.overlap_ && a.offsets_ == b.offsets_;
}
friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
return !(a == b);
}
GPUSet Devices() const { return devices_; }
bool IsEmpty() const { return devices_.IsEmpty(); }
size_t ShardStart(size_t size, int index) const {
if (size == 0) { return 0; }
if (offsets_.size() > 0) {
// explicit offsets are provided
CHECK_EQ(offsets_.back(), size);
return offsets_.at(index);
}
// no explicit offsets
size_t begin = std::min(index * Portion(size), size);
begin = begin > size ? size : begin;
return begin;
}
size_t ShardSize(size_t size, int index) const {
if (size == 0) { return 0; }
if (offsets_.size() > 0) {
// explicit offsets are provided
CHECK_EQ(offsets_.back(), size);
return offsets_.at(index + 1) - offsets_.at(index) +
(index == devices_.Size() - 1 ? overlap_ : 0);
}
size_t portion = Portion(size);
size_t begin = std::min(index * portion, size);
size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size);
return end - begin;
}
size_t ShardProperSize(size_t size, int index) const {
if (size == 0) { return 0; }
return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0);
}
bool IsFixedSize() const { return !offsets_.empty(); }
private:
static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; }
static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; }
size_t Portion(size_t size) const {
return RoundUp
(DivRoundUp
(std::max(static_cast<int64_t>(size - overlap_ * granularity_),
static_cast<int64_t>(1)),
devices_.Size()), granularity_);
}
GPUSet devices_;
int granularity_;
int overlap_;
// explicit offsets for the GPU parts, if any
std::vector<size_t> offsets_;
};
enum GPUAccess {
kNone, kRead,
// write implies read
kWrite
};
inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
return static_cast<GPUAccess>(static_cast<int>(a) - static_cast<int>(b));
}
template <typename T>
class HostDeviceVector {
public:
explicit HostDeviceVector(size_t size = 0, T v = T(),
GPUSet devices = GPUSet::Empty());
HostDeviceVector(std::initializer_list<T> init, GPUSet devices = GPUSet::Empty());
GPUDistribution distribution = GPUDistribution());
HostDeviceVector(std::initializer_list<T> init,
GPUDistribution distribution = GPUDistribution());
explicit HostDeviceVector(const std::vector<T>& init,
GPUSet devices = GPUSet::Empty());
GPUDistribution distribution = GPUDistribution());
~HostDeviceVector();
HostDeviceVector(const HostDeviceVector<T>&) = delete;
HostDeviceVector(HostDeviceVector<T>&&) = delete;
void operator=(const HostDeviceVector<T>&) = delete;
void operator=(HostDeviceVector<T>&&) = delete;
HostDeviceVector(const HostDeviceVector<T>&);
HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
size_t Size() const;
GPUSet Devices() const;
T* DevicePointer(int device);
const GPUDistribution& Distribution() const;
common::Span<T> DeviceSpan(int device);
common::Span<const T> ConstDeviceSpan(int device) const;
common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
T* DevicePointer(int device);
const T* ConstDevicePointer(int device) const;
const T* DevicePointer(int device) const { return ConstDevicePointer(device); }
T* HostPointer() { return HostVector().data(); }
size_t DeviceStart(int device);
size_t DeviceSize(int device);
const T* ConstHostPointer() const { return ConstHostVector().data(); }
const T* HostPointer() const { return ConstHostPointer(); }
size_t DeviceStart(int device) const;
size_t DeviceSize(int device) const;
// only define functions returning device_ptr
// if HostDeviceVector.h is included from a .cu file
#ifdef __CUDACC__
thrust::device_ptr<T> tbegin(int device); // NOLINT
thrust::device_ptr<T> tend(int device); // NOLINT
void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
thrust::device_ptr<const T> tcbegin(int device) const; // NOLINT
thrust::device_ptr<const T> tcend(int device) const; // NOLINT
thrust::device_ptr<const T> tbegin(int device) const { // NOLINT
return tcbegin(device);
}
thrust::device_ptr<const T> tend(int device) const { return tcend(device); } // NOLINT
void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end);
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const;
#endif
void Fill(T v);
void Copy(HostDeviceVector<T>* other);
void Copy(const HostDeviceVector<T>& other);
void Copy(const std::vector<T>& other);
void Copy(std::initializer_list<T> other);
std::vector<T>& HostVector();
void Reshard(GPUSet devices);
const std::vector<T>& ConstHostVector() const;
const std::vector<T>& HostVector() const {return ConstHostVector(); }
bool HostCanAccess(GPUAccess access) const;
bool DeviceCanAccess(int device, GPUAccess access) const;
void Reshard(const GPUDistribution& distribution) const;
void Reshard(GPUSet devices) const;
void Resize(size_t new_size, T v = T());
private: