Make HostDeviceVector single gpu only (#4773)

* Make HostDeviceVector single gpu only
This commit is contained in:
Rong Ou
2019-08-25 14:51:13 -07:00
committed by Rory Mitchell
parent 41227d1933
commit 38ab79f889
54 changed files with 641 additions and 1621 deletions

View File

@@ -22,48 +22,12 @@ using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
GlobalRandomEngine& GlobalRandom() {
return RandomThreadLocalStore::Get()->engine;
}
} // namespace common
#if !defined(XGBOOST_USE_CUDA)
int AllVisibleImpl::AllVisible() {
int AllVisibleGPUs() {
return 0;
}
#endif // !defined(XGBOOST_USE_CUDA)
constexpr GPUSet::GpuIdType GPUSet::kAll;
GPUSet GPUSet::All(GpuIdType gpu_id, GpuIdType n_gpus, int32_t n_rows) {
CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0.";
CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1.";
GpuIdType const n_devices_visible = AllVisible().Size();
CHECK_LE(n_gpus, n_devices_visible);
if (n_devices_visible == 0 || n_gpus == 0 || n_rows == 0) {
LOG(DEBUG) << "Runing on CPU.";
return Empty();
}
GpuIdType const n_available_devices = n_devices_visible - gpu_id;
if (n_gpus == kAll) { // Use all devices starting from `gpu_id'.
CHECK(gpu_id < n_devices_visible)
<< "\ngpu_id should be less than number of visible devices.\ngpu_id: "
<< gpu_id
<< ", number of visible devices: "
<< n_devices_visible;
GpuIdType n_devices =
n_available_devices < n_rows ? n_available_devices : n_rows;
LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
return Range(gpu_id, n_devices);
} else { // Use devices in ( gpu_id, gpu_id + n_gpus ).
CHECK_LE(n_gpus, n_available_devices)
<< "Starting from gpu id: " << gpu_id << ", there are only "
<< n_available_devices << " available devices, while n_gpus is set to: "
<< n_gpus;
GpuIdType n_devices = n_gpus < n_rows ? n_gpus : n_rows;
LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
return Range(gpu_id, n_devices);
}
}
} // namespace common
} // namespace xgboost

View File

@@ -4,8 +4,9 @@
#include "common.h"
namespace xgboost {
namespace common {
int AllVisibleImpl::AllVisible() {
int AllVisibleGPUs() {
int n_visgpus = 0;
try {
// When compiled with CUDA but running on CPU only device,
@@ -17,4 +18,5 @@ int AllVisibleImpl::AllVisible() {
return n_visgpus;
}
} // namespace common
} // namespace xgboost

View File

@@ -140,88 +140,8 @@ class Range {
Iterator begin_;
Iterator end_;
};
int AllVisibleGPUs();
} // namespace common
struct AllVisibleImpl {
static int AllVisible();
};
/* \brief set of devices across which HostDeviceVector can be distributed.
*
* Currently implemented as a range, but can be changed later to something else,
* e.g. a bitset
*/
class GPUSet {
public:
using GpuIdType = int;
static constexpr GpuIdType kAll = -1;
explicit GPUSet(int start = 0, int ndevices = 0)
: devices_(start, start + ndevices) {}
static GPUSet Empty() { return GPUSet(); }
static GPUSet Range(GpuIdType start, GpuIdType n_gpus) {
return n_gpus <= 0 ? Empty() : GPUSet{start, n_gpus};
}
/*! \brief n_gpus and num_rows both are upper bounds. */
static GPUSet All(GpuIdType gpu_id, GpuIdType n_gpus,
GpuIdType num_rows = std::numeric_limits<GpuIdType>::max());
static GPUSet AllVisible() {
GpuIdType n = AllVisibleImpl::AllVisible();
return Range(0, n);
}
size_t Size() const {
GpuIdType size = *devices_.end() - *devices_.begin();
GpuIdType res = size < 0 ? 0 : size;
return static_cast<size_t>(res);
}
/*
* By default, we have two configurations of identifying device, one
* is the device id obtained from `cudaGetDevice'. But we sometimes
* store objects that allocated one for each device in a list, which
* requires a zero-based index.
*
* Hence, `DeviceId' converts a zero-based index to actual device id,
* `Index' converts a device id to a zero-based index.
*/
GpuIdType DeviceId(size_t index) const {
GpuIdType result = *devices_.begin() + static_cast<GpuIdType>(index);
CHECK(Contains(result)) << "\nDevice " << result << " is not in GPUSet."
<< "\nIndex: " << index
<< "\nGPUSet: (" << *begin() << ", " << *end() << ")"
<< std::endl;
return result;
}
size_t Index(GpuIdType device) const {
CHECK(Contains(device)) << "\nDevice " << device << " is not in GPUSet."
<< "\nGPUSet: (" << *begin() << ", " << *end() << ")"
<< std::endl;
size_t result = static_cast<size_t>(device - *devices_.begin());
return result;
}
bool IsEmpty() const { return Size() == 0; }
bool Contains(GpuIdType device) const {
return *devices_.begin() <= device && device < *devices_.end();
}
common::Range::Iterator begin() const { return devices_.begin(); } // NOLINT
common::Range::Iterator end() const { return devices_.end(); } // NOLINT
friend bool operator==(const GPUSet& lhs, const GPUSet& rhs) {
return lhs.devices_ == rhs.devices_;
}
friend bool operator!=(const GPUSet& lhs, const GPUSet& rhs) {
return !(lhs == rhs);
}
private:
common::Range devices_;
};
} // namespace xgboost
#endif // XGBOOST_COMMON_COMMON_H_

View File

@@ -72,22 +72,6 @@ const T *Raw(const thrust::device_vector<T> &v) { // NOLINT
return raw_pointer_cast(v.data());
}
// if n_devices=-1, then use all visible devices
inline void SynchronizeNDevices(xgboost::GPUSet devices) {
devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices;
for (auto const d : devices) {
safe_cuda(cudaSetDevice(d));
safe_cuda(cudaDeviceSynchronize());
}
}
inline void SynchronizeAll() {
for (int device_idx : xgboost::GPUSet::AllVisible()) {
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaDeviceSynchronize());
}
}
inline size_t AvailableMemory(int device_idx) {
size_t device_free = 0;
size_t device_total = 0;
@@ -119,7 +103,7 @@ inline size_t MaxSharedMemory(int device_idx) {
}
inline void CheckComputeCapability() {
for (int d_idx : xgboost::GPUSet::AllVisible()) {
for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
cudaDeviceProp prop;
safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
std::ostringstream oss;

View File

@@ -35,7 +35,6 @@ __global__ void FindCutsK
if (icut >= ncuts) {
return;
}
WXQSketch::Entry v;
int isample = 0;
if (icut == 0) {
isample = 0;
@@ -59,11 +58,14 @@ struct IsNotNaN {
__device__ bool operator()(float a) const { return !isnan(a); }
};
__global__ void UnpackFeaturesK
(float* __restrict__ fvalues, float* __restrict__ feature_weights,
const size_t* __restrict__ row_ptrs, const float* __restrict__ weights,
Entry* entries, size_t nrows_array, int ncols, size_t row_begin_ptr,
size_t nrows) {
__global__ void UnpackFeaturesK(float* __restrict__ fvalues,
float* __restrict__ feature_weights,
const size_t* __restrict__ row_ptrs,
const float* __restrict__ weights,
Entry* entries,
size_t nrows_array,
size_t row_begin_ptr,
size_t nrows) {
size_t irow = threadIdx.x + size_t(blockIdx.x) * blockDim.x;
if (irow >= nrows) {
return;
@@ -102,8 +104,9 @@ struct SketchContainer {
const MetaInfo &info = dmat->Info();
// Initialize Sketches for this dmatrix
sketches_.resize(info.num_col_);
#pragma omp parallel for schedule(static) if (info.num_col_ > kOmpNumColsParallelizeLimit)
for (int icol = 0; icol < info.num_col_; ++icol) {
#pragma omp parallel for default(none) shared(info, param) schedule(static) \
if (info.num_col_ > kOmpNumColsParallelizeLimit) // NOLINT
for (int icol = 0; icol < info.num_col_; ++icol) { // NOLINT
sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin));
}
}
@@ -120,8 +123,6 @@ struct GPUSketcher {
// manage memory for a single GPU
class DeviceShard {
int device_;
bst_uint row_begin_; // The row offset for this shard
bst_uint row_end_;
bst_uint n_rows_;
int num_cols_{0};
size_t n_cuts_{0};
@@ -131,27 +132,31 @@ struct GPUSketcher {
tree::TrainParam param_;
SketchContainer *sketch_container_;
dh::device_vector<size_t> row_ptrs_;
dh::device_vector<Entry> entries_;
dh::device_vector<bst_float> fvalues_;
dh::device_vector<bst_float> feature_weights_;
dh::device_vector<bst_float> fvalues_cur_;
dh::device_vector<WXQSketch::Entry> cuts_d_;
thrust::host_vector<WXQSketch::Entry> cuts_h_;
dh::device_vector<bst_float> weights_;
dh::device_vector<bst_float> weights2_;
std::vector<size_t> n_cuts_cur_;
dh::device_vector<size_t> num_elements_;
dh::device_vector<char> tmp_storage_;
dh::device_vector<size_t> row_ptrs_{};
dh::device_vector<Entry> entries_{};
dh::device_vector<bst_float> fvalues_{};
dh::device_vector<bst_float> feature_weights_{};
dh::device_vector<bst_float> fvalues_cur_{};
dh::device_vector<WXQSketch::Entry> cuts_d_{};
thrust::host_vector<WXQSketch::Entry> cuts_h_{};
dh::device_vector<bst_float> weights_{};
dh::device_vector<bst_float> weights2_{};
std::vector<size_t> n_cuts_cur_{};
dh::device_vector<size_t> num_elements_{};
dh::device_vector<char> tmp_storage_{};
public:
DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
tree::TrainParam param, SketchContainer *sketch_container) :
device_(device), row_begin_(row_begin), row_end_(row_end),
n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
DeviceShard(int device,
bst_uint n_rows,
tree::TrainParam param,
SketchContainer* sketch_container) :
device_(device),
n_rows_(n_rows),
param_(std::move(param)),
sketch_container_(sketch_container) {
}
~DeviceShard() {
~DeviceShard() { // NOLINT
dh::safe_cuda(cudaSetDevice(device_));
}
@@ -319,19 +324,18 @@ struct GPUSketcher {
const auto& offset_vec = row_batch.offset.HostVector();
const auto& data_vec = row_batch.data.HostVector();
size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
offset_vec[row_begin_ + batch_row_begin];
size_t n_entries = offset_vec[batch_row_end] - offset_vec[batch_row_begin];
// copy the batch to the GPU
dh::safe_cuda
(cudaMemcpyAsync(entries_.data().get(),
data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
data_vec.data() + offset_vec[batch_row_begin],
n_entries * sizeof(Entry), cudaMemcpyDefault));
// copy the weights if necessary
if (has_weights_) {
const auto& weights_vec = info.weights_.HostVector();
dh::safe_cuda
(cudaMemcpyAsync(weights_.data().get(),
weights_vec.data() + row_begin_ + batch_row_begin,
weights_vec.data() + batch_row_begin,
batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
}
@@ -349,8 +353,7 @@ struct GPUSketcher {
(fvalues_.data().get(), has_weights_ ? feature_weights_.data().get() : nullptr,
row_ptrs_.data().get() + batch_row_begin,
has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
gpu_batch_nrows_, num_cols_,
offset_vec[row_begin_ + batch_row_begin], batch_nrows);
gpu_batch_nrows_, offset_vec[batch_row_begin], batch_nrows);
for (int icol = 0; icol < num_cols_; ++icol) {
FindColumnCuts(batch_nrows, icol);
@@ -358,7 +361,7 @@ struct GPUSketcher {
// add cuts into sketches
thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
#pragma omp parallel for schedule(static) \
#pragma omp parallel for default(none) schedule(static) \
if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
for (int icol = 0; icol < num_cols_; ++icol) {
WXQSketch::SummaryContainer summary;
@@ -391,8 +394,7 @@ struct GPUSketcher {
dh::safe_cuda(cudaSetDevice(device_));
const auto& offset_vec = row_batch.offset.HostVector();
row_ptrs_.resize(n_rows_ + 1);
thrust::copy(offset_vec.data() + row_begin_,
offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
thrust::copy(offset_vec.data(), offset_vec.data() + n_rows_ + 1, row_ptrs_.begin());
size_t gpu_nbatches = common::DivRoundUp(n_rows_, gpu_batch_nrows_);
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
SketchBatch(row_batch, info, gpu_batch);
@@ -401,32 +403,18 @@ struct GPUSketcher {
};
void SketchBatch(const SparsePage &batch, const MetaInfo &info) {
GPUDistribution dist =
GPUDistribution::Block(GPUSet::All(generic_param_.gpu_id, generic_param_.n_gpus,
batch.Size()));
auto device = generic_param_.gpu_id;
// create device shards
shards_.resize(dist.Devices().Size());
dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
size_t start = dist.ShardStart(batch.Size(), i);
size_t size = dist.ShardSize(batch.Size(), i);
shard = std::unique_ptr<DeviceShard>(
new DeviceShard(dist.Devices().DeviceId(i), start,
start + size, param_, sketch_container_.get()));
});
// create device shard
shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get()));
// compute sketches for each shard
dh::ExecuteIndexShards(&shards_,
[&](int idx, std::unique_ptr<DeviceShard>& shard) {
shard->Init(batch, info, gpu_batch_nrows_);
shard->Sketch(batch, info);
shard->ComputeRowStride();
});
// compute sketches for the shard
shard_->Init(batch, info, gpu_batch_nrows_);
shard_->Sketch(batch, info);
shard_->ComputeRowStride();
// compute row stride across all shards
for (const auto &shard : shards_) {
row_stride_ = std::max(row_stride_, shard->GetRowStride());
}
// compute row stride
row_stride_ = shard_->GetRowStride();
}
GPUSketcher(const tree::TrainParam &param, const GenericParameter &generic_param, int gpu_nrows)
@@ -444,13 +432,13 @@ struct GPUSketcher {
this->SketchBatch(batch, info);
}
hmat->Init(&sketch_container_.get()->sketches_, param_.max_bin);
hmat->Init(&sketch_container_->sketches_, param_.max_bin);
return row_stride_;
}
private:
std::vector<std::unique_ptr<DeviceShard>> shards_;
std::unique_ptr<DeviceShard> shard_;
const tree::TrainParam &param_;
const GenericParameter &generic_param_;
int gpu_batch_nrows_;

View File

@@ -30,19 +30,19 @@ struct HostDeviceVectorImpl {
};
template <typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, const GPUDistribution &)
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(size, v);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, const GPUDistribution &)
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, const GPUDistribution &)
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init);
}
@@ -75,29 +75,23 @@ template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
template <typename T>
GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
template <typename T>
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
static GPUDistribution dummyInstance;
return dummyInstance;
}
T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
template <typename T>
T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
template <typename T>
const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
const T* HostDeviceVector<T>::ConstDevicePointer() const {
return nullptr;
}
template <typename T>
common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
common::Span<T> HostDeviceVector<T>::DeviceSpan() {
return common::Span<T>();
}
template <typename T>
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
return common::Span<const T>();
}
@@ -115,10 +109,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
}
template <typename T>
size_t HostDeviceVector<T>::DeviceStart(int device) const { return 0; }
template <typename T>
size_t HostDeviceVector<T>::DeviceSize(int device) const { return 0; }
size_t HostDeviceVector<T>::DeviceSize() const { return 0; }
template <typename T>
void HostDeviceVector<T>::Fill(T v) {
@@ -149,18 +140,12 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
}
template <typename T>
bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
return false;
}
template <typename T>
void HostDeviceVector<T>::Shard(const GPUDistribution& distribution) const { }
template <typename T>
void HostDeviceVector<T>::Shard(GPUSet devices) const { }
template <typename T>
void Reshard(const GPUDistribution &distribution) { }
void HostDeviceVector<T>::SetDevice(int device) const {}
// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;

View File

@@ -10,7 +10,6 @@
#include <mutex>
#include "./device_helpers.cuh"
namespace xgboost {
// the handler to call instead of cudaSetDevice; only used for testing
@@ -43,144 +42,12 @@ class Permissions {
};
template <typename T>
struct HostDeviceVectorImpl {
struct DeviceShard {
DeviceShard()
: proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
~DeviceShard() {
class HostDeviceVectorImpl {
public:
HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), perm_h_(device < 0) {
if (device >= 0) {
SetDevice();
}
void Init(HostDeviceVectorImpl<T>* vec, int device) {
if (vec_ == nullptr) { vec_ = vec; }
CHECK_EQ(vec, vec_);
device_ = device;
LazyResize(vec_->Size());
perm_d_ = vec_->perm_h_.Complementary();
}
void Init(HostDeviceVectorImpl<T>* vec, const DeviceShard& other) {
if (vec_ == nullptr) { vec_ = vec; }
CHECK_EQ(vec, vec_);
device_ = other.device_;
cached_size_ = other.cached_size_;
start_ = other.start_;
proper_size_ = other.proper_size_;
SetDevice();
data_.resize(other.data_.size());
perm_d_ = other.perm_d_;
}
void ScatterFrom(const T* begin) {
// TODO(canonizer): avoid full copy of host data
LazySyncDevice(GPUAccess::kWrite);
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_,
data_.size() * sizeof(T), cudaMemcpyDefault));
}
void GatherTo(thrust::device_ptr<T> begin) {
LazySyncDevice(GPUAccess::kRead);
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(),
proper_size_ * sizeof(T), cudaMemcpyDefault));
}
void Fill(T v) {
// TODO(canonizer): avoid full copy of host data
LazySyncDevice(GPUAccess::kWrite);
SetDevice();
thrust::fill(data_.begin(), data_.end(), v);
}
void Copy(DeviceShard* other) {
// TODO(canonizer): avoid full copy of host data for this (but not for other)
LazySyncDevice(GPUAccess::kWrite);
other->LazySyncDevice(GPUAccess::kRead);
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(),
data_.size() * sizeof(T), cudaMemcpyDefault));
}
void LazySyncHost(GPUAccess access) {
SetDevice();
dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
data_.data().get(), proper_size_ * sizeof(T),
cudaMemcpyDeviceToHost));
perm_d_.DenyComplementary(access);
}
void LazyResize(size_t new_size) {
if (new_size == cached_size_) { return; }
// resize is required
int ndevices = vec_->distribution_.devices_.Size();
int device_index = vec_->distribution_.devices_.Index(device_);
start_ = vec_->distribution_.ShardStart(new_size, device_index);
proper_size_ = vec_->distribution_.ShardProperSize(new_size, device_index);
// The size on this device.
size_t size_d = vec_->distribution_.ShardSize(new_size, device_index);
SetDevice();
data_.resize(size_d);
cached_size_ = new_size;
}
void LazySyncDevice(GPUAccess access) {
if (perm_d_.CanAccess(access)) { return; }
if (perm_d_.CanRead()) {
// deny read to the host
perm_d_.Grant(access);
std::lock_guard<std::mutex> lock(vec_->mutex_);
vec_->perm_h_.DenyComplementary(access);
return;
}
// data is on the host
size_t size_h = vec_->data_h_.size();
LazyResize(size_h);
SetDevice();
dh::safe_cuda(
cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
data_.size() * sizeof(T), cudaMemcpyHostToDevice));
perm_d_.Grant(access);
std::lock_guard<std::mutex> lock(vec_->mutex_);
vec_->perm_h_.DenyComplementary(access);
vec_->size_d_ = size_h;
}
void SetDevice() {
if (cudaSetDeviceHandler == nullptr) {
dh::safe_cuda(cudaSetDevice(device_));
} else {
(*cudaSetDeviceHandler)(device_);
}
}
T* Raw() { return data_.data().get(); }
size_t Start() const { return start_; }
size_t DataSize() const { return data_.size(); }
Permissions& Perm() { return perm_d_; }
Permissions const& Perm() const { return perm_d_; }
private:
int device_;
dh::device_vector<T> data_;
// cached vector size
size_t cached_size_;
size_t start_;
// size of the portion to copy back to the host
size_t proper_size_;
Permissions perm_d_;
HostDeviceVectorImpl<T>* vec_;
};
HostDeviceVectorImpl(size_t size, T v, const GPUDistribution &distribution)
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
if (!distribution_.IsEmpty()) {
size_d_ = size;
InitShards();
Fill(v);
data_d_.resize(size, v);
} else {
data_h_.resize(size, v);
}
@@ -188,127 +55,81 @@ struct HostDeviceVectorImpl {
// required, as a new std::mutex has to be created
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
: data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
distribution_(other.distribution_), mutex_() {
shards_.resize(other.shards_.size());
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Init(this, other.shards_.at(i));
});
: device_(other.device_), data_h_(other.data_h_), perm_h_(other.perm_h_), mutex_() {
if (device_ >= 0) {
SetDevice();
data_d_ = other.data_d_;
}
}
// Initializer can be std::vector<T> or std::initializer_list<T>
template <class Initializer>
HostDeviceVectorImpl(const Initializer& init, const GPUDistribution &distribution)
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
if (!distribution_.IsEmpty()) {
size_d_ = init.size();
InitShards();
HostDeviceVectorImpl(const Initializer& init, int device) : device_(device), perm_h_(device < 0) {
if (device >= 0) {
LazyResizeDevice(init.size());
Copy(init);
} else {
data_h_ = init;
}
}
void InitShards() {
int ndevices = distribution_.devices_.Size();
shards_.resize(ndevices);
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Init(this, distribution_.devices_.DeviceId(i));
});
~HostDeviceVectorImpl() {
if (device_ >= 0) {
SetDevice();
}
}
size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }
size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : data_d_.size(); }
GPUSet Devices() const { return distribution_.devices_; }
int DeviceIdx() const { return device_; }
const GPUDistribution& Distribution() const { return distribution_; }
T* DevicePointer(int device) {
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kWrite);
return shards_.at(distribution_.devices_.Index(device)).Raw();
T* DevicePointer() {
LazySyncDevice(GPUAccess::kWrite);
return data_d_.data().get();
}
const T* ConstDevicePointer(int device) {
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return shards_.at(distribution_.devices_.Index(device)).Raw();
const T* ConstDevicePointer() {
LazySyncDevice(GPUAccess::kRead);
return data_d_.data().get();
}
common::Span<T> DeviceSpan(int device) {
GPUSet devices = distribution_.devices_;
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kWrite);
return {shards_.at(devices.Index(device)).Raw(),
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
common::Span<T> DeviceSpan() {
LazySyncDevice(GPUAccess::kWrite);
return {data_d_.data().get(), static_cast<typename common::Span<T>::index_type>(DeviceSize())};
}
common::Span<const T> ConstDeviceSpan(int device) {
GPUSet devices = distribution_.devices_;
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
common::Span<const T> ConstDeviceSpan() {
LazySyncDevice(GPUAccess::kRead);
using SpanInd = typename common::Span<const T>::index_type;
return {shards_.at(devices.Index(device)).Raw(),
static_cast<SpanInd>(DeviceSize(device))};
return {data_d_.data().get(), static_cast<SpanInd>(DeviceSize())};
}
size_t DeviceSize(int device) {
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return shards_.at(distribution_.devices_.Index(device)).DataSize();
size_t DeviceSize() {
LazySyncDevice(GPUAccess::kRead);
return data_d_.size();
}
size_t DeviceStart(int device) {
CHECK(distribution_.devices_.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return shards_.at(distribution_.devices_.Index(device)).Start();
thrust::device_ptr<T> tbegin() { // NOLINT
return thrust::device_ptr<T>(DevicePointer());
}
thrust::device_ptr<T> tbegin(int device) { // NOLINT
return thrust::device_ptr<T>(DevicePointer(device));
thrust::device_ptr<const T> tcbegin() { // NOLINT
return thrust::device_ptr<const T>(ConstDevicePointer());
}
thrust::device_ptr<const T> tcbegin(int device) { // NOLINT
return thrust::device_ptr<const T>(ConstDevicePointer(device));
thrust::device_ptr<T> tend() { // NOLINT
return tbegin() + DeviceSize();
}
thrust::device_ptr<T> tend(int device) { // NOLINT
return tbegin(device) + DeviceSize(device);
}
thrust::device_ptr<const T> tcend(int device) { // NOLINT
return tcbegin(device) + DeviceSize(device);
}
void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
CHECK_EQ(end - begin, Size());
if (perm_h_.CanWrite()) {
dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
(end - begin) * sizeof(T),
cudaMemcpyDeviceToHost));
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.ScatterFrom(begin.get());
});
}
}
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
CHECK_EQ(end - begin, Size());
if (perm_h_.CanWrite()) {
dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
data_h_.size() * sizeof(T),
cudaMemcpyHostToDevice));
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.GatherTo(begin); });
}
thrust::device_ptr<const T> tcend() { // NOLINT
return tcbegin() + DeviceSize();
}
void Fill(T v) { // NOLINT
if (perm_h_.CanWrite()) {
std::fill(data_h_.begin(), data_h_.end(), v);
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.Fill(v); });
DeviceFill(v);
}
}
@@ -320,14 +141,10 @@ struct HostDeviceVectorImpl {
return;
}
// Data is on device;
if (distribution_ != other->distribution_) {
distribution_ = GPUDistribution();
Shard(other->Distribution());
size_d_ = other->size_d_;
if (device_ != other->device_) {
SetDevice(other->device_);
}
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Copy(&other->shards_.at(i));
});
DeviceCopy(other);
}
void Copy(const std::vector<T>& other) {
@@ -335,9 +152,7 @@ struct HostDeviceVectorImpl {
if (perm_h_.CanWrite()) {
std::copy(other.begin(), other.end(), data_h_.begin());
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.ScatterFrom(other.data());
});
DeviceCopy(other.data());
}
}
@@ -346,9 +161,7 @@ struct HostDeviceVectorImpl {
if (perm_h_.CanWrite()) {
std::copy(other.begin(), other.end(), data_h_.begin());
} else {
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.ScatterFrom(other.begin());
});
DeviceCopy(other.begin());
}
}
@@ -362,40 +175,23 @@ struct HostDeviceVectorImpl {
return data_h_;
}
void Shard(const GPUDistribution& distribution) {
if (distribution_ == distribution) { return; }
CHECK(distribution_.IsEmpty())
<< "Data resides on different GPUs: " << "ID: "
<< *(distribution_.Devices().begin()) << " and ID: "
<< *(distribution.Devices().begin());
distribution_ = distribution;
InitShards();
}
void Shard(GPUSet new_devices) {
if (distribution_.Devices() == new_devices) { return; }
Shard(GPUDistribution::Block(new_devices));
}
void Reshard(const GPUDistribution &distribution) {
if (distribution_ == distribution) { return; }
LazySyncHost(GPUAccess::kWrite);
distribution_ = distribution;
shards_.clear();
InitShards();
void SetDevice(int device) {
if (device_ == device) { return; }
if (device_ >= 0) {
LazySyncHost(GPUAccess::kWrite);
}
device_ = device;
if (device_ >= 0) {
LazyResizeDevice(data_h_.size());
}
}
void Resize(size_t new_size, T v) {
if (new_size == Size()) { return; }
if (distribution_.IsFixedSize()) {
CHECK_EQ(new_size, distribution_.offsets_.back());
}
if (Size() == 0 && !distribution_.IsEmpty()) {
if (Size() == 0 && device_ >= 0) {
// fast on-device resize
perm_h_ = Permissions(false);
size_d_ = new_size;
InitShards();
Fill(v);
data_d_.resize(new_size, v);
} else {
// resize on host
LazySyncHost(GPUAccess::kWrite);
@@ -407,72 +203,110 @@ struct HostDeviceVectorImpl {
if (perm_h_.CanAccess(access)) { return; }
if (perm_h_.CanRead()) {
// data is present, just need to deny access to the device
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.Perm().DenyComplementary(access);
});
perm_h_.Grant(access);
return;
}
std::lock_guard<std::mutex> lock(mutex_);
if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.LazySyncHost(access);
});
if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
SetDevice();
dh::safe_cuda(cudaMemcpy(data_h_.data(),
data_d_.data().get(),
data_d_.size() * sizeof(T),
cudaMemcpyDeviceToHost));
perm_h_.Grant(access);
}
void LazySyncDevice(int device, GPUAccess access) {
GPUSet devices = distribution_.Devices();
CHECK(devices.Contains(device));
shards_.at(devices.Index(device)).LazySyncDevice(access);
void LazySyncDevice(GPUAccess access) {
if (DevicePerm().CanAccess(access)) { return; }
if (DevicePerm().CanRead()) {
// deny read to the host
std::lock_guard<std::mutex> lock(mutex_);
perm_h_.DenyComplementary(access);
return;
}
// data is on the host
LazyResizeDevice(data_h_.size());
SetDevice();
dh::safe_cuda(cudaMemcpy(data_d_.data().get(),
data_h_.data(),
data_d_.size() * sizeof(T),
cudaMemcpyHostToDevice));
std::lock_guard<std::mutex> lock(mutex_);
perm_h_.DenyComplementary(access);
}
bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
bool DeviceCanAccess(int device, GPUAccess access) {
GPUSet devices = distribution_.Devices();
if (!devices.Contains(device)) { return false; }
return shards_.at(devices.Index(device)).Perm().CanAccess(access);
}
bool DeviceCanAccess(GPUAccess access) { return DevicePerm().CanAccess(access); }
private:
std::vector<T> data_h_;
Permissions perm_h_;
// the total size of the data stored on the devices
size_t size_d_;
GPUDistribution distribution_;
int device_{-1};
std::vector<T> data_h_{};
dh::device_vector<T> data_d_{};
Permissions perm_h_{false};
// protects size_d_ and perm_h_ when updated from multiple threads
std::mutex mutex_;
std::vector<DeviceShard> shards_;
std::mutex mutex_{};
void DeviceFill(T v) {
// TODO(canonizer): avoid full copy of host data
LazySyncDevice(GPUAccess::kWrite);
SetDevice();
thrust::fill(data_d_.begin(), data_d_.end(), v);
}
void DeviceCopy(HostDeviceVectorImpl* other) {
// TODO(canonizer): avoid full copy of host data for this (but not for other)
LazySyncDevice(GPUAccess::kWrite);
other->LazySyncDevice(GPUAccess::kRead);
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
data_d_.size() * sizeof(T), cudaMemcpyDefault));
}
void DeviceCopy(const T* begin) {
// TODO(canonizer): avoid full copy of host data
LazySyncDevice(GPUAccess::kWrite);
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
data_d_.size() * sizeof(T), cudaMemcpyDefault));
}
void LazyResizeDevice(size_t new_size) {
if (new_size == data_d_.size()) { return; }
SetDevice();
data_d_.resize(new_size);
}
void SetDevice() {
CHECK_GE(device_, 0);
if (cudaSetDeviceHandler == nullptr) {
dh::safe_cuda(cudaSetDevice(device_));
} else {
(*cudaSetDeviceHandler)(device_);
}
}
Permissions DevicePerm() const { return perm_h_.Complementary(); }
};
template <typename T>
HostDeviceVector<T>::HostDeviceVector
(size_t size, T v, const GPUDistribution &distribution) : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
}
template<typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
: impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
template <typename T>
HostDeviceVector<T>::HostDeviceVector
(std::initializer_list<T> init, const GPUDistribution &distribution) : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
}
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
template <typename T>
HostDeviceVector<T>::HostDeviceVector
(const std::vector<T>& init, const GPUDistribution &distribution) : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init, distribution);
}
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
}
: impl_(new HostDeviceVectorImpl<T>(*other.impl_)) {}
template <typename T>
HostDeviceVector<T>& HostDeviceVector<T>::operator=
(const HostDeviceVector<T>& other) {
HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
if (this == &other) { return *this; }
std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
@@ -491,73 +325,51 @@ template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
template <typename T>
GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
template <typename T>
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
return impl_->Distribution();
T* HostDeviceVector<T>::DevicePointer() {
return impl_->DevicePointer();
}
template <typename T>
T* HostDeviceVector<T>::DevicePointer(int device) {
return impl_->DevicePointer(device);
const T* HostDeviceVector<T>::ConstDevicePointer() const {
return impl_->ConstDevicePointer();
}
template <typename T>
const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
return impl_->ConstDevicePointer(device);
common::Span<T> HostDeviceVector<T>::DeviceSpan() {
return impl_->DeviceSpan();
}
template <typename T>
common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
return impl_->DeviceSpan(device);
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
return impl_->ConstDeviceSpan();
}
template <typename T>
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
return impl_->ConstDeviceSpan(device);
size_t HostDeviceVector<T>::DeviceSize() const {
return impl_->DeviceSize();
}
template <typename T>
size_t HostDeviceVector<T>::DeviceStart(int device) const {
return impl_->DeviceStart(device);
thrust::device_ptr<T> HostDeviceVector<T>::tbegin() { // NOLINT
return impl_->tbegin();
}
template <typename T>
size_t HostDeviceVector<T>::DeviceSize(int device) const {
return impl_->DeviceSize(device);
thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin() const { // NOLINT
return impl_->tcbegin();
}
template <typename T>
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
return impl_->tbegin(device);
thrust::device_ptr<T> HostDeviceVector<T>::tend() { // NOLINT
return impl_->tend();
}
template <typename T>
thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const { // NOLINT
return impl_->tcbegin(device);
}
template <typename T>
thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) { // NOLINT
return impl_->tend(device);
}
template <typename T>
thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const { // NOLINT
return impl_->tcend(device);
}
template <typename T>
void HostDeviceVector<T>::ScatterFrom
(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
impl_->ScatterFrom(begin, end);
}
template <typename T>
void HostDeviceVector<T>::GatherTo
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
impl_->GatherTo(begin, end);
thrust::device_ptr<const T> HostDeviceVector<T>::tcend() const { // NOLINT
return impl_->tcend();
}
template <typename T>
@@ -594,23 +406,13 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
}
template <typename T>
bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
return impl_->DeviceCanAccess(device, access);
bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
return impl_->DeviceCanAccess(access);
}
template <typename T>
void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
impl_->Shard(new_devices);
}
template <typename T>
void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
impl_->Shard(distribution);
}
template <typename T>
void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution) {
impl_->Reshard(distribution);
void HostDeviceVector<T>::SetDevice(int device) const {
impl_->SetDevice(device);
}
template <typename T>

View File

@@ -79,113 +79,6 @@ void SetCudaSetDeviceHandler(void (*handler)(int));
template <typename T> struct HostDeviceVectorImpl;
// Distribution for the HostDeviceVector; it specifies such aspects as the
// devices it is distributed on, whether there are copies of elements from
// other GPUs as well as the granularity of splitting. It may also specify
// explicit boundaries for devices, in which case the size of the array cannot
// be changed.
class GPUDistribution {
template<typename T> friend struct HostDeviceVectorImpl;
public:
explicit GPUDistribution(GPUSet devices = GPUSet::Empty())
: devices_(devices), granularity_(1), overlap_(0) {}
private:
GPUDistribution(GPUSet devices, int granularity, int overlap,
std::vector<size_t> &&offsets)
: devices_(devices), granularity_(granularity), overlap_(overlap),
offsets_(std::move(offsets)) {}
public:
static GPUDistribution Empty() { return GPUDistribution(); }
static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
static GPUDistribution Overlap(GPUSet devices, int overlap) {
return GPUDistribution(devices, 1, overlap, std::vector<size_t>());
}
static GPUDistribution Granular(GPUSet devices, int granularity) {
return GPUDistribution(devices, granularity, 0, std::vector<size_t>());
}
// NOTE(rongou): Explicit offsets don't necessarily cover the whole vector. Sections before the
// first shard or after the last shard may be on host only. This windowing is done in the GPU
// predictor for external memory support.
static GPUDistribution Explicit(GPUSet devices, std::vector<size_t> offsets) {
return GPUDistribution(devices, 1, 0, std::move(offsets));
}
friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
bool const res = a.devices_ == b.devices_ &&
a.granularity_ == b.granularity_ &&
a.overlap_ == b.overlap_ &&
a.offsets_ == b.offsets_;
return res;
}
friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
return !(a == b);
}
GPUSet Devices() const { return devices_; }
bool IsEmpty() const { return devices_.IsEmpty(); }
size_t ShardStart(size_t size, int index) const {
if (size == 0) { return 0; }
if (offsets_.size() > 0) {
// explicit offsets are provided
CHECK_EQ(offsets_.back(), size);
return offsets_.at(index);
}
// no explicit offsets
size_t begin = std::min(index * Portion(size), size);
begin = begin > size ? size : begin;
return begin;
}
size_t ShardSize(size_t size, size_t index) const {
if (size == 0) { return 0; }
if (offsets_.size() > 0) {
// explicit offsets are provided
CHECK_EQ(offsets_.back(), size);
return offsets_.at(index + 1) - offsets_.at(index) +
(index == devices_.Size() - 1 ? overlap_ : 0);
}
size_t portion = Portion(size);
size_t begin = std::min(index * portion, size);
size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size);
return end - begin;
}
size_t ShardProperSize(size_t size, size_t index) const {
if (size == 0) { return 0; }
return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0);
}
bool IsFixedSize() const { return !offsets_.empty(); }
private:
static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; }
static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; }
size_t Portion(size_t size) const {
return RoundUp
(DivRoundUp
(std::max(static_cast<int64_t>(size - overlap_ * granularity_),
static_cast<int64_t>(1)),
devices_.Size()), granularity_);
}
GPUSet devices_;
int granularity_;
int overlap_;
// explicit offsets for the GPU parts, if any
std::vector<size_t> offsets_;
};
enum GPUAccess {
kNone, kRead,
// write implies read
@@ -199,46 +92,38 @@ inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
template <typename T>
class HostDeviceVector {
public:
explicit HostDeviceVector(size_t size = 0, T v = T(),
const GPUDistribution &distribution = GPUDistribution());
HostDeviceVector(std::initializer_list<T> init,
const GPUDistribution &distribution = GPUDistribution());
explicit HostDeviceVector(const std::vector<T>& init,
const GPUDistribution &distribution = GPUDistribution());
explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
HostDeviceVector(std::initializer_list<T> init, int device = -1);
explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
~HostDeviceVector();
HostDeviceVector(const HostDeviceVector<T>&);
HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
size_t Size() const;
GPUSet Devices() const;
const GPUDistribution& Distribution() const;
common::Span<T> DeviceSpan(int device);
common::Span<const T> ConstDeviceSpan(int device) const;
common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
T* DevicePointer(int device);
const T* ConstDevicePointer(int device) const;
const T* DevicePointer(int device) const { return ConstDevicePointer(device); }
int DeviceIdx() const;
common::Span<T> DeviceSpan();
common::Span<const T> ConstDeviceSpan() const;
common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
T* DevicePointer();
const T* ConstDevicePointer() const;
const T* DevicePointer() const { return ConstDevicePointer(); }
T* HostPointer() { return HostVector().data(); }
const T* ConstHostPointer() const { return ConstHostVector().data(); }
const T* HostPointer() const { return ConstHostPointer(); }
size_t DeviceStart(int device) const;
size_t DeviceSize(int device) const;
size_t DeviceSize() const;
// only define functions returning device_ptr
// if HostDeviceVector.h is included from a .cu file
#ifdef __CUDACC__
thrust::device_ptr<T> tbegin(int device); // NOLINT
thrust::device_ptr<T> tend(int device); // NOLINT
thrust::device_ptr<const T> tcbegin(int device) const; // NOLINT
thrust::device_ptr<const T> tcend(int device) const; // NOLINT
thrust::device_ptr<const T> tbegin(int device) const { // NOLINT
return tcbegin(device);
thrust::device_ptr<T> tbegin(); // NOLINT
thrust::device_ptr<T> tend(); // NOLINT
thrust::device_ptr<const T> tcbegin() const; // NOLINT
thrust::device_ptr<const T> tcend() const; // NOLINT
thrust::device_ptr<const T> tbegin() const { // NOLINT
return tcbegin();
}
thrust::device_ptr<const T> tend(int device) const { return tcend(device); } // NOLINT
void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end);
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const;
thrust::device_ptr<const T> tend() const { return tcend(); } // NOLINT
#endif // __CUDACC__
void Fill(T v);
@@ -251,18 +136,9 @@ class HostDeviceVector {
const std::vector<T>& HostVector() const {return ConstHostVector(); }
bool HostCanAccess(GPUAccess access) const;
bool DeviceCanAccess(int device, GPUAccess access) const;
bool DeviceCanAccess(GPUAccess access) const;
/*!
* \brief Specify memory distribution.
*/
void Shard(const GPUDistribution &distribution) const;
void Shard(GPUSet devices) const;
/*!
* \brief Change memory distribution.
*/
void Reshard(const GPUDistribution &distribution);
void SetDevice(int device) const;
void Resize(size_t new_size, T v = T());

View File

@@ -57,14 +57,10 @@ class Transform {
template <typename Functor>
struct Evaluator {
public:
Evaluator(Functor func, Range range, GPUSet devices, bool shard) :
Evaluator(Functor func, Range range, int device, bool shard) :
func_(func), range_{std::move(range)},
shard_{shard},
distribution_{GPUDistribution::Block(devices)} {}
Evaluator(Functor func, Range range, GPUDistribution dist,
bool shard) :
func_(func), range_{std::move(range)}, shard_{shard},
distribution_{std::move(dist)} {}
device_{device} {}
/*!
* \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -74,7 +70,7 @@ class Transform {
*/
template <typename... HDV>
void Eval(HDV... vectors) const {
bool on_device = !distribution_.IsEmpty();
bool on_device = device_ >= 0;
if (on_device) {
LaunchCUDA(func_, vectors...);
@@ -86,13 +82,13 @@ class Transform {
private:
// CUDA UnpackHDV
template <typename T>
Span<T> UnpackHDV(HostDeviceVector<T>* _vec, int _device) const {
auto span = _vec->DeviceSpan(_device);
Span<T> UnpackHDVOnDevice(HostDeviceVector<T>* _vec) const {
auto span = _vec->DeviceSpan();
return span;
}
template <typename T>
Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec, int _device) const {
auto span = _vec->ConstDeviceSpan(_device);
Span<T const> UnpackHDVOnDevice(const HostDeviceVector<T>* _vec) const {
auto span = _vec->ConstDeviceSpan();
return span;
}
// CPU UnpackHDV
@@ -108,15 +104,15 @@ class Transform {
}
// Recursive unpack for Shard.
template <typename T>
void UnpackShard(GPUDistribution dist, const HostDeviceVector<T> *vector) const {
vector->Shard(dist);
void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
vector->SetDevice(device);
}
template <typename Head, typename... Rest>
void UnpackShard(GPUDistribution dist,
void UnpackShard(int device,
const HostDeviceVector<Head> *_vector,
const HostDeviceVector<Rest> *... _vectors) const {
_vector->Shard(dist);
UnpackShard(dist, _vectors...);
_vector->SetDevice(device);
UnpackShard(device, _vectors...);
}
#if defined(__CUDACC__)
@@ -124,28 +120,20 @@ class Transform {
typename... HDV>
void LaunchCUDA(Functor _func, HDV*... _vectors) const {
if (shard_)
UnpackShard(distribution_, _vectors...);
UnpackShard(device_, _vectors...);
GPUSet devices = distribution_.Devices();
size_t range_size = *range_.end() - *range_.begin();
// Extract index to deal with possible old OpenMP.
size_t device_beg = *(devices.begin());
size_t device_end = *(devices.end());
#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
for (omp_ulong device = device_beg; device < device_end; ++device) { // NOLINT
// Ignore other attributes of GPUDistribution for spliting index.
// This deals with situation like multi-class setting where
// granularity is used in data vector.
size_t shard_size = GPUDistribution::Block(devices).ShardSize(
range_size, devices.Index(device));
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
dh::safe_cuda(cudaSetDevice(device));
const int GRID_SIZE =
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
_func, shard_range, UnpackHDV(_vectors, device)...);
}
// This deals with situation like multi-class setting where
// granularity is used in data vector.
size_t shard_size = range_size;
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
dh::safe_cuda(cudaSetDevice(device_));
const int GRID_SIZE =
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
_func, shard_range, UnpackHDVOnDevice(_vectors)...);
}
#else
/*! \brief Dummy funtion defined when compiling for CPU. */
@@ -172,7 +160,7 @@ class Transform {
Range range_;
/*! \brief Whether sharding for vectors is required. */
bool shard_;
GPUDistribution distribution_;
int device_;
};
public:
@@ -191,15 +179,9 @@ class Transform {
*/
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range,
GPUSet const devices,
int device,
bool const shard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(devices), shard};
}
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range,
GPUDistribution const dist,
bool const shard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(dist), shard};
return Evaluator<Functor> {func, std::move(range), device, shard};
}
};