Multi-GPU HostDeviceVector. (#3287)
* Multi-GPU HostDeviceVector. - HostDeviceVector instances can now span multiple devices, defined by GPUSet struct - the interface of HostDeviceVector has been modified accordingly - GPU objective functions are now multi-GPU - GPU predicting from cache is now multi-GPU - avoiding omp_set_num_threads() calls - other minor changes
This commit is contained in:
committed by
Rory Mitchell
parent
90a5c4db9d
commit
b8a0d66fe6
@@ -1004,14 +1004,29 @@ class AllReducer {
|
||||
|
||||
template <typename T, typename FunctionT>
|
||||
void ExecuteShards(std::vector<T> *shards, FunctionT f) {
|
||||
auto previous_num_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(shards->size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
f(shards->at(cpu_thread_id));
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||
f(shards->at(shard));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Executes some operation on each element of the input vector, using a
|
||||
* single controlling thread for each element. In addition, passes the shard index
|
||||
* into the function.
|
||||
*
|
||||
* \tparam T Generic type parameter.
|
||||
* \tparam FunctionT Type of the function t.
|
||||
* \param shards The shards.
|
||||
* \param f The func_t to process.
|
||||
*/
|
||||
|
||||
template <typename T, typename FunctionT>
|
||||
void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||
f(shard, shards->at(shard));
|
||||
}
|
||||
omp_set_num_threads(previous_num_threads);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1029,15 +1044,11 @@ void ExecuteShards(std::vector<T> *shards, FunctionT f) {
|
||||
|
||||
template <typename ReduceT,typename T, typename FunctionT>
|
||||
ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
|
||||
auto previous_num_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(shards->size());
|
||||
std::vector<ReduceT> sums(shards->size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
sums[cpu_thread_id] = f(shards->at(cpu_thread_id));
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||
sums[shard] = f(shards->at(shard));
|
||||
}
|
||||
omp_set_num_threads(previous_num_threads);
|
||||
return std::accumulate(sums.begin(), sums.end(), ReduceT());
|
||||
}
|
||||
} // namespace dh
|
||||
|
||||
@@ -21,18 +21,18 @@ struct HostDeviceVectorImpl {
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) {
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices) : impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(size, v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||
}
|
||||
@@ -48,7 +48,7 @@ template <typename T>
|
||||
size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }
|
||||
|
||||
template <typename T>
|
||||
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
|
||||
GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
|
||||
@@ -57,13 +57,46 @@ template <typename T>
|
||||
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
|
||||
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||
impl_->data_h_.resize(new_size, v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceStart(int device) { return 0; }
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceSize(int device) { return 0; }
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Fill(T v) {
|
||||
std::fill(HostVector().begin(), HostVector().end(), v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
|
||||
CHECK_EQ(Size(), other->Size());
|
||||
std::copy(other->HostVector().begin(), other->HostVector().end(), HostVector().begin());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
|
||||
CHECK_EQ(Size(), other.size());
|
||||
std::copy(other.begin(), other.end(), HostVector().begin());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
|
||||
CHECK_EQ(Size(), other.size());
|
||||
std::copy(other.begin(), other.end(), HostVector().begin());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Reshard(GPUSet devices) { }
|
||||
|
||||
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<unsigned int>;
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@@ -2,122 +2,309 @@
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
|
||||
|
||||
#include <thrust/fill.h>
|
||||
#include "./host_device_vector.h"
|
||||
#include "./device_helpers.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct HostDeviceVectorImpl {
|
||||
HostDeviceVectorImpl(size_t size, T v, int device)
|
||||
: device_(device), on_d_(device >= 0) {
|
||||
if (on_d_) {
|
||||
struct DeviceShard {
|
||||
DeviceShard() : index_(-1), device_(-1), start_(0), on_d_(false), vec_(nullptr) {}
|
||||
|
||||
static size_t ShardStart(size_t size, int ndevices, int index) {
|
||||
size_t portion = dh::DivRoundUp(size, ndevices);
|
||||
size_t begin = index * portion;
|
||||
begin = begin > size ? size : begin;
|
||||
return begin;
|
||||
}
|
||||
|
||||
static size_t ShardSize(size_t size, int ndevices, int index) {
|
||||
size_t portion = dh::DivRoundUp(size, ndevices);
|
||||
size_t begin = index * portion, end = (index + 1) * portion;
|
||||
begin = begin > size ? size : begin;
|
||||
end = end > size ? size : end;
|
||||
return end - begin;
|
||||
}
|
||||
|
||||
void Init(HostDeviceVectorImpl<T>* vec, int device) {
|
||||
if (vec_ == nullptr) { vec_ = vec; }
|
||||
CHECK_EQ(vec, vec_);
|
||||
device_ = device;
|
||||
index_ = vec_->devices_.Index(device);
|
||||
size_t size_h = vec_->Size();
|
||||
int ndevices = vec_->devices_.Size();
|
||||
start_ = ShardStart(size_h, ndevices, index_);
|
||||
size_t size_d = ShardSize(size_h, ndevices, index_);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
data_d_.resize(size, v);
|
||||
data_.resize(size_d);
|
||||
on_d_ = !vec_->on_h_;
|
||||
}
|
||||
|
||||
void ScatterFrom(const T* begin) {
|
||||
// TODO(canonizer): avoid full copy of host data
|
||||
LazySyncDevice();
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_,
|
||||
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void GatherTo(thrust::device_ptr<T> begin) {
|
||||
LazySyncDevice();
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(),
|
||||
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void Fill(T v) {
|
||||
// TODO(canonizer): avoid full copy of host data
|
||||
LazySyncDevice();
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
thrust::fill(data_.begin(), data_.end(), v);
|
||||
}
|
||||
|
||||
void Copy(DeviceShard* other) {
|
||||
// TODO(canonizer): avoid full copy of host data for this (but not for other)
|
||||
LazySyncDevice();
|
||||
other->LazySyncDevice();
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(),
|
||||
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void LazySyncHost() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
thrust::copy(data_.begin(), data_.end(), vec_->data_h_.begin() + start_);
|
||||
on_d_ = false;
|
||||
}
|
||||
|
||||
void LazySyncDevice() {
|
||||
if (on_d_) { return; }
|
||||
// data is on the host
|
||||
size_t size_h = vec_->data_h_.size();
|
||||
int ndevices = vec_->devices_.Size();
|
||||
start_ = ShardStart(size_h, ndevices, index_);
|
||||
size_t size_d = ShardSize(size_h, ndevices, index_);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
data_.resize(size_d);
|
||||
thrust::copy(vec_->data_h_.begin() + start_,
|
||||
vec_->data_h_.begin() + start_ + size_d, data_.begin());
|
||||
on_d_ = true;
|
||||
// this may cause a race condition if LazySyncDevice() is called
|
||||
// from multiple threads in parallel;
|
||||
// however, the race condition is benign, and will not cause problems
|
||||
vec_->on_h_ = false;
|
||||
vec_->size_d_ = vec_->data_h_.size();
|
||||
}
|
||||
|
||||
int index_;
|
||||
int device_;
|
||||
thrust::device_vector<T> data_;
|
||||
size_t start_;
|
||||
// true if there is an up-to-date copy of data on device, false otherwise
|
||||
bool on_d_;
|
||||
HostDeviceVectorImpl<T>* vec_;
|
||||
};
|
||||
|
||||
HostDeviceVectorImpl(size_t size, T v, GPUSet devices)
|
||||
: devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
|
||||
if (!devices.IsEmpty()) {
|
||||
size_d_ = size;
|
||||
InitShards();
|
||||
Fill(v);
|
||||
} else {
|
||||
data_h_.resize(size, v);
|
||||
}
|
||||
}
|
||||
|
||||
// Init can be std::vector<T> or std::initializer_list<T>
|
||||
template <class Init>
|
||||
HostDeviceVectorImpl(const Init& init, int device)
|
||||
: device_(device), on_d_(device >= 0) {
|
||||
if (on_d_) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
data_d_.resize(init.size());
|
||||
thrust::copy(init.begin(), init.end(), data_d_.begin());
|
||||
HostDeviceVectorImpl(const Init& init, GPUSet devices)
|
||||
: devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
|
||||
if (!devices.IsEmpty()) {
|
||||
size_d_ = init.size();
|
||||
InitShards();
|
||||
Copy(init);
|
||||
} else {
|
||||
data_h_ = init;
|
||||
}
|
||||
}
|
||||
|
||||
void InitShards() {
|
||||
int ndevices = devices_.Size();
|
||||
shards_.resize(ndevices);
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
||||
shard.Init(this, devices_[i]);
|
||||
});
|
||||
}
|
||||
|
||||
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>&) = delete;
|
||||
HostDeviceVectorImpl(HostDeviceVectorImpl<T>&&) = delete;
|
||||
void operator=(const HostDeviceVectorImpl<T>&) = delete;
|
||||
void operator=(HostDeviceVectorImpl<T>&&) = delete;
|
||||
|
||||
size_t Size() const { return on_d_ ? data_d_.size() : data_h_.size(); }
|
||||
size_t Size() const { return on_h_ ? data_h_.size() : size_d_; }
|
||||
|
||||
int DeviceIdx() const { return device_; }
|
||||
GPUSet Devices() const { return devices_; }
|
||||
|
||||
T* DevicePointer(int device) {
|
||||
CHECK(devices_.Contains(device));
|
||||
LazySyncDevice(device);
|
||||
return data_d_.data().get();
|
||||
return shards_[devices_.Index(device)].data_.data().get();
|
||||
}
|
||||
|
||||
size_t DeviceSize(int device) {
|
||||
CHECK(devices_.Contains(device));
|
||||
LazySyncDevice(device);
|
||||
return shards_[devices_.Index(device)].data_.size();
|
||||
}
|
||||
|
||||
size_t DeviceStart(int device) {
|
||||
CHECK(devices_.Contains(device));
|
||||
LazySyncDevice(device);
|
||||
return shards_[devices_.Index(device)].start_;
|
||||
}
|
||||
|
||||
thrust::device_ptr<T> tbegin(int device) { // NOLINT
|
||||
return thrust::device_ptr<T>(DevicePointer(device));
|
||||
}
|
||||
|
||||
thrust::device_ptr<T> tend(int device) { // NOLINT
|
||||
auto begin = tbegin(device);
|
||||
return begin + Size();
|
||||
return tbegin(device) + DeviceSize(device);
|
||||
}
|
||||
|
||||
void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
CHECK_EQ(end - begin, Size());
|
||||
if (on_h_) {
|
||||
thrust::copy(begin, end, data_h_.begin());
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||
shard.ScatterFrom(begin.get());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
CHECK_EQ(end - begin, Size());
|
||||
if (on_h_) {
|
||||
thrust::copy(data_h_.begin(), data_h_.end(), begin);
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); });
|
||||
}
|
||||
}
|
||||
|
||||
void Fill(T v) {
|
||||
if (on_h_) {
|
||||
std::fill(data_h_.begin(), data_h_.end(), v);
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.Fill(v); });
|
||||
}
|
||||
}
|
||||
|
||||
void Copy(HostDeviceVectorImpl<T>* other) {
|
||||
CHECK_EQ(Size(), other->Size());
|
||||
if (on_h_ && other->on_h_) {
|
||||
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
|
||||
} else {
|
||||
CHECK(devices_ == other->devices_);
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
||||
shard.Copy(&other->shards_[i]);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void Copy(const std::vector<T>& other) {
|
||||
CHECK_EQ(Size(), other.size());
|
||||
if (on_h_) {
|
||||
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||
shard.ScatterFrom(other.data());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void Copy(std::initializer_list<T> other) {
|
||||
CHECK_EQ(Size(), other.size());
|
||||
if (on_h_) {
|
||||
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||
shard.ScatterFrom(other.begin());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<T>& HostVector() {
|
||||
LazySyncHost();
|
||||
return data_h_;
|
||||
}
|
||||
void Resize(size_t new_size, T v, int new_device) {
|
||||
if (new_size == this->Size() && new_device == device_)
|
||||
|
||||
void Reshard(GPUSet new_devices) {
|
||||
if (devices_ == new_devices)
|
||||
return;
|
||||
if (new_device != -1)
|
||||
device_ = new_device;
|
||||
// if !on_d_, but the data size is 0 and the device is set,
|
||||
// resize the data on device instead
|
||||
if (!on_d_ && (data_h_.size() > 0 || device_ == -1)) {
|
||||
data_h_.resize(new_size, v);
|
||||
CHECK(devices_.IsEmpty());
|
||||
devices_ = new_devices;
|
||||
InitShards();
|
||||
}
|
||||
|
||||
void Resize(size_t new_size, T v) {
|
||||
if (new_size == Size())
|
||||
return;
|
||||
if (Size() == 0 && !devices_.IsEmpty()) {
|
||||
// fast on-device resize
|
||||
on_h_ = false;
|
||||
size_d_ = new_size;
|
||||
InitShards();
|
||||
Fill(v);
|
||||
} else {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
data_d_.resize(new_size, v);
|
||||
on_d_ = true;
|
||||
// resize on host
|
||||
LazySyncHost();
|
||||
data_h_.resize(new_size, v);
|
||||
}
|
||||
}
|
||||
|
||||
void LazySyncHost() {
|
||||
if (!on_d_)
|
||||
if (on_h_)
|
||||
return;
|
||||
if (data_h_.size() != this->Size())
|
||||
data_h_.resize(this->Size());
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
thrust::copy(data_d_.begin(), data_d_.end(), data_h_.begin());
|
||||
on_d_ = false;
|
||||
if (data_h_.size() != size_d_)
|
||||
data_h_.resize(size_d_);
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.LazySyncHost(); });
|
||||
on_h_ = true;
|
||||
}
|
||||
|
||||
void LazySyncDevice(int device) {
|
||||
if (on_d_)
|
||||
return;
|
||||
if (device != device_) {
|
||||
CHECK_EQ(device_, -1);
|
||||
device_ = device;
|
||||
}
|
||||
if (data_d_.size() != this->Size()) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
data_d_.resize(this->Size());
|
||||
}
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
thrust::copy(data_h_.begin(), data_h_.end(), data_d_.begin());
|
||||
on_d_ = true;
|
||||
CHECK(devices_.Contains(device));
|
||||
shards_[devices_.Index(device)].LazySyncDevice();
|
||||
}
|
||||
|
||||
std::vector<T> data_h_;
|
||||
thrust::device_vector<T> data_d_;
|
||||
// true if there is an up-to-date copy of data on device, false otherwise
|
||||
bool on_d_;
|
||||
int device_;
|
||||
bool on_h_;
|
||||
// the total size of the data stored on the devices
|
||||
size_t size_d_;
|
||||
GPUSet devices_;
|
||||
std::vector<DeviceShard> shards_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(size, v, device);
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(size, v, devices);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init, device);
|
||||
impl_ = new HostDeviceVectorImpl<T>(init, devices);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init, device);
|
||||
impl_ = new HostDeviceVectorImpl<T>(init, devices);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -131,11 +318,17 @@ template <typename T>
|
||||
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
||||
|
||||
template <typename T>
|
||||
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
|
||||
GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceStart(int device) { return impl_->DeviceStart(device); }
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceSize(int device) { return impl_->DeviceSize(device); }
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
|
||||
return impl_->tbegin(device);
|
||||
@@ -146,16 +339,54 @@ thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) { // NOLINT
|
||||
return impl_->tend(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::ScatterFrom
|
||||
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
impl_->ScatterFrom(begin, end);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::GatherTo
|
||||
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
impl_->GatherTo(begin, end);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Fill(T v) {
|
||||
impl_->Fill(v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
|
||||
impl_->Copy(other->impl_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
|
||||
impl_->Copy(other);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
|
||||
impl_->Copy(other);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
|
||||
impl_->Resize(new_size, v, new_device);
|
||||
void HostDeviceVector<T>::Reshard(GPUSet new_devices) {
|
||||
impl_->Reshard(new_devices);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||
impl_->Resize(new_size, v);
|
||||
}
|
||||
|
||||
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<unsigned int>;
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
||||
#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
||||
|
||||
#include <dmlc/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
@@ -18,6 +21,40 @@ namespace xgboost {
|
||||
|
||||
template <typename T> struct HostDeviceVectorImpl;
|
||||
|
||||
// set of devices across which HostDeviceVector can be distributed;
|
||||
// currently implemented as a range, but can be changed later to something else,
|
||||
// e.g. a bitset
|
||||
class GPUSet {
|
||||
public:
|
||||
explicit GPUSet(int start = 0, int ndevices = 0)
|
||||
: start_(start), ndevices_(ndevices) {}
|
||||
static GPUSet Empty() { return GPUSet(); }
|
||||
static GPUSet Range(int start, int ndevices) { return GPUSet(start, ndevices); }
|
||||
int Size() const { return ndevices_; }
|
||||
int operator[](int index) const {
|
||||
CHECK(index >= 0 && index < ndevices_);
|
||||
return start_ + index;
|
||||
}
|
||||
bool IsEmpty() const { return ndevices_ <= 0; }
|
||||
int Index(int device) const {
|
||||
CHECK(device >= start_ && device < start_ + ndevices_);
|
||||
return device - start_;
|
||||
}
|
||||
bool Contains(int device) const {
|
||||
return start_ <= device && device < start_ + ndevices_;
|
||||
}
|
||||
friend bool operator==(GPUSet a, GPUSet b) {
|
||||
return a.start_ == b.start_ && a.ndevices_ == b.ndevices_;
|
||||
}
|
||||
friend bool operator!=(GPUSet a, GPUSet b) {
|
||||
return a.start_ != b.start_ || a.ndevices_ != b.ndevices_;
|
||||
}
|
||||
|
||||
private:
|
||||
int start_, ndevices_;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @file host_device_vector.h
|
||||
* @brief A device-and-host vector abstraction layer.
|
||||
@@ -29,24 +66,26 @@ template <typename T> struct HostDeviceVectorImpl;
|
||||
*
|
||||
* Initialization/Allocation:<br/>
|
||||
* One can choose to initialize the vector on CPU or GPU during constructor.
|
||||
* (use the 'device' argument) Or, can choose to use the 'resize' method to
|
||||
* allocate/resize memory explicitly.
|
||||
* (use the 'devices' argument) Or, can choose to use the 'Resize' method to
|
||||
* allocate/resize memory explicitly, and use the 'Reshard' method
|
||||
* to specify the devices.
|
||||
*
|
||||
* Accessing underling data:<br/>
|
||||
* Use 'data_h' method to explicitly query for the underlying std::vector.
|
||||
* If you need the raw device pointer, use the 'ptr_d' method. For perf
|
||||
* Accessing underlying data:<br/>
|
||||
* Use 'HostVector' method to explicitly query for the underlying std::vector.
|
||||
* If you need the raw device pointer, use the 'DevicePointer' method. For perf
|
||||
* implications of these calls, see below.
|
||||
*
|
||||
* Accessing underling data and their perf implications:<br/>
|
||||
* There are 4 scenarios to be considered here:
|
||||
* data_h and data on CPU --> no problems, std::vector returned immediately
|
||||
* data_h but data on GPU --> this causes a cudaMemcpy to be issued internally.
|
||||
* subsequent calls to data_h, will NOT incur this penalty.
|
||||
* (assuming 'ptr_d' is not called in between)
|
||||
* ptr_d but data on CPU --> this causes a cudaMemcpy to be issued internally.
|
||||
* subsequent calls to ptr_d, will NOT incur this penalty.
|
||||
* (assuming 'data_h' is not called in between)
|
||||
* ptr_d and data on GPU --> no problems, the device ptr will be returned immediately
|
||||
* HostVector and data on CPU --> no problems, std::vector returned immediately
|
||||
* HostVector but data on GPU --> this causes a cudaMemcpy to be issued internally.
|
||||
* subsequent calls to HostVector, will NOT incur this penalty.
|
||||
* (assuming 'DevicePointer' is not called in between)
|
||||
* DevicePointer but data on CPU --> this causes a cudaMemcpy to be issued internally.
|
||||
* subsequent calls to DevicePointer, will NOT incur this penalty.
|
||||
* (assuming 'HostVector' is not called in between)
|
||||
* DevicePointer and data on GPU --> no problems, the device ptr
|
||||
* will be returned immediately.
|
||||
*
|
||||
* What if xgboost is compiled without CUDA?<br/>
|
||||
* In that case, there's a special implementation which always falls-back to
|
||||
@@ -57,35 +96,49 @@ template <typename T> struct HostDeviceVectorImpl;
|
||||
* compiling with and without CUDA toolkit. It was easier to have
|
||||
* 'HostDeviceVector' with a special-case implementation in host_device_vector.cc
|
||||
*
|
||||
* @note: This is not thread-safe!
|
||||
* @note: Size and Devices methods are thread-safe.
|
||||
* DevicePointer, DeviceStart, DeviceSize, tbegin and tend methods are thread-safe
|
||||
* if different threads call these methods with different values of the device argument.
|
||||
* All other methods are not thread safe.
|
||||
*/
|
||||
template <typename T>
|
||||
class HostDeviceVector {
|
||||
public:
|
||||
explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
|
||||
HostDeviceVector(std::initializer_list<T> init, int device = -1);
|
||||
explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
|
||||
explicit HostDeviceVector(size_t size = 0, T v = T(),
|
||||
GPUSet devices = GPUSet::Empty());
|
||||
HostDeviceVector(std::initializer_list<T> init, GPUSet devices = GPUSet::Empty());
|
||||
explicit HostDeviceVector(const std::vector<T>& init,
|
||||
GPUSet devices = GPUSet::Empty());
|
||||
~HostDeviceVector();
|
||||
HostDeviceVector(const HostDeviceVector<T>&) = delete;
|
||||
HostDeviceVector(HostDeviceVector<T>&&) = delete;
|
||||
void operator=(const HostDeviceVector<T>&) = delete;
|
||||
void operator=(HostDeviceVector<T>&&) = delete;
|
||||
size_t Size() const;
|
||||
int DeviceIdx() const;
|
||||
GPUSet Devices() const;
|
||||
T* DevicePointer(int device);
|
||||
|
||||
T* HostPointer() { return HostVector().data(); }
|
||||
size_t DeviceStart(int device);
|
||||
size_t DeviceSize(int device);
|
||||
|
||||
// only define functions returning device_ptr
|
||||
// if HostDeviceVector.h is included from a .cu file
|
||||
#ifdef __CUDACC__
|
||||
thrust::device_ptr<T> tbegin(int device);
|
||||
thrust::device_ptr<T> tend(int device);
|
||||
thrust::device_ptr<T> tbegin(int device); // NOLINT
|
||||
thrust::device_ptr<T> tend(int device); // NOLINT
|
||||
void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
|
||||
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
|
||||
#endif
|
||||
|
||||
std::vector<T>& HostVector();
|
||||
void Fill(T v);
|
||||
void Copy(HostDeviceVector<T>* other);
|
||||
void Copy(const std::vector<T>& other);
|
||||
void Copy(std::initializer_list<T> other);
|
||||
|
||||
// passing in new_device == -1 keeps the device as is
|
||||
void Resize(size_t new_size, T v = T(), int new_device = -1);
|
||||
std::vector<T>& HostVector();
|
||||
void Reshard(GPUSet devices);
|
||||
void Resize(size_t new_size, T v = T());
|
||||
|
||||
private:
|
||||
HostDeviceVectorImpl<T>* impl_;
|
||||
|
||||
Reference in New Issue
Block a user