Multi-GPU HostDeviceVector. (#3287)
* Multi-GPU HostDeviceVector. - HostDeviceVector instances can now span multiple devices, defined by GPUSet struct - the interface of HostDeviceVector has been modified accordingly - GPU objective functions are now multi-GPU - GPU predicting from cache is now multi-GPU - avoiding omp_set_num_threads() calls - other minor changes
This commit is contained in:
parent
90a5c4db9d
commit
b8a0d66fe6
@ -1004,14 +1004,29 @@ class AllReducer {
|
|||||||
|
|
||||||
template <typename T, typename FunctionT>
|
template <typename T, typename FunctionT>
|
||||||
void ExecuteShards(std::vector<T> *shards, FunctionT f) {
|
void ExecuteShards(std::vector<T> *shards, FunctionT f) {
|
||||||
auto previous_num_threads = omp_get_max_threads();
|
#pragma omp parallel for schedule(static, 1)
|
||||||
omp_set_num_threads(shards->size());
|
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||||
#pragma omp parallel
|
f(shards->at(shard));
|
||||||
{
|
}
|
||||||
auto cpu_thread_id = omp_get_thread_num();
|
}
|
||||||
f(shards->at(cpu_thread_id));
|
|
||||||
|
/**
|
||||||
|
* \brief Executes some operation on each element of the input vector, using a
|
||||||
|
* single controlling thread for each element. In addition, passes the shard index
|
||||||
|
* into the function.
|
||||||
|
*
|
||||||
|
* \tparam T Generic type parameter.
|
||||||
|
* \tparam FunctionT Type of the function t.
|
||||||
|
* \param shards The shards.
|
||||||
|
* \param f The func_t to process.
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename T, typename FunctionT>
|
||||||
|
void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
|
||||||
|
#pragma omp parallel for schedule(static, 1)
|
||||||
|
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||||
|
f(shard, shards->at(shard));
|
||||||
}
|
}
|
||||||
omp_set_num_threads(previous_num_threads);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1029,15 +1044,11 @@ void ExecuteShards(std::vector<T> *shards, FunctionT f) {
|
|||||||
|
|
||||||
template <typename ReduceT,typename T, typename FunctionT>
|
template <typename ReduceT,typename T, typename FunctionT>
|
||||||
ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
|
ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
|
||||||
auto previous_num_threads = omp_get_max_threads();
|
|
||||||
omp_set_num_threads(shards->size());
|
|
||||||
std::vector<ReduceT> sums(shards->size());
|
std::vector<ReduceT> sums(shards->size());
|
||||||
#pragma omp parallel
|
#pragma omp parallel for schedule(static, 1)
|
||||||
{
|
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||||
auto cpu_thread_id = omp_get_thread_num();
|
sums[shard] = f(shards->at(shard));
|
||||||
sums[cpu_thread_id] = f(shards->at(cpu_thread_id));
|
|
||||||
}
|
}
|
||||||
omp_set_num_threads(previous_num_threads);
|
|
||||||
return std::accumulate(sums.begin(), sums.end(), ReduceT());
|
return std::accumulate(sums.begin(), sums.end(), ReduceT());
|
||||||
}
|
}
|
||||||
} // namespace dh
|
} // namespace dh
|
||||||
|
|||||||
@ -21,18 +21,18 @@ struct HostDeviceVectorImpl {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) {
|
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices) : impl_(nullptr) {
|
||||||
impl_ = new HostDeviceVectorImpl<T>(size, v);
|
impl_ = new HostDeviceVectorImpl<T>(size, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
|
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
|
||||||
: impl_(nullptr) {
|
: impl_(nullptr) {
|
||||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
|
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
|
||||||
: impl_(nullptr) {
|
: impl_(nullptr) {
|
||||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||||
}
|
}
|
||||||
@ -48,7 +48,7 @@ template <typename T>
|
|||||||
size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }
|
size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
|
GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
|
T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
|
||||||
@ -57,13 +57,46 @@ template <typename T>
|
|||||||
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }
|
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
|
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||||
impl_->data_h_.resize(new_size, v);
|
impl_->data_h_.resize(new_size, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
size_t HostDeviceVector<T>::DeviceStart(int device) { return 0; }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
size_t HostDeviceVector<T>::DeviceSize(int device) { return 0; }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Fill(T v) {
|
||||||
|
std::fill(HostVector().begin(), HostVector().end(), v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
|
||||||
|
CHECK_EQ(Size(), other->Size());
|
||||||
|
std::copy(other->HostVector().begin(), other->HostVector().end(), HostVector().begin());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
|
||||||
|
CHECK_EQ(Size(), other.size());
|
||||||
|
std::copy(other.begin(), other.end(), HostVector().begin());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
|
||||||
|
CHECK_EQ(Size(), other.size());
|
||||||
|
std::copy(other.begin(), other.end(), HostVector().begin());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Reshard(GPUSet devices) { }
|
||||||
|
|
||||||
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
||||||
template class HostDeviceVector<bst_float>;
|
template class HostDeviceVector<bst_float>;
|
||||||
template class HostDeviceVector<GradientPair>;
|
template class HostDeviceVector<GradientPair>;
|
||||||
|
template class HostDeviceVector<unsigned int>;
|
||||||
|
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|
||||||
|
|||||||
@ -2,122 +2,309 @@
|
|||||||
* Copyright 2017 XGBoost contributors
|
* Copyright 2017 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <thrust/fill.h>
|
||||||
#include "./host_device_vector.h"
|
#include "./host_device_vector.h"
|
||||||
#include "./device_helpers.cuh"
|
#include "./device_helpers.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct HostDeviceVectorImpl {
|
struct HostDeviceVectorImpl {
|
||||||
HostDeviceVectorImpl(size_t size, T v, int device)
|
struct DeviceShard {
|
||||||
: device_(device), on_d_(device >= 0) {
|
DeviceShard() : index_(-1), device_(-1), start_(0), on_d_(false), vec_(nullptr) {}
|
||||||
if (on_d_) {
|
|
||||||
|
static size_t ShardStart(size_t size, int ndevices, int index) {
|
||||||
|
size_t portion = dh::DivRoundUp(size, ndevices);
|
||||||
|
size_t begin = index * portion;
|
||||||
|
begin = begin > size ? size : begin;
|
||||||
|
return begin;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ShardSize(size_t size, int ndevices, int index) {
|
||||||
|
size_t portion = dh::DivRoundUp(size, ndevices);
|
||||||
|
size_t begin = index * portion, end = (index + 1) * portion;
|
||||||
|
begin = begin > size ? size : begin;
|
||||||
|
end = end > size ? size : end;
|
||||||
|
return end - begin;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Init(HostDeviceVectorImpl<T>* vec, int device) {
|
||||||
|
if (vec_ == nullptr) { vec_ = vec; }
|
||||||
|
CHECK_EQ(vec, vec_);
|
||||||
|
device_ = device;
|
||||||
|
index_ = vec_->devices_.Index(device);
|
||||||
|
size_t size_h = vec_->Size();
|
||||||
|
int ndevices = vec_->devices_.Size();
|
||||||
|
start_ = ShardStart(size_h, ndevices, index_);
|
||||||
|
size_t size_d = ShardSize(size_h, ndevices, index_);
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_));
|
||||||
data_d_.resize(size, v);
|
data_.resize(size_d);
|
||||||
|
on_d_ = !vec_->on_h_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ScatterFrom(const T* begin) {
|
||||||
|
// TODO(canonizer): avoid full copy of host data
|
||||||
|
LazySyncDevice();
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_));
|
||||||
|
dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_,
|
||||||
|
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||||
|
}
|
||||||
|
|
||||||
|
void GatherTo(thrust::device_ptr<T> begin) {
|
||||||
|
LazySyncDevice();
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_));
|
||||||
|
dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(),
|
||||||
|
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Fill(T v) {
|
||||||
|
// TODO(canonizer): avoid full copy of host data
|
||||||
|
LazySyncDevice();
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_));
|
||||||
|
thrust::fill(data_.begin(), data_.end(), v);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Copy(DeviceShard* other) {
|
||||||
|
// TODO(canonizer): avoid full copy of host data for this (but not for other)
|
||||||
|
LazySyncDevice();
|
||||||
|
other->LazySyncDevice();
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_));
|
||||||
|
dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(),
|
||||||
|
data_.size() * sizeof(T), cudaMemcpyDefault));
|
||||||
|
}
|
||||||
|
|
||||||
|
void LazySyncHost() {
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_));
|
||||||
|
thrust::copy(data_.begin(), data_.end(), vec_->data_h_.begin() + start_);
|
||||||
|
on_d_ = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void LazySyncDevice() {
|
||||||
|
if (on_d_) { return; }
|
||||||
|
// data is on the host
|
||||||
|
size_t size_h = vec_->data_h_.size();
|
||||||
|
int ndevices = vec_->devices_.Size();
|
||||||
|
start_ = ShardStart(size_h, ndevices, index_);
|
||||||
|
size_t size_d = ShardSize(size_h, ndevices, index_);
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_));
|
||||||
|
data_.resize(size_d);
|
||||||
|
thrust::copy(vec_->data_h_.begin() + start_,
|
||||||
|
vec_->data_h_.begin() + start_ + size_d, data_.begin());
|
||||||
|
on_d_ = true;
|
||||||
|
// this may cause a race condition if LazySyncDevice() is called
|
||||||
|
// from multiple threads in parallel;
|
||||||
|
// however, the race condition is benign, and will not cause problems
|
||||||
|
vec_->on_h_ = false;
|
||||||
|
vec_->size_d_ = vec_->data_h_.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int index_;
|
||||||
|
int device_;
|
||||||
|
thrust::device_vector<T> data_;
|
||||||
|
size_t start_;
|
||||||
|
// true if there is an up-to-date copy of data on device, false otherwise
|
||||||
|
bool on_d_;
|
||||||
|
HostDeviceVectorImpl<T>* vec_;
|
||||||
|
};
|
||||||
|
|
||||||
|
HostDeviceVectorImpl(size_t size, T v, GPUSet devices)
|
||||||
|
: devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
|
||||||
|
if (!devices.IsEmpty()) {
|
||||||
|
size_d_ = size;
|
||||||
|
InitShards();
|
||||||
|
Fill(v);
|
||||||
} else {
|
} else {
|
||||||
data_h_.resize(size, v);
|
data_h_.resize(size, v);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init can be std::vector<T> or std::initializer_list<T>
|
// Init can be std::vector<T> or std::initializer_list<T>
|
||||||
template <class Init>
|
template <class Init>
|
||||||
HostDeviceVectorImpl(const Init& init, int device)
|
HostDeviceVectorImpl(const Init& init, GPUSet devices)
|
||||||
: device_(device), on_d_(device >= 0) {
|
: devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
|
||||||
if (on_d_) {
|
if (!devices.IsEmpty()) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
size_d_ = init.size();
|
||||||
data_d_.resize(init.size());
|
InitShards();
|
||||||
thrust::copy(init.begin(), init.end(), data_d_.begin());
|
Copy(init);
|
||||||
} else {
|
} else {
|
||||||
data_h_ = init;
|
data_h_ = init;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void InitShards() {
|
||||||
|
int ndevices = devices_.Size();
|
||||||
|
shards_.resize(ndevices);
|
||||||
|
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
||||||
|
shard.Init(this, devices_[i]);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>&) = delete;
|
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>&) = delete;
|
||||||
HostDeviceVectorImpl(HostDeviceVectorImpl<T>&&) = delete;
|
HostDeviceVectorImpl(HostDeviceVectorImpl<T>&&) = delete;
|
||||||
void operator=(const HostDeviceVectorImpl<T>&) = delete;
|
void operator=(const HostDeviceVectorImpl<T>&) = delete;
|
||||||
void operator=(HostDeviceVectorImpl<T>&&) = delete;
|
void operator=(HostDeviceVectorImpl<T>&&) = delete;
|
||||||
|
|
||||||
size_t Size() const { return on_d_ ? data_d_.size() : data_h_.size(); }
|
size_t Size() const { return on_h_ ? data_h_.size() : size_d_; }
|
||||||
|
|
||||||
int DeviceIdx() const { return device_; }
|
GPUSet Devices() const { return devices_; }
|
||||||
|
|
||||||
T* DevicePointer(int device) {
|
T* DevicePointer(int device) {
|
||||||
|
CHECK(devices_.Contains(device));
|
||||||
LazySyncDevice(device);
|
LazySyncDevice(device);
|
||||||
return data_d_.data().get();
|
return shards_[devices_.Index(device)].data_.data().get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t DeviceSize(int device) {
|
||||||
|
CHECK(devices_.Contains(device));
|
||||||
|
LazySyncDevice(device);
|
||||||
|
return shards_[devices_.Index(device)].data_.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t DeviceStart(int device) {
|
||||||
|
CHECK(devices_.Contains(device));
|
||||||
|
LazySyncDevice(device);
|
||||||
|
return shards_[devices_.Index(device)].start_;
|
||||||
|
}
|
||||||
|
|
||||||
thrust::device_ptr<T> tbegin(int device) { // NOLINT
|
thrust::device_ptr<T> tbegin(int device) { // NOLINT
|
||||||
return thrust::device_ptr<T>(DevicePointer(device));
|
return thrust::device_ptr<T>(DevicePointer(device));
|
||||||
}
|
}
|
||||||
|
|
||||||
thrust::device_ptr<T> tend(int device) { // NOLINT
|
thrust::device_ptr<T> tend(int device) { // NOLINT
|
||||||
auto begin = tbegin(device);
|
return tbegin(device) + DeviceSize(device);
|
||||||
return begin + Size();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||||
|
CHECK_EQ(end - begin, Size());
|
||||||
|
if (on_h_) {
|
||||||
|
thrust::copy(begin, end, data_h_.begin());
|
||||||
|
} else {
|
||||||
|
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||||
|
shard.ScatterFrom(begin.get());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||||
|
CHECK_EQ(end - begin, Size());
|
||||||
|
if (on_h_) {
|
||||||
|
thrust::copy(data_h_.begin(), data_h_.end(), begin);
|
||||||
|
} else {
|
||||||
|
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Fill(T v) {
|
||||||
|
if (on_h_) {
|
||||||
|
std::fill(data_h_.begin(), data_h_.end(), v);
|
||||||
|
} else {
|
||||||
|
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.Fill(v); });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Copy(HostDeviceVectorImpl<T>* other) {
|
||||||
|
CHECK_EQ(Size(), other->Size());
|
||||||
|
if (on_h_ && other->on_h_) {
|
||||||
|
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
|
||||||
|
} else {
|
||||||
|
CHECK(devices_ == other->devices_);
|
||||||
|
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
||||||
|
shard.Copy(&other->shards_[i]);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Copy(const std::vector<T>& other) {
|
||||||
|
CHECK_EQ(Size(), other.size());
|
||||||
|
if (on_h_) {
|
||||||
|
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||||
|
} else {
|
||||||
|
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||||
|
shard.ScatterFrom(other.data());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Copy(std::initializer_list<T> other) {
|
||||||
|
CHECK_EQ(Size(), other.size());
|
||||||
|
if (on_h_) {
|
||||||
|
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||||
|
} else {
|
||||||
|
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||||
|
shard.ScatterFrom(other.begin());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<T>& HostVector() {
|
std::vector<T>& HostVector() {
|
||||||
LazySyncHost();
|
LazySyncHost();
|
||||||
return data_h_;
|
return data_h_;
|
||||||
}
|
}
|
||||||
void Resize(size_t new_size, T v, int new_device) {
|
|
||||||
if (new_size == this->Size() && new_device == device_)
|
void Reshard(GPUSet new_devices) {
|
||||||
|
if (devices_ == new_devices)
|
||||||
return;
|
return;
|
||||||
if (new_device != -1)
|
CHECK(devices_.IsEmpty());
|
||||||
device_ = new_device;
|
devices_ = new_devices;
|
||||||
// if !on_d_, but the data size is 0 and the device is set,
|
InitShards();
|
||||||
// resize the data on device instead
|
}
|
||||||
if (!on_d_ && (data_h_.size() > 0 || device_ == -1)) {
|
|
||||||
data_h_.resize(new_size, v);
|
void Resize(size_t new_size, T v) {
|
||||||
|
if (new_size == Size())
|
||||||
|
return;
|
||||||
|
if (Size() == 0 && !devices_.IsEmpty()) {
|
||||||
|
// fast on-device resize
|
||||||
|
on_h_ = false;
|
||||||
|
size_d_ = new_size;
|
||||||
|
InitShards();
|
||||||
|
Fill(v);
|
||||||
} else {
|
} else {
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
// resize on host
|
||||||
data_d_.resize(new_size, v);
|
LazySyncHost();
|
||||||
on_d_ = true;
|
data_h_.resize(new_size, v);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void LazySyncHost() {
|
void LazySyncHost() {
|
||||||
if (!on_d_)
|
if (on_h_)
|
||||||
return;
|
return;
|
||||||
if (data_h_.size() != this->Size())
|
if (data_h_.size() != size_d_)
|
||||||
data_h_.resize(this->Size());
|
data_h_.resize(size_d_);
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.LazySyncHost(); });
|
||||||
thrust::copy(data_d_.begin(), data_d_.end(), data_h_.begin());
|
on_h_ = true;
|
||||||
on_d_ = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void LazySyncDevice(int device) {
|
void LazySyncDevice(int device) {
|
||||||
if (on_d_)
|
CHECK(devices_.Contains(device));
|
||||||
return;
|
shards_[devices_.Index(device)].LazySyncDevice();
|
||||||
if (device != device_) {
|
|
||||||
CHECK_EQ(device_, -1);
|
|
||||||
device_ = device;
|
|
||||||
}
|
|
||||||
if (data_d_.size() != this->Size()) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
|
||||||
data_d_.resize(this->Size());
|
|
||||||
}
|
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
|
||||||
thrust::copy(data_h_.begin(), data_h_.end(), data_d_.begin());
|
|
||||||
on_d_ = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<T> data_h_;
|
std::vector<T> data_h_;
|
||||||
thrust::device_vector<T> data_d_;
|
bool on_h_;
|
||||||
// true if there is an up-to-date copy of data on device, false otherwise
|
// the total size of the data stored on the devices
|
||||||
bool on_d_;
|
size_t size_d_;
|
||||||
int device_;
|
GPUSet devices_;
|
||||||
|
std::vector<DeviceShard> shards_;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) {
|
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices)
|
||||||
impl_ = new HostDeviceVectorImpl<T>(size, v, device);
|
: impl_(nullptr) {
|
||||||
|
impl_ = new HostDeviceVectorImpl<T>(size, v, devices);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
|
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
|
||||||
: impl_(nullptr) {
|
: impl_(nullptr) {
|
||||||
impl_ = new HostDeviceVectorImpl<T>(init, device);
|
impl_ = new HostDeviceVectorImpl<T>(init, devices);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
|
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
|
||||||
: impl_(nullptr) {
|
: impl_(nullptr) {
|
||||||
impl_ = new HostDeviceVectorImpl<T>(init, device);
|
impl_ = new HostDeviceVectorImpl<T>(init, devices);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -131,11 +318,17 @@ template <typename T>
|
|||||||
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
|
GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }
|
T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
size_t HostDeviceVector<T>::DeviceStart(int device) { return impl_->DeviceStart(device); }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
size_t HostDeviceVector<T>::DeviceSize(int device) { return impl_->DeviceSize(device); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
|
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
|
||||||
return impl_->tbegin(device);
|
return impl_->tbegin(device);
|
||||||
@ -146,16 +339,54 @@ thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) { // NOLINT
|
|||||||
return impl_->tend(device);
|
return impl_->tend(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::ScatterFrom
|
||||||
|
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||||
|
impl_->ScatterFrom(begin, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::GatherTo
|
||||||
|
(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||||
|
impl_->GatherTo(begin, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Fill(T v) {
|
||||||
|
impl_->Fill(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
|
||||||
|
impl_->Copy(other->impl_);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
|
||||||
|
impl_->Copy(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
|
||||||
|
impl_->Copy(other);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
|
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
|
void HostDeviceVector<T>::Reshard(GPUSet new_devices) {
|
||||||
impl_->Resize(new_size, v, new_device);
|
impl_->Reshard(new_devices);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||||
|
impl_->Resize(new_size, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
||||||
template class HostDeviceVector<bst_float>;
|
template class HostDeviceVector<bst_float>;
|
||||||
template class HostDeviceVector<GradientPair>;
|
template class HostDeviceVector<GradientPair>;
|
||||||
|
template class HostDeviceVector<unsigned int>;
|
||||||
|
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -4,6 +4,9 @@
|
|||||||
#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
||||||
#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
|
||||||
|
|
||||||
|
#include <dmlc/logging.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <initializer_list>
|
#include <initializer_list>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -18,6 +21,40 @@ namespace xgboost {
|
|||||||
|
|
||||||
template <typename T> struct HostDeviceVectorImpl;
|
template <typename T> struct HostDeviceVectorImpl;
|
||||||
|
|
||||||
|
// set of devices across which HostDeviceVector can be distributed;
|
||||||
|
// currently implemented as a range, but can be changed later to something else,
|
||||||
|
// e.g. a bitset
|
||||||
|
class GPUSet {
|
||||||
|
public:
|
||||||
|
explicit GPUSet(int start = 0, int ndevices = 0)
|
||||||
|
: start_(start), ndevices_(ndevices) {}
|
||||||
|
static GPUSet Empty() { return GPUSet(); }
|
||||||
|
static GPUSet Range(int start, int ndevices) { return GPUSet(start, ndevices); }
|
||||||
|
int Size() const { return ndevices_; }
|
||||||
|
int operator[](int index) const {
|
||||||
|
CHECK(index >= 0 && index < ndevices_);
|
||||||
|
return start_ + index;
|
||||||
|
}
|
||||||
|
bool IsEmpty() const { return ndevices_ <= 0; }
|
||||||
|
int Index(int device) const {
|
||||||
|
CHECK(device >= start_ && device < start_ + ndevices_);
|
||||||
|
return device - start_;
|
||||||
|
}
|
||||||
|
bool Contains(int device) const {
|
||||||
|
return start_ <= device && device < start_ + ndevices_;
|
||||||
|
}
|
||||||
|
friend bool operator==(GPUSet a, GPUSet b) {
|
||||||
|
return a.start_ == b.start_ && a.ndevices_ == b.ndevices_;
|
||||||
|
}
|
||||||
|
friend bool operator!=(GPUSet a, GPUSet b) {
|
||||||
|
return a.start_ != b.start_ || a.ndevices_ != b.ndevices_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
int start_, ndevices_;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @file host_device_vector.h
|
* @file host_device_vector.h
|
||||||
* @brief A device-and-host vector abstraction layer.
|
* @brief A device-and-host vector abstraction layer.
|
||||||
@ -29,24 +66,26 @@ template <typename T> struct HostDeviceVectorImpl;
|
|||||||
*
|
*
|
||||||
* Initialization/Allocation:<br/>
|
* Initialization/Allocation:<br/>
|
||||||
* One can choose to initialize the vector on CPU or GPU during constructor.
|
* One can choose to initialize the vector on CPU or GPU during constructor.
|
||||||
* (use the 'device' argument) Or, can choose to use the 'resize' method to
|
* (use the 'devices' argument) Or, can choose to use the 'Resize' method to
|
||||||
* allocate/resize memory explicitly.
|
* allocate/resize memory explicitly, and use the 'Reshard' method
|
||||||
|
* to specify the devices.
|
||||||
*
|
*
|
||||||
* Accessing underling data:<br/>
|
* Accessing underlying data:<br/>
|
||||||
* Use 'data_h' method to explicitly query for the underlying std::vector.
|
* Use 'HostVector' method to explicitly query for the underlying std::vector.
|
||||||
* If you need the raw device pointer, use the 'ptr_d' method. For perf
|
* If you need the raw device pointer, use the 'DevicePointer' method. For perf
|
||||||
* implications of these calls, see below.
|
* implications of these calls, see below.
|
||||||
*
|
*
|
||||||
* Accessing underling data and their perf implications:<br/>
|
* Accessing underling data and their perf implications:<br/>
|
||||||
* There are 4 scenarios to be considered here:
|
* There are 4 scenarios to be considered here:
|
||||||
* data_h and data on CPU --> no problems, std::vector returned immediately
|
* HostVector and data on CPU --> no problems, std::vector returned immediately
|
||||||
* data_h but data on GPU --> this causes a cudaMemcpy to be issued internally.
|
* HostVector but data on GPU --> this causes a cudaMemcpy to be issued internally.
|
||||||
* subsequent calls to data_h, will NOT incur this penalty.
|
* subsequent calls to HostVector, will NOT incur this penalty.
|
||||||
* (assuming 'ptr_d' is not called in between)
|
* (assuming 'DevicePointer' is not called in between)
|
||||||
* ptr_d but data on CPU --> this causes a cudaMemcpy to be issued internally.
|
* DevicePointer but data on CPU --> this causes a cudaMemcpy to be issued internally.
|
||||||
* subsequent calls to ptr_d, will NOT incur this penalty.
|
* subsequent calls to DevicePointer, will NOT incur this penalty.
|
||||||
* (assuming 'data_h' is not called in between)
|
* (assuming 'HostVector' is not called in between)
|
||||||
* ptr_d and data on GPU --> no problems, the device ptr will be returned immediately
|
* DevicePointer and data on GPU --> no problems, the device ptr
|
||||||
|
* will be returned immediately.
|
||||||
*
|
*
|
||||||
* What if xgboost is compiled without CUDA?<br/>
|
* What if xgboost is compiled without CUDA?<br/>
|
||||||
* In that case, there's a special implementation which always falls-back to
|
* In that case, there's a special implementation which always falls-back to
|
||||||
@ -57,35 +96,49 @@ template <typename T> struct HostDeviceVectorImpl;
|
|||||||
* compiling with and without CUDA toolkit. It was easier to have
|
* compiling with and without CUDA toolkit. It was easier to have
|
||||||
* 'HostDeviceVector' with a special-case implementation in host_device_vector.cc
|
* 'HostDeviceVector' with a special-case implementation in host_device_vector.cc
|
||||||
*
|
*
|
||||||
* @note: This is not thread-safe!
|
* @note: Size and Devices methods are thread-safe.
|
||||||
|
* DevicePointer, DeviceStart, DeviceSize, tbegin and tend methods are thread-safe
|
||||||
|
* if different threads call these methods with different values of the device argument.
|
||||||
|
* All other methods are not thread safe.
|
||||||
*/
|
*/
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class HostDeviceVector {
|
class HostDeviceVector {
|
||||||
public:
|
public:
|
||||||
explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
|
explicit HostDeviceVector(size_t size = 0, T v = T(),
|
||||||
HostDeviceVector(std::initializer_list<T> init, int device = -1);
|
GPUSet devices = GPUSet::Empty());
|
||||||
explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
|
HostDeviceVector(std::initializer_list<T> init, GPUSet devices = GPUSet::Empty());
|
||||||
|
explicit HostDeviceVector(const std::vector<T>& init,
|
||||||
|
GPUSet devices = GPUSet::Empty());
|
||||||
~HostDeviceVector();
|
~HostDeviceVector();
|
||||||
HostDeviceVector(const HostDeviceVector<T>&) = delete;
|
HostDeviceVector(const HostDeviceVector<T>&) = delete;
|
||||||
HostDeviceVector(HostDeviceVector<T>&&) = delete;
|
HostDeviceVector(HostDeviceVector<T>&&) = delete;
|
||||||
void operator=(const HostDeviceVector<T>&) = delete;
|
void operator=(const HostDeviceVector<T>&) = delete;
|
||||||
void operator=(HostDeviceVector<T>&&) = delete;
|
void operator=(HostDeviceVector<T>&&) = delete;
|
||||||
size_t Size() const;
|
size_t Size() const;
|
||||||
int DeviceIdx() const;
|
GPUSet Devices() const;
|
||||||
T* DevicePointer(int device);
|
T* DevicePointer(int device);
|
||||||
|
|
||||||
T* HostPointer() { return HostVector().data(); }
|
T* HostPointer() { return HostVector().data(); }
|
||||||
|
size_t DeviceStart(int device);
|
||||||
|
size_t DeviceSize(int device);
|
||||||
|
|
||||||
// only define functions returning device_ptr
|
// only define functions returning device_ptr
|
||||||
// if HostDeviceVector.h is included from a .cu file
|
// if HostDeviceVector.h is included from a .cu file
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
thrust::device_ptr<T> tbegin(int device);
|
thrust::device_ptr<T> tbegin(int device); // NOLINT
|
||||||
thrust::device_ptr<T> tend(int device);
|
thrust::device_ptr<T> tend(int device); // NOLINT
|
||||||
|
void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
|
||||||
|
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::vector<T>& HostVector();
|
void Fill(T v);
|
||||||
|
void Copy(HostDeviceVector<T>* other);
|
||||||
|
void Copy(const std::vector<T>& other);
|
||||||
|
void Copy(std::initializer_list<T> other);
|
||||||
|
|
||||||
// passing in new_device == -1 keeps the device as is
|
std::vector<T>& HostVector();
|
||||||
void Resize(size_t new_size, T v = T(), int new_device = -1);
|
void Reshard(GPUSet devices);
|
||||||
|
void Resize(size_t new_size, T v = T());
|
||||||
|
|
||||||
private:
|
private:
|
||||||
HostDeviceVectorImpl<T>* impl_;
|
HostDeviceVectorImpl<T>* impl_;
|
||||||
|
|||||||
@ -195,7 +195,7 @@ class GBTree : public GradientBooster {
|
|||||||
<< "must have exactly ngroup*nrow gpairs";
|
<< "must have exactly ngroup*nrow gpairs";
|
||||||
// TODO(canonizer): perform this on GPU if HostDeviceVector has device set.
|
// TODO(canonizer): perform this on GPU if HostDeviceVector has device set.
|
||||||
HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup,
|
HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup,
|
||||||
GradientPair(), in_gpair->DeviceIdx());
|
GradientPair(), in_gpair->Devices());
|
||||||
std::vector<GradientPair>& gpair_h = in_gpair->HostVector();
|
std::vector<GradientPair>& gpair_h = in_gpair->HostVector();
|
||||||
auto nsize = static_cast<bst_omp_uint>(tmp.Size());
|
auto nsize = static_cast<bst_omp_uint>(tmp.Size());
|
||||||
for (int gid = 0; gid < ngroup; ++gid) {
|
for (int gid = 0; gid < ngroup; ++gid) {
|
||||||
|
|||||||
@ -74,46 +74,35 @@ __global__ void pred_transform_k(float * __restrict__ preds, int n) {
|
|||||||
template<typename Loss>
|
template<typename Loss>
|
||||||
class GPURegLossObj : public ObjFunction {
|
class GPURegLossObj : public ObjFunction {
|
||||||
protected:
|
protected:
|
||||||
// manages device data
|
|
||||||
struct DeviceData {
|
|
||||||
DVec<float> labels, weights;
|
|
||||||
DVec<unsigned int> label_correct;
|
|
||||||
|
|
||||||
// allocate everything on device
|
|
||||||
DeviceData(dh::BulkAllocator<dh::MemoryType::kDevice>* ba, int device_idx, size_t n) {
|
|
||||||
ba->Allocate(device_idx, false,
|
|
||||||
&labels, n,
|
|
||||||
&weights, n,
|
|
||||||
&label_correct, 1);
|
|
||||||
}
|
|
||||||
size_t Size() const { return labels.Size(); }
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
bool copied_;
|
bool copied_;
|
||||||
std::unique_ptr<dh::BulkAllocator<dh::MemoryType::kDevice>> ba_;
|
HostDeviceVector<bst_float> labels_, weights_;
|
||||||
std::unique_ptr<DeviceData> data_;
|
HostDeviceVector<unsigned int> label_correct_;
|
||||||
HostDeviceVector<bst_float> preds_d_;
|
|
||||||
HostDeviceVector<GradientPair> out_gpair_d_;
|
|
||||||
|
|
||||||
// allocate device data for n elements, do nothing if enough memory is allocated already
|
// allocate device data for n elements, do nothing if memory is allocated already
|
||||||
void LazyResize(int n) {
|
void LazyResize(size_t n, size_t n_weights) {
|
||||||
if (data_.get() != nullptr && data_->Size() >= n)
|
if (labels_.Size() == n && weights_.Size() == n_weights)
|
||||||
return;
|
return;
|
||||||
copied_ = false;
|
copied_ = false;
|
||||||
// free the old data and allocate the new data
|
|
||||||
ba_.reset(new dh::BulkAllocator<dh::MemoryType::kDevice>());
|
labels_.Reshard(devices_);
|
||||||
data_.reset(new DeviceData(ba_.get(), 0, n));
|
weights_.Reshard(devices_);
|
||||||
preds_d_.Resize(n, 0.0f, param_.gpu_id);
|
label_correct_.Reshard(devices_);
|
||||||
out_gpair_d_.Resize(n, GradientPair(), param_.gpu_id);
|
|
||||||
|
if (labels_.Size() != n) {
|
||||||
|
labels_.Resize(n);
|
||||||
|
label_correct_.Resize(devices_.Size());
|
||||||
|
}
|
||||||
|
if (weights_.Size() != n_weights)
|
||||||
|
weights_.Resize(n_weights);
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GPURegLossObj() : copied_(false), preds_d_(0, -1), out_gpair_d_({}, -1) {}
|
GPURegLossObj() : copied_(false) {}
|
||||||
|
|
||||||
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
|
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
|
||||||
param_.InitAllowUnknown(args);
|
param_.InitAllowUnknown(args);
|
||||||
CHECK(param_.n_gpus != 0) << "Must have at least one device";
|
CHECK(param_.n_gpus != 0) << "Must have at least one device";
|
||||||
|
devices_ = GPUSet::Range(param_.gpu_id, dh::NDevicesAll(param_.n_gpus));
|
||||||
}
|
}
|
||||||
|
|
||||||
void GetGradient(HostDeviceVector<float>* preds,
|
void GetGradient(HostDeviceVector<float>* preds,
|
||||||
@ -125,45 +114,49 @@ class GPURegLossObj : public ObjFunction {
|
|||||||
<< "labels are not correctly provided"
|
<< "labels are not correctly provided"
|
||||||
<< "preds.size=" << preds->Size() << ", label.size=" << info.labels_.size();
|
<< "preds.size=" << preds->Size() << ", label.size=" << info.labels_.size();
|
||||||
size_t ndata = preds->Size();
|
size_t ndata = preds->Size();
|
||||||
out_gpair->Resize(ndata, GradientPair(), param_.gpu_id);
|
preds->Reshard(devices_);
|
||||||
LazyResize(ndata);
|
out_gpair->Reshard(devices_);
|
||||||
GetGradientDevice(preds->DevicePointer(param_.gpu_id), info, iter,
|
out_gpair->Resize(ndata);
|
||||||
out_gpair->DevicePointer(param_.gpu_id), ndata);
|
LazyResize(ndata, info.weights_.size());
|
||||||
|
GetGradientDevice(preds, info, iter, out_gpair);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void GetGradientDevice(float* preds,
|
void GetGradientDevice(HostDeviceVector<float>* preds,
|
||||||
const MetaInfo &info,
|
const MetaInfo &info,
|
||||||
int iter,
|
int iter,
|
||||||
GradientPair* out_gpair, size_t n) {
|
HostDeviceVector<GradientPair>* out_gpair) {
|
||||||
dh::safe_cuda(cudaSetDevice(param_.gpu_id));
|
label_correct_.Fill(1);
|
||||||
DeviceData& d = *data_;
|
|
||||||
d.label_correct.Fill(1);
|
|
||||||
// only copy the labels and weights once, similar to how the data is copied
|
// only copy the labels and weights once, similar to how the data is copied
|
||||||
if (!copied_) {
|
if (!copied_) {
|
||||||
thrust::copy(info.labels_.begin(), info.labels_.begin() + n,
|
labels_.Copy(info.labels_);
|
||||||
d.labels.tbegin());
|
if (info.weights_.size() > 0)
|
||||||
if (info.weights_.size() > 0) {
|
weights_.Copy(info.weights_);
|
||||||
thrust::copy(info.weights_.begin(), info.weights_.begin() + n,
|
|
||||||
d.weights.tbegin());
|
|
||||||
}
|
|
||||||
copied_ = true;
|
copied_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// run the kernel
|
// run the kernel
|
||||||
|
#pragma omp parallel for schedule(static, 1)
|
||||||
|
for (int i = 0; i < devices_.Size(); ++i) {
|
||||||
|
int d = devices_[i];
|
||||||
|
dh::safe_cuda(cudaSetDevice(d));
|
||||||
const int block = 256;
|
const int block = 256;
|
||||||
|
size_t n = preds->DeviceSize(d);
|
||||||
|
if (n > 0) {
|
||||||
get_gradient_k<Loss><<<dh::DivRoundUp(n, block), block>>>
|
get_gradient_k<Loss><<<dh::DivRoundUp(n, block), block>>>
|
||||||
(out_gpair, d.label_correct.Data(), preds,
|
(out_gpair->DevicePointer(d), label_correct_.DevicePointer(d),
|
||||||
d.labels.Data(), info.weights_.size() > 0 ? d.weights.Data() : nullptr,
|
preds->DevicePointer(d), labels_.DevicePointer(d),
|
||||||
|
info.weights_.size() > 0 ? weights_.DevicePointer(d) : nullptr,
|
||||||
n, param_.scale_pos_weight);
|
n, param_.scale_pos_weight);
|
||||||
dh::safe_cuda(cudaGetLastError());
|
dh::safe_cuda(cudaGetLastError());
|
||||||
|
}
|
||||||
|
dh::safe_cuda(cudaDeviceSynchronize());
|
||||||
|
}
|
||||||
|
|
||||||
// copy output data from the GPU
|
// copy "label correct" flags back to host
|
||||||
unsigned int label_correct_h;
|
std::vector<unsigned int>& label_correct_h = label_correct_.HostVector();
|
||||||
thrust::copy_n(d.label_correct.tbegin(), 1, &label_correct_h);
|
for (int i = 0; i < devices_.Size(); ++i) {
|
||||||
|
if (label_correct_h[i] == 0)
|
||||||
bool label_correct = label_correct_h != 0;
|
|
||||||
if (!label_correct) {
|
|
||||||
LOG(FATAL) << Loss::LabelErrorMsg();
|
LOG(FATAL) << Loss::LabelErrorMsg();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -174,17 +167,25 @@ class GPURegLossObj : public ObjFunction {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void PredTransform(HostDeviceVector<float> *io_preds) override {
|
void PredTransform(HostDeviceVector<float> *io_preds) override {
|
||||||
PredTransformDevice(io_preds->DevicePointer(param_.gpu_id), io_preds->Size());
|
io_preds->Reshard(devices_);
|
||||||
|
size_t ndata = io_preds->Size();
|
||||||
|
PredTransformDevice(io_preds);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PredTransformDevice(float* preds, size_t n) {
|
void PredTransformDevice(HostDeviceVector<float>* preds) {
|
||||||
dh::safe_cuda(cudaSetDevice(param_.gpu_id));
|
#pragma omp parallel for schedule(static, 1)
|
||||||
|
for (int i = 0; i < devices_.Size(); ++i) {
|
||||||
|
int d = devices_[i];
|
||||||
|
dh::safe_cuda(cudaSetDevice(d));
|
||||||
const int block = 256;
|
const int block = 256;
|
||||||
pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>(preds, n);
|
size_t n = preds->DeviceSize(d);
|
||||||
|
if (n > 0) {
|
||||||
|
pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>(preds->DevicePointer(d), n);
|
||||||
dh::safe_cuda(cudaGetLastError());
|
dh::safe_cuda(cudaGetLastError());
|
||||||
|
}
|
||||||
dh::safe_cuda(cudaDeviceSynchronize());
|
dh::safe_cuda(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
float ProbToMargin(float base_score) const override {
|
float ProbToMargin(float base_score) const override {
|
||||||
return Loss::ProbToMargin(base_score);
|
return Loss::ProbToMargin(base_score);
|
||||||
@ -192,6 +193,7 @@ class GPURegLossObj : public ObjFunction {
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
GPURegLossParam param_;
|
GPURegLossParam param_;
|
||||||
|
GPUSet devices_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// register the objective functions
|
// register the objective functions
|
||||||
|
|||||||
@ -310,8 +310,11 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
tree_group.begin());
|
tree_group.begin());
|
||||||
|
|
||||||
device_matrix->predictions.resize(out_preds->Size());
|
device_matrix->predictions.resize(out_preds->Size());
|
||||||
thrust::copy(out_preds->tbegin(param.gpu_id), out_preds->tend(param.gpu_id),
|
auto& predictions = device_matrix->predictions;
|
||||||
device_matrix->predictions.begin());
|
out_preds->GatherTo(predictions.data(),
|
||||||
|
predictions.data() + predictions.size());
|
||||||
|
|
||||||
|
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||||
|
|
||||||
const int BLOCK_THREADS = 128;
|
const int BLOCK_THREADS = 128;
|
||||||
const int GRID_SIZE = static_cast<int>(
|
const int GRID_SIZE = static_cast<int>(
|
||||||
@ -335,9 +338,8 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
model.param.num_output_group);
|
model.param.num_output_group);
|
||||||
|
|
||||||
dh::safe_cuda(cudaDeviceSynchronize());
|
dh::safe_cuda(cudaDeviceSynchronize());
|
||||||
thrust::copy(device_matrix->predictions.begin(),
|
out_preds->ScatterFrom(predictions.data(),
|
||||||
device_matrix->predictions.end(),
|
predictions.data() + predictions.size());
|
||||||
out_preds->tbegin(param.gpu_id));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -366,14 +368,13 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
const gbm::GBTreeModel& model) const {
|
const gbm::GBTreeModel& model) const {
|
||||||
size_t n = model.param.num_output_group * info.num_row_;
|
size_t n = model.param.num_output_group * info.num_row_;
|
||||||
const std::vector<bst_float>& base_margin = info.base_margin_;
|
const std::vector<bst_float>& base_margin = info.base_margin_;
|
||||||
out_preds->Resize(n, 0.0f, param.gpu_id);
|
out_preds->Reshard(devices);
|
||||||
|
out_preds->Resize(n);
|
||||||
if (base_margin.size() != 0) {
|
if (base_margin.size() != 0) {
|
||||||
CHECK_EQ(out_preds->Size(), n);
|
CHECK_EQ(out_preds->Size(), n);
|
||||||
thrust::copy(base_margin.begin(), base_margin.end(),
|
out_preds->Copy(base_margin);
|
||||||
out_preds->tbegin(param.gpu_id));
|
|
||||||
} else {
|
} else {
|
||||||
thrust::fill(out_preds->tbegin(param.gpu_id),
|
out_preds->Fill(model.base_margin);
|
||||||
out_preds->tend(param.gpu_id), model.base_margin);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -385,11 +386,9 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
if (it != cache_.end()) {
|
if (it != cache_.end()) {
|
||||||
HostDeviceVector<bst_float>& y = it->second.predictions;
|
HostDeviceVector<bst_float>& y = it->second.predictions;
|
||||||
if (y.Size() != 0) {
|
if (y.Size() != 0) {
|
||||||
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
out_preds->Reshard(devices);
|
||||||
out_preds->Resize(y.Size(), 0.0f, param.gpu_id);
|
out_preds->Resize(y.Size());
|
||||||
dh::safe_cuda(cudaMemcpy(
|
out_preds->Copy(&y);
|
||||||
out_preds->DevicePointer(param.gpu_id), y.DevicePointer(param.gpu_id),
|
|
||||||
out_preds->Size() * sizeof(bst_float), cudaMemcpyDefault));
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -410,18 +409,15 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
HostDeviceVector<bst_float>& predictions = e.predictions;
|
HostDeviceVector<bst_float>& predictions = e.predictions;
|
||||||
|
|
||||||
if (predictions.Size() == 0) {
|
if (predictions.Size() == 0) {
|
||||||
// ensure that the device in predictions is correct
|
this->InitOutPredictions(dmat->Info(), &predictions, model);
|
||||||
predictions.Resize(0, 0.0f, param.gpu_id);
|
}
|
||||||
cpu_predictor->PredictBatch(dmat, &predictions, model, 0,
|
|
||||||
static_cast<bst_uint>(model.trees.size()));
|
if (model.param.num_output_group == 1 && updaters->size() > 0 &&
|
||||||
} else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
|
|
||||||
num_new_trees == 1 &&
|
num_new_trees == 1 &&
|
||||||
updaters->back()->UpdatePredictionCache(e.data.get(),
|
updaters->back()->UpdatePredictionCache(e.data.get(), &predictions)) {
|
||||||
&predictions)) {
|
|
||||||
// do nothing
|
// do nothing
|
||||||
} else {
|
} else {
|
||||||
DevicePredictInternal(dmat, &predictions, model, old_ntree,
|
DevicePredictInternal(dmat, &predictions, model, old_ntree, model.trees.size());
|
||||||
model.trees.size());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -462,6 +458,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
Predictor::Init(cfg, cache);
|
Predictor::Init(cfg, cache);
|
||||||
cpu_predictor->Init(cfg, cache);
|
cpu_predictor->Init(cfg, cache);
|
||||||
param.InitAllowUnknown(cfg);
|
param.InitAllowUnknown(cfg);
|
||||||
|
devices = GPUSet::Range(param.gpu_id, dh::NDevicesAll(param.n_gpus));
|
||||||
max_shared_memory_bytes = dh::MaxSharedMemory(param.gpu_id);
|
max_shared_memory_bytes = dh::MaxSharedMemory(param.gpu_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -473,6 +470,8 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
thrust::device_vector<DevicePredictionNode> nodes;
|
thrust::device_vector<DevicePredictionNode> nodes;
|
||||||
thrust::device_vector<size_t> tree_segments;
|
thrust::device_vector<size_t> tree_segments;
|
||||||
thrust::device_vector<int> tree_group;
|
thrust::device_vector<int> tree_group;
|
||||||
|
thrust::device_vector<bst_float> preds;
|
||||||
|
GPUSet devices;
|
||||||
size_t max_shared_memory_bytes;
|
size_t max_shared_memory_bytes;
|
||||||
};
|
};
|
||||||
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
|
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
|
||||||
|
|||||||
@ -495,6 +495,11 @@ class GPUMaker : public TreeUpdater {
|
|||||||
int nCols;
|
int nCols;
|
||||||
int maxNodes;
|
int maxNodes;
|
||||||
int maxLeaves;
|
int maxLeaves;
|
||||||
|
|
||||||
|
// devices are only used for resharding the HostDeviceVector passed as a parameter;
|
||||||
|
// the algorithm works with a single GPU only
|
||||||
|
GPUSet devices;
|
||||||
|
|
||||||
dh::CubMemory tmp_mem;
|
dh::CubMemory tmp_mem;
|
||||||
dh::DVec<GradientPair> tmpScanGradBuff;
|
dh::DVec<GradientPair> tmpScanGradBuff;
|
||||||
dh::DVec<int> tmpScanKeyBuff;
|
dh::DVec<int> tmpScanKeyBuff;
|
||||||
@ -510,6 +515,8 @@ class GPUMaker : public TreeUpdater {
|
|||||||
param.InitAllowUnknown(args);
|
param.InitAllowUnknown(args);
|
||||||
maxNodes = (1 << (param.max_depth + 1)) - 1;
|
maxNodes = (1 << (param.max_depth + 1)) - 1;
|
||||||
maxLeaves = 1 << param.max_depth;
|
maxLeaves = 1 << param.max_depth;
|
||||||
|
|
||||||
|
devices = GPUSet::Range(param.gpu_id, dh::NDevicesAll(param.n_gpus));
|
||||||
}
|
}
|
||||||
|
|
||||||
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||||
@ -519,6 +526,8 @@ class GPUMaker : public TreeUpdater {
|
|||||||
float lr = param.learning_rate;
|
float lr = param.learning_rate;
|
||||||
param.learning_rate = lr / trees.size();
|
param.learning_rate = lr / trees.size();
|
||||||
|
|
||||||
|
gpair->Reshard(devices);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// build tree
|
// build tree
|
||||||
for (size_t i = 0; i < trees.size(); ++i) {
|
for (size_t i = 0; i < trees.size(); ++i) {
|
||||||
@ -688,10 +697,7 @@ class GPUMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void transferGrads(HostDeviceVector<GradientPair>* gpair) {
|
void transferGrads(HostDeviceVector<GradientPair>* gpair) {
|
||||||
// HACK
|
gpair->GatherTo(gradsInst.tbegin(), gradsInst.tend());
|
||||||
dh::safe_cuda(cudaMemcpy(gradsInst.Data(), gpair->DevicePointer(param.gpu_id),
|
|
||||||
sizeof(GradientPair) * nRows,
|
|
||||||
cudaMemcpyDefault));
|
|
||||||
// evaluate the full-grad reduction for the root node
|
// evaluate the full-grad reduction for the root node
|
||||||
dh::SumReduction<GradientPair>(tmp_mem, gradsInst, gradSums, nRows);
|
dh::SumReduction<GradientPair>(tmp_mem, gradsInst, gradSums, nRows);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -369,8 +369,7 @@ struct DeviceShard {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Reset values for each update iteration
|
// Reset values for each update iteration
|
||||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair, int device) {
|
void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
|
||||||
auto begin = dh_gpair->tbegin(device);
|
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
position.CurrentDVec().Fill(0);
|
position.CurrentDVec().Fill(0);
|
||||||
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
|
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
|
||||||
@ -380,7 +379,7 @@ struct DeviceShard {
|
|||||||
|
|
||||||
std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
|
std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
|
||||||
ridx_segments.front() = Segment(0, ridx.Size());
|
ridx_segments.front() = Segment(0, ridx.Size());
|
||||||
this->gpair.copy(begin + row_begin_idx, begin + row_end_idx);
|
this->gpair.copy(dh_gpair->tbegin(device_idx), dh_gpair->tend(device_idx));
|
||||||
SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
|
SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
|
||||||
hist.Reset();
|
hist.Reset();
|
||||||
}
|
}
|
||||||
@ -505,7 +504,7 @@ struct DeviceShard {
|
|||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
if (!prediction_cache_initialised) {
|
if (!prediction_cache_initialised) {
|
||||||
dh::safe_cuda(cudaMemcpy(
|
dh::safe_cuda(cudaMemcpy(
|
||||||
prediction_cache.Data(), &out_preds_d[row_begin_idx],
|
prediction_cache.Data(), out_preds_d,
|
||||||
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
|
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||||
}
|
}
|
||||||
prediction_cache_initialised = true;
|
prediction_cache_initialised = true;
|
||||||
@ -528,7 +527,7 @@ struct DeviceShard {
|
|||||||
});
|
});
|
||||||
|
|
||||||
dh::safe_cuda(cudaMemcpy(
|
dh::safe_cuda(cudaMemcpy(
|
||||||
&out_preds_d[row_begin_idx], prediction_cache.Data(),
|
out_preds_d, prediction_cache.Data(),
|
||||||
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
|
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -543,6 +542,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
param_.InitAllowUnknown(args);
|
param_.InitAllowUnknown(args);
|
||||||
CHECK(param_.n_gpus != 0) << "Must have at least one device";
|
CHECK(param_.n_gpus != 0) << "Must have at least one device";
|
||||||
n_devices_ = param_.n_gpus;
|
n_devices_ = param_.n_gpus;
|
||||||
|
devices_ = GPUSet::Range(param_.gpu_id, dh::NDevicesAll(param_.n_gpus));
|
||||||
|
|
||||||
dh::CheckComputeCapability();
|
dh::CheckComputeCapability();
|
||||||
|
|
||||||
@ -610,15 +610,11 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create device shards
|
// Create device shards
|
||||||
omp_set_num_threads(shards_.size());
|
dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
|
||||||
#pragma omp parallel
|
shard = std::unique_ptr<DeviceShard>(
|
||||||
{
|
new DeviceShard(device_list_[i], i, gmat_,
|
||||||
auto cpu_thread_id = omp_get_thread_num();
|
row_segments[i], row_segments[i + 1], n_bins_, param_));
|
||||||
shards_[cpu_thread_id] = std::unique_ptr<DeviceShard>(
|
});
|
||||||
new DeviceShard(device_list_[cpu_thread_id], cpu_thread_id, gmat_,
|
|
||||||
row_segments[cpu_thread_id],
|
|
||||||
row_segments[cpu_thread_id + 1], n_bins_, param_));
|
|
||||||
}
|
|
||||||
|
|
||||||
p_last_fmat_ = dmat;
|
p_last_fmat_ = dmat;
|
||||||
initialised_ = true;
|
initialised_ = true;
|
||||||
@ -636,12 +632,9 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
|
|
||||||
// Copy gpair & reset memory
|
// Copy gpair & reset memory
|
||||||
monitor_.Start("InitDataReset", device_list_);
|
monitor_.Start("InitDataReset", device_list_);
|
||||||
omp_set_num_threads(shards_.size());
|
|
||||||
|
|
||||||
// TODO(canonizer): make it parallel again once HostDeviceVector is
|
gpair->Reshard(devices_);
|
||||||
// thread-safe
|
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {shard->Reset(gpair); });
|
||||||
for (int shard = 0; shard < shards_.size(); ++shard)
|
|
||||||
shards_[shard]->Reset(gpair, param_.gpu_id);
|
|
||||||
monitor_.Stop("InitDataReset", device_list_);
|
monitor_.Stop("InitDataReset", device_list_);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -676,16 +669,16 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
subtraction_trick_nidx = nidx_left;
|
subtraction_trick_nidx = nidx_left;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto& shard : shards_) {
|
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||||
shard->BuildHist(build_hist_nidx);
|
shard->BuildHist(build_hist_nidx);
|
||||||
}
|
});
|
||||||
|
|
||||||
this->AllReduceHist(build_hist_nidx);
|
this->AllReduceHist(build_hist_nidx);
|
||||||
|
|
||||||
for (auto& shard : shards_) {
|
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||||
shard->SubtractionTrick(nidx_parent, build_hist_nidx,
|
shard->SubtractionTrick(nidx_parent, build_hist_nidx,
|
||||||
subtraction_trick_nidx);
|
subtraction_trick_nidx);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns best loss
|
// Returns best loss
|
||||||
@ -743,22 +736,20 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
auto root_nidx = 0;
|
auto root_nidx = 0;
|
||||||
// Sum gradients
|
// Sum gradients
|
||||||
std::vector<GradientPair> tmp_sums(shards_.size());
|
std::vector<GradientPair> tmp_sums(shards_.size());
|
||||||
omp_set_num_threads(shards_.size());
|
|
||||||
#pragma omp parallel
|
dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
|
||||||
{
|
|
||||||
auto cpu_thread_id = omp_get_thread_num();
|
|
||||||
auto& shard = shards_[cpu_thread_id];
|
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_idx));
|
dh::safe_cuda(cudaSetDevice(shard->device_idx));
|
||||||
tmp_sums[cpu_thread_id] = dh::SumReduction(
|
tmp_sums[i] =
|
||||||
shard->temp_memory, shard->gpair.Data(), shard->gpair.Size());
|
dh::SumReduction(shard->temp_memory, shard->gpair.Data(),
|
||||||
}
|
shard->gpair.Size());
|
||||||
|
});
|
||||||
auto sum_gradient =
|
auto sum_gradient =
|
||||||
std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair());
|
std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair());
|
||||||
|
|
||||||
// Generate root histogram
|
// Generate root histogram
|
||||||
for (auto& shard : shards_) {
|
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||||
shard->BuildHist(root_nidx);
|
shard->BuildHist(root_nidx);
|
||||||
}
|
});
|
||||||
|
|
||||||
this->AllReduceHist(root_nidx);
|
this->AllReduceHist(root_nidx);
|
||||||
|
|
||||||
@ -802,14 +793,11 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
|
|
||||||
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
|
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
|
||||||
|
|
||||||
omp_set_num_threads(shards_.size());
|
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||||
#pragma omp parallel
|
shard->UpdatePosition(nidx, left_nidx, right_nidx, fidx,
|
||||||
{
|
|
||||||
auto cpu_thread_id = omp_get_thread_num();
|
|
||||||
shards_[cpu_thread_id]->UpdatePosition(nidx, left_nidx, right_nidx, fidx,
|
|
||||||
split_gidx, default_dir_left,
|
split_gidx, default_dir_left,
|
||||||
is_dense, fidx_begin, fidx_end);
|
is_dense, fidx_begin, fidx_end);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
|
void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
|
||||||
@ -903,8 +891,6 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
monitor_.Stop("EvaluateSplits", device_list_);
|
monitor_.Stop("EvaluateSplits", device_list_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Reset omp num threads
|
|
||||||
omp_set_num_threads(nthread);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UpdatePredictionCache(
|
bool UpdatePredictionCache(
|
||||||
@ -912,13 +898,10 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
monitor_.Start("UpdatePredictionCache", device_list_);
|
monitor_.Start("UpdatePredictionCache", device_list_);
|
||||||
if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
|
if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
|
||||||
return false;
|
return false;
|
||||||
|
p_out_preds->Reshard(devices_);
|
||||||
bst_float* out_preds_d = p_out_preds->DevicePointer(param_.gpu_id);
|
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
|
||||||
|
shard->UpdatePredictionCache(p_out_preds->DevicePointer(shard->device_idx));
|
||||||
#pragma omp parallel for schedule(static, 1)
|
});
|
||||||
for (int shard = 0; shard < shards_.size(); ++shard) {
|
|
||||||
shards_[shard]->UpdatePredictionCache(out_preds_d);
|
|
||||||
}
|
|
||||||
monitor_.Stop("UpdatePredictionCache", device_list_);
|
monitor_.Stop("UpdatePredictionCache", device_list_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -992,6 +975,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
std::vector<int> device_list_;
|
std::vector<int> device_list_;
|
||||||
|
|
||||||
DMatrix* p_last_fmat_;
|
DMatrix* p_last_fmat_;
|
||||||
|
GPUSet devices_;
|
||||||
};
|
};
|
||||||
|
|
||||||
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user