diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index ce2119ed7..7271dcb5e 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1004,14 +1004,29 @@ class AllReducer { template void ExecuteShards(std::vector *shards, FunctionT f) { - auto previous_num_threads = omp_get_max_threads(); - omp_set_num_threads(shards->size()); -#pragma omp parallel - { - auto cpu_thread_id = omp_get_thread_num(); - f(shards->at(cpu_thread_id)); +#pragma omp parallel for schedule(static, 1) + for (int shard = 0; shard < shards->size(); ++shard) { + f(shards->at(shard)); + } +} + +/** + * \brief Executes some operation on each element of the input vector, using a + * single controlling thread for each element. In addition, passes the shard index + * into the function. + * + * \tparam T Generic type parameter. + * \tparam FunctionT Type of the function t. + * \param shards The shards. + * \param f The func_t to process. + */ + +template +void ExecuteIndexShards(std::vector *shards, FunctionT f) { +#pragma omp parallel for schedule(static, 1) + for (int shard = 0; shard < shards->size(); ++shard) { + f(shard, shards->at(shard)); } - omp_set_num_threads(previous_num_threads); } /** @@ -1029,15 +1044,11 @@ void ExecuteShards(std::vector *shards, FunctionT f) { template ReduceT ReduceShards(std::vector *shards, FunctionT f) { - auto previous_num_threads = omp_get_max_threads(); - omp_set_num_threads(shards->size()); std::vector sums(shards->size()); -#pragma omp parallel - { - auto cpu_thread_id = omp_get_thread_num(); - sums[cpu_thread_id] = f(shards->at(cpu_thread_id)); +#pragma omp parallel for schedule(static, 1) + for (int shard = 0; shard < shards->size(); ++shard) { + sums[shard] = f(shards->at(shard)); } - omp_set_num_threads(previous_num_threads); return std::accumulate(sums.begin(), sums.end(), ReduceT()); } } // namespace dh diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index 300694d76..e0263de04 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -21,18 +21,18 @@ struct HostDeviceVectorImpl { }; template -HostDeviceVector::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) { +HostDeviceVector::HostDeviceVector(size_t size, T v, GPUSet devices) : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(size, v); } template -HostDeviceVector::HostDeviceVector(std::initializer_list init, int device) +HostDeviceVector::HostDeviceVector(std::initializer_list init, GPUSet devices) : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init); } template -HostDeviceVector::HostDeviceVector(const std::vector& init, int device) +HostDeviceVector::HostDeviceVector(const std::vector& init, GPUSet devices) : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init); } @@ -48,7 +48,7 @@ template size_t HostDeviceVector::Size() const { return impl_->data_h_.size(); } template -int HostDeviceVector::DeviceIdx() const { return -1; } +GPUSet HostDeviceVector::Devices() const { return GPUSet::Empty(); } template T* HostDeviceVector::DevicePointer(int device) { return nullptr; } @@ -57,13 +57,46 @@ template std::vector& HostDeviceVector::HostVector() { return impl_->data_h_; } template -void HostDeviceVector::Resize(size_t new_size, T v, int new_device) { +void HostDeviceVector::Resize(size_t new_size, T v) { impl_->data_h_.resize(new_size, v); } +template +size_t HostDeviceVector::DeviceStart(int device) { return 0; } + +template +size_t HostDeviceVector::DeviceSize(int device) { return 0; } + +template +void HostDeviceVector::Fill(T v) { + std::fill(HostVector().begin(), HostVector().end(), v); +} + +template +void HostDeviceVector::Copy(HostDeviceVector* other) { + CHECK_EQ(Size(), other->Size()); + std::copy(other->HostVector().begin(), other->HostVector().end(), HostVector().begin()); +} + +template +void HostDeviceVector::Copy(const std::vector& other) { + CHECK_EQ(Size(), other.size()); + std::copy(other.begin(), other.end(), HostVector().begin()); +} + +template +void HostDeviceVector::Copy(std::initializer_list other) { + CHECK_EQ(Size(), other.size()); + std::copy(other.begin(), other.end(), HostVector().begin()); +} + +template +void HostDeviceVector::Reshard(GPUSet devices) { } + // explicit instantiations are required, as HostDeviceVector isn't header-only template class HostDeviceVector; template class HostDeviceVector; +template class HostDeviceVector; } // namespace xgboost diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index c1529f100..a474be7a0 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -2,122 +2,309 @@ * Copyright 2017 XGBoost contributors */ + +#include #include "./host_device_vector.h" #include "./device_helpers.cuh" namespace xgboost { + template struct HostDeviceVectorImpl { - HostDeviceVectorImpl(size_t size, T v, int device) - : device_(device), on_d_(device >= 0) { - if (on_d_) { + struct DeviceShard { + DeviceShard() : index_(-1), device_(-1), start_(0), on_d_(false), vec_(nullptr) {} + + static size_t ShardStart(size_t size, int ndevices, int index) { + size_t portion = dh::DivRoundUp(size, ndevices); + size_t begin = index * portion; + begin = begin > size ? size : begin; + return begin; + } + + static size_t ShardSize(size_t size, int ndevices, int index) { + size_t portion = dh::DivRoundUp(size, ndevices); + size_t begin = index * portion, end = (index + 1) * portion; + begin = begin > size ? size : begin; + end = end > size ? size : end; + return end - begin; + } + + void Init(HostDeviceVectorImpl* vec, int device) { + if (vec_ == nullptr) { vec_ = vec; } + CHECK_EQ(vec, vec_); + device_ = device; + index_ = vec_->devices_.Index(device); + size_t size_h = vec_->Size(); + int ndevices = vec_->devices_.Size(); + start_ = ShardStart(size_h, ndevices, index_); + size_t size_d = ShardSize(size_h, ndevices, index_); dh::safe_cuda(cudaSetDevice(device_)); - data_d_.resize(size, v); + data_.resize(size_d); + on_d_ = !vec_->on_h_; + } + + void ScatterFrom(const T* begin) { + // TODO(canonizer): avoid full copy of host data + LazySyncDevice(); + dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_, + data_.size() * sizeof(T), cudaMemcpyDefault)); + } + + void GatherTo(thrust::device_ptr begin) { + LazySyncDevice(); + dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(), + data_.size() * sizeof(T), cudaMemcpyDefault)); + } + + void Fill(T v) { + // TODO(canonizer): avoid full copy of host data + LazySyncDevice(); + dh::safe_cuda(cudaSetDevice(device_)); + thrust::fill(data_.begin(), data_.end(), v); + } + + void Copy(DeviceShard* other) { + // TODO(canonizer): avoid full copy of host data for this (but not for other) + LazySyncDevice(); + other->LazySyncDevice(); + dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(), + data_.size() * sizeof(T), cudaMemcpyDefault)); + } + + void LazySyncHost() { + dh::safe_cuda(cudaSetDevice(device_)); + thrust::copy(data_.begin(), data_.end(), vec_->data_h_.begin() + start_); + on_d_ = false; + } + + void LazySyncDevice() { + if (on_d_) { return; } + // data is on the host + size_t size_h = vec_->data_h_.size(); + int ndevices = vec_->devices_.Size(); + start_ = ShardStart(size_h, ndevices, index_); + size_t size_d = ShardSize(size_h, ndevices, index_); + dh::safe_cuda(cudaSetDevice(device_)); + data_.resize(size_d); + thrust::copy(vec_->data_h_.begin() + start_, + vec_->data_h_.begin() + start_ + size_d, data_.begin()); + on_d_ = true; + // this may cause a race condition if LazySyncDevice() is called + // from multiple threads in parallel; + // however, the race condition is benign, and will not cause problems + vec_->on_h_ = false; + vec_->size_d_ = vec_->data_h_.size(); + } + + int index_; + int device_; + thrust::device_vector data_; + size_t start_; + // true if there is an up-to-date copy of data on device, false otherwise + bool on_d_; + HostDeviceVectorImpl* vec_; + }; + + HostDeviceVectorImpl(size_t size, T v, GPUSet devices) + : devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) { + if (!devices.IsEmpty()) { + size_d_ = size; + InitShards(); + Fill(v); } else { data_h_.resize(size, v); } } + // Init can be std::vector or std::initializer_list template - HostDeviceVectorImpl(const Init& init, int device) - : device_(device), on_d_(device >= 0) { - if (on_d_) { - dh::safe_cuda(cudaSetDevice(device_)); - data_d_.resize(init.size()); - thrust::copy(init.begin(), init.end(), data_d_.begin()); + HostDeviceVectorImpl(const Init& init, GPUSet devices) + : devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) { + if (!devices.IsEmpty()) { + size_d_ = init.size(); + InitShards(); + Copy(init); } else { data_h_ = init; } } + + void InitShards() { + int ndevices = devices_.Size(); + shards_.resize(ndevices); + dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) { + shard.Init(this, devices_[i]); + }); + } + HostDeviceVectorImpl(const HostDeviceVectorImpl&) = delete; HostDeviceVectorImpl(HostDeviceVectorImpl&&) = delete; void operator=(const HostDeviceVectorImpl&) = delete; void operator=(HostDeviceVectorImpl&&) = delete; - size_t Size() const { return on_d_ ? data_d_.size() : data_h_.size(); } + size_t Size() const { return on_h_ ? data_h_.size() : size_d_; } - int DeviceIdx() const { return device_; } + GPUSet Devices() const { return devices_; } T* DevicePointer(int device) { + CHECK(devices_.Contains(device)); LazySyncDevice(device); - return data_d_.data().get(); + return shards_[devices_.Index(device)].data_.data().get(); } + + size_t DeviceSize(int device) { + CHECK(devices_.Contains(device)); + LazySyncDevice(device); + return shards_[devices_.Index(device)].data_.size(); + } + + size_t DeviceStart(int device) { + CHECK(devices_.Contains(device)); + LazySyncDevice(device); + return shards_[devices_.Index(device)].start_; + } + thrust::device_ptr tbegin(int device) { // NOLINT return thrust::device_ptr(DevicePointer(device)); } + thrust::device_ptr tend(int device) { // NOLINT - auto begin = tbegin(device); - return begin + Size(); + return tbegin(device) + DeviceSize(device); } + + void ScatterFrom(thrust::device_ptr begin, thrust::device_ptr end) { + CHECK_EQ(end - begin, Size()); + if (on_h_) { + thrust::copy(begin, end, data_h_.begin()); + } else { + dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { + shard.ScatterFrom(begin.get()); + }); + } + } + + void GatherTo(thrust::device_ptr begin, thrust::device_ptr end) { + CHECK_EQ(end - begin, Size()); + if (on_h_) { + thrust::copy(data_h_.begin(), data_h_.end(), begin); + } else { + dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); }); + } + } + + void Fill(T v) { + if (on_h_) { + std::fill(data_h_.begin(), data_h_.end(), v); + } else { + dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.Fill(v); }); + } + } + + void Copy(HostDeviceVectorImpl* other) { + CHECK_EQ(Size(), other->Size()); + if (on_h_ && other->on_h_) { + std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin()); + } else { + CHECK(devices_ == other->devices_); + dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) { + shard.Copy(&other->shards_[i]); + }); + } + } + + void Copy(const std::vector& other) { + CHECK_EQ(Size(), other.size()); + if (on_h_) { + std::copy(other.begin(), other.end(), data_h_.begin()); + } else { + dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { + shard.ScatterFrom(other.data()); + }); + } + } + + void Copy(std::initializer_list other) { + CHECK_EQ(Size(), other.size()); + if (on_h_) { + std::copy(other.begin(), other.end(), data_h_.begin()); + } else { + dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { + shard.ScatterFrom(other.begin()); + }); + } + } + std::vector& HostVector() { LazySyncHost(); return data_h_; } - void Resize(size_t new_size, T v, int new_device) { - if (new_size == this->Size() && new_device == device_) + + void Reshard(GPUSet new_devices) { + if (devices_ == new_devices) return; - if (new_device != -1) - device_ = new_device; - // if !on_d_, but the data size is 0 and the device is set, - // resize the data on device instead - if (!on_d_ && (data_h_.size() > 0 || device_ == -1)) { - data_h_.resize(new_size, v); + CHECK(devices_.IsEmpty()); + devices_ = new_devices; + InitShards(); + } + + void Resize(size_t new_size, T v) { + if (new_size == Size()) + return; + if (Size() == 0 && !devices_.IsEmpty()) { + // fast on-device resize + on_h_ = false; + size_d_ = new_size; + InitShards(); + Fill(v); } else { - dh::safe_cuda(cudaSetDevice(device_)); - data_d_.resize(new_size, v); - on_d_ = true; + // resize on host + LazySyncHost(); + data_h_.resize(new_size, v); } } void LazySyncHost() { - if (!on_d_) + if (on_h_) return; - if (data_h_.size() != this->Size()) - data_h_.resize(this->Size()); - dh::safe_cuda(cudaSetDevice(device_)); - thrust::copy(data_d_.begin(), data_d_.end(), data_h_.begin()); - on_d_ = false; + if (data_h_.size() != size_d_) + data_h_.resize(size_d_); + dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.LazySyncHost(); }); + on_h_ = true; } void LazySyncDevice(int device) { - if (on_d_) - return; - if (device != device_) { - CHECK_EQ(device_, -1); - device_ = device; - } - if (data_d_.size() != this->Size()) { - dh::safe_cuda(cudaSetDevice(device_)); - data_d_.resize(this->Size()); - } - dh::safe_cuda(cudaSetDevice(device_)); - thrust::copy(data_h_.begin(), data_h_.end(), data_d_.begin()); - on_d_ = true; + CHECK(devices_.Contains(device)); + shards_[devices_.Index(device)].LazySyncDevice(); } std::vector data_h_; - thrust::device_vector data_d_; - // true if there is an up-to-date copy of data on device, false otherwise - bool on_d_; - int device_; + bool on_h_; + // the total size of the data stored on the devices + size_t size_d_; + GPUSet devices_; + std::vector shards_; }; template -HostDeviceVector::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) { - impl_ = new HostDeviceVectorImpl(size, v, device); +HostDeviceVector::HostDeviceVector(size_t size, T v, GPUSet devices) + : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(size, v, devices); } template -HostDeviceVector::HostDeviceVector(std::initializer_list init, int device) +HostDeviceVector::HostDeviceVector(std::initializer_list init, GPUSet devices) : impl_(nullptr) { - impl_ = new HostDeviceVectorImpl(init, device); + impl_ = new HostDeviceVectorImpl(init, devices); } template -HostDeviceVector::HostDeviceVector(const std::vector& init, int device) +HostDeviceVector::HostDeviceVector(const std::vector& init, GPUSet devices) : impl_(nullptr) { - impl_ = new HostDeviceVectorImpl(init, device); + impl_ = new HostDeviceVectorImpl(init, devices); } template @@ -131,11 +318,17 @@ template size_t HostDeviceVector::Size() const { return impl_->Size(); } template -int HostDeviceVector::DeviceIdx() const { return impl_->DeviceIdx(); } +GPUSet HostDeviceVector::Devices() const { return impl_->Devices(); } template T* HostDeviceVector::DevicePointer(int device) { return impl_->DevicePointer(device); } +template +size_t HostDeviceVector::DeviceStart(int device) { return impl_->DeviceStart(device); } + +template +size_t HostDeviceVector::DeviceSize(int device) { return impl_->DeviceSize(device); } + template thrust::device_ptr HostDeviceVector::tbegin(int device) { // NOLINT return impl_->tbegin(device); @@ -146,16 +339,54 @@ thrust::device_ptr HostDeviceVector::tend(int device) { // NOLINT return impl_->tend(device); } +template +void HostDeviceVector::ScatterFrom +(thrust::device_ptr begin, thrust::device_ptr end) { + impl_->ScatterFrom(begin, end); +} + +template +void HostDeviceVector::GatherTo +(thrust::device_ptr begin, thrust::device_ptr end) { + impl_->GatherTo(begin, end); +} + +template +void HostDeviceVector::Fill(T v) { + impl_->Fill(v); +} + +template +void HostDeviceVector::Copy(HostDeviceVector* other) { + impl_->Copy(other->impl_); +} + +template +void HostDeviceVector::Copy(const std::vector& other) { + impl_->Copy(other); +} + +template +void HostDeviceVector::Copy(std::initializer_list other) { + impl_->Copy(other); +} + template std::vector& HostDeviceVector::HostVector() { return impl_->HostVector(); } template -void HostDeviceVector::Resize(size_t new_size, T v, int new_device) { - impl_->Resize(new_size, v, new_device); +void HostDeviceVector::Reshard(GPUSet new_devices) { + impl_->Reshard(new_devices); +} + +template +void HostDeviceVector::Resize(size_t new_size, T v) { + impl_->Resize(new_size, v); } // explicit instantiations are required, as HostDeviceVector isn't header-only template class HostDeviceVector; template class HostDeviceVector; +template class HostDeviceVector; } // namespace xgboost diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h index 9428c311f..5d9762511 100644 --- a/src/common/host_device_vector.h +++ b/src/common/host_device_vector.h @@ -4,6 +4,9 @@ #ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_ #define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_ +#include + +#include #include #include #include @@ -18,6 +21,40 @@ namespace xgboost { template struct HostDeviceVectorImpl; +// set of devices across which HostDeviceVector can be distributed; +// currently implemented as a range, but can be changed later to something else, +// e.g. a bitset +class GPUSet { + public: + explicit GPUSet(int start = 0, int ndevices = 0) + : start_(start), ndevices_(ndevices) {} + static GPUSet Empty() { return GPUSet(); } + static GPUSet Range(int start, int ndevices) { return GPUSet(start, ndevices); } + int Size() const { return ndevices_; } + int operator[](int index) const { + CHECK(index >= 0 && index < ndevices_); + return start_ + index; + } + bool IsEmpty() const { return ndevices_ <= 0; } + int Index(int device) const { + CHECK(device >= start_ && device < start_ + ndevices_); + return device - start_; + } + bool Contains(int device) const { + return start_ <= device && device < start_ + ndevices_; + } + friend bool operator==(GPUSet a, GPUSet b) { + return a.start_ == b.start_ && a.ndevices_ == b.ndevices_; + } + friend bool operator!=(GPUSet a, GPUSet b) { + return a.start_ != b.start_ || a.ndevices_ != b.ndevices_; + } + + private: + int start_, ndevices_; +}; + + /** * @file host_device_vector.h * @brief A device-and-host vector abstraction layer. @@ -29,24 +66,26 @@ template struct HostDeviceVectorImpl; * * Initialization/Allocation:
* One can choose to initialize the vector on CPU or GPU during constructor. - * (use the 'device' argument) Or, can choose to use the 'resize' method to - * allocate/resize memory explicitly. + * (use the 'devices' argument) Or, can choose to use the 'Resize' method to + * allocate/resize memory explicitly, and use the 'Reshard' method + * to specify the devices. * - * Accessing underling data:
- * Use 'data_h' method to explicitly query for the underlying std::vector. - * If you need the raw device pointer, use the 'ptr_d' method. For perf + * Accessing underlying data:
+ * Use 'HostVector' method to explicitly query for the underlying std::vector. + * If you need the raw device pointer, use the 'DevicePointer' method. For perf * implications of these calls, see below. * * Accessing underling data and their perf implications:
* There are 4 scenarios to be considered here: - * data_h and data on CPU --> no problems, std::vector returned immediately - * data_h but data on GPU --> this causes a cudaMemcpy to be issued internally. - * subsequent calls to data_h, will NOT incur this penalty. - * (assuming 'ptr_d' is not called in between) - * ptr_d but data on CPU --> this causes a cudaMemcpy to be issued internally. - * subsequent calls to ptr_d, will NOT incur this penalty. - * (assuming 'data_h' is not called in between) - * ptr_d and data on GPU --> no problems, the device ptr will be returned immediately + * HostVector and data on CPU --> no problems, std::vector returned immediately + * HostVector but data on GPU --> this causes a cudaMemcpy to be issued internally. + * subsequent calls to HostVector, will NOT incur this penalty. + * (assuming 'DevicePointer' is not called in between) + * DevicePointer but data on CPU --> this causes a cudaMemcpy to be issued internally. + * subsequent calls to DevicePointer, will NOT incur this penalty. + * (assuming 'HostVector' is not called in between) + * DevicePointer and data on GPU --> no problems, the device ptr + * will be returned immediately. * * What if xgboost is compiled without CUDA?
* In that case, there's a special implementation which always falls-back to @@ -57,35 +96,49 @@ template struct HostDeviceVectorImpl; * compiling with and without CUDA toolkit. It was easier to have * 'HostDeviceVector' with a special-case implementation in host_device_vector.cc * - * @note: This is not thread-safe! + * @note: Size and Devices methods are thread-safe. + * DevicePointer, DeviceStart, DeviceSize, tbegin and tend methods are thread-safe + * if different threads call these methods with different values of the device argument. + * All other methods are not thread safe. */ template class HostDeviceVector { public: - explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1); - HostDeviceVector(std::initializer_list init, int device = -1); - explicit HostDeviceVector(const std::vector& init, int device = -1); + explicit HostDeviceVector(size_t size = 0, T v = T(), + GPUSet devices = GPUSet::Empty()); + HostDeviceVector(std::initializer_list init, GPUSet devices = GPUSet::Empty()); + explicit HostDeviceVector(const std::vector& init, + GPUSet devices = GPUSet::Empty()); ~HostDeviceVector(); HostDeviceVector(const HostDeviceVector&) = delete; HostDeviceVector(HostDeviceVector&&) = delete; void operator=(const HostDeviceVector&) = delete; void operator=(HostDeviceVector&&) = delete; size_t Size() const; - int DeviceIdx() const; + GPUSet Devices() const; T* DevicePointer(int device); + T* HostPointer() { return HostVector().data(); } + size_t DeviceStart(int device); + size_t DeviceSize(int device); // only define functions returning device_ptr // if HostDeviceVector.h is included from a .cu file #ifdef __CUDACC__ - thrust::device_ptr tbegin(int device); - thrust::device_ptr tend(int device); + thrust::device_ptr tbegin(int device); // NOLINT + thrust::device_ptr tend(int device); // NOLINT + void ScatterFrom(thrust::device_ptr begin, thrust::device_ptr end); + void GatherTo(thrust::device_ptr begin, thrust::device_ptr end); #endif - std::vector& HostVector(); + void Fill(T v); + void Copy(HostDeviceVector* other); + void Copy(const std::vector& other); + void Copy(std::initializer_list other); - // passing in new_device == -1 keeps the device as is - void Resize(size_t new_size, T v = T(), int new_device = -1); + std::vector& HostVector(); + void Reshard(GPUSet devices); + void Resize(size_t new_size, T v = T()); private: HostDeviceVectorImpl* impl_; diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 9572358fc..72e8625e5 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -195,7 +195,7 @@ class GBTree : public GradientBooster { << "must have exactly ngroup*nrow gpairs"; // TODO(canonizer): perform this on GPU if HostDeviceVector has device set. HostDeviceVector tmp(in_gpair->Size() / ngroup, - GradientPair(), in_gpair->DeviceIdx()); + GradientPair(), in_gpair->Devices()); std::vector& gpair_h = in_gpair->HostVector(); auto nsize = static_cast(tmp.Size()); for (int gid = 0; gid < ngroup; ++gid) { diff --git a/src/objective/regression_obj_gpu.cu b/src/objective/regression_obj_gpu.cu index 7369d4ec2..2dfc691e3 100644 --- a/src/objective/regression_obj_gpu.cu +++ b/src/objective/regression_obj_gpu.cu @@ -74,46 +74,35 @@ __global__ void pred_transform_k(float * __restrict__ preds, int n) { template class GPURegLossObj : public ObjFunction { protected: - // manages device data - struct DeviceData { - DVec labels, weights; - DVec label_correct; - - // allocate everything on device - DeviceData(dh::BulkAllocator* ba, int device_idx, size_t n) { - ba->Allocate(device_idx, false, - &labels, n, - &weights, n, - &label_correct, 1); - } - size_t Size() const { return labels.Size(); } - }; - - bool copied_; - std::unique_ptr> ba_; - std::unique_ptr data_; - HostDeviceVector preds_d_; - HostDeviceVector out_gpair_d_; + HostDeviceVector labels_, weights_; + HostDeviceVector label_correct_; - // allocate device data for n elements, do nothing if enough memory is allocated already - void LazyResize(int n) { - if (data_.get() != nullptr && data_->Size() >= n) + // allocate device data for n elements, do nothing if memory is allocated already + void LazyResize(size_t n, size_t n_weights) { + if (labels_.Size() == n && weights_.Size() == n_weights) return; copied_ = false; - // free the old data and allocate the new data - ba_.reset(new dh::BulkAllocator()); - data_.reset(new DeviceData(ba_.get(), 0, n)); - preds_d_.Resize(n, 0.0f, param_.gpu_id); - out_gpair_d_.Resize(n, GradientPair(), param_.gpu_id); + + labels_.Reshard(devices_); + weights_.Reshard(devices_); + label_correct_.Reshard(devices_); + + if (labels_.Size() != n) { + labels_.Resize(n); + label_correct_.Resize(devices_.Size()); + } + if (weights_.Size() != n_weights) + weights_.Resize(n_weights); } public: - GPURegLossObj() : copied_(false), preds_d_(0, -1), out_gpair_d_({}, -1) {} + GPURegLossObj() : copied_(false) {} void Configure(const std::vector >& args) override { param_.InitAllowUnknown(args); CHECK(param_.n_gpus != 0) << "Must have at least one device"; + devices_ = GPUSet::Range(param_.gpu_id, dh::NDevicesAll(param_.n_gpus)); } void GetGradient(HostDeviceVector* preds, @@ -125,46 +114,50 @@ class GPURegLossObj : public ObjFunction { << "labels are not correctly provided" << "preds.size=" << preds->Size() << ", label.size=" << info.labels_.size(); size_t ndata = preds->Size(); - out_gpair->Resize(ndata, GradientPair(), param_.gpu_id); - LazyResize(ndata); - GetGradientDevice(preds->DevicePointer(param_.gpu_id), info, iter, - out_gpair->DevicePointer(param_.gpu_id), ndata); + preds->Reshard(devices_); + out_gpair->Reshard(devices_); + out_gpair->Resize(ndata); + LazyResize(ndata, info.weights_.size()); + GetGradientDevice(preds, info, iter, out_gpair); } private: - void GetGradientDevice(float* preds, + void GetGradientDevice(HostDeviceVector* preds, const MetaInfo &info, int iter, - GradientPair* out_gpair, size_t n) { - dh::safe_cuda(cudaSetDevice(param_.gpu_id)); - DeviceData& d = *data_; - d.label_correct.Fill(1); + HostDeviceVector* out_gpair) { + label_correct_.Fill(1); // only copy the labels and weights once, similar to how the data is copied if (!copied_) { - thrust::copy(info.labels_.begin(), info.labels_.begin() + n, - d.labels.tbegin()); - if (info.weights_.size() > 0) { - thrust::copy(info.weights_.begin(), info.weights_.begin() + n, - d.weights.tbegin()); - } + labels_.Copy(info.labels_); + if (info.weights_.size() > 0) + weights_.Copy(info.weights_); copied_ = true; } // run the kernel - const int block = 256; - get_gradient_k<<>> - (out_gpair, d.label_correct.Data(), preds, - d.labels.Data(), info.weights_.size() > 0 ? d.weights.Data() : nullptr, - n, param_.scale_pos_weight); - dh::safe_cuda(cudaGetLastError()); +#pragma omp parallel for schedule(static, 1) + for (int i = 0; i < devices_.Size(); ++i) { + int d = devices_[i]; + dh::safe_cuda(cudaSetDevice(d)); + const int block = 256; + size_t n = preds->DeviceSize(d); + if (n > 0) { + get_gradient_k<<>> + (out_gpair->DevicePointer(d), label_correct_.DevicePointer(d), + preds->DevicePointer(d), labels_.DevicePointer(d), + info.weights_.size() > 0 ? weights_.DevicePointer(d) : nullptr, + n, param_.scale_pos_weight); + dh::safe_cuda(cudaGetLastError()); + } + dh::safe_cuda(cudaDeviceSynchronize()); + } - // copy output data from the GPU - unsigned int label_correct_h; - thrust::copy_n(d.label_correct.tbegin(), 1, &label_correct_h); - - bool label_correct = label_correct_h != 0; - if (!label_correct) { - LOG(FATAL) << Loss::LabelErrorMsg(); + // copy "label correct" flags back to host + std::vector& label_correct_h = label_correct_.HostVector(); + for (int i = 0; i < devices_.Size(); ++i) { + if (label_correct_h[i] == 0) + LOG(FATAL) << Loss::LabelErrorMsg(); } } @@ -174,24 +167,33 @@ class GPURegLossObj : public ObjFunction { } void PredTransform(HostDeviceVector *io_preds) override { - PredTransformDevice(io_preds->DevicePointer(param_.gpu_id), io_preds->Size()); + io_preds->Reshard(devices_); + size_t ndata = io_preds->Size(); + PredTransformDevice(io_preds); } - void PredTransformDevice(float* preds, size_t n) { - dh::safe_cuda(cudaSetDevice(param_.gpu_id)); - const int block = 256; - pred_transform_k<<>>(preds, n); - dh::safe_cuda(cudaGetLastError()); - dh::safe_cuda(cudaDeviceSynchronize()); + void PredTransformDevice(HostDeviceVector* preds) { +#pragma omp parallel for schedule(static, 1) + for (int i = 0; i < devices_.Size(); ++i) { + int d = devices_[i]; + dh::safe_cuda(cudaSetDevice(d)); + const int block = 256; + size_t n = preds->DeviceSize(d); + if (n > 0) { + pred_transform_k<<>>(preds->DevicePointer(d), n); + dh::safe_cuda(cudaGetLastError()); + } + dh::safe_cuda(cudaDeviceSynchronize()); + } } - float ProbToMargin(float base_score) const override { return Loss::ProbToMargin(base_score); } protected: GPURegLossParam param_; + GPUSet devices_; }; // register the objective functions diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index bc928059c..1a2d1b7f5 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -310,8 +310,11 @@ class GPUPredictor : public xgboost::Predictor { tree_group.begin()); device_matrix->predictions.resize(out_preds->Size()); - thrust::copy(out_preds->tbegin(param.gpu_id), out_preds->tend(param.gpu_id), - device_matrix->predictions.begin()); + auto& predictions = device_matrix->predictions; + out_preds->GatherTo(predictions.data(), + predictions.data() + predictions.size()); + + dh::safe_cuda(cudaSetDevice(param.gpu_id)); const int BLOCK_THREADS = 128; const int GRID_SIZE = static_cast( @@ -335,9 +338,8 @@ class GPUPredictor : public xgboost::Predictor { model.param.num_output_group); dh::safe_cuda(cudaDeviceSynchronize()); - thrust::copy(device_matrix->predictions.begin(), - device_matrix->predictions.end(), - out_preds->tbegin(param.gpu_id)); + out_preds->ScatterFrom(predictions.data(), + predictions.data() + predictions.size()); } public: @@ -366,14 +368,13 @@ class GPUPredictor : public xgboost::Predictor { const gbm::GBTreeModel& model) const { size_t n = model.param.num_output_group * info.num_row_; const std::vector& base_margin = info.base_margin_; - out_preds->Resize(n, 0.0f, param.gpu_id); + out_preds->Reshard(devices); + out_preds->Resize(n); if (base_margin.size() != 0) { CHECK_EQ(out_preds->Size(), n); - thrust::copy(base_margin.begin(), base_margin.end(), - out_preds->tbegin(param.gpu_id)); + out_preds->Copy(base_margin); } else { - thrust::fill(out_preds->tbegin(param.gpu_id), - out_preds->tend(param.gpu_id), model.base_margin); + out_preds->Fill(model.base_margin); } } @@ -385,11 +386,9 @@ class GPUPredictor : public xgboost::Predictor { if (it != cache_.end()) { HostDeviceVector& y = it->second.predictions; if (y.Size() != 0) { - dh::safe_cuda(cudaSetDevice(param.gpu_id)); - out_preds->Resize(y.Size(), 0.0f, param.gpu_id); - dh::safe_cuda(cudaMemcpy( - out_preds->DevicePointer(param.gpu_id), y.DevicePointer(param.gpu_id), - out_preds->Size() * sizeof(bst_float), cudaMemcpyDefault)); + out_preds->Reshard(devices); + out_preds->Resize(y.Size()); + out_preds->Copy(&y); return true; } } @@ -410,18 +409,15 @@ class GPUPredictor : public xgboost::Predictor { HostDeviceVector& predictions = e.predictions; if (predictions.Size() == 0) { - // ensure that the device in predictions is correct - predictions.Resize(0, 0.0f, param.gpu_id); - cpu_predictor->PredictBatch(dmat, &predictions, model, 0, - static_cast(model.trees.size())); - } else if (model.param.num_output_group == 1 && updaters->size() > 0 && - num_new_trees == 1 && - updaters->back()->UpdatePredictionCache(e.data.get(), - &predictions)) { + this->InitOutPredictions(dmat->Info(), &predictions, model); + } + + if (model.param.num_output_group == 1 && updaters->size() > 0 && + num_new_trees == 1 && + updaters->back()->UpdatePredictionCache(e.data.get(), &predictions)) { // do nothing } else { - DevicePredictInternal(dmat, &predictions, model, old_ntree, - model.trees.size()); + DevicePredictInternal(dmat, &predictions, model, old_ntree, model.trees.size()); } } } @@ -462,6 +458,7 @@ class GPUPredictor : public xgboost::Predictor { Predictor::Init(cfg, cache); cpu_predictor->Init(cfg, cache); param.InitAllowUnknown(cfg); + devices = GPUSet::Range(param.gpu_id, dh::NDevicesAll(param.n_gpus)); max_shared_memory_bytes = dh::MaxSharedMemory(param.gpu_id); } @@ -473,6 +470,8 @@ class GPUPredictor : public xgboost::Predictor { thrust::device_vector nodes; thrust::device_vector tree_segments; thrust::device_vector tree_group; + thrust::device_vector preds; + GPUSet devices; size_t max_shared_memory_bytes; }; XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor") diff --git a/src/tree/updater_gpu.cu b/src/tree/updater_gpu.cu index 8ff0ed8fe..acaa1a9f5 100644 --- a/src/tree/updater_gpu.cu +++ b/src/tree/updater_gpu.cu @@ -495,6 +495,11 @@ class GPUMaker : public TreeUpdater { int nCols; int maxNodes; int maxLeaves; + + // devices are only used for resharding the HostDeviceVector passed as a parameter; + // the algorithm works with a single GPU only + GPUSet devices; + dh::CubMemory tmp_mem; dh::DVec tmpScanGradBuff; dh::DVec tmpScanKeyBuff; @@ -510,6 +515,8 @@ class GPUMaker : public TreeUpdater { param.InitAllowUnknown(args); maxNodes = (1 << (param.max_depth + 1)) - 1; maxLeaves = 1 << param.max_depth; + + devices = GPUSet::Range(param.gpu_id, dh::NDevicesAll(param.n_gpus)); } void Update(HostDeviceVector* gpair, DMatrix* dmat, @@ -519,6 +526,8 @@ class GPUMaker : public TreeUpdater { float lr = param.learning_rate; param.learning_rate = lr / trees.size(); + gpair->Reshard(devices); + try { // build tree for (size_t i = 0; i < trees.size(); ++i) { @@ -688,10 +697,7 @@ class GPUMaker : public TreeUpdater { } void transferGrads(HostDeviceVector* gpair) { - // HACK - dh::safe_cuda(cudaMemcpy(gradsInst.Data(), gpair->DevicePointer(param.gpu_id), - sizeof(GradientPair) * nRows, - cudaMemcpyDefault)); + gpair->GatherTo(gradsInst.tbegin(), gradsInst.tend()); // evaluate the full-grad reduction for the root node dh::SumReduction(tmp_mem, gradsInst, gradSums, nRows); } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 7fc74bcd7..2f1eb5ae4 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -369,8 +369,7 @@ struct DeviceShard { } // Reset values for each update iteration - void Reset(HostDeviceVector* dh_gpair, int device) { - auto begin = dh_gpair->tbegin(device); + void Reset(HostDeviceVector* dh_gpair) { dh::safe_cuda(cudaSetDevice(device_idx)); position.CurrentDVec().Fill(0); std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), @@ -380,7 +379,7 @@ struct DeviceShard { std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0)); ridx_segments.front() = Segment(0, ridx.Size()); - this->gpair.copy(begin + row_begin_idx, begin + row_end_idx); + this->gpair.copy(dh_gpair->tbegin(device_idx), dh_gpair->tend(device_idx)); SubsampleGradientPair(&gpair, param.subsample, row_begin_idx); hist.Reset(); } @@ -505,7 +504,7 @@ struct DeviceShard { dh::safe_cuda(cudaSetDevice(device_idx)); if (!prediction_cache_initialised) { dh::safe_cuda(cudaMemcpy( - prediction_cache.Data(), &out_preds_d[row_begin_idx], + prediction_cache.Data(), out_preds_d, prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault)); } prediction_cache_initialised = true; @@ -528,7 +527,7 @@ struct DeviceShard { }); dh::safe_cuda(cudaMemcpy( - &out_preds_d[row_begin_idx], prediction_cache.Data(), + out_preds_d, prediction_cache.Data(), prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault)); } }; @@ -543,6 +542,7 @@ class GPUHistMaker : public TreeUpdater { param_.InitAllowUnknown(args); CHECK(param_.n_gpus != 0) << "Must have at least one device"; n_devices_ = param_.n_gpus; + devices_ = GPUSet::Range(param_.gpu_id, dh::NDevicesAll(param_.n_gpus)); dh::CheckComputeCapability(); @@ -610,15 +610,11 @@ class GPUHistMaker : public TreeUpdater { } // Create device shards - omp_set_num_threads(shards_.size()); -#pragma omp parallel - { - auto cpu_thread_id = omp_get_thread_num(); - shards_[cpu_thread_id] = std::unique_ptr( - new DeviceShard(device_list_[cpu_thread_id], cpu_thread_id, gmat_, - row_segments[cpu_thread_id], - row_segments[cpu_thread_id + 1], n_bins_, param_)); - } + dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr& shard) { + shard = std::unique_ptr( + new DeviceShard(device_list_[i], i, gmat_, + row_segments[i], row_segments[i + 1], n_bins_, param_)); + }); p_last_fmat_ = dmat; initialised_ = true; @@ -636,12 +632,9 @@ class GPUHistMaker : public TreeUpdater { // Copy gpair & reset memory monitor_.Start("InitDataReset", device_list_); - omp_set_num_threads(shards_.size()); - // TODO(canonizer): make it parallel again once HostDeviceVector is - // thread-safe - for (int shard = 0; shard < shards_.size(); ++shard) - shards_[shard]->Reset(gpair, param_.gpu_id); + gpair->Reshard(devices_); + dh::ExecuteShards(&shards_, [&](std::unique_ptr& shard) {shard->Reset(gpair); }); monitor_.Stop("InitDataReset", device_list_); } @@ -676,16 +669,16 @@ class GPUHistMaker : public TreeUpdater { subtraction_trick_nidx = nidx_left; } - for (auto& shard : shards_) { - shard->BuildHist(build_hist_nidx); - } + dh::ExecuteShards(&shards_, [&](std::unique_ptr& shard) { + shard->BuildHist(build_hist_nidx); + }); this->AllReduceHist(build_hist_nidx); - for (auto& shard : shards_) { - shard->SubtractionTrick(nidx_parent, build_hist_nidx, - subtraction_trick_nidx); - } + dh::ExecuteShards(&shards_, [&](std::unique_ptr& shard) { + shard->SubtractionTrick(nidx_parent, build_hist_nidx, + subtraction_trick_nidx); + }); } // Returns best loss @@ -743,22 +736,20 @@ class GPUHistMaker : public TreeUpdater { auto root_nidx = 0; // Sum gradients std::vector tmp_sums(shards_.size()); - omp_set_num_threads(shards_.size()); -#pragma omp parallel - { - auto cpu_thread_id = omp_get_thread_num(); - auto& shard = shards_[cpu_thread_id]; - dh::safe_cuda(cudaSetDevice(shard->device_idx)); - tmp_sums[cpu_thread_id] = dh::SumReduction( - shard->temp_memory, shard->gpair.Data(), shard->gpair.Size()); - } + + dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr& shard) { + dh::safe_cuda(cudaSetDevice(shard->device_idx)); + tmp_sums[i] = + dh::SumReduction(shard->temp_memory, shard->gpair.Data(), + shard->gpair.Size()); + }); auto sum_gradient = std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair()); // Generate root histogram - for (auto& shard : shards_) { - shard->BuildHist(root_nidx); - } + dh::ExecuteShards(&shards_, [&](std::unique_ptr& shard) { + shard->BuildHist(root_nidx); + }); this->AllReduceHist(root_nidx); @@ -802,14 +793,11 @@ class GPUHistMaker : public TreeUpdater { auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_; - omp_set_num_threads(shards_.size()); -#pragma omp parallel - { - auto cpu_thread_id = omp_get_thread_num(); - shards_[cpu_thread_id]->UpdatePosition(nidx, left_nidx, right_nidx, fidx, - split_gidx, default_dir_left, - is_dense, fidx_begin, fidx_end); - } + dh::ExecuteShards(&shards_, [&](std::unique_ptr& shard) { + shard->UpdatePosition(nidx, left_nidx, right_nidx, fidx, + split_gidx, default_dir_left, + is_dense, fidx_begin, fidx_end); + }); } void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) { @@ -903,8 +891,6 @@ class GPUHistMaker : public TreeUpdater { monitor_.Stop("EvaluateSplits", device_list_); } } - // Reset omp num threads - omp_set_num_threads(nthread); } bool UpdatePredictionCache( @@ -912,13 +898,10 @@ class GPUHistMaker : public TreeUpdater { monitor_.Start("UpdatePredictionCache", device_list_); if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) return false; - - bst_float* out_preds_d = p_out_preds->DevicePointer(param_.gpu_id); - -#pragma omp parallel for schedule(static, 1) - for (int shard = 0; shard < shards_.size(); ++shard) { - shards_[shard]->UpdatePredictionCache(out_preds_d); - } + p_out_preds->Reshard(devices_); + dh::ExecuteShards(&shards_, [&](std::unique_ptr& shard) { + shard->UpdatePredictionCache(p_out_preds->DevicePointer(shard->device_idx)); + }); monitor_.Stop("UpdatePredictionCache", device_list_); return true; } @@ -992,6 +975,7 @@ class GPUHistMaker : public TreeUpdater { std::vector device_list_; DMatrix* p_last_fmat_; + GPUSet devices_; }; XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")