/*! * Copyright 2017 XGBoost contributors */ #include #include #include #include #include #include "xgboost/data.h" #include "xgboost/host_device_vector.h" #include "device_helpers.cuh" namespace xgboost { // the handler to call instead of cudaSetDevice; only used for testing static void (*cudaSetDeviceHandler)(int) = nullptr; // NOLINT void SetCudaSetDeviceHandler(void (*handler)(int)) { cudaSetDeviceHandler = handler; } template class HostDeviceVectorImpl { public: HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) { if (device >= 0) { gpu_access_ = GPUAccess::kWrite; SetDevice(); data_d_->resize(size, v); } else { data_h_.resize(size, v); } } // Initializer can be std::vector or std::initializer_list template HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) { if (device >= 0) { gpu_access_ = GPUAccess::kWrite; LazyResizeDevice(init.size()); Copy(init); } else { data_h_ = init; } } HostDeviceVectorImpl(HostDeviceVectorImpl&& that) : device_{that.device_}, data_h_{std::move(that.data_h_)}, data_d_{std::move(that.data_d_)}, gpu_access_{that.gpu_access_} {} ~HostDeviceVectorImpl() { if (device_ >= 0) { SetDevice(); } } size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0; } int DeviceIdx() const { return device_; } T* DevicePointer() { LazySyncDevice(GPUAccess::kWrite); return data_d_->data().get(); } const T* ConstDevicePointer() { LazySyncDevice(GPUAccess::kRead); return data_d_->data().get(); } common::Span DeviceSpan() { LazySyncDevice(GPUAccess::kWrite); return {data_d_->data().get(), Size()}; } common::Span ConstDeviceSpan() { LazySyncDevice(GPUAccess::kRead); return {data_d_->data().get(), Size()}; } void Fill(T v) { // NOLINT if (HostCanWrite()) { std::fill(data_h_.begin(), data_h_.end(), v); } else { gpu_access_ = GPUAccess::kWrite; SetDevice(); thrust::fill(data_d_->begin(), data_d_->end(), v); } } void Copy(HostDeviceVectorImpl* other) { CHECK_EQ(Size(), other->Size()); SetDevice(other->device_); // Data is on host. if (HostCanWrite() && other->HostCanWrite()) { std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin()); return; } SetDevice(); CopyToDevice(other); } void Copy(const std::vector& other) { CHECK_EQ(Size(), other.size()); if (HostCanWrite()) { std::copy(other.begin(), other.end(), data_h_.begin()); } else { CopyToDevice(other.data()); } } void Copy(std::initializer_list other) { CHECK_EQ(Size(), other.size()); if (HostCanWrite()) { std::copy(other.begin(), other.end(), data_h_.begin()); } else { CopyToDevice(other.begin()); } } void Extend(HostDeviceVectorImpl* other) { auto ori_size = this->Size(); this->Resize(ori_size + other->Size(), T()); if (HostCanWrite() && other->HostCanRead()) { auto& h_vec = this->HostVector(); auto& other_vec = other->HostVector(); CHECK_EQ(h_vec.size(), ori_size + other->Size()); std::copy(other_vec.cbegin(), other_vec.cend(), h_vec.begin() + ori_size); } else { auto ptr = other->ConstDevicePointer(); SetDevice(); CHECK_EQ(this->DeviceIdx(), other->DeviceIdx()); dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr, other->Size() * sizeof(T), cudaMemcpyDeviceToDevice)); } } std::vector& HostVector() { LazySyncHost(GPUAccess::kNone); return data_h_; } const std::vector& ConstHostVector() { LazySyncHost(GPUAccess::kRead); return data_h_; } void SetDevice(int device) { if (device_ == device) { return; } if (device_ >= 0) { LazySyncHost(GPUAccess::kNone); } device_ = device; if (device_ >= 0) { LazyResizeDevice(data_h_.size()); } } void Resize(size_t new_size, T v) { if (new_size == Size()) { return; } if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) { // fast on-device resize gpu_access_ = GPUAccess::kWrite; SetDevice(); data_d_->resize(new_size, v); } else { // resize on host LazySyncHost(GPUAccess::kNone); data_h_.resize(new_size, v); } } void LazySyncHost(GPUAccess access) { if (HostCanAccess(access)) { return; } if (HostCanRead()) { // data is present, just need to deny access to the device gpu_access_ = access; return; } gpu_access_ = access; if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); } SetDevice(); dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_->data().get(), data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost)); } void LazySyncDevice(GPUAccess access) { if (DeviceCanAccess(access)) { return; } if (DeviceCanRead()) { // deny read to the host gpu_access_ = access; return; } // data is on the host LazyResizeDevice(data_h_.size()); SetDevice(); dh::safe_cuda(cudaMemcpy(data_d_->data().get(), data_h_.data(), data_d_->size() * sizeof(T), cudaMemcpyHostToDevice)); gpu_access_ = access; } bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; } bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); } bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); } bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; } bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); } bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); } GPUAccess Access() const { return gpu_access_; } private: int device_{-1}; std::vector data_h_{}; std::unique_ptr> data_d_{}; GPUAccess gpu_access_{GPUAccess::kNone}; void CopyToDevice(HostDeviceVectorImpl* other) { if (other->HostCanWrite()) { CopyToDevice(other->data_h_.data()); } else { LazyResizeDevice(Size()); gpu_access_ = GPUAccess::kWrite; SetDevice(); dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(), data_d_->size() * sizeof(T), cudaMemcpyDefault)); } } void CopyToDevice(const T* begin) { LazyResizeDevice(Size()); gpu_access_ = GPUAccess::kWrite; SetDevice(); dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin, data_d_->size() * sizeof(T), cudaMemcpyDefault)); } void LazyResizeDevice(size_t new_size) { if (data_d_ && new_size == data_d_->size()) { return; } SetDevice(); data_d_->resize(new_size); } void SetDevice() { CHECK_GE(device_, 0); if (cudaSetDeviceHandler == nullptr) { dh::safe_cuda(cudaSetDevice(device_)); } else { (*cudaSetDeviceHandler)(device_); } if (!data_d_) { data_d_.reset(new dh::device_vector); } } }; template HostDeviceVector::HostDeviceVector(size_t size, T v, int device) : impl_(new HostDeviceVectorImpl(size, v, device)) {} template HostDeviceVector::HostDeviceVector(std::initializer_list init, int device) : impl_(new HostDeviceVectorImpl(init, device)) {} template HostDeviceVector::HostDeviceVector(const std::vector& init, int device) : impl_(new HostDeviceVectorImpl(init, device)) {} template HostDeviceVector::HostDeviceVector(HostDeviceVector&& other) : impl_(new HostDeviceVectorImpl(std::move(*other.impl_))) {} template HostDeviceVector& HostDeviceVector::operator=(HostDeviceVector&& other) { if (this == &other) { return *this; } std::unique_ptr> new_impl( new HostDeviceVectorImpl(std::move(*other.impl_))); delete impl_; impl_ = new_impl.release(); return *this; } template HostDeviceVector::~HostDeviceVector() { delete impl_; impl_ = nullptr; } template size_t HostDeviceVector::Size() const { return impl_->Size(); } template int HostDeviceVector::DeviceIdx() const { return impl_->DeviceIdx(); } template T* HostDeviceVector::DevicePointer() { return impl_->DevicePointer(); } template const T* HostDeviceVector::ConstDevicePointer() const { return impl_->ConstDevicePointer(); } template common::Span HostDeviceVector::DeviceSpan() { return impl_->DeviceSpan(); } template common::Span HostDeviceVector::ConstDeviceSpan() const { return impl_->ConstDeviceSpan(); } template void HostDeviceVector::Fill(T v) { impl_->Fill(v); } template void HostDeviceVector::Copy(const HostDeviceVector& other) { impl_->Copy(other.impl_); } template void HostDeviceVector::Copy(const std::vector& other) { impl_->Copy(other); } template void HostDeviceVector::Copy(std::initializer_list other) { impl_->Copy(other); } template void HostDeviceVector::Extend(HostDeviceVector const& other) { impl_->Extend(other.impl_); } template std::vector& HostDeviceVector::HostVector() { return impl_->HostVector(); } template const std::vector& HostDeviceVector::ConstHostVector() const { return impl_->ConstHostVector(); } template bool HostDeviceVector::HostCanRead() const { return impl_->HostCanRead(); } template bool HostDeviceVector::HostCanWrite() const { return impl_->HostCanWrite(); } template bool HostDeviceVector::DeviceCanRead() const { return impl_->DeviceCanRead(); } template bool HostDeviceVector::DeviceCanWrite() const { return impl_->DeviceCanWrite(); } template GPUAccess HostDeviceVector::DeviceAccess() const { return impl_->Access(); } template void HostDeviceVector::SetDevice(int device) const { impl_->SetDevice(device); } template void HostDeviceVector::Resize(size_t new_size, T v) { impl_->Resize(new_size, v); } // explicit instantiations are required, as HostDeviceVector isn't header-only template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; // bst_node_t template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; // bst_row_t template class HostDeviceVector; // bst_feature_t #if defined(__APPLE__) /* * On OSX: * * typedef unsigned int uint32_t; * typedef unsigned long long uint64_t; * typedef unsigned long __darwin_size_t; */ template class HostDeviceVector; #endif // defined(__APPLE__) } // namespace xgboost