diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 979167459..1e37050b3 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -238,8 +238,7 @@ class MemoryLogger { device_allocations.erase(itr); } }; - std::map - stats_; // Map device ordinal to memory information + DeviceStats stats_; std::mutex mutex_; public: @@ -249,8 +248,8 @@ public: std::lock_guard guard(mutex_); int current_device; safe_cuda(cudaGetDevice(¤t_device)); - stats_[current_device].RegisterAllocation(ptr, n); - CHECK_LE(stats_[current_device].peak_allocated_bytes, dh::TotalMemory(current_device)); + stats_.RegisterAllocation(ptr, n); + CHECK_LE(stats_.peak_allocated_bytes, dh::TotalMemory(current_device)); } void RegisterDeallocation(void *ptr, size_t n) { if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) @@ -258,19 +257,19 @@ public: std::lock_guard guard(mutex_); int current_device; safe_cuda(cudaGetDevice(¤t_device)); - stats_[current_device].RegisterDeallocation(ptr, n, current_device); + stats_.RegisterDeallocation(ptr, n, current_device); } void Log() { if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) return; std::lock_guard guard(mutex_); - for (const auto &kv : stats_) { - LOG(CONSOLE) << "======== Device " << kv.first << " Memory Allocations: " - << " ========"; - LOG(CONSOLE) << "Peak memory usage: " - << kv.second.peak_allocated_bytes / 1000000 << "mb"; - LOG(CONSOLE) << "Number of allocations: " << kv.second.num_allocations; - } + int current_device; + safe_cuda(cudaGetDevice(¤t_device)); + LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: " + << " ========"; + LOG(CONSOLE) << "Peak memory usage: " + << stats_.peak_allocated_bytes / 1000000 << "mb"; + LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations; } }; }; @@ -940,10 +939,9 @@ class AllReducer { size_t allreduce_calls_; // Keep statistics of the number of reduce calls std::vector host_data; // Used for all reduce on host #ifdef XGBOOST_USE_NCCL - std::vector comms; - std::vector streams; - std::vector device_ordinals; // device id from CUDA - std::vector device_counts; // device count from CUDA + ncclComm_t comm; + cudaStream_t stream; + int device_ordinal; ncclUniqueId id; #endif @@ -952,79 +950,28 @@ class AllReducer { allreduce_calls_(0) {} /** - * \brief If we are using a single GPU only - */ - bool IsSingleGPU() { -#ifdef XGBOOST_USE_NCCL - CHECK(device_counts.size() > 0) << "AllReducer not initialised."; - return device_counts.size() <= 1 && device_counts.at(0) == 1; -#else - return true; -#endif - } - - /** - * \brief Initialise with the desired device ordinals for this communication + * \brief Initialise with the desired device ordinal for this communication * group. * - * \param device_ordinals The device ordinals. + * \param device_ordinal The device ordinal. */ - void Init(const std::vector &device_ordinals) { + void Init(int _device_ordinal) { #ifdef XGBOOST_USE_NCCL /** \brief this >monitor . init. */ - this->device_ordinals = device_ordinals; - this->device_counts.resize(rabit::GetWorldSize()); - this->comms.resize(device_ordinals.size()); - this->streams.resize(device_ordinals.size()); - this->id = GetUniqueId(); - - device_counts.at(rabit::GetRank()) = device_ordinals.size(); - for (size_t i = 0; i < device_counts.size(); i++) { - int dev_count = device_counts.at(i); - rabit::Allreduce(&dev_count, 1); - device_counts.at(i) = dev_count; - } - - int nccl_rank = 0; - int nccl_rank_offset = std::accumulate(device_counts.begin(), - device_counts.begin() + rabit::GetRank(), 0); - int nccl_nranks = std::accumulate(device_counts.begin(), - device_counts.end(), 0); - nccl_rank += nccl_rank_offset; - - GroupStart(); - for (size_t i = 0; i < device_ordinals.size(); i++) { - int dev = device_ordinals.at(i); - dh::safe_cuda(cudaSetDevice(dev)); - dh::safe_nccl(ncclCommInitRank( - &comms.at(i), - nccl_nranks, id, - nccl_rank)); - - nccl_rank++; - } - GroupEnd(); - - for (size_t i = 0; i < device_ordinals.size(); i++) { - safe_cuda(cudaSetDevice(device_ordinals.at(i))); - safe_cuda(cudaStreamCreate(&streams.at(i))); - } + device_ordinal = _device_ordinal; + id = GetUniqueId(); + dh::safe_cuda(cudaSetDevice(device_ordinal)); + dh::safe_nccl(ncclCommInitRank(&comm, rabit::GetWorldSize(), id, rabit::GetRank())); + safe_cuda(cudaStreamCreate(&stream)); initialised_ = true; -#else - CHECK_EQ(device_ordinals.size(), 1) - << "XGBoost must be compiled with NCCL to use more than one GPU."; #endif } ~AllReducer() { #ifdef XGBOOST_USE_NCCL if (initialised_) { - for (auto &stream : streams) { - dh::safe_cuda(cudaStreamDestroy(stream)); - } - for (auto &comm : comms) { - ncclCommDestroy(comm); - } + dh::safe_cuda(cudaStreamDestroy(stream)); + ncclCommDestroy(comm); } if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { LOG(CONSOLE) << "======== NCCL Statistics========"; @@ -1035,20 +982,21 @@ class AllReducer { } /** - * \brief Use in exactly the same way as ncclGroupStart + * \brief Allreduce. Use in exactly the same way as NCCL but without needing + * streams or comms. + * + * \param sendbuff The sendbuff. + * \param recvbuff The recvbuff. + * \param count Number of elements. */ - void GroupStart() { -#ifdef XGBOOST_USE_NCCL - dh::safe_nccl(ncclGroupStart()); -#endif - } - /** - * \brief Use in exactly the same way as ncclGroupEnd - */ - void GroupEnd() { + void AllReduceSum(const double *sendbuff, double *recvbuff, int count) { #ifdef XGBOOST_USE_NCCL - dh::safe_nccl(ncclGroupEnd()); + CHECK(initialised_); + dh::safe_cuda(cudaSetDevice(device_ordinal)); + dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, comm, stream)); + allreduce_bytes_ += count * sizeof(double); + allreduce_calls_ += 1; #endif } @@ -1056,51 +1004,18 @@ class AllReducer { * \brief Allreduce. Use in exactly the same way as NCCL but without needing * streams or comms. * - * \param communication_group_idx Zero-based index of the communication group. * \param sendbuff The sendbuff. * \param recvbuff The recvbuff. * \param count Number of elements. */ - void AllReduceSum(int communication_group_idx, const double *sendbuff, - double *recvbuff, int count) { + void AllReduceSum(const float *sendbuff, float *recvbuff, int count) { #ifdef XGBOOST_USE_NCCL CHECK(initialised_); - dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx))); - dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, - comms.at(communication_group_idx), - streams.at(communication_group_idx))); - if(communication_group_idx == 0) - { - allreduce_bytes_ += count * sizeof(double); - allreduce_calls_ += 1; - } -#endif - } - - /** - * \brief Allreduce. Use in exactly the same way as NCCL but without needing - * streams or comms. - * - * \param communication_group_idx Zero-based index of the communication group. - * \param sendbuff The sendbuff. - * \param recvbuff The recvbuff. - * \param count Number of elements. - */ - - void AllReduceSum(int communication_group_idx, const float *sendbuff, - float *recvbuff, int count) { -#ifdef XGBOOST_USE_NCCL - CHECK(initialised_); - dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx))); - dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, - comms.at(communication_group_idx), - streams.at(communication_group_idx))); - if(communication_group_idx == 0) - { - allreduce_bytes_ += count * sizeof(float); - allreduce_calls_ += 1; - } + dh::safe_cuda(cudaSetDevice(device_ordinal)); + dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, comm, stream)); + allreduce_bytes_ += count * sizeof(float); + allreduce_calls_ += 1; #endif } @@ -1109,21 +1024,17 @@ class AllReducer { * * \param count Number of. * - * \param communication_group_idx Zero-based index of the communication group. \param sendbuff. * \param sendbuff The sendbuff. * \param recvbuff The recvbuff. * \param count Number of. */ - void AllReduceSum(int communication_group_idx, const int64_t *sendbuff, - int64_t *recvbuff, int count) { + void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) { #ifdef XGBOOST_USE_NCCL CHECK(initialised_); - dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx])); - dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum, - comms[communication_group_idx], - streams[communication_group_idx])); + dh::safe_cuda(cudaSetDevice(device_ordinal)); + dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum, comm, stream)); #endif } @@ -1134,26 +1045,8 @@ class AllReducer { */ void Synchronize() { #ifdef XGBOOST_USE_NCCL - for (size_t i = 0; i < device_ordinals.size(); i++) { - dh::safe_cuda(cudaSetDevice(device_ordinals[i])); - dh::safe_cuda(cudaStreamSynchronize(streams[i])); - } -#endif - }; - - /** - * \brief Synchronizes the device - * - * \param device_id Identifier for the device. - */ - void Synchronize(int device_id) { -#ifdef XGBOOST_USE_NCCL - SaveCudaContext([&]() { - dh::safe_cuda(cudaSetDevice(device_id)); - int idx = std::find(device_ordinals.begin(), device_ordinals.end(), device_id) - device_ordinals.begin(); - CHECK(idx < device_ordinals.size()); - dh::safe_cuda(cudaStreamSynchronize(streams[idx])); - }); + dh::safe_cuda(cudaSetDevice(device_ordinal)); + dh::safe_cuda(cudaStreamSynchronize(stream)); #endif }; @@ -1219,58 +1112,6 @@ class AllReducer { } }; -/** - * \brief Executes some operation on each element of the input vector, using a - * single controlling thread for each element. In addition, passes the shard index - * into the function. - * - * \tparam T Generic type parameter. - * \tparam FunctionT Type of the function t. - * \param shards The shards. - * \param f The func_t to process. - */ - -template -void ExecuteIndexShards(std::vector *shards, FunctionT f) { - SaveCudaContext{[&]() { - // Temporarily turn off dynamic so we have a guaranteed number of threads - bool dynamic = omp_get_dynamic(); - omp_set_dynamic(false); - const long shards_size = static_cast(shards->size()); -#pragma omp parallel for schedule(static, 1) if (shards_size > 1) num_threads(shards_size) - for (long shard = 0; shard < shards_size; ++shard) { - f(shard, shards->at(shard)); - } - omp_set_dynamic(dynamic); - }}; -} - -/** - * \brief Executes some operation on each element of the input vector, using a single controlling - * thread for each element, returns the sum of the results. - * - * \tparam ReduceT Type of the reduce t. - * \tparam T Generic type parameter. - * \tparam FunctionT Type of the function t. - * \param shards The shards. - * \param f The func_t to process. - * - * \return A reduce_t. - */ - -template -ReduceT ReduceShards(std::vector *shards, FunctionT f) { - std::vector sums(shards->size()); - SaveCudaContext { - [&](){ -#pragma omp parallel for schedule(static, 1) if (shards->size() > 1) - for (int shard = 0; shard < shards->size(); ++shard) { - sums[shard] = f(shards->at(shard)); - }} - }; - return std::accumulate(sums.begin(), sums.end(), ReduceT()); -} - template ::index_type> xgboost::common::Span ToSpan( diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index c28218b4e..8955dac54 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -108,9 +108,6 @@ void HostDeviceVector::Resize(size_t new_size, T v) { impl_->Vec().resize(new_size, v); } -template -size_t HostDeviceVector::DeviceSize() const { return 0; } - template void HostDeviceVector::Fill(T v) { std::fill(HostVector().begin(), HostVector().end(), v); @@ -135,12 +132,22 @@ void HostDeviceVector::Copy(std::initializer_list other) { } template -bool HostDeviceVector::HostCanAccess(GPUAccess access) const { +bool HostDeviceVector::HostCanRead() const { return true; } template -bool HostDeviceVector::DeviceCanAccess(GPUAccess access) const { +bool HostDeviceVector::HostCanWrite() const { + return true; +} + +template +bool HostDeviceVector::DeviceCanRead() const { + return false; +} + +template +bool HostDeviceVector::DeviceCanWrite() const { return false; } diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 3205e8fd1..4cbe05da3 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -19,33 +19,12 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) { cudaSetDeviceHandler = handler; } -// wrapper over access with useful methods -class Permissions { - GPUAccess access_; - explicit Permissions(GPUAccess access) : access_{access} {} - - public: - Permissions() : access_{GPUAccess::kNone} {} - explicit Permissions(bool perm) - : access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {} - - bool CanRead() const { return access_ >= kRead; } - bool CanWrite() const { return access_ == kWrite; } - bool CanAccess(GPUAccess access) const { return access_ >= access; } - void Grant(GPUAccess access) { access_ = std::max(access_, access); } - void DenyComplementary(GPUAccess compl_access) { - access_ = std::min(access_, GPUAccess::kWrite - compl_access); - } - Permissions Complementary() const { - return Permissions(GPUAccess::kWrite - access_); - } -}; - template class HostDeviceVectorImpl { public: - HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), perm_h_(device < 0) { + HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) { if (device >= 0) { + gpu_access_ = GPUAccess::kWrite; SetDevice(); data_d_.resize(size, v); } else { @@ -53,19 +32,11 @@ class HostDeviceVectorImpl { } } - // required, as a new std::mutex has to be created - HostDeviceVectorImpl(const HostDeviceVectorImpl& other) - : device_(other.device_), data_h_(other.data_h_), perm_h_(other.perm_h_), mutex_() { - if (device_ >= 0) { - SetDevice(); - data_d_ = other.data_d_; - } - } - // Initializer can be std::vector or std::initializer_list template - HostDeviceVectorImpl(const Initializer& init, int device) : device_(device), perm_h_(device < 0) { + HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) { if (device >= 0) { + gpu_access_ = GPUAccess::kWrite; LazyResizeDevice(init.size()); Copy(init); } else { @@ -79,7 +50,7 @@ class HostDeviceVectorImpl { } } - size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : data_d_.size(); } + size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_.size(); } int DeviceIdx() const { return device_; } @@ -95,18 +66,13 @@ class HostDeviceVectorImpl { common::Span DeviceSpan() { LazySyncDevice(GPUAccess::kWrite); - return {data_d_.data().get(), static_cast::index_type>(DeviceSize())}; + return {data_d_.data().get(), static_cast::index_type>(Size())}; } common::Span ConstDeviceSpan() { LazySyncDevice(GPUAccess::kRead); using SpanInd = typename common::Span::index_type; - return {data_d_.data().get(), static_cast(DeviceSize())}; - } - - size_t DeviceSize() { - LazySyncDevice(GPUAccess::kRead); - return data_d_.size(); + return {data_d_.data().get(), static_cast(Size())}; } thrust::device_ptr tbegin() { // NOLINT @@ -118,55 +84,53 @@ class HostDeviceVectorImpl { } thrust::device_ptr tend() { // NOLINT - return tbegin() + DeviceSize(); + return tbegin() + Size(); } thrust::device_ptr tcend() { // NOLINT - return tcbegin() + DeviceSize(); + return tcbegin() + Size(); } void Fill(T v) { // NOLINT - if (perm_h_.CanWrite()) { + if (HostCanWrite()) { std::fill(data_h_.begin(), data_h_.end(), v); } else { - DeviceFill(v); + gpu_access_ = GPUAccess::kWrite; + SetDevice(); + thrust::fill(data_d_.begin(), data_d_.end(), v); } } void Copy(HostDeviceVectorImpl* other) { CHECK_EQ(Size(), other->Size()); // Data is on host. - if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) { + if (HostCanWrite() && other->HostCanWrite()) { std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin()); return; } - // Data is on device; - if (device_ != other->device_) { - SetDevice(other->device_); - } - DeviceCopy(other); + CopyToDevice(other); } void Copy(const std::vector& other) { CHECK_EQ(Size(), other.size()); - if (perm_h_.CanWrite()) { + if (HostCanWrite()) { std::copy(other.begin(), other.end(), data_h_.begin()); } else { - DeviceCopy(other.data()); + CopyToDevice(other.data()); } } void Copy(std::initializer_list other) { CHECK_EQ(Size(), other.size()); - if (perm_h_.CanWrite()) { + if (HostCanWrite()) { std::copy(other.begin(), other.end(), data_h_.begin()); } else { - DeviceCopy(other.begin()); + CopyToDevice(other.begin()); } } std::vector& HostVector() { - LazySyncHost(GPUAccess::kWrite); + LazySyncHost(GPUAccess::kNone); return data_h_; } @@ -178,7 +142,7 @@ class HostDeviceVectorImpl { void SetDevice(int device) { if (device_ == device) { return; } if (device_ >= 0) { - LazySyncHost(GPUAccess::kWrite); + LazySyncHost(GPUAccess::kNone); } device_ = device; if (device_ >= 0) { @@ -190,38 +154,37 @@ class HostDeviceVectorImpl { if (new_size == Size()) { return; } if (Size() == 0 && device_ >= 0) { // fast on-device resize - perm_h_ = Permissions(false); + gpu_access_ = GPUAccess::kWrite; + SetDevice(); data_d_.resize(new_size, v); } else { // resize on host - LazySyncHost(GPUAccess::kWrite); + LazySyncHost(GPUAccess::kNone); data_h_.resize(new_size, v); } } void LazySyncHost(GPUAccess access) { - if (perm_h_.CanAccess(access)) { return; } - if (perm_h_.CanRead()) { + if (HostCanAccess(access)) { return; } + if (HostCanRead()) { // data is present, just need to deny access to the device - perm_h_.Grant(access); + gpu_access_ = access; return; } - std::lock_guard lock(mutex_); + gpu_access_ = access; if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); } SetDevice(); dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_.data().get(), data_d_.size() * sizeof(T), cudaMemcpyDeviceToHost)); - perm_h_.Grant(access); } void LazySyncDevice(GPUAccess access) { - if (DevicePerm().CanAccess(access)) { return; } - if (DevicePerm().CanRead()) { + if (DeviceCanAccess(access)) { return; } + if (DeviceCanRead()) { // deny read to the host - std::lock_guard lock(mutex_); - perm_h_.DenyComplementary(access); + gpu_access_ = access; return; } // data is on the host @@ -231,41 +194,37 @@ class HostDeviceVectorImpl { data_h_.data(), data_d_.size() * sizeof(T), cudaMemcpyHostToDevice)); - - std::lock_guard lock(mutex_); - perm_h_.DenyComplementary(access); + gpu_access_ = access; } - bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); } - bool DeviceCanAccess(GPUAccess access) { return DevicePerm().CanAccess(access); } + bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; } + bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); } + bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); } + bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; } + bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); } + bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); } private: int device_{-1}; std::vector data_h_{}; dh::device_vector data_d_{}; - Permissions perm_h_{false}; - // protects size_d_ and perm_h_ when updated from multiple threads - std::mutex mutex_{}; + GPUAccess gpu_access_{GPUAccess::kNone}; - void DeviceFill(T v) { - // TODO(canonizer): avoid full copy of host data - LazySyncDevice(GPUAccess::kWrite); - SetDevice(); - thrust::fill(data_d_.begin(), data_d_.end(), v); + void CopyToDevice(HostDeviceVectorImpl* other) { + if (other->HostCanWrite()) { + CopyToDevice(other->data_h_.data()); + } else { + LazyResizeDevice(Size()); + gpu_access_ = GPUAccess::kWrite; + SetDevice(); + dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(), + data_d_.size() * sizeof(T), cudaMemcpyDefault)); + } } - void DeviceCopy(HostDeviceVectorImpl* other) { - // TODO(canonizer): avoid full copy of host data for this (but not for other) - LazySyncDevice(GPUAccess::kWrite); - other->LazySyncDevice(GPUAccess::kRead); - SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(), - data_d_.size() * sizeof(T), cudaMemcpyDefault)); - } - - void DeviceCopy(const T* begin) { - // TODO(canonizer): avoid full copy of host data - LazySyncDevice(GPUAccess::kWrite); + void CopyToDevice(const T* begin) { + LazyResizeDevice(Size()); + gpu_access_ = GPUAccess::kWrite; SetDevice(); dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin, data_d_.size() * sizeof(T), cudaMemcpyDefault)); @@ -285,8 +244,6 @@ class HostDeviceVectorImpl { (*cudaSetDeviceHandler)(device_); } } - - Permissions DevicePerm() const { return perm_h_.Complementary(); } }; template @@ -347,11 +304,6 @@ common::Span HostDeviceVector::ConstDeviceSpan() const { return impl_->ConstDeviceSpan(); } -template -size_t HostDeviceVector::DeviceSize() const { - return impl_->DeviceSize(); -} - template thrust::device_ptr HostDeviceVector::tbegin() { // NOLINT return impl_->tbegin(); @@ -401,13 +353,23 @@ const std::vector& HostDeviceVector::ConstHostVector() const { } template -bool HostDeviceVector::HostCanAccess(GPUAccess access) const { - return impl_->HostCanAccess(access); +bool HostDeviceVector::HostCanRead() const { + return impl_->HostCanRead(); } template -bool HostDeviceVector::DeviceCanAccess(GPUAccess access) const { - return impl_->DeviceCanAccess(access); +bool HostDeviceVector::HostCanWrite() const { + return impl_->HostCanWrite(); +} + +template +bool HostDeviceVector::DeviceCanRead() const { + return impl_->DeviceCanRead(); +} + +template +bool HostDeviceVector::DeviceCanWrite() const { + return impl_->DeviceCanWrite(); } template diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h index abaa5dd59..e54c50a24 100644 --- a/src/common/host_device_vector.h +++ b/src/common/host_device_vector.h @@ -79,16 +79,23 @@ void SetCudaSetDeviceHandler(void (*handler)(int)); template struct HostDeviceVectorImpl; +/*! + * \brief Controls data access from the GPU. + * + * Since a `HostDeviceVector` can have data on both the host and device, access control needs to be + * maintained to keep the data consistent. + * + * There are 3 scenarios supported: + * - Data is being manipulated on device. GPU has write access, host doesn't have access. + * - Data is read-only on both the host and device. + * - Data is being manipulated on the host. Host has write access, device doesn't have access. + */ enum GPUAccess { kNone, kRead, // write implies read kWrite }; -inline GPUAccess operator-(GPUAccess a, GPUAccess b) { - return static_cast(static_cast(a) - static_cast(b)); -} - template class HostDeviceVector { public: @@ -111,8 +118,6 @@ class HostDeviceVector { const T* ConstHostPointer() const { return ConstHostVector().data(); } const T* HostPointer() const { return ConstHostPointer(); } - size_t DeviceSize() const; - // only define functions returning device_ptr // if HostDeviceVector.h is included from a .cu file #ifdef __CUDACC__ @@ -135,8 +140,10 @@ class HostDeviceVector { const std::vector& ConstHostVector() const; const std::vector& HostVector() const {return ConstHostVector(); } - bool HostCanAccess(GPUAccess access) const; - bool DeviceCanAccess(GPUAccess access) const; + bool HostCanRead() const; + bool HostCanWrite() const; + bool DeviceCanRead() const; + bool DeviceCanWrite() const; void SetDevice(int device) const; diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu index 661d02dac..8d4253a4f 100644 --- a/src/metric/elementwise_metric.cu +++ b/src/metric/elementwise_metric.cu @@ -68,7 +68,7 @@ class ElementWiseMetricsReduction { const HostDeviceVector& weights, const HostDeviceVector& labels, const HostDeviceVector& preds) { - size_t n_data = preds.DeviceSize(); + size_t n_data = preds.Size(); thrust::counting_iterator begin(0); thrust::counting_iterator end = begin + n_data; diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu index 5b18b9f5f..33a6ee565 100644 --- a/src/metric/multiclass_metric.cu +++ b/src/metric/multiclass_metric.cu @@ -85,7 +85,7 @@ class MultiClassMetricsReduction { const HostDeviceVector& labels, const HostDeviceVector& preds, const size_t n_class) { - size_t n_data = labels.DeviceSize(); + size_t n_data = labels.Size(); thrust::counting_iterator begin(0); thrust::counting_iterator end = begin + n_data; diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index bd3ac3583..d50cbcd79 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -231,12 +231,13 @@ class GPUPredictor : public xgboost::Predictor { this->num_group_ = model.param.num_output_group; } - void PredictInternal - (const SparsePage& batch, size_t num_features, - HostDeviceVector* predictions) { + void PredictInternal(const SparsePage& batch, + size_t num_features, + HostDeviceVector* predictions, + size_t batch_offset) { dh::safe_cuda(cudaSetDevice(device_)); const int BLOCK_THREADS = 128; - size_t num_rows = batch.offset.DeviceSize() - 1; + size_t num_rows = batch.Size(); const int GRID_SIZE = static_cast(common::DivRoundUp(num_rows, BLOCK_THREADS)); int shared_memory_bytes = static_cast @@ -249,10 +250,10 @@ class GPUPredictor : public xgboost::Predictor { size_t entry_start = 0; PredictKernel<<>> - (dh::ToSpan(nodes_), predictions->DeviceSpan(), dh::ToSpan(tree_segments_), - dh::ToSpan(tree_group_), batch.offset.DeviceSpan(), - batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features, - num_rows, entry_start, use_shared, this->num_group_); + (dh::ToSpan(nodes_), predictions->DeviceSpan().subspan(batch_offset), + dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_), batch.offset.DeviceSpan(), + batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features, num_rows, + entry_start, use_shared, this->num_group_); } private: @@ -297,28 +298,10 @@ class GPUPredictor : public xgboost::Predictor { InitModel(model, tree_begin, tree_end); size_t batch_offset = 0; - auto* preds = out_preds; - std::unique_ptr> batch_preds{nullptr}; for (auto &batch : dmat->GetBatches()) { - bool is_external_memory = batch.Size() < dmat->Info().num_row_; - if (is_external_memory) { - batch_preds.reset(new HostDeviceVector); - batch_preds->Resize(batch.Size() * model.param.num_output_group); - std::copy(out_preds->ConstHostVector().begin() + batch_offset, - out_preds->ConstHostVector().begin() + batch_offset + batch_preds->Size(), - batch_preds->HostVector().begin()); - preds = batch_preds.get(); - } - batch.offset.SetDevice(device_); batch.data.SetDevice(device_); - preds->SetDevice(device_); - shard_.PredictInternal(batch, model.param.num_feature, preds); - - if (is_external_memory) { - auto h_preds = preds->ConstHostVector(); - std::copy(h_preds.begin(), h_preds.end(), out_preds->HostVector().begin() + batch_offset); - } + shard_.PredictInternal(batch, model.param.num_feature, out_preds, batch_offset); batch_offset += batch.Size() * model.param.num_output_group; } @@ -356,6 +339,7 @@ class GPUPredictor : public xgboost::Predictor { size_t n_classes = model.param.num_output_group; size_t n = n_classes * info.num_row_; const HostDeviceVector& base_margin = info.base_margin_; + out_preds->SetDevice(device_); out_preds->Resize(n); if (base_margin.Size() != 0) { CHECK_EQ(base_margin.Size(), n); @@ -454,7 +438,7 @@ class GPUPredictor : public xgboost::Predictor { } private: - /*! \brief Re configure shards when GPUSet is changed. */ + /*! \brief Reconfigure the shard when GPU is changed. */ void ConfigureShard(int device) { if (device_ == device) return; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index b72acac8e..83525d64b 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -93,14 +93,14 @@ struct ExpandEntry { } }; -inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) { +inline static bool DepthWise(const ExpandEntry& lhs, const ExpandEntry& rhs) { if (lhs.depth == rhs.depth) { return lhs.timestamp > rhs.timestamp; // favor small timestamp } else { return lhs.depth > rhs.depth; // favor small depth } } -inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) { +inline static bool LossGuide(const ExpandEntry& lhs, const ExpandEntry& rhs) { if (lhs.split.loss_chg == rhs.split.loss_chg) { return lhs.timestamp > rhs.timestamp; // favor small timestamp } else { @@ -553,7 +553,7 @@ __global__ void SharedMemHistKernel(ELLPackMatrix matrix, // of rows to process from a batch and the position from which to process on each device. struct RowStateOnDevice { // Number of rows assigned to this device - const size_t total_rows_assigned_to_device; + size_t total_rows_assigned_to_device; // Number of rows processed thus far size_t total_rows_processed; // Number of rows to process from the current sparse page batch @@ -584,14 +584,13 @@ template struct DeviceShard { int n_bins; int device_id; - int shard_idx; // Position in the local array of shards dh::BulkAllocator ba; ELLPackMatrix ellpack_matrix; std::unique_ptr row_partitioner; - DeviceHistogram hist; + DeviceHistogram hist{}; /*! \brief row_ptr form HistogramCuts. */ common::Span feature_segments; @@ -611,9 +610,6 @@ struct DeviceShard { /*! \brief Sum gradient for each node. */ std::vector node_sum_gradients; common::Span node_sum_gradients_d; - /*! The row offset for this shard. */ - bst_uint row_begin_idx; - bst_uint row_end_idx; bst_uint n_rows; TrainParam param; @@ -623,7 +619,7 @@ struct DeviceShard { dh::CubMemory temp_memory; dh::PinnedMemory pinned_memory; - std::vector streams; + std::vector streams{}; common::Monitor monitor; std::vector node_value_constraints; @@ -635,14 +631,10 @@ struct DeviceShard { std::function>; std::unique_ptr qexpand; - DeviceShard(int _device_id, int shard_idx, bst_uint row_begin, - bst_uint row_end, TrainParam _param, uint32_t column_sampler_seed, + DeviceShard(int _device_id, bst_uint _n_rows, TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features) : device_id(_device_id), - shard_idx(shard_idx), - row_begin_idx(row_begin), - row_end_idx(row_end), - n_rows(row_end - row_begin), + n_rows(_n_rows), n_bins(0), param(std::move(_param)), prediction_cache_initialised(false), @@ -658,7 +650,7 @@ struct DeviceShard { const SparsePage &row_batch, const common::HistogramCuts &hmat, const RowStateOnDevice &device_row_state, int rows_per_batch); - ~DeviceShard() { + ~DeviceShard() { // NOLINT dh::safe_cuda(cudaSetDevice(device_id)); for (auto& stream : streams) { dh::safe_cuda(cudaStreamDestroy(stream)); @@ -704,7 +696,7 @@ struct DeviceShard { dh::safe_cuda(cudaMemcpyAsync( gpair.data(), dh_gpair->ConstDevicePointer(), gpair.size() * sizeof(GradientPair), cudaMemcpyHostToHost)); - SubsampleGradientPair(device_id, gpair, param.subsample, row_begin_idx); + SubsampleGradientPair(device_id, gpair, param.subsample); hist.Reset(); } @@ -755,7 +747,7 @@ struct DeviceShard { DeviceNodeStats node(node_sum_gradients[nidx], nidx, param); auto d_result = d_result_all.subspan(i, 1); - if (d_feature_set.size() == 0) { + if (d_feature_set.empty()) { // Acting as a device side constructor for DeviceSplitCandidate. // DeviceSplitCandidate::IsValid is false so that ApplySplit can reject this // candidate. @@ -927,12 +919,11 @@ struct DeviceShard { monitor.StartCuda("AllReduce"); auto d_node_hist = hist.GetNodeHistogram(nidx).data(); reducer->AllReduceSum( - shard_idx, reinterpret_cast(d_node_hist), reinterpret_cast(d_node_hist), ellpack_matrix.BinCount() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); - reducer->Synchronize(device_id); + reducer->Synchronize(); monitor.StopCuda("AllReduce"); } @@ -979,11 +970,11 @@ struct DeviceShard { void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) { RegTree& tree = *p_tree; - GradStats left_stats; + GradStats left_stats{}; left_stats.Add(candidate.split.left_sum); - GradStats right_stats; + GradStats right_stats{}; right_stats.Add(candidate.split.right_sum); - GradStats parent_sum; + GradStats parent_sum{}; parent_sum.Add(left_stats); parent_sum.Add(right_stats); node_value_constraints.resize(tree.GetNodes().size()); @@ -1021,9 +1012,9 @@ struct DeviceShard { dh::SumReduction(temp_memory, gpair, node_sum_gradients_d, gpair.size()); reducer->AllReduceSum( - shard_idx, reinterpret_cast(node_sum_gradients_d.data()), + reinterpret_cast(node_sum_gradients_d.data()), reinterpret_cast(node_sum_gradients_d.data()), 2); - reducer->Synchronize(device_id); + reducer->Synchronize(); dh::safe_cuda(cudaMemcpy(node_sum_gradients.data(), node_sum_gradients_d.data(), sizeof(GradientPair), cudaMemcpyDeviceToHost)); @@ -1238,52 +1229,44 @@ inline void DeviceShard::CreateHistIndices( class DeviceHistogramBuilderState { public: template - explicit DeviceHistogramBuilderState( - const std::vector>> &shards) { - device_row_states_.reserve(shards.size()); - for (const auto &shard : shards) { - device_row_states_.push_back(RowStateOnDevice(shard->n_rows)); - } - } + explicit DeviceHistogramBuilderState(const std::unique_ptr>& shard) + : device_row_state_(shard->n_rows) {} - const RowStateOnDevice &GetRowStateOnDevice(int idx) const { - return device_row_states_[idx]; + const RowStateOnDevice& GetRowStateOnDevice() const { + return device_row_state_; } // This method is invoked at the beginning of each sparse page batch. This distributes - // the rows in the sparse page to the different devices. + // the rows in the sparse page to the device. // TODO(sriramch): Think of a way to utilize *all* the GPUs to build the compressed bins. void BeginBatch(const SparsePage &batch) { size_t rem_rows = batch.Size(); size_t row_offset_in_current_batch = 0; - for (auto &device_row_state : device_row_states_) { - // Do we have anymore left to process from this batch on this device? - if (device_row_state.total_rows_assigned_to_device > device_row_state.total_rows_processed) { - // There are still some rows that needs to be assigned to this device - device_row_state.rows_to_process_from_batch = - std::min( - device_row_state.total_rows_assigned_to_device - device_row_state.total_rows_processed, - rem_rows); - } else { - // All rows have been assigned to this device - device_row_state.rows_to_process_from_batch = 0; - } - device_row_state.row_offset_in_current_batch = row_offset_in_current_batch; - row_offset_in_current_batch += device_row_state.rows_to_process_from_batch; - rem_rows -= device_row_state.rows_to_process_from_batch; + // Do we have anymore left to process from this batch on this device? + if (device_row_state_.total_rows_assigned_to_device > device_row_state_.total_rows_processed) { + // There are still some rows that needs to be assigned to this device + device_row_state_.rows_to_process_from_batch = + std::min( + device_row_state_.total_rows_assigned_to_device - device_row_state_.total_rows_processed, + rem_rows); + } else { + // All rows have been assigned to this device + device_row_state_.rows_to_process_from_batch = 0; } + + device_row_state_.row_offset_in_current_batch = row_offset_in_current_batch; + row_offset_in_current_batch += device_row_state_.rows_to_process_from_batch; + rem_rows -= device_row_state_.rows_to_process_from_batch; } // This method is invoked after completion of each sparse page batch void EndBatch() { - for (auto &rs : device_row_states_) { - rs.Advance(); - } + device_row_state_.Advance(); } private: - std::vector device_row_states_; + RowStateOnDevice device_row_state_{0}; }; template @@ -1302,7 +1285,9 @@ class GPUHistMakerSpecialised { monitor_.Init("updater_gpu_hist"); } - ~GPUHistMakerSpecialised() { dh::GlobalMemoryLogger().Log(); } + ~GPUHistMakerSpecialised() { // NOLINT + dh::GlobalMemoryLogger().Log(); + } void Update(HostDeviceVector* gpair, DMatrix* dmat, const std::vector& trees) { @@ -1333,20 +1318,13 @@ class GPUHistMakerSpecialised { uint32_t column_sampling_seed = common::GlobalRandom()(); rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0); - // Create device shards - shards_.resize(1); - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(device_)); - size_t start = 0; - size_t size = info_->num_row_; - shard = std::unique_ptr>( - new DeviceShard(device_, idx, - start, start + size, param_, - column_sampling_seed, - info_->num_col_)); - }); + // Create device shard + dh::safe_cuda(cudaSetDevice(device_)); + shard_.reset(new DeviceShard(device_, + info_->num_row_, + param_, + column_sampling_seed, + info_->num_col_)); monitor_.StartCuda("Quantiles"); // Create the quantile sketches for the dmatrix and initialize HistogramCuts @@ -1355,32 +1333,22 @@ class GPUHistMakerSpecialised { dmat, &hmat_); monitor_.StopCuda("Quantiles"); - n_bins_ = hmat_.Ptrs().back(); - auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_; // Init global data for each shard monitor_.StartCuda("InitCompressedData"); - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - shard->InitCompressedData(hmat_, row_stride, is_dense); - }); + dh::safe_cuda(cudaSetDevice(shard_->device_id)); + shard_->InitCompressedData(hmat_, row_stride, is_dense); monitor_.StopCuda("InitCompressedData"); monitor_.StartCuda("BinningCompression"); - DeviceHistogramBuilderState hist_builder_row_state(shards_); + DeviceHistogramBuilderState hist_builder_row_state(shard_); for (const auto &batch : dmat->GetBatches()) { hist_builder_row_state.BeginBatch(batch); - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - shard->CreateHistIndices(batch, hmat_, hist_builder_row_state.GetRowStateOnDevice(idx), - hist_maker_param_.gpu_batch_nrows); - }); + dh::safe_cuda(cudaSetDevice(shard_->device_id)); + shard_->CreateHistIndices(batch, hmat_, hist_builder_row_state.GetRowStateOnDevice(), + hist_maker_param_.gpu_batch_nrows); hist_builder_row_state.EndBatch(); } @@ -1408,7 +1376,7 @@ class GPUHistMakerSpecialised { } fs.Seek(0); rabit::Broadcast(&s_model, 0); - RegTree reference_tree; + RegTree reference_tree{}; reference_tree.Load(&fs); for (const auto& tree : local_trees) { CHECK(tree == reference_tree); @@ -1421,66 +1389,39 @@ class GPUHistMakerSpecialised { this->InitData(gpair, p_fmat); monitor_.StopCuda("InitData"); - std::vector trees(shards_.size()); - for (auto& tree : trees) { - tree = *p_tree; - } gpair->SetDevice(device_); - - // Launch one thread for each device "shard" containing a subset of rows. - // Threads will cooperatively build the tree, synchronising over histograms. - // Each thread will redundantly build its own copy of the tree - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - shard->UpdateTree(gpair, p_fmat, &trees.at(idx), &reducer_); - }); - - // All trees are expected to be identical - if (hist_maker_param_.debug_synchronize) { - this->CheckTreesSynchronized(trees); - } - - // Write the output tree - *p_tree = trees.front(); + shard_->UpdateTree(gpair, p_fmat, p_tree, &reducer_); } bool UpdatePredictionCache( const DMatrix* data, HostDeviceVector* p_out_preds) { - if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) { + if (shard_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) { return false; } monitor_.StartCuda("UpdatePredictionCache"); p_out_preds->SetDevice(device_); - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - shard->UpdatePredictionCache( - p_out_preds->DevicePointer()); - }); + dh::safe_cuda(cudaSetDevice(shard_->device_id)); + shard_->UpdatePredictionCache(p_out_preds->DevicePointer()); monitor_.StopCuda("UpdatePredictionCache"); return true; } TrainParam param_; // NOLINT common::HistogramCuts hmat_; // NOLINT - MetaInfo* info_; // NOLINT + MetaInfo* info_{}; // NOLINT - std::vector>> shards_; // NOLINT + std::unique_ptr> shard_; // NOLINT private: bool initialised_; - int n_bins_; - GPUHistMakerTrainParam hist_maker_param_; GenericParameter const* generic_param_; dh::AllReducer reducer_; DMatrix* p_last_fmat_; - int device_; + int device_{-1}; common::Monitor monitor_; }; diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 6f425a3c2..271ac8412 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -10,17 +10,6 @@ using xgboost::common::Span; -struct Shard { int id; }; - -TEST(DeviceHelpers, Basic) { - std::vector shards (4); - for (int i = 0; i < 4; ++i) { - shards[i].id = i; - } - int sum = dh::ReduceShards(&shards, [](Shard& s) { return s.id ; }); - ASSERT_EQ(sum, 6); -} - void CreateTestData(xgboost::bst_uint num_rows, int max_row_size, thrust::host_vector *row_ptr, thrust::host_vector *rows) { diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index 1ab4348c8..1869576d8 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -38,19 +38,19 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector *v) { ASSERT_EQ(v->Size(), n); ASSERT_EQ(v->DeviceIdx(), device); // ensure that the device have read-write access - ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead)); - ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite)); + ASSERT_TRUE(v->DeviceCanRead()); + ASSERT_TRUE(v->DeviceCanWrite()); // ensure that the host has no access - ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite)); - ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead)); + ASSERT_FALSE(v->HostCanRead()); + ASSERT_FALSE(v->HostCanWrite()); // fill in the data on the host std::vector& data_h = v->HostVector(); // ensure that the host has full access, while the device have none - ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead)); - ASSERT_TRUE(v->HostCanAccess(GPUAccess::kWrite)); - ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kRead)); - ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite)); + ASSERT_TRUE(v->HostCanRead()); + ASSERT_TRUE(v->HostCanWrite()); + ASSERT_FALSE(v->DeviceCanRead()); + ASSERT_FALSE(v->DeviceCanWrite()); ASSERT_EQ(data_h.size(), n); std::copy_n(thrust::make_counting_iterator(0), n, data_h.begin()); } @@ -62,76 +62,62 @@ void PlusOne(HostDeviceVector *v) { [=]__device__(unsigned int a){ return a + 1; }); } -void CheckDevice(HostDeviceVector *v, - const std::vector& starts, - const std::vector& sizes, - unsigned int first, GPUAccess access) { - int n_devices = sizes.size(); - ASSERT_EQ(n_devices, 1); - for (int i = 0; i < n_devices; ++i) { - ASSERT_EQ(v->DeviceSize(), sizes.at(i)); - SetDevice(i); - ASSERT_TRUE(thrust::equal(v->tcbegin(), v->tcend(), - thrust::make_counting_iterator(first + starts[i]))); - ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead)); - // ensure that the device has at most the access specified by access - ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite); - } - ASSERT_EQ(v->HostCanAccess(GPUAccess::kRead), access == GPUAccess::kRead); - ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite)); - for (int i = 0; i < n_devices; ++i) { - SetDevice(i); - ASSERT_TRUE(thrust::equal(v->tbegin(), v->tend(), - thrust::make_counting_iterator(first + starts[i]))); - ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead)); - ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite)); - } - ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead)); - ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite)); +void CheckDevice(HostDeviceVector* v, + size_t size, + unsigned int first, + GPUAccess access) { + ASSERT_EQ(v->Size(), size); + SetDevice(v->DeviceIdx()); + + ASSERT_TRUE(thrust::equal(v->tcbegin(), v->tcend(), + thrust::make_counting_iterator(first))); + ASSERT_TRUE(v->DeviceCanRead()); + // ensure that the device has at most the access specified by access + ASSERT_EQ(v->DeviceCanWrite(), access == GPUAccess::kWrite); + ASSERT_EQ(v->HostCanRead(), access == GPUAccess::kRead); + ASSERT_FALSE(v->HostCanWrite()); + + ASSERT_TRUE(thrust::equal(v->tbegin(), v->tend(), + thrust::make_counting_iterator(first))); + ASSERT_TRUE(v->DeviceCanRead()); + ASSERT_TRUE(v->DeviceCanWrite()); + ASSERT_FALSE(v->HostCanRead()); + ASSERT_FALSE(v->HostCanWrite()); } void CheckHost(HostDeviceVector *v, GPUAccess access) { - const std::vector& data_h = access == GPUAccess::kWrite ? + const std::vector& data_h = access == GPUAccess::kNone ? v->HostVector() : v->ConstHostVector(); for (size_t i = 0; i < v->Size(); ++i) { ASSERT_EQ(data_h.at(i), i + 1); } - ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead)); - ASSERT_EQ(v->HostCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite); - size_t n_devices = 1; - for (int i = 0; i < n_devices; ++i) { - ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kRead), access == GPUAccess::kRead); - // the devices should have no write access - ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite)); - } + ASSERT_TRUE(v->HostCanRead()); + ASSERT_EQ(v->HostCanWrite(), access == GPUAccess::kNone); + ASSERT_EQ(v->DeviceCanRead(), access == GPUAccess::kRead); + // the devices should have no write access + ASSERT_FALSE(v->DeviceCanWrite()); } -void TestHostDeviceVector -(size_t n, int device, - const std::vector& starts, const std::vector& sizes) { +void TestHostDeviceVector(size_t n, int device) { HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); HostDeviceVector v; InitHostDeviceVector(n, device, &v); - CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead); + CheckDevice(&v, n, 0, GPUAccess::kRead); PlusOne(&v); - CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite); + CheckDevice(&v, n, 1, GPUAccess::kWrite); CheckHost(&v, GPUAccess::kRead); - CheckHost(&v, GPUAccess::kWrite); + CheckHost(&v, GPUAccess::kNone); } -TEST(HostDeviceVector, TestBlock) { +TEST(HostDeviceVector, Basic) { size_t n = 1001; int device = 0; - std::vector starts{0}; - std::vector sizes{1001}; - TestHostDeviceVector(n, device, starts, sizes); + TestHostDeviceVector(n, device); } -TEST(HostDeviceVector, TestCopy) { +TEST(HostDeviceVector, Copy) { size_t n = 1001; int device = 0; - std::vector starts{0}; - std::vector sizes{1001}; HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); HostDeviceVector v; @@ -141,14 +127,14 @@ TEST(HostDeviceVector, TestCopy) { InitHostDeviceVector(n, device, &v1); v = v1; } - CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead); + CheckDevice(&v, n, 0, GPUAccess::kRead); PlusOne(&v); - CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite); + CheckDevice(&v, n, 1, GPUAccess::kWrite); CheckHost(&v, GPUAccess::kRead); - CheckHost(&v, GPUAccess::kWrite); + CheckHost(&v, GPUAccess::kNone); } -TEST(HostDeviceVector, Shard) { +TEST(HostDeviceVector, SetDevice) { std::vector h_vec (2345); for (size_t i = 0; i < h_vec.size(); ++i) { h_vec[i] = i; @@ -157,7 +143,6 @@ TEST(HostDeviceVector, Shard) { auto device = 0; vec.SetDevice(device); - ASSERT_EQ(vec.DeviceSize(), h_vec.size()); ASSERT_EQ(vec.Size(), h_vec.size()); auto span = vec.DeviceSpan(); // sync to device @@ -169,39 +154,26 @@ TEST(HostDeviceVector, Shard) { ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin())); } -TEST(HostDeviceVector, Reshard) { - std::vector h_vec (2345); - for (size_t i = 0; i < h_vec.size(); ++i) { - h_vec[i] = i; - } - HostDeviceVector vec (h_vec); - auto device = 0; - - vec.SetDevice(device); - ASSERT_EQ(vec.DeviceSize(), h_vec.size()); - ASSERT_EQ(vec.Size(), h_vec.size()); - PlusOne(&vec); - - vec.SetDevice(-1); - ASSERT_EQ(vec.Size(), h_vec.size()); - ASSERT_EQ(vec.DeviceIdx(), -1); - - auto h_vec_1 = vec.HostVector(); - for (size_t i = 0; i < h_vec_1.size(); ++i) { - ASSERT_EQ(h_vec_1.at(i), i + 1); - } -} - TEST(HostDeviceVector, Span) { HostDeviceVector vec {1.0f, 2.0f, 3.0f, 4.0f}; vec.SetDevice(0); auto span = vec.DeviceSpan(); - ASSERT_EQ(vec.DeviceSize(), span.size()); + ASSERT_EQ(vec.Size(), span.size()); ASSERT_EQ(vec.DevicePointer(), span.data()); auto const_span = vec.ConstDeviceSpan(); - ASSERT_EQ(vec.DeviceSize(), span.size()); - ASSERT_EQ(vec.ConstDevicePointer(), span.data()); + ASSERT_EQ(vec.Size(), const_span.size()); + ASSERT_EQ(vec.ConstDevicePointer(), const_span.data()); } +TEST(HostDeviceVector, MGPU_Basic) { + if (AllVisibleGPUs() < 2) { + LOG(WARNING) << "Not testing in multi-gpu environment."; + return; + } + + size_t n = 1001; + int device = 1; + TestHostDeviceVector(n, device); +} } // namespace common } // namespace xgboost diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 9a50e76f9..15ee3732c 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -83,8 +83,8 @@ TEST(gpu_predictor, ExternalMemoryTest) { std::string file1 = tmpdir.path + "/big_1.libsvm"; std::string file2 = tmpdir.path + "/big_2.libsvm"; dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0)); -// dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1)); -// dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2)); + dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1)); + dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2)); for (const auto& dmat: dmats) { dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5); diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index df811c413..280809cfa 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -113,7 +113,7 @@ TEST(GpuHist, BuildGidxDense) { {"max_leaves", "0"}, }; param.Init(args); - DeviceShard shard(0, 0, 0, kNRows, param, kNCols, kNCols); + DeviceShard shard(0, kNRows, param, kNCols, kNCols); BuildGidx(&shard, kNRows, kNCols); std::vector h_gidx_buffer(shard.gidx_buffer.size()); @@ -154,8 +154,7 @@ TEST(GpuHist, BuildGidxSparse) { }; param.Init(args); - DeviceShard shard(0, 0, 0, kNRows, param, kNCols, - kNCols); + DeviceShard shard(0, kNRows, param, kNCols, kNCols); BuildGidx(&shard, kNRows, kNCols, 0.9f); std::vector h_gidx_buffer(shard.gidx_buffer.size()); @@ -200,8 +199,7 @@ void TestBuildHist(bool use_shared_memory_histograms) { {"max_leaves", "0"}, }; param.Init(args); - DeviceShard shard(0, 0, 0, kNRows, param, kNCols, - kNCols); + DeviceShard shard(0, kNRows, param, kNCols, kNCols); BuildGidx(&shard, kNRows, kNCols); xgboost::SimpleLCG gen; @@ -303,8 +301,7 @@ TEST(GpuHist, EvaluateSplits) { // Initialize DeviceShard std::unique_ptr> shard{ - new DeviceShard(0, 0, 0, kNRows, param, kNCols, - kNCols)}; + new DeviceShard(0, kNRows, param, kNCols, kNCols)}; // Initialize DeviceShard::node_sum_gradients shard->node_sum_gradients = {{6.4f, 12.8f}}; @@ -391,24 +388,20 @@ void TestHistogramIndexImpl() { hist_maker_ext.Configure(training_params, &generic_param); hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get()); - ASSERT_EQ(hist_maker.shards_.size(), hist_maker_ext.shards_.size()); - - // Extract the device shards from the histogram makers and from that its compressed + // Extract the device shard from the histogram makers and from that its compressed // histogram index - for (size_t i = 0; i < hist_maker.shards_.size(); ++i) { - const auto &dev_shard = hist_maker.shards_[i]; - std::vector h_gidx_buffer(dev_shard->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer); + const auto &dev_shard = hist_maker.shard_; + std::vector h_gidx_buffer(dev_shard->gidx_buffer.size()); + dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer); - const auto &dev_shard_ext = hist_maker_ext.shards_[i]; - std::vector h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer); + const auto &dev_shard_ext = hist_maker_ext.shard_; + std::vector h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size()); + dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer); - ASSERT_EQ(dev_shard->n_bins, dev_shard_ext->n_bins); - ASSERT_EQ(dev_shard->gidx_buffer.size(), dev_shard_ext->gidx_buffer.size()); + ASSERT_EQ(dev_shard->n_bins, dev_shard_ext->n_bins); + ASSERT_EQ(dev_shard->gidx_buffer.size(), dev_shard_ext->gidx_buffer.size()); - ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext); - } + ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext); } TEST(GpuHist, TestHistogramIndex) {