further cleanup of single process multi-GPU code (#4810)
* use subspan in gpu predictor instead of copying * Revise `HostDeviceVector`
This commit is contained in:
@@ -238,8 +238,7 @@ class MemoryLogger {
|
||||
device_allocations.erase(itr);
|
||||
}
|
||||
};
|
||||
std::map<int, DeviceStats>
|
||||
stats_; // Map device ordinal to memory information
|
||||
DeviceStats stats_;
|
||||
std::mutex mutex_;
|
||||
|
||||
public:
|
||||
@@ -249,8 +248,8 @@ public:
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
stats_[current_device].RegisterAllocation(ptr, n);
|
||||
CHECK_LE(stats_[current_device].peak_allocated_bytes, dh::TotalMemory(current_device));
|
||||
stats_.RegisterAllocation(ptr, n);
|
||||
CHECK_LE(stats_.peak_allocated_bytes, dh::TotalMemory(current_device));
|
||||
}
|
||||
void RegisterDeallocation(void *ptr, size_t n) {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
|
||||
@@ -258,19 +257,19 @@ public:
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
stats_[current_device].RegisterDeallocation(ptr, n, current_device);
|
||||
stats_.RegisterDeallocation(ptr, n, current_device);
|
||||
}
|
||||
void Log() {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
|
||||
return;
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
for (const auto &kv : stats_) {
|
||||
LOG(CONSOLE) << "======== Device " << kv.first << " Memory Allocations: "
|
||||
<< " ========";
|
||||
LOG(CONSOLE) << "Peak memory usage: "
|
||||
<< kv.second.peak_allocated_bytes / 1000000 << "mb";
|
||||
LOG(CONSOLE) << "Number of allocations: " << kv.second.num_allocations;
|
||||
}
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
|
||||
<< " ========";
|
||||
LOG(CONSOLE) << "Peak memory usage: "
|
||||
<< stats_.peak_allocated_bytes / 1000000 << "mb";
|
||||
LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
|
||||
}
|
||||
};
|
||||
};
|
||||
@@ -940,10 +939,9 @@ class AllReducer {
|
||||
size_t allreduce_calls_; // Keep statistics of the number of reduce calls
|
||||
std::vector<size_t> host_data; // Used for all reduce on host
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
std::vector<ncclComm_t> comms;
|
||||
std::vector<cudaStream_t> streams;
|
||||
std::vector<int> device_ordinals; // device id from CUDA
|
||||
std::vector<int> device_counts; // device count from CUDA
|
||||
ncclComm_t comm;
|
||||
cudaStream_t stream;
|
||||
int device_ordinal;
|
||||
ncclUniqueId id;
|
||||
#endif
|
||||
|
||||
@@ -952,79 +950,28 @@ class AllReducer {
|
||||
allreduce_calls_(0) {}
|
||||
|
||||
/**
|
||||
* \brief If we are using a single GPU only
|
||||
*/
|
||||
bool IsSingleGPU() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(device_counts.size() > 0) << "AllReducer not initialised.";
|
||||
return device_counts.size() <= 1 && device_counts.at(0) == 1;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Initialise with the desired device ordinals for this communication
|
||||
* \brief Initialise with the desired device ordinal for this communication
|
||||
* group.
|
||||
*
|
||||
* \param device_ordinals The device ordinals.
|
||||
* \param device_ordinal The device ordinal.
|
||||
*/
|
||||
|
||||
void Init(const std::vector<int> &device_ordinals) {
|
||||
void Init(int _device_ordinal) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
/** \brief this >monitor . init. */
|
||||
this->device_ordinals = device_ordinals;
|
||||
this->device_counts.resize(rabit::GetWorldSize());
|
||||
this->comms.resize(device_ordinals.size());
|
||||
this->streams.resize(device_ordinals.size());
|
||||
this->id = GetUniqueId();
|
||||
|
||||
device_counts.at(rabit::GetRank()) = device_ordinals.size();
|
||||
for (size_t i = 0; i < device_counts.size(); i++) {
|
||||
int dev_count = device_counts.at(i);
|
||||
rabit::Allreduce<rabit::op::Sum, int>(&dev_count, 1);
|
||||
device_counts.at(i) = dev_count;
|
||||
}
|
||||
|
||||
int nccl_rank = 0;
|
||||
int nccl_rank_offset = std::accumulate(device_counts.begin(),
|
||||
device_counts.begin() + rabit::GetRank(), 0);
|
||||
int nccl_nranks = std::accumulate(device_counts.begin(),
|
||||
device_counts.end(), 0);
|
||||
nccl_rank += nccl_rank_offset;
|
||||
|
||||
GroupStart();
|
||||
for (size_t i = 0; i < device_ordinals.size(); i++) {
|
||||
int dev = device_ordinals.at(i);
|
||||
dh::safe_cuda(cudaSetDevice(dev));
|
||||
dh::safe_nccl(ncclCommInitRank(
|
||||
&comms.at(i),
|
||||
nccl_nranks, id,
|
||||
nccl_rank));
|
||||
|
||||
nccl_rank++;
|
||||
}
|
||||
GroupEnd();
|
||||
|
||||
for (size_t i = 0; i < device_ordinals.size(); i++) {
|
||||
safe_cuda(cudaSetDevice(device_ordinals.at(i)));
|
||||
safe_cuda(cudaStreamCreate(&streams.at(i)));
|
||||
}
|
||||
device_ordinal = _device_ordinal;
|
||||
id = GetUniqueId();
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal));
|
||||
dh::safe_nccl(ncclCommInitRank(&comm, rabit::GetWorldSize(), id, rabit::GetRank()));
|
||||
safe_cuda(cudaStreamCreate(&stream));
|
||||
initialised_ = true;
|
||||
#else
|
||||
CHECK_EQ(device_ordinals.size(), 1)
|
||||
<< "XGBoost must be compiled with NCCL to use more than one GPU.";
|
||||
#endif
|
||||
}
|
||||
~AllReducer() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
if (initialised_) {
|
||||
for (auto &stream : streams) {
|
||||
dh::safe_cuda(cudaStreamDestroy(stream));
|
||||
}
|
||||
for (auto &comm : comms) {
|
||||
ncclCommDestroy(comm);
|
||||
}
|
||||
dh::safe_cuda(cudaStreamDestroy(stream));
|
||||
ncclCommDestroy(comm);
|
||||
}
|
||||
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
LOG(CONSOLE) << "======== NCCL Statistics========";
|
||||
@@ -1035,20 +982,21 @@ class AllReducer {
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Use in exactly the same way as ncclGroupStart
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
void GroupStart() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
dh::safe_nccl(ncclGroupStart());
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Use in exactly the same way as ncclGroupEnd
|
||||
*/
|
||||
void GroupEnd() {
|
||||
void AllReduceSum(const double *sendbuff, double *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
dh::safe_nccl(ncclGroupEnd());
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, comm, stream));
|
||||
allreduce_bytes_ += count * sizeof(double);
|
||||
allreduce_calls_ += 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1056,51 +1004,18 @@ class AllReducer {
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param communication_group_idx Zero-based index of the communication group.
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
|
||||
void AllReduceSum(int communication_group_idx, const double *sendbuff,
|
||||
double *recvbuff, int count) {
|
||||
void AllReduceSum(const float *sendbuff, float *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx)));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum,
|
||||
comms.at(communication_group_idx),
|
||||
streams.at(communication_group_idx)));
|
||||
if(communication_group_idx == 0)
|
||||
{
|
||||
allreduce_bytes_ += count * sizeof(double);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param communication_group_idx Zero-based index of the communication group.
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
|
||||
void AllReduceSum(int communication_group_idx, const float *sendbuff,
|
||||
float *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised_);
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx)));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum,
|
||||
comms.at(communication_group_idx),
|
||||
streams.at(communication_group_idx)));
|
||||
if(communication_group_idx == 0)
|
||||
{
|
||||
allreduce_bytes_ += count * sizeof(float);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, comm, stream));
|
||||
allreduce_bytes_ += count * sizeof(float);
|
||||
allreduce_calls_ += 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1109,21 +1024,17 @@ class AllReducer {
|
||||
*
|
||||
* \param count Number of.
|
||||
*
|
||||
* \param communication_group_idx Zero-based index of the communication group. \param sendbuff.
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of.
|
||||
*/
|
||||
|
||||
void AllReduceSum(int communication_group_idx, const int64_t *sendbuff,
|
||||
int64_t *recvbuff, int count) {
|
||||
void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised_);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum,
|
||||
comms[communication_group_idx],
|
||||
streams[communication_group_idx]));
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum, comm, stream));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1134,26 +1045,8 @@ class AllReducer {
|
||||
*/
|
||||
void Synchronize() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
for (size_t i = 0; i < device_ordinals.size(); i++) {
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinals[i]));
|
||||
dh::safe_cuda(cudaStreamSynchronize(streams[i]));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Synchronizes the device
|
||||
*
|
||||
* \param device_id Identifier for the device.
|
||||
*/
|
||||
void Synchronize(int device_id) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
SaveCudaContext([&]() {
|
||||
dh::safe_cuda(cudaSetDevice(device_id));
|
||||
int idx = std::find(device_ordinals.begin(), device_ordinals.end(), device_id) - device_ordinals.begin();
|
||||
CHECK(idx < device_ordinals.size());
|
||||
dh::safe_cuda(cudaStreamSynchronize(streams[idx]));
|
||||
});
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal));
|
||||
dh::safe_cuda(cudaStreamSynchronize(stream));
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -1219,58 +1112,6 @@ class AllReducer {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Executes some operation on each element of the input vector, using a
|
||||
* single controlling thread for each element. In addition, passes the shard index
|
||||
* into the function.
|
||||
*
|
||||
* \tparam T Generic type parameter.
|
||||
* \tparam FunctionT Type of the function t.
|
||||
* \param shards The shards.
|
||||
* \param f The func_t to process.
|
||||
*/
|
||||
|
||||
template <typename T, typename FunctionT>
|
||||
void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
|
||||
SaveCudaContext{[&]() {
|
||||
// Temporarily turn off dynamic so we have a guaranteed number of threads
|
||||
bool dynamic = omp_get_dynamic();
|
||||
omp_set_dynamic(false);
|
||||
const long shards_size = static_cast<long>(shards->size());
|
||||
#pragma omp parallel for schedule(static, 1) if (shards_size > 1) num_threads(shards_size)
|
||||
for (long shard = 0; shard < shards_size; ++shard) {
|
||||
f(shard, shards->at(shard));
|
||||
}
|
||||
omp_set_dynamic(dynamic);
|
||||
}};
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Executes some operation on each element of the input vector, using a single controlling
|
||||
* thread for each element, returns the sum of the results.
|
||||
*
|
||||
* \tparam ReduceT Type of the reduce t.
|
||||
* \tparam T Generic type parameter.
|
||||
* \tparam FunctionT Type of the function t.
|
||||
* \param shards The shards.
|
||||
* \param f The func_t to process.
|
||||
*
|
||||
* \return A reduce_t.
|
||||
*/
|
||||
|
||||
template <typename ReduceT, typename ShardT, typename FunctionT>
|
||||
ReduceT ReduceShards(std::vector<ShardT> *shards, FunctionT f) {
|
||||
std::vector<ReduceT> sums(shards->size());
|
||||
SaveCudaContext {
|
||||
[&](){
|
||||
#pragma omp parallel for schedule(static, 1) if (shards->size() > 1)
|
||||
for (int shard = 0; shard < shards->size(); ++shard) {
|
||||
sums[shard] = f(shards->at(shard));
|
||||
}}
|
||||
};
|
||||
return std::accumulate(sums.begin(), sums.end(), ReduceT());
|
||||
}
|
||||
|
||||
template <typename T,
|
||||
typename IndexT = typename xgboost::common::Span<T>::index_type>
|
||||
xgboost::common::Span<T> ToSpan(
|
||||
|
||||
@@ -108,9 +108,6 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||
impl_->Vec().resize(new_size, v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceSize() const { return 0; }
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Fill(T v) {
|
||||
std::fill(HostVector().begin(), HostVector().end(), v);
|
||||
@@ -135,12 +132,22 @@ void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
|
||||
bool HostDeviceVector<T>::HostCanRead() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
|
||||
bool HostDeviceVector<T>::HostCanWrite() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::DeviceCanRead() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::DeviceCanWrite() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -19,33 +19,12 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
|
||||
cudaSetDeviceHandler = handler;
|
||||
}
|
||||
|
||||
// wrapper over access with useful methods
|
||||
class Permissions {
|
||||
GPUAccess access_;
|
||||
explicit Permissions(GPUAccess access) : access_{access} {}
|
||||
|
||||
public:
|
||||
Permissions() : access_{GPUAccess::kNone} {}
|
||||
explicit Permissions(bool perm)
|
||||
: access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
|
||||
|
||||
bool CanRead() const { return access_ >= kRead; }
|
||||
bool CanWrite() const { return access_ == kWrite; }
|
||||
bool CanAccess(GPUAccess access) const { return access_ >= access; }
|
||||
void Grant(GPUAccess access) { access_ = std::max(access_, access); }
|
||||
void DenyComplementary(GPUAccess compl_access) {
|
||||
access_ = std::min(access_, GPUAccess::kWrite - compl_access);
|
||||
}
|
||||
Permissions Complementary() const {
|
||||
return Permissions(GPUAccess::kWrite - access_);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class HostDeviceVectorImpl {
|
||||
public:
|
||||
HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), perm_h_(device < 0) {
|
||||
HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
|
||||
if (device >= 0) {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
data_d_.resize(size, v);
|
||||
} else {
|
||||
@@ -53,19 +32,11 @@ class HostDeviceVectorImpl {
|
||||
}
|
||||
}
|
||||
|
||||
// required, as a new std::mutex has to be created
|
||||
HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
|
||||
: device_(other.device_), data_h_(other.data_h_), perm_h_(other.perm_h_), mutex_() {
|
||||
if (device_ >= 0) {
|
||||
SetDevice();
|
||||
data_d_ = other.data_d_;
|
||||
}
|
||||
}
|
||||
|
||||
// Initializer can be std::vector<T> or std::initializer_list<T>
|
||||
template <class Initializer>
|
||||
HostDeviceVectorImpl(const Initializer& init, int device) : device_(device), perm_h_(device < 0) {
|
||||
HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
|
||||
if (device >= 0) {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
LazyResizeDevice(init.size());
|
||||
Copy(init);
|
||||
} else {
|
||||
@@ -79,7 +50,7 @@ class HostDeviceVectorImpl {
|
||||
}
|
||||
}
|
||||
|
||||
size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : data_d_.size(); }
|
||||
size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_.size(); }
|
||||
|
||||
int DeviceIdx() const { return device_; }
|
||||
|
||||
@@ -95,18 +66,13 @@ class HostDeviceVectorImpl {
|
||||
|
||||
common::Span<T> DeviceSpan() {
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
return {data_d_.data().get(), static_cast<typename common::Span<T>::index_type>(DeviceSize())};
|
||||
return {data_d_.data().get(), static_cast<typename common::Span<T>::index_type>(Size())};
|
||||
}
|
||||
|
||||
common::Span<const T> ConstDeviceSpan() {
|
||||
LazySyncDevice(GPUAccess::kRead);
|
||||
using SpanInd = typename common::Span<const T>::index_type;
|
||||
return {data_d_.data().get(), static_cast<SpanInd>(DeviceSize())};
|
||||
}
|
||||
|
||||
size_t DeviceSize() {
|
||||
LazySyncDevice(GPUAccess::kRead);
|
||||
return data_d_.size();
|
||||
return {data_d_.data().get(), static_cast<SpanInd>(Size())};
|
||||
}
|
||||
|
||||
thrust::device_ptr<T> tbegin() { // NOLINT
|
||||
@@ -118,55 +84,53 @@ class HostDeviceVectorImpl {
|
||||
}
|
||||
|
||||
thrust::device_ptr<T> tend() { // NOLINT
|
||||
return tbegin() + DeviceSize();
|
||||
return tbegin() + Size();
|
||||
}
|
||||
|
||||
thrust::device_ptr<const T> tcend() { // NOLINT
|
||||
return tcbegin() + DeviceSize();
|
||||
return tcbegin() + Size();
|
||||
}
|
||||
|
||||
void Fill(T v) { // NOLINT
|
||||
if (perm_h_.CanWrite()) {
|
||||
if (HostCanWrite()) {
|
||||
std::fill(data_h_.begin(), data_h_.end(), v);
|
||||
} else {
|
||||
DeviceFill(v);
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
thrust::fill(data_d_.begin(), data_d_.end(), v);
|
||||
}
|
||||
}
|
||||
|
||||
void Copy(HostDeviceVectorImpl<T>* other) {
|
||||
CHECK_EQ(Size(), other->Size());
|
||||
// Data is on host.
|
||||
if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
|
||||
if (HostCanWrite() && other->HostCanWrite()) {
|
||||
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
|
||||
return;
|
||||
}
|
||||
// Data is on device;
|
||||
if (device_ != other->device_) {
|
||||
SetDevice(other->device_);
|
||||
}
|
||||
DeviceCopy(other);
|
||||
CopyToDevice(other);
|
||||
}
|
||||
|
||||
void Copy(const std::vector<T>& other) {
|
||||
CHECK_EQ(Size(), other.size());
|
||||
if (perm_h_.CanWrite()) {
|
||||
if (HostCanWrite()) {
|
||||
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||
} else {
|
||||
DeviceCopy(other.data());
|
||||
CopyToDevice(other.data());
|
||||
}
|
||||
}
|
||||
|
||||
void Copy(std::initializer_list<T> other) {
|
||||
CHECK_EQ(Size(), other.size());
|
||||
if (perm_h_.CanWrite()) {
|
||||
if (HostCanWrite()) {
|
||||
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||
} else {
|
||||
DeviceCopy(other.begin());
|
||||
CopyToDevice(other.begin());
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<T>& HostVector() {
|
||||
LazySyncHost(GPUAccess::kWrite);
|
||||
LazySyncHost(GPUAccess::kNone);
|
||||
return data_h_;
|
||||
}
|
||||
|
||||
@@ -178,7 +142,7 @@ class HostDeviceVectorImpl {
|
||||
void SetDevice(int device) {
|
||||
if (device_ == device) { return; }
|
||||
if (device_ >= 0) {
|
||||
LazySyncHost(GPUAccess::kWrite);
|
||||
LazySyncHost(GPUAccess::kNone);
|
||||
}
|
||||
device_ = device;
|
||||
if (device_ >= 0) {
|
||||
@@ -190,38 +154,37 @@ class HostDeviceVectorImpl {
|
||||
if (new_size == Size()) { return; }
|
||||
if (Size() == 0 && device_ >= 0) {
|
||||
// fast on-device resize
|
||||
perm_h_ = Permissions(false);
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
data_d_.resize(new_size, v);
|
||||
} else {
|
||||
// resize on host
|
||||
LazySyncHost(GPUAccess::kWrite);
|
||||
LazySyncHost(GPUAccess::kNone);
|
||||
data_h_.resize(new_size, v);
|
||||
}
|
||||
}
|
||||
|
||||
void LazySyncHost(GPUAccess access) {
|
||||
if (perm_h_.CanAccess(access)) { return; }
|
||||
if (perm_h_.CanRead()) {
|
||||
if (HostCanAccess(access)) { return; }
|
||||
if (HostCanRead()) {
|
||||
// data is present, just need to deny access to the device
|
||||
perm_h_.Grant(access);
|
||||
gpu_access_ = access;
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
gpu_access_ = access;
|
||||
if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(data_h_.data(),
|
||||
data_d_.data().get(),
|
||||
data_d_.size() * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
perm_h_.Grant(access);
|
||||
}
|
||||
|
||||
void LazySyncDevice(GPUAccess access) {
|
||||
if (DevicePerm().CanAccess(access)) { return; }
|
||||
if (DevicePerm().CanRead()) {
|
||||
if (DeviceCanAccess(access)) { return; }
|
||||
if (DeviceCanRead()) {
|
||||
// deny read to the host
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
perm_h_.DenyComplementary(access);
|
||||
gpu_access_ = access;
|
||||
return;
|
||||
}
|
||||
// data is on the host
|
||||
@@ -231,41 +194,37 @@ class HostDeviceVectorImpl {
|
||||
data_h_.data(),
|
||||
data_d_.size() * sizeof(T),
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
perm_h_.DenyComplementary(access);
|
||||
gpu_access_ = access;
|
||||
}
|
||||
|
||||
bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
|
||||
bool DeviceCanAccess(GPUAccess access) { return DevicePerm().CanAccess(access); }
|
||||
bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
|
||||
bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
|
||||
bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
|
||||
bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
|
||||
bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
|
||||
bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
|
||||
|
||||
private:
|
||||
int device_{-1};
|
||||
std::vector<T> data_h_{};
|
||||
dh::device_vector<T> data_d_{};
|
||||
Permissions perm_h_{false};
|
||||
// protects size_d_ and perm_h_ when updated from multiple threads
|
||||
std::mutex mutex_{};
|
||||
GPUAccess gpu_access_{GPUAccess::kNone};
|
||||
|
||||
void DeviceFill(T v) {
|
||||
// TODO(canonizer): avoid full copy of host data
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
SetDevice();
|
||||
thrust::fill(data_d_.begin(), data_d_.end(), v);
|
||||
void CopyToDevice(HostDeviceVectorImpl* other) {
|
||||
if (other->HostCanWrite()) {
|
||||
CopyToDevice(other->data_h_.data());
|
||||
} else {
|
||||
LazyResizeDevice(Size());
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
|
||||
data_d_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
}
|
||||
|
||||
void DeviceCopy(HostDeviceVectorImpl* other) {
|
||||
// TODO(canonizer): avoid full copy of host data for this (but not for other)
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
other->LazySyncDevice(GPUAccess::kRead);
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
|
||||
data_d_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void DeviceCopy(const T* begin) {
|
||||
// TODO(canonizer): avoid full copy of host data
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
void CopyToDevice(const T* begin) {
|
||||
LazyResizeDevice(Size());
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
|
||||
data_d_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
@@ -285,8 +244,6 @@ class HostDeviceVectorImpl {
|
||||
(*cudaSetDeviceHandler)(device_);
|
||||
}
|
||||
}
|
||||
|
||||
Permissions DevicePerm() const { return perm_h_.Complementary(); }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
@@ -347,11 +304,6 @@ common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
|
||||
return impl_->ConstDeviceSpan();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::DeviceSize() const {
|
||||
return impl_->DeviceSize();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> HostDeviceVector<T>::tbegin() { // NOLINT
|
||||
return impl_->tbegin();
|
||||
@@ -401,13 +353,23 @@ const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
|
||||
return impl_->HostCanAccess(access);
|
||||
bool HostDeviceVector<T>::HostCanRead() const {
|
||||
return impl_->HostCanRead();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
|
||||
return impl_->DeviceCanAccess(access);
|
||||
bool HostDeviceVector<T>::HostCanWrite() const {
|
||||
return impl_->HostCanWrite();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::DeviceCanRead() const {
|
||||
return impl_->DeviceCanRead();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool HostDeviceVector<T>::DeviceCanWrite() const {
|
||||
return impl_->DeviceCanWrite();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -79,16 +79,23 @@ void SetCudaSetDeviceHandler(void (*handler)(int));
|
||||
|
||||
template <typename T> struct HostDeviceVectorImpl;
|
||||
|
||||
/*!
|
||||
* \brief Controls data access from the GPU.
|
||||
*
|
||||
* Since a `HostDeviceVector` can have data on both the host and device, access control needs to be
|
||||
* maintained to keep the data consistent.
|
||||
*
|
||||
* There are 3 scenarios supported:
|
||||
* - Data is being manipulated on device. GPU has write access, host doesn't have access.
|
||||
* - Data is read-only on both the host and device.
|
||||
* - Data is being manipulated on the host. Host has write access, device doesn't have access.
|
||||
*/
|
||||
enum GPUAccess {
|
||||
kNone, kRead,
|
||||
// write implies read
|
||||
kWrite
|
||||
};
|
||||
|
||||
inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
|
||||
return static_cast<GPUAccess>(static_cast<int>(a) - static_cast<int>(b));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class HostDeviceVector {
|
||||
public:
|
||||
@@ -111,8 +118,6 @@ class HostDeviceVector {
|
||||
const T* ConstHostPointer() const { return ConstHostVector().data(); }
|
||||
const T* HostPointer() const { return ConstHostPointer(); }
|
||||
|
||||
size_t DeviceSize() const;
|
||||
|
||||
// only define functions returning device_ptr
|
||||
// if HostDeviceVector.h is included from a .cu file
|
||||
#ifdef __CUDACC__
|
||||
@@ -135,8 +140,10 @@ class HostDeviceVector {
|
||||
const std::vector<T>& ConstHostVector() const;
|
||||
const std::vector<T>& HostVector() const {return ConstHostVector(); }
|
||||
|
||||
bool HostCanAccess(GPUAccess access) const;
|
||||
bool DeviceCanAccess(GPUAccess access) const;
|
||||
bool HostCanRead() const;
|
||||
bool HostCanWrite() const;
|
||||
bool DeviceCanRead() const;
|
||||
bool DeviceCanWrite() const;
|
||||
|
||||
void SetDevice(int device) const;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user