further cleanup of single process multi-GPU code (#4810)

* use subspan in gpu predictor instead of copying * Revise `HostDeviceVector`
2019-08-30 02:27:23 -07:00
parent 0184eb5d02
commit 733ed24dd9
12 changed files with 289 additions and 593 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -238,8 +238,7 @@ class MemoryLogger {
      device_allocations.erase(itr);
    }
  };
-  std::map<int, DeviceStats>
-    stats_;  // Map device ordinal to memory information
+  DeviceStats stats_;
  std::mutex mutex_;

 public:
@@ -249,8 +248,8 @@ public:
    std::lock_guard<std::mutex> guard(mutex_);
    int current_device;
    safe_cuda(cudaGetDevice(&current_device));
-    stats_[current_device].RegisterAllocation(ptr, n);
-    CHECK_LE(stats_[current_device].peak_allocated_bytes, dh::TotalMemory(current_device));
+    stats_.RegisterAllocation(ptr, n);
+    CHECK_LE(stats_.peak_allocated_bytes, dh::TotalMemory(current_device));
  }
  void RegisterDeallocation(void *ptr, size_t n) {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
@@ -258,19 +257,19 @@ public:
    std::lock_guard<std::mutex> guard(mutex_);
    int current_device;
    safe_cuda(cudaGetDevice(&current_device));
-    stats_[current_device].RegisterDeallocation(ptr, n, current_device);
+    stats_.RegisterDeallocation(ptr, n, current_device);
  }
  void Log() {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
      return;
    std::lock_guard<std::mutex> guard(mutex_);
-    for (const auto &kv : stats_) {
-      LOG(CONSOLE) << "======== Device " << kv.first << " Memory Allocations: "
-        << " ========";
-      LOG(CONSOLE) << "Peak memory usage: "
-        << kv.second.peak_allocated_bytes / 1000000 << "mb";
-      LOG(CONSOLE) << "Number of allocations: " << kv.second.num_allocations;
-    }
+    int current_device;
+    safe_cuda(cudaGetDevice(&current_device));
+    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
+      << " ========";
+    LOG(CONSOLE) << "Peak memory usage: "
+      << stats_.peak_allocated_bytes / 1000000 << "mb";
+    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
  }
 };
 };
@@ -940,10 +939,9 @@ class AllReducer {
  size_t allreduce_calls_;  // Keep statistics of the number of reduce calls
  std::vector<size_t> host_data;  // Used for all reduce on host
 #ifdef XGBOOST_USE_NCCL
-  std::vector<ncclComm_t> comms;
-  std::vector<cudaStream_t> streams;
-  std::vector<int> device_ordinals;  // device id from CUDA
-  std::vector<int> device_counts;  // device count from CUDA
+  ncclComm_t comm;
+  cudaStream_t stream;
+  int device_ordinal;
  ncclUniqueId id;
 #endif

@@ -952,79 +950,28 @@ class AllReducer {
                 allreduce_calls_(0) {}

  /**
-   * \brief If we are using a single GPU only
-   */
-  bool IsSingleGPU() {
-#ifdef XGBOOST_USE_NCCL
-    CHECK(device_counts.size() > 0) << "AllReducer not initialised.";
-    return device_counts.size() <= 1 && device_counts.at(0) == 1;
-#else
-    return true;
-#endif
-  }
-
-  /**
-   * \brief Initialise with the desired device ordinals for this communication
+   * \brief Initialise with the desired device ordinal for this communication
   * group.
   *
-   * \param device_ordinals The device ordinals.
+   * \param device_ordinal The device ordinal.
   */

-  void Init(const std::vector<int> &device_ordinals) {
+  void Init(int _device_ordinal) {
 #ifdef XGBOOST_USE_NCCL
    /** \brief this >monitor . init. */
-    this->device_ordinals = device_ordinals;
-    this->device_counts.resize(rabit::GetWorldSize());
-    this->comms.resize(device_ordinals.size());
-    this->streams.resize(device_ordinals.size());
-    this->id = GetUniqueId();
-
-    device_counts.at(rabit::GetRank()) = device_ordinals.size();
-    for (size_t i = 0; i < device_counts.size(); i++) {
-      int dev_count = device_counts.at(i);
-      rabit::Allreduce<rabit::op::Sum, int>(&dev_count, 1);
-      device_counts.at(i) = dev_count;
-    }
-
-    int nccl_rank = 0;
-    int nccl_rank_offset = std::accumulate(device_counts.begin(),
-                             device_counts.begin() + rabit::GetRank(), 0);
-    int nccl_nranks = std::accumulate(device_counts.begin(),
-                        device_counts.end(), 0);
-    nccl_rank += nccl_rank_offset;
-
-    GroupStart();
-    for (size_t i = 0; i < device_ordinals.size(); i++) {
-      int dev = device_ordinals.at(i);
-      dh::safe_cuda(cudaSetDevice(dev));
-      dh::safe_nccl(ncclCommInitRank(
-        &comms.at(i),
-        nccl_nranks, id,
-        nccl_rank));
-
-      nccl_rank++;
-    }
-    GroupEnd();
-
-    for (size_t i = 0; i < device_ordinals.size(); i++) {
-      safe_cuda(cudaSetDevice(device_ordinals.at(i)));
-      safe_cuda(cudaStreamCreate(&streams.at(i)));
-    }
+    device_ordinal = _device_ordinal;
+    id = GetUniqueId();
+    dh::safe_cuda(cudaSetDevice(device_ordinal));
+    dh::safe_nccl(ncclCommInitRank(&comm, rabit::GetWorldSize(), id, rabit::GetRank()));
+    safe_cuda(cudaStreamCreate(&stream));
    initialised_ = true;
-#else
-    CHECK_EQ(device_ordinals.size(), 1)
-        << "XGBoost must be compiled with NCCL to use more than one GPU.";
 #endif
  }
  ~AllReducer() {
 #ifdef XGBOOST_USE_NCCL
    if (initialised_) {
-      for (auto &stream : streams) {
-        dh::safe_cuda(cudaStreamDestroy(stream));
-      }
-      for (auto &comm : comms) {
-        ncclCommDestroy(comm);
-      }
+      dh::safe_cuda(cudaStreamDestroy(stream));
+      ncclCommDestroy(comm);
    }
    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
      LOG(CONSOLE) << "======== NCCL Statistics========";
@@ -1035,20 +982,21 @@ class AllReducer {
  }

  /**
-   * \brief Use in exactly the same way as ncclGroupStart
+   * \brief Allreduce. Use in exactly the same way as NCCL but without needing
+   * streams or comms.
+   *
+   * \param sendbuff                The sendbuff.
+   * \param recvbuff                The recvbuff.
+   * \param count                   Number of elements.
   */
-  void GroupStart() {
-#ifdef XGBOOST_USE_NCCL
-    dh::safe_nccl(ncclGroupStart());
-#endif
-  }

-  /**
-   * \brief Use in exactly the same way as ncclGroupEnd
-   */
-  void GroupEnd() {
+  void AllReduceSum(const double *sendbuff, double *recvbuff, int count) {
 #ifdef XGBOOST_USE_NCCL
-    dh::safe_nccl(ncclGroupEnd());
+    CHECK(initialised_);
+    dh::safe_cuda(cudaSetDevice(device_ordinal));
+    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum, comm, stream));
+    allreduce_bytes_ += count * sizeof(double);
+    allreduce_calls_ += 1;
 #endif
  }

@@ -1056,51 +1004,18 @@ class AllReducer {
   * \brief Allreduce. Use in exactly the same way as NCCL but without needing
   * streams or comms.
   *
-   * \param communication_group_idx Zero-based index of the communication group.
   * \param sendbuff                The sendbuff.
   * \param recvbuff                The recvbuff.
   * \param count                   Number of elements.
   */

-  void AllReduceSum(int communication_group_idx, const double *sendbuff,
-                    double *recvbuff, int count) {
+  void AllReduceSum(const float *sendbuff, float *recvbuff, int count) {
 #ifdef XGBOOST_USE_NCCL
    CHECK(initialised_);
-    dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx)));
-    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum,
-                                comms.at(communication_group_idx),
-                                streams.at(communication_group_idx)));
-    if(communication_group_idx == 0)
-    {
-      allreduce_bytes_ += count * sizeof(double);
-      allreduce_calls_ += 1;
-    }
-#endif
-  }
-
-  /**
-   * \brief Allreduce. Use in exactly the same way as NCCL but without needing
-   * streams or comms.
-   *
-   * \param communication_group_idx Zero-based index of the communication group.
-   * \param sendbuff                The sendbuff.
-   * \param recvbuff                The recvbuff.
-   * \param count                   Number of elements.
-   */
-
-  void AllReduceSum(int communication_group_idx, const float *sendbuff,
-                    float *recvbuff, int count) {
-#ifdef XGBOOST_USE_NCCL
-    CHECK(initialised_);
-    dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx)));
-    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum,
-                                comms.at(communication_group_idx),
-                                streams.at(communication_group_idx)));
-    if(communication_group_idx == 0)
-    {
-      allreduce_bytes_ += count * sizeof(float);
-      allreduce_calls_ += 1;
-    }
+    dh::safe_cuda(cudaSetDevice(device_ordinal));
+    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, comm, stream));
+    allreduce_bytes_ += count * sizeof(float);
+    allreduce_calls_ += 1;
 #endif
  }

@@ -1109,21 +1024,17 @@ class AllReducer {
   *
   * \param count Number of.
   *
-   * \param communication_group_idx Zero-based index of the communication group. \param sendbuff.
   * \param sendbuff                The sendbuff.
   * \param recvbuff                The recvbuff.
   * \param count                   Number of.
   */

-  void AllReduceSum(int communication_group_idx, const int64_t *sendbuff,
-                    int64_t *recvbuff, int count) {
+  void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) {
 #ifdef XGBOOST_USE_NCCL
    CHECK(initialised_);

-    dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
-    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum,
-                                comms[communication_group_idx],
-                                streams[communication_group_idx]));
+    dh::safe_cuda(cudaSetDevice(device_ordinal));
+    dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum, comm, stream));
 #endif
  }

@@ -1134,26 +1045,8 @@ class AllReducer {
   */
  void Synchronize() {
 #ifdef XGBOOST_USE_NCCL
-    for (size_t i = 0; i < device_ordinals.size(); i++) {
-      dh::safe_cuda(cudaSetDevice(device_ordinals[i]));
-      dh::safe_cuda(cudaStreamSynchronize(streams[i]));
-    }
-#endif
-  };
-
-  /**
-   * \brief Synchronizes the device
-   *
-   * \param device_id Identifier for the device.
-   */
-  void Synchronize(int device_id) {
-#ifdef XGBOOST_USE_NCCL
-    SaveCudaContext([&]() {
-      dh::safe_cuda(cudaSetDevice(device_id));
-      int idx = std::find(device_ordinals.begin(), device_ordinals.end(), device_id) - device_ordinals.begin();
-      CHECK(idx < device_ordinals.size());
-      dh::safe_cuda(cudaStreamSynchronize(streams[idx]));
-    });
+    dh::safe_cuda(cudaSetDevice(device_ordinal));
+    dh::safe_cuda(cudaStreamSynchronize(stream));
 #endif
  };

@@ -1219,58 +1112,6 @@ class AllReducer {
  }
 };

-/**
- * \brief Executes some operation on each element of the input vector, using a
- * single controlling thread for each element. In addition, passes the shard index
- * into the function.
- *
- * \tparam  T       Generic type parameter.
- * \tparam  FunctionT  Type of the function t.
- * \param shards  The shards.
- * \param f       The func_t to process.
- */
-
-template <typename T, typename FunctionT>
-void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
-  SaveCudaContext{[&]() {
-    // Temporarily turn off dynamic so we have a guaranteed number of threads
-    bool dynamic = omp_get_dynamic();
-    omp_set_dynamic(false);
-    const long shards_size = static_cast<long>(shards->size());
-#pragma omp parallel for schedule(static, 1) if (shards_size > 1) num_threads(shards_size)
-    for (long shard = 0; shard < shards_size; ++shard) {
-      f(shard, shards->at(shard));
-    }
-    omp_set_dynamic(dynamic);
-  }};
-}
-
-/**
- * \brief Executes some operation on each element of the input vector, using a single controlling
- *        thread for each element, returns the sum of the results.
- *
- * \tparam  ReduceT  Type of the reduce t.
- * \tparam  T         Generic type parameter.
- * \tparam  FunctionT    Type of the function t.
- * \param shards  The shards.
- * \param f       The func_t to process.
- *
- * \return  A reduce_t.
- */
-
-template <typename ReduceT, typename ShardT, typename FunctionT>
-ReduceT ReduceShards(std::vector<ShardT> *shards, FunctionT f) {
-  std::vector<ReduceT> sums(shards->size());
-  SaveCudaContext {
-    [&](){
-#pragma omp parallel for schedule(static, 1) if (shards->size() > 1)
-      for (int shard = 0; shard < shards->size(); ++shard) {
-        sums[shard] = f(shards->at(shard));
-      }}
-  };
-  return std::accumulate(sums.begin(), sums.end(), ReduceT());
-}
-
 template <typename T,
  typename IndexT = typename xgboost::common::Span<T>::index_type>
 xgboost::common::Span<T> ToSpan(
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -108,9 +108,6 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
  impl_->Vec().resize(new_size, v);
 }

-template <typename T>
-size_t HostDeviceVector<T>::DeviceSize() const { return 0; }
-
 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
  std::fill(HostVector().begin(), HostVector().end(), v);
@@ -135,12 +132,22 @@ void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
 }

 template <typename T>
-bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
+bool HostDeviceVector<T>::HostCanRead() const {
  return true;
 }

 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
+bool HostDeviceVector<T>::HostCanWrite() const {
+  return true;
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanRead() const {
+  return false;
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanWrite() const {
  return false;
 }

--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -19,33 +19,12 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
  cudaSetDeviceHandler = handler;
 }

-// wrapper over access with useful methods
-class Permissions {
-  GPUAccess access_;
-  explicit Permissions(GPUAccess access) : access_{access} {}
-
- public:
-  Permissions() : access_{GPUAccess::kNone} {}
-  explicit Permissions(bool perm)
-    : access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
-
-  bool CanRead() const { return access_ >= kRead; }
-  bool CanWrite() const { return access_ == kWrite; }
-  bool CanAccess(GPUAccess access) const { return access_ >= access; }
-  void Grant(GPUAccess access) { access_ = std::max(access_, access); }
-  void DenyComplementary(GPUAccess compl_access) {
-    access_ = std::min(access_, GPUAccess::kWrite - compl_access);
-  }
-  Permissions Complementary() const {
-    return Permissions(GPUAccess::kWrite - access_);
-  }
-};
-
 template <typename T>
 class HostDeviceVectorImpl {
 public:
-  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), perm_h_(device < 0) {
+  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
    if (device >= 0) {
+      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
      data_d_.resize(size, v);
    } else {
@@ -53,19 +32,11 @@ class HostDeviceVectorImpl {
    }
  }

-  // required, as a new std::mutex has to be created
-  HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
-      : device_(other.device_), data_h_(other.data_h_), perm_h_(other.perm_h_), mutex_() {
-    if (device_ >= 0) {
-      SetDevice();
-      data_d_ = other.data_d_;
-    }
-  }
-
  // Initializer can be std::vector<T> or std::initializer_list<T>
  template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device), perm_h_(device < 0) {
+  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
    if (device >= 0) {
+      gpu_access_ = GPUAccess::kWrite;
      LazyResizeDevice(init.size());
      Copy(init);
    } else {
@@ -79,7 +50,7 @@ class HostDeviceVectorImpl {
    }
  }

-  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : data_d_.size(); }
+  size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_.size(); }

  int DeviceIdx() const { return device_; }

@@ -95,18 +66,13 @@ class HostDeviceVectorImpl {

  common::Span<T> DeviceSpan() {
    LazySyncDevice(GPUAccess::kWrite);
-    return {data_d_.data().get(), static_cast<typename common::Span<T>::index_type>(DeviceSize())};
+    return {data_d_.data().get(), static_cast<typename common::Span<T>::index_type>(Size())};
  }

  common::Span<const T> ConstDeviceSpan() {
    LazySyncDevice(GPUAccess::kRead);
    using SpanInd = typename common::Span<const T>::index_type;
-    return {data_d_.data().get(), static_cast<SpanInd>(DeviceSize())};
-  }
-
-  size_t DeviceSize() {
-    LazySyncDevice(GPUAccess::kRead);
-    return data_d_.size();
+    return {data_d_.data().get(), static_cast<SpanInd>(Size())};
  }

  thrust::device_ptr<T> tbegin() {  // NOLINT
@@ -118,55 +84,53 @@ class HostDeviceVectorImpl {
  }

  thrust::device_ptr<T> tend() {  // NOLINT
-    return tbegin() + DeviceSize();
+    return tbegin() + Size();
  }

  thrust::device_ptr<const T> tcend() {  // NOLINT
-    return tcbegin() + DeviceSize();
+    return tcbegin() + Size();
  }

  void Fill(T v) {  // NOLINT
-    if (perm_h_.CanWrite()) {
+    if (HostCanWrite()) {
      std::fill(data_h_.begin(), data_h_.end(), v);
    } else {
-      DeviceFill(v);
+      gpu_access_ = GPUAccess::kWrite;
+      SetDevice();
+      thrust::fill(data_d_.begin(), data_d_.end(), v);
    }
  }

  void Copy(HostDeviceVectorImpl<T>* other) {
    CHECK_EQ(Size(), other->Size());
    // Data is on host.
-    if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
+    if (HostCanWrite() && other->HostCanWrite()) {
      std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
      return;
    }
-    // Data is on device;
-    if (device_ != other->device_) {
-      SetDevice(other->device_);
-    }
-    DeviceCopy(other);
+    CopyToDevice(other);
  }

  void Copy(const std::vector<T>& other) {
    CHECK_EQ(Size(), other.size());
-    if (perm_h_.CanWrite()) {
+    if (HostCanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
-      DeviceCopy(other.data());
+      CopyToDevice(other.data());
    }
  }

  void Copy(std::initializer_list<T> other) {
    CHECK_EQ(Size(), other.size());
-    if (perm_h_.CanWrite()) {
+    if (HostCanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
-      DeviceCopy(other.begin());
+      CopyToDevice(other.begin());
    }
  }

  std::vector<T>& HostVector() {
-    LazySyncHost(GPUAccess::kWrite);
+    LazySyncHost(GPUAccess::kNone);
    return data_h_;
  }

@@ -178,7 +142,7 @@ class HostDeviceVectorImpl {
  void SetDevice(int device) {
    if (device_ == device) { return; }
    if (device_ >= 0) {
-      LazySyncHost(GPUAccess::kWrite);
+      LazySyncHost(GPUAccess::kNone);
    }
    device_ = device;
    if (device_ >= 0) {
@@ -190,38 +154,37 @@ class HostDeviceVectorImpl {
    if (new_size == Size()) { return; }
    if (Size() == 0 && device_ >= 0) {
      // fast on-device resize
-      perm_h_ = Permissions(false);
+      gpu_access_ = GPUAccess::kWrite;
+      SetDevice();
      data_d_.resize(new_size, v);
    } else {
      // resize on host
-      LazySyncHost(GPUAccess::kWrite);
+      LazySyncHost(GPUAccess::kNone);
      data_h_.resize(new_size, v);
    }
  }

  void LazySyncHost(GPUAccess access) {
-    if (perm_h_.CanAccess(access)) { return; }
-    if (perm_h_.CanRead()) {
+    if (HostCanAccess(access)) { return; }
+    if (HostCanRead()) {
      // data is present, just need to deny access to the device
-      perm_h_.Grant(access);
+      gpu_access_ = access;
      return;
    }
-    std::lock_guard<std::mutex> lock(mutex_);
+    gpu_access_ = access;
    if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
    SetDevice();
    dh::safe_cuda(cudaMemcpy(data_h_.data(),
                             data_d_.data().get(),
                             data_d_.size() * sizeof(T),
                             cudaMemcpyDeviceToHost));
-    perm_h_.Grant(access);
  }

  void LazySyncDevice(GPUAccess access) {
-    if (DevicePerm().CanAccess(access)) { return; }
-    if (DevicePerm().CanRead()) {
+    if (DeviceCanAccess(access)) { return; }
+    if (DeviceCanRead()) {
      // deny read to the host
-      std::lock_guard<std::mutex> lock(mutex_);
-      perm_h_.DenyComplementary(access);
+      gpu_access_ = access;
      return;
    }
    // data is on the host
@@ -231,41 +194,37 @@ class HostDeviceVectorImpl {
                             data_h_.data(),
                             data_d_.size() * sizeof(T),
                             cudaMemcpyHostToDevice));
-
-    std::lock_guard<std::mutex> lock(mutex_);
-    perm_h_.DenyComplementary(access);
+    gpu_access_ = access;
  }

-  bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
-  bool DeviceCanAccess(GPUAccess access) { return DevicePerm().CanAccess(access); }
+  bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
+  bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
+  bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
+  bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
+  bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
+  bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }

 private:
  int device_{-1};
  std::vector<T> data_h_{};
  dh::device_vector<T> data_d_{};
-  Permissions perm_h_{false};
-  // protects size_d_ and perm_h_ when updated from multiple threads
-  std::mutex mutex_{};
+  GPUAccess gpu_access_{GPUAccess::kNone};

-  void DeviceFill(T v) {
-    // TODO(canonizer): avoid full copy of host data
-    LazySyncDevice(GPUAccess::kWrite);
-    SetDevice();
-    thrust::fill(data_d_.begin(), data_d_.end(), v);
+  void CopyToDevice(HostDeviceVectorImpl* other) {
+    if (other->HostCanWrite()) {
+      CopyToDevice(other->data_h_.data());
+    } else {
+      LazyResizeDevice(Size());
+      gpu_access_ = GPUAccess::kWrite;
+      SetDevice();
+      dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
+                                    data_d_.size() * sizeof(T), cudaMemcpyDefault));
+    }
  }

-  void DeviceCopy(HostDeviceVectorImpl* other) {
-    // TODO(canonizer): avoid full copy of host data for this (but not for other)
-    LazySyncDevice(GPUAccess::kWrite);
-    other->LazySyncDevice(GPUAccess::kRead);
-    SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
-                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
-  }
-
-  void DeviceCopy(const T* begin) {
-    // TODO(canonizer): avoid full copy of host data
-    LazySyncDevice(GPUAccess::kWrite);
+  void CopyToDevice(const T* begin) {
+    LazyResizeDevice(Size());
+    gpu_access_ = GPUAccess::kWrite;
    SetDevice();
    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
@@ -285,8 +244,6 @@ class HostDeviceVectorImpl {
      (*cudaSetDeviceHandler)(device_);
    }
  }
-
-  Permissions DevicePerm() const { return perm_h_.Complementary(); }
 };

 template<typename T>
@@ -347,11 +304,6 @@ common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
  return impl_->ConstDeviceSpan();
 }

-template <typename T>
-size_t HostDeviceVector<T>::DeviceSize() const {
-  return impl_->DeviceSize();
-}
-
 template <typename T>
 thrust::device_ptr<T> HostDeviceVector<T>::tbegin() {  // NOLINT
  return impl_->tbegin();
@@ -401,13 +353,23 @@ const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
 }

 template <typename T>
-bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
-  return impl_->HostCanAccess(access);
+bool HostDeviceVector<T>::HostCanRead() const {
+  return impl_->HostCanRead();
 }

 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
-  return impl_->DeviceCanAccess(access);
+bool HostDeviceVector<T>::HostCanWrite() const {
+  return impl_->HostCanWrite();
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanRead() const {
+  return impl_->DeviceCanRead();
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanWrite() const {
+  return impl_->DeviceCanWrite();
 }

 template <typename T>
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -79,16 +79,23 @@ void SetCudaSetDeviceHandler(void (*handler)(int));

 template <typename T> struct HostDeviceVectorImpl;

+/*!
+ * \brief Controls data access from the GPU.
+ *
+ * Since a `HostDeviceVector` can have data on both the host and device, access control needs to be
+ * maintained to keep the data consistent.
+ *
+ * There are 3 scenarios supported:
+ *   - Data is being manipulated on device. GPU has write access, host doesn't have access.
+ *   - Data is read-only on both the host and device.
+ *   - Data is being manipulated on the host. Host has write access, device doesn't have access.
+ */
 enum GPUAccess {
  kNone, kRead,
  // write implies read
  kWrite
 };

-inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
-  return static_cast<GPUAccess>(static_cast<int>(a) - static_cast<int>(b));
-}
-
 template <typename T>
 class HostDeviceVector {
 public:
@@ -111,8 +118,6 @@ class HostDeviceVector {
  const T* ConstHostPointer() const { return ConstHostVector().data(); }
  const T* HostPointer() const { return ConstHostPointer(); }

-  size_t DeviceSize() const;
-
  // only define functions returning device_ptr
  // if HostDeviceVector.h is included from a .cu file
 #ifdef __CUDACC__
@@ -135,8 +140,10 @@ class HostDeviceVector {
  const std::vector<T>& ConstHostVector() const;
  const std::vector<T>& HostVector() const {return ConstHostVector(); }

-  bool HostCanAccess(GPUAccess access) const;
-  bool DeviceCanAccess(GPUAccess access) const;
+  bool HostCanRead() const;
+  bool HostCanWrite() const;
+  bool DeviceCanRead() const;
+  bool DeviceCanWrite() const;

  void SetDevice(int device) const;