Lazy initialization of device vector. (#5173)

* Lazy initialization of device vector. * Fix #5162. * Disable copy constructor of HostDeviceVector. Prevents implicit copying. * Fix CPU build. * Bring back move assignment operator.
2020-01-07 11:23:05 +08:00
parent 77cfbff5a7
commit ee287808fb
7 changed files with 114 additions and 64 deletions
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -8,6 +8,7 @@
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <cstdint>
+#include <memory>
 #include <utility>
 #include "xgboost/host_device_vector.h"

@@ -18,6 +19,7 @@ struct HostDeviceVectorImpl {
  explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
  HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
  explicit HostDeviceVectorImpl(std::vector<T>  init) : data_h_(std::move(init)) {}
+  HostDeviceVectorImpl(HostDeviceVectorImpl&& that) : data_h_(std::move(that.data_h_)) {}

  void Swap(HostDeviceVectorImpl &other) {
     data_h_.swap(other.data_h_);
@@ -47,6 +49,22 @@ HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
  impl_ = new HostDeviceVectorImpl<T>(init);
 }

+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& that) {
+  impl_ = new HostDeviceVectorImpl<T>(std::move(*that.impl_));
+}
+
+template <typename T>
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& that) {
+  if (this == &that) { return *this; }
+
+  std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
+      new HostDeviceVectorImpl<T>(std::move(*that.impl_)));
+  delete impl_;
+  impl_ = new_impl.release();
+  return *this;
+}
+
 template <typename T>
 HostDeviceVector<T>::~HostDeviceVector() {
  delete impl_;
@@ -54,21 +72,8 @@ HostDeviceVector<T>::~HostDeviceVector() {
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
-  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
-}
-
-template <typename T>
-HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
-  if (this == &other) {
-    return *this;
-  }
-
-  HostDeviceVectorImpl<T> newInstance(*other.impl_);
-  newInstance.Swap(*impl_);
-
-  return *this;
+GPUAccess HostDeviceVector<T>::DeviceAccess() const {
+  return kNone;
 }

 template <typename T>
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -29,7 +29,7 @@ class HostDeviceVectorImpl {
    if (device >= 0) {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      data_d_.resize(size, v);
+      data_d_->resize(size, v);
    } else {
      data_h_.resize(size, v);
    }
@@ -47,34 +47,40 @@ class HostDeviceVectorImpl {
    }
  }

+  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&& that) :
+    device_{that.device_},
+    data_h_{std::move(that.data_h_)},
+    data_d_{std::move(that.data_d_)},
+    gpu_access_{that.gpu_access_} {}
+
  ~HostDeviceVectorImpl() {
    if (device_ >= 0) {
      SetDevice();
    }
  }

-  size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_.size(); }
+  size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_->size(); }

  int DeviceIdx() const { return device_; }

  T* DevicePointer() {
    LazySyncDevice(GPUAccess::kWrite);
-    return data_d_.data().get();
+    return data_d_->data().get();
  }

  const T* ConstDevicePointer() {
    LazySyncDevice(GPUAccess::kRead);
-    return data_d_.data().get();
+    return data_d_->data().get();
  }

  common::Span<T> DeviceSpan() {
    LazySyncDevice(GPUAccess::kWrite);
-    return {data_d_.data().get(), Size()};
+    return {data_d_->data().get(), Size()};
  }

  common::Span<const T> ConstDeviceSpan() {
    LazySyncDevice(GPUAccess::kRead);
-    return {data_d_.data().get(), Size()};
+    return {data_d_->data().get(), Size()};
  }

  void Fill(T v) {  // NOLINT
@@ -83,17 +89,19 @@ class HostDeviceVectorImpl {
    } else {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      thrust::fill(data_d_.begin(), data_d_.end(), v);
+      thrust::fill(data_d_->begin(), data_d_->end(), v);
    }
  }

  void Copy(HostDeviceVectorImpl<T>* other) {
    CHECK_EQ(Size(), other->Size());
+    SetDevice(other->device_);
    // Data is on host.
    if (HostCanWrite() && other->HostCanWrite()) {
      std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
      return;
    }
+    SetDevice();
    CopyToDevice(other);
  }

@@ -138,11 +146,11 @@ class HostDeviceVectorImpl {

  void Resize(size_t new_size, T v) {
    if (new_size == Size()) { return; }
-    if (Size() == 0 && device_ >= 0) {
+    if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
      // fast on-device resize
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      data_d_.resize(new_size, v);
+      data_d_->resize(new_size, v);
    } else {
      // resize on host
      LazySyncHost(GPUAccess::kNone);
@@ -158,11 +166,11 @@ class HostDeviceVectorImpl {
      return;
    }
    gpu_access_ = access;
-    if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
+    if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
    SetDevice();
    dh::safe_cuda(cudaMemcpy(data_h_.data(),
-                             data_d_.data().get(),
-                             data_d_.size() * sizeof(T),
+                             data_d_->data().get(),
+                             data_d_->size() * sizeof(T),
                             cudaMemcpyDeviceToHost));
  }

@@ -176,9 +184,9 @@ class HostDeviceVectorImpl {
    // data is on the host
    LazyResizeDevice(data_h_.size());
    SetDevice();
-    dh::safe_cuda(cudaMemcpy(data_d_.data().get(),
+    dh::safe_cuda(cudaMemcpy(data_d_->data().get(),
                             data_h_.data(),
-                             data_d_.size() * sizeof(T),
+                             data_d_->size() * sizeof(T),
                             cudaMemcpyHostToDevice));
    gpu_access_ = access;
  }
@@ -189,11 +197,12 @@ class HostDeviceVectorImpl {
  bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
  bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
  bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
+  GPUAccess Access() const { return gpu_access_; }

 private:
  int device_{-1};
  std::vector<T> data_h_{};
-  dh::device_vector<T> data_d_{};
+  std::unique_ptr<dh::device_vector<T>> data_d_{};
  GPUAccess gpu_access_{GPUAccess::kNone};

  void CopyToDevice(HostDeviceVectorImpl* other) {
@@ -203,8 +212,8 @@ class HostDeviceVectorImpl {
      LazyResizeDevice(Size());
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
-                                    data_d_.size() * sizeof(T), cudaMemcpyDefault));
+      dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
+                                    data_d_->size() * sizeof(T), cudaMemcpyDefault));
    }
  }

@@ -212,14 +221,14 @@ class HostDeviceVectorImpl {
    LazyResizeDevice(Size());
    gpu_access_ = GPUAccess::kWrite;
    SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
-                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
+    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
+                                  data_d_->size() * sizeof(T), cudaMemcpyDefault));
  }

  void LazyResizeDevice(size_t new_size) {
-    if (new_size == data_d_.size()) { return; }
+    if (data_d_ && new_size == data_d_->size()) { return; }
    SetDevice();
-    data_d_.resize(new_size);
+    data_d_->resize(new_size);
  }

  void SetDevice() {
@@ -229,6 +238,10 @@ class HostDeviceVectorImpl {
    } else {
      (*cudaSetDeviceHandler)(device_);
    }
+
+    if (!data_d_) {
+      data_d_.reset(new dh::device_vector<T>);
+    }
  }
 };

@@ -245,16 +258,17 @@ HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
-    : impl_(new HostDeviceVectorImpl<T>(*other.impl_)) {}
+HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& other)
+    : impl_(new HostDeviceVectorImpl<T>(std::move(*other.impl_))) {}

 template <typename T>
-HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& other) {
  if (this == &other) { return *this; }

-  std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
+  std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
+      new HostDeviceVectorImpl<T>(std::move(*other.impl_)));
  delete impl_;
-  impl_ = newImpl.release();
+  impl_ = new_impl.release();
  return *this;
 }

@@ -338,6 +352,11 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
  return impl_->DeviceCanWrite();
 }

+template <typename T>
+GPUAccess HostDeviceVector<T>::DeviceAccess() const {
+  return impl_->Access();
+}
+
 template <typename T>
 void HostDeviceVector<T>::SetDevice(int device) const {
  impl_->SetDevice(device);
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -339,6 +339,7 @@ class GPUPredictor : public xgboost::Predictor {
      // the first step only modifies prediction store in learner without following code.
      InitOutPredictions(cache_emtry->second.data->Info(),
                         &(cache_emtry->second.predictions), model);
+      CHECK_EQ(cache_emtry->second.predictions.Size(), out_preds->Size());
      cache_emtry->second.predictions.Copy(*out_preds);
    }
  }