diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 31525ee72..a3b870712 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -59,8 +59,25 @@ class MetaInfo {
    * can be used to specify initial prediction to boost from.
    */
   HostDeviceVector<bst_float> base_margin_;
+
   /*! \brief default constructor */
   MetaInfo()  = default;
+  MetaInfo& operator=(MetaInfo const& that) {
+    this->num_row_ = that.num_row_;
+    this->num_col_ = that.num_col_;
+    this->num_nonzero_ = that.num_nonzero_;
+
+    this->labels_.Resize(that.labels_.Size());
+    this->labels_.Copy(that.labels_);
+
+    this->group_ptr_ = that.group_ptr_;
+
+    this->weights_.Resize(that.weights_.Size());
+    this->weights_.Copy(that.weights_);
+    this->base_margin_.Resize(that.base_margin_.Size());
+    this->base_margin_.Copy(that.base_margin_);
+    return *this;
+  }
   /*!
    * \brief Get weight of each instances.
    * \param i Instance index.
@@ -246,10 +263,10 @@ class SparsePage {
   /**
    * \brief Pushes external data batch onto this page
    *
-   * \tparam  AdapterBatchT 
-   * \param batch 
-   * \param missing 
-   * \param nthread 
+   * \tparam  AdapterBatchT
+   * \param batch
+   * \param missing
+   * \param nthread
    *
    * \return  The maximum number of columns encountered in this input batch. Useful when pushing many adapter batches to work out the total number of columns.
    */
diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index e533d134b..7a2ea3d6d 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -88,8 +88,13 @@ class HostDeviceVector {
   HostDeviceVector(std::initializer_list<T> init, int device = -1);
   explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
   ~HostDeviceVector();
-  HostDeviceVector(const HostDeviceVector<T>&);
-  HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
+
+  HostDeviceVector(const HostDeviceVector<T>&) = delete;
+  HostDeviceVector(HostDeviceVector<T>&&);
+
+  HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
+  HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
+
   size_t Size() const;
   int DeviceIdx() const;
   common::Span<T> DeviceSpan();
@@ -116,6 +121,7 @@ class HostDeviceVector {
   bool HostCanWrite() const;
   bool DeviceCanRead() const;
   bool DeviceCanWrite() const;
+  GPUAccess DeviceAccess() const;
 
   void SetDevice(int device) const;
 
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index db31b2fe9..1dd9997fa 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -8,6 +8,7 @@
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <cstdint>
+#include <memory>
 #include <utility>
 #include "xgboost/host_device_vector.h"
 
@@ -18,6 +19,7 @@ struct HostDeviceVectorImpl {
   explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
   HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
   explicit HostDeviceVectorImpl(std::vector<T>  init) : data_h_(std::move(init)) {}
+  HostDeviceVectorImpl(HostDeviceVectorImpl&& that) : data_h_(std::move(that.data_h_)) {}
 
   void Swap(HostDeviceVectorImpl &other) {
      data_h_.swap(other.data_h_);
@@ -47,6 +49,22 @@ HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
 
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& that) {
+  impl_ = new HostDeviceVectorImpl<T>(std::move(*that.impl_));
+}
+
+template <typename T>
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& that) {
+  if (this == &that) { return *this; }
+
+  std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
+      new HostDeviceVectorImpl<T>(std::move(*that.impl_)));
+  delete impl_;
+  impl_ = new_impl.release();
+  return *this;
+}
+
 template <typename T>
 HostDeviceVector<T>::~HostDeviceVector() {
   delete impl_;
@@ -54,21 +72,8 @@ HostDeviceVector<T>::~HostDeviceVector() {
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
-  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
-}
-
-template <typename T>
-HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
-  if (this == &other) {
-    return *this;
-  }
-
-  HostDeviceVectorImpl<T> newInstance(*other.impl_);
-  newInstance.Swap(*impl_);
-
-  return *this;
+GPUAccess HostDeviceVector<T>::DeviceAccess() const {
+  return kNone;
 }
 
 template <typename T>
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index b27d811f2..6180b6312 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -29,7 +29,7 @@ class HostDeviceVectorImpl {
     if (device >= 0) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_.resize(size, v);
+      data_d_->resize(size, v);
     } else {
       data_h_.resize(size, v);
     }
@@ -47,34 +47,40 @@ class HostDeviceVectorImpl {
     }
   }
 
+  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&& that) :
+    device_{that.device_},
+    data_h_{std::move(that.data_h_)},
+    data_d_{std::move(that.data_d_)},
+    gpu_access_{that.gpu_access_} {}
+
   ~HostDeviceVectorImpl() {
     if (device_ >= 0) {
       SetDevice();
     }
   }
 
-  size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_.size(); }
+  size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_->size(); }
 
   int DeviceIdx() const { return device_; }
 
   T* DevicePointer() {
     LazySyncDevice(GPUAccess::kWrite);
-    return data_d_.data().get();
+    return data_d_->data().get();
   }
 
   const T* ConstDevicePointer() {
     LazySyncDevice(GPUAccess::kRead);
-    return data_d_.data().get();
+    return data_d_->data().get();
   }
 
   common::Span<T> DeviceSpan() {
     LazySyncDevice(GPUAccess::kWrite);
-    return {data_d_.data().get(), Size()};
+    return {data_d_->data().get(), Size()};
   }
 
   common::Span<const T> ConstDeviceSpan() {
     LazySyncDevice(GPUAccess::kRead);
-    return {data_d_.data().get(), Size()};
+    return {data_d_->data().get(), Size()};
   }
 
   void Fill(T v) {  // NOLINT
@@ -83,17 +89,19 @@ class HostDeviceVectorImpl {
     } else {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      thrust::fill(data_d_.begin(), data_d_.end(), v);
+      thrust::fill(data_d_->begin(), data_d_->end(), v);
     }
   }
 
   void Copy(HostDeviceVectorImpl<T>* other) {
     CHECK_EQ(Size(), other->Size());
+    SetDevice(other->device_);
     // Data is on host.
     if (HostCanWrite() && other->HostCanWrite()) {
       std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
       return;
     }
+    SetDevice();
     CopyToDevice(other);
   }
 
@@ -138,11 +146,11 @@ class HostDeviceVectorImpl {
 
   void Resize(size_t new_size, T v) {
     if (new_size == Size()) { return; }
-    if (Size() == 0 && device_ >= 0) {
+    if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
       // fast on-device resize
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_.resize(new_size, v);
+      data_d_->resize(new_size, v);
     } else {
       // resize on host
       LazySyncHost(GPUAccess::kNone);
@@ -158,11 +166,11 @@ class HostDeviceVectorImpl {
       return;
     }
     gpu_access_ = access;
-    if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
+    if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
     SetDevice();
     dh::safe_cuda(cudaMemcpy(data_h_.data(),
-                             data_d_.data().get(),
-                             data_d_.size() * sizeof(T),
+                             data_d_->data().get(),
+                             data_d_->size() * sizeof(T),
                              cudaMemcpyDeviceToHost));
   }
 
@@ -176,9 +184,9 @@ class HostDeviceVectorImpl {
     // data is on the host
     LazyResizeDevice(data_h_.size());
     SetDevice();
-    dh::safe_cuda(cudaMemcpy(data_d_.data().get(),
+    dh::safe_cuda(cudaMemcpy(data_d_->data().get(),
                              data_h_.data(),
-                             data_d_.size() * sizeof(T),
+                             data_d_->size() * sizeof(T),
                              cudaMemcpyHostToDevice));
     gpu_access_ = access;
   }
@@ -189,11 +197,12 @@ class HostDeviceVectorImpl {
   bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
   bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
   bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
+  GPUAccess Access() const { return gpu_access_; }
 
  private:
   int device_{-1};
   std::vector<T> data_h_{};
-  dh::device_vector<T> data_d_{};
+  std::unique_ptr<dh::device_vector<T>> data_d_{};
   GPUAccess gpu_access_{GPUAccess::kNone};
 
   void CopyToDevice(HostDeviceVectorImpl* other) {
@@ -203,8 +212,8 @@ class HostDeviceVectorImpl {
       LazyResizeDevice(Size());
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
-                                    data_d_.size() * sizeof(T), cudaMemcpyDefault));
+      dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
+                                    data_d_->size() * sizeof(T), cudaMemcpyDefault));
     }
   }
 
@@ -212,14 +221,14 @@ class HostDeviceVectorImpl {
     LazyResizeDevice(Size());
     gpu_access_ = GPUAccess::kWrite;
     SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
-                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
+    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
+                                  data_d_->size() * sizeof(T), cudaMemcpyDefault));
   }
 
   void LazyResizeDevice(size_t new_size) {
-    if (new_size == data_d_.size()) { return; }
+    if (data_d_ && new_size == data_d_->size()) { return; }
     SetDevice();
-    data_d_.resize(new_size);
+    data_d_->resize(new_size);
   }
 
   void SetDevice() {
@@ -229,6 +238,10 @@ class HostDeviceVectorImpl {
     } else {
       (*cudaSetDeviceHandler)(device_);
     }
+
+    if (!data_d_) {
+      data_d_.reset(new dh::device_vector<T>);
+    }
   }
 };
 
@@ -245,16 +258,17 @@ HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
     : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
-    : impl_(new HostDeviceVectorImpl<T>(*other.impl_)) {}
+HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& other)
+    : impl_(new HostDeviceVectorImpl<T>(std::move(*other.impl_))) {}
 
 template <typename T>
-HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& other) {
   if (this == &other) { return *this; }
 
-  std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
+  std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
+      new HostDeviceVectorImpl<T>(std::move(*other.impl_)));
   delete impl_;
-  impl_ = newImpl.release();
+  impl_ = new_impl.release();
   return *this;
 }
 
@@ -338,6 +352,11 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
   return impl_->DeviceCanWrite();
 }
 
+template <typename T>
+GPUAccess HostDeviceVector<T>::DeviceAccess() const {
+  return impl_->Access();
+}
+
 template <typename T>
 void HostDeviceVector<T>::SetDevice(int device) const {
   impl_->SetDevice(device);
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 92baa4ebb..1ff8d796d 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -339,6 +339,7 @@ class GPUPredictor : public xgboost::Predictor {
       // the first step only modifies prediction store in learner without following code.
       InitOutPredictions(cache_emtry->second.data->Info(),
                          &(cache_emtry->second.predictions), model);
+      CHECK_EQ(cache_emtry->second.predictions.Size(), out_preds->Size());
       cache_emtry->second.predictions.Copy(*out_preds);
     }
   }
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index cb328dfdd..777f8f7c2 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -60,6 +60,7 @@ void PlusOne(HostDeviceVector<int> *v) {
   SetDevice(device);
   thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
                     [=]__device__(unsigned int a){ return a + 1; });
+  ASSERT_TRUE(v->DeviceCanWrite());
 }
 
 void CheckDevice(HostDeviceVector<int>* v,
@@ -125,7 +126,8 @@ TEST(HostDeviceVector, Copy) {
     // a separate scope to ensure that v1 is gone before further checks
     HostDeviceVector<int> v1;
     InitHostDeviceVector(n, device, &v1);
-    v = v1;
+    v.Resize(v1.Size());
+    v.Copy(v1);
   }
   CheckDevice(&v, n, 0, GPUAccess::kRead);
   PlusOne(&v);
diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc
index 03f6251d5..dc7b38554 100644
--- a/tests/cpp/common/test_random.cc
+++ b/tests/cpp/common/test_random.cc
@@ -11,26 +11,26 @@ TEST(ColumnSampler, Test) {
 
   // No node sampling
   cs.Init(n, 1.0f, 0.5f, 0.5f);
-  auto set0 = *cs.GetFeatureSet(0);
-  ASSERT_EQ(set0.Size(), 32);
+  auto set0 = cs.GetFeatureSet(0);
+  ASSERT_EQ(set0->Size(), 32);
 
-  auto set1 = *cs.GetFeatureSet(0);
+  auto set1 = cs.GetFeatureSet(0);
 
-  ASSERT_EQ(set0.HostVector(), set1.HostVector());
+  ASSERT_EQ(set0->HostVector(), set1->HostVector());
 
-  auto set2 = *cs.GetFeatureSet(1);
-  ASSERT_NE(set1.HostVector(), set2.HostVector());
-  ASSERT_EQ(set2.Size(), 32);
+  auto set2 = cs.GetFeatureSet(1);
+  ASSERT_NE(set1->HostVector(), set2->HostVector());
+  ASSERT_EQ(set2->Size(), 32);
 
   // Node sampling
   cs.Init(n, 0.5f, 1.0f, 0.5f);
-  auto set3 = *cs.GetFeatureSet(0);
-  ASSERT_EQ(set3.Size(), 32);
+  auto set3 = cs.GetFeatureSet(0);
+  ASSERT_EQ(set3->Size(), 32);
 
-  auto set4 = *cs.GetFeatureSet(0);
+  auto set4 = cs.GetFeatureSet(0);
 
-  ASSERT_NE(set3.HostVector(), set4.HostVector());
-  ASSERT_EQ(set4.Size(), 32);
+  ASSERT_NE(set3->HostVector(), set4->HostVector());
+  ASSERT_EQ(set4->Size(), 32);
 
   // No level or node sampling, should be the same at different depth
   cs.Init(n, 1.0f, 1.0f, 0.5f);
@@ -38,11 +38,11 @@ TEST(ColumnSampler, Test) {
             cs.GetFeatureSet(1)->HostVector());
 
   cs.Init(n, 1.0f, 1.0f, 1.0f);
-  auto set5 = *cs.GetFeatureSet(0);
-  ASSERT_EQ(set5.Size(), n);
+  auto set5 = cs.GetFeatureSet(0);
+  ASSERT_EQ(set5->Size(), n);
   cs.Init(n, 1.0f, 1.0f, 1.0f);
-  auto set6 = *cs.GetFeatureSet(0);
-  ASSERT_EQ(set5.HostVector(), set6.HostVector());
+  auto set6 = cs.GetFeatureSet(0);
+  ASSERT_EQ(set5->HostVector(), set6->HostVector());
 
   // Should always be a minimum of one feature
   cs.Init(n, 1e-16f, 1e-16f, 1e-16f);