diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index 674f46109..867fee6a9 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -68,12 +68,9 @@ class GradientBooster {
    * \param obj The objective function, optional, can be nullptr when use customized version
    * the booster may change content of gpair
    */
-  virtual void DoBoost(DMatrix* p_fmat,
-                       std::vector<bst_gpair>* in_gpair,
-                       ObjFunction* obj = nullptr) = 0;
   virtual void DoBoost(DMatrix* p_fmat,
                        HostDeviceVector<bst_gpair>* in_gpair,
-                       ObjFunction* obj = nullptr);
+                       ObjFunction* obj = nullptr) = 0;
 
   /*!
    * \brief generate predictions for given feature matrix
@@ -82,12 +79,9 @@ class GradientBooster {
    * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
    *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
    */
-  virtual void PredictBatch(DMatrix* dmat,
-                       std::vector<bst_float>* out_preds,
-                       unsigned ntree_limit = 0) = 0;
   virtual void PredictBatch(DMatrix* dmat,
                             HostDeviceVector<bst_float>* out_preds,
-                            unsigned ntree_limit = 0);
+                            unsigned ntree_limit = 0) = 0;
   /*!
    * \brief online prediction function, predict score for one instance at a time
    *  NOTE: use the batch prediction interface if possible, batch prediction is usually
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 995a590c9..3981940d2 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -84,7 +84,7 @@ class Learner : public rabit::Serializable {
    */
   virtual void BoostOneIter(int iter,
                             DMatrix* train,
-                            std::vector<bst_gpair>* in_gpair) = 0;
+                            HostDeviceVector<bst_gpair>* in_gpair) = 0;
   /*!
    * \brief evaluate the model for specific iteration using the configured metrics.
    * \param iter iteration number
@@ -109,7 +109,7 @@ class Learner : public rabit::Serializable {
    */
   virtual void Predict(DMatrix* data,
                        bool output_margin,
-                       std::vector<bst_float> *out_preds,
+                       HostDeviceVector<bst_float> *out_preds,
                        unsigned ntree_limit = 0,
                        bool pred_leaf = false,
                        bool pred_contribs = false,
@@ -169,7 +169,7 @@ class Learner : public rabit::Serializable {
    */
   inline void Predict(const SparseBatch::Inst &inst,
                       bool output_margin,
-                      std::vector<bst_float> *out_preds,
+                      HostDeviceVector<bst_float> *out_preds,
                       unsigned ntree_limit = 0) const;
   /*!
    * \brief Create a new instance of learner.
@@ -192,9 +192,9 @@ class Learner : public rabit::Serializable {
 // implementation of inline functions.
 inline void Learner::Predict(const SparseBatch::Inst& inst,
                              bool output_margin,
-                             std::vector<bst_float>* out_preds,
+                             HostDeviceVector<bst_float>* out_preds,
                              unsigned ntree_limit) const {
-  gbm_->PredictInstance(inst, out_preds, ntree_limit);
+  gbm_->PredictInstance(inst, &out_preds->data_h(), ntree_limit);
   if (!output_margin) {
     obj_->PredTransform(out_preds);
   }
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
index 3f26db891..63e4c4d14 100644
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -44,14 +44,10 @@ class ObjFunction {
    * \param iteration current iteration number.
    * \param out_gpair output of get gradient, saves gradient and second order gradient in
    */
-  virtual void GetGradient(const std::vector<bst_float>& preds,
-                           const MetaInfo& info,
-                           int iteration,
-                           std::vector<bst_gpair>* out_gpair) = 0;
   virtual void GetGradient(HostDeviceVector<bst_float>* preds,
                            const MetaInfo& info,
                            int iteration,
-                           HostDeviceVector<bst_gpair>* out_gpair);
+                           HostDeviceVector<bst_gpair>* out_gpair) = 0;
 
   /*! \return the default evaluation metric for the objective */
   virtual const char* DefaultEvalMetric() const = 0;
@@ -60,17 +56,13 @@ class ObjFunction {
    * \brief transform prediction values, this is only called when Prediction is called
    * \param io_preds prediction values, saves to this vector as well
    */
-  virtual void PredTransform(std::vector<bst_float> *io_preds) {}
-  virtual void PredTransform(HostDeviceVector<bst_float> *io_preds);
+  virtual void PredTransform(HostDeviceVector<bst_float> *io_preds) {}
 
   /*!
    * \brief transform prediction values, this is only called when Eval is called,
    *  usually it redirect to PredTransform
    * \param io_preds prediction values, saves to this vector as well
    */
-  virtual void EvalTransform(std::vector<bst_float> *io_preds) {
-    this->PredTransform(io_preds);
-  }
   virtual void EvalTransform(HostDeviceVector<bst_float> *io_preds) {
     this->PredTransform(io_preds);
   }
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index 2c408740e..c8abd4b69 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -63,22 +63,6 @@ class Predictor {
    * limit trees.
    */
 
-  virtual void PredictBatch(DMatrix* dmat, std::vector<bst_float>* out_preds,
-                            const gbm::GBTreeModel& model, int tree_begin,
-                            unsigned ntree_limit = 0) = 0;
-
-  /**
-   * \brief Generate batch predictions for a given feature matrix. May use
-   * cached predictions if available instead of calculating from scratch.
-   *
-   * \param [in,out]  dmat        Feature matrix.
-   * \param [in,out]  out_preds   The output preds.
-   * \param           model       The model to predict from.
-   * \param           tree_begin  The tree begin index.
-   * \param           ntree_limit (Optional) The ntree limit. 0 means do not
-   * limit trees.
-   */
-
   virtual void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                             const gbm::GBTreeModel& model, int tree_begin,
                             unsigned ntree_limit = 0) = 0;
@@ -186,41 +170,14 @@ class Predictor {
   static Predictor* Create(std::string name);
 
  protected:
-  /**
-   * \fn  bool PredictFromCache(DMatrix* dmat, std::vector<bst_float>*
-   * out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit = 0)
-   *
-   * \brief Attempt to predict from cache.
-   *
-   * \return  True if it succeeds, false if it fails.
-   */
-  bool PredictFromCache(DMatrix* dmat, std::vector<bst_float>* out_preds,
-                        const gbm::GBTreeModel& model,
-                        unsigned ntree_limit = 0);
-
-  /**
-   * \fn void Predictor::InitOutPredictions(const MetaInfo& info,
-   * std::vector<bst_float>* out_preds, const gbm::GBTreeModel& model) const;
-   *
-   * \brief  Init out predictions according to base margin.
-   *
-   * \param          info      Dmatrix info possibly containing base margin.
-   * \param [in,out] out_preds The out preds.
-   * \param          model     The model.
-   */
-  void InitOutPredictions(const MetaInfo& info,
-                          std::vector<bst_float>* out_preds,
-                          const gbm::GBTreeModel& model) const;
-
   /**
    * \struct  PredictionCacheEntry
    *
    * \brief Contains pointer to input matrix and associated cached predictions.
    */
-
   struct PredictionCacheEntry {
     std::shared_ptr<DMatrix> data;
-    std::vector<bst_float> predictions;
+    HostDeviceVector<bst_float> predictions;
   };
 
   /**
diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h
index 8dbfa6cae..07e44a64e 100644
--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@@ -40,12 +40,9 @@ class TreeUpdater {
    *         but maybe different random seeds, usually one tree is passed in at a time,
    *         there can be multiple trees when we train random forest style model
    */
-  virtual void Update(const std::vector<bst_gpair>& gpair,
-                      DMatrix* data,
-                      const std::vector<RegTree*>& trees) = 0;
   virtual void Update(HostDeviceVector<bst_gpair>* gpair,
                       DMatrix* data,
-                      const std::vector<RegTree*>& trees);
+                      const std::vector<RegTree*>& trees) = 0;
 
   /*!
    * \brief determines whether updater has enough knowledge about a given dataset
@@ -58,11 +55,9 @@ class TreeUpdater {
    *         updated by the time this function returns.
    */
   virtual bool UpdatePredictionCache(const DMatrix* data,
-                                     std::vector<bst_float>* out_preds) {
+                                     HostDeviceVector<bst_float>* out_preds) {
     return false;
   }
-  virtual bool UpdatePredictionCache(const DMatrix* data,
-                                     HostDeviceVector<bst_float>* out_preds);
 
   /*!
    * \brief Create a tree updater given name
diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc
index 3b7eefa4f..5446ea9b1 100644
--- a/plugin/example/custom_obj.cc
+++ b/plugin/example/custom_obj.cc
@@ -33,30 +33,32 @@ class MyLogistic : public ObjFunction {
   void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
     param_.InitAllowUnknown(args);
   }
-  void GetGradient(const std::vector<bst_float> &preds,
+  void GetGradient(HostDeviceVector<bst_float> *preds,
                    const MetaInfo &info,
                    int iter,
-                   std::vector<bst_gpair> *out_gpair) override {
-    out_gpair->resize(preds.size());
-    for (size_t i = 0; i < preds.size(); ++i) {
+                   HostDeviceVector<bst_gpair> *out_gpair) override {
+    out_gpair->resize(preds->size());
+    std::vector<bst_float>& preds_h = preds->data_h();
+    std::vector<bst_gpair>& out_gpair_h = out_gpair->data_h();
+    for (size_t i = 0; i < preds_h.size(); ++i) {
       bst_float w = info.GetWeight(i);
       // scale the negative examples!
       if (info.labels[i] == 0.0f) w *= param_.scale_neg_weight;
       // logistic transformation
-      bst_float p = 1.0f / (1.0f + std::exp(-preds[i]));
+      bst_float p = 1.0f / (1.0f + std::exp(-preds_h[i]));
       // this is the gradient
       bst_float grad = (p - info.labels[i]) * w;
       // this is the second order gradient
       bst_float hess = p * (1.0f - p) * w;
-      out_gpair->at(i) = bst_gpair(grad, hess);
+      out_gpair_h.at(i) = bst_gpair(grad, hess);
     }
   }
   const char* DefaultEvalMetric() const override {
     return "error";
   }
-  void PredTransform(std::vector<bst_float> *io_preds) override {
+  void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
     // transform margin value to probability.
-    std::vector<bst_float> &preds = *io_preds;
+    std::vector<bst_float> &preds = io_preds->data_h();
     for (size_t i = 0; i < preds.size(); ++i) {
       preds[i] = 1.0f / (1.0f + std::exp(-preds[i]));
     }
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 796b38f64..f9150156d 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -191,9 +191,9 @@ struct XGBAPIThreadLocalEntry {
   /*! \brief result holder for returning string pointers */
   std::vector<const char *> ret_vec_charp;
   /*! \brief returning float vector. */
-  std::vector<bst_float> ret_vec_float;
+  HostDeviceVector<bst_float> ret_vec_float;
   /*! \brief temp variable of gradient pairs. */
-  std::vector<bst_gpair> tmp_gpair;
+  HostDeviceVector<bst_gpair> tmp_gpair;
 };
 
 // define the threadlocal store.
@@ -705,14 +705,15 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle,
                                   bst_float *grad,
                                   bst_float *hess,
                                   xgboost::bst_ulong len) {
-  std::vector<bst_gpair>& tmp_gpair = XGBAPIThreadLocalStore::Get()->tmp_gpair;
+  HostDeviceVector<bst_gpair>& tmp_gpair = XGBAPIThreadLocalStore::Get()->tmp_gpair;
   API_BEGIN();
   Booster* bst = static_cast<Booster*>(handle);
   std::shared_ptr<DMatrix>* dtr =
       static_cast<std::shared_ptr<DMatrix>*>(dtrain);
   tmp_gpair.resize(len);
+  std::vector<bst_gpair>& tmp_gpair_h = tmp_gpair.data_h();
   for (xgboost::bst_ulong i = 0; i < len; ++i) {
-    tmp_gpair[i] = bst_gpair(grad[i], hess[i]);
+    tmp_gpair_h[i] = bst_gpair(grad[i], hess[i]);
   }
 
   bst->LazyInit();
@@ -749,7 +750,8 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
                              unsigned ntree_limit,
                              xgboost::bst_ulong *len,
                              const bst_float **out_result) {
-  std::vector<bst_float>& preds = XGBAPIThreadLocalStore::Get()->ret_vec_float;
+  HostDeviceVector<bst_float>& preds =
+    XGBAPIThreadLocalStore::Get()->ret_vec_float;
   API_BEGIN();
   Booster *bst = static_cast<Booster*>(handle);
   bst->LazyInit();
@@ -761,7 +763,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
       (option_mask & 4) != 0,
       (option_mask & 8) != 0,
       (option_mask & 16) != 0);
-  *out_result = dmlc::BeginPtr(preds);
+  *out_result = dmlc::BeginPtr(preds.data_h());
   *len = static_cast<xgboost::bst_ulong>(preds.size());
   API_END();
 }
diff --git a/src/cli_main.cc b/src/cli_main.cc
index 5c301b626..59eafc581 100644
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -324,7 +324,7 @@ void CLIPredict(const CLIParam& param) {
   if (param.silent == 0) {
     LOG(CONSOLE) << "start prediction...";
   }
-  std::vector<bst_float> preds;
+  HostDeviceVector<bst_float> preds;
   learner->Predict(dtest.get(), param.pred_margin, &preds, param.ntree_limit);
   if (param.silent == 0) {
     LOG(CONSOLE) << "writing prediction to " << param.name_pred;
@@ -332,7 +332,7 @@ void CLIPredict(const CLIParam& param) {
   std::unique_ptr<dmlc::Stream> fo(
       dmlc::Stream::Create(param.name_pred.c_str(), "w"));
   dmlc::ostream os(fo.get());
-  for (bst_float p : preds) {
+  for (bst_float p : preds.data_h()) {
     os << p << '\n';
   }
   // force flush before fo destruct.
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index 154a80cf3..41312d5c7 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -12,13 +12,27 @@ namespace xgboost {
 
 template <typename T>
 struct HostDeviceVectorImpl {
-  explicit HostDeviceVectorImpl(size_t size) : data_h_(size) {}
+  explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
+  explicit HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
+  explicit HostDeviceVectorImpl(const std::vector<T>& init) : data_h_(init) {}
   std::vector<T> data_h_;
 };
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, int device) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(size);
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(size, v);
+}
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init);
+}
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init);
 }
 
 template <typename T>
@@ -41,8 +55,8 @@ template <typename T>
 std::vector<T>& HostDeviceVector<T>::data_h() { return impl_->data_h_; }
 
 template <typename T>
-void HostDeviceVector<T>::resize(size_t new_size, int new_device) {
-  impl_->data_h_.resize(new_size);
+void HostDeviceVector<T>::resize(size_t new_size, T v, int new_device) {
+  impl_->data_h_.resize(new_size, v);
 }
 
 // explicit instantiations are required, as HostDeviceVector isn't header-only
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 4370ef21e..9a2a63020 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -1,6 +1,7 @@
 /*!
  * Copyright 2017 XGBoost contributors
  */
+
 #include "./host_device_vector.h"
 #include "./device_helpers.cuh"
 
@@ -8,13 +9,25 @@ namespace xgboost {
 
 template <typename T>
 struct HostDeviceVectorImpl {
-  HostDeviceVectorImpl(size_t size, int device)
+  HostDeviceVectorImpl(size_t size, T v, int device)
     : device_(device), on_d_(device >= 0) {
     if (on_d_) {
       dh::safe_cuda(cudaSetDevice(device_));
-      data_d_.resize(size);
+      data_d_.resize(size, v);
     } else {
-      data_h_.resize(size);
+      data_h_.resize(size, v);
+    }
+  }
+  // Init can be std::vector<T> or std::initializer_list<T>
+  template <class Init>
+  HostDeviceVectorImpl(const Init& init, int device)
+    : device_(device), on_d_(device >= 0) {
+    if (on_d_) {
+      dh::safe_cuda(cudaSetDevice(device_));
+      data_d_.resize(init.size());
+      thrust::copy(init.begin(), init.end(), data_d_.begin());
+    } else {
+      data_h_ = init;
     }
   }
   HostDeviceVectorImpl(const HostDeviceVectorImpl<T>&) = delete;
@@ -41,17 +54,18 @@ struct HostDeviceVectorImpl {
     lazy_sync_host();
     return data_h_;
   }
-  void resize(size_t new_size, int new_device) {
+  void resize(size_t new_size, T v, int new_device) {
     if (new_size == this->size() && new_device == device_)
       return;
-    device_ = new_device;
+    if (new_device != -1)
+      device_ = new_device;
     // if !on_d_, but the data size is 0 and the device is set,
     // resize the data on device instead
     if (!on_d_ && (data_h_.size() > 0 || device_ == -1)) {
-      data_h_.resize(new_size);
+      data_h_.resize(new_size, v);
     } else {
       dh::safe_cuda(cudaSetDevice(device_));
-      data_d_.resize(new_size);
+      data_d_.resize(new_size, v);
       on_d_ = true;
     }
   }
@@ -90,8 +104,20 @@ struct HostDeviceVectorImpl {
 };
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, int device) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(size, device);
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(size, v, device);
+}
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init, device);
+}
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init, device);
 }
 
 template <typename T>
@@ -124,8 +150,8 @@ template <typename T>
 std::vector<T>& HostDeviceVector<T>::data_h() { return impl_->data_h(); }
 
 template <typename T>
-void HostDeviceVector<T>::resize(size_t new_size, int new_device) {
-  impl_->resize(new_size, new_device);
+void HostDeviceVector<T>::resize(size_t new_size, T v, int new_device) {
+  impl_->resize(new_size, v, new_device);
 }
 
 // explicit instantiations are required, as HostDeviceVector isn't header-only
diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h
index fc0ca0660..3f4cb2b94 100644
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -5,6 +5,7 @@
 #define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
 
 #include <cstdlib>
+#include <initializer_list>
 #include <vector>
 
 // only include thrust-related files if host_device_vector.h
@@ -61,7 +62,9 @@ template <typename T> struct HostDeviceVectorImpl;
 template <typename T>
 class HostDeviceVector {
  public:
-  explicit HostDeviceVector(size_t size = 0, int device = -1);
+  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
+  HostDeviceVector(std::initializer_list<T> init, int device = -1);
+  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
   ~HostDeviceVector();
   HostDeviceVector(const HostDeviceVector<T>&) = delete;
   HostDeviceVector(HostDeviceVector<T>&&) = delete;
@@ -70,6 +73,7 @@ class HostDeviceVector {
   size_t size() const;
   int device() const;
   T* ptr_d(int device);
+  T* ptr_h() { return data_h().data(); }
 
   // only define functions returning device_ptr
   // if HostDeviceVector.h is included from a .cu file
@@ -79,17 +83,9 @@ class HostDeviceVector {
 #endif
 
   std::vector<T>& data_h();
-  void resize(size_t new_size, int new_device);
 
-  // helper functions in case a function needs to be templated
-  // to work for both HostDeviceVector and std::vector
-  static std::vector<T>& data_h(HostDeviceVector<T>* v) {
-    return v->data_h();
-  }
-
-  static std::vector<T>& data_h(std::vector<T>* v) {
-    return *v;
-  }
+  // passing in new_device == -1 keeps the device as is
+  void resize(size_t new_size, T v = T(), int new_device = -1);
 
  private:
   HostDeviceVectorImpl<T>* impl_;
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index adccf6239..dde5231c5 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -76,8 +76,10 @@ class GBLinear : public GradientBooster {
   void Save(dmlc::Stream* fo) const override {
     model.Save(fo);
   }
-  void DoBoost(DMatrix *p_fmat, std::vector<bst_gpair> *in_gpair,
-               ObjFunction *obj) override {
+
+  void DoBoost(DMatrix *p_fmat,
+               HostDeviceVector<bst_gpair> *in_gpair,
+               ObjFunction* obj) override {
     monitor.Start("DoBoost");
 
     if (!p_fmat->HaveColAccess(false)) {
@@ -91,14 +93,15 @@ class GBLinear : public GradientBooster {
     this->LazySumWeights(p_fmat);
 
     if (!this->CheckConvergence()) {
-      updater->Update(in_gpair, p_fmat, &model, sum_instance_weight);
+      updater->Update(&in_gpair->data_h(), p_fmat, &model, sum_instance_weight);
     }
     this->UpdatePredictionCache();
 
     monitor.Stop("DoBoost");
   }
 
-  void PredictBatch(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
+  void PredictBatch(DMatrix *p_fmat,
+                    HostDeviceVector<bst_float> *out_preds,
                     unsigned ntree_limit) override {
     monitor.Start("PredictBatch");
     CHECK_EQ(ntree_limit, 0U)
@@ -109,9 +112,9 @@ class GBLinear : public GradientBooster {
     if (it != cache_.end() && it->second.predictions.size() != 0) {
       std::vector<bst_float> &y = it->second.predictions;
       out_preds->resize(y.size());
-      std::copy(y.begin(), y.end(), out_preds->begin());
+      std::copy(y.begin(), y.end(), out_preds->data_h().begin());
     } else {
-      this->PredictBatchInternal(p_fmat, out_preds);
+      this->PredictBatchInternal(p_fmat, &out_preds->data_h());
     }
     monitor.Stop("PredictBatch");
   }
diff --git a/src/gbm/gbm.cc b/src/gbm/gbm.cc
index 4d7ee0975..0d84abfd0 100644
--- a/src/gbm/gbm.cc
+++ b/src/gbm/gbm.cc
@@ -22,18 +22,6 @@ GradientBooster* GradientBooster::Create(
   return (e->body)(cache_mats, base_margin);
 }
 
-void GradientBooster::DoBoost(DMatrix* p_fmat,
-                     HostDeviceVector<bst_gpair>* in_gpair,
-                     ObjFunction* obj) {
-  DoBoost(p_fmat, &in_gpair->data_h(), obj);
-}
-
-void GradientBooster::PredictBatch(DMatrix* dmat,
-                                   HostDeviceVector<bst_float>* out_preds,
-                                   unsigned ntree_limit) {
-  PredictBatch(dmat, &out_preds->data_h(), ntree_limit);
-}
-
 }  // namespace xgboost
 
 namespace xgboost {
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 7bbc57c1b..3ccf5782a 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -180,22 +180,39 @@ class GBTree : public GradientBooster {
         tparam.updater_seq.find("distcol") != std::string::npos;
   }
 
-  void DoBoost(DMatrix* p_fmat,
-               std::vector<bst_gpair>* in_gpair,
-               ObjFunction* obj) override {
-    DoBoostHelper(p_fmat, in_gpair, obj);
-  }
-
   void DoBoost(DMatrix* p_fmat,
                HostDeviceVector<bst_gpair>* in_gpair,
                ObjFunction* obj) override {
-    DoBoostHelper(p_fmat, in_gpair, obj);
-  }
-
-  void PredictBatch(DMatrix* p_fmat,
-               std::vector<bst_float>* out_preds,
-               unsigned ntree_limit) override {
-    predictor->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
+    std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
+    const int ngroup = model_.param.num_output_group;
+    monitor.Start("BoostNewTrees");
+    if (ngroup == 1) {
+      std::vector<std::unique_ptr<RegTree> > ret;
+      BoostNewTrees(in_gpair, p_fmat, 0, &ret);
+      new_trees.push_back(std::move(ret));
+    } else {
+      CHECK_EQ(in_gpair->size() % ngroup, 0U)
+          << "must have exactly ngroup*nrow gpairs";
+      // TODO(canonizer): perform this on GPU if HostDeviceVector has device set.
+      HostDeviceVector<bst_gpair> tmp(in_gpair->size() / ngroup,
+                                      bst_gpair(), in_gpair->device());
+      std::vector<bst_gpair>& gpair_h = in_gpair->data_h();
+      bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
+      for (int gid = 0; gid < ngroup; ++gid) {
+        std::vector<bst_gpair>& tmp_h = tmp.data_h();
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          tmp_h[i] = gpair_h[i * ngroup + gid];
+        }
+        std::vector<std::unique_ptr<RegTree> > ret;
+        BoostNewTrees(&tmp, p_fmat, gid, &ret);
+        new_trees.push_back(std::move(ret));
+      }
+    }
+    monitor.Stop("BoostNewTrees");
+    monitor.Start("CommitModel");
+    this->CommitModel(std::move(new_trees));
+    monitor.Stop("CommitModel");
   }
 
   void PredictBatch(DMatrix* p_fmat,
@@ -251,48 +268,11 @@ class GBTree : public GradientBooster {
     }
   }
 
-  // TVec is either std::vector<bst_gpair> or HostDeviceVector<bst_gpair>
-  template <typename TVec>
-  void DoBoostHelper(DMatrix* p_fmat,
-               TVec* in_gpair,
-               ObjFunction* obj) {
-    std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
-    const int ngroup = model_.param.num_output_group;
-    monitor.Start("BoostNewTrees");
-    if (ngroup == 1) {
-      std::vector<std::unique_ptr<RegTree> > ret;
-      BoostNewTrees(in_gpair, p_fmat, 0, &ret);
-      new_trees.push_back(std::move(ret));
-    } else {
-      CHECK_EQ(in_gpair->size() % ngroup, 0U)
-          << "must have exactly ngroup*nrow gpairs";
-      std::vector<bst_gpair> tmp(in_gpair->size() / ngroup);
-      auto& gpair_h = HostDeviceVector<bst_gpair>::data_h(in_gpair);
-      for (int gid = 0; gid < ngroup; ++gid) {
-        bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          tmp[i] = gpair_h[i * ngroup + gid];
-        }
-        std::vector<std::unique_ptr<RegTree> > ret;
-        BoostNewTrees(&tmp, p_fmat, gid, &ret);
-        new_trees.push_back(std::move(ret));
-      }
-    }
-    monitor.Stop("BoostNewTrees");
-    monitor.Start("CommitModel");
-    this->CommitModel(std::move(new_trees));
-    monitor.Stop("CommitModel");
-  }
-
   // do group specific group
-  // TVec is either const std::vector<bst_gpair> or HostDeviceVector<bst_gpair>
-  template <typename TVec>
-  inline void
-  BoostNewTrees(TVec* gpair,
-                DMatrix *p_fmat,
-                int bst_group,
-                std::vector<std::unique_ptr<RegTree> >* ret) {
+  inline void BoostNewTrees(HostDeviceVector<bst_gpair>* gpair,
+                            DMatrix *p_fmat,
+                            int bst_group,
+                            std::vector<std::unique_ptr<RegTree> >* ret) {
     this->InitUpdater();
     std::vector<RegTree*> new_trees;
     ret->clear();
@@ -315,23 +295,8 @@ class GBTree : public GradientBooster {
       }
     }
     // update the trees
-    for (auto& up : updaters) {
-      UpdateHelper(up.get(), gpair, p_fmat, new_trees);
-    }
-  }
-
-  void UpdateHelper(TreeUpdater* updater,
-               std::vector<bst_gpair>* gpair,
-               DMatrix *p_fmat,
-               const std::vector<RegTree*>& new_trees) {
-    updater->Update(*gpair, p_fmat, new_trees);
-  }
-
-  void UpdateHelper(TreeUpdater* updater,
-               HostDeviceVector<bst_gpair>* gpair,
-               DMatrix *p_fmat,
-               const std::vector<RegTree*>& new_trees) {
-    updater->Update(gpair, p_fmat, new_trees);
+    for (auto& up : updaters)
+      up->Update(gpair, p_fmat, new_trees);
   }
 
   // commit new trees all at once
@@ -389,10 +354,10 @@ class Dart : public GBTree {
 
   // predict the leaf scores with dropout if ntree_limit = 0
   void PredictBatch(DMatrix* p_fmat,
-               std::vector<bst_float>* out_preds,
-               unsigned ntree_limit) override {
+                    HostDeviceVector<bst_float>* out_preds,
+                    unsigned ntree_limit) override {
     DropTrees(ntree_limit);
-    PredLoopInternal<Dart>(p_fmat, out_preds, 0, ntree_limit, true);
+    PredLoopInternal<Dart>(p_fmat, &out_preds->data_h(), 0, ntree_limit, true);
   }
 
   void PredictInstance(const SparseBatch::Inst& inst,
diff --git a/src/learner.cc b/src/learner.cc
index 3f13ffba8..883c7a8e5 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -362,17 +362,17 @@ class LearnerImpl : public Learner {
     }
     this->LazyInitDMatrix(train);
     monitor.Start("PredictRaw");
-    this->PredictRaw(train, &preds2_);
+    this->PredictRaw(train, &preds_);
     monitor.Stop("PredictRaw");
     monitor.Start("GetGradient");
-    obj_->GetGradient(&preds2_, train->info(), iter, &gpair_);
+    obj_->GetGradient(&preds_, train->info(), iter, &gpair_);
     monitor.Stop("GetGradient");
     gbm_->DoBoost(train, &gpair_, obj_.get());
     monitor.Stop("UpdateOneIter");
   }
 
   void BoostOneIter(int iter, DMatrix* train,
-                    std::vector<bst_gpair>* in_gpair) override {
+                    HostDeviceVector<bst_gpair>* in_gpair) override {
     monitor.Start("BoostOneIter");
     if (tparam.seed_per_iteration || rabit::IsDistributed()) {
       common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
@@ -395,7 +395,7 @@ class LearnerImpl : public Learner {
       obj_->EvalTransform(&preds_);
       for (auto& ev : metrics_) {
         os << '\t' << data_names[i] << '-' << ev->Name() << ':'
-           << ev->Eval(preds_, data_sets[i]->info(), tparam.dsplit == 2);
+           << ev->Eval(preds_.data_h(), data_sets[i]->info(), tparam.dsplit == 2);
       }
     }
 
@@ -438,19 +438,20 @@ class LearnerImpl : public Learner {
     this->PredictRaw(data, &preds_);
     obj_->EvalTransform(&preds_);
     return std::make_pair(metric,
-                          ev->Eval(preds_, data->info(), tparam.dsplit == 2));
+                          ev->Eval(preds_.data_h(), data->info(), tparam.dsplit == 2));
   }
 
   void Predict(DMatrix* data, bool output_margin,
-               std::vector<bst_float>* out_preds, unsigned ntree_limit,
+               HostDeviceVector<bst_float>* out_preds, unsigned ntree_limit,
                bool pred_leaf, bool pred_contribs, bool approx_contribs,
                bool pred_interactions) const override {
     if (pred_contribs) {
-      gbm_->PredictContribution(data, out_preds, ntree_limit, approx_contribs);
+      gbm_->PredictContribution(data, &out_preds->data_h(), ntree_limit, approx_contribs);
     } else if (pred_interactions) {
-      gbm_->PredictInteractionContributions(data, out_preds, ntree_limit, approx_contribs);
+      gbm_->PredictInteractionContributions(data, &out_preds->data_h(), ntree_limit,
+                                            approx_contribs);
     } else if (pred_leaf) {
-      gbm_->PredictLeaf(data, out_preds, ntree_limit);
+      gbm_->PredictLeaf(data, &out_preds->data_h(), ntree_limit);
     } else {
       this->PredictRaw(data, out_preds, ntree_limit);
       if (!output_margin) {
@@ -546,12 +547,6 @@ class LearnerImpl : public Learner {
    * \param ntree_limit limit number of trees used for boosted tree
    *   predictor, when it equals 0, this means we are using all the trees
    */
-  inline void PredictRaw(DMatrix* data, std::vector<bst_float>* out_preds,
-                         unsigned ntree_limit = 0) const {
-    CHECK(gbm_.get() != nullptr)
-        << "Predict must happen after Load or InitModel";
-    gbm_->PredictBatch(data, out_preds, ntree_limit);
-  }
   inline void PredictRaw(DMatrix* data, HostDeviceVector<bst_float>* out_preds,
                          unsigned ntree_limit = 0) const {
     CHECK(gbm_.get() != nullptr)
@@ -572,8 +567,7 @@ class LearnerImpl : public Learner {
   // name of objective function
   std::string name_obj_;
   // temporal storages for prediction
-  std::vector<bst_float> preds_;
-  HostDeviceVector<bst_float> preds2_;
+  HostDeviceVector<bst_float> preds_;
   // gradient pairs
   HostDeviceVector<bst_gpair> gpair_;
 
diff --git a/src/objective/multiclass_obj.cc b/src/objective/multiclass_obj.cc
index dad4a3d60..9dcb85686 100644
--- a/src/objective/multiclass_obj.cc
+++ b/src/objective/multiclass_obj.cc
@@ -35,16 +35,18 @@ class SoftmaxMultiClassObj : public ObjFunction {
   void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
     param_.InitAllowUnknown(args);
   }
-  void GetGradient(const std::vector<bst_float>& preds,
+  void GetGradient(HostDeviceVector<bst_float>* preds,
                    const MetaInfo& info,
                    int iter,
-                   std::vector<bst_gpair>* out_gpair) override {
+                   HostDeviceVector<bst_gpair>* out_gpair) override {
     CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
-    CHECK(preds.size() == (static_cast<size_t>(param_.num_class) * info.labels.size()))
+    CHECK(preds->size() == (static_cast<size_t>(param_.num_class) * info.labels.size()))
         << "SoftmaxMultiClassObj: label size and pred size does not match";
-    out_gpair->resize(preds.size());
+    std::vector<bst_float>& preds_h = preds->data_h();
+    out_gpair->resize(preds_h.size());
+    std::vector<bst_gpair>& gpair = out_gpair->data_h();
     const int nclass = param_.num_class;
-    const omp_ulong ndata = static_cast<omp_ulong>(preds.size() / nclass);
+    const omp_ulong ndata = static_cast<omp_ulong>(preds_h.size() / nclass);
 
     int label_error = 0;
     #pragma omp parallel
@@ -53,7 +55,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
       #pragma omp for schedule(static)
       for (omp_ulong i = 0; i < ndata; ++i) {
         for (int k = 0; k < nclass; ++k) {
-          rec[k] = preds[i * nclass + k];
+          rec[k] = preds_h[i * nclass + k];
         }
         common::Softmax(&rec);
         int label = static_cast<int>(info.labels[i]);
@@ -65,9 +67,9 @@ class SoftmaxMultiClassObj : public ObjFunction {
           bst_float p = rec[k];
           const bst_float h = 2.0f * p * (1.0f - p) * wt;
           if (label == k) {
-            (*out_gpair)[i * nclass + k] = bst_gpair((p - 1.0f) * wt, h);
+            gpair[i * nclass + k] = bst_gpair((p - 1.0f) * wt, h);
           } else {
-            (*out_gpair)[i * nclass + k] = bst_gpair(p* wt, h);
+            gpair[i * nclass + k] = bst_gpair(p* wt, h);
           }
         }
       }
@@ -77,10 +79,10 @@ class SoftmaxMultiClassObj : public ObjFunction {
         << " num_class=" << nclass
         << " but found " << label_error << " in label.";
   }
-  void PredTransform(std::vector<bst_float>* io_preds) override {
+  void PredTransform(HostDeviceVector<bst_float>* io_preds) override {
     this->Transform(io_preds, output_prob_);
   }
-  void EvalTransform(std::vector<bst_float>* io_preds) override {
+  void EvalTransform(HostDeviceVector<bst_float>* io_preds) override {
     this->Transform(io_preds, true);
   }
   const char* DefaultEvalMetric() const override {
@@ -88,8 +90,8 @@ class SoftmaxMultiClassObj : public ObjFunction {
   }
 
  private:
-  inline void Transform(std::vector<bst_float> *io_preds, bool prob) {
-    std::vector<bst_float> &preds = *io_preds;
+  inline void Transform(HostDeviceVector<bst_float> *io_preds, bool prob) {
+    std::vector<bst_float> &preds = io_preds->data_h();
     std::vector<bst_float> tmp;
     const int nclass = param_.num_class;
     const omp_ulong ndata = static_cast<omp_ulong>(preds.size() / nclass);
diff --git a/src/objective/objective.cc b/src/objective/objective.cc
index 53f52ac9f..bf860a480 100644
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -25,17 +25,6 @@ ObjFunction* ObjFunction::Create(const std::string& name) {
   return (e->body)();
 }
 
-void ObjFunction::GetGradient(HostDeviceVector<bst_float>* preds,
-                              const MetaInfo& info,
-                              int iteration,
-                              HostDeviceVector<bst_gpair>* out_gpair) {
-  GetGradient(preds->data_h(), info, iteration, &out_gpair->data_h());
-}
-
-void ObjFunction::PredTransform(HostDeviceVector<bst_float> *io_preds) {
-  PredTransform(&io_preds->data_h());
-}
-
 }  // namespace xgboost
 
 namespace xgboost {
diff --git a/src/objective/rank_obj.cc b/src/objective/rank_obj.cc
index 76ce3ad72..93559e135 100644
--- a/src/objective/rank_obj.cc
+++ b/src/objective/rank_obj.cc
@@ -37,13 +37,14 @@ class LambdaRankObj : public ObjFunction {
   void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
     param_.InitAllowUnknown(args);
   }
-  void GetGradient(const std::vector<bst_float>& preds,
+  void GetGradient(HostDeviceVector<bst_float>* preds,
                    const MetaInfo& info,
                    int iter,
-                   std::vector<bst_gpair>* out_gpair) override {
-    CHECK_EQ(preds.size(), info.labels.size()) << "label size predict size not match";
-    std::vector<bst_gpair>& gpair = *out_gpair;
-    gpair.resize(preds.size());
+                   HostDeviceVector<bst_gpair>* out_gpair) override {
+    CHECK_EQ(preds->size(), info.labels.size()) << "label size predict size not match";
+    auto& preds_h = preds->data_h();
+    out_gpair->resize(preds_h.size());
+    std::vector<bst_gpair>& gpair = out_gpair->data_h();
     // quick consistency when group is not available
     std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(info.labels.size());
     const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
@@ -63,7 +64,7 @@ class LambdaRankObj : public ObjFunction {
       for (bst_omp_uint k = 0; k < ngroup; ++k) {
         lst.clear(); pairs.clear();
         for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
-          lst.push_back(ListEntry(preds[j], info.labels[j], j));
+          lst.push_back(ListEntry(preds_h[j], info.labels[j], j));
           gpair[j] = bst_gpair(0.0f, 0.0f);
         }
         std::sort(lst.begin(), lst.end(), ListEntry::CmpPred);
diff --git a/src/objective/regression_obj.cc b/src/objective/regression_obj.cc
index 9fb0cc981..b1f75c221 100644
--- a/src/objective/regression_obj.cc
+++ b/src/objective/regression_obj.cc
@@ -38,18 +38,20 @@ class RegLossObj : public ObjFunction {
       const std::vector<std::pair<std::string, std::string> > &args) override {
     param_.InitAllowUnknown(args);
   }
-  void GetGradient(const std::vector<bst_float> &preds, const MetaInfo &info,
-                   int iter, std::vector<bst_gpair> *out_gpair) override {
+  void GetGradient(HostDeviceVector<bst_float> *preds, const MetaInfo &info,
+                   int iter, HostDeviceVector<bst_gpair> *out_gpair) override {
     CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.size(), info.labels.size())
+    CHECK_EQ(preds->size(), info.labels.size())
         << "labels are not correctly provided"
-        << "preds.size=" << preds.size()
+        << "preds.size=" << preds->size()
         << ", label.size=" << info.labels.size();
+    auto& preds_h = preds->data_h();
 
     this->LazyCheckLabels(info.labels);
-    out_gpair->resize(preds.size());
-    const omp_ulong n = static_cast<omp_ulong>(preds.size());
-    auto gpair_ptr = out_gpair->data();
+    out_gpair->resize(preds_h.size());
+    auto& gpair = out_gpair->data_h();
+    const omp_ulong n = static_cast<omp_ulong>(preds_h.size());
+    auto gpair_ptr = out_gpair->ptr_h();
     avx::Float8 scale(param_.scale_pos_weight);
 
     const omp_ulong remainder = n % 8;
@@ -58,7 +60,7 @@ class RegLossObj : public ObjFunction {
 #pragma omp parallel for schedule(static) num_threads(std::min(8, nthread))
     for (omp_ulong i = 0; i < n - remainder; i += 8) {
       avx::Float8 y(&info.labels[i]);
-      avx::Float8 p = Loss::PredTransform(avx::Float8(&preds[i]));
+      avx::Float8 p = Loss::PredTransform(avx::Float8(&preds_h[i]));
       avx::Float8 w = info.weights.empty() ? avx::Float8(1.0f)
                                            : avx::Float8(&info.weights[i]);
       // Adjust weight
@@ -69,11 +71,11 @@ class RegLossObj : public ObjFunction {
     }
     for (omp_ulong i = n - remainder; i < n; ++i) {
       auto y = info.labels[i];
-      bst_float p = Loss::PredTransform(preds[i]);
+      bst_float p = Loss::PredTransform(preds_h[i]);
       bst_float w = info.GetWeight(i);
       w += y * ((param_.scale_pos_weight * w) - w);
-      (*out_gpair)[i] = bst_gpair(Loss::FirstOrderGradient(p, y) * w,
-                                  Loss::SecondOrderGradient(p, y) * w);
+      gpair[i] = bst_gpair(Loss::FirstOrderGradient(p, y) * w,
+                           Loss::SecondOrderGradient(p, y) * w);
     }
 
     // Reset omp max threads
@@ -82,8 +84,8 @@ class RegLossObj : public ObjFunction {
   const char *DefaultEvalMetric() const override {
     return Loss::DefaultEvalMetric();
   }
-  void PredTransform(std::vector<bst_float> *io_preds) override {
-    std::vector<bst_float> &preds = *io_preds;
+  void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
+    std::vector<bst_float> &preds = io_preds->data_h();
     const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
 #pragma omp parallel for schedule(static)
     for (bst_omp_uint j = 0; j < ndata; ++j) {
@@ -143,40 +145,42 @@ class PoissonRegression : public ObjFunction {
     param_.InitAllowUnknown(args);
   }
 
-  void GetGradient(const std::vector<bst_float> &preds,
+  void GetGradient(HostDeviceVector<bst_float> *preds,
                    const MetaInfo &info,
                    int iter,
-                   std::vector<bst_gpair> *out_gpair) override {
+                   HostDeviceVector<bst_gpair> *out_gpair) override {
     CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided";
-    out_gpair->resize(preds.size());
+    CHECK_EQ(preds->size(), info.labels.size()) << "labels are not correctly provided";
+    auto& preds_h = preds->data_h();
+    out_gpair->resize(preds->size());
+    auto& gpair = out_gpair->data_h();
     // check if label in range
     bool label_correct = true;
     // start calculating gradient
-    const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
+    const omp_ulong ndata = static_cast<omp_ulong>(preds_h.size()); // NOLINT(*)
 #pragma omp parallel for schedule(static)
     for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
-      bst_float p = preds[i];
+      bst_float p = preds_h[i];
       bst_float w = info.GetWeight(i);
       bst_float y = info.labels[i];
       if (y >= 0.0f) {
-        (*out_gpair)[i] = bst_gpair((std::exp(p) - y) * w,
-                                     std::exp(p + param_.max_delta_step) * w);
+        gpair[i] = bst_gpair((std::exp(p) - y) * w,
+                             std::exp(p + param_.max_delta_step) * w);
       } else {
         label_correct = false;
       }
     }
     CHECK(label_correct) << "PoissonRegression: label must be nonnegative";
   }
-  void PredTransform(std::vector<bst_float> *io_preds) override {
-    std::vector<bst_float> &preds = *io_preds;
+  void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
+    std::vector<bst_float> &preds = io_preds->data_h();
     const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
 #pragma omp parallel for schedule(static)
     for (long j = 0; j < ndata; ++j) {  // NOLINT(*)
       preds[j] = std::exp(preds[j]);
     }
   }
-  void EvalTransform(std::vector<bst_float> *io_preds) override {
+  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
     PredTransform(io_preds);
   }
   bst_float ProbToMargin(bst_float base_score) const override {
@@ -202,21 +206,23 @@ class CoxRegression : public ObjFunction {
  public:
   // declare functions
   void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {}
-  void GetGradient(const std::vector<bst_float> &preds,
+  void GetGradient(HostDeviceVector<bst_float> *preds,
                    const MetaInfo &info,
                    int iter,
-                   std::vector<bst_gpair> *out_gpair) override {
+                   HostDeviceVector<bst_gpair> *out_gpair) override {
     CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided";
-    out_gpair->resize(preds.size());
+    CHECK_EQ(preds->size(), info.labels.size()) << "labels are not correctly provided";
+    auto& preds_h = preds->data_h();
+    out_gpair->resize(preds_h.size());
+    auto& gpair = out_gpair->data_h();
     const std::vector<size_t> &label_order = info.LabelAbsSort();
 
-    const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
+    const omp_ulong ndata = static_cast<omp_ulong>(preds_h.size()); // NOLINT(*)
 
     // pre-compute a sum
     double exp_p_sum = 0;  // we use double because we might need the precision with large datasets
     for (omp_ulong i = 0; i < ndata; ++i) {
-      exp_p_sum += std::exp(preds[label_order[i]]);
+      exp_p_sum += std::exp(preds_h[label_order[i]]);
     }
 
     // start calculating grad and hess
@@ -227,7 +233,7 @@ class CoxRegression : public ObjFunction {
     double accumulated_sum = 0;
     for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
       const size_t ind = label_order[i];
-      const double p = preds[ind];
+      const double p = preds_h[ind];
       const double exp_p = std::exp(p);
       const double w = info.GetWeight(ind);
       const double y = info.labels[ind];
@@ -251,21 +257,21 @@ class CoxRegression : public ObjFunction {
 
       const double grad = exp_p*r_k - static_cast<bst_float>(y > 0);
       const double hess = exp_p*r_k - exp_p*exp_p * s_k;
-      out_gpair->at(ind) = bst_gpair(grad * w, hess * w);
+      gpair.at(ind) = bst_gpair(grad * w, hess * w);
 
       last_abs_y = abs_y;
       last_exp_p = exp_p;
     }
   }
-  void PredTransform(std::vector<bst_float> *io_preds) override {
-    std::vector<bst_float> &preds = *io_preds;
+  void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
+    std::vector<bst_float> &preds = io_preds->data_h();
     const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
     #pragma omp parallel for schedule(static)
     for (long j = 0; j < ndata; ++j) {  // NOLINT(*)
       preds[j] = std::exp(preds[j]);
     }
   }
-  void EvalTransform(std::vector<bst_float> *io_preds) override {
+  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
     PredTransform(io_preds);
   }
   bst_float ProbToMargin(bst_float base_score) const override {
@@ -288,39 +294,41 @@ class GammaRegression : public ObjFunction {
   void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
   }
 
-  void GetGradient(const std::vector<bst_float> &preds,
+  void GetGradient(HostDeviceVector<bst_float> *preds,
                    const MetaInfo &info,
                    int iter,
-                   std::vector<bst_gpair> *out_gpair) override {
+                   HostDeviceVector<bst_gpair> *out_gpair) override {
     CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided";
-    out_gpair->resize(preds.size());
+    CHECK_EQ(preds->size(), info.labels.size()) << "labels are not correctly provided";
+    auto& preds_h = preds->data_h();
+    out_gpair->resize(preds_h.size());
+    auto& gpair = out_gpair->data_h();
     // check if label in range
     bool label_correct = true;
     // start calculating gradient
-    const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
+    const omp_ulong ndata = static_cast<omp_ulong>(preds_h.size()); // NOLINT(*)
     #pragma omp parallel for schedule(static)
     for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
-      bst_float p = preds[i];
+      bst_float p = preds_h[i];
       bst_float w = info.GetWeight(i);
       bst_float y = info.labels[i];
       if (y >= 0.0f) {
-        (*out_gpair)[i] = bst_gpair((1 - y / std::exp(p)) * w, y / std::exp(p) * w);
+        gpair[i] = bst_gpair((1 - y / std::exp(p)) * w, y / std::exp(p) * w);
       } else {
         label_correct = false;
       }
     }
     CHECK(label_correct) << "GammaRegression: label must be positive";
   }
-  void PredTransform(std::vector<bst_float> *io_preds) override {
-    std::vector<bst_float> &preds = *io_preds;
+  void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
+    std::vector<bst_float> &preds = io_preds->data_h();
     const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
     #pragma omp parallel for schedule(static)
     for (long j = 0; j < ndata; ++j) {  // NOLINT(*)
       preds[j] = std::exp(preds[j]);
     }
   }
-  void EvalTransform(std::vector<bst_float> *io_preds) override {
+  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
     PredTransform(io_preds);
   }
   bst_float ProbToMargin(bst_float base_score) const override {
@@ -353,20 +361,22 @@ class TweedieRegression : public ObjFunction {
     param_.InitAllowUnknown(args);
   }
 
-  void GetGradient(const std::vector<bst_float> &preds,
+  void GetGradient(HostDeviceVector<bst_float> *preds,
                    const MetaInfo &info,
                    int iter,
-                   std::vector<bst_gpair> *out_gpair) override {
+                   HostDeviceVector<bst_gpair> *out_gpair) override {
     CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided";
-    out_gpair->resize(preds.size());
+    CHECK_EQ(preds->size(), info.labels.size()) << "labels are not correctly provided";
+    auto& preds_h = preds->data_h();
+    out_gpair->resize(preds->size());
+    auto& gpair = out_gpair->data_h();
     // check if label in range
     bool label_correct = true;
     // start calculating gradient
-    const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
+    const omp_ulong ndata = static_cast<omp_ulong>(preds->size()); // NOLINT(*)
     #pragma omp parallel for schedule(static)
     for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
-      bst_float p = preds[i];
+      bst_float p = preds_h[i];
       bst_float w = info.GetWeight(i);
       bst_float y = info.labels[i];
       float rho = param_.tweedie_variance_power;
@@ -374,15 +384,15 @@ class TweedieRegression : public ObjFunction {
         bst_float grad = -y * std::exp((1 - rho) * p) + std::exp((2 - rho) * p);
         bst_float hess = -y * (1 - rho) * \
           std::exp((1 - rho) * p) + (2 - rho) * std::exp((2 - rho) * p);
-        (*out_gpair)[i] = bst_gpair(grad * w, hess * w);
+        gpair[i] = bst_gpair(grad * w, hess * w);
       } else {
         label_correct = false;
       }
     }
     CHECK(label_correct) << "TweedieRegression: label must be nonnegative";
   }
-  void PredTransform(std::vector<bst_float> *io_preds) override {
-    std::vector<bst_float> &preds = *io_preds;
+  void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
+    std::vector<bst_float> &preds = io_preds->data_h();
     const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
 #pragma omp parallel for schedule(static)
     for (long j = 0; j < ndata; ++j) {  // NOLINT(*)
diff --git a/src/objective/regression_obj_gpu.cu b/src/objective/regression_obj_gpu.cu
index 7d70a59b0..45270e316 100644
--- a/src/objective/regression_obj_gpu.cu
+++ b/src/objective/regression_obj_gpu.cu
@@ -103,8 +103,8 @@ class GPURegLossObj : public ObjFunction {
     // free the old data and allocate the new data
     ba_.reset(new bulk_allocator<memory_type::DEVICE>());
     data_.reset(new DeviceData(ba_.get(), 0, n));
-    preds_d_.resize(n, param_.gpu_id);
-    out_gpair_d_.resize(n, param_.gpu_id);
+    preds_d_.resize(n, 0.0f, param_.gpu_id);
+    out_gpair_d_.resize(n, bst_gpair(), param_.gpu_id);
   }
 
  public:
@@ -114,23 +114,6 @@ class GPURegLossObj : public ObjFunction {
     param_.InitAllowUnknown(args);
     CHECK(param_.n_gpus != 0) << "Must have at least one device";
   }
-  void GetGradient(const std::vector<float> &preds,
-                   const MetaInfo &info,
-                   int iter,
-                   std::vector<bst_gpair> *out_gpair) override {
-    CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.size(), info.labels.size())
-      << "labels are not correctly provided"
-      << "preds.size=" << preds.size() << ", label.size=" << info.labels.size();
-
-    size_t ndata = preds.size();
-    out_gpair->resize(ndata);
-    LazyResize(ndata);
-    thrust::copy(preds.begin(), preds.end(), preds_d_.tbegin(param_.gpu_id));
-    GetGradientDevice(preds_d_.ptr_d(param_.gpu_id), info, iter,
-                      out_gpair_d_.ptr_d(param_.gpu_id), ndata);
-    thrust::copy_n(out_gpair_d_.tbegin(param_.gpu_id), ndata, out_gpair->begin());
-  }
 
   void GetGradient(HostDeviceVector<float>* preds,
                    const MetaInfo &info,
@@ -141,7 +124,7 @@ class GPURegLossObj : public ObjFunction {
       << "labels are not correctly provided"
       << "preds.size=" << preds->size() << ", label.size=" << info.labels.size();
     size_t ndata = preds->size();
-    out_gpair->resize(ndata, param_.gpu_id);
+    out_gpair->resize(ndata, bst_gpair(), param_.gpu_id);
     LazyResize(ndata);
     GetGradientDevice(preds->ptr_d(param_.gpu_id), info, iter,
                       out_gpair->ptr_d(param_.gpu_id), ndata);
@@ -189,13 +172,6 @@ class GPURegLossObj : public ObjFunction {
     return Loss::DefaultEvalMetric();
   }
 
-  void PredTransform(std::vector<float> *io_preds) override {
-    LazyResize(io_preds->size());
-    thrust::copy(io_preds->begin(), io_preds->end(), preds_d_.tbegin(param_.gpu_id));
-    PredTransformDevice(preds_d_.ptr_d(param_.gpu_id), io_preds->size());
-    thrust::copy_n(preds_d_.tbegin(param_.gpu_id), io_preds->size(), io_preds->begin());
-  }
-
   void PredTransform(HostDeviceVector<float> *io_preds) override {
     PredTransformDevice(io_preds->ptr_d(param_.gpu_id), io_preds->size());
   }
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 04bfd9f7d..9c956b5d9 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -104,14 +104,43 @@ class CPUPredictor : public Predictor {
                       tree_begin, ntree_limit);
   }
 
- public:
-  void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
-                    const gbm::GBTreeModel& model, int tree_begin,
-                    unsigned ntree_limit = 0) override {
-    PredictBatch(dmat, &out_preds->data_h(), model, tree_begin, ntree_limit);
+  bool PredictFromCache(DMatrix* dmat,
+                        HostDeviceVector<bst_float>* out_preds,
+                        const gbm::GBTreeModel& model,
+                        unsigned ntree_limit) {
+    if (ntree_limit == 0 ||
+        ntree_limit * model.param.num_output_group >= model.trees.size()) {
+      auto it = cache_.find(dmat);
+      if (it != cache_.end()) {
+        HostDeviceVector<bst_float>& y = it->second.predictions;
+        if (y.size() != 0) {
+          out_preds->resize(y.size());
+          std::copy(y.data_h().begin(), y.data_h().end(),
+                    out_preds->data_h().begin());
+          return true;
+        }
+      }
+    }
+    return false;
   }
 
-  void PredictBatch(DMatrix* dmat, std::vector<bst_float>* out_preds,
+  void InitOutPredictions(const MetaInfo& info,
+                          HostDeviceVector<bst_float>* out_preds,
+                          const gbm::GBTreeModel& model) const {
+    size_t n = model.param.num_output_group * info.num_row;
+    const std::vector<bst_float>& base_margin = info.base_margin;
+    out_preds->resize(n);
+    std::vector<bst_float>& out_preds_h = out_preds->data_h();
+    if (base_margin.size() != 0) {
+      CHECK_EQ(out_preds->size(), n);
+      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
+    } else {
+      std::fill(out_preds_h.begin(), out_preds_h.end(), model.base_margin);
+    }
+  }
+
+ public:
+  void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                     const gbm::GBTreeModel& model, int tree_begin,
                     unsigned ntree_limit = 0) override {
     if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
@@ -125,12 +154,14 @@ class CPUPredictor : public Predictor {
       ntree_limit = static_cast<unsigned>(model.trees.size());
     }
 
-    this->PredLoopInternal(dmat, out_preds, model, tree_begin, ntree_limit);
+    this->PredLoopInternal(dmat, &out_preds->data_h(), model,
+                           tree_begin, ntree_limit);
   }
 
-  void UpdatePredictionCache(const gbm::GBTreeModel& model,
-                             std::vector<std::unique_ptr<TreeUpdater>>* updaters,
-                             int num_new_trees) override {
+  void UpdatePredictionCache(
+      const gbm::GBTreeModel& model,
+      std::vector<std::unique_ptr<TreeUpdater>>* updaters,
+      int num_new_trees) override {
     int old_ntree = model.trees.size() - num_new_trees;
     // update cache entry
     for (auto& kv : cache_) {
@@ -138,7 +169,7 @@ class CPUPredictor : public Predictor {
 
       if (e.predictions.size() == 0) {
         InitOutPredictions(e.data->info(), &(e.predictions), model);
-        PredLoopInternal(e.data.get(), &(e.predictions), model, 0,
+        PredLoopInternal(e.data.get(), &(e.predictions.data_h()), model, 0,
                          model.trees.size());
       } else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
                  num_new_trees == 1 &&
@@ -146,7 +177,7 @@ class CPUPredictor : public Predictor {
                                                          &(e.predictions))) {
         {}  // do nothing
       } else {
-        PredLoopInternal(e.data.get(), &(e.predictions), model, old_ntree,
+        PredLoopInternal(e.data.get(), &(e.predictions.data_h()), model, old_ntree,
                          model.trees.size());
       }
     }
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index b0f34529e..ca00e4b14 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -256,8 +256,6 @@ class GPUPredictor : public xgboost::Predictor {
     HostDeviceVector<bst_float> predictions;
   };
 
-  std::unordered_map<DMatrix*, DevicePredictionCacheEntry> device_cache_;
-
  private:
   void DevicePredictInternal(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                              const gbm::GBTreeModel& model, size_t tree_begin,
@@ -337,25 +335,16 @@ class GPUPredictor : public xgboost::Predictor {
  public:
   GPUPredictor() : cpu_predictor(Predictor::Create("cpu_predictor")) {}
 
-  void PredictBatch(DMatrix* dmat, std::vector<bst_float>* out_preds,
-                    const gbm::GBTreeModel& model, int tree_begin,
-                    unsigned ntree_limit = 0) override {
-    HostDeviceVector<bst_float> out_preds_d;
-    PredictBatch(dmat, &out_preds_d, model, tree_begin, ntree_limit);
-    out_preds->resize(out_preds_d.size());
-    thrust::copy(out_preds_d.tbegin(param.gpu_id),
-                 out_preds_d.tend(param.gpu_id), out_preds->begin());
-  }
-
   void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                     const gbm::GBTreeModel& model, int tree_begin,
                     unsigned ntree_limit = 0) override {
-    if (this->PredictFromCacheDevice(dmat, out_preds, model, ntree_limit)) {
+    if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
       return;
     }
-    this->InitOutPredictionsDevice(dmat->info(), out_preds, model);
+    this->InitOutPredictions(dmat->info(), out_preds, model);
 
     int tree_end = ntree_limit * model.param.num_output_group;
+
     if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
       tree_end = static_cast<unsigned>(model.trees.size());
     }
@@ -363,13 +352,13 @@ class GPUPredictor : public xgboost::Predictor {
     DevicePredictInternal(dmat, out_preds, model, tree_begin, tree_end);
   }
 
-
-  void InitOutPredictionsDevice(const MetaInfo& info,
+ protected:
+  void InitOutPredictions(const MetaInfo& info,
                           HostDeviceVector<bst_float>* out_preds,
                           const gbm::GBTreeModel& model) const {
     size_t n = model.param.num_output_group * info.num_row;
     const std::vector<bst_float>& base_margin = info.base_margin;
-    out_preds->resize(n, param.gpu_id);
+    out_preds->resize(n, 0.0f, param.gpu_id);
     if (base_margin.size() != 0) {
       CHECK_EQ(out_preds->size(), n);
       thrust::copy(base_margin.begin(), base_margin.end(), out_preds->tbegin(param.gpu_id));
@@ -380,29 +369,16 @@ class GPUPredictor : public xgboost::Predictor {
   }
 
   bool PredictFromCache(DMatrix* dmat,
-                        std::vector<bst_float>* out_preds,
+                        HostDeviceVector<bst_float>* out_preds,
                         const gbm::GBTreeModel& model,
                         unsigned ntree_limit) {
-    HostDeviceVector<bst_float> out_preds_d(0, -1);
-    bool result = PredictFromCacheDevice(dmat, &out_preds_d, model, ntree_limit);
-    if (!result) return false;
-    out_preds->resize(out_preds_d.size(), param.gpu_id);
-    thrust::copy(out_preds_d.tbegin(param.gpu_id),
-                 out_preds_d.tend(param.gpu_id), out_preds->begin());
-    return true;
-  }
-
-  bool PredictFromCacheDevice(DMatrix* dmat,
-                              HostDeviceVector<bst_float>* out_preds,
-                              const gbm::GBTreeModel& model,
-                              unsigned ntree_limit) {
     if (ntree_limit == 0 ||
         ntree_limit * model.param.num_output_group >= model.trees.size()) {
-      auto it = device_cache_.find(dmat);
-      if (it != device_cache_.end()) {
+      auto it = cache_.find(dmat);
+      if (it != cache_.end()) {
         HostDeviceVector<bst_float>& y = it->second.predictions;
         if (y.size() != 0) {
-          out_preds->resize(y.size(), param.gpu_id);
+          out_preds->resize(y.size(), 0.0f, param.gpu_id);
           thrust::copy(y.tbegin(param.gpu_id), y.tend(param.gpu_id),
                        out_preds->tbegin(param.gpu_id));
           return true;
@@ -418,15 +394,15 @@ class GPUPredictor : public xgboost::Predictor {
                              int num_new_trees) override {
     auto old_ntree = model.trees.size() - num_new_trees;
     // update cache entry
-    for (auto& kv : device_cache_) {
-      DevicePredictionCacheEntry& e = kv.second;
+    for (auto& kv : cache_) {
+      PredictionCacheEntry& e = kv.second;
       DMatrix* dmat = kv.first;
       HostDeviceVector<bst_float>& predictions = e.predictions;
 
       if (predictions.size() == 0) {
         // ensure that the device in predictions is correct
-        predictions.resize(0, param.gpu_id);
-        cpu_predictor->PredictBatch(dmat, &predictions.data_h(), model, 0,
+        predictions.resize(0, 0.0f, param.gpu_id);
+        cpu_predictor->PredictBatch(dmat, &predictions, model, 0,
                                     static_cast<bst_uint>(model.trees.size()));
       } else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
                  num_new_trees == 1 &&
@@ -477,8 +453,6 @@ class GPUPredictor : public xgboost::Predictor {
     Predictor::Init(cfg, cache);
     cpu_predictor->Init(cfg, cache);
     param.InitAllowUnknown(cfg);
-    for (const std::shared_ptr<DMatrix>& d : cache)
-      device_cache_[d.get()].data = d;
     max_shared_memory_bytes = dh::max_shared_memory(param.gpu_id);
   }
 
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 7e1ee3312..a4ea6e82c 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -11,43 +11,8 @@ namespace xgboost {
 void Predictor::Init(
     const std::vector<std::pair<std::string, std::string>>& cfg,
     const std::vector<std::shared_ptr<DMatrix>>& cache) {
-  for (const std::shared_ptr<DMatrix>& d : cache) {
-    PredictionCacheEntry e;
-    e.data = d;
-    cache_[d.get()] = std::move(e);
-  }
-}
-bool Predictor::PredictFromCache(DMatrix* dmat,
-                                 std::vector<bst_float>* out_preds,
-                                 const gbm::GBTreeModel& model,
-                                 unsigned ntree_limit) {
-  if (ntree_limit == 0 ||
-      ntree_limit * model.param.num_output_group >= model.trees.size()) {
-    auto it = cache_.find(dmat);
-    if (it != cache_.end()) {
-      std::vector<bst_float>& y = it->second.predictions;
-      if (y.size() != 0) {
-        out_preds->resize(y.size());
-        std::copy(y.begin(), y.end(), out_preds->begin());
-        return true;
-      }
-    }
-  }
-
-  return false;
-}
-void Predictor::InitOutPredictions(const MetaInfo& info,
-                                   std::vector<bst_float>* out_preds,
-                                   const gbm::GBTreeModel& model) const {
-  size_t n = model.param.num_output_group * info.num_row;
-  const std::vector<bst_float>& base_margin = info.base_margin;
-  out_preds->resize(n);
-  if (base_margin.size() != 0) {
-    CHECK_EQ(out_preds->size(), n);
-    std::copy(base_margin.begin(), base_margin.end(), out_preds->begin());
-  } else {
-    std::fill(out_preds->begin(), out_preds->end(), model.base_margin);
-  }
+  for (const std::shared_ptr<DMatrix>& d : cache)
+    cache_[d.get()].data = d;
 }
 Predictor* Predictor::Create(std::string name) {
   auto* e = ::dmlc::Registry<PredictorReg>::Get()->Find(name);
diff --git a/src/tree/tree_updater.cc b/src/tree/tree_updater.cc
index 2ca949e21..66227a78a 100644
--- a/src/tree/tree_updater.cc
+++ b/src/tree/tree_updater.cc
@@ -22,17 +22,6 @@ TreeUpdater* TreeUpdater::Create(const std::string& name) {
   return (e->body)();
 }
 
-void TreeUpdater::Update(HostDeviceVector<bst_gpair>* gpair,
-                         DMatrix* data,
-                         const std::vector<RegTree*>& trees) {
-  Update(gpair->data_h(), data, trees);
-}
-
-bool TreeUpdater::UpdatePredictionCache(const DMatrix* data,
-                                        HostDeviceVector<bst_float>* out_preds) {
-  return UpdatePredictionCache(data, &out_preds->data_h());
-}
-
 }  // namespace xgboost
 
 namespace xgboost {
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 4044c75ab..5d687f2c4 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -26,7 +26,7 @@ class ColMaker: public TreeUpdater {
     param.InitAllowUnknown(args);
   }
 
-  void Update(const std::vector<bst_gpair> &gpair,
+  void Update(HostDeviceVector<bst_gpair> *gpair,
               DMatrix* dmat,
               const std::vector<RegTree*> &trees) override {
     TStats::CheckInfo(dmat->info());
@@ -37,7 +37,7 @@ class ColMaker: public TreeUpdater {
     // build tree
     for (size_t i = 0; i < trees.size(); ++i) {
       Builder builder(param);
-      builder.Update(gpair, dmat, trees[i]);
+      builder.Update(gpair->data_h(), dmat, trees[i]);
     }
     param.learning_rate = lr;
   }
@@ -806,13 +806,13 @@ class DistColMaker : public ColMaker<TStats, TConstraint> {
     param.InitAllowUnknown(args);
     pruner->Init(args);
   }
-  void Update(const std::vector<bst_gpair> &gpair,
+  void Update(HostDeviceVector<bst_gpair> *gpair,
               DMatrix* dmat,
               const std::vector<RegTree*> &trees) override {
     TStats::CheckInfo(dmat->info());
     CHECK_EQ(trees.size(), 1U) << "DistColMaker: only support one tree at a time";
     // build the tree
-    builder.Update(gpair, dmat, trees[0]);
+    builder.Update(gpair->data_h(), dmat, trees[0]);
     //// prune the tree, note that pruner will sync the tree
     pruner->Update(gpair, dmat, trees);
     // update position after the tree is pruned
@@ -967,7 +967,7 @@ class TreeUpdaterSwitch : public TreeUpdater {
     inner_->Init(args);
   }
 
-  void Update(const std::vector<bst_gpair>& gpair,
+  void Update(HostDeviceVector<bst_gpair>* gpair,
               DMatrix* data,
               const std::vector<RegTree*>& trees) override {
     CHECK(inner_ != nullptr);
diff --git a/src/tree/updater_fast_hist.cc b/src/tree/updater_fast_hist.cc
index 70d39b60b..a3cb01a05 100644
--- a/src/tree/updater_fast_hist.cc
+++ b/src/tree/updater_fast_hist.cc
@@ -55,7 +55,7 @@ class FastHistMaker: public TreeUpdater {
     is_gmat_initialized_ = false;
   }
 
-  void Update(const std::vector<bst_gpair>& gpair,
+  void Update(HostDeviceVector<bst_gpair>* gpair,
               DMatrix* dmat,
               const std::vector<RegTree*>& trees) override {
     TStats::CheckInfo(dmat->info());
@@ -82,13 +82,14 @@ class FastHistMaker: public TreeUpdater {
       builder_.reset(new Builder(param, fhparam, std::move(pruner_)));
     }
     for (size_t i = 0; i < trees.size(); ++i) {
-      builder_->Update(gmat_, gmatb_, column_matrix_, gpair, dmat, trees[i]);
+      builder_->Update
+        (gmat_, gmatb_, column_matrix_, gpair, dmat, trees[i]);
     }
     param.learning_rate = lr;
   }
 
   bool UpdatePredictionCache(const DMatrix* data,
-                             std::vector<bst_float>* out_preds) override {
+                             HostDeviceVector<bst_float>* out_preds) override {
     if (!builder_ || param.subsample < 1.0f) {
       return false;
     } else {
@@ -139,7 +140,7 @@ class FastHistMaker: public TreeUpdater {
     virtual void Update(const GHistIndexMatrix& gmat,
                         const GHistIndexBlockMatrix& gmatb,
                         const ColumnMatrix& column_matrix,
-                        const std::vector<bst_gpair>& gpair,
+                        HostDeviceVector<bst_gpair>* gpair,
                         DMatrix* p_fmat,
                         RegTree* p_tree) {
       double gstart = dmlc::GetTime();
@@ -154,8 +155,10 @@ class FastHistMaker: public TreeUpdater {
       double time_evaluate_split = 0;
       double time_apply_split = 0;
 
+      std::vector<bst_gpair>& gpair_h = gpair->data_h();
+
       tstart = dmlc::GetTime();
-      this->InitData(gmat, gpair, *p_fmat, *p_tree);
+      this->InitData(gmat, gpair_h, *p_fmat, *p_tree);
       std::vector<bst_uint> feat_set = feat_index;
       time_init_data = dmlc::GetTime() - tstart;
 
@@ -165,11 +168,11 @@ class FastHistMaker: public TreeUpdater {
       for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
         tstart = dmlc::GetTime();
         hist_.AddHistRow(nid);
-        BuildHist(gpair, row_set_collection_[nid], gmat, gmatb, feat_set, hist_[nid]);
+        BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, feat_set, hist_[nid]);
         time_build_hist += dmlc::GetTime() - tstart;
 
         tstart = dmlc::GetTime();
-        this->InitNewNode(nid, gmat, gpair, *p_fmat, *p_tree);
+        this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
         time_init_new_node += dmlc::GetTime() - tstart;
 
         tstart = dmlc::GetTime();
@@ -200,17 +203,17 @@ class FastHistMaker: public TreeUpdater {
           hist_.AddHistRow(cleft);
           hist_.AddHistRow(cright);
           if (row_set_collection_[cleft].size() < row_set_collection_[cright].size()) {
-            BuildHist(gpair, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]);
+            BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]);
             SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
           } else {
-            BuildHist(gpair, row_set_collection_[cright], gmat, gmatb, feat_set, hist_[cright]);
+            BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, feat_set, hist_[cright]);
             SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
           }
           time_build_hist += dmlc::GetTime() - tstart;
 
           tstart = dmlc::GetTime();
-          this->InitNewNode(cleft, gmat, gpair, *p_fmat, *p_tree);
-          this->InitNewNode(cright, gmat, gpair, *p_fmat, *p_tree);
+          this->InitNewNode(cleft, gmat, gpair_h, *p_fmat, *p_tree);
+          this->InitNewNode(cright, gmat, gpair_h, *p_fmat, *p_tree);
           time_init_new_node += dmlc::GetTime() - tstart;
 
           tstart = dmlc::GetTime();
@@ -293,8 +296,8 @@ class FastHistMaker: public TreeUpdater {
     }
 
     inline bool UpdatePredictionCache(const DMatrix* data,
-                                      std::vector<bst_float>* p_out_preds) {
-      std::vector<bst_float>& out_preds = *p_out_preds;
+                                      HostDeviceVector<bst_float>* p_out_preds) {
+      std::vector<bst_float>& out_preds = p_out_preds->data_h();
 
       // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
       // conjunction with Update().
diff --git a/src/tree/updater_gpu.cu b/src/tree/updater_gpu.cu
index cee7c3e88..77eba9d72 100644
--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -512,7 +512,7 @@ class GPUMaker : public TreeUpdater {
     maxLeaves = 1 << param.max_depth;
   }
 
-  void Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
+  void Update(HostDeviceVector<bst_gpair>* gpair, DMatrix* dmat,
               const std::vector<RegTree*>& trees) override {
     GradStats::CheckInfo(dmat->info());
     // rescale learning rate according to size of trees
@@ -530,7 +530,7 @@ class GPUMaker : public TreeUpdater {
     param.learning_rate = lr;
   }
   /// @note: Update should be only after Init!!
-  void UpdateTree(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
+  void UpdateTree(HostDeviceVector<bst_gpair>* gpair, DMatrix* dmat,
                   RegTree* hTree) {
     if (!allocated) {
       setupOneTimeData(dmat);
@@ -687,11 +687,11 @@ class GPUMaker : public TreeUpdater {
     assignColIds<<<nCols, 512>>>(colIds.data(), colOffsets.data());
   }
 
-  void transferGrads(const std::vector<bst_gpair>& gpair) {
+  void transferGrads(HostDeviceVector<bst_gpair>* gpair) {
     // HACK
-    dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
+    dh::safe_cuda(cudaMemcpy(gradsInst.data(), gpair->ptr_d(param.gpu_id),
                              sizeof(bst_gpair) * nRows,
-                             cudaMemcpyHostToDevice));
+                             cudaMemcpyDefault));
     // evaluate the full-grad reduction for the root node
     dh::sumReduction<bst_gpair>(tmp_mem, gradsInst, gradSums, nRows);
   }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 688d28031..48bd45f09 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -506,27 +506,9 @@ class GPUHistMaker : public TreeUpdater {
     monitor.Init("updater_gpu_hist", param.debug_verbose);
   }
 
-  void Update(const std::vector<bst_gpair>& gpair, DMatrix* dmat,
-              const std::vector<RegTree*>& trees) override {
-    monitor.Start("Update", dList);
-    // TODO(canonizer): move it into the class if this ever becomes a bottleneck
-    HostDeviceVector<bst_gpair> gpair_d(gpair.size(), param.gpu_id);
-    dh::safe_cuda(cudaSetDevice(param.gpu_id));
-    thrust::copy(gpair.begin(), gpair.end(), gpair_d.tbegin(param.gpu_id));
-    Update(&gpair_d, dmat, trees);
-    monitor.Stop("Update", dList);
-  }
-
   void Update(HostDeviceVector<bst_gpair>* gpair, DMatrix* dmat,
               const std::vector<RegTree*>& trees) override {
     monitor.Start("Update", dList);
-    UpdateHelper(gpair, dmat, trees);
-    monitor.Stop("Update", dList);
-  }
-
- private:
-  void UpdateHelper(HostDeviceVector<bst_gpair>* gpair, DMatrix* dmat,
-                    const std::vector<RegTree*>& trees) {
     GradStats::CheckInfo(dmat->info());
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
@@ -541,9 +523,9 @@ class GPUHistMaker : public TreeUpdater {
       LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
     }
     param.learning_rate = lr;
+    monitor.Stop("Update", dList);
   }
 
- public:
   void InitDataOnce(DMatrix* dmat) {
     info = &dmat->info();
     monitor.Start("Quantiles", dList);
@@ -876,16 +858,6 @@ class GPUHistMaker : public TreeUpdater {
     omp_set_num_threads(nthread);
   }
 
-  bool UpdatePredictionCache(const DMatrix* data,
-                             std::vector<bst_float>* p_out_preds) override {
-    return false;
-  }
-
-  bool UpdatePredictionCache(
-      const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) override {
-    return false;
-  }
-
   struct ExpandEntry {
     int nid;
     int depth;
diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
index d4f011d06..04012f4b8 100644
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -21,7 +21,7 @@ DMLC_REGISTRY_FILE_TAG(updater_histmaker);
 template<typename TStats>
 class HistMaker: public BaseMaker {
  public:
-  void Update(const std::vector<bst_gpair> &gpair,
+  void Update(HostDeviceVector<bst_gpair> *gpair,
               DMatrix *p_fmat,
               const std::vector<RegTree*> &trees) override {
     TStats::CheckInfo(p_fmat->info());
@@ -30,7 +30,7 @@ class HistMaker: public BaseMaker {
     param.learning_rate = lr / trees.size();
     // build tree
     for (size_t i = 0; i < trees.size(); ++i) {
-      this->Update(gpair, p_fmat, trees[i]);
+      this->Update(gpair->data_h(), p_fmat, trees[i]);
     }
     param.learning_rate = lr;
   }
diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc
index af52f73f4..bbdc155d1 100644
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -29,7 +29,7 @@ class TreePruner: public TreeUpdater {
     syncher->Init(args);
   }
   // update the tree, do pruning
-  void Update(const std::vector<bst_gpair> &gpair,
+  void Update(HostDeviceVector<bst_gpair> *gpair,
               DMatrix *p_fmat,
               const std::vector<RegTree*> &trees) override {
     // rescale learning rate according to size of trees
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index fb4e72caf..e94a92147 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -25,10 +25,11 @@ class TreeRefresher: public TreeUpdater {
     param.InitAllowUnknown(args);
   }
   // update the tree, do pruning
-  void Update(const std::vector<bst_gpair> &gpair,
+  void Update(HostDeviceVector<bst_gpair> *gpair,
               DMatrix *p_fmat,
               const std::vector<RegTree*> &trees) override {
     if (trees.size() == 0) return;
+    std::vector<bst_gpair> &gpair_h = gpair->data_h();
     // number of threads
     // thread temporal space
     std::vector<std::vector<TStats> > stemp;
@@ -71,7 +72,7 @@ class TreeRefresher: public TreeUpdater {
           feats.Fill(inst);
           int offset = 0;
           for (size_t j = 0; j < trees.size(); ++j) {
-            AddStats(*trees[j], feats, gpair, info, ridx,
+            AddStats(*trees[j], feats, gpair_h, info, ridx,
                      dmlc::BeginPtr(stemp[tid]) + offset);
             offset += trees[j]->param.num_nodes;
           }
diff --git a/src/tree/updater_skmaker.cc b/src/tree/updater_skmaker.cc
index 1994cb6d3..688e2026c 100644
--- a/src/tree/updater_skmaker.cc
+++ b/src/tree/updater_skmaker.cc
@@ -22,7 +22,7 @@ DMLC_REGISTRY_FILE_TAG(updater_skmaker);
 
 class SketchMaker: public BaseMaker {
  public:
-  void Update(const std::vector<bst_gpair> &gpair,
+  void Update(HostDeviceVector<bst_gpair> *gpair,
               DMatrix *p_fmat,
               const std::vector<RegTree*> &trees) override {
     // rescale learning rate according to size of trees
@@ -30,7 +30,7 @@ class SketchMaker: public BaseMaker {
     param.learning_rate = lr / trees.size();
     // build tree
     for (size_t i = 0; i < trees.size(); ++i) {
-      this->Update(gpair, p_fmat, trees[i]);
+      this->Update(gpair->data_h(), p_fmat, trees[i]);
     }
     param.learning_rate = lr;
   }
diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc
index bd17968cd..f2a5da48b 100644
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -23,7 +23,7 @@ class TreeSyncher: public TreeUpdater {
  public:
   void Init(const std::vector<std::pair<std::string, std::string> >& args) override {}
 
-  void Update(const std::vector<bst_gpair> &gpair,
+  void Update(HostDeviceVector<bst_gpair> *gpair,
               DMatrix* dmat,
               const std::vector<RegTree*> &trees) override {
     if (rabit::GetWorldSize() == 1) return;
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 7f46e43b6..3318be60a 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -38,10 +38,13 @@ void CheckObjFunction(xgboost::ObjFunction * obj,
   info.labels = labels;
   info.weights = weights;
 
-  std::vector<xgboost::bst_gpair> gpair;
-  obj->GetGradient(preds, info, 1, &gpair);
+  xgboost::HostDeviceVector<xgboost::bst_float> in_preds(preds);
 
-  ASSERT_EQ(gpair.size(), preds.size());
+  xgboost::HostDeviceVector<xgboost::bst_gpair> out_gpair;
+  obj->GetGradient(&in_preds, info, 1, &out_gpair);
+  std::vector<xgboost::bst_gpair>& gpair = out_gpair.data_h();
+
+  ASSERT_EQ(gpair.size(), in_preds.size());
   for (int i = 0; i < static_cast<int>(gpair.size()); ++i) {
     EXPECT_NEAR(gpair[i].GetGrad(), out_grad[i], 0.01)
       << "Unexpected grad for pred=" << preds[i] << " label=" << labels[i]
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index 3c679a6a5..8a1d3f6ec 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -46,10 +46,11 @@ TEST(Objective, LogisticRegressionBasic) {
     << "Expected error when base_score not in range [0,1f] for LogisticRegression";
 
   // test PredTransform
-  std::vector<xgboost::bst_float> preds = {0, 0.1f, 0.5f, 0.9f, 1};
+  xgboost::HostDeviceVector<xgboost::bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
   std::vector<xgboost::bst_float> out_preds = {0.5f, 0.524f, 0.622f, 0.710f, 0.731f};
-  obj->PredTransform(&preds);
-  for (int i = 0; i < static_cast<int>(preds.size()); ++i) {
+  obj->PredTransform(&io_preds);
+  auto& preds = io_preds.data_h();
+  for (int i = 0; i < static_cast<int>(io_preds.size()); ++i) {
     EXPECT_NEAR(preds[i], out_preds[i], 0.01f);
   }
 }
@@ -94,10 +95,11 @@ TEST(Objective, PoissonRegressionBasic) {
   EXPECT_NEAR(obj->ProbToMargin(0.9f), -0.10f, 0.01f);
 
   // test PredTransform
-  std::vector<xgboost::bst_float> preds = {0, 0.1f, 0.5f, 0.9f, 1};
+  xgboost::HostDeviceVector<xgboost::bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
   std::vector<xgboost::bst_float> out_preds = {1, 1.10f, 1.64f, 2.45f, 2.71f};
-  obj->PredTransform(&preds);
-  for (int i = 0; i < static_cast<int>(preds.size()); ++i) {
+  obj->PredTransform(&io_preds);
+  auto& preds = io_preds.data_h();
+  for (int i = 0; i < static_cast<int>(io_preds.size()); ++i) {
     EXPECT_NEAR(preds[i], out_preds[i], 0.01f);
   }
 }
@@ -129,10 +131,11 @@ TEST(Objective, GammaRegressionBasic) {
   EXPECT_NEAR(obj->ProbToMargin(0.9f), -0.10f, 0.01f);
 
   // test PredTransform
-  std::vector<xgboost::bst_float> preds = {0, 0.1f, 0.5f, 0.9f, 1};
+  xgboost::HostDeviceVector<xgboost::bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
   std::vector<xgboost::bst_float> out_preds = {1, 1.10f, 1.64f, 2.45f, 2.71f};
-  obj->PredTransform(&preds);
-  for (int i = 0; i < static_cast<int>(preds.size()); ++i) {
+  obj->PredTransform(&io_preds);
+  auto& preds = io_preds.data_h();
+  for (int i = 0; i < static_cast<int>(io_preds.size()); ++i) {
     EXPECT_NEAR(preds[i], out_preds[i], 0.01f);
   }
 }
@@ -165,10 +168,11 @@ TEST(Objective, TweedieRegressionBasic) {
   EXPECT_NEAR(obj->ProbToMargin(0.9f), 0.89f, 0.01f);
 
   // test PredTransform
-  std::vector<xgboost::bst_float> preds = {0, 0.1f, 0.5f, 0.9f, 1};
+  xgboost::HostDeviceVector<xgboost::bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
   std::vector<xgboost::bst_float> out_preds = {1, 1.10f, 1.64f, 2.45f, 2.71f};
-  obj->PredTransform(&preds);
-  for (int i = 0; i < static_cast<int>(preds.size()); ++i) {
+  obj->PredTransform(&io_preds);
+  auto& preds = io_preds.data_h();
+  for (int i = 0; i < static_cast<int>(io_preds.size()); ++i) {
     EXPECT_NEAR(preds[i], out_preds[i], 0.01f);
   }
 }
diff --git a/tests/cpp/objective/test_regression_obj_gpu.cu b/tests/cpp/objective/test_regression_obj_gpu.cu
index 0e507dc07..0ea8a8e1e 100644
--- a/tests/cpp/objective/test_regression_obj_gpu.cu
+++ b/tests/cpp/objective/test_regression_obj_gpu.cu
@@ -48,10 +48,11 @@ TEST(Objective, GPULogisticRegressionBasic) {
     << "Expected error when base_score not in range [0,1f] for LogisticRegression";
 
   // test PredTransform
-  std::vector<xgboost::bst_float> preds = {0, 0.1f, 0.5f, 0.9f, 1};
+  xgboost::HostDeviceVector<xgboost::bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
   std::vector<xgboost::bst_float> out_preds = {0.5f, 0.524f, 0.622f, 0.710f, 0.731f};
-  obj->PredTransform(&preds);
-  for (int i = 0; i < static_cast<int>(preds.size()); ++i) {
+  obj->PredTransform(&io_preds);
+  auto& preds = io_preds.data_h();
+  for (int i = 0; i < static_cast<int>(io_preds.size()); ++i) {
     EXPECT_NEAR(preds[i], out_preds[i], 0.01f);
   }
 }
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 025aa11e5..0a9c4c8cf 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -24,10 +24,11 @@ TEST(cpu_predictor, Test) {
   auto dmat = CreateDMatrix(n_row, n_col, 0);
 
   // Test predict batch
-  std::vector<float> out_predictions;
+  HostDeviceVector<float> out_predictions;
   cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
+  std::vector<float>& out_predictions_h = out_predictions.data_h();
   for (int i = 0; i < out_predictions.size(); i++) {
-    ASSERT_EQ(out_predictions[i], 1.5);
+    ASSERT_EQ(out_predictions_h[i], 1.5);
   }
 
   // Test predict instance
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 1a4a48f83..45fad97d6 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -33,13 +33,15 @@ TEST(gpu_predictor, Test) {
   auto dmat = CreateDMatrix(n_row, n_col, 0);
 
   // Test predict batch
-  std::vector<float> gpu_out_predictions;
-  std::vector<float> cpu_out_predictions;
+  HostDeviceVector<float> gpu_out_predictions;
+  HostDeviceVector<float> cpu_out_predictions;
   gpu_predictor->PredictBatch(dmat.get(), &gpu_out_predictions, model, 0);
   cpu_predictor->PredictBatch(dmat.get(), &cpu_out_predictions, model, 0);
+  std::vector<float>& gpu_out_predictions_h = gpu_out_predictions.data_h();
+  std::vector<float>& cpu_out_predictions_h = cpu_out_predictions.data_h();
   float abs_tolerance = 0.001;
   for (int i = 0; i < gpu_out_predictions.size(); i++) {
-    ASSERT_LT(std::abs(gpu_out_predictions[i] - cpu_out_predictions[i]),
+    ASSERT_LT(std::abs(gpu_out_predictions_h[i] - cpu_out_predictions_h[i]),
               abs_tolerance);
   }
   // Test predict instance