diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index ca00e4b14..255888324 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -378,9 +378,11 @@ class GPUPredictor : public xgboost::Predictor {
       if (it != cache_.end()) {
         HostDeviceVector<bst_float>& y = it->second.predictions;
         if (y.size() != 0) {
+          dh::safe_cuda(cudaSetDevice(param.gpu_id));
           out_preds->resize(y.size(), 0.0f, param.gpu_id);
-          thrust::copy(y.tbegin(param.gpu_id), y.tend(param.gpu_id),
-                       out_preds->tbegin(param.gpu_id));
+          dh::safe_cuda
+            (cudaMemcpy(out_preds->ptr_d(param.gpu_id), y.ptr_d(param.gpu_id),
+                        out_preds->size() * sizeof(bst_float), cudaMemcpyDefault));
           return true;
         }
       }
@@ -406,8 +408,7 @@ class GPUPredictor : public xgboost::Predictor {
                                     static_cast<bst_uint>(model.trees.size()));
       } else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
                  num_new_trees == 1 &&
-                 updaters->back()->UpdatePredictionCache(e.data.get(),
-                                                         &predictions)) {
+                 updaters->back()->UpdatePredictionCache(e.data.get(), &predictions)) {
         // do nothing
       } else {
         DevicePredictInternal(dmat, &predictions, model, old_ntree,
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 48bd45f09..b951b8a9e 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -210,6 +210,18 @@ struct DeviceHistogram {
   }
 };
 
+struct CalcWeightTrainParam {
+  float min_child_weight;
+  float reg_alpha;
+  float reg_lambda;
+  float max_delta_step;
+  float learning_rate;
+  __host__ __device__ CalcWeightTrainParam(const TrainParam& p)
+    : min_child_weight(p.min_child_weight), reg_alpha(p.reg_alpha),
+      reg_lambda(p.reg_lambda), max_delta_step(p.max_delta_step),
+      learning_rate(p.learning_rate) {}
+};
+
 // Manage memory for a single GPU
 struct DeviceShard {
   struct Segment {
@@ -236,7 +248,9 @@ struct DeviceShard {
   dh::dvec<float> gidx_fvalue_map;
   dh::dvec<float> min_fvalue;
   dh::dvec<int> monotone_constraints;
+  dh::dvec<bst_float> prediction_cache;
   std::vector<bst_gpair> node_sum_gradients;
+  dh::dvec<bst_gpair> node_sum_gradients_d;
   common::CompressedIterator<uint32_t> gidx;
   int row_stride;
   bst_uint row_begin_idx;  // The row offset for this shard
@@ -246,6 +260,7 @@ struct DeviceShard {
   int null_gidx_value;
   DeviceHistogram hist;
   TrainParam param;
+  bool prediction_cache_initialised;
 
   int64_t* tmp_pinned;  // Small amount of staging memory
 
@@ -263,7 +278,8 @@ struct DeviceShard {
         n_rows(row_end - row_begin),
         n_bins(n_bins),
         null_gidx_value(n_bins),
-        param(param) {
+        param(param),
+        prediction_cache_initialised(false) {
     // Convert to ELLPACK matrix representation
     int max_elements_row = 0;
     for (auto i = row_begin; i < row_end; i++) {
@@ -296,6 +312,7 @@ struct DeviceShard {
         param.max_leaves > 0 ? param.max_leaves * 2 : n_nodes(param.max_depth);
     ba.allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes,
                 &gpair, n_rows, &ridx, n_rows, &position, n_rows,
+                &prediction_cache, n_rows, &node_sum_gradients_d, max_nodes,
                 &feature_segments, gmat.cut->row_ptr.size(), &gidx_fvalue_map,
                 gmat.cut->cut.size(), &min_fvalue, gmat.cut->min_val.size(),
                 &monotone_constraints, param.monotone_constraints.size());
@@ -481,13 +498,46 @@ struct DeviceShard {
         ridx.current() + segment.begin, ridx.other() + segment.begin,
         segment.Size() * sizeof(bst_uint), cudaMemcpyDeviceToDevice));
   }
+
+  void UpdatePredictionCache(bst_float* out_preds_d) {
+    dh::safe_cuda(cudaSetDevice(device_idx));
+    if (!prediction_cache_initialised) {
+      dh::safe_cuda(cudaMemcpy
+                    (prediction_cache.data(), &out_preds_d[row_begin_idx],
+                     prediction_cache.size() * sizeof(bst_float),
+                     cudaMemcpyDefault));
+    }
+    prediction_cache_initialised = true;
+
+    CalcWeightTrainParam param_d(param);
+
+    thrust::copy(node_sum_gradients.begin(), node_sum_gradients.end(),
+                 node_sum_gradients_d.tbegin());
+    auto d_position = position.current();
+    auto d_ridx = ridx.current();
+    auto d_node_sum_gradients = node_sum_gradients_d.data();
+    auto d_prediction_cache = prediction_cache.data();
+
+    dh::launch_n(device_idx, prediction_cache.size(),
+                 [=] __device__(int local_idx) {
+                   int pos = d_position[local_idx];
+                   bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]);
+                   d_prediction_cache[d_ridx[local_idx]] +=
+                     weight * param_d.learning_rate;
+                 });
+
+    dh::safe_cuda(cudaMemcpy
+                  (&out_preds_d[row_begin_idx], prediction_cache.data(),
+                   prediction_cache.size() * sizeof(bst_float),
+                   cudaMemcpyDefault));
+  }
 };
 
 class GPUHistMaker : public TreeUpdater {
  public:
   struct ExpandEntry;
 
-  GPUHistMaker() : initialised(false) {}
+  GPUHistMaker() : initialised(false), p_last_fmat_(nullptr) {}
   ~GPUHistMaker() {}
   void Init(
       const std::vector<std::pair<std::string, std::string>>& args) override {
@@ -571,6 +621,7 @@ class GPUHistMaker : public TreeUpdater {
                           row_segments[cpu_thread_id + 1], n_bins, param));
     }
 
+    p_last_fmat_ = dmat;
     initialised = true;
   }
 
@@ -858,6 +909,22 @@ class GPUHistMaker : public TreeUpdater {
     omp_set_num_threads(nthread);
   }
 
+  bool UpdatePredictionCache
+  (const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) override {
+    monitor.Start("UpdatePredictionCache", dList);
+    if (shards.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
+      return false;
+
+    bst_float *out_preds_d = p_out_preds->ptr_d(param.gpu_id);
+
+    #pragma omp parallel for schedule(static, 1)
+    for (int shard = 0; shard < shards.size(); ++shard) {
+      shards[shard]->UpdatePredictionCache(out_preds_d);
+    }
+    monitor.Stop("UpdatePredictionCache", dList);
+    return true;
+  }
+
   struct ExpandEntry {
     int nid;
     int depth;
@@ -925,6 +992,8 @@ class GPUHistMaker : public TreeUpdater {
   dh::AllReducer reducer;
   std::vector<ValueConstraint> node_value_constraints_;
   std::vector<int> dList;
+
+  DMatrix* p_last_fmat_;
 };
 
 XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")