diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 4324a74d6..b32ba349c 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -36,7 +36,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
   auto iter = p_fmat->RowIterator();
   iter->BeforeFirst();
   while (iter->Next()) {
-     auto batch = iter->Value();
+     auto &batch = iter->Value();
     #pragma omp parallel num_threads(nthread)
     {
       CHECK_EQ(nthread, omp_get_num_threads());
@@ -137,7 +137,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
   iter->BeforeFirst();
   row_ptr.push_back(0);
   while (iter->Next()) {
-     auto batch = iter->Value();
+     auto &batch = iter->Value();
     const size_t rbegin = row_ptr.size() - 1;
     for (size_t i = 0; i < batch.Size(); ++i) {
       row_ptr.push_back(batch[i].length + row_ptr.back());
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 05c026476..59cc32da3 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -67,7 +67,7 @@ void SimpleDMatrix::MakeOneBatch(SparsePage* pcol, bool sorted) {
 
   iter->BeforeFirst();
   while (iter->Next()) {
-     auto batch = iter->Value();
+     auto &batch = iter->Value();
     #pragma omp parallel for schedule(static)
     for (long i = 0; i < static_cast<long>(batch.Size()); ++i) { // NOLINT(*)
       int tid = omp_get_thread_num();
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index ec2bd2b67..55e078d84 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -185,7 +185,7 @@ void SparsePageDMatrix::InitColAccess(
 
     while (true) {
       if (batch_ptr != batch_top) {
-         auto batch = iter->Value();
+         auto &batch = iter->Value();
         CHECK_EQ(batch_top, batch.Size());
         for (size_t i = batch_ptr; i < batch_top; ++i) {
           auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index ed13bb71c..7f6d424ac 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -155,7 +155,7 @@ class GBLinear : public GradientBooster {
      auto iter = p_fmat->RowIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-       auto batch = iter->Value();
+       auto &batch = iter->Value();
       // parallel over local batch
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
       #pragma omp parallel for schedule(static)
@@ -207,7 +207,7 @@ class GBLinear : public GradientBooster {
     const int ngroup = model_.param.num_output_group;
     preds.resize(p_fmat->Info().num_row_ * ngroup);
     while (iter->Next()) {
-       auto batch = iter->Value();
+       auto &batch = iter->Value();
       // output convention: nrow * k, where nrow is number of rows
       // k is number of group
       // parallel over local batch
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index a619114d8..739acde3b 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -441,7 +441,7 @@ class Dart : public GBTree {
     auto* self = static_cast<Derived*>(this);
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // parallel over local batch
       constexpr int kUnroll = 8;
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
diff --git a/src/linear/coordinate_common.h b/src/linear/coordinate_common.h
index 04d078d4d..18d87fbf1 100644
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -67,7 +67,7 @@ inline std::pair<double, double> GetGradient(int group_idx, int num_group, int f
   double sum_grad = 0.0, sum_hess = 0.0;
   auto iter = p_fmat->ColIterator();
   while (iter->Next()) {
-    auto batch = iter->Value();
+    auto &batch = iter->Value();
     auto col = batch[fidx];
     const auto ndata = static_cast<bst_omp_uint>(col.length);
     for (bst_omp_uint j = 0; j < ndata; ++j) {
@@ -98,7 +98,7 @@ inline std::pair<double, double> GetGradientParallel(int group_idx, int num_grou
   double sum_grad = 0.0, sum_hess = 0.0;
   auto iter = p_fmat->ColIterator();
   while (iter->Next()) {
-    auto batch = iter->Value();
+    auto &batch = iter->Value();
     auto col = batch[fidx];
     const auto ndata = static_cast<bst_omp_uint>(col.length);
 #pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
@@ -156,7 +156,7 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
   if (dw == 0.0f) return;
   auto iter = p_fmat->ColIterator();
   while (iter->Next()) {
-    auto batch = iter->Value();
+    auto &batch = iter->Value();
     auto col = batch[fidx];
     // update grad value
     const auto num_row = static_cast<bst_omp_uint>(col.length);
@@ -327,7 +327,7 @@ class GreedyFeatureSelector : public FeatureSelector {
     std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
     auto iter = p_fmat->ColIterator();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < nfeat; ++i) {
         const auto col = batch[i];
@@ -394,7 +394,7 @@ class ThriftyFeatureSelector : public FeatureSelector {
     std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
     auto iter = p_fmat->ColIterator();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // column-parallel is usually faster than row-parallel
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < nfeat; ++i) {
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index 672fbd1ff..cf4e47c61 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -237,7 +237,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
     auto iter = p_fmat->ColIterator();
     CHECK(p_fmat->SingleColBlock());
     iter->Next();
-    auto batch = iter->Value();
+    auto &batch = iter->Value();
 
     shards.resize(n_devices);
     // Create device shards
diff --git a/src/linear/updater_shotgun.cc b/src/linear/updater_shotgun.cc
index f2e21c5f6..11b91cbce 100644
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@@ -81,7 +81,7 @@ class ShotgunUpdater : public LinearUpdater {
                      param_.reg_alpha_denorm, param_.reg_lambda_denorm, 0);
      auto iter = p_fmat->ColIterator();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < nfeat; ++i) {
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 58a95a9e0..964bbaa0d 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -236,7 +236,7 @@ class CPUPredictor : public Predictor {
     auto iter = p_fmat->RowIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // parallel over local batch
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
@@ -285,7 +285,7 @@ class CPUPredictor : public Predictor {
     const std::vector<bst_float>& base_margin = info.base_margin_;
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // parallel over local batch
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 9e576adb4..21afc37f3 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -64,7 +64,7 @@ struct DeviceMatrix {
     iter->BeforeFirst();
     size_t data_offset = 0;
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // Copy row ptr
       dh::safe_cuda(cudaMemcpy(
           row_ptr.Data() + batch.base_rowid, batch.offset.data(),
diff --git a/src/tree/updater_basemaker-inl.h b/src/tree/updater_basemaker-inl.h
index c2b92e31d..06ca474ba 100644
--- a/src/tree/updater_basemaker-inl.h
+++ b/src/tree/updater_basemaker-inl.h
@@ -46,7 +46,7 @@ class BaseMaker: public TreeUpdater {
       auto iter = p_fmat->ColIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         for (bst_uint fid = 0; fid < batch.Size(); ++fid) {
            auto c = batch[fid];
           if (c.length != 0) {
@@ -305,7 +305,7 @@ class BaseMaker: public TreeUpdater {
     this->GetSplitSet(nodes, tree, &fsplits);
     auto iter = p_fmat->ColIterator();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       for (auto fid : fsplits) {
         auto col = batch[fid];
         const auto ndata = static_cast<bst_omp_uint>(col.length);
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index df9a9a453..a87d96b65 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -731,7 +731,7 @@ class ColMaker: public TreeUpdater {
       fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
       auto iter = p_fmat->ColIterator();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         for (auto fid : fsplits) {
           auto col = batch[fid];
           const auto ndata = static_cast<bst_omp_uint>(col.length);
@@ -862,7 +862,7 @@ class DistColMaker : public ColMaker {
       }
       auto iter = p_fmat->ColIterator();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         for (auto fid : fsplits) {
           auto col = batch[fid];
           const auto ndata = static_cast<bst_omp_uint>(col.length);
diff --git a/src/tree/updater_gpu.cu b/src/tree/updater_gpu.cu
index 4d87572eb..616c75179 100644
--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -666,7 +666,7 @@ class GPUMaker : public TreeUpdater {
     auto iter = dmat->ColIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       for (int i = 0; i < batch.Size(); i++) {
         auto col = batch[i];
         for (const Entry* it = col.data; it != col.data + col.length;
diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
index 638017355..97d03359a 100644
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -347,7 +347,7 @@ class CQHistMaker: public HistMaker<TStats> {
       auto iter = p_fmat->ColIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         // start enumeration
         const auto nsize = static_cast<bst_omp_uint>(fset.size());
         #pragma omp parallel for schedule(dynamic, 1)
@@ -429,7 +429,7 @@ class CQHistMaker: public HistMaker<TStats> {
       auto iter = p_fmat->ColIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         // TWOPASS: use the real set + split set in the column iteration.
         this->CorrectNonDefaultPositionByBatch(batch, fsplit_set_, tree);
 
@@ -717,7 +717,7 @@ class GlobalProposalHistMaker: public CQHistMaker<TStats> {
       auto iter = p_fmat->ColIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         // TWOPASS: use the real set + split set in the column iteration.
         this->CorrectNonDefaultPositionByBatch(batch, this->fsplit_set_, tree);
 
@@ -775,7 +775,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
     auto iter = p_fmat->RowIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // parallel convert to column major format
       common::ParallelGroupBuilder<Entry>
           builder(&col_ptr_, &col_data_, &thread_col_ptr_);
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index c365c6a96..b14fa248d 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -60,7 +60,7 @@ class TreeRefresher: public TreeUpdater {
        auto *iter = p_fmat->RowIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-         auto batch = iter->Value();
+         auto &batch = iter->Value();
         CHECK_LT(batch.Size(), std::numeric_limits<unsigned>::max());
         const auto nbatch = static_cast<bst_omp_uint>(batch.Size());
         #pragma omp parallel for schedule(static)
diff --git a/src/tree/updater_skmaker.cc b/src/tree/updater_skmaker.cc
index 8040bb2fd..9549ff0c6 100644
--- a/src/tree/updater_skmaker.cc
+++ b/src/tree/updater_skmaker.cc
@@ -147,7 +147,7 @@ class SketchMaker: public BaseMaker {
     auto iter = p_fmat->ColIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // start enumeration
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
       #pragma omp parallel for schedule(dynamic, 1)