Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. (#3446)

* Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. - added distributions to HostDeviceVector - using HostDeviceVector for labels, weights and base margings in MetaInfo - using HostDeviceVector for offset and data in SparsePage - other necessary refactoring * Added const version of HostDeviceVector API calls. - const versions added to calls that can trigger data transfers, e.g. DevicePointer() - updated the code that uses HostDeviceVector - objective functions now accept const HostDeviceVector<bst_float>& for predictions * Updated src/linear/updater_gpu_coordinate.cu. * Added read-only state for HostDeviceVector sync. - this means no copies are performed if both host and devices access the HostDeviceVector read-only * Fixed linter and test errors. - updated the lz4 plugin - added ConstDeviceSpan to HostDeviceVector - using device % dh::NVisibleDevices() for the physical device number, e.g. in calls to cudaSetDevice() * Fixed explicit template instantiation errors for HostDeviceVector. - replaced HostDeviceVector<unsigned int> with HostDeviceVector<int> * Fixed HostDeviceVector tests that require multiple GPUs. - added a mock set device handler; when set, it is called instead of cudaSetDevice()
2018-08-30 04:28:47 +02:00 · 2018-08-30 04:28:47 +02:00 · 72cd1517d6
commit 72cd1517d6
parent 58d783df16
45 changed files with 1141 additions and 560 deletions
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@ -17,6 +17,8 @@
 #include "./base.h"
 #include "../../src/common/span.h"

+#include "../../src/common/host_device_vector.h"
+
 namespace xgboost {
 // forward declare learner.
 class LearnerImpl;
@ -41,7 +43,7 @@ class MetaInfo {
  /*! \brief number of nonzero entries in the data */
  uint64_t num_nonzero_{0};
  /*! \brief label of each instance */
-  std::vector<bst_float> labels_;
+  HostDeviceVector<bst_float> labels_;
  /*!
   * \brief specified root index of each instance,
   *  can be used for multi task setting
@ -53,7 +55,7 @@ class MetaInfo {
   */
  std::vector<bst_uint> group_ptr_;
  /*! \brief weights of each instance, optional */
-  std::vector<bst_float> weights_;
+  HostDeviceVector<bst_float> weights_;
  /*! \brief session-id of each instance, optional */
  std::vector<uint64_t> qids_;
  /*!
@ -61,7 +63,7 @@ class MetaInfo {
   * if specified, xgboost will start from this init margin
   * can be used to specify initial prediction to boost from.
   */
-  std::vector<bst_float> base_margin_;
+  HostDeviceVector<bst_float> base_margin_;
  /*! \brief version flag, used to check version of this info */
  static const int kVersion = 2;
  /*! \brief version that introduced qid field */
@ -74,7 +76,7 @@ class MetaInfo {
   * \return The weight.
   */
  inline bst_float GetWeight(size_t i) const {
-    return weights_.size() != 0 ?  weights_[i] : 1.0f;
+    return weights_.Size() != 0 ?  weights_.HostVector()[i] : 1.0f;
  }
  /*!
   * \brief Get the root index of i-th instance.
@ -86,12 +88,12 @@ class MetaInfo {
  }
  /*! \brief get sorted indexes (argsort) of labels by absolute value (used by cox loss) */
  inline const std::vector<size_t>& LabelAbsSort() const {
-    if (label_order_cache_.size() == labels_.size()) {
+    if (label_order_cache_.size() == labels_.Size()) {
      return label_order_cache_;
    }
-    label_order_cache_.resize(labels_.size());
+    label_order_cache_.resize(labels_.Size());
    std::iota(label_order_cache_.begin(), label_order_cache_.end(), 0);
-    const auto l = labels_;
+    const auto& l = labels_.HostVector();
    XGBOOST_PARALLEL_SORT(label_order_cache_.begin(), label_order_cache_.end(),
              [&l](size_t i1, size_t i2) {return std::abs(l[i1]) < std::abs(l[i2]);});

@ -151,9 +153,9 @@ struct Entry {
 */
 class SparsePage {
 public:
-  std::vector<size_t> offset;
+  HostDeviceVector<size_t> offset;
  /*! \brief the data of the segments */
-  std::vector<Entry> data;
+  HostDeviceVector<Entry> data;

  size_t base_rowid;

@ -162,8 +164,10 @@ class SparsePage {

  /*! \brief get i-th row from the batch */
  inline Inst operator[](size_t i) const {
-    return {data.data() + offset[i],
-            static_cast<Inst::index_type>(offset[i + 1] - offset[i])};
+    const auto& data_vec = data.HostVector();
+    const auto& offset_vec = offset.HostVector();
+    return {data_vec.data() + offset_vec[i],
+            static_cast<Inst::index_type>(offset_vec[i + 1] - offset_vec[i])};
  }

  /*! \brief constructor */
@ -172,18 +176,19 @@ class SparsePage {
  }
  /*! \return number of instance in the page */
  inline size_t Size() const {
-    return offset.size() - 1;
+    return offset.Size() - 1;
  }
  /*! \return estimation of memory cost of this page */
  inline size_t MemCostBytes() const {
-    return offset.size() * sizeof(size_t) + data.size() * sizeof(Entry);
+    return offset.Size() * sizeof(size_t) + data.Size() * sizeof(Entry);
  }
  /*! \brief clear the page */
  inline void Clear() {
    base_rowid = 0;
-    offset.clear();
-    offset.push_back(0);
-    data.clear();
+    auto& offset_vec = offset.HostVector();
+    offset_vec.clear();
+    offset_vec.push_back(0);
+    data.HostVector().clear();
  }

  /*!
@ -191,33 +196,39 @@ class SparsePage {
   * \param batch the row batch.
   */
  inline void Push(const dmlc::RowBlock<uint32_t>& batch) {
-    data.reserve(data.size() + batch.offset[batch.size] - batch.offset[0]);
-    offset.reserve(offset.size() + batch.size);
+    auto& data_vec = data.HostVector();
+    auto& offset_vec = offset.HostVector();
+    data_vec.reserve(data.Size() + batch.offset[batch.size] - batch.offset[0]);
+    offset_vec.reserve(offset.Size() + batch.size);
    CHECK(batch.index != nullptr);
    for (size_t i = 0; i < batch.size; ++i) {
-      offset.push_back(offset.back() + batch.offset[i + 1] - batch.offset[i]);
+      offset_vec.push_back(offset_vec.back() + batch.offset[i + 1] - batch.offset[i]);
    }
    for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
      uint32_t index = batch.index[i];
      bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
-      data.emplace_back(index, fvalue);
+      data_vec.emplace_back(index, fvalue);
    }
-    CHECK_EQ(offset.back(), data.size());
+    CHECK_EQ(offset_vec.back(), data.Size());
  }
  /*!
   * \brief Push a sparse page
   * \param batch the row page
   */
  inline void Push(const SparsePage &batch) {
-    size_t top = offset.back();
-    data.resize(top + batch.data.size());
-    std::memcpy(dmlc::BeginPtr(data) + top,
-                dmlc::BeginPtr(batch.data),
-                sizeof(Entry) * batch.data.size());
-    size_t begin = offset.size();
-    offset.resize(begin + batch.Size());
+    auto& data_vec = data.HostVector();
+    auto& offset_vec = offset.HostVector();
+    const auto& batch_offset_vec = batch.offset.HostVector();
+    const auto& batch_data_vec = batch.data.HostVector();
+    size_t top = offset_vec.back();
+    data_vec.resize(top + batch.data.Size());
+    std::memcpy(dmlc::BeginPtr(data_vec) + top,
+                dmlc::BeginPtr(batch_data_vec),
+                sizeof(Entry) * batch.data.Size());
+    size_t begin = offset.Size();
+    offset_vec.resize(begin + batch.Size());
    for (size_t i = 0; i < batch.Size(); ++i) {
-      offset[i + begin] = top + batch.offset[i + 1];
+      offset_vec[i + begin] = top + batch_offset_vec[i + 1];
    }
  }
  /*!
@ -225,20 +236,21 @@ class SparsePage {
   *  \param inst an instance row
   */
  inline void Push(const Inst &inst) {
-    offset.push_back(offset.back() + inst.size());
-    size_t begin = data.size();
-    data.resize(begin + inst.size());
+    auto& data_vec = data.HostVector();
+    auto& offset_vec = offset.HostVector();
+    offset_vec.push_back(offset_vec.back() + inst.size());
+
+    size_t begin = data_vec.size();
+    data_vec.resize(begin + inst.size());
    if (inst.size() != 0) {
-      std::memcpy(dmlc::BeginPtr(data) + begin, inst.data(),
+      std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
                  sizeof(Entry) * inst.size());
    }
  }

-  size_t Size() { return offset.size() - 1; }
+  size_t Size() { return offset.Size() - 1; }
 };

-
-
 /*!
 * \brief This is data structure that user can pass to DMatrix::Create
 *  to create a DMatrix for training, user can create this data structure
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@ -44,7 +44,7 @@ class ObjFunction {
   * \param iteration current iteration number.
   * \param out_gpair output of get gradient, saves gradient and second order gradient in
   */
-  virtual void GetGradient(HostDeviceVector<bst_float>* preds,
+  virtual void GetGradient(const HostDeviceVector<bst_float>& preds,
                           const MetaInfo& info,
                           int iteration,
                           HostDeviceVector<GradientPair>* out_gpair) = 0;
--- a/plugin/example/custom_obj.cc
+++ b/plugin/example/custom_obj.cc
@ -33,21 +33,22 @@ class MyLogistic : public ObjFunction {
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
  }
-  void GetGradient(HostDeviceVector<bst_float> *preds,
+  void GetGradient(const HostDeviceVector<bst_float> &preds,
                   const MetaInfo &info,
                   int iter,
                   HostDeviceVector<GradientPair> *out_gpair) override {
-    out_gpair->Resize(preds->Size());
-    std::vector<bst_float>& preds_h = preds->HostVector();
+    out_gpair->Resize(preds.Size());
+    const std::vector<bst_float>& preds_h = preds.HostVector();
    std::vector<GradientPair>& out_gpair_h = out_gpair->HostVector();
+    const std::vector<bst_float>& labels_h = info.labels_.HostVector();
    for (size_t i = 0; i < preds_h.size(); ++i) {
      bst_float w = info.GetWeight(i);
      // scale the negative examples!
-      if (info.labels_[i] == 0.0f) w *= param_.scale_neg_weight;
+      if (labels_h[i] == 0.0f) w *= param_.scale_neg_weight;
      // logistic transformation
      bst_float p = 1.0f / (1.0f + std::exp(-preds_h[i]));
      // this is the gradient
-      bst_float grad = (p - info.labels_[i]) * w;
+      bst_float grad = (p - labels_h[i]) * w;
      // this is the second order gradient
      bst_float hess = p * (1.0f - p) * w;
      out_gpair_h.at(i) = GradientPair(grad, hess);
--- a/plugin/lz4/sparse_page_lz4_format.cc
+++ b/plugin/lz4/sparse_page_lz4_format.cc
@ -177,15 +177,17 @@ class SparsePageLZ4Format : public SparsePageFormat {
  }

  bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
-    if (!fi->Read(&(page->offset))) return false;
-    CHECK_NE(page->offset.size(), 0) << "Invalid SparsePage file";
+    auto& offset_vec = page->offset.HostVector();
+    auto& data_vec = page->data.HostVector();
+    if (!fi->Read(&(offset_vec))) return false;
+    CHECK_NE(offset_vec.size(), 0) << "Invalid SparsePage file";
    this->LoadIndexValue(fi);

-    page->data.resize(page->offset.back());
+    data_vec.resize(offset_vec.back());
    CHECK_EQ(index_.data.size(), value_.data.size());
-    CHECK_EQ(index_.data.size(), page->data.size());
-    for (size_t i = 0; i < page->data.size(); ++i) {
-      page->data[i] = Entry(index_.data[i] + min_index_, value_.data[i]);
+    CHECK_EQ(index_.data.size(), data_vec.size());
+    for (size_t i = 0; i < data_vec.size(); ++i) {
+      data_vec[i] = Entry(index_.data[i] + min_index_, value_.data[i]);
    }
    return true;
  }
@ -195,24 +197,25 @@ class SparsePageLZ4Format : public SparsePageFormat {
            const std::vector<bst_uint>& sorted_index_set) override {
    if (!fi->Read(&disk_offset_)) return false;
    this->LoadIndexValue(fi);
-
-    page->offset.clear();
-    page->offset.push_back(0);
+    auto& offset_vec = page->offset.HostVector();
+    auto& data_vec = page->data.HostVector();
+    offset_vec.clear();
+    offset_vec.push_back(0);
    for (bst_uint cid : sorted_index_set) {
-      page->offset.push_back(
-          page->offset.back() + disk_offset_[cid + 1] - disk_offset_[cid]);
+      offset_vec.push_back(
+          offset_vec.back() + disk_offset_[cid + 1] - disk_offset_[cid]);
    }
-    page->data.resize(page->offset.back());
+    data_vec.resize(offset_vec.back());
    CHECK_EQ(index_.data.size(), value_.data.size());
    CHECK_EQ(index_.data.size(), disk_offset_.back());

    for (size_t i = 0; i < sorted_index_set.size(); ++i) {
      bst_uint cid = sorted_index_set[i];
-      size_t dst_begin = page->offset[i];
+      size_t dst_begin = offset_vec[i];
      size_t src_begin = disk_offset_[cid];
      size_t num = disk_offset_[cid + 1] - disk_offset_[cid];
      for (size_t j = 0; j < num; ++j) {
-        page->data[dst_begin + j] = Entry(
+        data_vec[dst_begin + j] = Entry(
            index_.data[src_begin + j] + min_index_, value_.data[src_begin + j]);
      }
    }
@ -220,22 +223,24 @@ class SparsePageLZ4Format : public SparsePageFormat {
  }

  void Write(const SparsePage& page, dmlc::Stream* fo) override {
-    CHECK(page.offset.size() != 0 && page.offset[0] == 0);
-    CHECK_EQ(page.offset.back(), page.data.size());
-    fo->Write(page.offset);
+    const auto& offset_vec = page.offset.HostVector();
+    const auto& data_vec = page.data.HostVector();
+    CHECK(offset_vec.size() != 0 && offset_vec[0] == 0);
+    CHECK_EQ(offset_vec.back(), data_vec.size());
+    fo->Write(offset_vec);
    min_index_ = page.base_rowid;
    fo->Write(&min_index_, sizeof(min_index_));
-    index_.data.resize(page.data.size());
-    value_.data.resize(page.data.size());
+    index_.data.resize(data_vec.size());
+    value_.data.resize(data_vec.size());

-    for (size_t i = 0; i < page.data.size(); ++i) {
-      bst_uint idx = page.data[i].index - min_index_;
+    for (size_t i = 0; i < data_vec.size(); ++i) {
+      bst_uint idx = data_vec[i].index - min_index_;
      CHECK_LE(idx, static_cast<bst_uint>(std::numeric_limits<StorageIndex>::max()))
          << "The storage index is chosen to limited to smaller equal than "
          << std::numeric_limits<StorageIndex>::max()
          << "min_index=" << min_index_;
      index_.data[i] = static_cast<StorageIndex>(idx);
-      value_.data[i] = page.data[i].fvalue;
+      value_.data[i] = data_vec[i].fvalue;
    }

    index_.InitCompressChunks(kChunkSize, kMaxChunk);
@ -259,7 +264,7 @@ class SparsePageLZ4Format : public SparsePageFormat {
    raw_bytes_value_ += value_.RawBytes();
    encoded_bytes_index_ += index_.EncodedBytes();
    encoded_bytes_value_ += value_.EncodedBytes();
-    raw_bytes_ += page.offset.size() * sizeof(size_t);
+    raw_bytes_ += offset_vec.size() * sizeof(size_t);
  }

  inline void LoadIndexValue(dmlc::SeekStream* fi) {
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -250,20 +250,22 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,

  API_BEGIN();
  data::SimpleCSRSource& mat = *source;
-  mat.page_.offset.reserve(nindptr);
-  mat.page_.data.reserve(nelem);
-  mat.page_.offset.resize(1);
-  mat.page_.offset[0] = 0;
+  auto& offset_vec = mat.page_.offset.HostVector();
+  auto& data_vec = mat.page_.data.HostVector();
+  offset_vec.reserve(nindptr);
+  data_vec.reserve(nelem);
+  offset_vec.resize(1);
+  offset_vec[0] = 0;
  size_t num_column = 0;
  for (size_t i = 1; i < nindptr; ++i) {
    for (size_t j = indptr[i - 1]; j < indptr[i]; ++j) {
      if (!common::CheckNAN(data[j])) {
        // automatically skip nan.
-        mat.page_.data.emplace_back(Entry(indices[j], data[j]));
+        data_vec.emplace_back(Entry(indices[j], data[j]));
        num_column = std::max(num_column, static_cast<size_t>(indices[j] + 1));
      }
    }
-    mat.page_.offset.push_back(mat.page_.data.size());
+    offset_vec.push_back(mat.page_.data.Size());
  }

  mat.info.num_col_ = num_column;
@ -273,7 +275,7 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
    mat.info.num_col_ = num_col;
  }
  mat.info.num_row_ = nindptr - 1;
-  mat.info.num_nonzero_ = mat.page_.data.size();
+  mat.info.num_nonzero_ = mat.page_.data.Size();
  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
  API_END();
 }
@ -305,7 +307,9 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
  // FIXME: User should be able to control number of threads
  const int nthread = omp_get_max_threads();
  data::SimpleCSRSource& mat = *source;
-  common::ParallelGroupBuilder<Entry> builder(&mat.page_.offset, &mat.page_.data);
+  auto& offset_vec = mat.page_.offset.HostVector();
+  auto& data_vec = mat.page_.data.HostVector();
+  common::ParallelGroupBuilder<Entry> builder(&offset_vec, &data_vec);
  builder.InitBudget(0, nthread);
  size_t ncol = nindptr - 1;  // NOLINT(*)
  #pragma omp parallel for schedule(static)
@ -329,15 +333,16 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
      }
    }
  }
-  mat.info.num_row_ = mat.page_.offset.size() - 1;
+  mat.info.num_row_ = mat.page_.offset.Size() - 1;
  if (num_row > 0) {
    CHECK_LE(mat.info.num_row_, num_row);
    // provision for empty rows at the bottom of matrix
+    auto& offset_vec = mat.page_.offset.HostVector();
    for (uint64_t i = mat.info.num_row_; i < static_cast<uint64_t>(num_row); ++i) {
-      mat.page_.offset.push_back(mat.page_.offset.back());
+      offset_vec.push_back(offset_vec.back());
    }
    mat.info.num_row_ = num_row;
-    CHECK_EQ(mat.info.num_row_, mat.page_.offset.size() - 1);  // sanity check
+    CHECK_EQ(mat.info.num_row_, offset_vec.size() - 1);  // sanity check
  }
  mat.info.num_col_ = ncol;
  mat.info.num_nonzero_ = nelem;
@ -368,7 +373,9 @@ XGB_DLL int XGDMatrixCreateFromMat(const bst_float* data,

  API_BEGIN();
  data::SimpleCSRSource& mat = *source;
-  mat.page_.offset.resize(1+nrow);
+  auto& offset_vec = mat.page_.offset.HostVector();
+  auto& data_vec = mat.page_.data.HostVector();
+  offset_vec.resize(1+nrow);
  bool nan_missing = common::CheckNAN(missing);
  mat.info.num_row_ = nrow;
  mat.info.num_col_ = ncol;
@ -388,9 +395,9 @@ XGB_DLL int XGDMatrixCreateFromMat(const bst_float* data,
        }
      }
    }
-    mat.page_.offset[i+1] = mat.page_.offset[i] + nelem;
+    offset_vec[i+1] = offset_vec[i] + nelem;
  }
-  mat.page_.data.resize(mat.page_.data.size() + mat.page_.offset.back());
+  data_vec.resize(mat.page_.data.Size() + offset_vec.back());

  data = data0;
  for (xgboost::bst_ulong i = 0; i < nrow; ++i, data += ncol) {
@ -399,14 +406,14 @@ XGB_DLL int XGDMatrixCreateFromMat(const bst_float* data,
      if (common::CheckNAN(data[j])) {
      } else {
        if (nan_missing || data[j] != missing) {
-          mat.page_.data[mat.page_.offset[i] + matj] = Entry(j, data[j]);
+          data_vec[offset_vec[i] + matj] = Entry(j, data[j]);
          ++matj;
        }
      }
    }
  }

-  mat.info.num_nonzero_ = mat.page_.data.size();
+  mat.info.num_nonzero_ = mat.page_.data.Size();
  *out  = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
  API_END();
 }
@ -461,7 +468,9 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const bst_float* data,  // NOLINT

  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
  data::SimpleCSRSource& mat = *source;
-  mat.page_.offset.resize(1+nrow);
+  auto& offset_vec = mat.page_.offset.HostVector();
+  auto& data_vec = mat.page_.data.HostVector();
+  offset_vec.resize(1+nrow);
  mat.info.num_row_ = nrow;
  mat.info.num_col_ = ncol;

@ -487,7 +496,7 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const bst_float* data,  // NOLINT
          ++nelem;
        }
      }
-      mat.page_.offset[i+1] = nelem;
+      offset_vec[i+1] = nelem;
    }
  }
  // Inform about any NaNs and resize data matrix
@ -496,8 +505,8 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const bst_float* data,  // NOLINT
  }

  // do cumulative sum (to avoid otherwise need to copy)
-  PrefixSum(&mat.page_.offset[0], mat.page_.offset.size());
-  mat.page_.data.resize(mat.page_.data.size() + mat.page_.offset.back());
+  PrefixSum(&offset_vec[0], offset_vec.size());
+  data_vec.resize(mat.page_.data.Size() + offset_vec.back());

  // Fill data matrix (now that know size, no need for slow push_back())
 #pragma omp parallel num_threads(nthread)
@ -508,7 +517,7 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const bst_float* data,  // NOLINT
      for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
        if (common::CheckNAN(data[ncol * i + j])) {
        } else if (nan_missing || data[ncol * i + j] != missing) {
-          mat.page_.data[mat.page_.offset[i] + matj] =
+          data_vec[offset_vec[i] + matj] =
              Entry(j, data[ncol * i + j]);
          ++matj;
        }
@ -518,7 +527,7 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const bst_float* data,  // NOLINT
  // restore omp state
  omp_set_num_threads(nthread_orig);

-  mat.info.num_nonzero_ = mat.page_.data.size();
+  mat.info.num_nonzero_ = mat.page_.data.Size();
  *out  = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
  API_END();
 }
@ -611,10 +620,11 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,

  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
  data::SimpleCSRSource& mat = *source;
-  mat.page_.offset.resize(1 + nrow);
+  mat.page_.offset.Resize(1 + nrow);
  mat.info.num_row_ = nrow;
  mat.info.num_col_ = ncol;

+  auto& page_offset = mat.page_.offset.HostVector();
 #pragma omp parallel num_threads(nthread)
  {
    // Count elements per row, column by column
@ -624,15 +634,17 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
      for (omp_ulong i = 0; i < nrow; ++i) {
        float val = DTGetValue(data[j], dtype, i);
        if (!std::isnan(val)) {
-          mat.page_.offset[i + 1]++;
+          page_offset[i + 1]++;
        }
      }
    }
  }
  // do cumulative sum (to avoid otherwise need to copy)
-  PrefixSum(&mat.page_.offset[0], mat.page_.offset.size());
+  PrefixSum(&page_offset[0], page_offset.size());

-  mat.page_.data.resize(mat.page_.data.size() + mat.page_.offset.back());
+  mat.page_.data.Resize(mat.page_.data.Size() + page_offset.back());
+
+  auto& page_data = mat.page_.data.HostVector();

  // Fill data matrix (now that know size, no need for slow push_back())
  std::vector<size_t> position(nrow);
@ -644,7 +656,7 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
      for (omp_ulong i = 0; i < nrow; ++i) {
        float val = DTGetValue(data[j], dtype, i);
        if (!std::isnan(val)) {
-          mat.page_.data[mat.page_.offset[i] + position[i]] = Entry(j, val);
+          page_data[page_offset[i] + position[i]] = Entry(j, val);
          position[i]++;
        }
      }
@ -654,7 +666,7 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
  // restore omp state
  omp_set_num_threads(nthread_orig);

-  mat.info.num_nonzero_ = mat.page_.data.size();
+  mat.info.num_nonzero_ = mat.page_.data.Size();
  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
  API_END();
 }
@ -683,23 +695,32 @@ XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
  CHECK(iter->Next());

  const auto& batch = iter->Value();
+  const auto& src_labels = src.info.labels_.ConstHostVector();
+  const auto& src_weights = src.info.weights_.ConstHostVector();
+  const auto& src_base_margin = src.info.base_margin_.ConstHostVector();
+  auto& ret_labels = ret.info.labels_.HostVector();
+  auto& ret_weights = ret.info.weights_.HostVector();
+  auto& ret_base_margin = ret.info.base_margin_.HostVector();
+  auto& offset_vec = ret.page_.offset.HostVector();
+  auto& data_vec = ret.page_.data.HostVector();
+
  for (xgboost::bst_ulong i = 0; i < len; ++i) {
    const int ridx = idxset[i];
    auto inst = batch[ridx];
    CHECK_LT(static_cast<xgboost::bst_ulong>(ridx), batch.Size());
-    ret.page_.data.insert(ret.page_.data.end(), inst.data(),
+    data_vec.insert(data_vec.end(), inst.data(),
                    inst.data() + inst.size());
-    ret.page_.offset.push_back(ret.page_.offset.back() + inst.size());
+    offset_vec.push_back(offset_vec.back() + inst.size());
    ret.info.num_nonzero_ += inst.size();

-    if (src.info.labels_.size() != 0) {
-      ret.info.labels_.push_back(src.info.labels_[ridx]);
+    if (src_labels.size() != 0) {
+      ret_labels.push_back(src_labels[ridx]);
    }
-    if (src.info.weights_.size() != 0) {
-      ret.info.weights_.push_back(src.info.weights_[ridx]);
+    if (src_weights.size() != 0) {
+      ret_weights.push_back(src_weights[ridx]);
    }
-    if (src.info.base_margin_.size() != 0) {
-      ret.info.base_margin_.push_back(src.info.base_margin_[ridx]);
+    if (src_base_margin.size() != 0) {
+      ret_base_margin.push_back(src_base_margin[ridx]);
    }
    if (src.info.root_index_.size() != 0) {
      ret.info.root_index_.push_back(src.info.root_index_[ridx]);
@ -771,11 +792,11 @@ XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
  const MetaInfo& info = static_cast<std::shared_ptr<DMatrix>*>(handle)->get()->Info();
  const std::vector<bst_float>* vec = nullptr;
  if (!std::strcmp(field, "label")) {
-    vec = &info.labels_;
+    vec = &info.labels_.HostVector();
  } else if (!std::strcmp(field, "weight")) {
-    vec = &info.weights_;
+    vec = &info.weights_.HostVector();
  } else if (!std::strcmp(field, "base_margin")) {
-    vec = &info.base_margin_;
+    vec = &info.base_margin_.HostVector();
  } else {
    LOG(FATAL) << "Unknown float field name " << field;
  }
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@ -332,7 +332,7 @@ void CLIPredict(const CLIParam& param) {
  std::unique_ptr<dmlc::Stream> fo(
      dmlc::Stream::Create(param.name_pred.c_str(), "w"));
  dmlc::ostream os(fo.get());
-  for (bst_float p : preds.HostVector()) {
+  for (bst_float p : preds.ConstHostVector()) {
    os << std::setprecision(std::numeric_limits<bst_float>::max_digits10 + 2)
       << p << '\n';
  }
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@ -35,6 +35,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {

  auto iter = p_fmat->RowIterator();
  iter->BeforeFirst();
+  const auto& weights = info.weights_.HostVector();
  while (iter->Next()) {
     auto &batch = iter->Value();
    #pragma omp parallel num_threads(nthread)
@ -50,7 +51,8 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
          SparsePage::Inst inst = batch[i];
          for (auto& ins : inst) {
            if (ins.index >= begin && ins.index < end) {
-              sketchs[ins.index].Push(ins.fvalue, info.GetWeight(ridx));
+              sketchs[ins.index].Push(ins.fvalue,
+                                      weights.size() > 0 ? weights[ridx] : 1.0f);
            }
          }
        }
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -118,7 +118,7 @@ struct GPUSketcher {

    void Init(const SparsePage& row_batch, const MetaInfo& info) {
      num_cols_ = info.num_col_;
-      has_weights_ = info.weights_.size() > 0;
+      has_weights_ = info.weights_.Size() > 0;

      // find the batch size
      if (param_.gpu_batch_nrows == 0) {
@ -282,19 +282,23 @@ struct GPUSketcher {
      size_t batch_row_end = std::min((gpu_batch + 1) * gpu_batch_nrows_,
                                      static_cast<size_t>(n_rows_));
      size_t batch_nrows = batch_row_end - batch_row_begin;
-      size_t n_entries =
-        row_batch.offset[row_begin_ + batch_row_end] -
-        row_batch.offset[row_begin_ + batch_row_begin];
+
+      const auto& offset_vec = row_batch.offset.HostVector();
+      const auto& data_vec = row_batch.data.HostVector();
+
+      size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
+        offset_vec[row_begin_ + batch_row_begin];
      // copy the batch to the GPU
      dh::safe_cuda
        (cudaMemcpy(entries_.data().get(),
-                    &row_batch.data[row_batch.offset[row_begin_ + batch_row_begin]],
+                    data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
                    n_entries * sizeof(Entry), cudaMemcpyDefault));
      // copy the weights if necessary
      if (has_weights_) {
+        const auto& weights_vec = info.weights_.HostVector();
        dh::safe_cuda
          (cudaMemcpy(weights_.data().get(),
-                      info.weights_.data() + row_begin_ + batch_row_begin,
+                      weights_vec.data() + row_begin_ + batch_row_begin,
                      batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
      }

@ -310,7 +314,7 @@ struct GPUSketcher {
         row_ptrs_.data().get() + batch_row_begin,
         has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
         gpu_batch_nrows_, num_cols_,
-         row_batch.offset[row_begin_ + batch_row_begin], batch_nrows);
+         offset_vec[row_begin_ + batch_row_begin], batch_nrows);
      dh::safe_cuda(cudaGetLastError());       // NOLINT
      dh::safe_cuda(cudaDeviceSynchronize());  // NOLINT

@ -331,13 +335,11 @@ struct GPUSketcher {
    void Sketch(const SparsePage& row_batch, const MetaInfo& info) {
      // copy rows to the device
      dh::safe_cuda(cudaSetDevice(device_));
+      const auto& offset_vec = row_batch.offset.HostVector();
      row_ptrs_.resize(n_rows_ + 1);
-      thrust::copy(row_batch.offset.data() + row_begin_,
-                   row_batch.offset.data() + row_end_ + 1,
-                   row_ptrs_.begin());
-
+      thrust::copy(offset_vec.data() + row_begin_,
+                   offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
      size_t gpu_nbatches = dh::DivRoundUp(n_rows_, gpu_batch_nrows_);
-
      for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
        SketchBatch(row_batch, info, gpu_batch);
      }
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@ -6,7 +6,8 @@
 // dummy implementation of HostDeviceVector in case CUDA is not used

 #include <xgboost/base.h>
-
+#include <xgboost/data.h>
+#include <cstdint>
 #include <utility>
 #include "./host_device_vector.h"

@ -14,25 +15,27 @@ namespace xgboost {

 template <typename T>
 struct HostDeviceVectorImpl {
-  explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
-  HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
-  explicit HostDeviceVectorImpl(std::vector<T>  init) : data_h_(std::move(init)) {}
+  explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v), distribution_() {}
+  HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init), distribution_() {}
+  explicit HostDeviceVectorImpl(std::vector<T>  init) : data_h_(std::move(init)), distribution_() {}
  std::vector<T> data_h_;
+  GPUDistribution distribution_;
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices) : impl_(nullptr) {
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUDistribution distribution)
+  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(size, v);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUDistribution distribution)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUDistribution distribution)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
@ -44,33 +47,69 @@ HostDeviceVector<T>::~HostDeviceVector() {
  delete tmp;
 }

+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
+}
+
+template <typename T>
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
+  if (this == &other) {
+    return *this;
+  }
+  delete impl_;
+  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
+  return *this;
+}
+
 template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }

 template <typename T>
 GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }

+template <typename T>
+const GPUDistribution& HostDeviceVector<T>::Distribution() const {
+  return impl_->distribution_;
+}
+
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }

+template <typename T>
+const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
+  return nullptr;
+}
+
 template <typename T>
 common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
  return common::Span<T>();
 }

+template <typename T>
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+  return common::Span<const T>();
+}
+
 template <typename T>
 std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }

+template <typename T>
+const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
+  return impl_->data_h_;
+}
+
 template <typename T>
 void HostDeviceVector<T>::Resize(size_t new_size, T v) {
  impl_->data_h_.resize(new_size, v);
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) { return 0; }
+size_t HostDeviceVector<T>::DeviceStart(int device) const { return 0; }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) { return 0; }
+size_t HostDeviceVector<T>::DeviceSize(int device) const { return 0; }

 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
@ -78,9 +117,9 @@ void HostDeviceVector<T>::Fill(T v) {
 }

 template <typename T>
-void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
-  CHECK_EQ(Size(), other->Size());
-  std::copy(other->HostVector().begin(), other->HostVector().end(), HostVector().begin());
+void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
+  CHECK_EQ(Size(), other.Size());
+  std::copy(other.HostVector().begin(), other.HostVector().end(), HostVector().begin());
 }

 template <typename T>
@ -96,13 +135,27 @@ void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
 }

 template <typename T>
-void HostDeviceVector<T>::Reshard(GPUSet devices) { }
+bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
+  return true;
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
+  return false;
+}
+
+template <typename T>
+void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const { }
+
+template <typename T>
+void HostDeviceVector<T>::Reshard(GPUSet devices) const { }

 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
-template class HostDeviceVector<unsigned int>;
 template class HostDeviceVector<int>;
+template class HostDeviceVector<Entry>;
+template class HostDeviceVector<size_t>;

 }  // namespace xgboost

--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -2,119 +2,159 @@
 * Copyright 2017 XGBoost contributors
 */

-
-#include <thrust/fill.h>
 #include "./host_device_vector.h"
+#include <thrust/fill.h>
+#include <xgboost/data.h>
+#include <algorithm>
+#include <cstdint>
+#include <mutex>
 #include "./device_helpers.cuh"

+
 namespace xgboost {

+// the handler to call instead of cudaSetDevice; only used for testing
+static void (*cudaSetDeviceHandler)(int) = nullptr;  // NOLINT
+
+void SetCudaSetDeviceHandler(void (*handler)(int)) {
+  cudaSetDeviceHandler = handler;
+}
+
+// wrapper over access with useful methods
+class Permissions {
+  GPUAccess access_;
+  explicit Permissions(GPUAccess access) : access_(access) {}
+
+ public:
+  Permissions() : access_(GPUAccess::kNone) {}
+  explicit Permissions(bool perm)
+    : access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
+
+  bool CanRead() const { return access_ >= kRead; }
+  bool CanWrite() const { return access_ == kWrite; }
+  bool CanAccess(GPUAccess access) const { return access_ >= access; }
+  void Grant(GPUAccess access) { access_ = std::max(access_, access); }
+  void DenyComplementary(GPUAccess compl_access) {
+    access_ = std::min(access_, GPUAccess::kWrite - compl_access);
+  }
+  Permissions Complementary() const {
+    return Permissions(GPUAccess::kWrite - access_);
+  }
+};

 template <typename T>
 struct HostDeviceVectorImpl {
  struct DeviceShard {
-    DeviceShard() : index_(-1), device_(-1), start_(0), on_d_(false), vec_(nullptr) {}
-
-    static size_t ShardStart(size_t size, int ndevices, int index) {
-      size_t portion = dh::DivRoundUp(size, ndevices);
-      size_t begin = index * portion;
-      begin = begin > size ? size : begin;
-      return begin;
-    }
-
-    static size_t ShardSize(size_t size, int ndevices, int index) {
-      size_t portion = dh::DivRoundUp(size, ndevices);
-      size_t begin = index * portion, end = (index + 1) * portion;
-      begin = begin > size ? size : begin;
-      end = end > size ? size : end;
-      return end - begin;
-    }
+    DeviceShard()
+      : index_(-1), proper_size_(0), device_(-1), start_(0), perm_d_(false),
+        cached_size_(~0), vec_(nullptr) {}

    void Init(HostDeviceVectorImpl<T>* vec, int device) {
      if (vec_ == nullptr) { vec_ = vec; }
      CHECK_EQ(vec, vec_);
      device_ = device;
-      index_ = vec_->devices_.Index(device);
-      size_t size_h = vec_->Size();
-      int ndevices = vec_->devices_.Size();
-      start_ = ShardStart(size_h, ndevices, index_);
-      size_t size_d = ShardSize(size_h, ndevices, index_);
-      dh::safe_cuda(cudaSetDevice(device_));
-      data_.resize(size_d);
-      on_d_ = !vec_->on_h_;
+      index_ = vec_->distribution_.devices_.Index(device);
+      LazyResize(vec_->Size());
+      perm_d_ = vec_->perm_h_.Complementary();
    }

    void ScatterFrom(const T* begin) {
      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice();
-      dh::safe_cuda(cudaSetDevice(device_));
+      LazySyncDevice(GPUAccess::kWrite);
+      SetDevice();
      dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_,
                               data_.size() * sizeof(T), cudaMemcpyDefault));
    }

    void GatherTo(thrust::device_ptr<T> begin) {
-      LazySyncDevice();
-      dh::safe_cuda(cudaSetDevice(device_));
+      LazySyncDevice(GPUAccess::kRead);
+      SetDevice();
      dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(),
-                               data_.size() * sizeof(T), cudaMemcpyDefault));
+                               proper_size_ * sizeof(T), cudaMemcpyDefault));
    }

    void Fill(T v) {
      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice();
-      dh::safe_cuda(cudaSetDevice(device_));
+      LazySyncDevice(GPUAccess::kWrite);
+      SetDevice();
      thrust::fill(data_.begin(), data_.end(), v);
    }

    void Copy(DeviceShard* other) {
      // TODO(canonizer): avoid full copy of host data for this (but not for other)
-      LazySyncDevice();
-      other->LazySyncDevice();
-      dh::safe_cuda(cudaSetDevice(device_));
+      LazySyncDevice(GPUAccess::kWrite);
+      other->LazySyncDevice(GPUAccess::kRead);
+      SetDevice();
      dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(),
                               data_.size() * sizeof(T), cudaMemcpyDefault));
    }

-    void LazySyncHost() {
-      dh::safe_cuda(cudaSetDevice(device_));
+    void LazySyncHost(GPUAccess access) {
+      SetDevice();
      dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
-                               data_.data().get(), data_.size() * sizeof(T),
+                               data_.data().get(),  proper_size_ * sizeof(T),
                               cudaMemcpyDeviceToHost));
-      on_d_ = false;
+      perm_d_.DenyComplementary(access);
    }

-    void LazySyncDevice() {
-      if (on_d_) { return; }
+    void LazyResize(size_t new_size) {
+      if (new_size == cached_size_) { return; }
+      // resize is required
+      int ndevices = vec_->distribution_.devices_.Size();
+      start_ = vec_->distribution_.ShardStart(new_size, index_);
+      proper_size_ = vec_->distribution_.ShardProperSize(new_size, index_);
+      size_t size_d = vec_->distribution_.ShardSize(new_size, index_);
+      SetDevice();
+      data_.resize(size_d);
+      cached_size_ = new_size;
+    }
+
+    void LazySyncDevice(GPUAccess access) {
+      if (perm_d_.CanAccess(access)) { return; }
+      if (perm_d_.CanRead()) {
+        // deny read to the host
+        perm_d_.Grant(access);
+        std::lock_guard<std::mutex> lock(vec_->mutex_);
+        vec_->perm_h_.DenyComplementary(access);
+        return;
+      }
      // data is on the host
      size_t size_h = vec_->data_h_.size();
-      int ndevices = vec_->devices_.Size();
-      start_ = ShardStart(size_h, ndevices, index_);
-      size_t size_d = ShardSize(size_h, ndevices, index_);
+      LazyResize(size_h);
+      SetDevice();
+      dh::safe_cuda(
+          cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
+                     data_.size() * sizeof(T), cudaMemcpyHostToDevice));
+      perm_d_.Grant(access);
+
+      std::lock_guard<std::mutex> lock(vec_->mutex_);
+      vec_->perm_h_.DenyComplementary(access);
+      vec_->size_d_ = size_h;
+    }
+
+    void SetDevice() {
+      if (cudaSetDeviceHandler == nullptr) {
        dh::safe_cuda(cudaSetDevice(device_));
-      data_.resize(size_d);
-      dh::safe_cuda(cudaMemcpy(data_.data().get(),
-                               vec_->data_h_.data() + start_,
-                               size_d * sizeof(T), cudaMemcpyHostToDevice));
-      on_d_ = true;
-      // this may cause a race condition if LazySyncDevice() is called
-      // from multiple threads in parallel;
-      // however, the race condition is benign, and will not cause problems
-      vec_->on_h_ = false;
-      vec_->size_d_ = vec_->data_h_.size();
+      } else {
+        (*cudaSetDeviceHandler)(device_);
+      }
    }

    int index_;
    int device_;
    thrust::device_vector<T> data_;
+    // cached vector size
+    size_t cached_size_;
    size_t start_;
-    // true if there is an up-to-date copy of data on device, false otherwise
-    bool on_d_;
+    // size of the portion to copy back to the host
+    size_t proper_size_;
+    Permissions perm_d_;
    HostDeviceVectorImpl<T>* vec_;
  };

-  HostDeviceVectorImpl(size_t size, T v, GPUSet devices)
-    : devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
-    if (!devices.IsEmpty()) {
+  HostDeviceVectorImpl(size_t size, T v, GPUDistribution distribution)
+    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
+    if (!distribution_.IsEmpty()) {
      size_d_ = size;
      InitShards();
      Fill(v);
@ -123,11 +163,16 @@ struct HostDeviceVectorImpl {
    }
  }

+  // required, as a new std::mutex has to be created
+  HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
+    : data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
+      distribution_(other.distribution_), mutex_(), shards_(other.shards_) {}
+
  // Init can be std::vector<T> or std::initializer_list<T>
  template <class Init>
-  HostDeviceVectorImpl(const Init& init, GPUSet devices)
-    : devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
-    if (!devices.IsEmpty()) {
+  HostDeviceVectorImpl(const Init& init, GPUDistribution distribution)
+    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
+    if (!distribution_.IsEmpty()) {
      size_d_ = init.size();
      InitShards();
      Copy(init);
@ -137,58 +182,78 @@ struct HostDeviceVectorImpl {
  }

  void InitShards() {
-    int ndevices = devices_.Size();
+    int ndevices = distribution_.devices_.Size();
    shards_.resize(ndevices);
    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Init(this, devices_[i]);
+        shard.Init(this, distribution_.devices_[i]);
      });
  }

-  HostDeviceVectorImpl(const HostDeviceVectorImpl<T>&) = delete;
-  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&&) = delete;
-  void operator=(const HostDeviceVectorImpl<T>&) = delete;
-  void operator=(HostDeviceVectorImpl<T>&&) = delete;
+  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }

-  size_t Size() const { return on_h_ ? data_h_.size() : size_d_; }
+  GPUSet Devices() const { return distribution_.devices_; }

-  GPUSet Devices() const { return devices_; }
+  const GPUDistribution& Distribution() const { return distribution_; }

  T* DevicePointer(int device) {
-    CHECK(devices_.Contains(device));
-    LazySyncDevice(device);
-    return shards_[devices_.Index(device)].data_.data().get();
+    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(device, GPUAccess::kWrite);
+    return shards_[distribution_.devices_.Index(device)].data_.data().get();
+  }
+
+  const T* ConstDevicePointer(int device) {
+    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(device, GPUAccess::kRead);
+    return shards_[distribution_.devices_.Index(device)].data_.data().get();
  }

  common::Span<T> DeviceSpan(int device) {
-    CHECK(devices_.Contains(device));
-    LazySyncDevice(device);
-    return { shards_[devices_.Index(device)].data_.data().get(),
-             static_cast<typename common::Span<T>::index_type>(Size()) };
+    GPUSet devices = distribution_.devices_;
+    CHECK(devices.Contains(device));
+    LazySyncDevice(device, GPUAccess::kWrite);
+    return {shards_[devices.Index(device)].data_.data().get(),
+            static_cast<typename common::Span<T>::index_type>(Size())};
+  }
+
+  common::Span<const T> ConstDeviceSpan(int device) {
+    GPUSet devices = distribution_.devices_;
+    CHECK(devices.Contains(device));
+    LazySyncDevice(device, GPUAccess::kRead);
+    return {shards_[devices.Index(device)].data_.data().get(),
+      static_cast<typename common::Span<const T>::index_type>(Size())};
  }

  size_t DeviceSize(int device) {
-    CHECK(devices_.Contains(device));
-    LazySyncDevice(device);
-    return shards_[devices_.Index(device)].data_.size();
+    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(device, GPUAccess::kRead);
+    return shards_[distribution_.devices_.Index(device)].data_.size();
  }

  size_t DeviceStart(int device) {
-    CHECK(devices_.Contains(device));
-    LazySyncDevice(device);
-    return shards_[devices_.Index(device)].start_;
+    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(device, GPUAccess::kRead);
+    return shards_[distribution_.devices_.Index(device)].start_;
  }

  thrust::device_ptr<T> tbegin(int device) {  // NOLINT
    return thrust::device_ptr<T>(DevicePointer(device));
  }

+  thrust::device_ptr<const T> tcbegin(int device) {  // NOLINT
+    return thrust::device_ptr<const T>(ConstDevicePointer(device));
+  }
+
  thrust::device_ptr<T> tend(int device) {  // NOLINT
    return tbegin(device) + DeviceSize(device);
  }

-  void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+  thrust::device_ptr<const T> tcend(int device) {  // NOLINT
+    return tcbegin(device) + DeviceSize(device);
+  }
+
+  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
    CHECK_EQ(end - begin, Size());
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
                               (end - begin) * sizeof(T),
                               cudaMemcpyDeviceToHost));
@ -201,7 +266,7 @@ struct HostDeviceVectorImpl {

  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
    CHECK_EQ(end - begin, Size());
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
                               data_h_.size() * sizeof(T),
                               cudaMemcpyHostToDevice));
@ -211,7 +276,7 @@ struct HostDeviceVectorImpl {
  }

  void Fill(T v) {
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      std::fill(data_h_.begin(), data_h_.end(), v);
    } else {
      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.Fill(v); });
@ -220,10 +285,10 @@ struct HostDeviceVectorImpl {

  void Copy(HostDeviceVectorImpl<T>* other) {
    CHECK_EQ(Size(), other->Size());
-    if (on_h_ && other->on_h_) {
+    if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
      std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
    } else {
-      CHECK(devices_ == other->devices_);
+      CHECK(distribution_ == other->distribution_);
      dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
          shard.Copy(&other->shards_[i]);
        });
@ -232,7 +297,7 @@ struct HostDeviceVectorImpl {

  void Copy(const std::vector<T>& other) {
    CHECK_EQ(Size(), other.size());
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
@ -243,7 +308,7 @@ struct HostDeviceVectorImpl {

  void Copy(std::initializer_list<T> other) {
    CHECK_EQ(Size(), other.size());
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
@ -253,72 +318,117 @@ struct HostDeviceVectorImpl {
  }

  std::vector<T>& HostVector() {
-    LazySyncHost();
+    LazySyncHost(GPUAccess::kWrite);
    return data_h_;
  }

-  void Reshard(GPUSet new_devices) {
-    if (devices_ == new_devices)
-      return;
-    CHECK(devices_.IsEmpty());
-    devices_ = new_devices;
+  const std::vector<T>& ConstHostVector() {
+    LazySyncHost(GPUAccess::kRead);
+    return data_h_;
+  }
+
+  void Reshard(const GPUDistribution& distribution) {
+    if (distribution_ == distribution) { return; }
+    CHECK(distribution_.IsEmpty());
+    distribution_ = distribution;
    InitShards();
  }

+  void Reshard(GPUSet new_devices) {
+    if (distribution_.Devices() == new_devices) { return; }
+    Reshard(GPUDistribution::Block(new_devices));
+  }
+
  void Resize(size_t new_size, T v) {
-    if (new_size == Size())
-      return;
-    if (Size() == 0 && !devices_.IsEmpty()) {
+    if (new_size == Size()) { return; }
+    if (distribution_.IsFixedSize()) {
+      CHECK_EQ(new_size, distribution_.offsets_.back());
+    }
+    if (Size() == 0 && !distribution_.IsEmpty()) {
      // fast on-device resize
-      on_h_ = false;
+      perm_h_ = Permissions(false);
      size_d_ = new_size;
      InitShards();
      Fill(v);
    } else {
      // resize on host
-      LazySyncHost();
+      LazySyncHost(GPUAccess::kWrite);
      data_h_.resize(new_size, v);
    }
  }

-  void LazySyncHost() {
-    if (on_h_)
+  void LazySyncHost(GPUAccess access) {
+    if (perm_h_.CanAccess(access)) { return; }
+    if (perm_h_.CanRead()) {
+      // data is present, just need to deny access to the device
+      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
+          shard.perm_d_.DenyComplementary(access);
+        });
+      perm_h_.Grant(access);
      return;
-    if (data_h_.size() != size_d_)
-      data_h_.resize(size_d_);
-    dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.LazySyncHost(); });
-    on_h_ = true;
+    }
+    if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
+    dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
+        shard.LazySyncHost(access);
+      });
+    perm_h_.Grant(access);
  }

-  void LazySyncDevice(int device) {
-    CHECK(devices_.Contains(device));
-    shards_[devices_.Index(device)].LazySyncDevice();
+  void LazySyncDevice(int device, GPUAccess access) {
+    GPUSet devices = distribution_.Devices();
+    CHECK(devices.Contains(device));
+    shards_[devices.Index(device)].LazySyncDevice(access);
+  }
+
+  bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
+
+  bool DeviceCanAccess(int device, GPUAccess access) {
+    GPUSet devices = distribution_.Devices();
+    if (!devices.Contains(device)) { return false; }
+    return shards_[devices.Index(device)].perm_d_.CanAccess(access);
  }

  std::vector<T> data_h_;
-  bool on_h_;
+  Permissions perm_h_;
  // the total size of the data stored on the devices
  size_t size_d_;
-  GPUSet devices_;
+  GPUDistribution distribution_;
+  // protects size_d_ and perm_h_ when updated from multiple threads
+  std::mutex mutex_;
  std::vector<DeviceShard> shards_;
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices)
-  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(size, v, devices);
+HostDeviceVector<T>::HostDeviceVector
+(size_t size, T v, GPUDistribution distribution) : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
-  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, devices);
+HostDeviceVector<T>::HostDeviceVector
+(std::initializer_list<T> init, GPUDistribution distribution) : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
+HostDeviceVector<T>::HostDeviceVector
+(const std::vector<T>& init, GPUDistribution distribution) : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
+}
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, devices);
+  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
+}
+
+template <typename T>
+HostDeviceVector<T>& HostDeviceVector<T>::operator=
+(const HostDeviceVector<T>& other) {
+  if (this == &other) { return *this; }
+  delete impl_;
+  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
+  return *this;
 }

 template <typename T>
@ -335,7 +445,19 @@ template <typename T>
 GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }

 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }
+const GPUDistribution& HostDeviceVector<T>::Distribution() const {
+  return impl_->Distribution();
+}
+
+template <typename T>
+T* HostDeviceVector<T>::DevicePointer(int device) {
+  return impl_->DevicePointer(device);
+}
+
+template <typename T>
+const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
+  return impl_->ConstDevicePointer(device);
+}

 template <typename T>
 common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
@ -343,30 +465,49 @@ common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) { return impl_->DeviceStart(device); }
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+  return impl_->ConstDeviceSpan(device);
+}

 template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) { return impl_->DeviceSize(device); }
+size_t HostDeviceVector<T>::DeviceStart(int device) const {
+  return impl_->DeviceStart(device);
+}
+
+template <typename T>
+size_t HostDeviceVector<T>::DeviceSize(int device) const {
+  return impl_->DeviceSize(device);
+}

 template <typename T>
 thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) {  // NOLINT
  return impl_->tbegin(device);
 }

+template <typename T>
+thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const {  // NOLINT
+  return impl_->tcbegin(device);
+}
+
 template <typename T>
 thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) {  // NOLINT
  return impl_->tend(device);
 }

+template <typename T>
+thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const {  // NOLINT
+  return impl_->tcend(device);
+}
+
 template <typename T>
 void HostDeviceVector<T>::ScatterFrom
-(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
  impl_->ScatterFrom(begin, end);
 }

 template <typename T>
 void HostDeviceVector<T>::GatherTo
-(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
  impl_->GatherTo(begin, end);
 }

@ -376,8 +517,8 @@ void HostDeviceVector<T>::Fill(T v) {
 }

 template <typename T>
-void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
-  impl_->Copy(other->impl_);
+void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
+  impl_->Copy(other.impl_);
 }

 template <typename T>
@ -394,10 +535,30 @@ template <typename T>
 std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }

 template <typename T>
-void HostDeviceVector<T>::Reshard(GPUSet new_devices) {
+const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
+  return impl_->ConstHostVector();
+}
+
+template <typename T>
+bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
+  return impl_->HostCanAccess(access);
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
+  return impl_->DeviceCanAccess(device, access);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Reshard(GPUSet new_devices) const {
  impl_->Reshard(new_devices);
 }

+template <typename T>
+void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const {
+  impl_->Reshard(distribution);
+}
+
 template <typename T>
 void HostDeviceVector<T>::Resize(size_t new_size, T v) {
  impl_->Resize(new_size, v);
@ -406,7 +567,8 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
-template class HostDeviceVector<unsigned int>;
 template class HostDeviceVector<int>;
+template class HostDeviceVector<Entry>;
+template class HostDeviceVector<size_t>;

 }  // namespace xgboost
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@ -1,28 +1,6 @@
 /*!
 * Copyright 2017 XGBoost contributors
 */
-#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
-#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
-
-#include <dmlc/logging.h>
-
-#include <algorithm>
-#include <cstdlib>
-#include <initializer_list>
-#include <vector>
-
-#include "gpu_set.h"
-#include "span.h"
-
-// only include thrust-related files if host_device_vector.h
-// is included from a .cu file
-#ifdef __CUDACC__
-#include <thrust/device_ptr.h>
-#endif
-
-namespace xgboost {
-
-template <typename T> struct HostDeviceVectorImpl;

 /**
 * @file host_device_vector.h
@ -70,44 +48,203 @@ template <typename T> struct HostDeviceVectorImpl;
 * if different threads call these methods with different values of the device argument.
 * All other methods are not thread safe. 
 */
+
+#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
+#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
+
+#include <dmlc/logging.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <initializer_list>
+#include <vector>
+
+#include "gpu_set.h"
+#include "span.h"
+
+// only include thrust-related files if host_device_vector.h
+// is included from a .cu file
+#ifdef __CUDACC__
+#include <thrust/device_ptr.h>
+#endif
+
+namespace xgboost {
+
+#ifdef __CUDACC__
+// Sets a function to call instead of cudaSetDevice();
+// only added for testing
+void SetCudaSetDeviceHandler(void (*handler)(int));
+#endif
+
+template <typename T> struct HostDeviceVectorImpl;
+
+// Distribution for the HostDeviceVector; it specifies such aspects as the devices it is
+// distributed on, whether there are copies of elements from other GPUs as well as the granularity
+// of splitting. It may also specify explicit boundaries for devices, in which case the size of the
+// array cannot be changed.
+class GPUDistribution {
+  template<typename T> friend struct HostDeviceVectorImpl;
+
+ public:
+  explicit GPUDistribution(GPUSet devices = GPUSet::Empty())
+    : devices_(devices), granularity_(1), overlap_(0) {}
+
+ private:
+  GPUDistribution(GPUSet devices, int granularity, int overlap,
+                  std::vector<size_t> offsets)
+    : devices_(devices), granularity_(granularity), overlap_(overlap),
+    offsets_(std::move(offsets)) {}
+
+ public:
+  static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
+
+  static GPUDistribution Overlap(GPUSet devices, int overlap) {
+    return GPUDistribution(devices, 1, overlap, std::vector<size_t>());
+  }
+
+  static GPUDistribution Granular(GPUSet devices, int granularity) {
+    return GPUDistribution(devices, granularity, 0, std::vector<size_t>());
+  }
+
+  static GPUDistribution Explicit(GPUSet devices, std::vector<size_t> offsets) {
+    return GPUDistribution(devices, 1, 0, offsets);
+  }
+
+  friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
+    return a.devices_ == b.devices_ && a.granularity_ == b.granularity_ &&
+      a.overlap_ == b.overlap_ && a.offsets_ == b.offsets_;
+  }
+
+  friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
+    return !(a == b);
+  }
+
+  GPUSet Devices() const { return devices_; }
+
+  bool IsEmpty() const { return devices_.IsEmpty(); }
+
+  size_t ShardStart(size_t size, int index) const {
+    if (size == 0) { return 0; }
+    if (offsets_.size() > 0) {
+      // explicit offsets are provided
+      CHECK_EQ(offsets_.back(), size);
+      return offsets_.at(index);
+    }
+    // no explicit offsets
+    size_t begin = std::min(index * Portion(size), size);
+    begin = begin > size ? size : begin;
+    return begin;
+  }
+
+  size_t ShardSize(size_t size, int index) const {
+    if (size == 0) { return 0; }
+    if (offsets_.size() > 0) {
+      // explicit offsets are provided
+      CHECK_EQ(offsets_.back(), size);
+      return offsets_.at(index + 1)  - offsets_.at(index) +
+        (index == devices_.Size() - 1 ? overlap_ : 0);
+    }
+    size_t portion = Portion(size);
+    size_t begin = std::min(index * portion, size);
+    size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size);
+    return end - begin;
+  }
+
+  size_t ShardProperSize(size_t size, int index) const {
+    if (size == 0) { return 0; }
+    return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0);
+  }
+
+  bool IsFixedSize() const { return !offsets_.empty(); }
+
+ private:
+  static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; }
+  static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; }
+
+  size_t Portion(size_t size) const {
+    return RoundUp
+      (DivRoundUp
+       (std::max(static_cast<int64_t>(size - overlap_ * granularity_),
+                 static_cast<int64_t>(1)),
+        devices_.Size()), granularity_);
+  }
+
+  GPUSet devices_;
+  int granularity_;
+  int overlap_;
+  // explicit offsets for the GPU parts, if any
+  std::vector<size_t> offsets_;
+};
+
+enum GPUAccess {
+  kNone, kRead,
+  // write implies read
+  kWrite
+};
+
+inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
+  return static_cast<GPUAccess>(static_cast<int>(a) - static_cast<int>(b));
+}
+
 template <typename T>
 class HostDeviceVector {
 public:
  explicit HostDeviceVector(size_t size = 0, T v = T(),
-                            GPUSet devices = GPUSet::Empty());
-  HostDeviceVector(std::initializer_list<T> init, GPUSet devices = GPUSet::Empty());
+                            GPUDistribution distribution = GPUDistribution());
+  HostDeviceVector(std::initializer_list<T> init,
+                   GPUDistribution distribution = GPUDistribution());
  explicit HostDeviceVector(const std::vector<T>& init,
-                            GPUSet devices = GPUSet::Empty());
+                            GPUDistribution distribution = GPUDistribution());
  ~HostDeviceVector();
-  HostDeviceVector(const HostDeviceVector<T>&) = delete;
-  HostDeviceVector(HostDeviceVector<T>&&) = delete;
-  void operator=(const HostDeviceVector<T>&) = delete;
-  void operator=(HostDeviceVector<T>&&) = delete;
+  HostDeviceVector(const HostDeviceVector<T>&);
+  HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
  size_t Size() const;
  GPUSet Devices() const;
-  T* DevicePointer(int device);
+  const GPUDistribution& Distribution() const;
  common::Span<T> DeviceSpan(int device);
+  common::Span<const T> ConstDeviceSpan(int device) const;
+  common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
+  T* DevicePointer(int device);
+  const T* ConstDevicePointer(int device) const;
+  const T* DevicePointer(int device) const { return ConstDevicePointer(device); }

  T* HostPointer() { return HostVector().data(); }
-  size_t DeviceStart(int device);
-  size_t DeviceSize(int device);
+  const T* ConstHostPointer() const { return ConstHostVector().data(); }
+  const T* HostPointer() const { return ConstHostPointer(); }
+
+  size_t DeviceStart(int device) const;
+  size_t DeviceSize(int device) const;

  // only define functions returning device_ptr
  // if HostDeviceVector.h is included from a .cu file
 #ifdef __CUDACC__
  thrust::device_ptr<T> tbegin(int device);  // NOLINT
  thrust::device_ptr<T> tend(int device);  // NOLINT
-  void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
-  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
+  thrust::device_ptr<const T> tcbegin(int device) const;  // NOLINT
+  thrust::device_ptr<const T> tcend(int device) const;  // NOLINT
+  thrust::device_ptr<const T> tbegin(int device) const {  // NOLINT
+    return tcbegin(device);
+  }
+  thrust::device_ptr<const T> tend(int device) const { return tcend(device); }  // NOLINT
+
+  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end);
+  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const;
 #endif

  void Fill(T v);
-  void Copy(HostDeviceVector<T>* other);
+  void Copy(const HostDeviceVector<T>& other);
  void Copy(const std::vector<T>& other);
  void Copy(std::initializer_list<T> other);

  std::vector<T>& HostVector();
-  void Reshard(GPUSet devices);
+  const std::vector<T>& ConstHostVector() const;
+  const std::vector<T>& HostVector() const {return ConstHostVector(); }
+
+  bool HostCanAccess(GPUAccess access) const;
+  bool DeviceCanAccess(int device, GPUAccess access) const;
+
+  void Reshard(const GPUDistribution& distribution) const;
+  void Reshard(GPUSet devices) const;
  void Resize(size_t new_size, T v = T());

 private:
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -25,12 +25,12 @@ namespace xgboost {
 // implementation of inline functions
 void MetaInfo::Clear() {
  num_row_ = num_col_ = num_nonzero_ = 0;
-  labels_.clear();
+  labels_.HostVector().clear();
  root_index_.clear();
  group_ptr_.clear();
  qids_.clear();
-  weights_.clear();
-  base_margin_.clear();
+  weights_.HostVector().clear();
+  base_margin_.HostVector().clear();
 }

 void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
@ -39,12 +39,12 @@ void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
  fo->Write(&num_row_, sizeof(num_row_));
  fo->Write(&num_col_, sizeof(num_col_));
  fo->Write(&num_nonzero_, sizeof(num_nonzero_));
-  fo->Write(labels_);
+  fo->Write(labels_.HostVector());
  fo->Write(group_ptr_);
  fo->Write(qids_);
-  fo->Write(weights_);
+  fo->Write(weights_.HostVector());
  fo->Write(root_index_);
-  fo->Write(base_margin_);
+  fo->Write(base_margin_.HostVector());
 }

 void MetaInfo::LoadBinary(dmlc::Stream *fi) {
@ -55,16 +55,16 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
  CHECK(fi->Read(&num_col_, sizeof(num_col_)) == sizeof(num_col_)) << "MetaInfo: invalid format";
  CHECK(fi->Read(&num_nonzero_, sizeof(num_nonzero_)) == sizeof(num_nonzero_))
      << "MetaInfo: invalid format";
-  CHECK(fi->Read(&labels_)) <<  "MetaInfo: invalid format";
+  CHECK(fi->Read(&labels_.HostVector())) <<  "MetaInfo: invalid format";
  CHECK(fi->Read(&group_ptr_)) << "MetaInfo: invalid format";
  if (version >= kVersionQidAdded) {
    CHECK(fi->Read(&qids_)) << "MetaInfo: invalid format";
  } else {  // old format doesn't contain qid field
    qids_.clear();
  }
-  CHECK(fi->Read(&weights_)) << "MetaInfo: invalid format";
+  CHECK(fi->Read(&weights_.HostVector())) << "MetaInfo: invalid format";
  CHECK(fi->Read(&root_index_)) << "MetaInfo: invalid format";
-  CHECK(fi->Read(&base_margin_)) << "MetaInfo: invalid format";
+  CHECK(fi->Read(&base_margin_.HostVector())) << "MetaInfo: invalid format";
 }

 // try to load group information from file, if exists
@ -121,17 +121,20 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
                       std::copy(cast_dptr, cast_dptr + num, root_index_.begin()));
  } else if (!std::strcmp(key, "label")) {
-    labels_.resize(num);
+    auto& labels = labels_.HostVector();
+    labels.resize(num);
    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, labels_.begin()));
+                       std::copy(cast_dptr, cast_dptr + num, labels.begin()));
  } else if (!std::strcmp(key, "weight")) {
-    weights_.resize(num);
+    auto& weights = weights_.HostVector();
+    weights.resize(num);
    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, weights_.begin()));
+                       std::copy(cast_dptr, cast_dptr + num, weights.begin()));
  } else if (!std::strcmp(key, "base_margin")) {
-    base_margin_.resize(num);
+    auto& base_margin = base_margin_.HostVector();
+    base_margin.resize(num);
    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, base_margin_.begin()));
+                       std::copy(cast_dptr, cast_dptr + num, base_margin.begin()));
  } else if (!std::strcmp(key, "group")) {
    group_ptr_.resize(num + 1);
    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
@ -230,12 +233,14 @@ DMatrix* DMatrix::Load(const std::string& uri,
      LOG(CONSOLE) << info.group_ptr_.size() - 1
                   << " groups are loaded from " << fname << ".group";
    }
-    if (MetaTryLoadFloatInfo(fname + ".base_margin", &info.base_margin_) && !silent) {
-      LOG(CONSOLE) << info.base_margin_.size()
+    if (MetaTryLoadFloatInfo
+        (fname + ".base_margin", &info.base_margin_.HostVector()) && !silent) {
+      LOG(CONSOLE) << info.base_margin_.Size()
                   << " base_margin are loaded from " << fname << ".base_margin";
    }
-    if (MetaTryLoadFloatInfo(fname + ".weight", &info.weights_) && !silent) {
-      LOG(CONSOLE) << info.weights_.size()
+    if (MetaTryLoadFloatInfo
+        (fname + ".weight", &info.weights_.HostVector()) && !silent) {
+      LOG(CONSOLE) << info.weights_.Size()
                   << " weights are loaded from " << fname << ".weight";
    }
  }
--- a/src/data/simple_csr_source.cc
+++ b/src/data/simple_csr_source.cc
@ -35,10 +35,12 @@ void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
  while (parser->Next()) {
    const dmlc::RowBlock<uint32_t>& batch = parser->Value();
    if (batch.label != nullptr) {
-      info.labels_.insert(info.labels_.end(), batch.label, batch.label + batch.size);
+      auto& labels = info.labels_.HostVector();
+      labels.insert(labels.end(), batch.label, batch.label + batch.size);
    }
    if (batch.weight != nullptr) {
-      info.weights_.insert(info.weights_.end(), batch.weight, batch.weight + batch.size);
+      auto& weights = info.weights_.HostVector();
+      weights.insert(weights.end(), batch.weight, batch.weight + batch.size);
    }
    if (batch.qid != nullptr) {
      info.qids_.insert(info.qids_.end(), batch.qid, batch.qid + batch.size);
@ -62,16 +64,18 @@ void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
    // update information
    this->info.num_row_ += batch.size;
    // copy the data over
+    auto& data_vec = page_.data.HostVector();
+    auto& offset_vec = page_.offset.HostVector();
    for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
      uint32_t index = batch.index[i];
      bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
-      page_.data.emplace_back(index, fvalue);
+      data_vec.emplace_back(index, fvalue);
      this->info.num_col_ = std::max(this->info.num_col_,
                                    static_cast<uint64_t>(index + 1));
    }
-    size_t top = page_.offset.size();
+    size_t top = page_.offset.Size();
    for (size_t i = 0; i < batch.size; ++i) {
-      page_.offset.push_back(page_.offset[top - 1] + batch.offset[i + 1] - batch.offset[0]);
+      offset_vec.push_back(offset_vec[top - 1] + batch.offset[i + 1] - batch.offset[0]);
    }
  }
  if (last_group_id != default_max) {
@ -79,7 +83,7 @@ void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
      info.group_ptr_.push_back(group_size);
    }
  }
-  this->info.num_nonzero_ = static_cast<uint64_t>(page_.data.size());
+  this->info.num_nonzero_ = static_cast<uint64_t>(page_.data.Size());
  // Either every row has query ID or none at all
  CHECK(info.qids_.empty() || info.qids_.size() == info.num_row_);
 }
@ -89,16 +93,16 @@ void SimpleCSRSource::LoadBinary(dmlc::Stream* fi) {
  CHECK(fi->Read(&tmagic, sizeof(tmagic)) == sizeof(tmagic)) << "invalid input file format";
  CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
  info.LoadBinary(fi);
-  fi->Read(&page_.offset);
-  fi->Read(&page_.data);
+  fi->Read(&page_.offset.HostVector());
+  fi->Read(&page_.data.HostVector());
 }

 void SimpleCSRSource::SaveBinary(dmlc::Stream* fo) const {
  int tmagic = kMagic;
  fo->Write(&tmagic, sizeof(tmagic));
  info.SaveBinary(fo);
-  fo->Write(page_.offset);
-  fo->Write(page_.data);
+  fo->Write(page_.offset.HostVector());
+  fo->Write(page_.data.HostVector());
 }

 void SimpleCSRSource::BeforeFirst() {
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@ -41,8 +41,10 @@ void SimpleDMatrix::MakeOneBatch(SparsePage* pcol, bool sorted) {
  // bit map
  const int nthread = omp_get_max_threads();
  pcol->Clear();
+  auto& pcol_offset_vec = pcol->offset.HostVector();
+  auto& pcol_data_vec = pcol->data.HostVector();
  common::ParallelGroupBuilder<Entry>
-      builder(&pcol->offset, &pcol->data);
+      builder(&pcol_offset_vec, &pcol_data_vec);
  builder.InitBudget(Info().num_col_, nthread);
  // start working
  auto iter = this->RowIterator();
@ -88,9 +90,9 @@ void SimpleDMatrix::MakeOneBatch(SparsePage* pcol, bool sorted) {
    auto ncol = static_cast<bst_omp_uint>(pcol->Size());
 #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
    for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (pcol->offset[i] < pcol->offset[i + 1]) {
-        std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
-          dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
+      if (pcol_offset_vec[i] < pcol_offset_vec[i + 1]) {
+        std::sort(dmlc::BeginPtr(pcol_data_vec) + pcol_offset_vec[i],
+          dmlc::BeginPtr(pcol_data_vec) + pcol_offset_vec[i + 1],
          Entry::CmpValue);
      }
    }
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@ -141,15 +141,19 @@ void SparsePageDMatrix::InitColAccess(
    pcol->Clear();
    pcol->base_rowid = buffered_rowset_[begin];
    const int nthread = std::max(omp_get_max_threads(), std::max(omp_get_num_procs() / 2 - 1, 1));
+    auto& offset_vec = pcol->offset.HostVector();
+    auto& data_vec = pcol->data.HostVector();
    common::ParallelGroupBuilder<Entry>
-    builder(&pcol->offset, &pcol->data);
+    builder(&offset_vec, &data_vec);
    builder.InitBudget(info.num_col_, nthread);
    bst_omp_uint ndata = static_cast<bst_uint>(prow.Size());
+    const auto& prow_offset_vec = prow.offset.HostVector();
+    const auto& prow_data_vec = prow.data.HostVector();
    #pragma omp parallel for schedule(static) num_threads(nthread)
    for (bst_omp_uint i = 0; i < ndata; ++i) {
      int tid = omp_get_thread_num();
-      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
-        const  auto e = prow.data[j];
+      for (size_t j = prow_offset_vec[i]; j < prow_offset_vec[i+1]; ++j) {
+        const  auto e = prow_data_vec[j];
        builder.AddBudget(e.index, tid);
      }
    }
@ -157,8 +161,8 @@ void SparsePageDMatrix::InitColAccess(
    #pragma omp parallel for schedule(static) num_threads(nthread)
    for (bst_omp_uint i = 0; i < ndata; ++i) {
      int tid = omp_get_thread_num();
-      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
-        const Entry &e = prow.data[j];
+      for (size_t j = prow_offset_vec[i]; j < prow_offset_vec[i+1]; ++j) {
+        const Entry &e = prow_data_vec[j];
        builder.Push(e.index,
                     Entry(buffered_rowset_[i + begin], e.fvalue),
                     tid);
@ -170,9 +174,9 @@ void SparsePageDMatrix::InitColAccess(
      auto ncol = static_cast<bst_omp_uint>(pcol->Size());
 #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
      for (bst_omp_uint i = 0; i < ncol; ++i) {
-        if (pcol->offset[i] < pcol->offset[i + 1]) {
-          std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
-            dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
+        if (offset_vec[i] < offset_vec[i + 1]) {
+          std::sort(dmlc::BeginPtr(data_vec) + offset_vec[i],
+            dmlc::BeginPtr(data_vec) + offset_vec[i + 1],
            Entry::CmpValue);
        }
      }
@ -233,8 +237,9 @@ void SparsePageDMatrix::InitColAccess(
    size_t tick_expected = kStep;

    while (make_next_col(page.get())) {
+      const auto& page_offset_vec = page->offset.ConstHostVector();
      for (size_t i = 0; i < page->Size(); ++i) {
-        col_size_[i] += page->offset[i + 1] - page->offset[i];
+        col_size_[i] += page_offset_vec[i + 1] - page_offset_vec[i];
      }

      bytes_write += page->MemCostBytes();
--- a/src/data/sparse_page_raw_format.cc
+++ b/src/data/sparse_page_raw_format.cc
@ -15,13 +15,15 @@ DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);
 class SparsePageRawFormat : public SparsePageFormat {
 public:
  bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
-    if (!fi->Read(&(page->offset))) return false;
-    CHECK_NE(page->offset.size(), 0U) << "Invalid SparsePage file";
-    page->data.resize(page->offset.back());
-    if (page->data.size() != 0) {
-      CHECK_EQ(fi->Read(dmlc::BeginPtr(page->data),
-                        (page->data).size() * sizeof(Entry)),
-               (page->data).size() * sizeof(Entry))
+    auto& offset_vec = page->offset.HostVector();
+    if (!fi->Read(&offset_vec)) return false;
+    auto& data_vec = page->data.HostVector();
+    CHECK_NE(page->offset.Size(), 0U) << "Invalid SparsePage file";
+    data_vec.resize(offset_vec.back());
+    if (page->data.Size() != 0) {
+      CHECK_EQ(fi->Read(dmlc::BeginPtr(data_vec),
+                        (page->data).Size() * sizeof(Entry)),
+               (page->data).Size() * sizeof(Entry))
          << "Invalid SparsePage file";
    }
    return true;
@ -31,15 +33,17 @@ class SparsePageRawFormat : public SparsePageFormat {
            dmlc::SeekStream* fi,
            const std::vector<bst_uint>& sorted_index_set) override {
    if (!fi->Read(&disk_offset_)) return false;
+    auto& offset_vec = page->offset.HostVector();
+    auto& data_vec = page->data.HostVector();
    // setup the offset
-    page->offset.clear();
-    page->offset.push_back(0);
+    offset_vec.clear();
+    offset_vec.push_back(0);
    for (unsigned int fid : sorted_index_set) {
      CHECK_LT(fid + 1, disk_offset_.size());
      size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
-      page->offset.push_back(page->offset.back() + size);
+      offset_vec.push_back(offset_vec.back() + size);
    }
-    page->data.resize(page->offset.back());
+    data_vec.resize(offset_vec.back());
    // read in the data
    size_t begin = fi->Tell();
    size_t curr_offset = 0;
@ -53,14 +57,14 @@ class SparsePageRawFormat : public SparsePageFormat {
      size_t j, size_to_read = 0;
      for (j = i; j < sorted_index_set.size(); ++j) {
        if (disk_offset_[sorted_index_set[j]] == disk_offset_[fid] + size_to_read) {
-          size_to_read += page->offset[j + 1] - page->offset[j];
+          size_to_read += offset_vec[j + 1] - offset_vec[j];
        } else {
          break;
        }
      }

      if (size_to_read != 0) {
-        CHECK_EQ(fi->Read(dmlc::BeginPtr(page->data) + page->offset[i],
+        CHECK_EQ(fi->Read(dmlc::BeginPtr(data_vec) + offset_vec[i],
                          size_to_read * sizeof(Entry)),
                 size_to_read * sizeof(Entry))
            << "Invalid SparsePage file";
@ -76,11 +80,13 @@ class SparsePageRawFormat : public SparsePageFormat {
  }

  void Write(const SparsePage& page, dmlc::Stream* fo) override {
-    CHECK(page.offset.size() != 0 && page.offset[0] == 0);
-    CHECK_EQ(page.offset.back(), page.data.size());
-    fo->Write(page.offset);
-    if (page.data.size() != 0) {
-      fo->Write(dmlc::BeginPtr(page.data), page.data.size() * sizeof(Entry));
+    const auto& offset_vec = page.offset.HostVector();
+    const auto& data_vec = page.data.HostVector();
+    CHECK(page.offset.Size() != 0 && offset_vec[0] == 0);
+    CHECK_EQ(offset_vec.back(), page.data.Size());
+    fo->Write(offset_vec);
+    if (page.data.Size() != 0) {
+      fo->Write(dmlc::BeginPtr(data_vec), page.data.Size() * sizeof(Entry));
    }
  }

--- a/src/data/sparse_page_source.cc
+++ b/src/data/sparse_page_source.cc
@ -129,10 +129,12 @@ void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
    while (src->Next()) {
      const dmlc::RowBlock<uint32_t>& batch = src->Value();
      if (batch.label != nullptr) {
-        info.labels_.insert(info.labels_.end(), batch.label, batch.label + batch.size);
+        auto& labels = info.labels_.HostVector();
+        labels.insert(labels.end(), batch.label, batch.label + batch.size);
      }
      if (batch.weight != nullptr) {
-        info.weights_.insert(info.weights_.end(), batch.weight, batch.weight + batch.size);
+        auto& weights = info.weights_.HostVector();
+        weights.insert(weights.end(), batch.weight, batch.weight + batch.size);
      }
      if (batch.qid != nullptr) {
        info.qids_.insert(info.qids_.end(), batch.qid, batch.qid + batch.size);
@ -175,7 +177,7 @@ void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
      }
    }

-    if (page->data.size() != 0) {
+    if (page->data.Size() != 0) {
      writer.PushWrite(std::move(page));
    }

@ -224,7 +226,7 @@ void SparsePageSource::Create(DMatrix* src,
                     << (bytes_write >> 20UL) << " written";
      }
    }
-    if (page->data.size() != 0) {
+    if (page->data.Size() != 0) {
      writer.PushWrite(std::move(page));
    }

--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@ -143,7 +143,7 @@ class GBLinear : public GradientBooster {
    model_.LazyInitModel();
    CHECK_EQ(ntree_limit, 0U)
        << "GBLinear::PredictContribution: ntrees is only valid for gbtree predictor";
-    const std::vector<bst_float>& base_margin = p_fmat->Info().base_margin_;
+    const auto& base_margin = p_fmat->Info().base_margin_.ConstHostVector();
    const int ngroup = model_.param.num_output_group;
    const size_t ncolumns = model_.param.num_feature + 1;
    // allocate space for (#features + bias) times #groups times #rows
@ -201,7 +201,7 @@ class GBLinear : public GradientBooster {
    monitor_.Start("PredictBatchInternal");
      model_.LazyInitModel();
    std::vector<bst_float> &preds = *out_preds;
-    const std::vector<bst_float>& base_margin = p_fmat->Info().base_margin_;
+    const auto& base_margin = p_fmat->Info().base_margin_.ConstHostVector();
    // start collecting the prediction
     auto iter = p_fmat->RowIterator();
    const int ngroup = model_.param.num_output_group;
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -195,8 +195,8 @@ class GBTree : public GradientBooster {
          << "must have exactly ngroup*nrow gpairs";
      // TODO(canonizer): perform this on GPU if HostDeviceVector has device set.
      HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup,
-                                      GradientPair(), in_gpair->Devices());
-      std::vector<GradientPair>& gpair_h = in_gpair->HostVector();
+                                      GradientPair(), in_gpair->Distribution());
+      const auto& gpair_h = in_gpair->ConstHostVector();
      auto nsize = static_cast<bst_omp_uint>(tmp.Size());
      for (int gid = 0; gid < ngroup; ++gid) {
        std::vector<GradientPair>& tmp_h = tmp.HostVector();
@ -402,7 +402,8 @@ class Dart : public GBTree {

    if (init_out_preds) {
      size_t n = num_group * p_fmat->Info().num_row_;
-      const std::vector<bst_float>& base_margin = p_fmat->Info().base_margin_;
+      const auto& base_margin =
+        p_fmat->Info().base_margin_.ConstHostVector();
      out_preds->resize(n);
      if (base_margin.size() != 0) {
        CHECK_EQ(out_preds->size(), n);
--- a/src/learner.cc
+++ b/src/learner.cc
@ -386,7 +386,7 @@ class LearnerImpl : public Learner {
    this->PredictRaw(train, &preds_);
    monitor_.Stop("PredictRaw");
    monitor_.Start("GetGradient");
-    obj_->GetGradient(&preds_, train->Info(), iter, &gpair_);
+    obj_->GetGradient(preds_, train->Info(), iter, &gpair_);
    monitor_.Stop("GetGradient");
    gbm_->DoBoost(train, &gpair_, obj_.get());
    monitor_.Stop("UpdateOneIter");
@ -416,7 +416,8 @@ class LearnerImpl : public Learner {
      obj_->EvalTransform(&preds_);
      for (auto& ev : metrics_) {
        os << '\t' << data_names[i] << '-' << ev->Name() << ':'
-           << ev->Eval(preds_.HostVector(), data_sets[i]->Info(), tparam_.dsplit == 2);
+           << ev->Eval(preds_.ConstHostVector(), data_sets[i]->Info(),
+                       tparam_.dsplit == 2);
      }
    }

@ -459,7 +460,8 @@ class LearnerImpl : public Learner {
    this->PredictRaw(data, &preds_);
    obj_->EvalTransform(&preds_);
    return std::make_pair(metric,
-                          ev->Eval(preds_.HostVector(), data->Info(), tparam_.dsplit == 2));
+                          ev->Eval(preds_.ConstHostVector(), data->Info(),
+                                   tparam_.dsplit == 2));
  }

  void Predict(DMatrix* data, bool output_margin,
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@ -90,7 +90,8 @@ class CoordinateUpdater : public LinearUpdater {
    const int ngroup = model->param.num_output_group;
    // update bias
    for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
-      auto grad = GetBiasGradientParallel(group_idx, ngroup, in_gpair->HostVector(), p_fmat);
+      auto grad = GetBiasGradientParallel(group_idx, ngroup,
+                                          in_gpair->ConstHostVector(), p_fmat);
      auto dbias = static_cast<float>(param.learning_rate *
                                      CoordinateDeltaBias(grad.first, grad.second));
      model->bias()[group_idx] += dbias;
@ -98,12 +99,13 @@ class CoordinateUpdater : public LinearUpdater {
                                 dbias, &in_gpair->HostVector(), p_fmat);
    }
    // prepare for updating the weights
-    selector->Setup(*model, in_gpair->HostVector(), p_fmat, param.reg_alpha_denorm,
+    selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat, param.reg_alpha_denorm,
                    param.reg_lambda_denorm, param.top_k);
    // update weights
    for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
      for (unsigned i = 0U; i < model->param.num_feature; i++) {
-        int fidx = selector->NextFeature(i, *model, group_idx, in_gpair->HostVector(), p_fmat,
+        int fidx = selector->NextFeature
+          (i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
           param.reg_alpha_denorm, param.reg_lambda_denorm);
        if (fidx < 0) break;
        this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@ -259,7 +259,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
    monitor.Start("UpdateGpair");
    // Update gpair
    dh::ExecuteShards(&shards, [&](std::unique_ptr<DeviceShard> &shard) {
-      shard->UpdateGpair(in_gpair->HostVector(), model->param);
+      shard->UpdateGpair(in_gpair->ConstHostVector(), model->param);
    });
    monitor.Stop("UpdateGpair");

@ -267,7 +267,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
    this->UpdateBias(p_fmat, model);
    monitor.Stop("UpdateBias");
    // prepare for updating the weights
-    selector->Setup(*model, in_gpair->HostVector(), p_fmat,
+    selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
                    param.reg_alpha_denorm, param.reg_lambda_denorm,
                    param.top_k);
    monitor.Start("UpdateFeature");
@ -275,7 +275,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
         ++group_idx) {
      for (auto i = 0U; i < model->param.num_feature; i++) {
        auto fidx = selector->NextFeature(
-            i, *model, group_idx, in_gpair->HostVector(), p_fmat,
+            i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
            param.reg_alpha_denorm, param.reg_lambda_denorm);
        if (fidx < 0) break;
        this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), model);
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@ -63,13 +63,14 @@ class ShotgunUpdater : public LinearUpdater {
  }
  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
              gbm::GBLinearModel *model, double sum_instance_weight) override {
-    std::vector<GradientPair> &gpair = in_gpair->HostVector();
+    auto &gpair = in_gpair->HostVector();
    param_.DenormalizePenalties(sum_instance_weight);
    const int ngroup = model->param.num_output_group;

    // update bias
    for (int gid = 0; gid < ngroup; ++gid) {
-      auto grad = GetBiasGradientParallel(gid, ngroup, in_gpair->HostVector(), p_fmat);
+      auto grad = GetBiasGradientParallel(gid, ngroup,
+                                          in_gpair->ConstHostVector(), p_fmat);
      auto dbias = static_cast<bst_float>(param_.learning_rate *
                               CoordinateDeltaBias(grad.first, grad.second));
      model->bias()[gid] += dbias;
@ -77,7 +78,7 @@ class ShotgunUpdater : public LinearUpdater {
    }

    // lock-free parallel updates of weights
-    selector_->Setup(*model, in_gpair->HostVector(), p_fmat,
+    selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
                     param_.reg_alpha_denorm, param_.reg_lambda_denorm, 0);
     auto iter = p_fmat->ColIterator();
    while (iter->Next()) {
@ -85,15 +86,16 @@ class ShotgunUpdater : public LinearUpdater {
      const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
-        int ii = selector_->NextFeature(i, *model, 0, in_gpair->HostVector(), p_fmat,
-                                       param_.reg_alpha_denorm, param_.reg_lambda_denorm);
+        int ii = selector_->NextFeature
+          (i, *model, 0, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
+           param_.reg_lambda_denorm);
        if (ii < 0) continue;
        const bst_uint fid = ii;
        auto col = batch[ii];
        for (int gid = 0; gid < ngroup; ++gid) {
          double sum_grad = 0.0, sum_hess = 0.0;
          for (auto& c : col) {
-            GradientPair &p = gpair[c.index * ngroup + gid];
+            const GradientPair &p = gpair[c.index * ngroup + gid];
            if (p.GetHess() < 0.0f) continue;
            const bst_float v = c.fvalue;
            sum_grad += p.GetGrad() * v;
--- a/src/metric/elementwise_metric.cc
+++ b/src/metric/elementwise_metric.cc
@ -24,16 +24,18 @@ struct EvalEWiseBase : public Metric {
  bst_float Eval(const std::vector<bst_float>& preds,
                 const MetaInfo& info,
                 bool distributed) const override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.size(), info.labels_.size())
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.size(), info.labels_.Size())
        << "label and prediction size not match, "
        << "hint: use merror or mlogloss for multi-class classification";
-    const auto ndata = static_cast<omp_ulong>(info.labels_.size());
+    const auto ndata = static_cast<omp_ulong>(info.labels_.Size());
    double sum = 0.0, wsum = 0.0;
+    const auto& labels = info.labels_.HostVector();
+    const auto& weights = info.weights_.HostVector();
    #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
    for (omp_ulong i = 0; i < ndata; ++i) {
-      const bst_float wt = info.GetWeight(i);
-      sum += static_cast<const Derived*>(this)->EvalRow(info.labels_[i], preds[i]) * wt;
+      const bst_float wt = weights.size() > 0 ? weights[i] : 1.0f;
+      sum += static_cast<const Derived*>(this)->EvalRow(labels[i], preds[i]) * wt;
      wsum += wt;
    }
    double dat[2]; dat[0] = sum, dat[1] = wsum;
--- a/src/metric/multiclass_metric.cc
+++ b/src/metric/multiclass_metric.cc
@ -23,20 +23,24 @@ struct EvalMClassBase : public Metric {
  bst_float Eval(const std::vector<bst_float> &preds,
                 const MetaInfo &info,
                 bool distributed) const override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK(preds.size() % info.labels_.size() == 0)
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK(preds.size() % info.labels_.Size() == 0)
        << "label and prediction size not match";
-    const size_t nclass = preds.size() / info.labels_.size();
+    const size_t nclass = preds.size() / info.labels_.Size();
    CHECK_GE(nclass, 1U)
        << "mlogloss and merror are only used for multi-class classification,"
        << " use logloss for binary classification";
-    const auto ndata = static_cast<bst_omp_uint>(info.labels_.size());
+    const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
    double sum = 0.0, wsum = 0.0;
    int label_error = 0;
+
+    const auto& labels = info.labels_.HostVector();
+    const auto& weights = info.weights_.HostVector();
+
    #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const bst_float wt = info.GetWeight(i);
-      auto label =  static_cast<int>(info.labels_[i]);
+      const bst_float wt = weights.size() > 0 ? weights[i] : 1.0f;
+      auto label =  static_cast<int>(labels[i]);
      if (label >= 0 && label < static_cast<int>(nclass)) {
        sum += Derived::EvalRow(label,
                                preds.data() + i * nclass,
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@ -32,7 +32,7 @@ struct EvalAMS : public Metric {
    CHECK(!distributed) << "metric AMS do not support distributed evaluation";
    using namespace std;  // NOLINT(*)

-    const auto ndata = static_cast<bst_omp_uint>(info.labels_.size());
+    const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
    std::vector<std::pair<bst_float, unsigned> > rec(ndata);

    #pragma omp parallel for schedule(static)
@ -45,10 +45,11 @@ struct EvalAMS : public Metric {
    const double br = 10.0;
    unsigned thresindex = 0;
    double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
+    const auto& labels = info.labels_.HostVector();
    for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
      const unsigned ridx = rec[i].second;
      const bst_float wt = info.GetWeight(ridx);
-      if (info.labels_[ridx] > 0.5f) {
+      if (labels[ridx] > 0.5f) {
        s_tp += wt;
      } else {
        b_fp += wt;
@ -84,14 +85,14 @@ struct EvalAuc : public Metric {
  bst_float Eval(const std::vector<bst_float> &preds,
                 const MetaInfo &info,
                 bool distributed) const override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.size(), info.labels_.size())
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.size(), info.labels_.Size())
        << "label size predict size not match";
    std::vector<unsigned> tgptr(2, 0);
-    tgptr[1] = static_cast<unsigned>(info.labels_.size());
+    tgptr[1] = static_cast<unsigned>(info.labels_.Size());

    const std::vector<unsigned> &gptr = info.group_ptr_.size() == 0 ? tgptr : info.group_ptr_;
-    CHECK_EQ(gptr.back(), info.labels_.size())
+    CHECK_EQ(gptr.back(), info.labels_.Size())
        << "EvalAuc: group structure must match number of prediction";
    const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
    // sum statistics
@ -99,6 +100,7 @@ struct EvalAuc : public Metric {
    int auc_error = 0;
    // each thread takes a local rec
    std::vector< std::pair<bst_float, unsigned> > rec;
+    const auto& labels = info.labels_.HostVector();
    for (bst_omp_uint k = 0; k < ngroup; ++k) {
      rec.clear();
      for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
@ -110,7 +112,7 @@ struct EvalAuc : public Metric {
      double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
      for (size_t j = 0; j < rec.size(); ++j) {
        const bst_float wt = info.GetWeight(rec[j].second);
-        const bst_float ctr = info.labels_[rec[j].second];
+        const bst_float ctr = labels[rec[j].second];
        // keep bucketing predictions in same bucket
        if (j != 0 && rec[j].first != rec[j - 1].first) {
          sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
@ -156,7 +158,7 @@ struct EvalRankList : public Metric {
  bst_float Eval(const std::vector<bst_float> &preds,
                 const MetaInfo &info,
                 bool distributed) const override {
-    CHECK_EQ(preds.size(), info.labels_.size())
+    CHECK_EQ(preds.size(), info.labels_.Size())
        << "label size predict size not match";
    // quick consistency when group is not available
    std::vector<unsigned> tgptr(2, 0);
@ -168,6 +170,7 @@ struct EvalRankList : public Metric {
    const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
    // sum statistics
    double sum_metric = 0.0f;
+    const auto& labels = info.labels_.HostVector();
    #pragma omp parallel reduction(+:sum_metric)
    {
      // each thread takes a local rec
@ -176,7 +179,7 @@ struct EvalRankList : public Metric {
      for (bst_omp_uint k = 0; k < ngroup; ++k) {
        rec.clear();
        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
-          rec.emplace_back(preds[j], static_cast<int>(info.labels_[j]));
+          rec.emplace_back(preds[j], static_cast<int>(labels[j]));
        }
        sum_metric += this->EvalMetric(rec);
      }
@ -314,7 +317,7 @@ struct EvalCox : public Metric {
    CHECK(!distributed) << "Cox metric does not support distributed evaluation";
    using namespace std;  // NOLINT(*)

-    const auto ndata = static_cast<bst_omp_uint>(info.labels_.size());
+    const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
    const std::vector<size_t> &label_order = info.LabelAbsSort();

    // pre-compute a sum for the denominator
@ -326,9 +329,10 @@ struct EvalCox : public Metric {
    double out = 0;
    double accumulated_sum = 0;
    bst_omp_uint num_events = 0;
+    const auto& labels = info.labels_.HostVector();
    for (bst_omp_uint i = 0; i < ndata; ++i) {
      const size_t ind = label_order[i];
-      const auto label = info.labels_[ind];
+      const auto label = labels[ind];
      if (label > 0) {
        out -= log(preds[ind]) - log(exp_p_sum);
        ++num_events;
@ -336,7 +340,7 @@ struct EvalCox : public Metric {

      // only update the denominator after we move forward in time (labels are sorted)
      accumulated_sum += preds[ind];
-      if (i == ndata - 1 || std::abs(label) < std::abs(info.labels_[label_order[i + 1]])) {
+      if (i == ndata - 1 || std::abs(label) < std::abs(labels[label_order[i + 1]])) {
        exp_p_sum -= accumulated_sum;
        accumulated_sum = 0;
      }
@ -358,14 +362,14 @@ struct EvalAucPR : public Metric {

  bst_float Eval(const std::vector<bst_float> &preds, const MetaInfo &info,
                 bool distributed) const override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.size(), info.labels_.size())
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.size(), info.labels_.Size())
        << "label size predict size not match";
    std::vector<unsigned> tgptr(2, 0);
-    tgptr[1] = static_cast<unsigned>(info.labels_.size());
+    tgptr[1] = static_cast<unsigned>(info.labels_.Size());
    const std::vector<unsigned> &gptr =
        info.group_ptr_.size() == 0 ? tgptr : info.group_ptr_;
-    CHECK_EQ(gptr.back(), info.labels_.size())
+    CHECK_EQ(gptr.back(), info.labels_.Size())
        << "EvalAucPR: group structure must match number of prediction";
    const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
    // sum statistics
@ -373,13 +377,14 @@ struct EvalAucPR : public Metric {
    int auc_error = 0, auc_gt_one = 0;
    // each thread takes a local rec
    std::vector<std::pair<bst_float, unsigned>> rec;
+    const auto& labels = info.labels_.HostVector();
    for (bst_omp_uint k = 0; k < ngroup; ++k) {
      double total_pos = 0.0;
      double total_neg = 0.0;
      rec.clear();
      for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
-        total_pos += info.GetWeight(j) * info.labels_[j];
-        total_neg += info.GetWeight(j) * (1.0f - info.labels_[j]);
+        total_pos += info.GetWeight(j) * labels[j];
+        total_neg += info.GetWeight(j) * (1.0f - labels[j]);
        rec.emplace_back(preds[j], j);
      }
      XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
@ -390,8 +395,8 @@ struct EvalAucPR : public Metric {
      // calculate AUC
      double tp = 0.0, prevtp = 0.0, fp = 0.0, prevfp = 0.0, h = 0.0, a = 0.0, b = 0.0;
      for (size_t j = 0; j < rec.size(); ++j) {
-        tp += info.GetWeight(rec[j].second) * info.labels_[rec[j].second];
-        fp += info.GetWeight(rec[j].second) * (1.0f - info.labels_[rec[j].second]);
+        tp += info.GetWeight(rec[j].second) * labels[rec[j].second];
+        fp += info.GetWeight(rec[j].second) * (1.0f - labels[rec[j].second]);
        if ((j < rec.size() - 1 && rec[j].first != rec[j + 1].first) || j  == rec.size() - 1) {
          if (tp == prevtp) {
            a = 1.0;
--- a/src/objective/hinge.cc
+++ b/src/objective/hinge.cc
@ -21,24 +21,26 @@ class HingeObj : public ObjFunction {
    // This objective does not take any parameters
  }

-  void GetGradient(HostDeviceVector<bst_float> *preds,
+  void GetGradient(const HostDeviceVector<bst_float> &preds,
                   const MetaInfo &info,
                   int iter,
                   HostDeviceVector<GradientPair> *out_gpair) override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds->Size(), info.labels_.size())
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.Size(), info.labels_.Size())
        << "labels are not correctly provided"
-        << "preds.size=" << preds->Size()
-        << ", label.size=" << info.labels_.size();
-    auto& preds_h = preds->HostVector();
+        << "preds.size=" << preds.Size()
+        << ", label.size=" << info.labels_.Size();
+    const auto& preds_h = preds.HostVector();
+    const auto& labels_h = info.labels_.HostVector();
+    const auto& weights_h = info.weights_.HostVector();

    out_gpair->Resize(preds_h.size());
    auto& gpair = out_gpair->HostVector();

    for (size_t i = 0; i < preds_h.size(); ++i) {
-      auto y = info.labels_[i] * 2.0 - 1.0;
+      auto y = labels_h[i] * 2.0 - 1.0;
      bst_float p = preds_h[i];
-      bst_float w = info.GetWeight(i);
+      bst_float w = weights_h.size() > 0 ? weights_h[i] : 1.0f;
      bst_float g, h;
      if (p * y < 1.0) {
        g = -y * w;
--- a/src/objective/multiclass_obj.cc
+++ b/src/objective/multiclass_obj.cc
@ -35,19 +35,20 @@ class SoftmaxMultiClassObj : public ObjFunction {
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
  }
-  void GetGradient(HostDeviceVector<bst_float>* preds,
+  void GetGradient(const HostDeviceVector<bst_float>& preds,
                   const MetaInfo& info,
                   int iter,
                   HostDeviceVector<GradientPair>* out_gpair) override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK(preds->Size() == (static_cast<size_t>(param_.num_class) * info.labels_.size()))
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK(preds.Size() == (static_cast<size_t>(param_.num_class) * info.labels_.Size()))
        << "SoftmaxMultiClassObj: label size and pred size does not match";
-    std::vector<bst_float>& preds_h = preds->HostVector();
+    const std::vector<bst_float>& preds_h = preds.HostVector();
    out_gpair->Resize(preds_h.size());
    std::vector<GradientPair>& gpair = out_gpair->HostVector();
    const int nclass = param_.num_class;
    const auto ndata = static_cast<omp_ulong>(preds_h.size() / nclass);

+    const auto& labels = info.labels_.HostVector();
    int label_error = 0;
    #pragma omp parallel
    {
@ -58,7 +59,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
          rec[k] = preds_h[i * nclass + k];
        }
        common::Softmax(&rec);
-        auto label = static_cast<int>(info.labels_[i]);
+        auto label = static_cast<int>(labels[i]);
        if (label < 0 || label >= nclass)  {
          label_error = label; label = 0;
        }
--- a/src/objective/rank_obj.cc
+++ b/src/objective/rank_obj.cc
@ -38,18 +38,18 @@ class LambdaRankObj : public ObjFunction {
    param_.InitAllowUnknown(args);
  }

-  void GetGradient(HostDeviceVector<bst_float>* preds,
+  void GetGradient(const HostDeviceVector<bst_float>& preds,
                   const MetaInfo& info,
                   int iter,
                   HostDeviceVector<GradientPair>* out_gpair) override {
-    CHECK_EQ(preds->Size(), info.labels_.size()) << "label size predict size not match";
-    auto& preds_h = preds->HostVector();
+    CHECK_EQ(preds.Size(), info.labels_.Size()) << "label size predict size not match";
+    const auto& preds_h = preds.HostVector();
    out_gpair->Resize(preds_h.size());
    std::vector<GradientPair>& gpair = out_gpair->HostVector();
    // quick consistency when group is not available
-    std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(info.labels_.size());
+    std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(info.labels_.Size());
    const std::vector<unsigned> &gptr = info.group_ptr_.size() == 0 ? tgptr : info.group_ptr_;
-    CHECK(gptr.size() != 0 && gptr.back() == info.labels_.size())
+    CHECK(gptr.size() != 0 && gptr.back() == info.labels_.Size())
        << "group structure not consistent with #rows";

    const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
@ -67,11 +67,12 @@ class LambdaRankObj : public ObjFunction {
        sum_weights += info.GetWeight(k);
      }
      bst_float weight_normalization_factor = ngroup/sum_weights;
+      const auto& labels = info.labels_.HostVector();
      #pragma omp for schedule(static)
      for (bst_omp_uint k = 0; k < ngroup; ++k) {
        lst.clear(); pairs.clear();
        for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
-          lst.emplace_back(preds_h[j], info.labels_[j], j);
+          lst.emplace_back(preds_h[j], labels[j], j);
          gpair[j] = GradientPair(0.0f, 0.0f);
        }
        std::sort(lst.begin(), lst.end(), ListEntry::CmpPred);
--- a/src/objective/regression_obj.cc
+++ b/src/objective/regression_obj.cc
@ -38,16 +38,18 @@ class RegLossObj : public ObjFunction {
      const std::vector<std::pair<std::string, std::string> > &args) override {
    param_.InitAllowUnknown(args);
  }
-  void GetGradient(HostDeviceVector<bst_float> *preds, const MetaInfo &info,
+  void GetGradient(const HostDeviceVector<bst_float> &preds, const MetaInfo &info,
                   int iter, HostDeviceVector<GradientPair> *out_gpair) override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds->Size(), info.labels_.size())
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.Size(), info.labels_.Size())
        << "labels are not correctly provided"
-        << "preds.size=" << preds->Size()
-        << ", label.size=" << info.labels_.size();
-    auto& preds_h = preds->HostVector();
+        << "preds.size=" << preds.Size()
+        << ", label.size=" << info.labels_.Size();
+    const auto& preds_h = preds.HostVector();
+    const auto& labels = info.labels_.HostVector();
+    const auto& weights = info.weights_.HostVector();

-    this->LazyCheckLabels(info.labels_);
+    this->LazyCheckLabels(labels);
    out_gpair->Resize(preds_h.size());
    auto& gpair = out_gpair->HostVector();
    const auto n = static_cast<omp_ulong>(preds_h.size());
@ -57,10 +59,10 @@ class RegLossObj : public ObjFunction {
    const omp_ulong remainder = n % 8;
 #pragma omp parallel for schedule(static)
    for (omp_ulong i = 0; i < n - remainder; i += 8) {
-      avx::Float8 y(&info.labels_[i]);
+      avx::Float8 y(&labels[i]);
      avx::Float8 p = Loss::PredTransform(avx::Float8(&preds_h[i]));
-      avx::Float8 w = info.weights_.empty() ? avx::Float8(1.0f)
-                                           : avx::Float8(&info.weights_[i]);
+      avx::Float8 w = weights.empty() ? avx::Float8(1.0f)
+        : avx::Float8(&weights[i]);
      // Adjust weight
      w += y * (scale * w - w);
      avx::Float8 grad = Loss::FirstOrderGradient(p, y);
@ -68,7 +70,7 @@ class RegLossObj : public ObjFunction {
      avx::StoreGpair(gpair_ptr + i, grad * w, hess * w);
    }
    for (omp_ulong i = n - remainder; i < n; ++i) {
-      auto y = info.labels_[i];
+      auto y = labels[i];
      bst_float p = Loss::PredTransform(preds_h[i]);
      bst_float w = info.GetWeight(i);
      w += y * ((param_.scale_pos_weight * w) - w);
@ -140,15 +142,16 @@ class PoissonRegression : public ObjFunction {
    param_.InitAllowUnknown(args);
  }

-  void GetGradient(HostDeviceVector<bst_float> *preds,
+  void GetGradient(const HostDeviceVector<bst_float> &preds,
                   const MetaInfo &info,
                   int iter,
                   HostDeviceVector<GradientPair> *out_gpair) override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds->Size(), info.labels_.size()) << "labels are not correctly provided";
-    auto& preds_h = preds->HostVector();
-    out_gpair->Resize(preds->Size());
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
+    const auto& preds_h = preds.HostVector();
+    out_gpair->Resize(preds.Size());
    auto& gpair = out_gpair->HostVector();
+    const auto& labels = info.labels_.HostVector();
    // check if label in range
    bool label_correct = true;
    // start calculating gradient
@ -157,7 +160,7 @@ class PoissonRegression : public ObjFunction {
    for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
      bst_float p = preds_h[i];
      bst_float w = info.GetWeight(i);
-      bst_float y = info.labels_[i];
+      bst_float y = labels[i];
      if (y >= 0.0f) {
        gpair[i] = GradientPair((std::exp(p) - y) * w,
                             std::exp(p + param_.max_delta_step) * w);
@ -201,13 +204,13 @@ class CoxRegression : public ObjFunction {
 public:
  // declare functions
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {}
-  void GetGradient(HostDeviceVector<bst_float> *preds,
+  void GetGradient(const HostDeviceVector<bst_float> &preds,
                   const MetaInfo &info,
                   int iter,
                   HostDeviceVector<GradientPair> *out_gpair) override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds->Size(), info.labels_.size()) << "labels are not correctly provided";
-    auto& preds_h = preds->HostVector();
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
+    const auto& preds_h = preds.HostVector();
    out_gpair->Resize(preds_h.size());
    auto& gpair = out_gpair->HostVector();
    const std::vector<size_t> &label_order = info.LabelAbsSort();
@ -221,6 +224,7 @@ class CoxRegression : public ObjFunction {
    }

    // start calculating grad and hess
+    const auto& labels = info.labels_.HostVector();
    double r_k = 0;
    double s_k = 0;
    double last_exp_p = 0.0;
@ -231,7 +235,7 @@ class CoxRegression : public ObjFunction {
      const double p = preds_h[ind];
      const double exp_p = std::exp(p);
      const double w = info.GetWeight(ind);
-      const double y = info.labels_[ind];
+      const double y = labels[ind];
      const double abs_y = std::abs(y);

      // only update the denominator after we move forward in time (labels are sorted)
@ -289,15 +293,16 @@ class GammaRegression : public ObjFunction {
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
  }

-  void GetGradient(HostDeviceVector<bst_float> *preds,
+  void GetGradient(const HostDeviceVector<bst_float> &preds,
                   const MetaInfo &info,
                   int iter,
                   HostDeviceVector<GradientPair> *out_gpair) override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds->Size(), info.labels_.size()) << "labels are not correctly provided";
-    auto& preds_h = preds->HostVector();
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
+    const auto& preds_h = preds.HostVector();
    out_gpair->Resize(preds_h.size());
    auto& gpair = out_gpair->HostVector();
+    const auto& labels = info.labels_.HostVector();
    // check if label in range
    bool label_correct = true;
    // start calculating gradient
@ -306,7 +311,7 @@ class GammaRegression : public ObjFunction {
    for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
      bst_float p = preds_h[i];
      bst_float w = info.GetWeight(i);
-      bst_float y = info.labels_[i];
+      bst_float y = labels[i];
      if (y >= 0.0f) {
        gpair[i] = GradientPair((1 - y / std::exp(p)) * w, y / std::exp(p) * w);
      } else {
@ -356,24 +361,25 @@ class TweedieRegression : public ObjFunction {
    param_.InitAllowUnknown(args);
  }

-  void GetGradient(HostDeviceVector<bst_float> *preds,
+  void GetGradient(const HostDeviceVector<bst_float> &preds,
                   const MetaInfo &info,
                   int iter,
                   HostDeviceVector<GradientPair> *out_gpair) override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds->Size(), info.labels_.size()) << "labels are not correctly provided";
-    auto& preds_h = preds->HostVector();
-    out_gpair->Resize(preds->Size());
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
+    const auto& preds_h = preds.HostVector();
+    out_gpair->Resize(preds.Size());
    auto& gpair = out_gpair->HostVector();
+    const auto& labels = info.labels_.HostVector();
    // check if label in range
    bool label_correct = true;
    // start calculating gradient
-    const omp_ulong ndata = static_cast<omp_ulong>(preds->Size()); // NOLINT(*)
+    const omp_ulong ndata = static_cast<omp_ulong>(preds.Size()); // NOLINT(*)
    #pragma omp parallel for schedule(static)
    for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
      bst_float p = preds_h[i];
      bst_float w = info.GetWeight(i);
-      bst_float y = info.labels_[i];
+      bst_float y = labels[i];
      float rho = param_.tweedie_variance_power;
      if (y >= 0.0f) {
        bst_float grad = -y * std::exp((1 - rho) * p) + std::exp((2 - rho) * p);
--- a/src/objective/regression_obj_gpu.cu
+++ b/src/objective/regression_obj_gpu.cu
@ -45,7 +45,7 @@ struct GPURegLossParam : public dmlc::Parameter<GPURegLossParam> {
 // GPU kernel for gradient computation
 template<typename Loss>
 __global__ void get_gradient_k
-(common::Span<GradientPair> out_gpair,  common::Span<unsigned int> label_correct,
+(common::Span<GradientPair> out_gpair, common::Span<int> label_correct,
 common::Span<const float> preds, common::Span<const float> labels,
 const float * __restrict__ weights, int n, float scale_pos_weight) {
  int i = threadIdx.x + blockIdx.x * blockDim.x;
@ -75,66 +75,46 @@ __global__ void pred_transform_k(common::Span<float> preds, int n) {
 template<typename Loss>
 class GPURegLossObj : public ObjFunction {
 protected:
-  bool copied_;
-  HostDeviceVector<bst_float> labels_, weights_;
-  HostDeviceVector<unsigned int> label_correct_;
+  HostDeviceVector<int> label_correct_;

  // allocate device data for n elements, do nothing if memory is allocated already
-  void LazyResize(size_t n, size_t n_weights) {
-    if (labels_.Size() == n && weights_.Size() == n_weights)
-      return;
-    copied_ = false;
-
-    labels_.Reshard(devices_);
-    weights_.Reshard(devices_);
-    label_correct_.Reshard(devices_);
-
-    if (labels_.Size() != n) {
-      labels_.Resize(n);
-      label_correct_.Resize(devices_.Size());
-    }
-    if (weights_.Size() != n_weights)
-      weights_.Resize(n_weights);
+  void LazyResize() {
  }

 public:
-  GPURegLossObj() : copied_(false) {}
+  GPURegLossObj() {}

  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
-    // CHECK(param_.n_gpus != 0) << "Must have at least one device";
+    CHECK(param_.n_gpus != 0) << "Must have at least one device";
    devices_ = GPUSet::All(param_.n_gpus).Normalised(param_.gpu_id);
+    label_correct_.Reshard(devices_);
+    label_correct_.Resize(devices_.Size());
  }

-  void GetGradient(HostDeviceVector<float>* preds,
+  void GetGradient(const HostDeviceVector<float> &preds,
                   const MetaInfo &info,
                   int iter,
                   HostDeviceVector<GradientPair>* out_gpair) override {
-    CHECK_NE(info.labels_.size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds->Size(), info.labels_.size())
+    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.Size(), info.labels_.Size())
      << "labels are not correctly provided"
-      << "preds.size=" << preds->Size() << ", label.size=" << info.labels_.size();
-    size_t ndata = preds->Size();
-    preds->Reshard(devices_);
+      << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size();
+    size_t ndata = preds.Size();
+    preds.Reshard(devices_);
+    info.labels_.Reshard(devices_);
+    info.weights_.Reshard(devices_);
    out_gpair->Reshard(devices_);
    out_gpair->Resize(ndata);
-    LazyResize(ndata, info.weights_.size());
    GetGradientDevice(preds, info, iter, out_gpair);
  }

 private:
-  void GetGradientDevice(HostDeviceVector<float>* preds,
+  void GetGradientDevice(const HostDeviceVector<float>& preds,
                         const MetaInfo &info,
                         int iter,
                         HostDeviceVector<GradientPair>* out_gpair) {
    label_correct_.Fill(1);
-    // only copy the labels and weights once, similar to how the data is copied
-    if (!copied_) {
-      labels_.Copy(info.labels_);
-      if (info.weights_.size() > 0)
-        weights_.Copy(info.weights_);
-      copied_ = true;
-    }

    // run the kernel
 #pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
@ -142,12 +122,12 @@ class GPURegLossObj : public ObjFunction {
      int d = devices_[i];
      dh::safe_cuda(cudaSetDevice(d));
      const int block = 256;
-      size_t n = preds->DeviceSize(d);
+      size_t n = preds.DeviceSize(d);
      if (n > 0) {
        get_gradient_k<Loss><<<dh::DivRoundUp(n, block), block>>>
          (out_gpair->DeviceSpan(d), label_correct_.DeviceSpan(d),
-           preds->DeviceSpan(d), labels_.DeviceSpan(d),
-           info.weights_.size() > 0 ? weights_.DevicePointer(d) : nullptr,
+           preds.DeviceSpan(d), info.labels_.DeviceSpan(d),
+           info.weights_.Size() > 0 ? info.weights_.DevicePointer(d) : nullptr,
           n, param_.scale_pos_weight);
        dh::safe_cuda(cudaGetLastError());
      }
@ -155,7 +135,7 @@ class GPURegLossObj : public ObjFunction {
    }

    // copy "label correct" flags back to host
-    std::vector<unsigned int>& label_correct_h = label_correct_.HostVector();
+    std::vector<int>& label_correct_h = label_correct_.HostVector();
    for (int i = 0; i < devices_.Size(); ++i) {
      if (label_correct_h[i] == 0)
        LOG(FATAL) << Loss::LabelErrorMsg();
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@ -112,7 +112,7 @@ class CPUPredictor : public Predictor {
        ntree_limit * model.param.num_output_group >= model.trees.size()) {
      auto it = cache_.find(dmat);
      if (it != cache_.end()) {
-        HostDeviceVector<bst_float>& y = it->second.predictions;
+        const HostDeviceVector<bst_float>& y = it->second.predictions;
        if (y.Size() != 0) {
          out_preds->Resize(y.Size());
          std::copy(y.HostVector().begin(), y.HostVector().end(),
@ -128,7 +128,7 @@ class CPUPredictor : public Predictor {
                          HostDeviceVector<bst_float>* out_preds,
                          const gbm::GBTreeModel& model) const {
    size_t n = model.param.num_output_group * info.num_row_;
-    const std::vector<bst_float>& base_margin = info.base_margin_;
+    const auto& base_margin = info.base_margin_.HostVector();
    out_preds->Resize(n);
    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
    if (base_margin.size() == n) {
@ -282,7 +282,7 @@ class CPUPredictor : public Predictor {
    }
    // start collecting the contributions
    auto iter = p_fmat->RowIterator();
-    const std::vector<bst_float>& base_margin = info.base_margin_;
+    const auto& base_margin = info.base_margin_.HostVector();
    iter->BeforeFirst();
    while (iter->Next()) {
      auto &batch = iter->Value();
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -58,28 +58,30 @@ struct DeviceMatrix {

  DeviceMatrix(DMatrix* dmat, int device_idx, bool silent) : p_mat(dmat) {
    dh::safe_cuda(cudaSetDevice(device_idx));
-    auto info = dmat->Info();
+    const auto& info = dmat->Info();
    ba.Allocate(device_idx, silent, &row_ptr, info.num_row_ + 1, &data,
                info.num_nonzero_);
    auto iter = dmat->RowIterator();
    iter->BeforeFirst();
    size_t data_offset = 0;
    while (iter->Next()) {
-      auto &batch = iter->Value();
+      const auto& batch = iter->Value();
+      const auto& offset_vec = batch.offset.HostVector();
+      const auto& data_vec = batch.data.HostVector();
      // Copy row ptr
      dh::safe_cuda(cudaMemcpy(
-          row_ptr.Data() + batch.base_rowid, batch.offset.data(),
-          sizeof(size_t) * batch.offset.size(), cudaMemcpyHostToDevice));
+          row_ptr.Data() + batch.base_rowid, offset_vec.data(),
+          sizeof(size_t) * offset_vec.size(), cudaMemcpyHostToDevice));
      if (batch.base_rowid > 0) {
        auto begin_itr = row_ptr.tbegin() + batch.base_rowid;
        auto end_itr = begin_itr + batch.Size() + 1;
        IncrementOffset(begin_itr, end_itr, batch.base_rowid);
      }
-      dh::safe_cuda(cudaMemcpy(data.Data() + data_offset, batch.data.data(),
-                               sizeof(Entry) * batch.data.size(),
+      dh::safe_cuda(cudaMemcpy(data.Data() + data_offset, data_vec.data(),
+                               sizeof(Entry) * data_vec.size(),
                               cudaMemcpyHostToDevice));
      // Copy data
-      data_offset += batch.data.size();
+      data_offset += batch.data.Size();
    }
  }
 };
@ -374,10 +376,10 @@ class GPUPredictor : public xgboost::Predictor {
                          HostDeviceVector<bst_float>* out_preds,
                          const gbm::GBTreeModel& model) const {
    size_t n = model.param.num_output_group * info.num_row_;
-    const std::vector<bst_float>& base_margin = info.base_margin_;
+    const HostDeviceVector<bst_float>& base_margin = info.base_margin_;
    out_preds->Reshard(devices);
    out_preds->Resize(n);
-    if (base_margin.size() != 0) {
+    if (base_margin.Size() != 0) {
      CHECK_EQ(out_preds->Size(), n);
      out_preds->Copy(base_margin);
    } else {
@ -391,11 +393,11 @@ class GPUPredictor : public xgboost::Predictor {
        ntree_limit * model.param.num_output_group >= model.trees.size()) {
      auto it = cache_.find(dmat);
      if (it != cache_.end()) {
-        HostDeviceVector<bst_float>& y = it->second.predictions;
+        const HostDeviceVector<bst_float>& y = it->second.predictions;
        if (y.Size() != 0) {
          out_preds->Reshard(devices);
          out_preds->Resize(y.Size());
-          out_preds->Copy(&y);
+          out_preds->Copy(y);
          return true;
        }
      }
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@ -41,7 +41,7 @@ class ColMaker: public TreeUpdater {
      Builder builder(
        param_,
        std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone()));
-      builder.Update(gpair->HostVector(), dmat, tree);
+      builder.Update(gpair->ConstHostVector(), dmat, tree);
    }
    param_.learning_rate = lr;
  }
@ -784,7 +784,7 @@ class DistColMaker : public ColMaker {
      param_,
      std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone()));
    // build the tree
-    builder.Update(gpair->HostVector(), dmat, trees[0]);
+    builder.Update(gpair->ConstHostVector(), dmat, trees[0]);
    //// prune the tree, note that pruner will sync the tree
    pruner_->Update(gpair, dmat, trees);
    // update position after the tree is pruned
--- a/src/tree/updater_fast_hist.cc
+++ b/src/tree/updater_fast_hist.cc
@ -164,7 +164,7 @@ class FastHistMaker: public TreeUpdater {
      double time_evaluate_split = 0;
      double time_apply_split = 0;

-      std::vector<GradientPair>& gpair_h = gpair->HostVector();
+      const std::vector<GradientPair>& gpair_h = gpair->ConstHostVector();

      spliteval_->Reset();

--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@ -650,7 +650,7 @@ class GPUMaker : public TreeUpdater {

  void convertToCsc(DMatrix* dmat, std::vector<float>* fval,
                    std::vector<int>* fId, std::vector<size_t>* offset) {
-    MetaInfo info = dmat->Info();
+    const MetaInfo& info = dmat->Info();
    CHECK(info.num_col_ < std::numeric_limits<int>::max());
    CHECK(info.num_row_ < std::numeric_limits<int>::max());
    nRows = static_cast<int>(info.num_row_);
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -387,11 +387,13 @@ struct DeviceShard {

  void InitRowPtrs(const SparsePage& row_batch) {
    dh::safe_cuda(cudaSetDevice(device_idx));
+    const auto& offset_vec = row_batch.offset.HostVector();
    row_ptrs.resize(n_rows + 1);
-    thrust::copy(row_batch.offset.data() + row_begin_idx,
-                 row_batch.offset.data() + row_end_idx + 1,
+    thrust::copy(offset_vec.data() + row_begin_idx,
+                 offset_vec.data() + row_end_idx + 1,
                 row_ptrs.begin());
    auto row_iter = row_ptrs.begin();
+    // find the maximum row size
    auto get_size = [=] __device__(size_t row) {
      return row_iter[row + 1] - row_iter[row];
    }; // NOLINT
@ -432,9 +434,12 @@ struct DeviceShard {
      (dh::TotalMemory(device_idx) / (16 * row_stride * sizeof(Entry)),
       static_cast<size_t>(n_rows));

-    thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
+    const auto& offset_vec = row_batch.offset.HostVector();
+    const auto& data_vec = row_batch.data.HostVector();

+    thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
    size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows);
+
    for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
      size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
      size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
@ -443,12 +448,12 @@ struct DeviceShard {
      }
      size_t batch_nrows = batch_row_end - batch_row_begin;
      size_t n_entries =
-        row_batch.offset[row_begin_idx + batch_row_end] -
-        row_batch.offset[row_begin_idx + batch_row_begin];
+        offset_vec[row_begin_idx + batch_row_end] -
+        offset_vec[row_begin_idx + batch_row_begin];
      dh::safe_cuda
        (cudaMemcpy
         (entries_d.data().get(),
-          &row_batch.data[row_batch.offset[row_begin_idx + batch_row_begin]],
+          data_vec.data() + offset_vec[row_begin_idx + batch_row_begin],
          n_entries * sizeof(Entry), cudaMemcpyDefault));
      dim3 block3(32, 8, 1);
      dim3 grid3(dh::DivRoundUp(n_rows, block3.x),
@ -458,7 +463,7 @@ struct DeviceShard {
         row_ptrs.data().get() + batch_row_begin,
         entries_d.data().get(), cuts_d.data().get(), cut_row_ptrs_d.data().get(),
         batch_row_begin, batch_nrows,
-         row_batch.offset[row_begin_idx + batch_row_begin],
+         offset_vec[row_begin_idx + batch_row_begin],
         row_stride, null_gidx_value);

      dh::safe_cuda(cudaGetLastError());
@ -538,7 +543,7 @@ struct DeviceShard {

    std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
    ridx_segments.front() = Segment(0, ridx.Size());
-    this->gpair.copy(dh_gpair->tbegin(device_idx), dh_gpair->tend(device_idx));
+    this->gpair.copy(dh_gpair->tcbegin(device_idx), dh_gpair->tcend(device_idx));
    SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
    hist.Reset();
  }
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@ -30,7 +30,7 @@ class HistMaker: public BaseMaker {
    param_.learning_rate = lr / trees.size();
    // build tree
    for (auto tree : trees) {
-      this->Update(gpair->HostVector(), p_fmat, tree);
+      this->Update(gpair->ConstHostVector(), p_fmat, tree);
    }
    param_.learning_rate = lr;
  }
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@ -29,7 +29,7 @@ class TreeRefresher: public TreeUpdater {
              DMatrix *p_fmat,
              const std::vector<RegTree*> &trees) override {
    if (trees.size() == 0) return;
-    std::vector<GradientPair> &gpair_h = gpair->HostVector();
+    const std::vector<GradientPair> &gpair_h = gpair->ConstHostVector();
    // number of threads
    // thread temporal space
    std::vector<std::vector<TStats> > stemp;
--- a/src/tree/updater_skmaker.cc
+++ b/src/tree/updater_skmaker.cc
@ -30,7 +30,7 @@ class SketchMaker: public BaseMaker {
    param_.learning_rate = lr / trees.size();
    // build tree
    for (auto tree : trees) {
-      this->Update(gpair->HostVector(), p_fmat, tree);
+      this->Update(gpair->ConstHostVector(), p_fmat, tree);
    }
    param_.learning_rate = lr;
  }
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@ -3,20 +3,168 @@
 */

 #include <gtest/gtest.h>
-#include "../../../src/common/host_device_vector.h"
+#include <thrust/equal.h>
+#include <thrust/iterator/counting_iterator.h>
+
 #include "../../../src/common/device_helpers.cuh"
+#include "../../../src/common/host_device_vector.h"

 namespace xgboost {
 namespace common {

+void SetDevice(int device) {
+  int n_devices;
+  dh::safe_cuda(cudaGetDeviceCount(&n_devices));
+  device %= n_devices;
+  dh::safe_cuda(cudaSetDevice(device));
+}
+
+void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
+                     HostDeviceVector<int> *v) {
+  // create the vector
+  GPUSet devices = distribution.Devices();
+  v->Reshard(distribution);
+  v->Resize(n);
+
+  ASSERT_EQ(v->Size(), n);
+  ASSERT_TRUE(v->Distribution() == distribution);
+  ASSERT_TRUE(v->Devices() == devices);
+  // ensure that the devices have read-write access
+  for (int i = 0; i < devices.Size(); ++i) {
+    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
+    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+  }
+  // ensure that the host has no access
+  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
+  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));
+
+  // fill in the data on the host
+  std::vector<int>& data_h = v->HostVector();
+  // ensure that the host has full access, while the devices have none
+  ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
+  ASSERT_TRUE(v->HostCanAccess(GPUAccess::kWrite));
+  for (int i = 0; i < devices.Size(); ++i) {
+    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kRead));
+    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+  }
+  ASSERT_EQ(data_h.size(), n);
+  std::copy_n(thrust::make_counting_iterator(0), n, data_h.begin());
+}
+
+void PlusOne(HostDeviceVector<int> *v) {
+  int n_devices = v->Devices().Size();
+  for (int i = 0; i < n_devices; ++i) {
+    SetDevice(i);
+    thrust::transform(v->tbegin(i), v->tend(i), v->tbegin(i),
+                      [=]__device__(unsigned int a){ return a + 1; });
+  }
+}
+
+void CheckDevice(HostDeviceVector<int> *v,
+                 const std::vector<size_t>& starts,
+                 const std::vector<size_t>& sizes,
+                 unsigned int first, GPUAccess access) {
+  int n_devices = sizes.size();
+  ASSERT_EQ(v->Devices().Size(), n_devices);
+  for (int i = 0; i < n_devices; ++i) {
+    ASSERT_EQ(v->DeviceSize(i), sizes.at(i));
+    SetDevice(i);
+    ASSERT_TRUE(thrust::equal(v->tcbegin(i), v->tcend(i),
+                              thrust::make_counting_iterator(first + starts[i])));
+    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
+    // ensure that the device has at most the access specified by access
+    ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kWrite), access == GPUAccess::kWrite);
+  }
+  ASSERT_EQ(v->HostCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
+  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
+  for (int i = 0; i < n_devices; ++i) {
+    SetDevice(i);
+    ASSERT_TRUE(thrust::equal(v->tbegin(i), v->tend(i),
+                              thrust::make_counting_iterator(first + starts[i])));
+    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
+    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+  }
+  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));
+  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
+}
+
+void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
+  const std::vector<int>& data_h = access == GPUAccess::kWrite ?
+    v->HostVector() : v->ConstHostVector();
+  for (size_t i = 0; i < v->Size(); ++i) {
+    ASSERT_EQ(data_h.at(i), i + 1);
+  }
+  ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
+  ASSERT_EQ(v->HostCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite);
+  size_t n_devices = v->Devices().Size();
+  for (int i = 0; i < n_devices; ++i) {
+    ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kRead), access == GPUAccess::kRead);
+    // the devices should have no write access
+    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+  }
+}
+
+void TestHostDeviceVector
+(size_t n, const GPUDistribution& distribution,
+ const std::vector<size_t>& starts, const std::vector<size_t>& sizes) {
+  SetCudaSetDeviceHandler(SetDevice);
+  HostDeviceVector<int> v;
+  InitHostDeviceVector(n, distribution, &v);
+  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
+  PlusOne(&v);
+  CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
+  CheckHost(&v, GPUAccess::kRead);
+  CheckHost(&v, GPUAccess::kWrite);
+  SetCudaSetDeviceHandler(nullptr);
+}
+
+TEST(HostDeviceVector, TestBlock) {
+  size_t n = 1001;
+  int n_devices = 2;
+  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
+  std::vector<size_t> starts{0, 501};
+  std::vector<size_t> sizes{501, 500};
+  TestHostDeviceVector(n, distribution, starts, sizes);
+}
+
+TEST(HostDeviceVector, TestGranular) {
+  size_t n = 3003;
+  int n_devices = 2;
+  auto distribution = GPUDistribution::Granular(GPUSet::Range(0, n_devices), 3);
+  std::vector<size_t> starts{0, 1503};
+  std::vector<size_t> sizes{1503, 1500};
+  TestHostDeviceVector(n, distribution, starts, sizes);
+}
+
+TEST(HostDeviceVector, TestOverlap) {
+  size_t n = 1001;
+  int n_devices = 2;
+  auto distribution = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1);
+  std::vector<size_t> starts{0, 500};
+  std::vector<size_t> sizes{501, 501};
+  TestHostDeviceVector(n, distribution, starts, sizes);
+}
+
+TEST(HostDeviceVector, TestExplicit) {
+  size_t n = 1001;
+  int n_devices = 2;
+  std::vector<size_t> offsets{0, 550, 1001};
+  auto distribution = GPUDistribution::Explicit(GPUSet::Range(0, n_devices), offsets);
+  std::vector<size_t> starts{0, 550};
+  std::vector<size_t> sizes{550, 451};
+  TestHostDeviceVector(n, distribution, starts, sizes);
+}
+
 TEST(HostDeviceVector, Span) {
  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
  vec.Reshard(GPUSet{0, 1});
  auto span = vec.DeviceSpan(0);
  ASSERT_EQ(vec.Size(), span.size());
  ASSERT_EQ(vec.DevicePointer(0), span.data());
+  auto const_span = vec.ConstDeviceSpan(0);
+  ASSERT_EQ(vec.Size(), span.size());
+  ASSERT_EQ(vec.ConstDevicePointer(0), span.data());
 }

 }  // namespace common
 }  // namespace xgboost
-
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@ -16,9 +16,9 @@ TEST(MetaInfo, GetSet) {
  info.SetInfo("root_index", double2, xgboost::kDouble, 2);
  EXPECT_EQ(info.GetRoot(1), 2.0f);

-  EXPECT_EQ(info.labels_.size(), 0);
+  EXPECT_EQ(info.labels_.Size(), 0);
  info.SetInfo("label", double2, xgboost::kFloat32, 2);
-  EXPECT_EQ(info.labels_.size(), 2);
+  EXPECT_EQ(info.labels_.Size(), 2);

  float float2[2] = {1.0f, 2.0f};
  EXPECT_EQ(info.GetWeight(1), 1.0f)
@ -27,9 +27,9 @@ TEST(MetaInfo, GetSet) {
  EXPECT_EQ(info.GetWeight(1), 2.0f);

  uint32_t uint32_t2[2] = {1U, 2U};
-  EXPECT_EQ(info.base_margin_.size(), 0);
+  EXPECT_EQ(info.base_margin_.Size(), 0);
  info.SetInfo("base_margin", uint32_t2, xgboost::kUInt32, 2);
-  EXPECT_EQ(info.base_margin_.size(), 2);
+  EXPECT_EQ(info.base_margin_.Size(), 2);

  uint64_t uint64_t2[2] = {1U, 2U};
  EXPECT_EQ(info.group_ptr_.size(), 0);
@ -59,7 +59,7 @@ TEST(MetaInfo, SaveLoadBinary) {
  fs = dmlc::Stream::Create(tmp_file.c_str(), "r");
  xgboost::MetaInfo inforead;
  inforead.LoadBinary(fs);
-  EXPECT_EQ(inforead.labels_, info.labels_);
+  EXPECT_EQ(inforead.labels_.HostVector(), info.labels_.HostVector());
  EXPECT_EQ(inforead.num_col_, info.num_col_);
  EXPECT_EQ(inforead.num_row_, info.num_row_);

@ -128,7 +128,7 @@ TEST(MetaInfo, LoadQid) {
  CHECK(iter->Next());
  const xgboost::SparsePage& batch = iter->Value();
  CHECK_EQ(batch.base_rowid, 0);
-  CHECK(batch.offset == expected_offset);
-  CHECK(batch.data == expected_data);
+  CHECK(batch.offset.HostVector() == expected_offset);
+  CHECK(batch.data.HostVector() == expected_data);
  CHECK(!iter->Next());
 }
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@ -13,7 +13,7 @@ TEST(SimpleDMatrix, MetaInfo) {
  EXPECT_EQ(dmat->Info().num_row_, 2);
  EXPECT_EQ(dmat->Info().num_col_, 5);
  EXPECT_EQ(dmat->Info().num_nonzero_, 6);
-  EXPECT_EQ(dmat->Info().labels_.size(), dmat->Info().num_row_);
+  EXPECT_EQ(dmat->Info().labels_.Size(), dmat->Info().num_row_);

  delete dmat;
 }
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@ -16,7 +16,7 @@ TEST(SparsePageDMatrix, MetaInfo) {
  EXPECT_EQ(dmat->Info().num_row_, 2);
  EXPECT_EQ(dmat->Info().num_col_, 5);
  EXPECT_EQ(dmat->Info().num_nonzero_, 6);
-  EXPECT_EQ(dmat->Info().labels_.size(), dmat->Info().num_row_);
+  EXPECT_EQ(dmat->Info().labels_.Size(), dmat->Info().num_row_);

  // Clean up of external memory files
  std::remove((tmp_file + ".cache").c_str());
@ -54,7 +54,7 @@ TEST(SparsePageDMatrix, RowAccess) {
  delete dmat;
 }

-TEST(SparsePageDMatrix, ColAcess) {
+TEST(SparsePageDMatrix, ColAccess) {
  std::string tmp_file = CreateSimpleTestData();
  xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
    tmp_file + "#" + tmp_file + ".cache", true, false);
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@ -49,9 +49,8 @@ void _CheckObjFunction(xgboost::ObjFunction * obj,
                      std::vector<xgboost::bst_float> out_grad,
                      std::vector<xgboost::bst_float> out_hess) {
  xgboost::HostDeviceVector<xgboost::bst_float> in_preds(preds);
-
  xgboost::HostDeviceVector<xgboost::GradientPair> out_gpair;
-  obj->GetGradient(&in_preds, info, 1, &out_gpair);
+  obj->GetGradient(in_preds, info, 1, &out_gpair);
  std::vector<xgboost::GradientPair>& gpair = out_gpair.HostVector();

  ASSERT_EQ(gpair.size(), in_preds.Size());
@ -73,8 +72,8 @@ void CheckObjFunction(xgboost::ObjFunction * obj,
                      std::vector<xgboost::bst_float> out_hess) {
  xgboost::MetaInfo info;
  info.num_row_ = labels.size();
-  info.labels_ = labels;
-  info.weights_ = weights;
+  info.labels_.HostVector() = labels;
+  info.weights_.HostVector() = weights;

  _CheckObjFunction(obj, preds, labels, weights, info, out_grad, out_hess);
 }
@ -88,8 +87,8 @@ void CheckRankingObjFunction(xgboost::ObjFunction * obj,
                      std::vector<xgboost::bst_float> out_hess) {
  xgboost::MetaInfo info;
  info.num_row_ = labels.size();
-  info.labels_ = labels;
-  info.weights_ = weights;
+  info.labels_.HostVector() = labels;
+  info.weights_.HostVector() = weights;
  info.group_ptr_ = groups;

  _CheckObjFunction(obj, preds, labels, weights, info, out_grad, out_hess);
@ -102,8 +101,8 @@ xgboost::bst_float GetMetricEval(xgboost::Metric * metric,
                                 std::vector<xgboost::bst_float> weights) {
  xgboost::MetaInfo info;
  info.num_row_ = labels.size();
-  info.labels_ = labels;
-  info.weights_ = weights;
+  info.labels_.HostVector() = labels;
+  info.weights_.HostVector() = weights;
  return metric->Eval(preds, info, false);
 }