Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. (#3446)

* Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. - added distributions to HostDeviceVector - using HostDeviceVector for labels, weights and base margings in MetaInfo - using HostDeviceVector for offset and data in SparsePage - other necessary refactoring * Added const version of HostDeviceVector API calls. - const versions added to calls that can trigger data transfers, e.g. DevicePointer() - updated the code that uses HostDeviceVector - objective functions now accept const HostDeviceVector<bst_float>& for predictions * Updated src/linear/updater_gpu_coordinate.cu. * Added read-only state for HostDeviceVector sync. - this means no copies are performed if both host and devices access the HostDeviceVector read-only * Fixed linter and test errors. - updated the lz4 plugin - added ConstDeviceSpan to HostDeviceVector - using device % dh::NVisibleDevices() for the physical device number, e.g. in calls to cudaSetDevice() * Fixed explicit template instantiation errors for HostDeviceVector. - replaced HostDeviceVector<unsigned int> with HostDeviceVector<int> * Fixed HostDeviceVector tests that require multiple GPUs. - added a mock set device handler; when set, it is called instead of cudaSetDevice()
2018-08-30 04:28:47 +02:00
parent 58d783df16
commit 72cd1517d6
45 changed files with 1141 additions and 560 deletions
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -35,6 +35,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {

  auto iter = p_fmat->RowIterator();
  iter->BeforeFirst();
+  const auto& weights = info.weights_.HostVector();
  while (iter->Next()) {
     auto &batch = iter->Value();
    #pragma omp parallel num_threads(nthread)
@@ -50,7 +51,8 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
          SparsePage::Inst inst = batch[i];
          for (auto& ins : inst) {
            if (ins.index >= begin && ins.index < end) {
-              sketchs[ins.index].Push(ins.fvalue, info.GetWeight(ridx));
+              sketchs[ins.index].Push(ins.fvalue,
+                                      weights.size() > 0 ? weights[ridx] : 1.0f);
            }
          }
        }
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -118,7 +118,7 @@ struct GPUSketcher {

    void Init(const SparsePage& row_batch, const MetaInfo& info) {
      num_cols_ = info.num_col_;
-      has_weights_ = info.weights_.size() > 0;
+      has_weights_ = info.weights_.Size() > 0;

      // find the batch size
      if (param_.gpu_batch_nrows == 0) {
@@ -282,19 +282,23 @@ struct GPUSketcher {
      size_t batch_row_end = std::min((gpu_batch + 1) * gpu_batch_nrows_,
                                      static_cast<size_t>(n_rows_));
      size_t batch_nrows = batch_row_end - batch_row_begin;
-      size_t n_entries =
-        row_batch.offset[row_begin_ + batch_row_end] -
-        row_batch.offset[row_begin_ + batch_row_begin];
+
+      const auto& offset_vec = row_batch.offset.HostVector();
+      const auto& data_vec = row_batch.data.HostVector();
+
+      size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
+        offset_vec[row_begin_ + batch_row_begin];
      // copy the batch to the GPU
      dh::safe_cuda
        (cudaMemcpy(entries_.data().get(),
-                    &row_batch.data[row_batch.offset[row_begin_ + batch_row_begin]],
+                    data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
                    n_entries * sizeof(Entry), cudaMemcpyDefault));
      // copy the weights if necessary
      if (has_weights_) {
+        const auto& weights_vec = info.weights_.HostVector();
        dh::safe_cuda
          (cudaMemcpy(weights_.data().get(),
-                      info.weights_.data() + row_begin_ + batch_row_begin,
+                      weights_vec.data() + row_begin_ + batch_row_begin,
                      batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
      }

@@ -310,7 +314,7 @@ struct GPUSketcher {
         row_ptrs_.data().get() + batch_row_begin,
         has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
         gpu_batch_nrows_, num_cols_,
-         row_batch.offset[row_begin_ + batch_row_begin], batch_nrows);
+         offset_vec[row_begin_ + batch_row_begin], batch_nrows);
      dh::safe_cuda(cudaGetLastError());       // NOLINT
      dh::safe_cuda(cudaDeviceSynchronize());  // NOLINT

@@ -331,13 +335,11 @@ struct GPUSketcher {
    void Sketch(const SparsePage& row_batch, const MetaInfo& info) {
      // copy rows to the device
      dh::safe_cuda(cudaSetDevice(device_));
+      const auto& offset_vec = row_batch.offset.HostVector();
      row_ptrs_.resize(n_rows_ + 1);
-      thrust::copy(row_batch.offset.data() + row_begin_,
-                   row_batch.offset.data() + row_end_ + 1,
-                   row_ptrs_.begin());
-
+      thrust::copy(offset_vec.data() + row_begin_,
+                   offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
      size_t gpu_nbatches = dh::DivRoundUp(n_rows_, gpu_batch_nrows_);
-
      for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
        SketchBatch(row_batch, info, gpu_batch);
      }
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -6,7 +6,8 @@
 // dummy implementation of HostDeviceVector in case CUDA is not used

 #include <xgboost/base.h>
-
+#include <xgboost/data.h>
+#include <cstdint>
 #include <utility>
 #include "./host_device_vector.h"

@@ -14,25 +15,27 @@ namespace xgboost {

 template <typename T>
 struct HostDeviceVectorImpl {
-  explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
-  HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
-  explicit HostDeviceVectorImpl(std::vector<T>  init) : data_h_(std::move(init)) {}
+  explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v), distribution_() {}
+  HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init), distribution_() {}
+  explicit HostDeviceVectorImpl(std::vector<T>  init) : data_h_(std::move(init)), distribution_() {}
  std::vector<T> data_h_;
+  GPUDistribution distribution_;
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices) : impl_(nullptr) {
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUDistribution distribution)
+  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(size, v);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUDistribution distribution)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUDistribution distribution)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -44,33 +47,69 @@ HostDeviceVector<T>::~HostDeviceVector() {
  delete tmp;
 }

+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
+}
+
+template <typename T>
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
+  if (this == &other) {
+    return *this;
+  }
+  delete impl_;
+  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
+  return *this;
+}
+
 template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }

 template <typename T>
 GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }

+template <typename T>
+const GPUDistribution& HostDeviceVector<T>::Distribution() const {
+  return impl_->distribution_;
+}
+
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }

+template <typename T>
+const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
+  return nullptr;
+}
+
 template <typename T>
 common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
  return common::Span<T>();
 }

+template <typename T>
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+  return common::Span<const T>();
+}
+
 template <typename T>
 std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }

+template <typename T>
+const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
+  return impl_->data_h_;
+}
+
 template <typename T>
 void HostDeviceVector<T>::Resize(size_t new_size, T v) {
  impl_->data_h_.resize(new_size, v);
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) { return 0; }
+size_t HostDeviceVector<T>::DeviceStart(int device) const { return 0; }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) { return 0; }
+size_t HostDeviceVector<T>::DeviceSize(int device) const { return 0; }

 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
@@ -78,9 +117,9 @@ void HostDeviceVector<T>::Fill(T v) {
 }

 template <typename T>
-void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
-  CHECK_EQ(Size(), other->Size());
-  std::copy(other->HostVector().begin(), other->HostVector().end(), HostVector().begin());
+void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
+  CHECK_EQ(Size(), other.Size());
+  std::copy(other.HostVector().begin(), other.HostVector().end(), HostVector().begin());
 }

 template <typename T>
@@ -96,13 +135,27 @@ void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
 }

 template <typename T>
-void HostDeviceVector<T>::Reshard(GPUSet devices) { }
+bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
+  return true;
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
+  return false;
+}
+
+template <typename T>
+void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const { }
+
+template <typename T>
+void HostDeviceVector<T>::Reshard(GPUSet devices) const { }

 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
-template class HostDeviceVector<unsigned int>;
 template class HostDeviceVector<int>;
+template class HostDeviceVector<Entry>;
+template class HostDeviceVector<size_t>;

 }  // namespace xgboost

--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -2,119 +2,159 @@
 * Copyright 2017 XGBoost contributors
 */

-
-#include <thrust/fill.h>
 #include "./host_device_vector.h"
+#include <thrust/fill.h>
+#include <xgboost/data.h>
+#include <algorithm>
+#include <cstdint>
+#include <mutex>
 #include "./device_helpers.cuh"

+
 namespace xgboost {

+// the handler to call instead of cudaSetDevice; only used for testing
+static void (*cudaSetDeviceHandler)(int) = nullptr;  // NOLINT
+
+void SetCudaSetDeviceHandler(void (*handler)(int)) {
+  cudaSetDeviceHandler = handler;
+}
+
+// wrapper over access with useful methods
+class Permissions {
+  GPUAccess access_;
+  explicit Permissions(GPUAccess access) : access_(access) {}
+
+ public:
+  Permissions() : access_(GPUAccess::kNone) {}
+  explicit Permissions(bool perm)
+    : access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
+
+  bool CanRead() const { return access_ >= kRead; }
+  bool CanWrite() const { return access_ == kWrite; }
+  bool CanAccess(GPUAccess access) const { return access_ >= access; }
+  void Grant(GPUAccess access) { access_ = std::max(access_, access); }
+  void DenyComplementary(GPUAccess compl_access) {
+    access_ = std::min(access_, GPUAccess::kWrite - compl_access);
+  }
+  Permissions Complementary() const {
+    return Permissions(GPUAccess::kWrite - access_);
+  }
+};

 template <typename T>
 struct HostDeviceVectorImpl {
  struct DeviceShard {
-    DeviceShard() : index_(-1), device_(-1), start_(0), on_d_(false), vec_(nullptr) {}
-
-    static size_t ShardStart(size_t size, int ndevices, int index) {
-      size_t portion = dh::DivRoundUp(size, ndevices);
-      size_t begin = index * portion;
-      begin = begin > size ? size : begin;
-      return begin;
-    }
-
-    static size_t ShardSize(size_t size, int ndevices, int index) {
-      size_t portion = dh::DivRoundUp(size, ndevices);
-      size_t begin = index * portion, end = (index + 1) * portion;
-      begin = begin > size ? size : begin;
-      end = end > size ? size : end;
-      return end - begin;
-    }
+    DeviceShard()
+      : index_(-1), proper_size_(0), device_(-1), start_(0), perm_d_(false),
+        cached_size_(~0), vec_(nullptr) {}

    void Init(HostDeviceVectorImpl<T>* vec, int device) {
      if (vec_ == nullptr) { vec_ = vec; }
      CHECK_EQ(vec, vec_);
      device_ = device;
-      index_ = vec_->devices_.Index(device);
-      size_t size_h = vec_->Size();
-      int ndevices = vec_->devices_.Size();
-      start_ = ShardStart(size_h, ndevices, index_);
-      size_t size_d = ShardSize(size_h, ndevices, index_);
-      dh::safe_cuda(cudaSetDevice(device_));
-      data_.resize(size_d);
-      on_d_ = !vec_->on_h_;
+      index_ = vec_->distribution_.devices_.Index(device);
+      LazyResize(vec_->Size());
+      perm_d_ = vec_->perm_h_.Complementary();
    }

    void ScatterFrom(const T* begin) {
      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice();
-      dh::safe_cuda(cudaSetDevice(device_));
+      LazySyncDevice(GPUAccess::kWrite);
+      SetDevice();
      dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_,
                               data_.size() * sizeof(T), cudaMemcpyDefault));
    }

    void GatherTo(thrust::device_ptr<T> begin) {
-      LazySyncDevice();
-      dh::safe_cuda(cudaSetDevice(device_));
+      LazySyncDevice(GPUAccess::kRead);
+      SetDevice();
      dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(),
-                               data_.size() * sizeof(T), cudaMemcpyDefault));
+                               proper_size_ * sizeof(T), cudaMemcpyDefault));
    }

    void Fill(T v) {
      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice();
-      dh::safe_cuda(cudaSetDevice(device_));
+      LazySyncDevice(GPUAccess::kWrite);
+      SetDevice();
      thrust::fill(data_.begin(), data_.end(), v);
    }

    void Copy(DeviceShard* other) {
      // TODO(canonizer): avoid full copy of host data for this (but not for other)
-      LazySyncDevice();
-      other->LazySyncDevice();
-      dh::safe_cuda(cudaSetDevice(device_));
+      LazySyncDevice(GPUAccess::kWrite);
+      other->LazySyncDevice(GPUAccess::kRead);
+      SetDevice();
      dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(),
                               data_.size() * sizeof(T), cudaMemcpyDefault));
    }

-    void LazySyncHost() {
-      dh::safe_cuda(cudaSetDevice(device_));
+    void LazySyncHost(GPUAccess access) {
+      SetDevice();
      dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
-                               data_.data().get(), data_.size() * sizeof(T),
+                               data_.data().get(),  proper_size_ * sizeof(T),
                               cudaMemcpyDeviceToHost));
-      on_d_ = false;
+      perm_d_.DenyComplementary(access);
    }

-    void LazySyncDevice() {
-      if (on_d_) { return; }
+    void LazyResize(size_t new_size) {
+      if (new_size == cached_size_) { return; }
+      // resize is required
+      int ndevices = vec_->distribution_.devices_.Size();
+      start_ = vec_->distribution_.ShardStart(new_size, index_);
+      proper_size_ = vec_->distribution_.ShardProperSize(new_size, index_);
+      size_t size_d = vec_->distribution_.ShardSize(new_size, index_);
+      SetDevice();
+      data_.resize(size_d);
+      cached_size_ = new_size;
+    }
+
+    void LazySyncDevice(GPUAccess access) {
+      if (perm_d_.CanAccess(access)) { return; }
+      if (perm_d_.CanRead()) {
+        // deny read to the host
+        perm_d_.Grant(access);
+        std::lock_guard<std::mutex> lock(vec_->mutex_);
+        vec_->perm_h_.DenyComplementary(access);
+        return;
+      }
      // data is on the host
      size_t size_h = vec_->data_h_.size();
-      int ndevices = vec_->devices_.Size();
-      start_ = ShardStart(size_h, ndevices, index_);
-      size_t size_d = ShardSize(size_h, ndevices, index_);
-      dh::safe_cuda(cudaSetDevice(device_));
-      data_.resize(size_d);
-      dh::safe_cuda(cudaMemcpy(data_.data().get(),
-                               vec_->data_h_.data() + start_,
-                               size_d * sizeof(T), cudaMemcpyHostToDevice));
-      on_d_ = true;
-      // this may cause a race condition if LazySyncDevice() is called
-      // from multiple threads in parallel;
-      // however, the race condition is benign, and will not cause problems
-      vec_->on_h_ = false;
-      vec_->size_d_ = vec_->data_h_.size();
+      LazyResize(size_h);
+      SetDevice();
+      dh::safe_cuda(
+          cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
+                     data_.size() * sizeof(T), cudaMemcpyHostToDevice));
+      perm_d_.Grant(access);
+
+      std::lock_guard<std::mutex> lock(vec_->mutex_);
+      vec_->perm_h_.DenyComplementary(access);
+      vec_->size_d_ = size_h;
+    }
+
+    void SetDevice() {
+      if (cudaSetDeviceHandler == nullptr) {
+        dh::safe_cuda(cudaSetDevice(device_));
+      } else {
+        (*cudaSetDeviceHandler)(device_);
+      }
    }

    int index_;
    int device_;
    thrust::device_vector<T> data_;
+    // cached vector size
+    size_t cached_size_;
    size_t start_;
-    // true if there is an up-to-date copy of data on device, false otherwise
-    bool on_d_;
+    // size of the portion to copy back to the host
+    size_t proper_size_;
+    Permissions perm_d_;
    HostDeviceVectorImpl<T>* vec_;
  };

-  HostDeviceVectorImpl(size_t size, T v, GPUSet devices)
-    : devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
-    if (!devices.IsEmpty()) {
+  HostDeviceVectorImpl(size_t size, T v, GPUDistribution distribution)
+    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
+    if (!distribution_.IsEmpty()) {
      size_d_ = size;
      InitShards();
      Fill(v);
@@ -123,11 +163,16 @@ struct HostDeviceVectorImpl {
    }
  }

+  // required, as a new std::mutex has to be created
+  HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
+    : data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
+      distribution_(other.distribution_), mutex_(), shards_(other.shards_) {}
+
  // Init can be std::vector<T> or std::initializer_list<T>
  template <class Init>
-  HostDeviceVectorImpl(const Init& init, GPUSet devices)
-    : devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
-    if (!devices.IsEmpty()) {
+  HostDeviceVectorImpl(const Init& init, GPUDistribution distribution)
+    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
+    if (!distribution_.IsEmpty()) {
      size_d_ = init.size();
      InitShards();
      Copy(init);
@@ -137,58 +182,78 @@ struct HostDeviceVectorImpl {
  }

  void InitShards() {
-    int ndevices = devices_.Size();
+    int ndevices = distribution_.devices_.Size();
    shards_.resize(ndevices);
    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Init(this, devices_[i]);
+        shard.Init(this, distribution_.devices_[i]);
      });
  }

-  HostDeviceVectorImpl(const HostDeviceVectorImpl<T>&) = delete;
-  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&&) = delete;
-  void operator=(const HostDeviceVectorImpl<T>&) = delete;
-  void operator=(HostDeviceVectorImpl<T>&&) = delete;
+  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }

-  size_t Size() const { return on_h_ ? data_h_.size() : size_d_; }
+  GPUSet Devices() const { return distribution_.devices_; }

-  GPUSet Devices() const { return devices_; }
+  const GPUDistribution& Distribution() const { return distribution_; }

  T* DevicePointer(int device) {
-    CHECK(devices_.Contains(device));
-    LazySyncDevice(device);
-    return shards_[devices_.Index(device)].data_.data().get();
+    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(device, GPUAccess::kWrite);
+    return shards_[distribution_.devices_.Index(device)].data_.data().get();
+  }
+
+  const T* ConstDevicePointer(int device) {
+    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(device, GPUAccess::kRead);
+    return shards_[distribution_.devices_.Index(device)].data_.data().get();
  }

  common::Span<T> DeviceSpan(int device) {
-    CHECK(devices_.Contains(device));
-    LazySyncDevice(device);
-    return { shards_[devices_.Index(device)].data_.data().get(),
-             static_cast<typename common::Span<T>::index_type>(Size()) };
+    GPUSet devices = distribution_.devices_;
+    CHECK(devices.Contains(device));
+    LazySyncDevice(device, GPUAccess::kWrite);
+    return {shards_[devices.Index(device)].data_.data().get(),
+            static_cast<typename common::Span<T>::index_type>(Size())};
+  }
+
+  common::Span<const T> ConstDeviceSpan(int device) {
+    GPUSet devices = distribution_.devices_;
+    CHECK(devices.Contains(device));
+    LazySyncDevice(device, GPUAccess::kRead);
+    return {shards_[devices.Index(device)].data_.data().get(),
+      static_cast<typename common::Span<const T>::index_type>(Size())};
  }

  size_t DeviceSize(int device) {
-    CHECK(devices_.Contains(device));
-    LazySyncDevice(device);
-    return shards_[devices_.Index(device)].data_.size();
+    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(device, GPUAccess::kRead);
+    return shards_[distribution_.devices_.Index(device)].data_.size();
  }

  size_t DeviceStart(int device) {
-    CHECK(devices_.Contains(device));
-    LazySyncDevice(device);
-    return shards_[devices_.Index(device)].start_;
+    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(device, GPUAccess::kRead);
+    return shards_[distribution_.devices_.Index(device)].start_;
  }

  thrust::device_ptr<T> tbegin(int device) {  // NOLINT
    return thrust::device_ptr<T>(DevicePointer(device));
  }

+  thrust::device_ptr<const T> tcbegin(int device) {  // NOLINT
+    return thrust::device_ptr<const T>(ConstDevicePointer(device));
+  }
+
  thrust::device_ptr<T> tend(int device) {  // NOLINT
    return tbegin(device) + DeviceSize(device);
  }

-  void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+  thrust::device_ptr<const T> tcend(int device) {  // NOLINT
+    return tcbegin(device) + DeviceSize(device);
+  }
+
+  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
    CHECK_EQ(end - begin, Size());
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
                               (end - begin) * sizeof(T),
                               cudaMemcpyDeviceToHost));
@@ -201,7 +266,7 @@ struct HostDeviceVectorImpl {

  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
    CHECK_EQ(end - begin, Size());
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
                               data_h_.size() * sizeof(T),
                               cudaMemcpyHostToDevice));
@@ -211,7 +276,7 @@ struct HostDeviceVectorImpl {
  }

  void Fill(T v) {
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      std::fill(data_h_.begin(), data_h_.end(), v);
    } else {
      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.Fill(v); });
@@ -220,10 +285,10 @@ struct HostDeviceVectorImpl {

  void Copy(HostDeviceVectorImpl<T>* other) {
    CHECK_EQ(Size(), other->Size());
-    if (on_h_ && other->on_h_) {
+    if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
      std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
    } else {
-      CHECK(devices_ == other->devices_);
+      CHECK(distribution_ == other->distribution_);
      dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
          shard.Copy(&other->shards_[i]);
        });
@@ -232,7 +297,7 @@ struct HostDeviceVectorImpl {

  void Copy(const std::vector<T>& other) {
    CHECK_EQ(Size(), other.size());
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
@@ -243,7 +308,7 @@ struct HostDeviceVectorImpl {

  void Copy(std::initializer_list<T> other) {
    CHECK_EQ(Size(), other.size());
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
@@ -253,72 +318,117 @@ struct HostDeviceVectorImpl {
  }

  std::vector<T>& HostVector() {
-    LazySyncHost();
+    LazySyncHost(GPUAccess::kWrite);
    return data_h_;
  }

-  void Reshard(GPUSet new_devices) {
-    if (devices_ == new_devices)
-      return;
-    CHECK(devices_.IsEmpty());
-    devices_ = new_devices;
+  const std::vector<T>& ConstHostVector() {
+    LazySyncHost(GPUAccess::kRead);
+    return data_h_;
+  }
+
+  void Reshard(const GPUDistribution& distribution) {
+    if (distribution_ == distribution) { return; }
+    CHECK(distribution_.IsEmpty());
+    distribution_ = distribution;
    InitShards();
  }

+  void Reshard(GPUSet new_devices) {
+    if (distribution_.Devices() == new_devices) { return; }
+    Reshard(GPUDistribution::Block(new_devices));
+  }
+
  void Resize(size_t new_size, T v) {
-    if (new_size == Size())
-      return;
-    if (Size() == 0 && !devices_.IsEmpty()) {
+    if (new_size == Size()) { return; }
+    if (distribution_.IsFixedSize()) {
+      CHECK_EQ(new_size, distribution_.offsets_.back());
+    }
+    if (Size() == 0 && !distribution_.IsEmpty()) {
      // fast on-device resize
-      on_h_ = false;
+      perm_h_ = Permissions(false);
      size_d_ = new_size;
      InitShards();
      Fill(v);
    } else {
      // resize on host
-      LazySyncHost();
+      LazySyncHost(GPUAccess::kWrite);
      data_h_.resize(new_size, v);
    }
  }

-  void LazySyncHost() {
-    if (on_h_)
+  void LazySyncHost(GPUAccess access) {
+    if (perm_h_.CanAccess(access)) { return; }
+    if (perm_h_.CanRead()) {
+      // data is present, just need to deny access to the device
+      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
+          shard.perm_d_.DenyComplementary(access);
+        });
+      perm_h_.Grant(access);
      return;
-    if (data_h_.size() != size_d_)
-      data_h_.resize(size_d_);
-    dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.LazySyncHost(); });
-    on_h_ = true;
+    }
+    if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
+    dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
+        shard.LazySyncHost(access);
+      });
+    perm_h_.Grant(access);
  }

-  void LazySyncDevice(int device) {
-    CHECK(devices_.Contains(device));
-    shards_[devices_.Index(device)].LazySyncDevice();
+  void LazySyncDevice(int device, GPUAccess access) {
+    GPUSet devices = distribution_.Devices();
+    CHECK(devices.Contains(device));
+    shards_[devices.Index(device)].LazySyncDevice(access);
+  }
+
+  bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
+
+  bool DeviceCanAccess(int device, GPUAccess access) {
+    GPUSet devices = distribution_.Devices();
+    if (!devices.Contains(device)) { return false; }
+    return shards_[devices.Index(device)].perm_d_.CanAccess(access);
  }

  std::vector<T> data_h_;
-  bool on_h_;
+  Permissions perm_h_;
  // the total size of the data stored on the devices
  size_t size_d_;
-  GPUSet devices_;
+  GPUDistribution distribution_;
+  // protects size_d_ and perm_h_ when updated from multiple threads
+  std::mutex mutex_;
  std::vector<DeviceShard> shards_;
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices)
-  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(size, v, devices);
+HostDeviceVector<T>::HostDeviceVector
+(size_t size, T v, GPUDistribution distribution) : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
-  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, devices);
+HostDeviceVector<T>::HostDeviceVector
+(std::initializer_list<T> init, GPUDistribution distribution) : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
+HostDeviceVector<T>::HostDeviceVector
+(const std::vector<T>& init, GPUDistribution distribution) : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
+}
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, devices);
+  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
+}
+
+template <typename T>
+HostDeviceVector<T>& HostDeviceVector<T>::operator=
+(const HostDeviceVector<T>& other) {
+  if (this == &other) { return *this; }
+  delete impl_;
+  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
+  return *this;
 }

 template <typename T>
@@ -335,7 +445,19 @@ template <typename T>
 GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }

 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }
+const GPUDistribution& HostDeviceVector<T>::Distribution() const {
+  return impl_->Distribution();
+}
+
+template <typename T>
+T* HostDeviceVector<T>::DevicePointer(int device) {
+  return impl_->DevicePointer(device);
+}
+
+template <typename T>
+const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
+  return impl_->ConstDevicePointer(device);
+}

 template <typename T>
 common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
@@ -343,30 +465,49 @@ common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) { return impl_->DeviceStart(device); }
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+  return impl_->ConstDeviceSpan(device);
+}

 template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) { return impl_->DeviceSize(device); }
+size_t HostDeviceVector<T>::DeviceStart(int device) const {
+  return impl_->DeviceStart(device);
+}
+
+template <typename T>
+size_t HostDeviceVector<T>::DeviceSize(int device) const {
+  return impl_->DeviceSize(device);
+}

 template <typename T>
 thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) {  // NOLINT
  return impl_->tbegin(device);
 }

+template <typename T>
+thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const {  // NOLINT
+  return impl_->tcbegin(device);
+}
+
 template <typename T>
 thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) {  // NOLINT
  return impl_->tend(device);
 }

+template <typename T>
+thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const {  // NOLINT
+  return impl_->tcend(device);
+}
+
 template <typename T>
 void HostDeviceVector<T>::ScatterFrom
-(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
  impl_->ScatterFrom(begin, end);
 }

 template <typename T>
 void HostDeviceVector<T>::GatherTo
-(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
  impl_->GatherTo(begin, end);
 }

@@ -376,8 +517,8 @@ void HostDeviceVector<T>::Fill(T v) {
 }

 template <typename T>
-void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
-  impl_->Copy(other->impl_);
+void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
+  impl_->Copy(other.impl_);
 }

 template <typename T>
@@ -394,10 +535,30 @@ template <typename T>
 std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }

 template <typename T>
-void HostDeviceVector<T>::Reshard(GPUSet new_devices) {
+const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
+  return impl_->ConstHostVector();
+}
+
+template <typename T>
+bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
+  return impl_->HostCanAccess(access);
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
+  return impl_->DeviceCanAccess(device, access);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Reshard(GPUSet new_devices) const {
  impl_->Reshard(new_devices);
 }

+template <typename T>
+void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const {
+  impl_->Reshard(distribution);
+}
+
 template <typename T>
 void HostDeviceVector<T>::Resize(size_t new_size, T v) {
  impl_->Resize(new_size, v);
@@ -406,7 +567,8 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
-template class HostDeviceVector<unsigned int>;
 template class HostDeviceVector<int>;
+template class HostDeviceVector<Entry>;
+template class HostDeviceVector<size_t>;

 }  // namespace xgboost
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -1,28 +1,6 @@
 /*!
 * Copyright 2017 XGBoost contributors
 */
-#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
-#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
-
-#include <dmlc/logging.h>
-
-#include <algorithm>
-#include <cstdlib>
-#include <initializer_list>
-#include <vector>
-
-#include "gpu_set.h"
-#include "span.h"
-
-// only include thrust-related files if host_device_vector.h
-// is included from a .cu file
-#ifdef __CUDACC__
-#include <thrust/device_ptr.h>
-#endif
-
-namespace xgboost {
-
-template <typename T> struct HostDeviceVectorImpl;

 /**
 * @file host_device_vector.h
@@ -70,44 +48,203 @@ template <typename T> struct HostDeviceVectorImpl;
 * if different threads call these methods with different values of the device argument.
 * All other methods are not thread safe. 
 */
+
+#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
+#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
+
+#include <dmlc/logging.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <initializer_list>
+#include <vector>
+
+#include "gpu_set.h"
+#include "span.h"
+
+// only include thrust-related files if host_device_vector.h
+// is included from a .cu file
+#ifdef __CUDACC__
+#include <thrust/device_ptr.h>
+#endif
+
+namespace xgboost {
+
+#ifdef __CUDACC__
+// Sets a function to call instead of cudaSetDevice();
+// only added for testing
+void SetCudaSetDeviceHandler(void (*handler)(int));
+#endif
+
+template <typename T> struct HostDeviceVectorImpl;
+
+// Distribution for the HostDeviceVector; it specifies such aspects as the devices it is
+// distributed on, whether there are copies of elements from other GPUs as well as the granularity
+// of splitting. It may also specify explicit boundaries for devices, in which case the size of the
+// array cannot be changed.
+class GPUDistribution {
+  template<typename T> friend struct HostDeviceVectorImpl;
+
+ public:
+  explicit GPUDistribution(GPUSet devices = GPUSet::Empty())
+    : devices_(devices), granularity_(1), overlap_(0) {}
+
+ private:
+  GPUDistribution(GPUSet devices, int granularity, int overlap,
+                  std::vector<size_t> offsets)
+    : devices_(devices), granularity_(granularity), overlap_(overlap),
+    offsets_(std::move(offsets)) {}
+
+ public:
+  static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
+
+  static GPUDistribution Overlap(GPUSet devices, int overlap) {
+    return GPUDistribution(devices, 1, overlap, std::vector<size_t>());
+  }
+
+  static GPUDistribution Granular(GPUSet devices, int granularity) {
+    return GPUDistribution(devices, granularity, 0, std::vector<size_t>());
+  }
+
+  static GPUDistribution Explicit(GPUSet devices, std::vector<size_t> offsets) {
+    return GPUDistribution(devices, 1, 0, offsets);
+  }
+
+  friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
+    return a.devices_ == b.devices_ && a.granularity_ == b.granularity_ &&
+      a.overlap_ == b.overlap_ && a.offsets_ == b.offsets_;
+  }
+
+  friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
+    return !(a == b);
+  }
+
+  GPUSet Devices() const { return devices_; }
+
+  bool IsEmpty() const { return devices_.IsEmpty(); }
+
+  size_t ShardStart(size_t size, int index) const {
+    if (size == 0) { return 0; }
+    if (offsets_.size() > 0) {
+      // explicit offsets are provided
+      CHECK_EQ(offsets_.back(), size);
+      return offsets_.at(index);
+    }
+    // no explicit offsets
+    size_t begin = std::min(index * Portion(size), size);
+    begin = begin > size ? size : begin;
+    return begin;
+  }
+
+  size_t ShardSize(size_t size, int index) const {
+    if (size == 0) { return 0; }
+    if (offsets_.size() > 0) {
+      // explicit offsets are provided
+      CHECK_EQ(offsets_.back(), size);
+      return offsets_.at(index + 1)  - offsets_.at(index) +
+        (index == devices_.Size() - 1 ? overlap_ : 0);
+    }
+    size_t portion = Portion(size);
+    size_t begin = std::min(index * portion, size);
+    size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size);
+    return end - begin;
+  }
+
+  size_t ShardProperSize(size_t size, int index) const {
+    if (size == 0) { return 0; }
+    return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0);
+  }
+
+  bool IsFixedSize() const { return !offsets_.empty(); }
+
+ private:
+  static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; }
+  static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; }
+
+  size_t Portion(size_t size) const {
+    return RoundUp
+      (DivRoundUp
+       (std::max(static_cast<int64_t>(size - overlap_ * granularity_),
+                 static_cast<int64_t>(1)),
+        devices_.Size()), granularity_);
+  }
+
+  GPUSet devices_;
+  int granularity_;
+  int overlap_;
+  // explicit offsets for the GPU parts, if any
+  std::vector<size_t> offsets_;
+};
+
+enum GPUAccess {
+  kNone, kRead,
+  // write implies read
+  kWrite
+};
+
+inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
+  return static_cast<GPUAccess>(static_cast<int>(a) - static_cast<int>(b));
+}
+
 template <typename T>
 class HostDeviceVector {
 public:
  explicit HostDeviceVector(size_t size = 0, T v = T(),
-                            GPUSet devices = GPUSet::Empty());
-  HostDeviceVector(std::initializer_list<T> init, GPUSet devices = GPUSet::Empty());
+                            GPUDistribution distribution = GPUDistribution());
+  HostDeviceVector(std::initializer_list<T> init,
+                   GPUDistribution distribution = GPUDistribution());
  explicit HostDeviceVector(const std::vector<T>& init,
-                            GPUSet devices = GPUSet::Empty());
+                            GPUDistribution distribution = GPUDistribution());
  ~HostDeviceVector();
-  HostDeviceVector(const HostDeviceVector<T>&) = delete;
-  HostDeviceVector(HostDeviceVector<T>&&) = delete;
-  void operator=(const HostDeviceVector<T>&) = delete;
-  void operator=(HostDeviceVector<T>&&) = delete;
+  HostDeviceVector(const HostDeviceVector<T>&);
+  HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
  size_t Size() const;
  GPUSet Devices() const;
-  T* DevicePointer(int device);
+  const GPUDistribution& Distribution() const;
  common::Span<T> DeviceSpan(int device);
+  common::Span<const T> ConstDeviceSpan(int device) const;
+  common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
+  T* DevicePointer(int device);
+  const T* ConstDevicePointer(int device) const;
+  const T* DevicePointer(int device) const { return ConstDevicePointer(device); }

  T* HostPointer() { return HostVector().data(); }
-  size_t DeviceStart(int device);
-  size_t DeviceSize(int device);
+  const T* ConstHostPointer() const { return ConstHostVector().data(); }
+  const T* HostPointer() const { return ConstHostPointer(); }
+
+  size_t DeviceStart(int device) const;
+  size_t DeviceSize(int device) const;

  // only define functions returning device_ptr
  // if HostDeviceVector.h is included from a .cu file
 #ifdef __CUDACC__
  thrust::device_ptr<T> tbegin(int device);  // NOLINT
  thrust::device_ptr<T> tend(int device);  // NOLINT
-  void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
-  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
+  thrust::device_ptr<const T> tcbegin(int device) const;  // NOLINT
+  thrust::device_ptr<const T> tcend(int device) const;  // NOLINT
+  thrust::device_ptr<const T> tbegin(int device) const {  // NOLINT
+    return tcbegin(device);
+  }
+  thrust::device_ptr<const T> tend(int device) const { return tcend(device); }  // NOLINT
+
+  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end);
+  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const;
 #endif

  void Fill(T v);
-  void Copy(HostDeviceVector<T>* other);
+  void Copy(const HostDeviceVector<T>& other);
  void Copy(const std::vector<T>& other);
  void Copy(std::initializer_list<T> other);

  std::vector<T>& HostVector();
-  void Reshard(GPUSet devices);
+  const std::vector<T>& ConstHostVector() const;
+  const std::vector<T>& HostVector() const {return ConstHostVector(); }
+
+  bool HostCanAccess(GPUAccess access) const;
+  bool DeviceCanAccess(int device, GPUAccess access) const;
+
+  void Reshard(const GPUDistribution& distribution) const;
+  void Reshard(GPUSet devices) const;
  void Resize(size_t new_size, T v = T());

 private: