Make HostDeviceVector single gpu only (#4773)

* Make HostDeviceVector single gpu only
2019-08-25 14:51:13 -07:00
parent 41227d1933
commit 38ab79f889
54 changed files with 641 additions and 1621 deletions
--- a/src/common/common.cc
+++ b/src/common/common.cc
@@ -22,48 +22,12 @@ using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
 GlobalRandomEngine& GlobalRandom() {
  return RandomThreadLocalStore::Get()->engine;
 }
-}  // namespace common

 #if !defined(XGBOOST_USE_CUDA)
-int AllVisibleImpl::AllVisible() {
+int AllVisibleGPUs() {
  return 0;
 }
 #endif  // !defined(XGBOOST_USE_CUDA)

-constexpr GPUSet::GpuIdType GPUSet::kAll;
-
-GPUSet GPUSet::All(GpuIdType gpu_id, GpuIdType n_gpus, int32_t n_rows) {
-  CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0.";
-  CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1.";
-
-  GpuIdType const n_devices_visible = AllVisible().Size();
-  CHECK_LE(n_gpus, n_devices_visible);
-  if (n_devices_visible == 0 || n_gpus == 0 || n_rows == 0) {
-    LOG(DEBUG) << "Runing on CPU.";
-    return Empty();
-  }
-
-  GpuIdType const n_available_devices = n_devices_visible - gpu_id;
-
-  if (n_gpus == kAll) {  // Use all devices starting from `gpu_id'.
-    CHECK(gpu_id < n_devices_visible)
-        << "\ngpu_id should be less than number of visible devices.\ngpu_id: "
-        << gpu_id
-        << ", number of visible devices: "
-        << n_devices_visible;
-    GpuIdType n_devices =
-        n_available_devices < n_rows ? n_available_devices : n_rows;
-    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
-    return Range(gpu_id, n_devices);
-  } else {  // Use devices in ( gpu_id, gpu_id + n_gpus ).
-    CHECK_LE(n_gpus, n_available_devices)
-        << "Starting from gpu id: " << gpu_id << ", there are only "
-        << n_available_devices << " available devices, while n_gpus is set to: "
-        << n_gpus;
-    GpuIdType n_devices = n_gpus < n_rows ? n_gpus : n_rows;
-    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
-    return Range(gpu_id, n_devices);
-  }
-}
-
+}  // namespace common
 }  // namespace xgboost
--- a/src/common/common.cu
+++ b/src/common/common.cu
@@ -4,8 +4,9 @@
 #include "common.h"

 namespace xgboost {
+namespace common {

-int AllVisibleImpl::AllVisible() {
+int AllVisibleGPUs() {
  int n_visgpus = 0;
  try {
    // When compiled with CUDA but running on CPU only device,
@@ -17,4 +18,5 @@ int AllVisibleImpl::AllVisible() {
  return n_visgpus;
 }

+}  // namespace common
 }  // namespace xgboost
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -140,88 +140,8 @@ class Range {
  Iterator begin_;
  Iterator end_;
 };
+
+int AllVisibleGPUs();
 }  // namespace common
-
-struct AllVisibleImpl {
-  static int AllVisible();
-};
-/* \brief set of devices across which HostDeviceVector can be distributed.
- *
- * Currently implemented as a range, but can be changed later to something else,
- *   e.g. a bitset
- */
-class GPUSet {
- public:
-  using GpuIdType = int;
-  static constexpr GpuIdType kAll = -1;
-
-  explicit GPUSet(int start = 0, int ndevices = 0)
-      : devices_(start, start + ndevices) {}
-
-  static GPUSet Empty() { return GPUSet(); }
-
-  static GPUSet Range(GpuIdType start, GpuIdType n_gpus) {
-    return n_gpus <= 0 ? Empty() : GPUSet{start, n_gpus};
-  }
-  /*! \brief n_gpus and num_rows both are upper bounds. */
-  static GPUSet All(GpuIdType gpu_id, GpuIdType n_gpus,
-                    GpuIdType num_rows = std::numeric_limits<GpuIdType>::max());
-
-  static GPUSet AllVisible() {
-    GpuIdType n =  AllVisibleImpl::AllVisible();
-    return Range(0, n);
-  }
-
-  size_t Size() const {
-    GpuIdType size = *devices_.end() - *devices_.begin();
-    GpuIdType res = size < 0 ? 0 : size;
-    return static_cast<size_t>(res);
-  }
-
-  /*
-   * By default, we have two configurations of identifying device, one
-   * is the device id obtained from `cudaGetDevice'.  But we sometimes
-   * store objects that allocated one for each device in a list, which
-   * requires a zero-based index.
-   *
-   * Hence, `DeviceId' converts a zero-based index to actual device id,
-   * `Index' converts a device id to a zero-based index.
-   */
-  GpuIdType DeviceId(size_t index) const {
-    GpuIdType result = *devices_.begin() + static_cast<GpuIdType>(index);
-    CHECK(Contains(result)) << "\nDevice " << result << " is not in GPUSet."
-                            << "\nIndex: " << index
-                            << "\nGPUSet: (" << *begin() << ", " << *end() << ")"
-                            << std::endl;
-    return result;
-  }
-  size_t Index(GpuIdType device) const {
-    CHECK(Contains(device)) << "\nDevice " << device << " is not in GPUSet."
-                            << "\nGPUSet: (" << *begin() << ", " << *end() << ")"
-                            << std::endl;
-    size_t result = static_cast<size_t>(device - *devices_.begin());
-    return result;
-  }
-
-  bool IsEmpty() const { return Size() == 0; }
-
-  bool Contains(GpuIdType device) const {
-    return *devices_.begin() <= device && device < *devices_.end();
-  }
-
-  common::Range::Iterator begin() const { return devices_.begin(); }  // NOLINT
-  common::Range::Iterator end() const { return devices_.end(); }      // NOLINT
-
-  friend bool operator==(const GPUSet& lhs, const GPUSet& rhs) {
-    return lhs.devices_ == rhs.devices_;
-  }
-  friend bool operator!=(const GPUSet& lhs, const GPUSet& rhs) {
-    return !(lhs == rhs);
-  }
-
- private:
-  common::Range devices_;
-};
-
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -72,22 +72,6 @@ const T *Raw(const thrust::device_vector<T> &v) {  //  NOLINT
  return raw_pointer_cast(v.data());
 }

-// if n_devices=-1, then use all visible devices
-inline void SynchronizeNDevices(xgboost::GPUSet devices) {
-  devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices;
-  for (auto const d : devices) {
-    safe_cuda(cudaSetDevice(d));
-    safe_cuda(cudaDeviceSynchronize());
-  }
-}
-
-inline void SynchronizeAll() {
-  for (int device_idx : xgboost::GPUSet::AllVisible()) {
-    safe_cuda(cudaSetDevice(device_idx));
-    safe_cuda(cudaDeviceSynchronize());
-  }
-}
-
 inline size_t AvailableMemory(int device_idx) {
  size_t device_free = 0;
  size_t device_total = 0;
@@ -119,7 +103,7 @@ inline size_t MaxSharedMemory(int device_idx) {
 }

 inline void CheckComputeCapability() {
-  for (int d_idx : xgboost::GPUSet::AllVisible()) {
+  for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
    cudaDeviceProp prop;
    safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
    std::ostringstream oss;
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -35,7 +35,6 @@ __global__ void FindCutsK
  if (icut >= ncuts) {
    return;
  }
-  WXQSketch::Entry v;
  int isample = 0;
  if (icut == 0) {
    isample = 0;
@@ -59,11 +58,14 @@ struct IsNotNaN {
  __device__ bool operator()(float a) const { return !isnan(a); }
 };

-__global__ void UnpackFeaturesK
-(float* __restrict__ fvalues, float* __restrict__ feature_weights,
- const size_t* __restrict__ row_ptrs, const float* __restrict__ weights,
- Entry* entries, size_t nrows_array, int ncols, size_t row_begin_ptr,
- size_t nrows) {
+__global__ void UnpackFeaturesK(float* __restrict__ fvalues,
+                                float* __restrict__ feature_weights,
+                                const size_t* __restrict__ row_ptrs,
+                                const float* __restrict__ weights,
+                                Entry* entries,
+                                size_t nrows_array,
+                                size_t row_begin_ptr,
+                                size_t nrows) {
  size_t irow = threadIdx.x + size_t(blockIdx.x) * blockDim.x;
  if (irow >= nrows) {
    return;
@@ -102,8 +104,9 @@ struct SketchContainer {
    const MetaInfo &info = dmat->Info();
    // Initialize Sketches for this dmatrix
    sketches_.resize(info.num_col_);
-#pragma omp parallel for schedule(static) if (info.num_col_ > kOmpNumColsParallelizeLimit)
-    for (int icol = 0; icol < info.num_col_; ++icol) {
+#pragma omp parallel for default(none) shared(info, param) schedule(static) \
+if (info.num_col_ > kOmpNumColsParallelizeLimit)  // NOLINT
+    for (int icol = 0; icol < info.num_col_; ++icol) {  // NOLINT
      sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin));
    }
  }
@@ -120,8 +123,6 @@ struct GPUSketcher {
  // manage memory for a single GPU
  class DeviceShard {
    int device_;
-    bst_uint row_begin_;  // The row offset for this shard
-    bst_uint row_end_;
    bst_uint n_rows_;
    int num_cols_{0};
    size_t n_cuts_{0};
@@ -131,27 +132,31 @@ struct GPUSketcher {

    tree::TrainParam param_;
    SketchContainer *sketch_container_;
-    dh::device_vector<size_t> row_ptrs_;
-    dh::device_vector<Entry> entries_;
-    dh::device_vector<bst_float> fvalues_;
-    dh::device_vector<bst_float> feature_weights_;
-    dh::device_vector<bst_float> fvalues_cur_;
-    dh::device_vector<WXQSketch::Entry> cuts_d_;
-    thrust::host_vector<WXQSketch::Entry> cuts_h_;
-    dh::device_vector<bst_float> weights_;
-    dh::device_vector<bst_float> weights2_;
-    std::vector<size_t> n_cuts_cur_;
-    dh::device_vector<size_t> num_elements_;
-    dh::device_vector<char> tmp_storage_;
+    dh::device_vector<size_t> row_ptrs_{};
+    dh::device_vector<Entry> entries_{};
+    dh::device_vector<bst_float> fvalues_{};
+    dh::device_vector<bst_float> feature_weights_{};
+    dh::device_vector<bst_float> fvalues_cur_{};
+    dh::device_vector<WXQSketch::Entry> cuts_d_{};
+    thrust::host_vector<WXQSketch::Entry> cuts_h_{};
+    dh::device_vector<bst_float> weights_{};
+    dh::device_vector<bst_float> weights2_{};
+    std::vector<size_t> n_cuts_cur_{};
+    dh::device_vector<size_t> num_elements_{};
+    dh::device_vector<char> tmp_storage_{};

   public:
-    DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
-                tree::TrainParam param, SketchContainer *sketch_container) :
-      device_(device), row_begin_(row_begin), row_end_(row_end),
-      n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
+    DeviceShard(int device,
+                bst_uint n_rows,
+                tree::TrainParam param,
+                SketchContainer* sketch_container) :
+        device_(device),
+        n_rows_(n_rows),
+        param_(std::move(param)),
+        sketch_container_(sketch_container) {
    }

-    ~DeviceShard() {
+    ~DeviceShard() {  // NOLINT
      dh::safe_cuda(cudaSetDevice(device_));
    }

@@ -319,19 +324,18 @@ struct GPUSketcher {
      const auto& offset_vec = row_batch.offset.HostVector();
      const auto& data_vec = row_batch.data.HostVector();

-      size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
-        offset_vec[row_begin_ + batch_row_begin];
+      size_t n_entries = offset_vec[batch_row_end] - offset_vec[batch_row_begin];
      // copy the batch to the GPU
      dh::safe_cuda
        (cudaMemcpyAsync(entries_.data().get(),
-                    data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
+                    data_vec.data() + offset_vec[batch_row_begin],
                    n_entries * sizeof(Entry), cudaMemcpyDefault));
      // copy the weights if necessary
      if (has_weights_) {
        const auto& weights_vec = info.weights_.HostVector();
        dh::safe_cuda
          (cudaMemcpyAsync(weights_.data().get(),
-                      weights_vec.data() + row_begin_ + batch_row_begin,
+                      weights_vec.data() + batch_row_begin,
                      batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
      }

@@ -349,8 +353,7 @@ struct GPUSketcher {
        (fvalues_.data().get(), has_weights_ ? feature_weights_.data().get() : nullptr,
         row_ptrs_.data().get() + batch_row_begin,
         has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
-         gpu_batch_nrows_, num_cols_,
-         offset_vec[row_begin_ + batch_row_begin], batch_nrows);
+         gpu_batch_nrows_, offset_vec[batch_row_begin], batch_nrows);

      for (int icol = 0; icol < num_cols_; ++icol) {
        FindColumnCuts(batch_nrows, icol);
@@ -358,7 +361,7 @@ struct GPUSketcher {

      // add cuts into sketches
      thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
-#pragma omp parallel for schedule(static) \
+#pragma omp parallel for default(none) schedule(static) \
      if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
      for (int icol = 0; icol < num_cols_; ++icol) {
        WXQSketch::SummaryContainer summary;
@@ -391,8 +394,7 @@ struct GPUSketcher {
      dh::safe_cuda(cudaSetDevice(device_));
      const auto& offset_vec = row_batch.offset.HostVector();
      row_ptrs_.resize(n_rows_ + 1);
-      thrust::copy(offset_vec.data() + row_begin_,
-                   offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
+      thrust::copy(offset_vec.data(), offset_vec.data() + n_rows_ + 1, row_ptrs_.begin());
      size_t gpu_nbatches = common::DivRoundUp(n_rows_, gpu_batch_nrows_);
      for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
        SketchBatch(row_batch, info, gpu_batch);
@@ -401,32 +403,18 @@ struct GPUSketcher {
  };

  void SketchBatch(const SparsePage &batch, const MetaInfo &info) {
-    GPUDistribution dist =
-      GPUDistribution::Block(GPUSet::All(generic_param_.gpu_id, generic_param_.n_gpus,
-                                         batch.Size()));
+    auto device = generic_param_.gpu_id;

-    // create device shards
-    shards_.resize(dist.Devices().Size());
-    dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
-        size_t start = dist.ShardStart(batch.Size(), i);
-        size_t size = dist.ShardSize(batch.Size(), i);
-        shard = std::unique_ptr<DeviceShard>(
-            new DeviceShard(dist.Devices().DeviceId(i), start,
-                            start + size, param_, sketch_container_.get()));
-      });
+    // create device shard
+    shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get()));

-    // compute sketches for each shard
-    dh::ExecuteIndexShards(&shards_,
-                           [&](int idx, std::unique_ptr<DeviceShard>& shard) {
-                             shard->Init(batch, info, gpu_batch_nrows_);
-                             shard->Sketch(batch, info);
-                             shard->ComputeRowStride();
-                           });
+    // compute sketches for the shard
+    shard_->Init(batch, info, gpu_batch_nrows_);
+    shard_->Sketch(batch, info);
+    shard_->ComputeRowStride();

-    // compute row stride across all shards
-    for (const auto &shard : shards_) {
-      row_stride_ = std::max(row_stride_, shard->GetRowStride());
-    }
+    // compute row stride
+    row_stride_ = shard_->GetRowStride();
  }

  GPUSketcher(const tree::TrainParam &param, const GenericParameter &generic_param, int gpu_nrows)
@@ -444,13 +432,13 @@ struct GPUSketcher {
      this->SketchBatch(batch, info);
    }

-    hmat->Init(&sketch_container_.get()->sketches_, param_.max_bin);
+    hmat->Init(&sketch_container_->sketches_, param_.max_bin);

    return row_stride_;
  }

 private:
-  std::vector<std::unique_ptr<DeviceShard>> shards_;
+  std::unique_ptr<DeviceShard> shard_;
  const tree::TrainParam &param_;
  const GenericParameter &generic_param_;
  int gpu_batch_nrows_;
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -30,19 +30,19 @@ struct HostDeviceVectorImpl {
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(size, v);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -75,29 +75,23 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }

 template <typename T>
-GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
+int HostDeviceVector<T>::DeviceIdx() const { return -1; }

 template <typename T>
-const GPUDistribution& HostDeviceVector<T>::Distribution() const {
-  static GPUDistribution dummyInstance;
-  return dummyInstance;
-}
+T* HostDeviceVector<T>::DevicePointer() { return nullptr; }

 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
-
-template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
+const T* HostDeviceVector<T>::ConstDevicePointer() const {
  return nullptr;
 }

 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
+common::Span<T> HostDeviceVector<T>::DeviceSpan() {
  return common::Span<T>();
 }

 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
  return common::Span<const T>();
 }

@@ -115,10 +109,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) const { return 0; }
-
-template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) const { return 0; }
+size_t HostDeviceVector<T>::DeviceSize() const { return 0; }

 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
@@ -149,18 +140,12 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
 }

 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
+bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
  return false;
 }

 template <typename T>
-void HostDeviceVector<T>::Shard(const GPUDistribution& distribution) const { }
-
-template <typename T>
-void HostDeviceVector<T>::Shard(GPUSet devices) const { }
-
-template <typename T>
-void Reshard(const GPUDistribution &distribution) { }
+void HostDeviceVector<T>::SetDevice(int device) const {}

 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -10,7 +10,6 @@
 #include <mutex>
 #include "./device_helpers.cuh"

-
 namespace xgboost {

 // the handler to call instead of cudaSetDevice; only used for testing
@@ -43,144 +42,12 @@ class Permissions {
 };

 template <typename T>
-struct HostDeviceVectorImpl {
-  struct DeviceShard {
-    DeviceShard()
-      : proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
-        cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
-
-    ~DeviceShard() {
+class HostDeviceVectorImpl {
+ public:
+  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), perm_h_(device < 0) {
+    if (device >= 0) {
      SetDevice();
-    }
-
-    void Init(HostDeviceVectorImpl<T>* vec, int device) {
-      if (vec_ == nullptr) { vec_ = vec; }
-      CHECK_EQ(vec, vec_);
-      device_ = device;
-      LazyResize(vec_->Size());
-      perm_d_ = vec_->perm_h_.Complementary();
-    }
-
-    void Init(HostDeviceVectorImpl<T>* vec, const DeviceShard& other) {
-      if (vec_ == nullptr) { vec_ = vec; }
-      CHECK_EQ(vec, vec_);
-      device_ = other.device_;
-      cached_size_ = other.cached_size_;
-      start_ = other.start_;
-      proper_size_ = other.proper_size_;
-      SetDevice();
-      data_.resize(other.data_.size());
-      perm_d_ = other.perm_d_;
-    }
-
-    void ScatterFrom(const T* begin) {
-      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice(GPUAccess::kWrite);
-      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_,
-                               data_.size() * sizeof(T), cudaMemcpyDefault));
-    }
-
-    void GatherTo(thrust::device_ptr<T> begin) {
-      LazySyncDevice(GPUAccess::kRead);
-      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(),
-                               proper_size_ * sizeof(T), cudaMemcpyDefault));
-    }
-
-    void Fill(T v) {
-      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice(GPUAccess::kWrite);
-      SetDevice();
-      thrust::fill(data_.begin(), data_.end(), v);
-    }
-
-    void Copy(DeviceShard* other) {
-      // TODO(canonizer): avoid full copy of host data for this (but not for other)
-      LazySyncDevice(GPUAccess::kWrite);
-      other->LazySyncDevice(GPUAccess::kRead);
-      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(),
-                               data_.size() * sizeof(T), cudaMemcpyDefault));
-    }
-
-    void LazySyncHost(GPUAccess access) {
-      SetDevice();
-      dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
-                               data_.data().get(),  proper_size_ * sizeof(T),
-                               cudaMemcpyDeviceToHost));
-      perm_d_.DenyComplementary(access);
-    }
-
-    void LazyResize(size_t new_size) {
-      if (new_size == cached_size_) { return; }
-      // resize is required
-      int ndevices = vec_->distribution_.devices_.Size();
-      int device_index = vec_->distribution_.devices_.Index(device_);
-      start_ = vec_->distribution_.ShardStart(new_size, device_index);
-      proper_size_ = vec_->distribution_.ShardProperSize(new_size, device_index);
-      // The size on this device.
-      size_t size_d = vec_->distribution_.ShardSize(new_size, device_index);
-      SetDevice();
-      data_.resize(size_d);
-      cached_size_ = new_size;
-    }
-
-    void LazySyncDevice(GPUAccess access) {
-      if (perm_d_.CanAccess(access)) { return; }
-      if (perm_d_.CanRead()) {
-        // deny read to the host
-        perm_d_.Grant(access);
-        std::lock_guard<std::mutex> lock(vec_->mutex_);
-        vec_->perm_h_.DenyComplementary(access);
-        return;
-      }
-      // data is on the host
-      size_t size_h = vec_->data_h_.size();
-      LazyResize(size_h);
-      SetDevice();
-      dh::safe_cuda(
-          cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
-                     data_.size() * sizeof(T), cudaMemcpyHostToDevice));
-      perm_d_.Grant(access);
-
-      std::lock_guard<std::mutex> lock(vec_->mutex_);
-      vec_->perm_h_.DenyComplementary(access);
-      vec_->size_d_ = size_h;
-    }
-
-    void SetDevice() {
-      if (cudaSetDeviceHandler == nullptr) {
-        dh::safe_cuda(cudaSetDevice(device_));
-      } else {
-        (*cudaSetDeviceHandler)(device_);
-      }
-    }
-
-    T*     Raw()                    { return data_.data().get(); }
-    size_t Start()            const { return start_; }
-    size_t DataSize()         const { return data_.size(); }
-    Permissions& Perm()             { return perm_d_; }
-    Permissions const& Perm() const { return perm_d_; }
-
-   private:
-    int device_;
-    dh::device_vector<T> data_;
-    // cached vector size
-    size_t cached_size_;
-    size_t start_;
-    // size of the portion to copy back to the host
-    size_t proper_size_;
-    Permissions perm_d_;
-    HostDeviceVectorImpl<T>* vec_;
-  };
-
-  HostDeviceVectorImpl(size_t size, T v, const GPUDistribution &distribution)
-    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
-    if (!distribution_.IsEmpty()) {
-      size_d_ = size;
-      InitShards();
-      Fill(v);
+      data_d_.resize(size, v);
    } else {
      data_h_.resize(size, v);
    }
@@ -188,127 +55,81 @@ struct HostDeviceVectorImpl {

  // required, as a new std::mutex has to be created
  HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
-    : data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
-      distribution_(other.distribution_), mutex_() {
-    shards_.resize(other.shards_.size());
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Init(this, other.shards_.at(i));
-      });
+      : device_(other.device_), data_h_(other.data_h_), perm_h_(other.perm_h_), mutex_() {
+    if (device_ >= 0) {
+      SetDevice();
+      data_d_ = other.data_d_;
+    }
  }

  // Initializer can be std::vector<T> or std::initializer_list<T>
  template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, const GPUDistribution &distribution)
-    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
-    if (!distribution_.IsEmpty()) {
-      size_d_ = init.size();
-      InitShards();
+  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device), perm_h_(device < 0) {
+    if (device >= 0) {
+      LazyResizeDevice(init.size());
      Copy(init);
    } else {
      data_h_ = init;
    }
  }

-  void InitShards() {
-    int ndevices = distribution_.devices_.Size();
-    shards_.resize(ndevices);
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Init(this, distribution_.devices_.DeviceId(i));
-      });
+  ~HostDeviceVectorImpl() {
+    if (device_ >= 0) {
+      SetDevice();
+    }
  }

-  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }
+  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : data_d_.size(); }

-  GPUSet Devices() const { return distribution_.devices_; }
+  int DeviceIdx() const { return device_; }

-  const GPUDistribution& Distribution() const { return distribution_; }
-
-  T* DevicePointer(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kWrite);
-    return shards_.at(distribution_.devices_.Index(device)).Raw();
+  T* DevicePointer() {
+    LazySyncDevice(GPUAccess::kWrite);
+    return data_d_.data().get();
  }

-  const T* ConstDevicePointer(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
-    return shards_.at(distribution_.devices_.Index(device)).Raw();
+  const T* ConstDevicePointer() {
+    LazySyncDevice(GPUAccess::kRead);
+    return data_d_.data().get();
  }

-  common::Span<T> DeviceSpan(int device) {
-    GPUSet devices = distribution_.devices_;
-    CHECK(devices.Contains(device));
-    LazySyncDevice(device, GPUAccess::kWrite);
-    return {shards_.at(devices.Index(device)).Raw(),
-          static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
+  common::Span<T> DeviceSpan() {
+    LazySyncDevice(GPUAccess::kWrite);
+    return {data_d_.data().get(), static_cast<typename common::Span<T>::index_type>(DeviceSize())};
  }

-  common::Span<const T> ConstDeviceSpan(int device) {
-    GPUSet devices = distribution_.devices_;
-    CHECK(devices.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
+  common::Span<const T> ConstDeviceSpan() {
+    LazySyncDevice(GPUAccess::kRead);
    using SpanInd = typename common::Span<const T>::index_type;
-    return {shards_.at(devices.Index(device)).Raw(),
-          static_cast<SpanInd>(DeviceSize(device))};
+    return {data_d_.data().get(), static_cast<SpanInd>(DeviceSize())};
  }

-  size_t DeviceSize(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
-    return shards_.at(distribution_.devices_.Index(device)).DataSize();
+  size_t DeviceSize() {
+    LazySyncDevice(GPUAccess::kRead);
+    return data_d_.size();
  }

-  size_t DeviceStart(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
-    return shards_.at(distribution_.devices_.Index(device)).Start();
+  thrust::device_ptr<T> tbegin() {  // NOLINT
+    return thrust::device_ptr<T>(DevicePointer());
  }

-  thrust::device_ptr<T> tbegin(int device) {  // NOLINT
-    return thrust::device_ptr<T>(DevicePointer(device));
+  thrust::device_ptr<const T> tcbegin() {  // NOLINT
+    return thrust::device_ptr<const T>(ConstDevicePointer());
  }

-  thrust::device_ptr<const T> tcbegin(int device) {  // NOLINT
-    return thrust::device_ptr<const T>(ConstDevicePointer(device));
+  thrust::device_ptr<T> tend() {  // NOLINT
+    return tbegin() + DeviceSize();
  }

-  thrust::device_ptr<T> tend(int device) {  // NOLINT
-    return tbegin(device) + DeviceSize(device);
-  }
-
-  thrust::device_ptr<const T> tcend(int device) {  // NOLINT
-    return tcbegin(device) + DeviceSize(device);
-  }
-
-  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
-    CHECK_EQ(end - begin, Size());
-    if (perm_h_.CanWrite()) {
-      dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
-                               (end - begin) * sizeof(T),
-                               cudaMemcpyDeviceToHost));
-    } else {
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-        shard.ScatterFrom(begin.get());
-      });
-    }
-  }
-
-  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
-    CHECK_EQ(end - begin, Size());
-    if (perm_h_.CanWrite()) {
-      dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
-                               data_h_.size() * sizeof(T),
-                               cudaMemcpyHostToDevice));
-    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.GatherTo(begin); });
-    }
+  thrust::device_ptr<const T> tcend() {  // NOLINT
+    return tcbegin() + DeviceSize();
  }

  void Fill(T v) {  // NOLINT
    if (perm_h_.CanWrite()) {
      std::fill(data_h_.begin(), data_h_.end(), v);
    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.Fill(v); });
+      DeviceFill(v);
    }
  }

@@ -320,14 +141,10 @@ struct HostDeviceVectorImpl {
      return;
    }
    // Data is on device;
-    if (distribution_ != other->distribution_) {
-      distribution_ = GPUDistribution();
-      Shard(other->Distribution());
-      size_d_ = other->size_d_;
+    if (device_ != other->device_) {
+      SetDevice(other->device_);
    }
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Copy(&other->shards_.at(i));
-      });
+    DeviceCopy(other);
  }

  void Copy(const std::vector<T>& other) {
@@ -335,9 +152,7 @@ struct HostDeviceVectorImpl {
    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-          shard.ScatterFrom(other.data());
-        });
+      DeviceCopy(other.data());
    }
  }

@@ -346,9 +161,7 @@ struct HostDeviceVectorImpl {
    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-          shard.ScatterFrom(other.begin());
-        });
+      DeviceCopy(other.begin());
    }
  }

@@ -362,40 +175,23 @@ struct HostDeviceVectorImpl {
    return data_h_;
  }

-  void Shard(const GPUDistribution& distribution) {
-    if (distribution_ == distribution) { return; }
-    CHECK(distribution_.IsEmpty())
-        << "Data resides on different GPUs: " << "ID: "
-        << *(distribution_.Devices().begin()) << " and ID: "
-        << *(distribution.Devices().begin());
-    distribution_ = distribution;
-    InitShards();
-  }
-
-  void Shard(GPUSet new_devices) {
-    if (distribution_.Devices() == new_devices) { return; }
-    Shard(GPUDistribution::Block(new_devices));
-  }
-
-  void Reshard(const GPUDistribution &distribution) {
-    if (distribution_ == distribution) { return; }
-    LazySyncHost(GPUAccess::kWrite);
-    distribution_ = distribution;
-    shards_.clear();
-    InitShards();
+  void SetDevice(int device) {
+    if (device_ == device) { return; }
+    if (device_ >= 0) {
+      LazySyncHost(GPUAccess::kWrite);
+    }
+    device_ = device;
+    if (device_ >= 0) {
+      LazyResizeDevice(data_h_.size());
+    }
  }

  void Resize(size_t new_size, T v) {
    if (new_size == Size()) { return; }
-    if (distribution_.IsFixedSize()) {
-      CHECK_EQ(new_size, distribution_.offsets_.back());
-    }
-    if (Size() == 0 && !distribution_.IsEmpty()) {
+    if (Size() == 0 && device_ >= 0) {
      // fast on-device resize
      perm_h_ = Permissions(false);
-      size_d_ = new_size;
-      InitShards();
-      Fill(v);
+      data_d_.resize(new_size, v);
    } else {
      // resize on host
      LazySyncHost(GPUAccess::kWrite);
@@ -407,72 +203,110 @@ struct HostDeviceVectorImpl {
    if (perm_h_.CanAccess(access)) { return; }
    if (perm_h_.CanRead()) {
      // data is present, just need to deny access to the device
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-          shard.Perm().DenyComplementary(access);
-        });
      perm_h_.Grant(access);
      return;
    }
    std::lock_guard<std::mutex> lock(mutex_);
-    if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-        shard.LazySyncHost(access);
-      });
+    if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
+    SetDevice();
+    dh::safe_cuda(cudaMemcpy(data_h_.data(),
+                             data_d_.data().get(),
+                             data_d_.size() * sizeof(T),
+                             cudaMemcpyDeviceToHost));
    perm_h_.Grant(access);
  }

-  void LazySyncDevice(int device, GPUAccess access) {
-    GPUSet devices = distribution_.Devices();
-    CHECK(devices.Contains(device));
-    shards_.at(devices.Index(device)).LazySyncDevice(access);
+  void LazySyncDevice(GPUAccess access) {
+    if (DevicePerm().CanAccess(access)) { return; }
+    if (DevicePerm().CanRead()) {
+      // deny read to the host
+      std::lock_guard<std::mutex> lock(mutex_);
+      perm_h_.DenyComplementary(access);
+      return;
+    }
+    // data is on the host
+    LazyResizeDevice(data_h_.size());
+    SetDevice();
+    dh::safe_cuda(cudaMemcpy(data_d_.data().get(),
+                             data_h_.data(),
+                             data_d_.size() * sizeof(T),
+                             cudaMemcpyHostToDevice));
+
+    std::lock_guard<std::mutex> lock(mutex_);
+    perm_h_.DenyComplementary(access);
  }

  bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
-
-  bool DeviceCanAccess(int device, GPUAccess access) {
-    GPUSet devices = distribution_.Devices();
-    if (!devices.Contains(device)) { return false; }
-    return shards_.at(devices.Index(device)).Perm().CanAccess(access);
-  }
+  bool DeviceCanAccess(GPUAccess access) { return DevicePerm().CanAccess(access); }

 private:
-  std::vector<T> data_h_;
-  Permissions perm_h_;
-  // the total size of the data stored on the devices
-  size_t size_d_;
-  GPUDistribution distribution_;
+  int device_{-1};
+  std::vector<T> data_h_{};
+  dh::device_vector<T> data_d_{};
+  Permissions perm_h_{false};
  // protects size_d_ and perm_h_ when updated from multiple threads
-  std::mutex mutex_;
-  std::vector<DeviceShard> shards_;
+  std::mutex mutex_{};
+
+  void DeviceFill(T v) {
+    // TODO(canonizer): avoid full copy of host data
+    LazySyncDevice(GPUAccess::kWrite);
+    SetDevice();
+    thrust::fill(data_d_.begin(), data_d_.end(), v);
+  }
+
+  void DeviceCopy(HostDeviceVectorImpl* other) {
+    // TODO(canonizer): avoid full copy of host data for this (but not for other)
+    LazySyncDevice(GPUAccess::kWrite);
+    other->LazySyncDevice(GPUAccess::kRead);
+    SetDevice();
+    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
+                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
+  }
+
+  void DeviceCopy(const T* begin) {
+    // TODO(canonizer): avoid full copy of host data
+    LazySyncDevice(GPUAccess::kWrite);
+    SetDevice();
+    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
+                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
+  }
+
+  void LazyResizeDevice(size_t new_size) {
+    if (new_size == data_d_.size()) { return; }
+    SetDevice();
+    data_d_.resize(new_size);
+  }
+
+  void SetDevice() {
+    CHECK_GE(device_, 0);
+    if (cudaSetDeviceHandler == nullptr) {
+      dh::safe_cuda(cudaSetDevice(device_));
+    } else {
+      (*cudaSetDeviceHandler)(device_);
+    }
+  }
+
+  Permissions DevicePerm() const { return perm_h_.Complementary(); }
 };

-template <typename T>
-HostDeviceVector<T>::HostDeviceVector
-(size_t size, T v, const GPUDistribution &distribution) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
-}
+template<typename T>
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
+    : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector
-(std::initializer_list<T> init, const GPUDistribution &distribution) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
-}
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector
-(const std::vector<T>& init, const GPUDistribution &distribution) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
-}
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}

 template <typename T>
 HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
-  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
-}
+    : impl_(new HostDeviceVectorImpl<T>(*other.impl_)) {}

 template <typename T>
-HostDeviceVector<T>& HostDeviceVector<T>::operator=
-(const HostDeviceVector<T>& other) {
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
  if (this == &other) { return *this; }

  std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
@@ -491,73 +325,51 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }

 template <typename T>
-GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
+int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }

 template <typename T>
-const GPUDistribution& HostDeviceVector<T>::Distribution() const {
-  return impl_->Distribution();
+T* HostDeviceVector<T>::DevicePointer() {
+  return impl_->DevicePointer();
 }

 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) {
-  return impl_->DevicePointer(device);
+const T* HostDeviceVector<T>::ConstDevicePointer() const {
+  return impl_->ConstDevicePointer();
 }

 template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
-  return impl_->ConstDevicePointer(device);
+common::Span<T> HostDeviceVector<T>::DeviceSpan() {
+  return impl_->DeviceSpan();
 }

 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
-  return impl_->DeviceSpan(device);
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
+  return impl_->ConstDeviceSpan();
 }

 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
-  return impl_->ConstDeviceSpan(device);
+size_t HostDeviceVector<T>::DeviceSize() const {
+  return impl_->DeviceSize();
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) const {
-  return impl_->DeviceStart(device);
+thrust::device_ptr<T> HostDeviceVector<T>::tbegin() {  // NOLINT
+  return impl_->tbegin();
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) const {
-  return impl_->DeviceSize(device);
+thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin() const {  // NOLINT
+  return impl_->tcbegin();
 }

 template <typename T>
-thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) {  // NOLINT
-  return impl_->tbegin(device);
+thrust::device_ptr<T> HostDeviceVector<T>::tend() {  // NOLINT
+  return impl_->tend();
 }

 template <typename T>
-thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const {  // NOLINT
-  return impl_->tcbegin(device);
-}
-
-template <typename T>
-thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) {  // NOLINT
-  return impl_->tend(device);
-}
-
-template <typename T>
-thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const {  // NOLINT
-  return impl_->tcend(device);
-}
-
-template <typename T>
-void HostDeviceVector<T>::ScatterFrom
-(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
-  impl_->ScatterFrom(begin, end);
-}
-
-template <typename T>
-void HostDeviceVector<T>::GatherTo
-(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
-  impl_->GatherTo(begin, end);
+thrust::device_ptr<const T> HostDeviceVector<T>::tcend() const {  // NOLINT
+  return impl_->tcend();
 }

 template <typename T>
@@ -594,23 +406,13 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
 }

 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
-  return impl_->DeviceCanAccess(device, access);
+bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
+  return impl_->DeviceCanAccess(access);
 }

 template <typename T>
-void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
-  impl_->Shard(new_devices);
-}
-
-template <typename T>
-void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
-  impl_->Shard(distribution);
-}
-
-template <typename T>
-void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution) {
-  impl_->Reshard(distribution);
+void HostDeviceVector<T>::SetDevice(int device) const {
+  impl_->SetDevice(device);
 }

 template <typename T>
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -79,113 +79,6 @@ void SetCudaSetDeviceHandler(void (*handler)(int));

 template <typename T> struct HostDeviceVectorImpl;

-// Distribution for the HostDeviceVector; it specifies such aspects as the
-// devices it is distributed on, whether there are copies of elements from
-// other GPUs as well as the granularity of splitting. It may also specify
-// explicit boundaries for devices, in which case the size of the array cannot
-// be changed.
-class GPUDistribution {
-  template<typename T> friend struct HostDeviceVectorImpl;
-
- public:
-  explicit GPUDistribution(GPUSet devices = GPUSet::Empty())
-    : devices_(devices), granularity_(1), overlap_(0) {}
-
- private:
-  GPUDistribution(GPUSet devices, int granularity, int overlap,
-                  std::vector<size_t> &&offsets)
-    : devices_(devices), granularity_(granularity), overlap_(overlap),
-    offsets_(std::move(offsets)) {}
-
- public:
-  static GPUDistribution Empty() { return GPUDistribution(); }
-
-  static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
-
-  static GPUDistribution Overlap(GPUSet devices, int overlap) {
-    return GPUDistribution(devices, 1, overlap, std::vector<size_t>());
-  }
-
-  static GPUDistribution Granular(GPUSet devices, int granularity) {
-    return GPUDistribution(devices, granularity, 0, std::vector<size_t>());
-  }
-
-  // NOTE(rongou): Explicit offsets don't necessarily cover the whole vector. Sections before the
-  // first shard or after the last shard may be on host only. This windowing is done in the GPU
-  // predictor for external memory support.
-  static GPUDistribution Explicit(GPUSet devices, std::vector<size_t> offsets) {
-    return GPUDistribution(devices, 1, 0, std::move(offsets));
-  }
-
-  friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
-    bool const res = a.devices_ == b.devices_ &&
-                     a.granularity_ == b.granularity_ &&
-                     a.overlap_ == b.overlap_ &&
-                     a.offsets_ == b.offsets_;
-    return res;
-  }
-
-  friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
-    return !(a == b);
-  }
-
-  GPUSet Devices() const { return devices_; }
-
-  bool IsEmpty() const { return devices_.IsEmpty(); }
-
-  size_t ShardStart(size_t size, int index) const {
-    if (size == 0) { return 0; }
-    if (offsets_.size() > 0) {
-      // explicit offsets are provided
-      CHECK_EQ(offsets_.back(), size);
-      return offsets_.at(index);
-    }
-    // no explicit offsets
-    size_t begin = std::min(index * Portion(size), size);
-    begin = begin > size ? size : begin;
-    return begin;
-  }
-
-  size_t ShardSize(size_t size, size_t index) const {
-    if (size == 0) { return 0; }
-    if (offsets_.size() > 0) {
-      // explicit offsets are provided
-      CHECK_EQ(offsets_.back(), size);
-      return offsets_.at(index + 1)  - offsets_.at(index) +
-        (index == devices_.Size() - 1 ? overlap_ : 0);
-    }
-    size_t portion = Portion(size);
-    size_t begin = std::min(index * portion, size);
-    size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size);
-    return end - begin;
-  }
-
-  size_t ShardProperSize(size_t size, size_t index) const {
-    if (size == 0) { return 0; }
-    return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0);
-  }
-
-  bool IsFixedSize() const { return !offsets_.empty(); }
-
- private:
-  static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; }
-  static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; }
-
-  size_t Portion(size_t size) const {
-    return RoundUp
-      (DivRoundUp
-       (std::max(static_cast<int64_t>(size - overlap_ * granularity_),
-                 static_cast<int64_t>(1)),
-        devices_.Size()), granularity_);
-  }
-
-  GPUSet devices_;
-  int granularity_;
-  int overlap_;
-  // explicit offsets for the GPU parts, if any
-  std::vector<size_t> offsets_;
-};
-
 enum GPUAccess {
  kNone, kRead,
  // write implies read
@@ -199,46 +92,38 @@ inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
 template <typename T>
 class HostDeviceVector {
 public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(),
-                            const GPUDistribution &distribution = GPUDistribution());
-  HostDeviceVector(std::initializer_list<T> init,
-                   const GPUDistribution &distribution = GPUDistribution());
-  explicit HostDeviceVector(const std::vector<T>& init,
-                            const GPUDistribution &distribution = GPUDistribution());
+  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
+  HostDeviceVector(std::initializer_list<T> init, int device = -1);
+  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
  ~HostDeviceVector();
  HostDeviceVector(const HostDeviceVector<T>&);
  HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
  size_t Size() const;
-  GPUSet Devices() const;
-  const GPUDistribution& Distribution() const;
-  common::Span<T> DeviceSpan(int device);
-  common::Span<const T> ConstDeviceSpan(int device) const;
-  common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
-  T* DevicePointer(int device);
-  const T* ConstDevicePointer(int device) const;
-  const T* DevicePointer(int device) const { return ConstDevicePointer(device); }
+  int DeviceIdx() const;
+  common::Span<T> DeviceSpan();
+  common::Span<const T> ConstDeviceSpan() const;
+  common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
+  T* DevicePointer();
+  const T* ConstDevicePointer() const;
+  const T* DevicePointer() const { return ConstDevicePointer(); }

  T* HostPointer() { return HostVector().data(); }
  const T* ConstHostPointer() const { return ConstHostVector().data(); }
  const T* HostPointer() const { return ConstHostPointer(); }

-  size_t DeviceStart(int device) const;
-  size_t DeviceSize(int device) const;
+  size_t DeviceSize() const;

  // only define functions returning device_ptr
  // if HostDeviceVector.h is included from a .cu file
 #ifdef __CUDACC__
-  thrust::device_ptr<T> tbegin(int device);  // NOLINT
-  thrust::device_ptr<T> tend(int device);  // NOLINT
-  thrust::device_ptr<const T> tcbegin(int device) const;  // NOLINT
-  thrust::device_ptr<const T> tcend(int device) const;  // NOLINT
-  thrust::device_ptr<const T> tbegin(int device) const {  // NOLINT
-    return tcbegin(device);
+  thrust::device_ptr<T> tbegin();  // NOLINT
+  thrust::device_ptr<T> tend();  // NOLINT
+  thrust::device_ptr<const T> tcbegin() const;  // NOLINT
+  thrust::device_ptr<const T> tcend() const;  // NOLINT
+  thrust::device_ptr<const T> tbegin() const {  // NOLINT
+    return tcbegin();
  }
-  thrust::device_ptr<const T> tend(int device) const { return tcend(device); }  // NOLINT
-
-  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end);
-  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const;
+  thrust::device_ptr<const T> tend() const { return tcend(); }  // NOLINT
 #endif  // __CUDACC__

  void Fill(T v);
@@ -251,18 +136,9 @@ class HostDeviceVector {
  const std::vector<T>& HostVector() const {return ConstHostVector(); }

  bool HostCanAccess(GPUAccess access) const;
-  bool DeviceCanAccess(int device, GPUAccess access) const;
+  bool DeviceCanAccess(GPUAccess access) const;

-  /*!
-   * \brief Specify memory distribution.
-   */
-  void Shard(const GPUDistribution &distribution) const;
-  void Shard(GPUSet devices) const;
-
-  /*!
-   * \brief Change memory distribution.
-   */
-  void Reshard(const GPUDistribution &distribution);
+  void SetDevice(int device) const;

  void Resize(size_t new_size, T v = T());

--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -57,14 +57,10 @@ class Transform {
  template <typename Functor>
  struct Evaluator {
   public:
-    Evaluator(Functor func, Range range, GPUSet devices, bool shard) :
+    Evaluator(Functor func, Range range, int device, bool shard) :
        func_(func), range_{std::move(range)},
        shard_{shard},
-        distribution_{GPUDistribution::Block(devices)} {}
-    Evaluator(Functor func, Range range, GPUDistribution dist,
-              bool shard) :
-        func_(func), range_{std::move(range)}, shard_{shard},
-        distribution_{std::move(dist)} {}
+        device_{device} {}

    /*!
     * \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -74,7 +70,7 @@ class Transform {
     */
    template <typename... HDV>
    void Eval(HDV... vectors) const {
-      bool on_device = !distribution_.IsEmpty();
+      bool on_device = device_ >= 0;

      if (on_device) {
        LaunchCUDA(func_, vectors...);
@@ -86,13 +82,13 @@ class Transform {
   private:
    // CUDA UnpackHDV
    template <typename T>
-    Span<T> UnpackHDV(HostDeviceVector<T>* _vec, int _device) const {
-      auto span = _vec->DeviceSpan(_device);
+    Span<T> UnpackHDVOnDevice(HostDeviceVector<T>* _vec) const {
+      auto span = _vec->DeviceSpan();
      return span;
    }
    template <typename T>
-    Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec, int _device) const {
-      auto span = _vec->ConstDeviceSpan(_device);
+    Span<T const> UnpackHDVOnDevice(const HostDeviceVector<T>* _vec) const {
+      auto span = _vec->ConstDeviceSpan();
      return span;
    }
    // CPU UnpackHDV
@@ -108,15 +104,15 @@ class Transform {
    }
    // Recursive unpack for Shard.
    template <typename T>
-    void UnpackShard(GPUDistribution dist, const HostDeviceVector<T> *vector) const {
-      vector->Shard(dist);
+    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+      vector->SetDevice(device);
    }
    template <typename Head, typename... Rest>
-    void UnpackShard(GPUDistribution dist,
+    void UnpackShard(int device,
                     const HostDeviceVector<Head> *_vector,
                     const HostDeviceVector<Rest> *... _vectors) const {
-      _vector->Shard(dist);
-      UnpackShard(dist, _vectors...);
+      _vector->SetDevice(device);
+      UnpackShard(device, _vectors...);
    }

 #if defined(__CUDACC__)
@@ -124,28 +120,20 @@ class Transform {
              typename... HDV>
    void LaunchCUDA(Functor _func, HDV*... _vectors) const {
      if (shard_)
-        UnpackShard(distribution_, _vectors...);
+        UnpackShard(device_, _vectors...);

-      GPUSet devices = distribution_.Devices();
      size_t range_size = *range_.end() - *range_.begin();

      // Extract index to deal with possible old OpenMP.
-      size_t device_beg = *(devices.begin());
-      size_t device_end = *(devices.end());
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (omp_ulong device = device_beg; device < device_end; ++device) {  // NOLINT
-        // Ignore other attributes of GPUDistribution for spliting index.
-        // This deals with situation like multi-class setting where
-        // granularity is used in data vector.
-        size_t shard_size = GPUDistribution::Block(devices).ShardSize(
-            range_size, devices.Index(device));
-        Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-        dh::safe_cuda(cudaSetDevice(device));
-        const int GRID_SIZE =
-            static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
-        detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
-            _func, shard_range, UnpackHDV(_vectors, device)...);
-      }
+      // This deals with situation like multi-class setting where
+      // granularity is used in data vector.
+      size_t shard_size = range_size;
+      Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
+      dh::safe_cuda(cudaSetDevice(device_));
+      const int GRID_SIZE =
+          static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
+      detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
+          _func, shard_range, UnpackHDVOnDevice(_vectors)...);
    }
 #else
    /*! \brief Dummy funtion defined when compiling for CPU.  */
@@ -172,7 +160,7 @@ class Transform {
    Range range_;
    /*! \brief Whether sharding for vectors is required. */
    bool shard_;
-    GPUDistribution distribution_;
+    int device_;
  };

 public:
@@ -191,15 +179,9 @@ class Transform {
   */
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range,
-                                 GPUSet const devices,
+                                 int device,
                                 bool const shard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(devices), shard};
-  }
-  template <typename Functor>
-  static Evaluator<Functor> Init(Functor func, Range const range,
-                                 GPUDistribution const dist,
-                                 bool const shard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(dist), shard};
+    return Evaluator<Functor> {func, std::move(range), device, shard};
  }
 };