Multi-GPU HostDeviceVector. (#3287)

* Multi-GPU HostDeviceVector. - HostDeviceVector instances can now span multiple devices, defined by GPUSet struct - the interface of HostDeviceVector has been modified accordingly - GPU objective functions are now multi-GPU - GPU predicting from cache is now multi-GPU - avoiding omp_set_num_threads() calls - other minor changes
2018-05-04 22:00:05 +02:00
parent 90a5c4db9d
commit b8a0d66fe6
9 changed files with 569 additions and 250 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1004,14 +1004,29 @@ class AllReducer {

 template <typename T, typename FunctionT>
 void ExecuteShards(std::vector<T> *shards, FunctionT f) {
-  auto previous_num_threads = omp_get_max_threads();
-  omp_set_num_threads(shards->size());
-#pragma omp parallel
-  {
-    auto cpu_thread_id = omp_get_thread_num();
-    f(shards->at(cpu_thread_id));
+#pragma omp parallel for schedule(static, 1)
+  for (int shard = 0; shard < shards->size(); ++shard) {
+    f(shards->at(shard));
+  }
+}
+
+/**
+ * \brief Executes some operation on each element of the input vector, using a
+ * single controlling thread for each element. In addition, passes the shard index
+ * into the function.
+ *
+ * \tparam  T       Generic type parameter.
+ * \tparam  FunctionT  Type of the function t.
+ * \param shards  The shards.
+ * \param f       The func_t to process.
+ */
+
+template <typename T, typename FunctionT>
+void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
+#pragma omp parallel for schedule(static, 1)
+  for (int shard = 0; shard < shards->size(); ++shard) {
+    f(shard, shards->at(shard));
  }
-  omp_set_num_threads(previous_num_threads);
 }

 /**
@@ -1029,15 +1044,11 @@ void ExecuteShards(std::vector<T> *shards, FunctionT f) {

 template <typename ReduceT,typename T, typename FunctionT>
 ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
-  auto previous_num_threads = omp_get_max_threads();
-  omp_set_num_threads(shards->size());
  std::vector<ReduceT> sums(shards->size());
-#pragma omp parallel
-  {
-    auto cpu_thread_id = omp_get_thread_num();
-    sums[cpu_thread_id] = f(shards->at(cpu_thread_id));
+#pragma omp parallel for schedule(static, 1)
+  for (int shard = 0; shard < shards->size(); ++shard) {
+    sums[shard] = f(shards->at(shard));
  }
-  omp_set_num_threads(previous_num_threads);
  return std::accumulate(sums.begin(), sums.end(), ReduceT());
 }
 }  // namespace dh
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -21,18 +21,18 @@ struct HostDeviceVectorImpl {
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) {
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices) : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(size, v);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -48,7 +48,7 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }

 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return -1; }
+GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }

 template <typename T>
 T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
@@ -57,13 +57,46 @@ template <typename T>
 std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }

 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
+void HostDeviceVector<T>::Resize(size_t new_size, T v) {
  impl_->data_h_.resize(new_size, v);
 }

+template <typename T>
+size_t HostDeviceVector<T>::DeviceStart(int device) { return 0; }
+
+template <typename T>
+size_t HostDeviceVector<T>::DeviceSize(int device) { return 0; }
+
+template <typename T>
+void HostDeviceVector<T>::Fill(T v) {
+  std::fill(HostVector().begin(), HostVector().end(), v);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
+  CHECK_EQ(Size(), other->Size());
+  std::copy(other->HostVector().begin(), other->HostVector().end(), HostVector().begin());
+}
+
+template <typename T>
+void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
+  CHECK_EQ(Size(), other.size());
+  std::copy(other.begin(), other.end(), HostVector().begin());
+}
+
+template <typename T>
+void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
+  CHECK_EQ(Size(), other.size());
+  std::copy(other.begin(), other.end(), HostVector().begin());
+}
+
+template <typename T>
+void HostDeviceVector<T>::Reshard(GPUSet devices) { }
+
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
+template class HostDeviceVector<unsigned int>;

 }  // namespace xgboost

--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -2,122 +2,309 @@
 * Copyright 2017 XGBoost contributors
 */

+
+#include <thrust/fill.h>
 #include "./host_device_vector.h"
 #include "./device_helpers.cuh"

 namespace xgboost {

+
 template <typename T>
 struct HostDeviceVectorImpl {
-  HostDeviceVectorImpl(size_t size, T v, int device)
-    : device_(device), on_d_(device >= 0) {
-    if (on_d_) {
+  struct DeviceShard {
+    DeviceShard() : index_(-1), device_(-1), start_(0), on_d_(false), vec_(nullptr) {}
+
+    static size_t ShardStart(size_t size, int ndevices, int index) {
+      size_t portion = dh::DivRoundUp(size, ndevices);
+      size_t begin = index * portion;
+      begin = begin > size ? size : begin;
+      return begin;
+    }
+
+    static size_t ShardSize(size_t size, int ndevices, int index) {
+      size_t portion = dh::DivRoundUp(size, ndevices);
+      size_t begin = index * portion, end = (index + 1) * portion;
+      begin = begin > size ? size : begin;
+      end = end > size ? size : end;
+      return end - begin;
+    }
+
+    void Init(HostDeviceVectorImpl<T>* vec, int device) {
+      if (vec_ == nullptr) { vec_ = vec; }
+      CHECK_EQ(vec, vec_);
+      device_ = device;
+      index_ = vec_->devices_.Index(device);
+      size_t size_h = vec_->Size();
+      int ndevices = vec_->devices_.Size();
+      start_ = ShardStart(size_h, ndevices, index_);
+      size_t size_d = ShardSize(size_h, ndevices, index_);
      dh::safe_cuda(cudaSetDevice(device_));
-      data_d_.resize(size, v);
+      data_.resize(size_d);
+      on_d_ = !vec_->on_h_;
+    }
+
+    void ScatterFrom(const T* begin) {
+      // TODO(canonizer): avoid full copy of host data
+      LazySyncDevice();
+      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaMemcpy(data_.data().get(), begin + start_,
+                               data_.size() * sizeof(T), cudaMemcpyDefault));
+    }
+
+    void GatherTo(thrust::device_ptr<T> begin) {
+      LazySyncDevice();
+      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaMemcpy(begin.get() + start_, data_.data().get(),
+                               data_.size() * sizeof(T), cudaMemcpyDefault));
+    }
+
+    void Fill(T v) {
+      // TODO(canonizer): avoid full copy of host data
+      LazySyncDevice();
+      dh::safe_cuda(cudaSetDevice(device_));
+      thrust::fill(data_.begin(), data_.end(), v);
+    }
+
+    void Copy(DeviceShard* other) {
+      // TODO(canonizer): avoid full copy of host data for this (but not for other)
+      LazySyncDevice();
+      other->LazySyncDevice();
+      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaMemcpy(data_.data().get(), other->data_.data().get(),
+                               data_.size() * sizeof(T), cudaMemcpyDefault));
+    }
+
+    void LazySyncHost() {
+      dh::safe_cuda(cudaSetDevice(device_));
+      thrust::copy(data_.begin(), data_.end(), vec_->data_h_.begin() + start_);
+      on_d_ = false;
+    }
+
+    void LazySyncDevice() {
+      if (on_d_) { return; }
+      // data is on the host
+      size_t size_h = vec_->data_h_.size();
+      int ndevices = vec_->devices_.Size();
+      start_ = ShardStart(size_h, ndevices, index_);
+      size_t size_d = ShardSize(size_h, ndevices, index_);
+      dh::safe_cuda(cudaSetDevice(device_));
+      data_.resize(size_d);
+      thrust::copy(vec_->data_h_.begin() + start_,
+                   vec_->data_h_.begin() + start_ + size_d, data_.begin());
+      on_d_ = true;
+      // this may cause a race condition if LazySyncDevice() is called
+      // from multiple threads in parallel;
+      // however, the race condition is benign, and will not cause problems
+      vec_->on_h_ = false;
+      vec_->size_d_ = vec_->data_h_.size();
+    }
+
+    int index_;
+    int device_;
+    thrust::device_vector<T> data_;
+    size_t start_;
+    // true if there is an up-to-date copy of data on device, false otherwise
+    bool on_d_;
+    HostDeviceVectorImpl<T>* vec_;
+  };
+
+  HostDeviceVectorImpl(size_t size, T v, GPUSet devices)
+    : devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
+    if (!devices.IsEmpty()) {
+      size_d_ = size;
+      InitShards();
+      Fill(v);
    } else {
      data_h_.resize(size, v);
    }
  }
+
  // Init can be std::vector<T> or std::initializer_list<T>
  template <class Init>
-  HostDeviceVectorImpl(const Init& init, int device)
-    : device_(device), on_d_(device >= 0) {
-    if (on_d_) {
-      dh::safe_cuda(cudaSetDevice(device_));
-      data_d_.resize(init.size());
-      thrust::copy(init.begin(), init.end(), data_d_.begin());
+  HostDeviceVectorImpl(const Init& init, GPUSet devices)
+    : devices_(devices), on_h_(devices.IsEmpty()), size_d_(0) {
+    if (!devices.IsEmpty()) {
+      size_d_ = init.size();
+      InitShards();
+      Copy(init);
    } else {
      data_h_ = init;
    }
  }
+
+  void InitShards() {
+    int ndevices = devices_.Size();
+    shards_.resize(ndevices);
+    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
+        shard.Init(this, devices_[i]);
+      });
+  }
+
  HostDeviceVectorImpl(const HostDeviceVectorImpl<T>&) = delete;
  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&&) = delete;
  void operator=(const HostDeviceVectorImpl<T>&) = delete;
  void operator=(HostDeviceVectorImpl<T>&&) = delete;

-  size_t Size() const { return on_d_ ? data_d_.size() : data_h_.size(); }
+  size_t Size() const { return on_h_ ? data_h_.size() : size_d_; }

-  int DeviceIdx() const { return device_; }
+  GPUSet Devices() const { return devices_; }

  T* DevicePointer(int device) {
+    CHECK(devices_.Contains(device));
    LazySyncDevice(device);
-    return data_d_.data().get();
+    return shards_[devices_.Index(device)].data_.data().get();
  }
+
+  size_t DeviceSize(int device) {
+    CHECK(devices_.Contains(device));
+    LazySyncDevice(device);
+    return shards_[devices_.Index(device)].data_.size();
+  }
+
+  size_t DeviceStart(int device) {
+    CHECK(devices_.Contains(device));
+    LazySyncDevice(device);
+    return shards_[devices_.Index(device)].start_;
+  }
+
  thrust::device_ptr<T> tbegin(int device) {  // NOLINT
    return thrust::device_ptr<T>(DevicePointer(device));
  }
+
  thrust::device_ptr<T> tend(int device) {  // NOLINT
-    auto begin = tbegin(device);
-    return begin + Size();
+    return tbegin(device) + DeviceSize(device);
  }
+
+  void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+    CHECK_EQ(end - begin, Size());
+    if (on_h_) {
+      thrust::copy(begin, end, data_h_.begin());
+    } else {
+      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
+          shard.ScatterFrom(begin.get());
+        });
+    }
+  }
+
+  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+    CHECK_EQ(end - begin, Size());
+    if (on_h_) {
+      thrust::copy(data_h_.begin(), data_h_.end(), begin);
+    } else {
+      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); });
+    }
+  }
+
+  void Fill(T v) {
+    if (on_h_) {
+      std::fill(data_h_.begin(), data_h_.end(), v);
+    } else {
+      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.Fill(v); });
+    }
+  }
+
+  void Copy(HostDeviceVectorImpl<T>* other) {
+    CHECK_EQ(Size(), other->Size());
+    if (on_h_ && other->on_h_) {
+      std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
+    } else {
+      CHECK(devices_ == other->devices_);
+      dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
+          shard.Copy(&other->shards_[i]);
+        });
+    }
+  }
+
+  void Copy(const std::vector<T>& other) {
+    CHECK_EQ(Size(), other.size());
+    if (on_h_) {
+      std::copy(other.begin(), other.end(), data_h_.begin());
+    } else {
+      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
+          shard.ScatterFrom(other.data());
+        });
+    }
+  }
+
+  void Copy(std::initializer_list<T> other) {
+    CHECK_EQ(Size(), other.size());
+    if (on_h_) {
+      std::copy(other.begin(), other.end(), data_h_.begin());
+    } else {
+      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
+          shard.ScatterFrom(other.begin());
+        });
+    }
+  }
+
  std::vector<T>& HostVector() {
    LazySyncHost();
    return data_h_;
  }
-  void Resize(size_t new_size, T v, int new_device) {
-    if (new_size == this->Size() && new_device == device_)
+
+  void Reshard(GPUSet new_devices) {
+    if (devices_ == new_devices)
      return;
-    if (new_device != -1)
-      device_ = new_device;
-    // if !on_d_, but the data size is 0 and the device is set,
-    // resize the data on device instead
-    if (!on_d_ && (data_h_.size() > 0 || device_ == -1)) {
-      data_h_.resize(new_size, v);
+    CHECK(devices_.IsEmpty());
+    devices_ = new_devices;
+    InitShards();
+  }
+
+  void Resize(size_t new_size, T v) {
+    if (new_size == Size())
+      return;
+    if (Size() == 0 && !devices_.IsEmpty()) {
+      // fast on-device resize
+      on_h_ = false;
+      size_d_ = new_size;
+      InitShards();
+      Fill(v);
    } else {
-      dh::safe_cuda(cudaSetDevice(device_));
-      data_d_.resize(new_size, v);
-      on_d_ = true;
+      // resize on host
+      LazySyncHost();
+      data_h_.resize(new_size, v);
    }
  }

  void LazySyncHost() {
-    if (!on_d_)
+    if (on_h_)
      return;
-    if (data_h_.size() != this->Size())
-      data_h_.resize(this->Size());
-    dh::safe_cuda(cudaSetDevice(device_));
-    thrust::copy(data_d_.begin(), data_d_.end(), data_h_.begin());
-    on_d_ = false;
+    if (data_h_.size() != size_d_)
+      data_h_.resize(size_d_);
+    dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.LazySyncHost(); });
+    on_h_ = true;
  }

  void LazySyncDevice(int device) {
-    if (on_d_)
-      return;
-    if (device != device_) {
-      CHECK_EQ(device_, -1);
-      device_ = device;
-    }
-    if (data_d_.size() != this->Size()) {
-      dh::safe_cuda(cudaSetDevice(device_));
-      data_d_.resize(this->Size());
-    }
-    dh::safe_cuda(cudaSetDevice(device_));
-    thrust::copy(data_h_.begin(), data_h_.end(), data_d_.begin());
-    on_d_ = true;
+    CHECK(devices_.Contains(device));
+    shards_[devices_.Index(device)].LazySyncDevice();
  }

  std::vector<T> data_h_;
-  thrust::device_vector<T> data_d_;
-  // true if there is an up-to-date copy of data on device, false otherwise
-  bool on_d_;
-  int device_;
+  bool on_h_;
+  // the total size of the data stored on the devices
+  size_t size_d_;
+  GPUSet devices_;
+  std::vector<DeviceShard> shards_;
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(size, v, device);
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, GPUSet devices)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(size, v, devices);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, GPUSet devices)
  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, device);
+  impl_ = new HostDeviceVectorImpl<T>(init, devices);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, GPUSet devices)
  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, device);
+  impl_ = new HostDeviceVectorImpl<T>(init, devices);
 }

 template <typename T>
@@ -131,11 +318,17 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }

 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
+GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }

 template <typename T>
 T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }

+template <typename T>
+size_t HostDeviceVector<T>::DeviceStart(int device) { return impl_->DeviceStart(device); }
+
+template <typename T>
+size_t HostDeviceVector<T>::DeviceSize(int device) { return impl_->DeviceSize(device); }
+
 template <typename T>
 thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) {  // NOLINT
  return impl_->tbegin(device);
@@ -146,16 +339,54 @@ thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) {  // NOLINT
  return impl_->tend(device);
 }

+template <typename T>
+void HostDeviceVector<T>::ScatterFrom
+(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+  impl_->ScatterFrom(begin, end);
+}
+
+template <typename T>
+void HostDeviceVector<T>::GatherTo
+(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+  impl_->GatherTo(begin, end);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Fill(T v) {
+  impl_->Fill(v);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Copy(HostDeviceVector<T>* other) {
+  impl_->Copy(other->impl_);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
+  impl_->Copy(other);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
+  impl_->Copy(other);
+}
+
 template <typename T>
 std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }

 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
-  impl_->Resize(new_size, v, new_device);
+void HostDeviceVector<T>::Reshard(GPUSet new_devices) {
+  impl_->Reshard(new_devices);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Resize(size_t new_size, T v) {
+  impl_->Resize(new_size, v);
 }

 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
+template class HostDeviceVector<unsigned int>;

 }  // namespace xgboost
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -4,6 +4,9 @@
 #ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
 #define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_

+#include <dmlc/logging.h>
+
+#include <algorithm>
 #include <cstdlib>
 #include <initializer_list>
 #include <vector>
@@ -18,6 +21,40 @@ namespace xgboost {

 template <typename T> struct HostDeviceVectorImpl;

+// set of devices across which HostDeviceVector can be distributed;
+// currently implemented as a range, but can be changed later to something else,
+// e.g. a bitset
+class GPUSet {
+ public:
+  explicit GPUSet(int start = 0, int ndevices = 0)
+    : start_(start), ndevices_(ndevices) {}
+  static GPUSet Empty() { return GPUSet(); }
+  static GPUSet Range(int start, int ndevices) { return GPUSet(start, ndevices); }
+  int Size() const { return ndevices_; }
+  int operator[](int index) const {
+    CHECK(index >= 0 && index < ndevices_);
+    return start_ + index;
+  }
+  bool IsEmpty() const { return ndevices_ <= 0; }
+  int Index(int device) const {
+    CHECK(device >= start_ && device < start_ + ndevices_);
+    return device - start_;
+  }
+  bool Contains(int device) const {
+    return start_ <= device && device < start_ + ndevices_;
+  }
+  friend bool operator==(GPUSet a, GPUSet b) {
+    return a.start_ == b.start_ && a.ndevices_ == b.ndevices_;
+  }
+  friend bool operator!=(GPUSet a, GPUSet b) {
+    return a.start_ != b.start_ || a.ndevices_ != b.ndevices_;
+  }
+
+ private:
+  int start_, ndevices_;
+};
+
+
 /**
 * @file host_device_vector.h
 * @brief A device-and-host vector abstraction layer.
@@ -29,24 +66,26 @@ template <typename T> struct HostDeviceVectorImpl;
 *
 * Initialization/Allocation:<br/>
 * One can choose to initialize the vector on CPU or GPU during constructor.
- * (use the 'device' argument) Or, can choose to use the 'resize' method to
- * allocate/resize memory explicitly.
+ * (use the 'devices' argument) Or, can choose to use the 'Resize' method to
+ * allocate/resize memory explicitly, and use the 'Reshard' method 
+ * to specify the devices.
 *
- * Accessing underling data:<br/>
- * Use 'data_h' method to explicitly query for the underlying std::vector.
- * If you need the raw device pointer, use the 'ptr_d' method. For perf
+ * Accessing underlying data:<br/>
+ * Use 'HostVector' method to explicitly query for the underlying std::vector.
+ * If you need the raw device pointer, use the 'DevicePointer' method. For perf
 * implications of these calls, see below.
 *
 * Accessing underling data and their perf implications:<br/>
 * There are 4 scenarios to be considered here:
- * data_h and data on CPU --> no problems, std::vector returned immediately
- * data_h but data on GPU --> this causes a cudaMemcpy to be issued internally.
- *                        subsequent calls to data_h, will NOT incur this penalty.
- *                        (assuming 'ptr_d' is not called in between)
- * ptr_d but data on CPU  --> this causes a cudaMemcpy to be issued internally.
- *                        subsequent calls to ptr_d, will NOT incur this penalty.
- *                        (assuming 'data_h' is not called in between)
- * ptr_d and data on GPU  --> no problems, the device ptr will be returned immediately
+ * HostVector and data on CPU --> no problems, std::vector returned immediately
+ * HostVector but data on GPU --> this causes a cudaMemcpy to be issued internally.
+ *                        subsequent calls to HostVector, will NOT incur this penalty.
+ *                        (assuming 'DevicePointer' is not called in between)
+ * DevicePointer but data on CPU  --> this causes a cudaMemcpy to be issued internally.
+ *                        subsequent calls to DevicePointer, will NOT incur this penalty.
+ *                        (assuming 'HostVector' is not called in between)
+ * DevicePointer and data on GPU  --> no problems, the device ptr 
+ *                        will be returned immediately.
 *
 * What if xgboost is compiled without CUDA?<br/>
 * In that case, there's a special implementation which always falls-back to
@@ -57,35 +96,49 @@ template <typename T> struct HostDeviceVectorImpl;
 * compiling with and without CUDA toolkit. It was easier to have 
 * 'HostDeviceVector' with a special-case implementation in host_device_vector.cc
 *
- * @note: This is not thread-safe!
+ * @note: Size and Devices methods are thread-safe.
+ * DevicePointer, DeviceStart, DeviceSize, tbegin and tend methods are thread-safe 
+ * if different threads call these methods with different values of the device argument.
+ * All other methods are not thread safe. 
 */
 template <typename T>
 class HostDeviceVector {
 public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
-  HostDeviceVector(std::initializer_list<T> init, int device = -1);
-  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
+  explicit HostDeviceVector(size_t size = 0, T v = T(),
+                            GPUSet devices = GPUSet::Empty());
+  HostDeviceVector(std::initializer_list<T> init, GPUSet devices = GPUSet::Empty());
+  explicit HostDeviceVector(const std::vector<T>& init,
+                            GPUSet devices = GPUSet::Empty());
  ~HostDeviceVector();
  HostDeviceVector(const HostDeviceVector<T>&) = delete;
  HostDeviceVector(HostDeviceVector<T>&&) = delete;
  void operator=(const HostDeviceVector<T>&) = delete;
  void operator=(HostDeviceVector<T>&&) = delete;
  size_t Size() const;
-  int DeviceIdx() const;
+  GPUSet Devices() const;
  T* DevicePointer(int device);
+
  T* HostPointer() { return HostVector().data(); }
+  size_t DeviceStart(int device);
+  size_t DeviceSize(int device);

  // only define functions returning device_ptr
  // if HostDeviceVector.h is included from a .cu file
 #ifdef __CUDACC__
-  thrust::device_ptr<T> tbegin(int device);
-  thrust::device_ptr<T> tend(int device);
+  thrust::device_ptr<T> tbegin(int device);  // NOLINT
+  thrust::device_ptr<T> tend(int device);  // NOLINT
+  void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
+  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end);
 #endif

-  std::vector<T>& HostVector();
+  void Fill(T v);
+  void Copy(HostDeviceVector<T>* other);
+  void Copy(const std::vector<T>& other);
+  void Copy(std::initializer_list<T> other);

-  // passing in new_device == -1 keeps the device as is
-  void Resize(size_t new_size, T v = T(), int new_device = -1);
+  std::vector<T>& HostVector();
+  void Reshard(GPUSet devices);
+  void Resize(size_t new_size, T v = T());

 private:
  HostDeviceVectorImpl<T>* impl_;
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -195,7 +195,7 @@ class GBTree : public GradientBooster {
          << "must have exactly ngroup*nrow gpairs";
      // TODO(canonizer): perform this on GPU if HostDeviceVector has device set.
      HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup,
-                                      GradientPair(), in_gpair->DeviceIdx());
+                                      GradientPair(), in_gpair->Devices());
      std::vector<GradientPair>& gpair_h = in_gpair->HostVector();
      auto nsize = static_cast<bst_omp_uint>(tmp.Size());
      for (int gid = 0; gid < ngroup; ++gid) {
--- a/src/objective/regression_obj_gpu.cu
+++ b/src/objective/regression_obj_gpu.cu
@@ -74,46 +74,35 @@ __global__ void pred_transform_k(float * __restrict__ preds, int n) {
 template<typename Loss>
 class GPURegLossObj : public ObjFunction {
 protected:
-  // manages device data
-  struct DeviceData {
-    DVec<float> labels, weights;
-    DVec<unsigned int> label_correct;
-
-    // allocate everything on device
-    DeviceData(dh::BulkAllocator<dh::MemoryType::kDevice>* ba, int device_idx, size_t n) {
-      ba->Allocate(device_idx, false,
-                  &labels, n,
-                  &weights, n,
-                  &label_correct, 1);
-    }
-    size_t Size() const { return labels.Size(); }
-  };
-
-
  bool copied_;
-  std::unique_ptr<dh::BulkAllocator<dh::MemoryType::kDevice>> ba_;
-  std::unique_ptr<DeviceData> data_;
-  HostDeviceVector<bst_float> preds_d_;
-  HostDeviceVector<GradientPair> out_gpair_d_;
+  HostDeviceVector<bst_float> labels_, weights_;
+  HostDeviceVector<unsigned int> label_correct_;

-  // allocate device data for n elements, do nothing if enough memory is allocated already
-  void LazyResize(int n) {
-    if (data_.get() != nullptr && data_->Size() >= n)
+  // allocate device data for n elements, do nothing if memory is allocated already
+  void LazyResize(size_t n, size_t n_weights) {
+    if (labels_.Size() == n && weights_.Size() == n_weights)
      return;
    copied_ = false;
-    // free the old data and allocate the new data
-    ba_.reset(new dh::BulkAllocator<dh::MemoryType::kDevice>());
-    data_.reset(new DeviceData(ba_.get(), 0, n));
-    preds_d_.Resize(n, 0.0f, param_.gpu_id);
-    out_gpair_d_.Resize(n, GradientPair(), param_.gpu_id);
+
+    labels_.Reshard(devices_);
+    weights_.Reshard(devices_);
+    label_correct_.Reshard(devices_);
+
+    if (labels_.Size() != n) {
+      labels_.Resize(n);
+      label_correct_.Resize(devices_.Size());
+    }
+    if (weights_.Size() != n_weights)
+      weights_.Resize(n_weights);
  }

 public:
-  GPURegLossObj() : copied_(false), preds_d_(0, -1), out_gpair_d_({}, -1) {}
+  GPURegLossObj() : copied_(false) {}

  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
    param_.InitAllowUnknown(args);
    CHECK(param_.n_gpus != 0) << "Must have at least one device";
+    devices_ = GPUSet::Range(param_.gpu_id, dh::NDevicesAll(param_.n_gpus));
  }

  void GetGradient(HostDeviceVector<float>* preds,
@@ -125,45 +114,49 @@ class GPURegLossObj : public ObjFunction {
      << "labels are not correctly provided"
      << "preds.size=" << preds->Size() << ", label.size=" << info.labels_.size();
    size_t ndata = preds->Size();
-    out_gpair->Resize(ndata, GradientPair(), param_.gpu_id);
-    LazyResize(ndata);
-    GetGradientDevice(preds->DevicePointer(param_.gpu_id), info, iter,
-                      out_gpair->DevicePointer(param_.gpu_id), ndata);
+    preds->Reshard(devices_);
+    out_gpair->Reshard(devices_);
+    out_gpair->Resize(ndata);
+    LazyResize(ndata, info.weights_.size());
+    GetGradientDevice(preds, info, iter, out_gpair);
  }

 private:
-  void GetGradientDevice(float* preds,
+  void GetGradientDevice(HostDeviceVector<float>* preds,
                         const MetaInfo &info,
                         int iter,
-                         GradientPair* out_gpair, size_t n) {
-    dh::safe_cuda(cudaSetDevice(param_.gpu_id));
-    DeviceData& d = *data_;
-    d.label_correct.Fill(1);
+                         HostDeviceVector<GradientPair>* out_gpair) {
+    label_correct_.Fill(1);
    // only copy the labels and weights once, similar to how the data is copied
    if (!copied_) {
-      thrust::copy(info.labels_.begin(), info.labels_.begin() + n,
-                   d.labels.tbegin());
-      if (info.weights_.size() > 0) {
-        thrust::copy(info.weights_.begin(), info.weights_.begin() + n,
-                     d.weights.tbegin());
-      }
+      labels_.Copy(info.labels_);
+      if (info.weights_.size() > 0)
+        weights_.Copy(info.weights_);
      copied_ = true;
    }

    // run the kernel
+#pragma omp parallel for schedule(static, 1)
+    for (int i = 0; i < devices_.Size(); ++i) {
+      int d = devices_[i];
+      dh::safe_cuda(cudaSetDevice(d));
      const int block = 256;
+      size_t n = preds->DeviceSize(d);
+      if (n > 0) {
        get_gradient_k<Loss><<<dh::DivRoundUp(n, block), block>>>
-      (out_gpair, d.label_correct.Data(), preds,
-       d.labels.Data(), info.weights_.size() > 0 ? d.weights.Data() : nullptr,
+          (out_gpair->DevicePointer(d), label_correct_.DevicePointer(d),
+           preds->DevicePointer(d), labels_.DevicePointer(d),
+           info.weights_.size() > 0 ? weights_.DevicePointer(d) : nullptr,
           n, param_.scale_pos_weight);
        dh::safe_cuda(cudaGetLastError());
+      }
+      dh::safe_cuda(cudaDeviceSynchronize());
+    }

-    // copy output data from the GPU
-    unsigned int label_correct_h;
-    thrust::copy_n(d.label_correct.tbegin(), 1, &label_correct_h);
-
-    bool label_correct = label_correct_h != 0;
-    if (!label_correct) {
+    // copy "label correct" flags back to host
+    std::vector<unsigned int>& label_correct_h = label_correct_.HostVector();
+    for (int i = 0; i < devices_.Size(); ++i) {
+      if (label_correct_h[i] == 0)
        LOG(FATAL) << Loss::LabelErrorMsg();
    }
  }
@@ -174,17 +167,25 @@ class GPURegLossObj : public ObjFunction {
  }

  void PredTransform(HostDeviceVector<float> *io_preds) override {
-    PredTransformDevice(io_preds->DevicePointer(param_.gpu_id), io_preds->Size());
+    io_preds->Reshard(devices_);
+    size_t ndata = io_preds->Size();
+    PredTransformDevice(io_preds);
  }

-  void PredTransformDevice(float* preds, size_t n) {
-    dh::safe_cuda(cudaSetDevice(param_.gpu_id));
+  void PredTransformDevice(HostDeviceVector<float>* preds) {
+#pragma omp parallel for schedule(static, 1)
+    for (int i = 0; i < devices_.Size(); ++i) {
+      int d = devices_[i];
+      dh::safe_cuda(cudaSetDevice(d));
      const int block = 256;
-    pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>(preds, n);
+      size_t n = preds->DeviceSize(d);
+      if (n > 0) {
+        pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>(preds->DevicePointer(d), n);
        dh::safe_cuda(cudaGetLastError());
+      }
      dh::safe_cuda(cudaDeviceSynchronize());
    }
-
+  }

  float ProbToMargin(float base_score) const override {
    return Loss::ProbToMargin(base_score);
@@ -192,6 +193,7 @@ class GPURegLossObj : public ObjFunction {

 protected:
  GPURegLossParam param_;
+  GPUSet devices_;
 };

 // register the objective functions
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -310,8 +310,11 @@ class GPUPredictor : public xgboost::Predictor {
                 tree_group.begin());

    device_matrix->predictions.resize(out_preds->Size());
-    thrust::copy(out_preds->tbegin(param.gpu_id), out_preds->tend(param.gpu_id),
-                 device_matrix->predictions.begin());
+    auto& predictions = device_matrix->predictions;
+    out_preds->GatherTo(predictions.data(),
+                        predictions.data() + predictions.size());
+
+    dh::safe_cuda(cudaSetDevice(param.gpu_id));

    const int BLOCK_THREADS = 128;
    const int GRID_SIZE = static_cast<int>(
@@ -335,9 +338,8 @@ class GPUPredictor : public xgboost::Predictor {
            model.param.num_output_group);

    dh::safe_cuda(cudaDeviceSynchronize());
-    thrust::copy(device_matrix->predictions.begin(),
-                 device_matrix->predictions.end(),
-                 out_preds->tbegin(param.gpu_id));
+    out_preds->ScatterFrom(predictions.data(),
+                           predictions.data() + predictions.size());
  }

 public:
@@ -366,14 +368,13 @@ class GPUPredictor : public xgboost::Predictor {
                          const gbm::GBTreeModel& model) const {
    size_t n = model.param.num_output_group * info.num_row_;
    const std::vector<bst_float>& base_margin = info.base_margin_;
-    out_preds->Resize(n, 0.0f, param.gpu_id);
+    out_preds->Reshard(devices);
+    out_preds->Resize(n);
    if (base_margin.size() != 0) {
      CHECK_EQ(out_preds->Size(), n);
-      thrust::copy(base_margin.begin(), base_margin.end(),
-                   out_preds->tbegin(param.gpu_id));
+      out_preds->Copy(base_margin);
    } else {
-      thrust::fill(out_preds->tbegin(param.gpu_id),
-                   out_preds->tend(param.gpu_id), model.base_margin);
+      out_preds->Fill(model.base_margin);
    }
  }

@@ -385,11 +386,9 @@ class GPUPredictor : public xgboost::Predictor {
      if (it != cache_.end()) {
        HostDeviceVector<bst_float>& y = it->second.predictions;
        if (y.Size() != 0) {
-          dh::safe_cuda(cudaSetDevice(param.gpu_id));
-          out_preds->Resize(y.Size(), 0.0f, param.gpu_id);
-          dh::safe_cuda(cudaMemcpy(
-              out_preds->DevicePointer(param.gpu_id), y.DevicePointer(param.gpu_id),
-              out_preds->Size() * sizeof(bst_float), cudaMemcpyDefault));
+          out_preds->Reshard(devices);
+          out_preds->Resize(y.Size());
+          out_preds->Copy(&y);
          return true;
        }
      }
@@ -410,18 +409,15 @@ class GPUPredictor : public xgboost::Predictor {
      HostDeviceVector<bst_float>& predictions = e.predictions;

      if (predictions.Size() == 0) {
-        // ensure that the device in predictions is correct
-        predictions.Resize(0, 0.0f, param.gpu_id);
-        cpu_predictor->PredictBatch(dmat, &predictions, model, 0,
-                                    static_cast<bst_uint>(model.trees.size()));
-      } else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
+        this->InitOutPredictions(dmat->Info(), &predictions, model);
+      }
+
+      if (model.param.num_output_group == 1 && updaters->size() > 0 &&
          num_new_trees == 1 &&
-                 updaters->back()->UpdatePredictionCache(e.data.get(),
-                                                         &predictions)) {
+          updaters->back()->UpdatePredictionCache(e.data.get(), &predictions)) {
        // do nothing
      } else {
-        DevicePredictInternal(dmat, &predictions, model, old_ntree,
-                              model.trees.size());
+        DevicePredictInternal(dmat, &predictions, model, old_ntree, model.trees.size());
      }
    }
  }
@@ -462,6 +458,7 @@ class GPUPredictor : public xgboost::Predictor {
    Predictor::Init(cfg, cache);
    cpu_predictor->Init(cfg, cache);
    param.InitAllowUnknown(cfg);
+    devices = GPUSet::Range(param.gpu_id, dh::NDevicesAll(param.n_gpus));
    max_shared_memory_bytes = dh::MaxSharedMemory(param.gpu_id);
  }

@@ -473,6 +470,8 @@ class GPUPredictor : public xgboost::Predictor {
  thrust::device_vector<DevicePredictionNode> nodes;
  thrust::device_vector<size_t> tree_segments;
  thrust::device_vector<int> tree_group;
+  thrust::device_vector<bst_float> preds;
+  GPUSet devices;
  size_t max_shared_memory_bytes;
 };
 XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -495,6 +495,11 @@ class GPUMaker : public TreeUpdater {
  int nCols;
  int maxNodes;
  int maxLeaves;
+
+  // devices are only used for resharding the HostDeviceVector passed as a parameter;
+  // the algorithm works with a single GPU only
+  GPUSet devices;
+
  dh::CubMemory tmp_mem;
  dh::DVec<GradientPair> tmpScanGradBuff;
  dh::DVec<int> tmpScanKeyBuff;
@@ -510,6 +515,8 @@ class GPUMaker : public TreeUpdater {
    param.InitAllowUnknown(args);
    maxNodes = (1 << (param.max_depth + 1)) - 1;
    maxLeaves = 1 << param.max_depth;
+
+    devices = GPUSet::Range(param.gpu_id, dh::NDevicesAll(param.n_gpus));
  }

  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
@@ -519,6 +526,8 @@ class GPUMaker : public TreeUpdater {
    float lr = param.learning_rate;
    param.learning_rate = lr / trees.size();

+    gpair->Reshard(devices);
+
    try {
      // build tree
      for (size_t i = 0; i < trees.size(); ++i) {
@@ -688,10 +697,7 @@ class GPUMaker : public TreeUpdater {
  }

  void transferGrads(HostDeviceVector<GradientPair>* gpair) {
-    // HACK
-    dh::safe_cuda(cudaMemcpy(gradsInst.Data(), gpair->DevicePointer(param.gpu_id),
-                             sizeof(GradientPair) * nRows,
-                             cudaMemcpyDefault));
+    gpair->GatherTo(gradsInst.tbegin(), gradsInst.tend());
    // evaluate the full-grad reduction for the root node
    dh::SumReduction<GradientPair>(tmp_mem, gradsInst, gradSums, nRows);
  }
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -369,8 +369,7 @@ struct DeviceShard {
  }

  // Reset values for each update iteration
-  void Reset(HostDeviceVector<GradientPair>* dh_gpair, int device) {
-    auto begin = dh_gpair->tbegin(device);
+  void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
    dh::safe_cuda(cudaSetDevice(device_idx));
    position.CurrentDVec().Fill(0);
    std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
@@ -380,7 +379,7 @@ struct DeviceShard {

    std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
    ridx_segments.front() = Segment(0, ridx.Size());
-    this->gpair.copy(begin + row_begin_idx, begin + row_end_idx);
+    this->gpair.copy(dh_gpair->tbegin(device_idx), dh_gpair->tend(device_idx));
    SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
    hist.Reset();
  }
@@ -505,7 +504,7 @@ struct DeviceShard {
    dh::safe_cuda(cudaSetDevice(device_idx));
    if (!prediction_cache_initialised) {
      dh::safe_cuda(cudaMemcpy(
-          prediction_cache.Data(), &out_preds_d[row_begin_idx],
+          prediction_cache.Data(), out_preds_d,
          prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
    }
    prediction_cache_initialised = true;
@@ -528,7 +527,7 @@ struct DeviceShard {
        });

    dh::safe_cuda(cudaMemcpy(
-        &out_preds_d[row_begin_idx], prediction_cache.Data(),
+        out_preds_d, prediction_cache.Data(),
        prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
  }
 };
@@ -543,6 +542,7 @@ class GPUHistMaker : public TreeUpdater {
    param_.InitAllowUnknown(args);
    CHECK(param_.n_gpus != 0) << "Must have at least one device";
    n_devices_ = param_.n_gpus;
+    devices_ = GPUSet::Range(param_.gpu_id, dh::NDevicesAll(param_.n_gpus));

    dh::CheckComputeCapability();

@@ -610,15 +610,11 @@ class GPUHistMaker : public TreeUpdater {
    }

    // Create device shards
-    omp_set_num_threads(shards_.size());
-#pragma omp parallel
-    {
-      auto cpu_thread_id = omp_get_thread_num();
-      shards_[cpu_thread_id] = std::unique_ptr<DeviceShard>(
-          new DeviceShard(device_list_[cpu_thread_id], cpu_thread_id, gmat_,
-                          row_segments[cpu_thread_id],
-                          row_segments[cpu_thread_id + 1], n_bins_, param_));
-    }
+    dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
+      shard = std::unique_ptr<DeviceShard>(
+          new DeviceShard(device_list_[i], i, gmat_,
+                          row_segments[i], row_segments[i + 1], n_bins_, param_));
+      });

    p_last_fmat_ = dmat;
    initialised_ = true;
@@ -636,12 +632,9 @@ class GPUHistMaker : public TreeUpdater {

    // Copy gpair & reset memory
    monitor_.Start("InitDataReset", device_list_);
-    omp_set_num_threads(shards_.size());

-    // TODO(canonizer): make it parallel again once HostDeviceVector is
-    // thread-safe
-    for (int shard = 0; shard < shards_.size(); ++shard)
-      shards_[shard]->Reset(gpair, param_.gpu_id);
+    gpair->Reshard(devices_);
+    dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {shard->Reset(gpair); });
    monitor_.Stop("InitDataReset", device_list_);
  }

@@ -676,16 +669,16 @@ class GPUHistMaker : public TreeUpdater {
      subtraction_trick_nidx = nidx_left;
    }

-    for (auto& shard : shards_) {
+    dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
        shard->BuildHist(build_hist_nidx);
-    }
+      });

    this->AllReduceHist(build_hist_nidx);

-    for (auto& shard : shards_) {
+    dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
        shard->SubtractionTrick(nidx_parent, build_hist_nidx,
                               subtraction_trick_nidx);
-    }
+      });
  }

  // Returns best loss
@@ -743,22 +736,20 @@ class GPUHistMaker : public TreeUpdater {
    auto root_nidx = 0;
    // Sum gradients
    std::vector<GradientPair> tmp_sums(shards_.size());
-    omp_set_num_threads(shards_.size());
-#pragma omp parallel
-    {
-      auto cpu_thread_id = omp_get_thread_num();
-      auto& shard = shards_[cpu_thread_id];
+
+    dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
        dh::safe_cuda(cudaSetDevice(shard->device_idx));
-      tmp_sums[cpu_thread_id] = dh::SumReduction(
-          shard->temp_memory, shard->gpair.Data(), shard->gpair.Size());
-    }
+      tmp_sums[i] =
+        dh::SumReduction(shard->temp_memory, shard->gpair.Data(),
+                         shard->gpair.Size());
+      });
    auto sum_gradient =
        std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair());

    // Generate root histogram
-    for (auto& shard : shards_) {
+    dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
        shard->BuildHist(root_nidx);
-    }
+      });

    this->AllReduceHist(root_nidx);

@@ -802,14 +793,11 @@ class GPUHistMaker : public TreeUpdater {

    auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;

-    omp_set_num_threads(shards_.size());
-#pragma omp parallel
-    {
-      auto cpu_thread_id = omp_get_thread_num();
-      shards_[cpu_thread_id]->UpdatePosition(nidx, left_nidx, right_nidx, fidx,
+    dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
+      shard->UpdatePosition(nidx, left_nidx, right_nidx, fidx,
                           split_gidx, default_dir_left,
                           is_dense, fidx_begin, fidx_end);
-    }
+      });
  }

  void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
@@ -903,8 +891,6 @@ class GPUHistMaker : public TreeUpdater {
        monitor_.Stop("EvaluateSplits", device_list_);
      }
    }
-    // Reset omp num threads
-    omp_set_num_threads(nthread);
  }

  bool UpdatePredictionCache(
@@ -912,13 +898,10 @@ class GPUHistMaker : public TreeUpdater {
    monitor_.Start("UpdatePredictionCache", device_list_);
    if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
      return false;
-
-    bst_float* out_preds_d = p_out_preds->DevicePointer(param_.gpu_id);
-
-#pragma omp parallel for schedule(static, 1)
-    for (int shard = 0; shard < shards_.size(); ++shard) {
-      shards_[shard]->UpdatePredictionCache(out_preds_d);
-    }
+    p_out_preds->Reshard(devices_);
+    dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
+        shard->UpdatePredictionCache(p_out_preds->DevicePointer(shard->device_idx));
+      });
    monitor_.Stop("UpdatePredictionCache", device_list_);
    return true;
  }
@@ -992,6 +975,7 @@ class GPUHistMaker : public TreeUpdater {
  std::vector<int> device_list_;

  DMatrix* p_last_fmat_;
+  GPUSet devices_;
 };

 XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")