Make HostDeviceVector single gpu only (#4773)

* Make HostDeviceVector single gpu only
2019-08-25 14:51:13 -07:00 · 2019-08-25 14:51:13 -07:00 · 38ab79f889
commit 38ab79f889
parent 41227d1933
54 changed files with 641 additions and 1621 deletions
--- a/demo/c-api/c-api-demo.c
+++ b/demo/c-api/c-api-demo.c
@ -36,13 +36,12 @@ int main(int argc, char** argv) {
  //   https://xgboost.readthedocs.io/en/latest/parameter.html
  safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
  if (use_gpu) {
-    // set the number of GPUs and the first GPU to use;
+    // set the GPU to use;
    // this is not necessary, but provided here as an illustration
    safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "1"));
    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
  } else {
    // avoid evaluating objective and metric on a GPU
-    safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "0"));
+    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
  }
  safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
--- a/include/xgboost/generic_parameters.h
+++ b/include/xgboost/generic_parameters.h
@ -19,10 +19,8 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
  // number of threads to use if OpenMP is enabled
  // if equals 0, use system default
  int nthread;
-  // primary device.
+  // primary device, -1 means no gpu.
  int gpu_id;
  // number of devices to use, -1 implies using all available devices.
  int n_gpus;
  // declare parameters
  DMLC_DECLARE_PARAMETER(GenericParameter) {
    DMLC_DECLARE_FIELD(seed).set_default(0).describe(
@ -36,15 +34,20 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
    DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
        "Number of threads to use.");
    DMLC_DECLARE_FIELD(gpu_id)
-        .set_default(0)
+        .set_default(-1)
        .set_lower_bound(-1)
        .describe("The primary GPU device ordinal.");
    DMLC_DECLARE_FIELD(n_gpus)
        .set_default(0)
-        .set_range(0, 1)
+        .set_range(0, 0)
        .describe("Deprecated. Single process multi-GPU training is no longer supported. "
                  "Please switch to distributed training with one process per GPU. "
                  "This can be done using Dask or Spark.");
  }
 private:
  // number of devices to use (deprecated).
  int n_gpus;
 };
 }  // namespace xgboost
--- a/plugin/example/custom_obj.cc
+++ b/plugin/example/custom_obj.cc
@ -60,8 +60,8 @@ class MyLogistic : public ObjFunction {
  void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
    // transform margin value to probability.
    std::vector<bst_float> &preds = io_preds->HostVector();
-    for (size_t i = 0; i < preds.size(); ++i) {
+    for (auto& pred : preds) {
-      preds[i] = 1.0f / (1.0f + std::exp(-preds[i]));
+      pred = 1.0f / (1.0f + std::exp(-pred));
    }
  }
  bst_float ProbToMargin(bst_float base_score) const override {
--- a/src/common/common.cc
+++ b/src/common/common.cc
@ -22,48 +22,12 @@ using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
 GlobalRandomEngine& GlobalRandom() {
  return RandomThreadLocalStore::Get()->engine;
 }
 }  // namespace common
 #if !defined(XGBOOST_USE_CUDA)
-int AllVisibleImpl::AllVisible() {
+int AllVisibleGPUs() {
  return 0;
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
-constexpr GPUSet::GpuIdType GPUSet::kAll;
+}  // namespace common
 GPUSet GPUSet::All(GpuIdType gpu_id, GpuIdType n_gpus, int32_t n_rows) {
  CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0.";
  CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1.";
  GpuIdType const n_devices_visible = AllVisible().Size();
  CHECK_LE(n_gpus, n_devices_visible);
  if (n_devices_visible == 0 || n_gpus == 0 || n_rows == 0) {
    LOG(DEBUG) << "Runing on CPU.";
    return Empty();
  }
  GpuIdType const n_available_devices = n_devices_visible - gpu_id;
  if (n_gpus == kAll) {  // Use all devices starting from `gpu_id'.
    CHECK(gpu_id < n_devices_visible)
        << "\ngpu_id should be less than number of visible devices.\ngpu_id: "
        << gpu_id
        << ", number of visible devices: "
        << n_devices_visible;
    GpuIdType n_devices =
        n_available_devices < n_rows ? n_available_devices : n_rows;
    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
    return Range(gpu_id, n_devices);
  } else {  // Use devices in ( gpu_id, gpu_id + n_gpus ).
    CHECK_LE(n_gpus, n_available_devices)
        << "Starting from gpu id: " << gpu_id << ", there are only "
        << n_available_devices << " available devices, while n_gpus is set to: "
        << n_gpus;
    GpuIdType n_devices = n_gpus < n_rows ? n_gpus : n_rows;
    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
    return Range(gpu_id, n_devices);
  }
 }
 }  // namespace xgboost
--- a/src/common/common.cu
+++ b/src/common/common.cu
@ -4,8 +4,9 @@
 #include "common.h"
 namespace xgboost {
 namespace common {
-int AllVisibleImpl::AllVisible() {
+int AllVisibleGPUs() {
  int n_visgpus = 0;
  try {
    // When compiled with CUDA but running on CPU only device,
@ -17,4 +18,5 @@ int AllVisibleImpl::AllVisible() {
  return n_visgpus;
 }
 }  // namespace common
 }  // namespace xgboost
--- a/src/common/common.h
+++ b/src/common/common.h
@ -140,88 +140,8 @@ class Range {
  Iterator begin_;
  Iterator end_;
 };
 int AllVisibleGPUs();
 }  // namespace common
 struct AllVisibleImpl {
  static int AllVisible();
 };
 /* \brief set of devices across which HostDeviceVector can be distributed.
 *
 * Currently implemented as a range, but can be changed later to something else,
 *   e.g. a bitset
 */
 class GPUSet {
 public:
  using GpuIdType = int;
  static constexpr GpuIdType kAll = -1;
  explicit GPUSet(int start = 0, int ndevices = 0)
      : devices_(start, start + ndevices) {}
  static GPUSet Empty() { return GPUSet(); }
  static GPUSet Range(GpuIdType start, GpuIdType n_gpus) {
    return n_gpus <= 0 ? Empty() : GPUSet{start, n_gpus};
  }
  /*! \brief n_gpus and num_rows both are upper bounds. */
  static GPUSet All(GpuIdType gpu_id, GpuIdType n_gpus,
                    GpuIdType num_rows = std::numeric_limits<GpuIdType>::max());
  static GPUSet AllVisible() {
    GpuIdType n =  AllVisibleImpl::AllVisible();
    return Range(0, n);
  }
  size_t Size() const {
    GpuIdType size = *devices_.end() - *devices_.begin();
    GpuIdType res = size < 0 ? 0 : size;
    return static_cast<size_t>(res);
  }
  /*
   * By default, we have two configurations of identifying device, one
   * is the device id obtained from `cudaGetDevice'.  But we sometimes
   * store objects that allocated one for each device in a list, which
   * requires a zero-based index.
   *
   * Hence, `DeviceId' converts a zero-based index to actual device id,
   * `Index' converts a device id to a zero-based index.
   */
  GpuIdType DeviceId(size_t index) const {
    GpuIdType result = *devices_.begin() + static_cast<GpuIdType>(index);
    CHECK(Contains(result)) << "\nDevice " << result << " is not in GPUSet."
                            << "\nIndex: " << index
                            << "\nGPUSet: (" << *begin() << ", " << *end() << ")"
                            << std::endl;
    return result;
  }
  size_t Index(GpuIdType device) const {
    CHECK(Contains(device)) << "\nDevice " << device << " is not in GPUSet."
                            << "\nGPUSet: (" << *begin() << ", " << *end() << ")"
                            << std::endl;
    size_t result = static_cast<size_t>(device - *devices_.begin());
    return result;
  }
  bool IsEmpty() const { return Size() == 0; }
  bool Contains(GpuIdType device) const {
    return *devices_.begin() <= device && device < *devices_.end();
  }
  common::Range::Iterator begin() const { return devices_.begin(); }  // NOLINT
  common::Range::Iterator end() const { return devices_.end(); }      // NOLINT
  friend bool operator==(const GPUSet& lhs, const GPUSet& rhs) {
    return lhs.devices_ == rhs.devices_;
  }
  friend bool operator!=(const GPUSet& lhs, const GPUSet& rhs) {
    return !(lhs == rhs);
  }
 private:
  common::Range devices_;
 };
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -72,22 +72,6 @@ const T *Raw(const thrust::device_vector<T> &v) {  //  NOLINT
  return raw_pointer_cast(v.data());
 }
 // if n_devices=-1, then use all visible devices
 inline void SynchronizeNDevices(xgboost::GPUSet devices) {
  devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices;
  for (auto const d : devices) {
    safe_cuda(cudaSetDevice(d));
    safe_cuda(cudaDeviceSynchronize());
  }
 }
 inline void SynchronizeAll() {
  for (int device_idx : xgboost::GPUSet::AllVisible()) {
    safe_cuda(cudaSetDevice(device_idx));
    safe_cuda(cudaDeviceSynchronize());
  }
 }
 inline size_t AvailableMemory(int device_idx) {
  size_t device_free = 0;
  size_t device_total = 0;
@ -119,7 +103,7 @@ inline size_t MaxSharedMemory(int device_idx) {
 }
 inline void CheckComputeCapability() {
-  for (int d_idx : xgboost::GPUSet::AllVisible()) {
+  for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
    cudaDeviceProp prop;
    safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
    std::ostringstream oss;
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -35,7 +35,6 @@ __global__ void FindCutsK
  if (icut >= ncuts) {
    return;
  }
  WXQSketch::Entry v;
  int isample = 0;
  if (icut == 0) {
    isample = 0;
@ -59,11 +58,14 @@ struct IsNotNaN {
  __device__ bool operator()(float a) const { return !isnan(a); }
 };
-__global__ void UnpackFeaturesK
+__global__ void UnpackFeaturesK(float* __restrict__ fvalues,
-(float* __restrict__ fvalues, float* __restrict__ feature_weights,
+                                float* __restrict__ feature_weights,
- const size_t* __restrict__ row_ptrs, const float* __restrict__ weights,
+                                const size_t* __restrict__ row_ptrs,
- Entry* entries, size_t nrows_array, int ncols, size_t row_begin_ptr,
+                                const float* __restrict__ weights,
- size_t nrows) {
+                                Entry* entries,
                                size_t nrows_array,
                                size_t row_begin_ptr,
                                size_t nrows) {
  size_t irow = threadIdx.x + size_t(blockIdx.x) * blockDim.x;
  if (irow >= nrows) {
    return;
@ -102,8 +104,9 @@ struct SketchContainer {
    const MetaInfo &info = dmat->Info();
    // Initialize Sketches for this dmatrix
    sketches_.resize(info.num_col_);
-#pragma omp parallel for schedule(static) if (info.num_col_ > kOmpNumColsParallelizeLimit)
+#pragma omp parallel for default(none) shared(info, param) schedule(static) \
-    for (int icol = 0; icol < info.num_col_; ++icol) {
+if (info.num_col_ > kOmpNumColsParallelizeLimit)  // NOLINT
    for (int icol = 0; icol < info.num_col_; ++icol) {  // NOLINT
      sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin));
    }
  }
@ -120,8 +123,6 @@ struct GPUSketcher {
  // manage memory for a single GPU
  class DeviceShard {
    int device_;
    bst_uint row_begin_;  // The row offset for this shard
    bst_uint row_end_;
    bst_uint n_rows_;
    int num_cols_{0};
    size_t n_cuts_{0};
@ -131,27 +132,31 @@ struct GPUSketcher {
    tree::TrainParam param_;
    SketchContainer *sketch_container_;
-    dh::device_vector<size_t> row_ptrs_;
+    dh::device_vector<size_t> row_ptrs_{};
-    dh::device_vector<Entry> entries_;
+    dh::device_vector<Entry> entries_{};
-    dh::device_vector<bst_float> fvalues_;
+    dh::device_vector<bst_float> fvalues_{};
-    dh::device_vector<bst_float> feature_weights_;
+    dh::device_vector<bst_float> feature_weights_{};
-    dh::device_vector<bst_float> fvalues_cur_;
+    dh::device_vector<bst_float> fvalues_cur_{};
-    dh::device_vector<WXQSketch::Entry> cuts_d_;
+    dh::device_vector<WXQSketch::Entry> cuts_d_{};
-    thrust::host_vector<WXQSketch::Entry> cuts_h_;
+    thrust::host_vector<WXQSketch::Entry> cuts_h_{};
-    dh::device_vector<bst_float> weights_;
+    dh::device_vector<bst_float> weights_{};
-    dh::device_vector<bst_float> weights2_;
+    dh::device_vector<bst_float> weights2_{};
-    std::vector<size_t> n_cuts_cur_;
+    std::vector<size_t> n_cuts_cur_{};
-    dh::device_vector<size_t> num_elements_;
+    dh::device_vector<size_t> num_elements_{};
-    dh::device_vector<char> tmp_storage_;
+    dh::device_vector<char> tmp_storage_{};
   public:
-    DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
+    DeviceShard(int device,
-                tree::TrainParam param, SketchContainer *sketch_container) :
+                bst_uint n_rows,
-      device_(device), row_begin_(row_begin), row_end_(row_end),
+                tree::TrainParam param,
-      n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
+                SketchContainer* sketch_container) :
        device_(device),
        n_rows_(n_rows),
        param_(std::move(param)),
        sketch_container_(sketch_container) {
    }
-    ~DeviceShard() {
+    ~DeviceShard() {  // NOLINT
      dh::safe_cuda(cudaSetDevice(device_));
    }
@ -319,19 +324,18 @@ struct GPUSketcher {
      const auto& offset_vec = row_batch.offset.HostVector();
      const auto& data_vec = row_batch.data.HostVector();
-      size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
+      size_t n_entries = offset_vec[batch_row_end] - offset_vec[batch_row_begin];
        offset_vec[row_begin_ + batch_row_begin];
      // copy the batch to the GPU
      dh::safe_cuda
        (cudaMemcpyAsync(entries_.data().get(),
-                    data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
+                    data_vec.data() + offset_vec[batch_row_begin],
                    n_entries * sizeof(Entry), cudaMemcpyDefault));
      // copy the weights if necessary
      if (has_weights_) {
        const auto& weights_vec = info.weights_.HostVector();
        dh::safe_cuda
          (cudaMemcpyAsync(weights_.data().get(),
-                      weights_vec.data() + row_begin_ + batch_row_begin,
+                      weights_vec.data() + batch_row_begin,
                      batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
      }
@ -349,8 +353,7 @@ struct GPUSketcher {
        (fvalues_.data().get(), has_weights_ ? feature_weights_.data().get() : nullptr,
         row_ptrs_.data().get() + batch_row_begin,
         has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
-         gpu_batch_nrows_, num_cols_,
+         gpu_batch_nrows_, offset_vec[batch_row_begin], batch_nrows);
         offset_vec[row_begin_ + batch_row_begin], batch_nrows);
      for (int icol = 0; icol < num_cols_; ++icol) {
        FindColumnCuts(batch_nrows, icol);
@ -358,7 +361,7 @@ struct GPUSketcher {
      // add cuts into sketches
      thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
-#pragma omp parallel for schedule(static) \
+#pragma omp parallel for default(none) schedule(static) \
      if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
      for (int icol = 0; icol < num_cols_; ++icol) {
        WXQSketch::SummaryContainer summary;
@ -391,8 +394,7 @@ struct GPUSketcher {
      dh::safe_cuda(cudaSetDevice(device_));
      const auto& offset_vec = row_batch.offset.HostVector();
      row_ptrs_.resize(n_rows_ + 1);
-      thrust::copy(offset_vec.data() + row_begin_,
+      thrust::copy(offset_vec.data(), offset_vec.data() + n_rows_ + 1, row_ptrs_.begin());
                   offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
      size_t gpu_nbatches = common::DivRoundUp(n_rows_, gpu_batch_nrows_);
      for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
        SketchBatch(row_batch, info, gpu_batch);
@ -401,32 +403,18 @@ struct GPUSketcher {
  };
  void SketchBatch(const SparsePage &batch, const MetaInfo &info) {
-    GPUDistribution dist =
+    auto device = generic_param_.gpu_id;
      GPUDistribution::Block(GPUSet::All(generic_param_.gpu_id, generic_param_.n_gpus,
                                         batch.Size()));
-    // create device shards
+    // create device shard
-    shards_.resize(dist.Devices().Size());
+    shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get()));
    dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
        size_t start = dist.ShardStart(batch.Size(), i);
        size_t size = dist.ShardSize(batch.Size(), i);
        shard = std::unique_ptr<DeviceShard>(
            new DeviceShard(dist.Devices().DeviceId(i), start,
                            start + size, param_, sketch_container_.get()));
      });
-    // compute sketches for each shard
+    // compute sketches for the shard
-    dh::ExecuteIndexShards(&shards_,
+    shard_->Init(batch, info, gpu_batch_nrows_);
-                           [&](int idx, std::unique_ptr<DeviceShard>& shard) {
+    shard_->Sketch(batch, info);
-                             shard->Init(batch, info, gpu_batch_nrows_);
+    shard_->ComputeRowStride();
                             shard->Sketch(batch, info);
                             shard->ComputeRowStride();
                           });
-    // compute row stride across all shards
+    // compute row stride
-    for (const auto &shard : shards_) {
+    row_stride_ = shard_->GetRowStride();
      row_stride_ = std::max(row_stride_, shard->GetRowStride());
    }
  }
  GPUSketcher(const tree::TrainParam &param, const GenericParameter &generic_param, int gpu_nrows)
@ -444,13 +432,13 @@ struct GPUSketcher {
      this->SketchBatch(batch, info);
    }
-    hmat->Init(&sketch_container_.get()->sketches_, param_.max_bin);
+    hmat->Init(&sketch_container_->sketches_, param_.max_bin);
    return row_stride_;
  }
 private:
-  std::vector<std::unique_ptr<DeviceShard>> shards_;
+  std::unique_ptr<DeviceShard> shard_;
  const tree::TrainParam &param_;
  const GenericParameter &generic_param_;
  int gpu_batch_nrows_;
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@ -30,19 +30,19 @@ struct HostDeviceVectorImpl {
 };
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(size, v);
 }
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
@ -75,29 +75,23 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
 template <typename T>
-GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
+int HostDeviceVector<T>::DeviceIdx() const { return -1; }
 template <typename T>
-const GPUDistribution& HostDeviceVector<T>::Distribution() const {
+T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
  static GPUDistribution dummyInstance;
  return dummyInstance;
 }
 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
+const T* HostDeviceVector<T>::ConstDevicePointer() const {
 template <typename T>
 const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
  return nullptr;
 }
 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
+common::Span<T> HostDeviceVector<T>::DeviceSpan() {
  return common::Span<T>();
 }
 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
  return common::Span<const T>();
 }
@ -115,10 +109,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 }
 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) const { return 0; }
+size_t HostDeviceVector<T>::DeviceSize() const { return 0; }
 template <typename T>
 size_t HostDeviceVector<T>::DeviceSize(int device) const { return 0; }
 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
@ -149,18 +140,12 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
 }
 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
+bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
  return false;
 }
 template <typename T>
-void HostDeviceVector<T>::Shard(const GPUDistribution& distribution) const { }
+void HostDeviceVector<T>::SetDevice(int device) const {}
 template <typename T>
 void HostDeviceVector<T>::Shard(GPUSet devices) const { }
 template <typename T>
 void Reshard(const GPUDistribution &distribution) { }
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -10,7 +10,6 @@
 #include <mutex>
 #include "./device_helpers.cuh"
 namespace xgboost {
 // the handler to call instead of cudaSetDevice; only used for testing
@ -43,144 +42,12 @@ class Permissions {
 };
 template <typename T>
-struct HostDeviceVectorImpl {
+class HostDeviceVectorImpl {
-  struct DeviceShard {
+ public:
-    DeviceShard()
+  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), perm_h_(device < 0) {
-      : proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
+    if (device >= 0) {
        cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
    ~DeviceShard() {
      SetDevice();
-    }
+      data_d_.resize(size, v);
    void Init(HostDeviceVectorImpl<T>* vec, int device) {
      if (vec_ == nullptr) { vec_ = vec; }
      CHECK_EQ(vec, vec_);
      device_ = device;
      LazyResize(vec_->Size());
      perm_d_ = vec_->perm_h_.Complementary();
    }
    void Init(HostDeviceVectorImpl<T>* vec, const DeviceShard& other) {
      if (vec_ == nullptr) { vec_ = vec; }
      CHECK_EQ(vec, vec_);
      device_ = other.device_;
      cached_size_ = other.cached_size_;
      start_ = other.start_;
      proper_size_ = other.proper_size_;
      SetDevice();
      data_.resize(other.data_.size());
      perm_d_ = other.perm_d_;
    }
    void ScatterFrom(const T* begin) {
      // TODO(canonizer): avoid full copy of host data
      LazySyncDevice(GPUAccess::kWrite);
      SetDevice();
      dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_,
                               data_.size() * sizeof(T), cudaMemcpyDefault));
    }
    void GatherTo(thrust::device_ptr<T> begin) {
      LazySyncDevice(GPUAccess::kRead);
      SetDevice();
      dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(),
                               proper_size_ * sizeof(T), cudaMemcpyDefault));
    }
    void Fill(T v) {
      // TODO(canonizer): avoid full copy of host data
      LazySyncDevice(GPUAccess::kWrite);
      SetDevice();
      thrust::fill(data_.begin(), data_.end(), v);
    }
    void Copy(DeviceShard* other) {
      // TODO(canonizer): avoid full copy of host data for this (but not for other)
      LazySyncDevice(GPUAccess::kWrite);
      other->LazySyncDevice(GPUAccess::kRead);
      SetDevice();
      dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(),
                               data_.size() * sizeof(T), cudaMemcpyDefault));
    }
    void LazySyncHost(GPUAccess access) {
      SetDevice();
      dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
                               data_.data().get(),  proper_size_ * sizeof(T),
                               cudaMemcpyDeviceToHost));
      perm_d_.DenyComplementary(access);
    }
    void LazyResize(size_t new_size) {
      if (new_size == cached_size_) { return; }
      // resize is required
      int ndevices = vec_->distribution_.devices_.Size();
      int device_index = vec_->distribution_.devices_.Index(device_);
      start_ = vec_->distribution_.ShardStart(new_size, device_index);
      proper_size_ = vec_->distribution_.ShardProperSize(new_size, device_index);
      // The size on this device.
      size_t size_d = vec_->distribution_.ShardSize(new_size, device_index);
      SetDevice();
      data_.resize(size_d);
      cached_size_ = new_size;
    }
    void LazySyncDevice(GPUAccess access) {
      if (perm_d_.CanAccess(access)) { return; }
      if (perm_d_.CanRead()) {
        // deny read to the host
        perm_d_.Grant(access);
        std::lock_guard<std::mutex> lock(vec_->mutex_);
        vec_->perm_h_.DenyComplementary(access);
        return;
      }
      // data is on the host
      size_t size_h = vec_->data_h_.size();
      LazyResize(size_h);
      SetDevice();
      dh::safe_cuda(
          cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
                     data_.size() * sizeof(T), cudaMemcpyHostToDevice));
      perm_d_.Grant(access);
      std::lock_guard<std::mutex> lock(vec_->mutex_);
      vec_->perm_h_.DenyComplementary(access);
      vec_->size_d_ = size_h;
    }
    void SetDevice() {
      if (cudaSetDeviceHandler == nullptr) {
        dh::safe_cuda(cudaSetDevice(device_));
      } else {
        (*cudaSetDeviceHandler)(device_);
      }
    }
    T*     Raw()                    { return data_.data().get(); }
    size_t Start()            const { return start_; }
    size_t DataSize()         const { return data_.size(); }
    Permissions& Perm()             { return perm_d_; }
    Permissions const& Perm() const { return perm_d_; }
   private:
    int device_;
    dh::device_vector<T> data_;
    // cached vector size
    size_t cached_size_;
    size_t start_;
    // size of the portion to copy back to the host
    size_t proper_size_;
    Permissions perm_d_;
    HostDeviceVectorImpl<T>* vec_;
  };
  HostDeviceVectorImpl(size_t size, T v, const GPUDistribution &distribution)
    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
    if (!distribution_.IsEmpty()) {
      size_d_ = size;
      InitShards();
      Fill(v);
    } else {
      data_h_.resize(size, v);
    }
@ -188,127 +55,81 @@ struct HostDeviceVectorImpl {
  // required, as a new std::mutex has to be created
  HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
-    : data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
+      : device_(other.device_), data_h_(other.data_h_), perm_h_(other.perm_h_), mutex_() {
-      distribution_(other.distribution_), mutex_() {
+    if (device_ >= 0) {
-    shards_.resize(other.shards_.size());
+      SetDevice();
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
+      data_d_ = other.data_d_;
-        shard.Init(this, other.shards_.at(i));
+    }
      });
  }
  // Initializer can be std::vector<T> or std::initializer_list<T>
  template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, const GPUDistribution &distribution)
+  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device), perm_h_(device < 0) {
-    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
+    if (device >= 0) {
-    if (!distribution_.IsEmpty()) {
+      LazyResizeDevice(init.size());
      size_d_ = init.size();
      InitShards();
      Copy(init);
    } else {
      data_h_ = init;
    }
  }
-  void InitShards() {
+  ~HostDeviceVectorImpl() {
-    int ndevices = distribution_.devices_.Size();
+    if (device_ >= 0) {
-    shards_.resize(ndevices);
+      SetDevice();
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
+    }
        shard.Init(this, distribution_.devices_.DeviceId(i));
      });
  }
-  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }
+  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : data_d_.size(); }
-  GPUSet Devices() const { return distribution_.devices_; }
+  int DeviceIdx() const { return device_; }
-  const GPUDistribution& Distribution() const { return distribution_; }
+  T* DevicePointer() {
-
+    LazySyncDevice(GPUAccess::kWrite);
-  T* DevicePointer(int device) {
+    return data_d_.data().get();
    CHECK(distribution_.devices_.Contains(device));
    LazySyncDevice(device, GPUAccess::kWrite);
    return shards_.at(distribution_.devices_.Index(device)).Raw();
  }
-  const T* ConstDevicePointer(int device) {
+  const T* ConstDevicePointer() {
-    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(GPUAccess::kRead);
-    LazySyncDevice(device, GPUAccess::kRead);
+    return data_d_.data().get();
    return shards_.at(distribution_.devices_.Index(device)).Raw();
  }
-  common::Span<T> DeviceSpan(int device) {
+  common::Span<T> DeviceSpan() {
-    GPUSet devices = distribution_.devices_;
+    LazySyncDevice(GPUAccess::kWrite);
-    CHECK(devices.Contains(device));
+    return {data_d_.data().get(), static_cast<typename common::Span<T>::index_type>(DeviceSize())};
    LazySyncDevice(device, GPUAccess::kWrite);
    return {shards_.at(devices.Index(device)).Raw(),
          static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
  }
-  common::Span<const T> ConstDeviceSpan(int device) {
+  common::Span<const T> ConstDeviceSpan() {
-    GPUSet devices = distribution_.devices_;
+    LazySyncDevice(GPUAccess::kRead);
    CHECK(devices.Contains(device));
    LazySyncDevice(device, GPUAccess::kRead);
    using SpanInd = typename common::Span<const T>::index_type;
-    return {shards_.at(devices.Index(device)).Raw(),
+    return {data_d_.data().get(), static_cast<SpanInd>(DeviceSize())};
          static_cast<SpanInd>(DeviceSize(device))};
  }
-  size_t DeviceSize(int device) {
+  size_t DeviceSize() {
-    CHECK(distribution_.devices_.Contains(device));
+    LazySyncDevice(GPUAccess::kRead);
-    LazySyncDevice(device, GPUAccess::kRead);
+    return data_d_.size();
    return shards_.at(distribution_.devices_.Index(device)).DataSize();
  }
-  size_t DeviceStart(int device) {
+  thrust::device_ptr<T> tbegin() {  // NOLINT
-    CHECK(distribution_.devices_.Contains(device));
+    return thrust::device_ptr<T>(DevicePointer());
    LazySyncDevice(device, GPUAccess::kRead);
    return shards_.at(distribution_.devices_.Index(device)).Start();
  }
-  thrust::device_ptr<T> tbegin(int device) {  // NOLINT
+  thrust::device_ptr<const T> tcbegin() {  // NOLINT
-    return thrust::device_ptr<T>(DevicePointer(device));
+    return thrust::device_ptr<const T>(ConstDevicePointer());
  }
-  thrust::device_ptr<const T> tcbegin(int device) {  // NOLINT
+  thrust::device_ptr<T> tend() {  // NOLINT
-    return thrust::device_ptr<const T>(ConstDevicePointer(device));
+    return tbegin() + DeviceSize();
  }
-  thrust::device_ptr<T> tend(int device) {  // NOLINT
+  thrust::device_ptr<const T> tcend() {  // NOLINT
-    return tbegin(device) + DeviceSize(device);
+    return tcbegin() + DeviceSize();
  }
  thrust::device_ptr<const T> tcend(int device) {  // NOLINT
    return tcbegin(device) + DeviceSize(device);
  }
  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
    CHECK_EQ(end - begin, Size());
    if (perm_h_.CanWrite()) {
      dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
                               (end - begin) * sizeof(T),
                               cudaMemcpyDeviceToHost));
    } else {
    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
        shard.ScatterFrom(begin.get());
      });
    }
  }
  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
    CHECK_EQ(end - begin, Size());
    if (perm_h_.CanWrite()) {
      dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
                               data_h_.size() * sizeof(T),
                               cudaMemcpyHostToDevice));
    } else {
      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.GatherTo(begin); });
    }
  }
  void Fill(T v) {  // NOLINT
    if (perm_h_.CanWrite()) {
      std::fill(data_h_.begin(), data_h_.end(), v);
    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.Fill(v); });
+      DeviceFill(v);
    }
  }
@ -320,14 +141,10 @@ struct HostDeviceVectorImpl {
      return;
    }
    // Data is on device;
-    if (distribution_ != other->distribution_) {
+    if (device_ != other->device_) {
-      distribution_ = GPUDistribution();
+      SetDevice(other->device_);
      Shard(other->Distribution());
      size_d_ = other->size_d_;
    }
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
+    DeviceCopy(other);
        shard.Copy(&other->shards_.at(i));
      });
  }
  void Copy(const std::vector<T>& other) {
@ -335,9 +152,7 @@ struct HostDeviceVectorImpl {
    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
+      DeviceCopy(other.data());
          shard.ScatterFrom(other.data());
        });
    }
  }
@ -346,9 +161,7 @@ struct HostDeviceVectorImpl {
    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
+      DeviceCopy(other.begin());
          shard.ScatterFrom(other.begin());
        });
    }
  }
@ -362,40 +175,23 @@ struct HostDeviceVectorImpl {
    return data_h_;
  }
-  void Shard(const GPUDistribution& distribution) {
+  void SetDevice(int device) {
-    if (distribution_ == distribution) { return; }
+    if (device_ == device) { return; }
-    CHECK(distribution_.IsEmpty())
+    if (device_ >= 0) {
-        << "Data resides on different GPUs: " << "ID: "
+      LazySyncHost(GPUAccess::kWrite);
-        << *(distribution_.Devices().begin()) << " and ID: "
+    }
-        << *(distribution.Devices().begin());
+    device_ = device;
-    distribution_ = distribution;
+    if (device_ >= 0) {
-    InitShards();
+      LazyResizeDevice(data_h_.size());
-  }
+    }
  void Shard(GPUSet new_devices) {
    if (distribution_.Devices() == new_devices) { return; }
    Shard(GPUDistribution::Block(new_devices));
  }
  void Reshard(const GPUDistribution &distribution) {
    if (distribution_ == distribution) { return; }
    LazySyncHost(GPUAccess::kWrite);
    distribution_ = distribution;
    shards_.clear();
    InitShards();
  }
  void Resize(size_t new_size, T v) {
    if (new_size == Size()) { return; }
-    if (distribution_.IsFixedSize()) {
+    if (Size() == 0 && device_ >= 0) {
      CHECK_EQ(new_size, distribution_.offsets_.back());
    }
    if (Size() == 0 && !distribution_.IsEmpty()) {
      // fast on-device resize
      perm_h_ = Permissions(false);
-      size_d_ = new_size;
+      data_d_.resize(new_size, v);
      InitShards();
      Fill(v);
    } else {
      // resize on host
      LazySyncHost(GPUAccess::kWrite);
@ -407,72 +203,110 @@ struct HostDeviceVectorImpl {
    if (perm_h_.CanAccess(access)) { return; }
    if (perm_h_.CanRead()) {
      // data is present, just need to deny access to the device
      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
          shard.Perm().DenyComplementary(access);
        });
      perm_h_.Grant(access);
      return;
    }
    std::lock_guard<std::mutex> lock(mutex_);
-    if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
+    if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
+    SetDevice();
-        shard.LazySyncHost(access);
+    dh::safe_cuda(cudaMemcpy(data_h_.data(),
-      });
+                             data_d_.data().get(),
                             data_d_.size() * sizeof(T),
                             cudaMemcpyDeviceToHost));
    perm_h_.Grant(access);
  }
-  void LazySyncDevice(int device, GPUAccess access) {
+  void LazySyncDevice(GPUAccess access) {
-    GPUSet devices = distribution_.Devices();
+    if (DevicePerm().CanAccess(access)) { return; }
-    CHECK(devices.Contains(device));
+    if (DevicePerm().CanRead()) {
-    shards_.at(devices.Index(device)).LazySyncDevice(access);
+      // deny read to the host
      std::lock_guard<std::mutex> lock(mutex_);
      perm_h_.DenyComplementary(access);
      return;
    }
    // data is on the host
    LazyResizeDevice(data_h_.size());
    SetDevice();
    dh::safe_cuda(cudaMemcpy(data_d_.data().get(),
                             data_h_.data(),
                             data_d_.size() * sizeof(T),
                             cudaMemcpyHostToDevice));
    std::lock_guard<std::mutex> lock(mutex_);
    perm_h_.DenyComplementary(access);
  }
  bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
-
+  bool DeviceCanAccess(GPUAccess access) { return DevicePerm().CanAccess(access); }
  bool DeviceCanAccess(int device, GPUAccess access) {
    GPUSet devices = distribution_.Devices();
    if (!devices.Contains(device)) { return false; }
    return shards_.at(devices.Index(device)).Perm().CanAccess(access);
  }
 private:
-  std::vector<T> data_h_;
+  int device_{-1};
-  Permissions perm_h_;
+  std::vector<T> data_h_{};
-  // the total size of the data stored on the devices
+  dh::device_vector<T> data_d_{};
-  size_t size_d_;
+  Permissions perm_h_{false};
  GPUDistribution distribution_;
  // protects size_d_ and perm_h_ when updated from multiple threads
-  std::mutex mutex_;
+  std::mutex mutex_{};
-  std::vector<DeviceShard> shards_;
+
  void DeviceFill(T v) {
    // TODO(canonizer): avoid full copy of host data
    LazySyncDevice(GPUAccess::kWrite);
    SetDevice();
    thrust::fill(data_d_.begin(), data_d_.end(), v);
  }
  void DeviceCopy(HostDeviceVectorImpl* other) {
    // TODO(canonizer): avoid full copy of host data for this (but not for other)
    LazySyncDevice(GPUAccess::kWrite);
    other->LazySyncDevice(GPUAccess::kRead);
    SetDevice();
    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
  }
  void DeviceCopy(const T* begin) {
    // TODO(canonizer): avoid full copy of host data
    LazySyncDevice(GPUAccess::kWrite);
    SetDevice();
    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
  }
  void LazyResizeDevice(size_t new_size) {
    if (new_size == data_d_.size()) { return; }
    SetDevice();
    data_d_.resize(new_size);
  }
  void SetDevice() {
    CHECK_GE(device_, 0);
    if (cudaSetDeviceHandler == nullptr) {
      dh::safe_cuda(cudaSetDevice(device_));
    } else {
      (*cudaSetDeviceHandler)(device_);
    }
  }
  Permissions DevicePerm() const { return perm_h_.Complementary(); }
 };
-template <typename T>
+template<typename T>
-HostDeviceVector<T>::HostDeviceVector
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
-(size_t size, T v, const GPUDistribution &distribution) : impl_(nullptr) {
+    : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
  impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
 }
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
-(std::initializer_list<T> init, const GPUDistribution &distribution) : impl_(nullptr) {
+    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
 }
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
-(const std::vector<T>& init, const GPUDistribution &distribution) : impl_(nullptr) {
+    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
 }
 template <typename T>
 HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
-  : impl_(nullptr) {
+    : impl_(new HostDeviceVectorImpl<T>(*other.impl_)) {}
  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
 }
 template <typename T>
-HostDeviceVector<T>& HostDeviceVector<T>::operator=
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
 (const HostDeviceVector<T>& other) {
  if (this == &other) { return *this; }
  std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
@ -491,73 +325,51 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
 template <typename T>
-GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
+int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
 template <typename T>
-const GPUDistribution& HostDeviceVector<T>::Distribution() const {
+T* HostDeviceVector<T>::DevicePointer() {
-  return impl_->Distribution();
+  return impl_->DevicePointer();
 }
 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) {
+const T* HostDeviceVector<T>::ConstDevicePointer() const {
-  return impl_->DevicePointer(device);
+  return impl_->ConstDevicePointer();
 }
 template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
+common::Span<T> HostDeviceVector<T>::DeviceSpan() {
-  return impl_->ConstDevicePointer(device);
+  return impl_->DeviceSpan();
 }
 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
-  return impl_->DeviceSpan(device);
+  return impl_->ConstDeviceSpan();
 }
 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+size_t HostDeviceVector<T>::DeviceSize() const {
-  return impl_->ConstDeviceSpan(device);
+  return impl_->DeviceSize();
 }
 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) const {
+thrust::device_ptr<T> HostDeviceVector<T>::tbegin() {  // NOLINT
-  return impl_->DeviceStart(device);
+  return impl_->tbegin();
 }
 template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) const {
+thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin() const {  // NOLINT
-  return impl_->DeviceSize(device);
+  return impl_->tcbegin();
 }
 template <typename T>
-thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) {  // NOLINT
+thrust::device_ptr<T> HostDeviceVector<T>::tend() {  // NOLINT
-  return impl_->tbegin(device);
+  return impl_->tend();
 }
 template <typename T>
-thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const {  // NOLINT
+thrust::device_ptr<const T> HostDeviceVector<T>::tcend() const {  // NOLINT
-  return impl_->tcbegin(device);
+  return impl_->tcend();
 }
 template <typename T>
 thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) {  // NOLINT
  return impl_->tend(device);
 }
 template <typename T>
 thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const {  // NOLINT
  return impl_->tcend(device);
 }
 template <typename T>
 void HostDeviceVector<T>::ScatterFrom
 (thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
  impl_->ScatterFrom(begin, end);
 }
 template <typename T>
 void HostDeviceVector<T>::GatherTo
 (thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
  impl_->GatherTo(begin, end);
 }
 template <typename T>
@ -594,23 +406,13 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
 }
 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
+bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
-  return impl_->DeviceCanAccess(device, access);
+  return impl_->DeviceCanAccess(access);
 }
 template <typename T>
-void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
+void HostDeviceVector<T>::SetDevice(int device) const {
-  impl_->Shard(new_devices);
+  impl_->SetDevice(device);
 }
 template <typename T>
 void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
  impl_->Shard(distribution);
 }
 template <typename T>
 void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution) {
  impl_->Reshard(distribution);
 }
 template <typename T>
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@ -79,113 +79,6 @@ void SetCudaSetDeviceHandler(void (*handler)(int));
 template <typename T> struct HostDeviceVectorImpl;
 // Distribution for the HostDeviceVector; it specifies such aspects as the
 // devices it is distributed on, whether there are copies of elements from
 // other GPUs as well as the granularity of splitting. It may also specify
 // explicit boundaries for devices, in which case the size of the array cannot
 // be changed.
 class GPUDistribution {
  template<typename T> friend struct HostDeviceVectorImpl;
 public:
  explicit GPUDistribution(GPUSet devices = GPUSet::Empty())
    : devices_(devices), granularity_(1), overlap_(0) {}
 private:
  GPUDistribution(GPUSet devices, int granularity, int overlap,
                  std::vector<size_t> &&offsets)
    : devices_(devices), granularity_(granularity), overlap_(overlap),
    offsets_(std::move(offsets)) {}
 public:
  static GPUDistribution Empty() { return GPUDistribution(); }
  static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
  static GPUDistribution Overlap(GPUSet devices, int overlap) {
    return GPUDistribution(devices, 1, overlap, std::vector<size_t>());
  }
  static GPUDistribution Granular(GPUSet devices, int granularity) {
    return GPUDistribution(devices, granularity, 0, std::vector<size_t>());
  }
  // NOTE(rongou): Explicit offsets don't necessarily cover the whole vector. Sections before the
  // first shard or after the last shard may be on host only. This windowing is done in the GPU
  // predictor for external memory support.
  static GPUDistribution Explicit(GPUSet devices, std::vector<size_t> offsets) {
    return GPUDistribution(devices, 1, 0, std::move(offsets));
  }
  friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
    bool const res = a.devices_ == b.devices_ &&
                     a.granularity_ == b.granularity_ &&
                     a.overlap_ == b.overlap_ &&
                     a.offsets_ == b.offsets_;
    return res;
  }
  friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
    return !(a == b);
  }
  GPUSet Devices() const { return devices_; }
  bool IsEmpty() const { return devices_.IsEmpty(); }
  size_t ShardStart(size_t size, int index) const {
    if (size == 0) { return 0; }
    if (offsets_.size() > 0) {
      // explicit offsets are provided
      CHECK_EQ(offsets_.back(), size);
      return offsets_.at(index);
    }
    // no explicit offsets
    size_t begin = std::min(index * Portion(size), size);
    begin = begin > size ? size : begin;
    return begin;
  }
  size_t ShardSize(size_t size, size_t index) const {
    if (size == 0) { return 0; }
    if (offsets_.size() > 0) {
      // explicit offsets are provided
      CHECK_EQ(offsets_.back(), size);
      return offsets_.at(index + 1)  - offsets_.at(index) +
        (index == devices_.Size() - 1 ? overlap_ : 0);
    }
    size_t portion = Portion(size);
    size_t begin = std::min(index * portion, size);
    size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size);
    return end - begin;
  }
  size_t ShardProperSize(size_t size, size_t index) const {
    if (size == 0) { return 0; }
    return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0);
  }
  bool IsFixedSize() const { return !offsets_.empty(); }
 private:
  static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; }
  static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; }
  size_t Portion(size_t size) const {
    return RoundUp
      (DivRoundUp
       (std::max(static_cast<int64_t>(size - overlap_ * granularity_),
                 static_cast<int64_t>(1)),
        devices_.Size()), granularity_);
  }
  GPUSet devices_;
  int granularity_;
  int overlap_;
  // explicit offsets for the GPU parts, if any
  std::vector<size_t> offsets_;
 };
 enum GPUAccess {
  kNone, kRead,
  // write implies read
@ -199,46 +92,38 @@ inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
 template <typename T>
 class HostDeviceVector {
 public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(),
+  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
-                            const GPUDistribution &distribution = GPUDistribution());
+  HostDeviceVector(std::initializer_list<T> init, int device = -1);
-  HostDeviceVector(std::initializer_list<T> init,
+  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
                   const GPUDistribution &distribution = GPUDistribution());
  explicit HostDeviceVector(const std::vector<T>& init,
                            const GPUDistribution &distribution = GPUDistribution());
  ~HostDeviceVector();
  HostDeviceVector(const HostDeviceVector<T>&);
  HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
  size_t Size() const;
-  GPUSet Devices() const;
+  int DeviceIdx() const;
-  const GPUDistribution& Distribution() const;
+  common::Span<T> DeviceSpan();
-  common::Span<T> DeviceSpan(int device);
+  common::Span<const T> ConstDeviceSpan() const;
-  common::Span<const T> ConstDeviceSpan(int device) const;
+  common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
-  common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
+  T* DevicePointer();
-  T* DevicePointer(int device);
+  const T* ConstDevicePointer() const;
-  const T* ConstDevicePointer(int device) const;
+  const T* DevicePointer() const { return ConstDevicePointer(); }
  const T* DevicePointer(int device) const { return ConstDevicePointer(device); }
  T* HostPointer() { return HostVector().data(); }
  const T* ConstHostPointer() const { return ConstHostVector().data(); }
  const T* HostPointer() const { return ConstHostPointer(); }
-  size_t DeviceStart(int device) const;
+  size_t DeviceSize() const;
  size_t DeviceSize(int device) const;
  // only define functions returning device_ptr
  // if HostDeviceVector.h is included from a .cu file
 #ifdef __CUDACC__
-  thrust::device_ptr<T> tbegin(int device);  // NOLINT
+  thrust::device_ptr<T> tbegin();  // NOLINT
-  thrust::device_ptr<T> tend(int device);  // NOLINT
+  thrust::device_ptr<T> tend();  // NOLINT
-  thrust::device_ptr<const T> tcbegin(int device) const;  // NOLINT
+  thrust::device_ptr<const T> tcbegin() const;  // NOLINT
-  thrust::device_ptr<const T> tcend(int device) const;  // NOLINT
+  thrust::device_ptr<const T> tcend() const;  // NOLINT
-  thrust::device_ptr<const T> tbegin(int device) const {  // NOLINT
+  thrust::device_ptr<const T> tbegin() const {  // NOLINT
-    return tcbegin(device);
+    return tcbegin();
  }
-  thrust::device_ptr<const T> tend(int device) const { return tcend(device); }  // NOLINT
+  thrust::device_ptr<const T> tend() const { return tcend(); }  // NOLINT
  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end);
  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const;
 #endif  // __CUDACC__
  void Fill(T v);
@ -251,18 +136,9 @@ class HostDeviceVector {
  const std::vector<T>& HostVector() const {return ConstHostVector(); }
  bool HostCanAccess(GPUAccess access) const;
-  bool DeviceCanAccess(int device, GPUAccess access) const;
+  bool DeviceCanAccess(GPUAccess access) const;
-  /*!
+  void SetDevice(int device) const;
   * \brief Specify memory distribution.
   */
  void Shard(const GPUDistribution &distribution) const;
  void Shard(GPUSet devices) const;
  /*!
   * \brief Change memory distribution.
   */
  void Reshard(const GPUDistribution &distribution);
  void Resize(size_t new_size, T v = T());
--- a/src/common/transform.h
+++ b/src/common/transform.h
@ -57,14 +57,10 @@ class Transform {
  template <typename Functor>
  struct Evaluator {
   public:
-    Evaluator(Functor func, Range range, GPUSet devices, bool shard) :
+    Evaluator(Functor func, Range range, int device, bool shard) :
        func_(func), range_{std::move(range)},
        shard_{shard},
-        distribution_{GPUDistribution::Block(devices)} {}
+        device_{device} {}
    Evaluator(Functor func, Range range, GPUDistribution dist,
              bool shard) :
        func_(func), range_{std::move(range)}, shard_{shard},
        distribution_{std::move(dist)} {}
    /*!
     * \brief Evaluate the functor with input pointers to HostDeviceVector.
@ -74,7 +70,7 @@ class Transform {
     */
    template <typename... HDV>
    void Eval(HDV... vectors) const {
-      bool on_device = !distribution_.IsEmpty();
+      bool on_device = device_ >= 0;
      if (on_device) {
        LaunchCUDA(func_, vectors...);
@ -86,13 +82,13 @@ class Transform {
   private:
    // CUDA UnpackHDV
    template <typename T>
-    Span<T> UnpackHDV(HostDeviceVector<T>* _vec, int _device) const {
+    Span<T> UnpackHDVOnDevice(HostDeviceVector<T>* _vec) const {
-      auto span = _vec->DeviceSpan(_device);
+      auto span = _vec->DeviceSpan();
      return span;
    }
    template <typename T>
-    Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec, int _device) const {
+    Span<T const> UnpackHDVOnDevice(const HostDeviceVector<T>* _vec) const {
-      auto span = _vec->ConstDeviceSpan(_device);
+      auto span = _vec->ConstDeviceSpan();
      return span;
    }
    // CPU UnpackHDV
@ -108,15 +104,15 @@ class Transform {
    }
    // Recursive unpack for Shard.
    template <typename T>
-    void UnpackShard(GPUDistribution dist, const HostDeviceVector<T> *vector) const {
+    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
-      vector->Shard(dist);
+      vector->SetDevice(device);
    }
    template <typename Head, typename... Rest>
-    void UnpackShard(GPUDistribution dist,
+    void UnpackShard(int device,
                     const HostDeviceVector<Head> *_vector,
                     const HostDeviceVector<Rest> *... _vectors) const {
-      _vector->Shard(dist);
+      _vector->SetDevice(device);
-      UnpackShard(dist, _vectors...);
+      UnpackShard(device, _vectors...);
    }
 #if defined(__CUDACC__)
@ -124,28 +120,20 @@ class Transform {
              typename... HDV>
    void LaunchCUDA(Functor _func, HDV*... _vectors) const {
      if (shard_)
-        UnpackShard(distribution_, _vectors...);
+        UnpackShard(device_, _vectors...);
      GPUSet devices = distribution_.Devices();
      size_t range_size = *range_.end() - *range_.begin();
      // Extract index to deal with possible old OpenMP.
-      size_t device_beg = *(devices.begin());
+      // This deals with situation like multi-class setting where
-      size_t device_end = *(devices.end());
+      // granularity is used in data vector.
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
+      size_t shard_size = range_size;
-      for (omp_ulong device = device_beg; device < device_end; ++device) {  // NOLINT
+      Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-        // Ignore other attributes of GPUDistribution for spliting index.
+      dh::safe_cuda(cudaSetDevice(device_));
-        // This deals with situation like multi-class setting where
+      const int GRID_SIZE =
-        // granularity is used in data vector.
+          static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
-        size_t shard_size = GPUDistribution::Block(devices).ShardSize(
+      detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
-            range_size, devices.Index(device));
+          _func, shard_range, UnpackHDVOnDevice(_vectors)...);
        Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
        dh::safe_cuda(cudaSetDevice(device));
        const int GRID_SIZE =
            static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
        detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
            _func, shard_range, UnpackHDV(_vectors, device)...);
      }
    }
 #else
    /*! \brief Dummy funtion defined when compiling for CPU.  */
@ -172,7 +160,7 @@ class Transform {
    Range range_;
    /*! \brief Whether sharding for vectors is required. */
    bool shard_;
-    GPUDistribution distribution_;
+    int device_;
  };
 public:
@ -191,15 +179,9 @@ class Transform {
   */
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range,
-                                 GPUSet const devices,
+                                 int device,
                                 bool const shard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(devices), shard};
+    return Evaluator<Functor> {func, std::move(range), device, shard};
  }
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range,
                                 GPUDistribution const dist,
                                 bool const shard = true) {
    return Evaluator<Functor> {func, std::move(range), std::move(dist), shard};
  }
 };
--- a/src/data/data.cu
+++ b/src/data/data.cu
@ -78,9 +78,9 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
  } else {
    LOG(FATAL) << "Unknown metainfo: " << key;
  }
-  dst->Reshard(GPUDistribution(GPUSet::Range(ptr_device, 1)));
+  dst->SetDevice(ptr_device);
  dst->Resize(length);
-  auto p_dst = thrust::device_pointer_cast(dst->DevicePointer(0));
+  auto p_dst = thrust::device_pointer_cast(dst->DevicePointer());
  thrust::copy(p_src, p_src + length, p_dst);
 }
 }  // namespace xgboost
--- a/src/data/simple_csr_source.cu
+++ b/src/data/simple_csr_source.cu
@ -77,16 +77,14 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector<Columnar> cols) {
  dh::safe_cuda(cudaSetDevice(device));
-  GPUSet devices = GPUSet::Range(device, 1);
+  page_.offset.SetDevice(device);
  page_.offset.Reshard(GPUDistribution(devices));
  page_.offset.Resize(info.num_row_ + 1);
-  page_.data.Reshard(GPUDistribution(devices));
+  page_.data.SetDevice(device);
  page_.data.Resize(info.num_nonzero_);
-  auto s_data = page_.data.DeviceSpan(device);
+  auto s_data = page_.data.DeviceSpan();
-  auto s_offsets = page_.offset.DeviceSpan(device);
+  auto s_offsets = page_.offset.DeviceSpan();
  CHECK_EQ(s_offsets.size(), n_rows + 1);
  int32_t constexpr kThreads = 256;
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -182,9 +182,9 @@ void GBTree::DoBoost(DMatrix* p_fmat,
    CHECK_EQ(in_gpair->Size() % ngroup, 0U)
        << "must have exactly ngroup*nrow gpairs";
    // TODO(canonizer): perform this on GPU if HostDeviceVector has device set.
-    HostDeviceVector<GradientPair> tmp
+    HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup,
-        (in_gpair->Size() / ngroup, GradientPair(),
+                                       GradientPair(),
-         GPUDistribution::Block(in_gpair->Distribution().Devices()));
+                                       in_gpair->DeviceIdx());
    const auto& gpair_h = in_gpair->ConstHostVector();
    auto nsize = static_cast<bst_omp_uint>(tmp.Size());
    for (int gid = 0; gid < ngroup; ++gid) {
--- a/src/learner.cc
+++ b/src/learner.cc
@ -237,14 +237,13 @@ class LearnerImpl : public Learner {
      std::vector<std::pair<std::string, std::string> > attr;
      fi->Read(&attr);
      for (auto& kv : attr) {
-        // Load `predictor`, `n_gpus`, `gpu_id` parameters from extra attributes
+        // Load `predictor`, `gpu_id` parameters from extra attributes
        const std::string prefix = "SAVED_PARAM_";
        if (kv.first.find(prefix) == 0) {
          const std::string saved_param = kv.first.substr(prefix.length());
          bool is_gpu_predictor = saved_param == "predictor" && kv.second == "gpu_predictor";
 #ifdef XGBOOST_USE_CUDA
-          if (saved_param == "predictor" || saved_param == "n_gpus"
+          if (saved_param == "predictor" || saved_param == "gpu_id") {
              || saved_param == "gpu_id") {
            cfg_[saved_param] = kv.second;
            LOG(INFO)
              << "Parameter '" << saved_param << "' has been recovered from "
@ -266,7 +265,7 @@ class LearnerImpl : public Learner {
          }
 #endif  // XGBOOST_USE_CUDA
          // NO visible GPU in current environment
-          if (is_gpu_predictor && GPUSet::AllVisible().Size() == 0) {
+          if (is_gpu_predictor && common::AllVisibleGPUs() == 0) {
            cfg_["predictor"] = "cpu_predictor";
            kv.second = "cpu_predictor";
            LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
@ -294,7 +293,9 @@ class LearnerImpl : public Learner {
    auto n = tparam_.__DICT__();
    cfg_.insert(n.cbegin(), n.cend());
-    gbm_->Configure({cfg_.cbegin(), cfg_.cend()});
+    Args args = {cfg_.cbegin(), cfg_.cend()};
    generic_param_.InitAllowUnknown(args);
    gbm_->Configure(args);
    obj_->Configure({cfg_.begin(), cfg_.end()});
    for (auto& p_metric : metrics_) {
@ -331,9 +332,8 @@ class LearnerImpl : public Learner {
      }
    }
    {
-      // Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
+      // Write `predictor`, `gpu_id` parameters as extra attributes
-      for (const auto& key : std::vector<std::string>{
+      for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) {
          "predictor", "n_gpus", "gpu_id"}) {
        auto it = cfg_.find(key);
        if (it != cfg_.end()) {
          mparam.contain_extra_attrs = 1;
@ -581,13 +581,8 @@ class LearnerImpl : public Learner {
    gbm_->Configure(args);
    if (this->gbm_->UseGPU()) {
-      if (cfg_.find("n_gpus") == cfg_.cend()) {
+      if (cfg_.find("gpu_id") == cfg_.cend()) {
-        generic_param_.n_gpus = 1;
+        generic_param_.gpu_id = 0;
      }
      if (generic_param_.n_gpus != 1) {
        LOG(FATAL) << "Single process multi-GPU training is no longer supported. "
                      "Please switch to distributed GPU training with one process per GPU. "
                      "This can be done using Dask or Spark.";
      }
    }
  }
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@ -19,12 +19,6 @@ namespace linear {
 DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
 void RescaleIndices(int device_idx, size_t ridx_begin,
                    common::Span<xgboost::Entry> data) {
  dh::LaunchN(device_idx, data.size(),
              [=] __device__(size_t idx) { data[idx].index -= ridx_begin; });
 }
 class DeviceShard {
  int device_id_;
  dh::BulkAllocator ba_;
@ -32,18 +26,16 @@ class DeviceShard {
  common::Span<xgboost::Entry> data_;
  common::Span<GradientPair> gpair_;
  dh::CubMemory temp_;
-  size_t ridx_begin_;
+  size_t shard_size_;
  size_t ridx_end_;
 public:
  DeviceShard(int device_id,
              const SparsePage &batch,  // column batch
-              bst_uint row_begin, bst_uint row_end,
+              bst_uint shard_size,
              const LinearTrainParam &param,
              const gbm::GBLinearModelParam &model_param)
      : device_id_(device_id),
-        ridx_begin_(row_begin),
+        shard_size_(shard_size) {
        ridx_end_(row_end) {
    if ( IsEmpty() ) { return; }
    dh::safe_cuda(cudaSetDevice(device_id_));
    // The begin and end indices for the section of each column associated with
@ -51,25 +43,25 @@ class DeviceShard {
    std::vector<std::pair<bst_uint, bst_uint>> column_segments;
    row_ptr_ = {0};
    // iterate through columns
-    for (auto fidx = 0; fidx < batch.Size(); fidx++) {
+    for (size_t fidx = 0; fidx < batch.Size(); fidx++) {
      common::Span<Entry const> col = batch[fidx];
      auto cmp = [](Entry e1, Entry e2) {
        return e1.index < e2.index;
      };
      auto column_begin =
          std::lower_bound(col.cbegin(), col.cend(),
-                           xgboost::Entry(row_begin, 0.0f), cmp);
+                           xgboost::Entry(0, 0.0f), cmp);
      auto column_end =
          std::lower_bound(col.cbegin(), col.cend(),
-                           xgboost::Entry(row_end, 0.0f), cmp);
+                           xgboost::Entry(shard_size_, 0.0f), cmp);
      column_segments.emplace_back(
          std::make_pair(column_begin - col.cbegin(), column_end - col.cbegin()));
      row_ptr_.push_back(row_ptr_.back() + (column_end - column_begin));
    }
    ba_.Allocate(device_id_, &data_, row_ptr_.back(), &gpair_,
-                 (row_end - row_begin) * model_param.num_output_group);
+                 shard_size_ * model_param.num_output_group);
-    for (int fidx = 0; fidx < batch.Size(); fidx++) {
+    for (size_t fidx = 0; fidx < batch.Size(); fidx++) {
      auto col = batch[fidx];
      auto seg = column_segments[fidx];
      dh::safe_cuda(cudaMemcpy(
@ -77,23 +69,21 @@ class DeviceShard {
          col.data() + seg.first,
          sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice));
    }
    // Rescale indices with respect to current shard
    RescaleIndices(device_id_, ridx_begin_, data_);
  }
-  ~DeviceShard() {
+  ~DeviceShard() {  // NOLINT
    dh::safe_cuda(cudaSetDevice(device_id_));
  }
  bool IsEmpty() {
-    return (ridx_end_ - ridx_begin_) == 0;
+    return shard_size_ == 0;
  }
  void UpdateGpair(const std::vector<GradientPair> &host_gpair,
                   const gbm::GBLinearModelParam &model_param) {
    dh::safe_cuda(cudaMemcpyAsync(
        gpair_.data(),
-        host_gpair.data() + ridx_begin_ * model_param.num_output_group,
+        host_gpair.data(),
        gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice));
  }
@ -107,13 +97,13 @@ class DeviceShard {
        counting, f);
    auto perm = thrust::make_permutation_iterator(gpair_.data(), skip);
-    return dh::SumReduction(temp_, perm, ridx_end_ - ridx_begin_);
+    return dh::SumReduction(temp_, perm, shard_size_);
  }
  void UpdateBiasResidual(float dbias, int group_idx, int num_groups) {
    if (dbias == 0.0f) return;
    auto d_gpair = gpair_;
-    dh::LaunchN(device_id_, ridx_end_ - ridx_begin_, [=] __device__(size_t idx) {
+    dh::LaunchN(device_id_, shard_size_, [=] __device__(size_t idx) {
      auto &g = d_gpair[idx * num_groups + group_idx];
      g += GradientPair(g.GetHess() * dbias, 0);
    });
@ -154,7 +144,7 @@ class DeviceShard {
 * \brief Coordinate descent algorithm that updates one feature per iteration
 */
-class GPUCoordinateUpdater : public LinearUpdater {
+class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
 public:
  // set training parameter
  void Configure(Args const& args) override {
@ -165,37 +155,23 @@ class GPUCoordinateUpdater : public LinearUpdater {
  void LazyInitShards(DMatrix *p_fmat,
                      const gbm::GBLinearModelParam &model_param) {
-    if (!shards_.empty()) return;
+    if (shard_) return;
-    dist_ = GPUDistribution::Block(GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus,
+    device_ = learner_param_->gpu_id;
                                               p_fmat->Info().num_row_));
    auto devices = dist_.Devices();
-    size_t n_devices = static_cast<size_t>(devices.Size());
+    auto num_row = static_cast<size_t>(p_fmat->Info().num_row_);
    size_t row_begin = 0;
    size_t num_row = static_cast<size_t>(p_fmat->Info().num_row_);
    // Partition input matrix into row segments
    std::vector<size_t> row_segments;
    row_segments.push_back(0);
-    for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
+    size_t shard_size = num_row;
-      size_t shard_size = dist_.ShardSize(num_row, d_idx);
+    row_segments.push_back(shard_size);
      size_t row_end = row_begin + shard_size;
      row_segments.push_back(row_end);
      row_begin = row_end;
    }
    CHECK(p_fmat->SingleColBlock());
    SparsePage const& batch = *(p_fmat->GetBatches<CSCPage>().begin());
-    shards_.resize(n_devices);
+    // Create device shard
-    // Create device shards
+    shard_.reset(new DeviceShard(device_, batch, shard_size, tparam_, model_param));
    dh::ExecuteIndexShards(&shards_,
                           [&](int i, std::unique_ptr<DeviceShard>& shard) {
        shard = std::unique_ptr<DeviceShard>(
            new DeviceShard(devices.DeviceId(i), batch, row_segments[i],
                            row_segments[i + 1], tparam_, model_param));
      });
  }
  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
@ -208,11 +184,9 @@ class GPUCoordinateUpdater : public LinearUpdater {
    monitor_.Start("UpdateGpair");
    auto &in_gpair_host = in_gpair->ConstHostVector();
    // Update gpair
-    dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
+    if (shard_) {
-      if (!shard->IsEmpty()) {
+      shard_->UpdateGpair(in_gpair_host, model->param);
-        shard->UpdateGpair(in_gpair_host, model->param);
+    }
      }
    });
    monitor_.Stop("UpdateGpair");
    monitor_.Start("UpdateBias");
@ -237,32 +211,21 @@ class GPUCoordinateUpdater : public LinearUpdater {
  }
  void UpdateBias(DMatrix *p_fmat, gbm::GBLinearModel *model) {
-    for (int group_idx = 0; group_idx < model->param.num_output_group;
+    for (int group_idx = 0; group_idx < model->param.num_output_group; ++group_idx) {
         ++group_idx) {
      // Get gradient
-      auto grad = dh::ReduceShards<GradientPair>(
+      auto grad = GradientPair(0, 0);
-          &shards_, [&](std::unique_ptr<DeviceShard> &shard) {
+      if (shard_) {
-            if (!shard->IsEmpty()) {
+        grad = shard_->GetBiasGradient(group_idx, model->param.num_output_group);
-              GradientPair result =
+      }
                  shard->GetBiasGradient(group_idx,
                                         model->param.num_output_group);
              return result;
            }
            return GradientPair(0, 0);
          });
      auto dbias = static_cast<float>(
          tparam_.learning_rate *
-          CoordinateDeltaBias(grad.GetGrad(), grad.GetHess()));
+              CoordinateDeltaBias(grad.GetGrad(), grad.GetHess()));
      model->bias()[group_idx] += dbias;
      // Update residual
-    dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
+      if (shard_) {
-        if (!shard->IsEmpty()) {
+        shard_->UpdateBiasResidual(dbias, group_idx, model->param.num_output_group);
-          shard->UpdateBiasResidual(dbias, group_idx,
+      }
                                    model->param.num_output_group);
        }
      });
    }
  }
@ -271,38 +234,30 @@ class GPUCoordinateUpdater : public LinearUpdater {
                     gbm::GBLinearModel *model) {
    bst_float &w = (*model)[fidx][group_idx];
    // Get gradient
-    auto grad = dh::ReduceShards<GradientPair>(
+    auto grad = GradientPair(0, 0);
-        &shards_, [&](std::unique_ptr<DeviceShard> &shard) {
+    if (shard_) {
-          if (!shard->IsEmpty()) {
+      grad = shard_->GetGradient(group_idx, model->param.num_output_group, fidx);
-            return shard->GetGradient(group_idx, model->param.num_output_group,
+    }
                                      fidx);
          }
          return GradientPair(0, 0);
        });
    auto dw = static_cast<float>(tparam_.learning_rate *
                                 CoordinateDelta(grad.GetGrad(), grad.GetHess(),
                                                 w, tparam_.reg_alpha_denorm,
                                                 tparam_.reg_lambda_denorm));
    w += dw;
-    dh::ExecuteIndexShards(&shards_, [&](int idx,
+    if (shard_) {
-                                        std::unique_ptr<DeviceShard> &shard) {
+      shard_->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx);
-      if (!shard->IsEmpty()) {
+    }
        shard->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx);
      }
    });
  }
 private:
  // training parameter
  LinearTrainParam tparam_;
  CoordinateParam coord_param_;
-  GPUDistribution dist_;
+  int device_{};
  std::unique_ptr<FeatureSelector> selector_;
  common::Monitor monitor_;
-  std::vector<std::unique_ptr<DeviceShard>> shards_;
+  std::unique_ptr<DeviceShard> shard_{nullptr};
 };
 XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@ -30,8 +30,7 @@ DMLC_REGISTRY_FILE_TAG(elementwise_metric);
 template <typename EvalRow>
 class ElementWiseMetricsReduction {
 public:
-  explicit ElementWiseMetricsReduction(EvalRow policy) :
+  explicit ElementWiseMetricsReduction(EvalRow policy) : policy_(std::move(policy)) {}
    policy_(std::move(policy)) {}
  PackedReduceResult CpuReduceMetrics(
      const HostDeviceVector<bst_float>& weights,
@ -59,34 +58,31 @@ class ElementWiseMetricsReduction {
 #if defined(XGBOOST_USE_CUDA)
  ~ElementWiseMetricsReduction() {
-    for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
+    if (device_ >= 0) {
-      dh::safe_cuda(cudaSetDevice(id));
+      dh::safe_cuda(cudaSetDevice(device_));
-      size_t index = devices_.Index(id);
+      allocator_.Free();
      allocators_.at(index).Free();
    }
  }
  PackedReduceResult DeviceReduceMetrics(
      GPUSet::GpuIdType device_id,
      size_t device_index,
      const HostDeviceVector<bst_float>& weights,
      const HostDeviceVector<bst_float>& labels,
      const HostDeviceVector<bst_float>& preds) {
-    size_t n_data = preds.DeviceSize(device_id);
+    size_t n_data = preds.DeviceSize();
    thrust::counting_iterator<size_t> begin(0);
    thrust::counting_iterator<size_t> end = begin + n_data;
-    auto s_label = labels.DeviceSpan(device_id);
+    auto s_label = labels.DeviceSpan();
-    auto s_preds = preds.DeviceSpan(device_id);
+    auto s_preds = preds.DeviceSpan();
-    auto s_weights = weights.DeviceSpan(device_id);
+    auto s_weights = weights.DeviceSpan();
    bool const is_null_weight = weights.Size() == 0;
    auto d_policy = policy_;
    PackedReduceResult result = thrust::transform_reduce(
-        thrust::cuda::par(allocators_.at(device_index)),
+        thrust::cuda::par(allocator_),
        begin, end,
        [=] XGBOOST_DEVICE(size_t idx) {
          bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
@ -105,37 +101,24 @@ class ElementWiseMetricsReduction {
  PackedReduceResult Reduce(
      const GenericParameter &tparam,
-      GPUSet devices,
+      int device,
      const HostDeviceVector<bst_float>& weights,
      const HostDeviceVector<bst_float>& labels,
      const HostDeviceVector<bst_float>& preds) {
    PackedReduceResult result;
-    if (devices.IsEmpty()) {
+    if (device < 0) {
      result = CpuReduceMetrics(weights, labels, preds);
    }
 #if defined(XGBOOST_USE_CUDA)
    else {  // NOLINT
-      if (allocators_.empty()) {
+      device_ = device;
-        devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
+      preds.SetDevice(device_);
-        allocators_.resize(devices_.Size());
+      labels.SetDevice(device_);
-      }
+      weights.SetDevice(device_);
      preds.Shard(devices);
      labels.Shard(devices);
      weights.Shard(devices);
      std::vector<PackedReduceResult> res_per_device(devices.Size());
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
+      dh::safe_cuda(cudaSetDevice(device_));
-      for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
+      result = DeviceReduceMetrics(weights, labels, preds);
        dh::safe_cuda(cudaSetDevice(id));
        size_t index = devices.Index(id);
        res_per_device.at(index) =
            DeviceReduceMetrics(id, index, weights, labels, preds);
      }
      for (auto const& res : res_per_device) {
        result += res;
      }
    }
 #endif  // defined(XGBOOST_USE_CUDA)
    return result;
@ -144,8 +127,8 @@ class ElementWiseMetricsReduction {
 private:
  EvalRow policy_;
 #if defined(XGBOOST_USE_CUDA)
-  GPUSet devices_;
+  int device_{-1};
-  std::vector<dh::CubMemory> allocators_;
+  dh::CubMemory allocator_;
 #endif  // defined(XGBOOST_USE_CUDA)
 };
@ -345,11 +328,10 @@ struct EvalEWiseBase : public Metric {
        << "label and prediction size not match, "
        << "hint: use merror or mlogloss for multi-class classification";
    const auto ndata = static_cast<omp_ulong>(info.labels_.Size());
-    // Dealing with ndata < n_gpus.
+    int device = tparam_->gpu_id;
    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
    auto result =
-        reducer_.Reduce(*tparam_, devices, info.weights_, info.labels_, preds);
+        reducer_.Reduce(*tparam_, device, info.weights_, info.labels_, preds);
    double dat[2] { result.Residue(), result.Weights() };
    if (distributed) {
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@ -74,35 +74,32 @@ class MultiClassMetricsReduction {
 #if defined(XGBOOST_USE_CUDA)
  ~MultiClassMetricsReduction() {
-    for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
+    if (device_ >= 0) {
-      dh::safe_cuda(cudaSetDevice(id));
+      dh::safe_cuda(cudaSetDevice(device_));
-      size_t index = devices_.Index(id);
+      allocator_.Free();
      allocators_.at(index).Free();
    }
  }
  PackedReduceResult DeviceReduceMetrics(
      GPUSet::GpuIdType device_id,
      size_t device_index,
      const HostDeviceVector<bst_float>& weights,
      const HostDeviceVector<bst_float>& labels,
      const HostDeviceVector<bst_float>& preds,
      const size_t n_class) {
-    size_t n_data = labels.DeviceSize(device_id);
+    size_t n_data = labels.DeviceSize();
    thrust::counting_iterator<size_t> begin(0);
    thrust::counting_iterator<size_t> end = begin + n_data;
-    auto s_labels = labels.DeviceSpan(device_id);
+    auto s_labels = labels.DeviceSpan();
-    auto s_preds = preds.DeviceSpan(device_id);
+    auto s_preds = preds.DeviceSpan();
-    auto s_weights = weights.DeviceSpan(device_id);
+    auto s_weights = weights.DeviceSpan();
    bool const is_null_weight = weights.Size() == 0;
    auto s_label_error = label_error_.GetSpan<int32_t>(1);
    s_label_error[0] = 0;
    PackedReduceResult result = thrust::transform_reduce(
-        thrust::cuda::par(allocators_.at(device_index)),
+        thrust::cuda::par(allocator_),
        begin, end,
        [=] XGBOOST_DEVICE(size_t idx) {
          bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
@ -127,38 +124,25 @@ class MultiClassMetricsReduction {
  PackedReduceResult Reduce(
      const GenericParameter &tparam,
-      GPUSet devices,
+      int device,
      size_t n_class,
      const HostDeviceVector<bst_float>& weights,
      const HostDeviceVector<bst_float>& labels,
      const HostDeviceVector<bst_float>& preds) {
    PackedReduceResult result;
-    if (devices.IsEmpty()) {
+    if (device < 0) {
      result = CpuReduceMetrics(weights, labels, preds, n_class);
    }
 #if defined(XGBOOST_USE_CUDA)
    else {  // NOLINT
-      if (allocators_.empty()) {
+      device_ = tparam.gpu_id;
-        devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
+      preds.SetDevice(device_);
-        allocators_.resize(devices_.Size());
+      labels.SetDevice(device_);
-      }
+      weights.SetDevice(device_);
      preds.Shard(GPUDistribution::Granular(devices, n_class));
      labels.Shard(devices);
      weights.Shard(devices);
      std::vector<PackedReduceResult> res_per_device(devices.Size());
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
+      dh::safe_cuda(cudaSetDevice(device_));
-      for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
+      result = DeviceReduceMetrics(weights, labels, preds, n_class);
        dh::safe_cuda(cudaSetDevice(id));
        size_t index = devices.Index(id);
        res_per_device.at(index) =
            DeviceReduceMetrics(id, index, weights, labels, preds, n_class);
      }
      for (auto const& res : res_per_device) {
        result += res;
      }
    }
 #endif  // defined(XGBOOST_USE_CUDA)
    return result;
@ -167,8 +151,8 @@ class MultiClassMetricsReduction {
 private:
 #if defined(XGBOOST_USE_CUDA)
  dh::PinnedMemory label_error_;
-  GPUSet devices_;
+  int device_{-1};
-  std::vector<dh::CubMemory> allocators_;
+  dh::CubMemory allocator_;
 #endif  // defined(XGBOOST_USE_CUDA)
 };
@ -190,8 +174,8 @@ struct EvalMClassBase : public Metric {
        << " use logloss for binary classification";
    const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
-    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
+    int device = tparam_->gpu_id;
-    auto result = reducer_.Reduce(*tparam_, devices, nclass, info.weights_, info.labels_, preds);
+    auto result = reducer_.Reduce(*tparam_, device, nclass, info.weights_, info.labels_, preds);
    double dat[2] { result.Residue(), result.Weights() };
    if (distributed) {
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@ -58,7 +58,7 @@ class HingeObj : public ObjFunction {
          _out_gpair[_idx] = GradientPair(g, h);
        },
        common::Range{0, static_cast<int64_t>(ndata)},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata)).Eval(
+        tparam_->gpu_id).Eval(
            out_gpair, &preds, &info.labels_, &info.weights_);
  }
@ -68,7 +68,7 @@ class HingeObj : public ObjFunction {
          _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size()), 1},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@ -59,14 +59,14 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const int nclass = param_.num_class;
    const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
+    auto device = tparam_->gpu_id;
-    out_gpair->Shard(GPUDistribution::Granular(devices, nclass));
+    out_gpair->SetDevice(device);
-    info.labels_.Shard(GPUDistribution::Block(devices));
+    info.labels_.SetDevice(device);
-    info.weights_.Shard(GPUDistribution::Block(devices));
+    info.weights_.SetDevice(device);
-    preds.Shard(GPUDistribution::Granular(devices, nclass));
+    preds.SetDevice(device);
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    label_correct_.Resize(1);
-    label_correct_.Shard(GPUDistribution::Block(devices));
+    label_correct_.SetDevice(device);
    out_gpair->Resize(preds.Size());
    label_correct_.Fill(1);
@ -100,7 +100,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
            p = label == k ? p - 1.0f : p;
            gpair[idx * nclass + k] = GradientPair(p * wt, h);
          }
-        }, common::Range{0, ndata}, devices, false)
+        }, common::Range{0, ndata}, device, false)
        .Eval(out_gpair, &info.labels_, &preds, &info.weights_, &label_correct_);
    std::vector<int>& label_correct_h = label_correct_.HostVector();
@ -125,7 +125,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
    max_preds_.Resize(ndata);
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size());
+    auto device = tparam_->gpu_id;
    if (prob) {
      common::Transform<>::Init(
          [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
@ -133,11 +133,11 @@ class SoftmaxMultiClassObj : public ObjFunction {
                _preds.subspan(_idx * nclass, nclass);
            common::Softmax(point.begin(), point.end());
          },
-          common::Range{0, ndata}, GPUDistribution::Granular(devices, nclass))
+          common::Range{0, ndata}, device)
        .Eval(io_preds);
    } else {
-      io_preds->Shard(GPUDistribution::Granular(devices, nclass));
+      io_preds->SetDevice(device);
-      max_preds_.Shard(GPUDistribution::Block(devices));
+      max_preds_.SetDevice(device);
      common::Transform<>::Init(
          [=] XGBOOST_DEVICE(size_t _idx,
                             common::Span<const bst_float> _preds,
@ -148,7 +148,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
                common::FindMaxIndex(point.cbegin(),
                                     point.cend()) - point.cbegin();
          },
-          common::Range{0, ndata}, devices, false)
+          common::Range{0, ndata}, device, false)
        .Eval(io_preds, &max_preds_);
    }
    if (!prob) {
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@ -57,8 +57,8 @@ class RegLossObj : public ObjFunction {
        << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size();
    size_t ndata = preds.Size();
    out_gpair->Resize(ndata);
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
+    auto device = tparam_->gpu_id;
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    label_correct_.Resize(1);
    label_correct_.Fill(1);
    bool is_null_weight = info.weights_.Size() == 0;
@ -83,7 +83,7 @@ class RegLossObj : public ObjFunction {
          _out_gpair[_idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
                                          Loss::SecondOrderGradient(p, label) * w);
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, device).Eval(
            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
    // copy "label correct" flags back to host
@ -105,7 +105,7 @@ class RegLossObj : public ObjFunction {
        [] XGBOOST_DEVICE(size_t _idx, common::Span<float> _preds) {
          _preds[_idx] = Loss::PredTransform(_preds[_idx]);
        }, common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }
@ -175,8 +175,8 @@ class PoissonRegression : public ObjFunction {
    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
    size_t ndata = preds.Size();
    out_gpair->Resize(ndata);
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
+    auto device = tparam_->gpu_id;
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    label_correct_.Resize(1);
    label_correct_.Fill(1);
    bool is_null_weight = info.weights_.Size() == 0;
@ -197,7 +197,7 @@ class PoissonRegression : public ObjFunction {
          _out_gpair[_idx] = GradientPair{(expf(p) - y) * w,
                                          expf(p + max_delta_step) * w};
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, device).Eval(
            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
    // copy "label correct" flags back to host
    std::vector<int>& label_correct_h = label_correct_.HostVector();
@ -213,7 +213,7 @@ class PoissonRegression : public ObjFunction {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@ -340,9 +340,9 @@ class GammaRegression : public ObjFunction {
    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
    const size_t ndata = preds.Size();
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
+    auto device = tparam_->gpu_id;
    out_gpair->Resize(ndata);
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    label_correct_.Resize(1);
    label_correct_.Fill(1);
    const bool is_null_weight = info.weights_.Size() == 0;
@ -361,7 +361,7 @@ class GammaRegression : public ObjFunction {
          }
          _out_gpair[_idx] = GradientPair((1 - y / expf(p)) * w, y / expf(p) * w);
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, device).Eval(
            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
    // copy "label correct" flags back to host
@ -378,7 +378,7 @@ class GammaRegression : public ObjFunction {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@ -430,8 +430,8 @@ class TweedieRegression : public ObjFunction {
    const size_t ndata = preds.Size();
    out_gpair->Resize(ndata);
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
+    auto device = tparam_->gpu_id;
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    label_correct_.Resize(1);
    label_correct_.Fill(1);
    const bool is_null_weight = info.weights_.Size() == 0;
@ -455,7 +455,7 @@ class TweedieRegression : public ObjFunction {
              std::exp((1 - rho) * p) + (2 - rho) * expf((2 - rho) * p);
          _out_gpair[_idx] = GradientPair(grad * w, hess * w);
        },
-        common::Range{0, static_cast<int64_t>(ndata), 1}, devices)
+        common::Range{0, static_cast<int64_t>(ndata), 1}, device)
        .Eval(&label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
    // copy "label correct" flags back to host
@ -472,7 +472,7 @@ class TweedieRegression : public ObjFunction {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -20,12 +20,6 @@ namespace predictor {
 DMLC_REGISTRY_FILE_TAG(gpu_predictor);
 template <typename IterT>
 void IncrementOffset(IterT begin_itr, IterT end_itr, size_t amount) {
  thrust::transform(begin_itr, end_itr, begin_itr,
                    [=] __device__(size_t elem) { return elem + amount; });
 }
 /**
 * \struct  DevicePredictionNode
 *
@ -44,7 +38,7 @@ struct DevicePredictionNode {
  int fidx;
  int left_child_idx;
  int right_child_idx;
-  NodeValue val;
+  NodeValue val{};
  DevicePredictionNode(const RegTree::Node& n) {  // NOLINT
    static_assert(sizeof(DevicePredictionNode) == 16, "Size is not 16 bytes");
@ -200,58 +194,14 @@ __global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
 }
 class GPUPredictor : public xgboost::Predictor {
 protected:
  struct DevicePredictionCacheEntry {
    std::shared_ptr<DMatrix> data;
    HostDeviceVector<bst_float> predictions;
  };
 private:
  void DeviceOffsets(const HostDeviceVector<size_t>& data,
                     size_t total_size,
                     std::vector<size_t>* out_offsets) {
    auto& offsets = *out_offsets;
    offsets.resize(devices_.Size() + 1);
    offsets[0] = 0;
 #pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
    for (int shard = 0; shard < devices_.Size(); ++shard) {
      int device = devices_.DeviceId(shard);
      auto data_span = data.DeviceSpan(device);
      dh::safe_cuda(cudaSetDevice(device));
      if (data_span.size() == 0) {
        offsets[shard + 1] = total_size;
      } else {
        // copy the last element from every shard
        dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
                                 &data_span[data_span.size()-1],
                                 sizeof(size_t), cudaMemcpyDeviceToHost));
      }
    }
  }
  // This function populates the explicit offsets that can be used to create a window into the
  // underlying host vector. The window starts from the `batch_offset` and has a size of
  // `batch_size`, and is sharded across all the devices. Each shard is granular depending on
  // the number of output classes `n_classes`.
  void PredictionDeviceOffsets(size_t total_size, size_t batch_offset, size_t batch_size,
                               int n_classes, std::vector<size_t>* out_offsets) {
    auto& offsets = *out_offsets;
    size_t n_shards = devices_.Size();
    offsets.resize(n_shards + 2);
    size_t rows_per_shard = common::DivRoundUp(batch_size, n_shards);
    for (size_t shard = 0; shard < devices_.Size(); ++shard) {
      size_t n_rows = std::min(batch_size, shard * rows_per_shard);
      offsets[shard] = batch_offset + n_rows * n_classes;
    }
    offsets[n_shards] = batch_offset + batch_size * n_classes;
    offsets[n_shards + 1] = total_size;
  }
  struct DeviceShard {
    DeviceShard() : device_{-1} {}
    ~DeviceShard() {
-      dh::safe_cuda(cudaSetDevice(device_));
+      if (device_ >= 0) {
        dh::safe_cuda(cudaSetDevice(device_));
      }
    }
    void Init(int device) {
@ -284,10 +234,9 @@ class GPUPredictor : public xgboost::Predictor {
    void PredictInternal
    (const SparsePage& batch, size_t num_features,
     HostDeviceVector<bst_float>* predictions) {
      if (predictions->DeviceSize(device_) == 0) { return; }
      dh::safe_cuda(cudaSetDevice(device_));
      const int BLOCK_THREADS = 128;
-      size_t num_rows = batch.offset.DeviceSize(device_) - 1;
+      size_t num_rows = batch.offset.DeviceSize() - 1;
      const int GRID_SIZE = static_cast<int>(common::DivRoundUp(num_rows, BLOCK_THREADS));
      int shared_memory_bytes = static_cast<int>
@ -297,14 +246,12 @@ class GPUPredictor : public xgboost::Predictor {
        shared_memory_bytes = 0;
        use_shared = false;
      }
-      const auto& data_distr = batch.data.Distribution();
+      size_t entry_start = 0;
      size_t entry_start = data_distr.ShardStart(batch.data.Size(),
                                                 data_distr.Devices().Index(device_));
      PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
-        (dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_),
+        (dh::ToSpan(nodes_), predictions->DeviceSpan(), dh::ToSpan(tree_segments_),
-         dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_),
+         dh::ToSpan(tree_group_), batch.offset.DeviceSpan(),
-         batch.data.DeviceSpan(device_), this->tree_begin_, this->tree_end_, num_features,
+         batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features,
         num_rows, entry_start, use_shared, this->num_group_);
    }
@ -322,7 +269,7 @@ class GPUPredictor : public xgboost::Predictor {
  void InitModel(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) {
    CHECK_EQ(model.param.size_leaf_vector, 0);
    // Copy decision trees to device
-    thrust::host_vector<size_t> h_tree_segments;
+    thrust::host_vector<size_t> h_tree_segments{};
    h_tree_segments.reserve((tree_end - tree_begin) + 1);
    size_t sum = 0;
    h_tree_segments.push_back(sum);
@ -337,9 +284,7 @@ class GPUPredictor : public xgboost::Predictor {
      std::copy(src_nodes.begin(), src_nodes.end(),
                h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
    }
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard &shard) {
+    shard_.InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
      shard.InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
    });
  }
  void DevicePredictInternal(DMatrix* dmat,
@ -352,40 +297,43 @@ class GPUPredictor : public xgboost::Predictor {
    InitModel(model, tree_begin, tree_end);
    size_t batch_offset = 0;
    auto* preds = out_preds;
    std::unique_ptr<HostDeviceVector<bst_float>> batch_preds{nullptr};
    for (auto &batch : dmat->GetBatches<SparsePage>()) {
      bool is_external_memory = batch.Size() < dmat->Info().num_row_;
      if (is_external_memory) {
-        std::vector<size_t> out_preds_offsets;
+        batch_preds.reset(new HostDeviceVector<bst_float>);
-        PredictionDeviceOffsets(out_preds->Size(), batch_offset, batch.Size(),
+        batch_preds->Resize(batch.Size() * model.param.num_output_group);
-                                model.param.num_output_group, &out_preds_offsets);
+        std::copy(out_preds->ConstHostVector().begin() + batch_offset,
-        out_preds->Reshard(GPUDistribution::Explicit(devices_, out_preds_offsets));
+                  out_preds->ConstHostVector().begin() + batch_offset + batch_preds->Size(),
                  batch_preds->HostVector().begin());
        preds = batch_preds.get();
      }
-      batch.offset.Shard(GPUDistribution::Overlap(devices_, 1));
+      batch.offset.SetDevice(device_);
-      std::vector<size_t> device_offsets;
+      batch.data.SetDevice(device_);
-      DeviceOffsets(batch.offset, batch.data.Size(), &device_offsets);
+      preds->SetDevice(device_);
-      batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
+      shard_.PredictInternal(batch, model.param.num_feature, preds);
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
+      if (is_external_memory) {
-        shard.PredictInternal(batch, model.param.num_feature, out_preds);
+        auto h_preds = preds->ConstHostVector();
-      });
+        std::copy(h_preds.begin(), h_preds.end(), out_preds->HostVector().begin() + batch_offset);
      }
      batch_offset += batch.Size() * model.param.num_output_group;
    }
    out_preds->Reshard(GPUDistribution::Granular(devices_, model.param.num_output_group));
    monitor_.StopCuda("DevicePredictInternal");
  }
 public:
-  GPUPredictor() = default;
+  GPUPredictor() : device_{-1} {};
  void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                    const gbm::GBTreeModel& model, int tree_begin,
                    unsigned ntree_limit = 0) override {
-    GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus,
+    int device = learner_param_->gpu_id;
-                                 dmat->Info().num_row_);
+    CHECK_GE(device, 0);
-    CHECK_NE(devices.Size(), 0);
+    ConfigureShard(device);
    ConfigureShards(devices);
    if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
      return;
@ -408,10 +356,9 @@ class GPUPredictor : public xgboost::Predictor {
    size_t n_classes = model.param.num_output_group;
    size_t n = n_classes * info.num_row_;
    const HostDeviceVector<bst_float>& base_margin = info.base_margin_;
    out_preds->Shard(GPUDistribution::Granular(devices_, n_classes));
    out_preds->Resize(n);
    if (base_margin.Size() != 0) {
-      CHECK_EQ(out_preds->Size(), n);
+      CHECK_EQ(base_margin.Size(), n);
      out_preds->Copy(base_margin);
    } else {
      out_preds->Fill(model.base_margin);
@ -427,7 +374,7 @@ class GPUPredictor : public xgboost::Predictor {
        const HostDeviceVector<bst_float>& y = it->second.predictions;
        if (y.Size() != 0) {
          monitor_.StartCuda("PredictFromCache");
-          out_preds->Shard(y.Distribution());
+          out_preds->SetDevice(y.DeviceIdx());
          out_preds->Resize(y.Size());
          out_preds->Copy(y);
          monitor_.StopCuda("PredictFromCache");
@ -500,25 +447,23 @@ class GPUPredictor : public xgboost::Predictor {
                 const std::vector<std::shared_ptr<DMatrix>>& cache) override {
    Predictor::Configure(cfg, cache);
-    GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus);
+    int device = learner_param_->gpu_id;
-    ConfigureShards(devices);
+    if (device >= 0) {
      ConfigureShard(device);
    }
  }
 private:
  /*! \brief Re configure shards when GPUSet is changed. */
-  void ConfigureShards(GPUSet devices) {
+  void ConfigureShard(int device) {
-    if (devices_ == devices) return;
+    if (device_ == device) return;
-    devices_ = devices;
+    device_ = device;
-    shards_.clear();
+    shard_.Init(device_);
    shards_.resize(devices_.Size());
    dh::ExecuteIndexShards(&shards_, [=](size_t i, DeviceShard& shard){
        shard.Init(devices_.DeviceId(i));
      });
  }
-  std::vector<DeviceShard> shards_;
+  DeviceShard shard_;
-  GPUSet devices_;
+  int device_;
  common::Monitor monitor_;
 };
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -702,7 +702,7 @@ struct DeviceShard {
    row_partitioner.reset(new RowPartitioner(device_id, n_rows));
    dh::safe_cuda(cudaMemcpyAsync(
-        gpair.data(), dh_gpair->ConstDevicePointer(device_id),
+        gpair.data(), dh_gpair->ConstDevicePointer(),
        gpair.size() * sizeof(GradientPair), cudaMemcpyHostToHost));
    SubsampleGradientPair(device_id, gpair, param.subsample, row_begin_idx);
    hist.Reset();
@ -745,8 +745,8 @@ struct DeviceShard {
    for (auto i = 0ull; i < nidxs.size(); i++) {
      auto nidx = nidxs[i];
      auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx));
-      p_feature_set->Shard(GPUSet(device_id, 1));
+      p_feature_set->SetDevice(device_id);
-      auto d_sampled_features = p_feature_set->DeviceSpan(device_id);
+      auto d_sampled_features = p_feature_set->DeviceSpan();
      common::Span<int32_t> d_feature_set =
          interaction_constraints.Query(d_sampled_features, nidx);
      auto d_split_candidates =
@ -1016,7 +1016,7 @@ struct DeviceShard {
                dh::AllReducer* reducer, int64_t num_columns) {
    constexpr int kRootNIdx = 0;
-    const auto &gpair = gpair_all->DeviceSpan(device_id);
+    const auto &gpair = gpair_all->DeviceSpan();
    dh::SumReduction(temp_memory, gpair, node_sum_gradients_d,
                     gpair.size());
@ -1294,11 +1294,8 @@ class GPUHistMakerSpecialised {
    param_.InitAllowUnknown(args);
    generic_param_ = generic_param;
    hist_maker_param_.InitAllowUnknown(args);
-    auto devices = GPUSet::All(generic_param_->gpu_id,
+    device_ = generic_param_->gpu_id;
-                               generic_param_->n_gpus);
+    CHECK_GE(device_, 0) << "Must have at least one device";
    n_devices_ = devices.Size();
    CHECK(n_devices_ != 0) << "Must have at least one device";
    dist_ = GPUDistribution::Block(devices);
    dh::CheckComputeCapability();
@ -1330,30 +1327,22 @@ class GPUHistMakerSpecialised {
  void InitDataOnce(DMatrix* dmat) {
    info_ = &dmat->Info();
-    int n_devices = dist_.Devices().Size();
+    reducer_.Init({device_});
    device_list_.resize(n_devices);
    for (int index = 0; index < n_devices; ++index) {
      int device_id = dist_.Devices().DeviceId(index);
      device_list_[index] = device_id;
    }
    reducer_.Init(device_list_);
    // Synchronise the column sampling seed
    uint32_t column_sampling_seed = common::GlobalRandom()();
    rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
    // Create device shards
-    shards_.resize(n_devices);
+    shards_.resize(1);
    dh::ExecuteIndexShards(
        &shards_,
        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(idx)));
+          dh::safe_cuda(cudaSetDevice(device_));
-          size_t start = dist_.ShardStart(info_->num_row_, idx);
+          size_t start = 0;
-          size_t size = dist_.ShardSize(info_->num_row_, idx);
+          size_t size = info_->num_row_;
          shard = std::unique_ptr<DeviceShard<GradientSumT>>(
-            new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(idx), idx,
+            new DeviceShard<GradientSumT>(device_, idx,
                                          start, start + size, param_,
                                          column_sampling_seed,
                                          info_->num_col_));
@ -1436,7 +1425,7 @@ class GPUHistMakerSpecialised {
    for (auto& tree : trees) {
      tree = *p_tree;
    }
-    gpair->Reshard(dist_);
+    gpair->SetDevice(device_);
    // Launch one thread for each device "shard" containing a subset of rows.
    // Threads will cooperatively build the tree, synchronising over histograms.
@ -1462,13 +1451,13 @@ class GPUHistMakerSpecialised {
      return false;
    }
    monitor_.StartCuda("UpdatePredictionCache");
-    p_out_preds->Shard(dist_.Devices());
+    p_out_preds->SetDevice(device_);
    dh::ExecuteIndexShards(
        &shards_,
        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
          dh::safe_cuda(cudaSetDevice(shard->device_id));
          shard->UpdatePredictionCache(
-              p_out_preds->DevicePointer(shard->device_id));
+              p_out_preds->DevicePointer());
        });
    monitor_.StopCuda("UpdatePredictionCache");
    return true;
@ -1483,7 +1472,6 @@ class GPUHistMakerSpecialised {
 private:
  bool initialised_;
  int n_devices_;
  int n_bins_;
  GPUHistMakerTrainParam hist_maker_param_;
@ -1492,11 +1480,9 @@ class GPUHistMakerSpecialised {
  dh::AllReducer reducer_;
  DMatrix* p_last_fmat_;
-  GPUDistribution dist_;
+  int device_;
  common::Monitor monitor_;
  /*! List storing device id. */
  std::vector<int> device_list_;
 };
 class GPUHistMaker : public TreeUpdater {
--- a/tests/cpp/common/test_common.cc
+++ b/tests/cpp/common/test_common.cc
@ -1,37 +0,0 @@
 #include "../../../src/common/common.h"
 #include <gtest/gtest.h>
 namespace xgboost {
 TEST(GPUSet, Basic) {
  GPUSet devices = GPUSet::Empty();
  ASSERT_TRUE(devices.IsEmpty());
  devices = GPUSet{0, 1};
  ASSERT_TRUE(devices != GPUSet::Empty());
  EXPECT_EQ(devices.Size(), 1);
  devices = GPUSet::Range(1, 0);
  EXPECT_EQ(devices.Size(), 0);
  EXPECT_TRUE(devices.IsEmpty());
  EXPECT_FALSE(devices.Contains(1));
  devices = GPUSet::Range(2, -1);
  EXPECT_EQ(devices, GPUSet::Empty());
  EXPECT_EQ(devices.Size(), 0);
  EXPECT_TRUE(devices.IsEmpty());
  devices = GPUSet::Range(2, 8);  // 2 ~ 10
  EXPECT_EQ(devices.Size(), 8);
  EXPECT_ANY_THROW(devices.DeviceId(8));
  auto device_id = devices.DeviceId(0);
  EXPECT_EQ(device_id, 2);
  auto device_index = devices.Index(2);
  EXPECT_EQ(device_index, 0);
 #ifndef XGBOOST_USE_CUDA
  EXPECT_EQ(GPUSet::AllVisible(), GPUSet::Empty());
 #endif
 }
 }  // namespace xgboost
--- a/tests/cpp/common/test_common.cu
+++ b/tests/cpp/common/test_common.cu
@ -1,83 +0,0 @@
 #include <gtest/gtest.h>
 #include <xgboost/logging.h>
 #include "../../../src/common/common.h"
 #include "../helpers.h"
 #include <string>
 namespace xgboost {
 TEST(GPUSet, GPUBasic) {
  GPUSet devices = GPUSet::Empty();
  ASSERT_TRUE(devices.IsEmpty());
  devices = GPUSet{1, 1};
  ASSERT_TRUE(devices != GPUSet::Empty());
  EXPECT_EQ(devices.Size(), 1);
  EXPECT_EQ(*(devices.begin()), 1);
  devices = GPUSet::Range(1, 0);
  EXPECT_EQ(devices, GPUSet::Empty());
  EXPECT_EQ(devices.Size(), 0);
  EXPECT_TRUE(devices.IsEmpty());
  EXPECT_FALSE(devices.Contains(1));
  devices = GPUSet::Range(2, -1);
  EXPECT_EQ(devices, GPUSet::Empty());
  devices = GPUSet::Range(2, 8);
  EXPECT_EQ(devices.Size(), 8);
  EXPECT_EQ(*devices.begin(), 2);
  EXPECT_EQ(*devices.end(), 2 + devices.Size());
  EXPECT_EQ(8, devices.Size());
  ASSERT_NO_THROW(GPUSet::AllVisible());
  devices = GPUSet::AllVisible();
  if (devices.IsEmpty()) {
    LOG(WARNING) << "Empty devices.";
  }
 }
 TEST(GPUSet, Verbose) {
  {
    std::map<std::string, std::string> args {};
    args["verbosity"] = "3";  // LOG INFO
    testing::internal::CaptureStderr();
    ConsoleLogger::Configure({args.cbegin(), args.cend()});
    GPUSet::All(0, 1);
    std::string output = testing::internal::GetCapturedStderr();
    ASSERT_NE(output.find("GPU ID: 0"), std::string::npos);
    ASSERT_NE(output.find("GPUs: 1"), std::string::npos);
    args["verbosity"] = "1";  // restore
    ConsoleLogger::Configure({args.cbegin(), args.cend()});
  }
 }
 #if defined(XGBOOST_USE_NCCL)
 TEST(GPUSet, MGPU_GPUBasic) {
  {
    GPUSet devices = GPUSet::All(1, 1);
    ASSERT_EQ(*(devices.begin()), 1);
    ASSERT_EQ(*(devices.end()), 2);
    ASSERT_EQ(devices.Size(), 1);
    ASSERT_TRUE(devices.Contains(1));
  }
  {
    GPUSet devices = GPUSet::All(0, -1);
    ASSERT_GE(devices.Size(), 2);
  }
  // Specify number of rows.
  {
    GPUSet devices = GPUSet::All(0, -1, 1);
    ASSERT_EQ(devices.Size(), 1);
  }
 }
 #endif
 }  // namespace xgboost
--- a/tests/cpp/common/test_config.cc
+++ b/tests/cpp/common/test_config.cc
@ -87,8 +87,8 @@ TEST(ConfigParser, ParseKeyValuePair) {
  ASSERT_TRUE(parser.ParseKeyValuePair("booster = gbtree", &key, &value));
  ASSERT_EQ(key, "booster");
  ASSERT_EQ(value, "gbtree");
-  ASSERT_TRUE(parser.ParseKeyValuePair("n_gpus = 2", &key, &value));
+  ASSERT_TRUE(parser.ParseKeyValuePair("gpu_id = 2", &key, &value));
-  ASSERT_EQ(key, "n_gpus");
+  ASSERT_EQ(key, "gpu_id");
  ASSERT_EQ(value, "2");
  ASSERT_TRUE(parser.ParseKeyValuePair("monotone_constraints = (1,0,-1)",
                                       &key, &value));
--- a/tests/cpp/common/test_gpu_hist_util.cu
+++ b/tests/cpp/common/test_gpu_hist_util.cu
@ -18,7 +18,7 @@
 namespace xgboost {
 namespace common {
-void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
+void TestDeviceSketch(bool use_external_memory) {
  // create the data
  int nrows = 10001;
  std::shared_ptr<xgboost::DMatrix> *dmat = nullptr;
@ -53,7 +53,7 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
  // find the cuts on the GPU
  HistogramCuts hmat_gpu;
-  size_t row_stride = DeviceSketch(p, CreateEmptyGenericParam(0, devices.Size()), gpu_batch_nrows,
+  size_t row_stride = DeviceSketch(p, CreateEmptyGenericParam(0), gpu_batch_nrows,
                                   dmat->get(), &hmat_gpu);
  // compare the row stride with the one obtained from the dmatrix
@ -81,11 +81,11 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
 }
 TEST(gpu_hist_util, DeviceSketch) {
-  TestDeviceSketch(GPUSet::Range(0, 1), false);
+  TestDeviceSketch(false);
 }
 TEST(gpu_hist_util, DeviceSketch_ExternalMemory) {
-  TestDeviceSketch(GPUSet::Range(0, 1), true);
+  TestDeviceSketch(true);
 }
 }  // namespace common
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@ -30,45 +30,36 @@ struct HostDeviceVectorSetDeviceHandler {
  }
 };
-void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
+void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
                     HostDeviceVector<int> *v) {
  // create the vector
-  GPUSet devices = distribution.Devices();
+  v->SetDevice(device);
  v->Shard(distribution);
  v->Resize(n);
  ASSERT_EQ(v->Size(), n);
-  ASSERT_TRUE(v->Distribution() == distribution);
+  ASSERT_EQ(v->DeviceIdx(), device);
-  ASSERT_TRUE(v->Devices() == devices);
+  // ensure that the device have read-write access
-  // ensure that the devices have read-write access
+  ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
-  for (int i = 0; i < devices.Size(); ++i) {
+  ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite));
    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite));
  }
  // ensure that the host has no access
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));
  // fill in the data on the host
  std::vector<int>& data_h = v->HostVector();
-  // ensure that the host has full access, while the devices have none
+  // ensure that the host has full access, while the device have none
  ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
  ASSERT_TRUE(v->HostCanAccess(GPUAccess::kWrite));
-  for (int i = 0; i < devices.Size(); ++i) {
+  ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kRead));
-    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kRead));
+  ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite));
    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite));
  }
  ASSERT_EQ(data_h.size(), n);
  std::copy_n(thrust::make_counting_iterator(0), n, data_h.begin());
 }
 void PlusOne(HostDeviceVector<int> *v) {
-  int n_devices = v->Devices().Size();
+  int device = v->DeviceIdx();
-  for (int i = 0; i < n_devices; ++i) {
+  SetDevice(device);
-    SetDevice(i);
+  thrust::transform(v->tbegin(), v->tend(), v->tbegin(),
-    thrust::transform(v->tbegin(i), v->tend(i), v->tbegin(i),
+                    [=]__device__(unsigned int a){ return a + 1; });
                      [=]__device__(unsigned int a){ return a + 1; });
  }
 }
 void CheckDevice(HostDeviceVector<int> *v,
@ -76,24 +67,24 @@ void CheckDevice(HostDeviceVector<int> *v,
                 const std::vector<size_t>& sizes,
                 unsigned int first, GPUAccess access) {
  int n_devices = sizes.size();
-  ASSERT_EQ(v->Devices().Size(), n_devices);
+  ASSERT_EQ(n_devices, 1);
  for (int i = 0; i < n_devices; ++i) {
-    ASSERT_EQ(v->DeviceSize(i), sizes.at(i));
+    ASSERT_EQ(v->DeviceSize(), sizes.at(i));
    SetDevice(i);
-    ASSERT_TRUE(thrust::equal(v->tcbegin(i), v->tcend(i),
+    ASSERT_TRUE(thrust::equal(v->tcbegin(), v->tcend(),
                              thrust::make_counting_iterator(first + starts[i])));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
+    ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
    // ensure that the device has at most the access specified by access
-    ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kWrite), access == GPUAccess::kWrite);
+    ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite);
  }
  ASSERT_EQ(v->HostCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
  for (int i = 0; i < n_devices; ++i) {
    SetDevice(i);
-    ASSERT_TRUE(thrust::equal(v->tbegin(i), v->tend(i),
+    ASSERT_TRUE(thrust::equal(v->tbegin(), v->tend(),
                              thrust::make_counting_iterator(first + starts[i])));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
+    ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+    ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite));
  }
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
@ -107,20 +98,20 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
  }
  ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
  ASSERT_EQ(v->HostCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite);
-  size_t n_devices = v->Devices().Size();
+  size_t n_devices = 1;
  for (int i = 0; i < n_devices; ++i) {
-    ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kRead), access == GPUAccess::kRead);
+    ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
    // the devices should have no write access
-    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+    ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite));
  }
 }
 void TestHostDeviceVector
-(size_t n, const GPUDistribution& distribution,
+(size_t n, int device,
 const std::vector<size_t>& starts, const std::vector<size_t>& sizes) {
  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
  HostDeviceVector<int> v;
-  InitHostDeviceVector(n, distribution, &v);
+  InitHostDeviceVector(n, device, &v);
  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
  PlusOne(&v);
  CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
@ -130,54 +121,24 @@ void TestHostDeviceVector
 TEST(HostDeviceVector, TestBlock) {
  size_t n = 1001;
-  int n_devices = 2;
+  int device = 0;
-  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
+  std::vector<size_t> starts{0};
-  std::vector<size_t> starts{0, 501};
+  std::vector<size_t> sizes{1001};
-  std::vector<size_t> sizes{501, 500};
+  TestHostDeviceVector(n, device, starts, sizes);
  TestHostDeviceVector(n, distribution, starts, sizes);
 }
 TEST(HostDeviceVector, TestGranular) {
  size_t n = 3003;
  int n_devices = 2;
  auto distribution = GPUDistribution::Granular(GPUSet::Range(0, n_devices), 3);
  std::vector<size_t> starts{0, 1503};
  std::vector<size_t> sizes{1503, 1500};
  TestHostDeviceVector(n, distribution, starts, sizes);
 }
 TEST(HostDeviceVector, TestOverlap) {
  size_t n = 1001;
  int n_devices = 2;
  auto distribution = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1);
  std::vector<size_t> starts{0, 500};
  std::vector<size_t> sizes{501, 501};
  TestHostDeviceVector(n, distribution, starts, sizes);
 }
 TEST(HostDeviceVector, TestExplicit) {
  size_t n = 1001;
  int n_devices = 2;
  std::vector<size_t> offsets{0, 550, 1001};
  auto distribution = GPUDistribution::Explicit(GPUSet::Range(0, n_devices), offsets);
  std::vector<size_t> starts{0, 550};
  std::vector<size_t> sizes{550, 451};
  TestHostDeviceVector(n, distribution, starts, sizes);
 }
 TEST(HostDeviceVector, TestCopy) {
  size_t n = 1001;
-  int n_devices = 2;
+  int device = 0;
-  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
+  std::vector<size_t> starts{0};
-  std::vector<size_t> starts{0, 501};
+  std::vector<size_t> sizes{1001};
  std::vector<size_t> sizes{501, 500};
  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
  HostDeviceVector<int> v;
  {
    // a separate scope to ensure that v1 is gone before further checks
    HostDeviceVector<int> v1;
-    InitHostDeviceVector(n, distribution, &v1);
+    InitHostDeviceVector(n, device, &v1);
    v = v1;
  }
  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
@ -193,16 +154,16 @@ TEST(HostDeviceVector, Shard) {
    h_vec[i] = i;
  }
  HostDeviceVector<int> vec (h_vec);
-  auto devices = GPUSet::Range(0, 1);
+  auto device = 0;
-  vec.Shard(devices);
+  vec.SetDevice(device);
-  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
+  ASSERT_EQ(vec.DeviceSize(), h_vec.size());
  ASSERT_EQ(vec.Size(), h_vec.size());
-  auto span = vec.DeviceSpan(0);  // sync to device
+  auto span = vec.DeviceSpan();  // sync to device
-  vec.Reshard(GPUDistribution::Empty());  // pull back to cpu, empty devices.
+  vec.SetDevice(-1);  // pull back to cpu.
  ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_TRUE(vec.Devices().IsEmpty());
+  ASSERT_EQ(vec.DeviceIdx(), -1);
  auto h_vec_1 = vec.HostVector();
  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@ -214,16 +175,16 @@ TEST(HostDeviceVector, Reshard) {
    h_vec[i] = i;
  }
  HostDeviceVector<int> vec (h_vec);
-  auto devices = GPUSet::Range(0, 1);
+  auto device = 0;
-  vec.Shard(devices);
+  vec.SetDevice(device);
-  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
+  ASSERT_EQ(vec.DeviceSize(), h_vec.size());
  ASSERT_EQ(vec.Size(), h_vec.size());
  PlusOne(&vec);
-  vec.Reshard(GPUDistribution::Empty());
+  vec.SetDevice(-1);
  ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_TRUE(vec.Devices().IsEmpty());
+  ASSERT_EQ(vec.DeviceIdx(), -1);
  auto h_vec_1 = vec.HostVector();
  for (size_t i = 0; i < h_vec_1.size(); ++i) {
@ -233,97 +194,14 @@ TEST(HostDeviceVector, Reshard) {
 TEST(HostDeviceVector, Span) {
  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.Shard(GPUSet{0, 1});
+  vec.SetDevice(0);
-  auto span = vec.DeviceSpan(0);
+  auto span = vec.DeviceSpan();
-  ASSERT_EQ(vec.DeviceSize(0), span.size());
+  ASSERT_EQ(vec.DeviceSize(), span.size());
-  ASSERT_EQ(vec.DevicePointer(0), span.data());
+  ASSERT_EQ(vec.DevicePointer(), span.data());
-  auto const_span = vec.ConstDeviceSpan(0);
+  auto const_span = vec.ConstDeviceSpan();
-  ASSERT_EQ(vec.DeviceSize(0), span.size());
+  ASSERT_EQ(vec.DeviceSize(), span.size());
-  ASSERT_EQ(vec.ConstDevicePointer(0), span.data());
+  ASSERT_EQ(vec.ConstDevicePointer(), span.data());
 }
 // Multi-GPUs' test
 #if defined(XGBOOST_USE_NCCL)
 TEST(HostDeviceVector, MGPU_Shard) {
  auto devices = GPUSet::AllVisible();
  if (devices.Size() < 2) {
    LOG(WARNING) << "Not testing in multi-gpu environment.";
    return;
  }
  std::vector<int> h_vec (2345);
  for (size_t i = 0; i < h_vec.size(); ++i) {
    h_vec[i] = i;
  }
  HostDeviceVector<int> vec (h_vec);
  // Data size for each device.
  std::vector<size_t> devices_size (devices.Size());
  // From CPU to GPUs.
  vec.Shard(devices);
  size_t total_size = 0;
  for (size_t i = 0; i < devices.Size(); ++i) {
    total_size += vec.DeviceSize(i);
    devices_size[i] = vec.DeviceSize(i);
  }
  ASSERT_EQ(total_size, h_vec.size());
  ASSERT_EQ(total_size, vec.Size());
  // Shard from devices to devices with different distribution.
  EXPECT_ANY_THROW(
      vec.Shard(GPUDistribution::Granular(devices, 12)));
  // All data is drawn back to CPU
  vec.Reshard(GPUDistribution::Empty());
  ASSERT_TRUE(vec.Devices().IsEmpty());
  ASSERT_EQ(vec.Size(), h_vec.size());
  vec.Shard(GPUDistribution::Granular(devices, 12));
  total_size = 0;
  for (size_t i = 0; i < devices.Size(); ++i) {
    total_size += vec.DeviceSize(i);
    devices_size[i] = vec.DeviceSize(i);
  }
  ASSERT_EQ(total_size, h_vec.size());
  ASSERT_EQ(total_size, vec.Size());
 }
 TEST(HostDeviceVector, MGPU_Reshard) {
  auto devices = GPUSet::AllVisible();
  if (devices.Size() < 2) {
    LOG(WARNING) << "Not testing in multi-gpu environment.";
    return;
  }
  size_t n = 1001;
  int n_devices = 2;
  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
  std::vector<size_t> starts{0, 501};
  std::vector<size_t> sizes{501, 500};
  HostDeviceVector<int> v;
  InitHostDeviceVector(n, distribution, &v);
  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
  PlusOne(&v);
  CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
  CheckHost(&v, GPUAccess::kRead);
  CheckHost(&v, GPUAccess::kWrite);
  auto distribution1 = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1);
  v.Reshard(distribution1);
  for (size_t i = 0; i < n_devices; ++i) {
    auto span = v.DeviceSpan(i);  // sync to device
  }
  std::vector<size_t> starts1{0, 500};
  std::vector<size_t> sizes1{501, 501};
  CheckDevice(&v, starts1, sizes1, 1, GPUAccess::kWrite);
  CheckHost(&v, GPUAccess::kRead);
  CheckHost(&v, GPUAccess::kWrite);
 }
 #endif
 }  // namespace common
 }  // namespace xgboost
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@ -33,7 +33,7 @@ std::string GetModelStr() {
  },
  "configuration": {
    "booster": "gbtree",
-    "n_gpus": "1",
+    "gpu_id": "0",
    "num_class": "0",
    "num_feature": "10",
    "objective": "reg:linear",
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@ -9,13 +9,11 @@
 #if defined(__CUDACC__)
-#define TRANSFORM_GPU_RANGE GPUSet::Range(0, 1)
+#define TRANSFORM_GPU 0
 #define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Range(0, 1))
 #else
-#define TRANSFORM_GPU_RANGE GPUSet::Empty()
+#define TRANSFORM_GPU -1
 #define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Empty())
 #endif
@ -46,13 +44,13 @@ TEST(Transform, DeclareUnifiedTest(Basic)) {
  std::vector<bst_float> h_sol(size);
  InitializeRange(h_sol.begin(), h_sol.end());
-  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU_DIST};
+  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
-  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU_DIST};
+  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
  out_vec.Fill(0);
  Transform<>::Init(TestTransformRange<bst_float>{},
 	                Range{0, static_cast<Range::DifferenceType>(size)},
-	                TRANSFORM_GPU_RANGE)
+	                TRANSFORM_GPU)
      .Eval(&out_vec, &in_vec);
  std::vector<bst_float> res = out_vec.HostVector();
--- a/tests/cpp/common/test_transform_range.cu
+++ b/tests/cpp/common/test_transform_range.cu
@ -5,87 +5,13 @@
 namespace xgboost {
 namespace common {
 // Test here is multi gpu specific
 TEST(Transform, MGPU_Basic) {
  auto devices = GPUSet::AllVisible();
  CHECK_GT(devices.Size(), 1);
  const size_t size {256};
  std::vector<bst_float> h_in(size);
  std::vector<bst_float> h_out(size);
  InitializeRange(h_in.begin(), h_in.end());
  std::vector<bst_float> h_sol(size);
  InitializeRange(h_sol.begin(), h_sol.end());
  const HostDeviceVector<bst_float> in_vec {h_in,
        GPUDistribution::Block(GPUSet::Empty())};
  HostDeviceVector<bst_float> out_vec {h_out,
        GPUDistribution::Block(GPUSet::Empty())};
  out_vec.Fill(0);
  in_vec.Shard(GPUDistribution::Granular(devices, 8));
  out_vec.Shard(GPUDistribution::Block(devices));
  // Granularity is different, sharding will throw.
  EXPECT_ANY_THROW(
      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, devices)
      .Eval(&out_vec, &in_vec));
  Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size},
                    devices, false).Eval(&out_vec, &in_vec);
  std::vector<bst_float> res = out_vec.HostVector();
  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
 }
 // Test for multi-classes setting.
 template <typename T>
 struct TestTransformRangeGranular {
  const size_t granularity = 8;
  explicit TestTransformRangeGranular(const size_t granular) : granularity{granular} {}
  void XGBOOST_DEVICE operator()(size_t _idx,
                                 Span<bst_float> _out, Span<const bst_float> _in) {
    auto in_sub = _in.subspan(_idx * granularity, granularity);
    auto out_sub = _out.subspan(_idx * granularity, granularity);
    for (size_t i = 0; i < granularity; ++i) {
      out_sub[i] = in_sub[i];
    }
  }
 };
 TEST(Transform, MGPU_Granularity) {
  GPUSet devices = GPUSet::All(0, -1);
  const size_t size {8990};
  const size_t granularity = 10;
  GPUDistribution distribution =
      GPUDistribution::Granular(devices, granularity);
  std::vector<bst_float> h_in(size);
  std::vector<bst_float> h_out(size);
  InitializeRange(h_in.begin(), h_in.end());
  std::vector<bst_float> h_sol(size);
  InitializeRange(h_sol.begin(), h_sol.end());
  const HostDeviceVector<bst_float> in_vec {h_in, distribution};
  HostDeviceVector<bst_float> out_vec {h_out, distribution};
  ASSERT_NO_THROW(
      Transform<>::Init(
          TestTransformRangeGranular<bst_float>{granularity},
          Range{0, size / granularity},
          distribution)
      .Eval(&out_vec, &in_vec));
  std::vector<bst_float> res = out_vec.HostVector();
  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
 }
 TEST(Transform, MGPU_SpecifiedGpuId) {
  if (AllVisibleGPUs() < 2) {
    LOG(WARNING) << "Not testing in multi-gpu environment.";
    return;
  }
  // Use 1 GPU, Numbering of GPU starts from 1
-  auto devices = GPUSet::All(1, 1);
+  auto device = 1;
  const size_t size {256};
  std::vector<bst_float> h_in(size);
  std::vector<bst_float> h_out(size);
@ -93,13 +19,11 @@ TEST(Transform, MGPU_SpecifiedGpuId) {
  std::vector<bst_float> h_sol(size);
  InitializeRange(h_sol.begin(), h_sol.end());
-  const HostDeviceVector<bst_float> in_vec {h_in,
+  const HostDeviceVector<bst_float> in_vec {h_in, device};
-        GPUDistribution::Block(devices)};
+  HostDeviceVector<bst_float> out_vec {h_out, device};
  HostDeviceVector<bst_float> out_vec {h_out,
        GPUDistribution::Block(devices)};
  ASSERT_NO_THROW(
-      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, devices)
+      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, device)
      .Eval(&out_vec, &in_vec));
  std::vector<bst_float> res = out_vec.HostVector();
  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@ -12,7 +12,7 @@ TEST(GBTree, SelectTreeMethod) {
  auto p_dmat {(*p_shared_ptr_dmat).get()};
  GenericParameter generic_param;
-  generic_param.InitAllowUnknown(std::vector<Arg>{Arg("n_gpus", "0")});
+  generic_param.InitAllowUnknown(std::vector<Arg>{});
  std::unique_ptr<GradientBooster> p_gbm{
    GradientBooster::Create("gbtree", &generic_param, {}, 0)};
  auto& gbtree = dynamic_cast<gbm::GBTree&> (*p_gbm);
@ -35,7 +35,7 @@ TEST(GBTree, SelectTreeMethod) {
                                 Arg{"num_feature", n_feat}}, p_dmat);
  ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker");
 #ifdef XGBOOST_USE_CUDA
-  generic_param.InitAllowUnknown(std::vector<Arg>{Arg{"n_gpus", "1"}});
+  generic_param.InitAllowUnknown(std::vector<Arg>{Arg{"gpu_id", "0"}});
  gbtree.ConfigureWithKnownData({Arg("tree_method", "gpu_hist"), Arg("num_feature", n_feat)},
                                p_dmat);
  ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist");
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@ -29,9 +29,9 @@
 #endif
 #if defined(__CUDACC__)
-#define NGPUS 1
+#define GPUIDX 0
 #else
-#define NGPUS 0
+#define GPUIDX -1
 #endif
 bool FileExists(const std::string& filename);
@ -189,11 +189,10 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_c
 gbm::GBTreeModel CreateTestModel();
-inline GenericParameter CreateEmptyGenericParam(int gpu_id, int n_gpus) {
+inline GenericParameter CreateEmptyGenericParam(int gpu_id) {
  xgboost::GenericParameter tparam;
  std::vector<std::pair<std::string, std::string>> args {
-    {"gpu_id", std::to_string(gpu_id)},
+    {"gpu_id", std::to_string(gpu_id)}};
    {"n_gpus", std::to_string(n_gpus)}};
  tparam.Init(args);
  return tparam;
 }
--- a/tests/cpp/linear/test_linear.cc
+++ b/tests/cpp/linear/test_linear.cc
@ -7,7 +7,7 @@
 TEST(Linear, shotgun) {
  auto mat = xgboost::CreateDMatrix(10, 10, 0);
-  auto lparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  {
    auto updater = std::unique_ptr<xgboost::LinearUpdater>(
        xgboost::LinearUpdater::Create("shotgun", &lparam));
@ -33,7 +33,7 @@ TEST(Linear, shotgun) {
 TEST(Linear, coordinate) {
  auto mat = xgboost::CreateDMatrix(10, 10, 0);
-  auto lparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  auto updater = std::unique_ptr<xgboost::LinearUpdater>(
      xgboost::LinearUpdater::Create("coord_descent", &lparam));
  updater->Configure({{"eta", "1."}});
--- a/tests/cpp/linear/test_linear.cu
+++ b/tests/cpp/linear/test_linear.cu
@ -7,8 +7,7 @@ namespace xgboost {
 TEST(Linear, GPUCoordinate) {
  auto mat = xgboost::CreateDMatrix(10, 10, 0);
-  auto lparam = CreateEmptyGenericParam(0, 1);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
  lparam.n_gpus = 1;
  auto updater = std::unique_ptr<xgboost::LinearUpdater>(
      xgboost::LinearUpdater::Create("gpu_coord_descent", &lparam));
  updater->Configure({{"eta", "1."}});
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@ -6,7 +6,7 @@
 #include "../helpers.h"
 TEST(Metric, DeclareUnifiedTest(RMSE)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("rmse", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "rmse");
@ -20,7 +20,7 @@ TEST(Metric, DeclareUnifiedTest(RMSE)) {
 }
 TEST(Metric, DeclareUnifiedTest(RMSLE)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "rmsle");
@ -32,7 +32,7 @@ TEST(Metric, DeclareUnifiedTest(RMSLE)) {
 }
 TEST(Metric, DeclareUnifiedTest(MAE)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("mae", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "mae");
@ -45,7 +45,7 @@ TEST(Metric, DeclareUnifiedTest(MAE)) {
 }
 TEST(Metric, DeclareUnifiedTest(LogLoss)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("logloss", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "logloss");
@ -58,7 +58,7 @@ TEST(Metric, DeclareUnifiedTest(LogLoss)) {
 }
 TEST(Metric, DeclareUnifiedTest(Error)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("error", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "error");
@ -90,7 +90,7 @@ TEST(Metric, DeclareUnifiedTest(Error)) {
 }
 TEST(Metric, DeclareUnifiedTest(PoissionNegLogLik)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "poisson-nloglik");
--- a/tests/cpp/metric/test_metric.cc
+++ b/tests/cpp/metric/test_metric.cc
@ -4,7 +4,7 @@
 #include "../helpers.h"
 TEST(Metric, UnknownMetric) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = nullptr;
  EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name", &tparam));
  EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse", &tparam));
--- a/tests/cpp/metric/test_multiclass_metric.cc
+++ b/tests/cpp/metric/test_multiclass_metric.cc
@ -4,10 +4,9 @@
 #include "../helpers.h"
-inline void TestMultiClassError(xgboost::GPUSet const& devices) {
+inline void TestMultiClassError(int device) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(device);
-  lparam.gpu_id = *devices.begin();
+  lparam.gpu_id = device;
  lparam.n_gpus = devices.Size();
  xgboost::Metric * metric = xgboost::Metric::Create("merror", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "merror");
@ -23,14 +22,12 @@ inline void TestMultiClassError(xgboost::GPUSet const& devices) {
 }
 TEST(Metric, DeclareUnifiedTest(MultiClassError)) {
-  auto devices = xgboost::GPUSet::Range(0, NGPUS);
+  TestMultiClassError(GPUIDX);
  TestMultiClassError(devices);
 }
-inline void TestMultiClassLogLoss(xgboost::GPUSet const& devices) {
+inline void TestMultiClassLogLoss(int device) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(device);
-  lparam.gpu_id = *devices.begin();
+  lparam.gpu_id = device;
  lparam.n_gpus = devices.Size();
  xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "mlogloss");
@ -46,27 +43,31 @@ inline void TestMultiClassLogLoss(xgboost::GPUSet const& devices) {
 }
 TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) {
-  auto devices = xgboost::GPUSet::Range(0, NGPUS);
+  TestMultiClassLogLoss(GPUIDX);
  TestMultiClassLogLoss(devices);
 }
 #if defined(XGBOOST_USE_NCCL) && defined(__CUDACC__)
 namespace xgboost {
 namespace common {
 TEST(Metric, MGPU_MultiClassError) {
  if (AllVisibleGPUs() < 2) {
    LOG(WARNING) << "Not testing in multi-gpu environment.";
    return;
  }
  {
-    auto devices = xgboost::GPUSet::All(0, -1);
+    TestMultiClassError(0);
    TestMultiClassError(devices);
  }
  {
-    auto devices = xgboost::GPUSet::All(1, -1);
+    TestMultiClassError(1);
    TestMultiClassError(devices);
  }
  {
-    auto devices = xgboost::GPUSet::All(0, -1);
+    TestMultiClassLogLoss(0);
    TestMultiClassLogLoss(devices);
  }
  {
-    auto devices = xgboost::GPUSet::All(1, -1);
+    TestMultiClassLogLoss(1);
    TestMultiClassLogLoss(devices);
  }
 }
 }  // namespace common
 }  // namespace xgboost
 #endif  // defined(XGBOOST_USE_NCCL)
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@ -4,7 +4,7 @@
 #include "../helpers.h"
 TEST(Metric, AMS) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  EXPECT_ANY_THROW(xgboost::Metric::Create("ams", &tparam));
  xgboost::Metric * metric = xgboost::Metric::Create("ams@0.5f", &tparam);
  ASSERT_STREQ(metric->Name(), "ams@0.5");
@ -23,7 +23,7 @@ TEST(Metric, AMS) {
 }
 TEST(Metric, AUC) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("auc", &tparam);
  ASSERT_STREQ(metric->Name(), "auc");
  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
@ -38,7 +38,7 @@ TEST(Metric, AUC) {
 }
 TEST(Metric, AUCPR) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric *metric = xgboost::Metric::Create("aucpr", &tparam);
  ASSERT_STREQ(metric->Name(), "aucpr");
  EXPECT_NEAR(GetMetricEval(metric, {0, 0, 1, 1}, {0, 0, 1, 1}), 1, 1e-10);
@ -65,7 +65,7 @@ TEST(Metric, Precision) {
  // When the limit for precision is not given, it takes the limit at
  // std::numeric_limits<unsigned>::max(); hence all values are very small
  // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("pre", &tparam);
  ASSERT_STREQ(metric->Name(), "pre");
  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-7);
@ -89,7 +89,7 @@ TEST(Metric, Precision) {
 }
 TEST(Metric, NDCG) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("ndcg", &tparam);
  ASSERT_STREQ(metric->Name(), "ndcg");
  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
@ -147,7 +147,7 @@ TEST(Metric, NDCG) {
 }
 TEST(Metric, MAP) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("map", &tparam);
  ASSERT_STREQ(metric->Name(), "map");
  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
--- a/tests/cpp/objective/test_hinge.cc
+++ b/tests/cpp/objective/test_hinge.cc
@ -6,7 +6,7 @@
 #include "../helpers.h"
 TEST(Objective, DeclareUnifiedTest(HingeObj)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("binary:hinge", &tparam);
  xgboost::bst_float eps = std::numeric_limits<xgboost::bst_float>::min();
--- a/tests/cpp/objective/test_multiclass_obj.cc
+++ b/tests/cpp/objective/test_multiclass_obj.cc
@ -7,7 +7,7 @@
 #include "../helpers.h"
 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args {{"num_class", "3"}};
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("multi:softmax", &lparam);
@ -25,7 +25,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
 }
 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args{
    std::pair<std::string, std::string>("num_class", "3")};
@ -47,7 +47,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
 }
 TEST(Objective, DeclareUnifiedTest(SoftprobMultiClassBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args {
    std::pair<std::string, std::string>("num_class", "3")};
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@ -7,7 +7,7 @@
 #include "../helpers.h"
 TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj =
@ -32,7 +32,7 @@ TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
 }
 TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj =
@ -56,7 +56,7 @@ TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
 }
 TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:logistic", &tparam);
@ -72,7 +72,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
 }
 TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:logistic", &lparam);
@ -102,7 +102,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
 }
 TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("binary:logitraw", &lparam);
@ -118,7 +118,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
 }
 TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson", &lparam);
@ -140,7 +140,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
 }
 TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson", &lparam);
@ -168,7 +168,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
 }
 TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:gamma", &lparam);
@ -189,7 +189,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
 }
 TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:gamma", &lparam);
@ -217,7 +217,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
 }
 TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie", &lparam);
@ -241,7 +241,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
 #if defined(__CUDACC__)
 TEST(Objective, CPU_vs_CUDA) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, 1);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::ObjFunction * obj =
      xgboost::ObjFunction::Create("reg:squarederror", &lparam);
@ -267,12 +267,12 @@ TEST(Objective, CPU_vs_CUDA) {
  {
    // CPU
-    lparam.n_gpus = 0;
+    lparam.gpu_id = -1;
    obj->GetGradient(preds, info, 0, &cpu_out_preds);
  }
  {
    // CUDA
-    lparam.n_gpus = 1;
+    lparam.gpu_id = 0;
    obj->GetGradient(preds, info, 0, &cuda_out_preds);
  }
@ -294,7 +294,7 @@ TEST(Objective, CPU_vs_CUDA) {
 #endif
 TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie", &lparam);
@ -325,7 +325,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
 // CoxRegression not implemented in GPU code, no need for testing.
 #if !defined(__CUDACC__)
 TEST(Objective, CoxRegressionGPair) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, 0);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj =
      xgboost::ObjFunction::Create("survival:cox", &lparam);
--- a/tests/cpp/plugin/test_example_objective.cc
+++ b/tests/cpp/plugin/test_example_objective.cc
@ -6,7 +6,7 @@
 namespace xgboost {
 TEST(Plugin, ExampleObjective) {
-  xgboost::GenericParameter tparam = CreateEmptyGenericParam(0, 0);
+  xgboost::GenericParameter tparam = CreateEmptyGenericParam(GPUIDX);
  auto * obj = xgboost::ObjFunction::Create("mylogistic", &tparam);
  ASSERT_EQ(obj->DefaultEvalMetric(), std::string{"error"});
  delete obj;
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@ -6,7 +6,7 @@
 namespace xgboost {
 TEST(cpu_predictor, Test) {
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
  std::unique_ptr<Predictor> cpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
@ -59,7 +59,7 @@ TEST(cpu_predictor, ExternalMemoryTest) {
  dmlc::TemporaryDirectory tmpdir;
  std::string filename = tmpdir.path + "/big.libsvm";
  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64, filename);
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
  std::unique_ptr<Predictor> cpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@ -33,8 +33,8 @@ namespace xgboost {
 namespace predictor {
 TEST(gpu_predictor, Test) {
-  auto cpu_lparam = CreateEmptyGenericParam(0, 0);
+  auto cpu_lparam = CreateEmptyGenericParam(-1);
-  auto gpu_lparam = CreateEmptyGenericParam(0, 1);
+  auto gpu_lparam = CreateEmptyGenericParam(0);
  std::unique_ptr<Predictor> gpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &gpu_lparam));
@ -69,7 +69,7 @@ TEST(gpu_predictor, Test) {
 }
 TEST(gpu_predictor, ExternalMemoryTest) {
-  auto lparam = CreateEmptyGenericParam(0, 1);
+  auto lparam = CreateEmptyGenericParam(0);
  std::unique_ptr<Predictor> gpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &lparam));
  gpu_predictor->Configure({}, {});
@ -83,26 +83,26 @@ TEST(gpu_predictor, ExternalMemoryTest) {
  std::string file1 = tmpdir.path + "/big_1.libsvm";
  std::string file2 = tmpdir.path + "/big_2.libsvm";
  dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0));
-  dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
+//  dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
-  dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
+//  dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
  for (const auto& dmat: dmats) {
-    // Test predict batch
+    dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);
    HostDeviceVector<float> out_predictions;
    gpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
    EXPECT_EQ(out_predictions.Size(), dmat->Info().num_row_ * n_classes);
    const std::vector<float> &host_vector = out_predictions.ConstHostVector();
    for (int i = 0; i < host_vector.size() / n_classes; i++) {
-      ASSERT_EQ(host_vector[i * n_classes], 1.5);
+      ASSERT_EQ(host_vector[i * n_classes], 2.0);
-      ASSERT_EQ(host_vector[i * n_classes + 1], 0.);
+      ASSERT_EQ(host_vector[i * n_classes + 1], 0.5);
-      ASSERT_EQ(host_vector[i * n_classes + 2], 0.);
+      ASSERT_EQ(host_vector[i * n_classes + 2], 0.5);
    }
  }
 }
 // Test whether pickling preserves predictor parameters
 TEST(gpu_predictor, PicklingTest) {
-  int const ngpu = 1;
+  int const gpuid = 0;
  dmlc::TemporaryDirectory tempdir;
  const std::string tmp_file = tempdir.path + "/simple.libsvm";
@ -134,7 +134,7 @@ TEST(gpu_predictor, PicklingTest) {
  ASSERT_EQ(XGBoosterSetParam(
      bst, "tree_method", "gpu_hist"), 0) << XGBGetLastError();
  ASSERT_EQ(XGBoosterSetParam(
-      bst, "n_gpus", std::to_string(ngpu).c_str()), 0) << XGBGetLastError();
+      bst, "gpu_id", std::to_string(gpuid).c_str()), 0) << XGBGetLastError();
  ASSERT_EQ(XGBoosterSetParam(bst, "predictor", "gpu_predictor"), 0) << XGBGetLastError();
  // Run boosting iterations
@ -160,7 +160,7 @@ TEST(gpu_predictor, PicklingTest) {
  {  // Query predictor
    const auto& kwargs = QueryBoosterConfigurationArguments(bst2);
    ASSERT_EQ(kwargs.at("predictor"), "gpu_predictor");
-    ASSERT_EQ(kwargs.at("n_gpus"), std::to_string(ngpu).c_str());
+    ASSERT_EQ(kwargs.at("gpu_id"), std::to_string(gpuid).c_str());
  }
  {  // Change predictor and query again
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@ -168,10 +168,9 @@ TEST(Learner, IO) {
  std::unique_ptr<Learner> learner {Learner::Create(mat)};
  learner->SetParams({Arg{"tree_method", "auto"},
                      Arg{"predictor", "gpu_predictor"},
-                      Arg{"n_gpus", "1"}});
+                      Arg{"gpu_id", "0"}});
  learner->UpdateOneIter(0, p_dmat.get());
  ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
  ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
  dmlc::TemporaryDirectory tempdir;
  const std::string fname = tempdir.path + "/model.bst";
@ -185,7 +184,6 @@ TEST(Learner, IO) {
  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
  learner->Load(fi.get());
  ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
  ASSERT_EQ(learner->GetGenericParameter().n_gpus, 0);
  delete pp_dmat;
 }
@ -208,31 +206,27 @@ TEST(Learner, GPUConfiguration) {
                        Arg{"updater", "gpu_coord_descent"}});
    learner->UpdateOneIter(0, p_dmat.get());
    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
  }
  {
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "gpu_hist"}});
    learner->UpdateOneIter(0, p_dmat.get());
    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
  }
  {
    // with CPU algorithm
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "hist"}});
    learner->UpdateOneIter(0, p_dmat.get());
-    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
+    ASSERT_EQ(learner->GetGenericParameter().gpu_id, -1);
    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 0);
  }
  {
-    // with CPU algorithm, but `n_gpus` takes priority
+    // with CPU algorithm, but `gpu_id` takes priority
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"n_gpus", "1"}});
+                        Arg{"gpu_id", "0"}});
    learner->UpdateOneIter(0, p_dmat.get());
    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
  }
  {
    // With CPU algorithm but GPU Predictor, this is to simulate when
@ -243,7 +237,6 @@ TEST(Learner, GPUConfiguration) {
                        Arg{"predictor", "gpu_predictor"}});
    learner->UpdateOneIter(0, p_dmat.get());
    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
  }
  delete pp_dmat;
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@ -366,7 +366,7 @@ TEST(GpuHist, EvaluateSplits) {
  ASSERT_NEAR(res[1].fvalue, 0.26, xgboost::kRtEps);
 }
-void TestHistogramIndexImpl(int n_gpus) {
+void TestHistogramIndexImpl() {
  // Test if the compressed histogram index matches when using a sparse
  // dmatrix with and without using external memory
@ -384,7 +384,7 @@ void TestHistogramIndexImpl(int n_gpus) {
    {"max_leaves", "0"}
  };
-  GenericParameter generic_param(CreateEmptyGenericParam(0, n_gpus));
+  GenericParameter generic_param(CreateEmptyGenericParam(0));
  hist_maker.Configure(training_params, &generic_param);
  hist_maker.InitDataOnce(hist_maker_dmat.get());
@ -412,7 +412,7 @@ void TestHistogramIndexImpl(int n_gpus) {
 }
 TEST(GpuHist, TestHistogramIndex) {
-  TestHistogramIndexImpl(1);
+  TestHistogramIndexImpl();
 }
 }  // namespace tree
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@ -29,7 +29,7 @@ TEST(Updater, Prune) {
        {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} };
  auto dmat = CreateDMatrix(32, 16, 0.4, 3);
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
  // prepare tree
  RegTree tree = RegTree();
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@ -25,7 +25,7 @@ TEST(Updater, Refresh) {
    {"reg_lambda", "1"}};
  RegTree tree = RegTree();
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
  tree.param.InitAllowUnknown(cfg);
  std::vector<RegTree*> trees {&tree};
  std::unique_ptr<TreeUpdater> refresher(TreeUpdater::Create("refresh", &lparam));
--- a/tests/distributed/distributed_gpu.py
+++ b/tests/distributed/distributed_gpu.py
@ -61,7 +61,6 @@ base_params = {
 def params_basic_1x4(rank):
    return dict(base_params, **{
        'n_gpus': 1,
        'gpu_id': rank,
    }), 20
--- a/tests/python-gpu/test_gpu_linear.py
+++ b/tests/python-gpu/test_gpu_linear.py
@ -23,7 +23,7 @@ class TestGPULinear(unittest.TestCase):
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_gpu_coordinate(self):
        parameters = self.common_param.copy()
-        parameters['n_gpus'] = [1]
+        parameters['gpu_id'] = [0]
        for param in test_linear.parameter_combinations(parameters):
            results = test_linear.run_suite(
                param, 150, self.datasets, scale_features=True)
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@ -21,7 +21,7 @@ datasets = ["Boston", "Cancer", "Digits", "Sparse regression",
 class TestGPU(unittest.TestCase):
    def test_gpu_hist(self):
-        test_param = parameter_combinations({'n_gpus': [1], 'max_depth': [2, 8],
+        test_param = parameter_combinations({'gpu_id': [0], 'max_depth': [2, 8],
                                             'max_leaves': [255, 4],
                                             'max_bin': [2, 256],
                                             'grow_policy': ['lossguide']})
@ -38,8 +38,7 @@ class TestGPU(unittest.TestCase):
    @pytest.mark.mgpu
    def test_specified_gpu_id_gpu_update(self):
-        variable_param = {'n_gpus': [1],
+        variable_param = {'gpu_id': [1],
                          'gpu_id': [1],
                          'max_depth': [8],
                          'max_leaves': [255, 4],
                          'max_bin': [2, 64],
--- a/tests/python-gpu/test_large_sizes.py
+++ b/tests/python-gpu/test_large_sizes.py
@ -63,7 +63,7 @@ class TestGPU(unittest.TestCase):
                         'nthread': 0,
                         'eta': 1,
                         'verbosity': 3,
-                         'n_gpus': 1,
+                         'gpu_id': 0,
                         'objective': 'binary:logistic',
                         'max_bin': max_bin,
                         'eval_metric': 'auc'}