diff --git a/demo/c-api/c-api-demo.c b/demo/c-api/c-api-demo.c
index e107876cb..9024badcd 100644
--- a/demo/c-api/c-api-demo.c
+++ b/demo/c-api/c-api-demo.c
@@ -36,13 +36,12 @@ int main(int argc, char** argv) {
   //   https://xgboost.readthedocs.io/en/latest/parameter.html
   safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
   if (use_gpu) {
-    // set the number of GPUs and the first GPU to use;
+    // set the GPU to use;
     // this is not necessary, but provided here as an illustration
-    safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "1"));
     safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
   } else {
     // avoid evaluating objective and metric on a GPU
-    safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "0"));
+    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
   }
 
   safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h
index e4cef9cf0..47084ec4b 100644
--- a/include/xgboost/generic_parameters.h
+++ b/include/xgboost/generic_parameters.h
@@ -19,10 +19,8 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
   // number of threads to use if OpenMP is enabled
   // if equals 0, use system default
   int nthread;
-  // primary device.
+  // primary device, -1 means no gpu.
   int gpu_id;
-  // number of devices to use, -1 implies using all available devices.
-  int n_gpus;
   // declare parameters
   DMLC_DECLARE_PARAMETER(GenericParameter) {
     DMLC_DECLARE_FIELD(seed).set_default(0).describe(
@@ -36,15 +34,20 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
     DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
         "Number of threads to use.");
     DMLC_DECLARE_FIELD(gpu_id)
-        .set_default(0)
+        .set_default(-1)
+        .set_lower_bound(-1)
         .describe("The primary GPU device ordinal.");
     DMLC_DECLARE_FIELD(n_gpus)
         .set_default(0)
-        .set_range(0, 1)
+        .set_range(0, 0)
         .describe("Deprecated. Single process multi-GPU training is no longer supported. "
                   "Please switch to distributed training with one process per GPU. "
                   "This can be done using Dask or Spark.");
   }
+
+ private:
+  // number of devices to use (deprecated).
+  int n_gpus;
 };
 }  // namespace xgboost
 
diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc
index 6ce653875..b6cb174de 100644
--- a/plugin/example/custom_obj.cc
+++ b/plugin/example/custom_obj.cc
@@ -60,8 +60,8 @@ class MyLogistic : public ObjFunction {
   void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
     // transform margin value to probability.
     std::vector<bst_float> &preds = io_preds->HostVector();
-    for (size_t i = 0; i < preds.size(); ++i) {
-      preds[i] = 1.0f / (1.0f + std::exp(-preds[i]));
+    for (auto& pred : preds) {
+      pred = 1.0f / (1.0f + std::exp(-pred));
     }
   }
   bst_float ProbToMargin(bst_float base_score) const override {
diff --git a/src/common/common.cc b/src/common/common.cc
index 60ba3e16e..8f4f4b5c8 100644
--- a/src/common/common.cc
+++ b/src/common/common.cc
@@ -22,48 +22,12 @@ using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
 GlobalRandomEngine& GlobalRandom() {
   return RandomThreadLocalStore::Get()->engine;
 }
-}  // namespace common
 
 #if !defined(XGBOOST_USE_CUDA)
-int AllVisibleImpl::AllVisible() {
+int AllVisibleGPUs() {
   return 0;
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
 
-constexpr GPUSet::GpuIdType GPUSet::kAll;
-
-GPUSet GPUSet::All(GpuIdType gpu_id, GpuIdType n_gpus, int32_t n_rows) {
-  CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0.";
-  CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1.";
-
-  GpuIdType const n_devices_visible = AllVisible().Size();
-  CHECK_LE(n_gpus, n_devices_visible);
-  if (n_devices_visible == 0 || n_gpus == 0 || n_rows == 0) {
-    LOG(DEBUG) << "Runing on CPU.";
-    return Empty();
-  }
-
-  GpuIdType const n_available_devices = n_devices_visible - gpu_id;
-
-  if (n_gpus == kAll) {  // Use all devices starting from `gpu_id'.
-    CHECK(gpu_id < n_devices_visible)
-        << "\ngpu_id should be less than number of visible devices.\ngpu_id: "
-        << gpu_id
-        << ", number of visible devices: "
-        << n_devices_visible;
-    GpuIdType n_devices =
-        n_available_devices < n_rows ? n_available_devices : n_rows;
-    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
-    return Range(gpu_id, n_devices);
-  } else {  // Use devices in ( gpu_id, gpu_id + n_gpus ).
-    CHECK_LE(n_gpus, n_available_devices)
-        << "Starting from gpu id: " << gpu_id << ", there are only "
-        << n_available_devices << " available devices, while n_gpus is set to: "
-        << n_gpus;
-    GpuIdType n_devices = n_gpus < n_rows ? n_gpus : n_rows;
-    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
-    return Range(gpu_id, n_devices);
-  }
-}
-
+}  // namespace common
 }  // namespace xgboost
diff --git a/src/common/common.cu b/src/common/common.cu
index 43fa4d465..d30fbc0ae 100644
--- a/src/common/common.cu
+++ b/src/common/common.cu
@@ -4,8 +4,9 @@
 #include "common.h"
 
 namespace xgboost {
+namespace common {
 
-int AllVisibleImpl::AllVisible() {
+int AllVisibleGPUs() {
   int n_visgpus = 0;
   try {
     // When compiled with CUDA but running on CPU only device,
@@ -17,4 +18,5 @@ int AllVisibleImpl::AllVisible() {
   return n_visgpus;
 }
 
+}  // namespace common
 }  // namespace xgboost
diff --git a/src/common/common.h b/src/common/common.h
index 9eab46add..ccff63eb7 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -140,88 +140,8 @@ class Range {
   Iterator begin_;
   Iterator end_;
 };
+
+int AllVisibleGPUs();
 }  // namespace common
-
-struct AllVisibleImpl {
-  static int AllVisible();
-};
-/* \brief set of devices across which HostDeviceVector can be distributed.
- *
- * Currently implemented as a range, but can be changed later to something else,
- *   e.g. a bitset
- */
-class GPUSet {
- public:
-  using GpuIdType = int;
-  static constexpr GpuIdType kAll = -1;
-
-  explicit GPUSet(int start = 0, int ndevices = 0)
-      : devices_(start, start + ndevices) {}
-
-  static GPUSet Empty() { return GPUSet(); }
-
-  static GPUSet Range(GpuIdType start, GpuIdType n_gpus) {
-    return n_gpus <= 0 ? Empty() : GPUSet{start, n_gpus};
-  }
-  /*! \brief n_gpus and num_rows both are upper bounds. */
-  static GPUSet All(GpuIdType gpu_id, GpuIdType n_gpus,
-                    GpuIdType num_rows = std::numeric_limits<GpuIdType>::max());
-
-  static GPUSet AllVisible() {
-    GpuIdType n =  AllVisibleImpl::AllVisible();
-    return Range(0, n);
-  }
-
-  size_t Size() const {
-    GpuIdType size = *devices_.end() - *devices_.begin();
-    GpuIdType res = size < 0 ? 0 : size;
-    return static_cast<size_t>(res);
-  }
-
-  /*
-   * By default, we have two configurations of identifying device, one
-   * is the device id obtained from `cudaGetDevice'.  But we sometimes
-   * store objects that allocated one for each device in a list, which
-   * requires a zero-based index.
-   *
-   * Hence, `DeviceId' converts a zero-based index to actual device id,
-   * `Index' converts a device id to a zero-based index.
-   */
-  GpuIdType DeviceId(size_t index) const {
-    GpuIdType result = *devices_.begin() + static_cast<GpuIdType>(index);
-    CHECK(Contains(result)) << "\nDevice " << result << " is not in GPUSet."
-                            << "\nIndex: " << index
-                            << "\nGPUSet: (" << *begin() << ", " << *end() << ")"
-                            << std::endl;
-    return result;
-  }
-  size_t Index(GpuIdType device) const {
-    CHECK(Contains(device)) << "\nDevice " << device << " is not in GPUSet."
-                            << "\nGPUSet: (" << *begin() << ", " << *end() << ")"
-                            << std::endl;
-    size_t result = static_cast<size_t>(device - *devices_.begin());
-    return result;
-  }
-
-  bool IsEmpty() const { return Size() == 0; }
-
-  bool Contains(GpuIdType device) const {
-    return *devices_.begin() <= device && device < *devices_.end();
-  }
-
-  common::Range::Iterator begin() const { return devices_.begin(); }  // NOLINT
-  common::Range::Iterator end() const { return devices_.end(); }      // NOLINT
-
-  friend bool operator==(const GPUSet& lhs, const GPUSet& rhs) {
-    return lhs.devices_ == rhs.devices_;
-  }
-  friend bool operator!=(const GPUSet& lhs, const GPUSet& rhs) {
-    return !(lhs == rhs);
-  }
-
- private:
-  common::Range devices_;
-};
-
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 0eee94009..979167459 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -72,22 +72,6 @@ const T *Raw(const thrust::device_vector<T> &v) {  //  NOLINT
   return raw_pointer_cast(v.data());
 }
 
-// if n_devices=-1, then use all visible devices
-inline void SynchronizeNDevices(xgboost::GPUSet devices) {
-  devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices;
-  for (auto const d : devices) {
-    safe_cuda(cudaSetDevice(d));
-    safe_cuda(cudaDeviceSynchronize());
-  }
-}
-
-inline void SynchronizeAll() {
-  for (int device_idx : xgboost::GPUSet::AllVisible()) {
-    safe_cuda(cudaSetDevice(device_idx));
-    safe_cuda(cudaDeviceSynchronize());
-  }
-}
-
 inline size_t AvailableMemory(int device_idx) {
   size_t device_free = 0;
   size_t device_total = 0;
@@ -119,7 +103,7 @@ inline size_t MaxSharedMemory(int device_idx) {
 }
 
 inline void CheckComputeCapability() {
-  for (int d_idx : xgboost::GPUSet::AllVisible()) {
+  for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
     cudaDeviceProp prop;
     safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
     std::ostringstream oss;
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 4131c59ed..d1ef37df1 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -35,7 +35,6 @@ __global__ void FindCutsK
   if (icut >= ncuts) {
     return;
   }
-  WXQSketch::Entry v;
   int isample = 0;
   if (icut == 0) {
     isample = 0;
@@ -59,11 +58,14 @@ struct IsNotNaN {
   __device__ bool operator()(float a) const { return !isnan(a); }
 };
 
-__global__ void UnpackFeaturesK
-(float* __restrict__ fvalues, float* __restrict__ feature_weights,
- const size_t* __restrict__ row_ptrs, const float* __restrict__ weights,
- Entry* entries, size_t nrows_array, int ncols, size_t row_begin_ptr,
- size_t nrows) {
+__global__ void UnpackFeaturesK(float* __restrict__ fvalues,
+                                float* __restrict__ feature_weights,
+                                const size_t* __restrict__ row_ptrs,
+                                const float* __restrict__ weights,
+                                Entry* entries,
+                                size_t nrows_array,
+                                size_t row_begin_ptr,
+                                size_t nrows) {
   size_t irow = threadIdx.x + size_t(blockIdx.x) * blockDim.x;
   if (irow >= nrows) {
     return;
@@ -102,8 +104,9 @@ struct SketchContainer {
     const MetaInfo &info = dmat->Info();
     // Initialize Sketches for this dmatrix
     sketches_.resize(info.num_col_);
-#pragma omp parallel for schedule(static) if (info.num_col_ > kOmpNumColsParallelizeLimit)
-    for (int icol = 0; icol < info.num_col_; ++icol) {
+#pragma omp parallel for default(none) shared(info, param) schedule(static) \
+if (info.num_col_ > kOmpNumColsParallelizeLimit)  // NOLINT
+    for (int icol = 0; icol < info.num_col_; ++icol) {  // NOLINT
       sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin));
     }
   }
@@ -120,8 +123,6 @@ struct GPUSketcher {
   // manage memory for a single GPU
   class DeviceShard {
     int device_;
-    bst_uint row_begin_;  // The row offset for this shard
-    bst_uint row_end_;
     bst_uint n_rows_;
     int num_cols_{0};
     size_t n_cuts_{0};
@@ -131,27 +132,31 @@ struct GPUSketcher {
 
     tree::TrainParam param_;
     SketchContainer *sketch_container_;
-    dh::device_vector<size_t> row_ptrs_;
-    dh::device_vector<Entry> entries_;
-    dh::device_vector<bst_float> fvalues_;
-    dh::device_vector<bst_float> feature_weights_;
-    dh::device_vector<bst_float> fvalues_cur_;
-    dh::device_vector<WXQSketch::Entry> cuts_d_;
-    thrust::host_vector<WXQSketch::Entry> cuts_h_;
-    dh::device_vector<bst_float> weights_;
-    dh::device_vector<bst_float> weights2_;
-    std::vector<size_t> n_cuts_cur_;
-    dh::device_vector<size_t> num_elements_;
-    dh::device_vector<char> tmp_storage_;
+    dh::device_vector<size_t> row_ptrs_{};
+    dh::device_vector<Entry> entries_{};
+    dh::device_vector<bst_float> fvalues_{};
+    dh::device_vector<bst_float> feature_weights_{};
+    dh::device_vector<bst_float> fvalues_cur_{};
+    dh::device_vector<WXQSketch::Entry> cuts_d_{};
+    thrust::host_vector<WXQSketch::Entry> cuts_h_{};
+    dh::device_vector<bst_float> weights_{};
+    dh::device_vector<bst_float> weights2_{};
+    std::vector<size_t> n_cuts_cur_{};
+    dh::device_vector<size_t> num_elements_{};
+    dh::device_vector<char> tmp_storage_{};
 
    public:
-    DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
-                tree::TrainParam param, SketchContainer *sketch_container) :
-      device_(device), row_begin_(row_begin), row_end_(row_end),
-      n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
+    DeviceShard(int device,
+                bst_uint n_rows,
+                tree::TrainParam param,
+                SketchContainer* sketch_container) :
+        device_(device),
+        n_rows_(n_rows),
+        param_(std::move(param)),
+        sketch_container_(sketch_container) {
     }
 
-    ~DeviceShard() {
+    ~DeviceShard() {  // NOLINT
       dh::safe_cuda(cudaSetDevice(device_));
     }
 
@@ -319,19 +324,18 @@ struct GPUSketcher {
       const auto& offset_vec = row_batch.offset.HostVector();
       const auto& data_vec = row_batch.data.HostVector();
 
-      size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
-        offset_vec[row_begin_ + batch_row_begin];
+      size_t n_entries = offset_vec[batch_row_end] - offset_vec[batch_row_begin];
       // copy the batch to the GPU
       dh::safe_cuda
         (cudaMemcpyAsync(entries_.data().get(),
-                    data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
+                    data_vec.data() + offset_vec[batch_row_begin],
                     n_entries * sizeof(Entry), cudaMemcpyDefault));
       // copy the weights if necessary
       if (has_weights_) {
         const auto& weights_vec = info.weights_.HostVector();
         dh::safe_cuda
           (cudaMemcpyAsync(weights_.data().get(),
-                      weights_vec.data() + row_begin_ + batch_row_begin,
+                      weights_vec.data() + batch_row_begin,
                       batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
       }
 
@@ -349,8 +353,7 @@ struct GPUSketcher {
         (fvalues_.data().get(), has_weights_ ? feature_weights_.data().get() : nullptr,
          row_ptrs_.data().get() + batch_row_begin,
          has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
-         gpu_batch_nrows_, num_cols_,
-         offset_vec[row_begin_ + batch_row_begin], batch_nrows);
+         gpu_batch_nrows_, offset_vec[batch_row_begin], batch_nrows);
 
       for (int icol = 0; icol < num_cols_; ++icol) {
         FindColumnCuts(batch_nrows, icol);
@@ -358,7 +361,7 @@ struct GPUSketcher {
 
       // add cuts into sketches
       thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
-#pragma omp parallel for schedule(static) \
+#pragma omp parallel for default(none) schedule(static) \
       if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
       for (int icol = 0; icol < num_cols_; ++icol) {
         WXQSketch::SummaryContainer summary;
@@ -391,8 +394,7 @@ struct GPUSketcher {
       dh::safe_cuda(cudaSetDevice(device_));
       const auto& offset_vec = row_batch.offset.HostVector();
       row_ptrs_.resize(n_rows_ + 1);
-      thrust::copy(offset_vec.data() + row_begin_,
-                   offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
+      thrust::copy(offset_vec.data(), offset_vec.data() + n_rows_ + 1, row_ptrs_.begin());
       size_t gpu_nbatches = common::DivRoundUp(n_rows_, gpu_batch_nrows_);
       for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
         SketchBatch(row_batch, info, gpu_batch);
@@ -401,32 +403,18 @@ struct GPUSketcher {
   };
 
   void SketchBatch(const SparsePage &batch, const MetaInfo &info) {
-    GPUDistribution dist =
-      GPUDistribution::Block(GPUSet::All(generic_param_.gpu_id, generic_param_.n_gpus,
-                                         batch.Size()));
+    auto device = generic_param_.gpu_id;
 
-    // create device shards
-    shards_.resize(dist.Devices().Size());
-    dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
-        size_t start = dist.ShardStart(batch.Size(), i);
-        size_t size = dist.ShardSize(batch.Size(), i);
-        shard = std::unique_ptr<DeviceShard>(
-            new DeviceShard(dist.Devices().DeviceId(i), start,
-                            start + size, param_, sketch_container_.get()));
-      });
+    // create device shard
+    shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get()));
 
-    // compute sketches for each shard
-    dh::ExecuteIndexShards(&shards_,
-                           [&](int idx, std::unique_ptr<DeviceShard>& shard) {
-                             shard->Init(batch, info, gpu_batch_nrows_);
-                             shard->Sketch(batch, info);
-                             shard->ComputeRowStride();
-                           });
+    // compute sketches for the shard
+    shard_->Init(batch, info, gpu_batch_nrows_);
+    shard_->Sketch(batch, info);
+    shard_->ComputeRowStride();
 
-    // compute row stride across all shards
-    for (const auto &shard : shards_) {
-      row_stride_ = std::max(row_stride_, shard->GetRowStride());
-    }
+    // compute row stride
+    row_stride_ = shard_->GetRowStride();
   }
 
   GPUSketcher(const tree::TrainParam &param, const GenericParameter &generic_param, int gpu_nrows)
@@ -444,13 +432,13 @@ struct GPUSketcher {
       this->SketchBatch(batch, info);
     }
 
-    hmat->Init(&sketch_container_.get()->sketches_, param_.max_bin);
+    hmat->Init(&sketch_container_->sketches_, param_.max_bin);
 
     return row_stride_;
   }
 
  private:
-  std::vector<std::unique_ptr<DeviceShard>> shards_;
+  std::unique_ptr<DeviceShard> shard_;
   const tree::TrainParam &param_;
   const GenericParameter &generic_param_;
   int gpu_batch_nrows_;
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index 6ccb1a661..c28218b4e 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -30,19 +30,19 @@ struct HostDeviceVectorImpl {
 };
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(size, v);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -75,29 +75,23 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
 
 template <typename T>
-GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
+int HostDeviceVector<T>::DeviceIdx() const { return -1; }
 
 template <typename T>
-const GPUDistribution& HostDeviceVector<T>::Distribution() const {
-  static GPUDistribution dummyInstance;
-  return dummyInstance;
-}
+T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
 
 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
-
-template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
+const T* HostDeviceVector<T>::ConstDevicePointer() const {
   return nullptr;
 }
 
 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
+common::Span<T> HostDeviceVector<T>::DeviceSpan() {
   return common::Span<T>();
 }
 
 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
   return common::Span<const T>();
 }
 
@@ -115,10 +109,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 }
 
 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) const { return 0; }
-
-template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) const { return 0; }
+size_t HostDeviceVector<T>::DeviceSize() const { return 0; }
 
 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
@@ -149,18 +140,12 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
 }
 
 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
+bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
   return false;
 }
 
 template <typename T>
-void HostDeviceVector<T>::Shard(const GPUDistribution& distribution) const { }
-
-template <typename T>
-void HostDeviceVector<T>::Shard(GPUSet devices) const { }
-
-template <typename T>
-void Reshard(const GPUDistribution &distribution) { }
+void HostDeviceVector<T>::SetDevice(int device) const {}
 
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index bbf8af690..3205e8fd1 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -10,7 +10,6 @@
 #include <mutex>
 #include "./device_helpers.cuh"
 
-
 namespace xgboost {
 
 // the handler to call instead of cudaSetDevice; only used for testing
@@ -43,144 +42,12 @@ class Permissions {
 };
 
 template <typename T>
-struct HostDeviceVectorImpl {
-  struct DeviceShard {
-    DeviceShard()
-      : proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
-        cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
-
-    ~DeviceShard() {
+class HostDeviceVectorImpl {
+ public:
+  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), perm_h_(device < 0) {
+    if (device >= 0) {
       SetDevice();
-    }
-
-    void Init(HostDeviceVectorImpl<T>* vec, int device) {
-      if (vec_ == nullptr) { vec_ = vec; }
-      CHECK_EQ(vec, vec_);
-      device_ = device;
-      LazyResize(vec_->Size());
-      perm_d_ = vec_->perm_h_.Complementary();
-    }
-
-    void Init(HostDeviceVectorImpl<T>* vec, const DeviceShard& other) {
-      if (vec_ == nullptr) { vec_ = vec; }
-      CHECK_EQ(vec, vec_);
-      device_ = other.device_;
-      cached_size_ = other.cached_size_;
-      start_ = other.start_;
-      proper_size_ = other.proper_size_;
-      SetDevice();
-      data_.resize(other.data_.size());
-      perm_d_ = other.perm_d_;
-    }
-
-    void ScatterFrom(const T* begin) {
-      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice(GPUAccess::kWrite);
-      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_,
-                               data_.size() * sizeof(T), cudaMemcpyDefault));
-    }
-
-    void GatherTo(thrust::device_ptr<T> begin) {
-      LazySyncDevice(GPUAccess::kRead);
-      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(),
-                               proper_size_ * sizeof(T), cudaMemcpyDefault));
-    }
-
-    void Fill(T v) {
-      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice(GPUAccess::kWrite);
-      SetDevice();
-      thrust::fill(data_.begin(), data_.end(), v);
-    }
-
-    void Copy(DeviceShard* other) {
-      // TODO(canonizer): avoid full copy of host data for this (but not for other)
-      LazySyncDevice(GPUAccess::kWrite);
-      other->LazySyncDevice(GPUAccess::kRead);
-      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(),
-                               data_.size() * sizeof(T), cudaMemcpyDefault));
-    }
-
-    void LazySyncHost(GPUAccess access) {
-      SetDevice();
-      dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
-                               data_.data().get(),  proper_size_ * sizeof(T),
-                               cudaMemcpyDeviceToHost));
-      perm_d_.DenyComplementary(access);
-    }
-
-    void LazyResize(size_t new_size) {
-      if (new_size == cached_size_) { return; }
-      // resize is required
-      int ndevices = vec_->distribution_.devices_.Size();
-      int device_index = vec_->distribution_.devices_.Index(device_);
-      start_ = vec_->distribution_.ShardStart(new_size, device_index);
-      proper_size_ = vec_->distribution_.ShardProperSize(new_size, device_index);
-      // The size on this device.
-      size_t size_d = vec_->distribution_.ShardSize(new_size, device_index);
-      SetDevice();
-      data_.resize(size_d);
-      cached_size_ = new_size;
-    }
-
-    void LazySyncDevice(GPUAccess access) {
-      if (perm_d_.CanAccess(access)) { return; }
-      if (perm_d_.CanRead()) {
-        // deny read to the host
-        perm_d_.Grant(access);
-        std::lock_guard<std::mutex> lock(vec_->mutex_);
-        vec_->perm_h_.DenyComplementary(access);
-        return;
-      }
-      // data is on the host
-      size_t size_h = vec_->data_h_.size();
-      LazyResize(size_h);
-      SetDevice();
-      dh::safe_cuda(
-          cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
-                     data_.size() * sizeof(T), cudaMemcpyHostToDevice));
-      perm_d_.Grant(access);
-
-      std::lock_guard<std::mutex> lock(vec_->mutex_);
-      vec_->perm_h_.DenyComplementary(access);
-      vec_->size_d_ = size_h;
-    }
-
-    void SetDevice() {
-      if (cudaSetDeviceHandler == nullptr) {
-        dh::safe_cuda(cudaSetDevice(device_));
-      } else {
-        (*cudaSetDeviceHandler)(device_);
-      }
-    }
-
-    T*     Raw()                    { return data_.data().get(); }
-    size_t Start()            const { return start_; }
-    size_t DataSize()         const { return data_.size(); }
-    Permissions& Perm()             { return perm_d_; }
-    Permissions const& Perm() const { return perm_d_; }
-
-   private:
-    int device_;
-    dh::device_vector<T> data_;
-    // cached vector size
-    size_t cached_size_;
-    size_t start_;
-    // size of the portion to copy back to the host
-    size_t proper_size_;
-    Permissions perm_d_;
-    HostDeviceVectorImpl<T>* vec_;
-  };
-
-  HostDeviceVectorImpl(size_t size, T v, const GPUDistribution &distribution)
-    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
-    if (!distribution_.IsEmpty()) {
-      size_d_ = size;
-      InitShards();
-      Fill(v);
+      data_d_.resize(size, v);
     } else {
       data_h_.resize(size, v);
     }
@@ -188,127 +55,81 @@ struct HostDeviceVectorImpl {
 
   // required, as a new std::mutex has to be created
   HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
-    : data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
-      distribution_(other.distribution_), mutex_() {
-    shards_.resize(other.shards_.size());
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Init(this, other.shards_.at(i));
-      });
+      : device_(other.device_), data_h_(other.data_h_), perm_h_(other.perm_h_), mutex_() {
+    if (device_ >= 0) {
+      SetDevice();
+      data_d_ = other.data_d_;
+    }
   }
 
   // Initializer can be std::vector<T> or std::initializer_list<T>
   template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, const GPUDistribution &distribution)
-    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
-    if (!distribution_.IsEmpty()) {
-      size_d_ = init.size();
-      InitShards();
+  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device), perm_h_(device < 0) {
+    if (device >= 0) {
+      LazyResizeDevice(init.size());
       Copy(init);
     } else {
       data_h_ = init;
     }
   }
 
-  void InitShards() {
-    int ndevices = distribution_.devices_.Size();
-    shards_.resize(ndevices);
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Init(this, distribution_.devices_.DeviceId(i));
-      });
+  ~HostDeviceVectorImpl() {
+    if (device_ >= 0) {
+      SetDevice();
+    }
   }
 
-  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }
+  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : data_d_.size(); }
 
-  GPUSet Devices() const { return distribution_.devices_; }
+  int DeviceIdx() const { return device_; }
 
-  const GPUDistribution& Distribution() const { return distribution_; }
-
-  T* DevicePointer(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kWrite);
-    return shards_.at(distribution_.devices_.Index(device)).Raw();
+  T* DevicePointer() {
+    LazySyncDevice(GPUAccess::kWrite);
+    return data_d_.data().get();
   }
 
-  const T* ConstDevicePointer(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
-    return shards_.at(distribution_.devices_.Index(device)).Raw();
+  const T* ConstDevicePointer() {
+    LazySyncDevice(GPUAccess::kRead);
+    return data_d_.data().get();
   }
 
-  common::Span<T> DeviceSpan(int device) {
-    GPUSet devices = distribution_.devices_;
-    CHECK(devices.Contains(device));
-    LazySyncDevice(device, GPUAccess::kWrite);
-    return {shards_.at(devices.Index(device)).Raw(),
-          static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
+  common::Span<T> DeviceSpan() {
+    LazySyncDevice(GPUAccess::kWrite);
+    return {data_d_.data().get(), static_cast<typename common::Span<T>::index_type>(DeviceSize())};
   }
 
-  common::Span<const T> ConstDeviceSpan(int device) {
-    GPUSet devices = distribution_.devices_;
-    CHECK(devices.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
+  common::Span<const T> ConstDeviceSpan() {
+    LazySyncDevice(GPUAccess::kRead);
     using SpanInd = typename common::Span<const T>::index_type;
-    return {shards_.at(devices.Index(device)).Raw(),
-          static_cast<SpanInd>(DeviceSize(device))};
+    return {data_d_.data().get(), static_cast<SpanInd>(DeviceSize())};
   }
 
-  size_t DeviceSize(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
-    return shards_.at(distribution_.devices_.Index(device)).DataSize();
+  size_t DeviceSize() {
+    LazySyncDevice(GPUAccess::kRead);
+    return data_d_.size();
   }
 
-  size_t DeviceStart(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
-    return shards_.at(distribution_.devices_.Index(device)).Start();
+  thrust::device_ptr<T> tbegin() {  // NOLINT
+    return thrust::device_ptr<T>(DevicePointer());
   }
 
-  thrust::device_ptr<T> tbegin(int device) {  // NOLINT
-    return thrust::device_ptr<T>(DevicePointer(device));
+  thrust::device_ptr<const T> tcbegin() {  // NOLINT
+    return thrust::device_ptr<const T>(ConstDevicePointer());
   }
 
-  thrust::device_ptr<const T> tcbegin(int device) {  // NOLINT
-    return thrust::device_ptr<const T>(ConstDevicePointer(device));
+  thrust::device_ptr<T> tend() {  // NOLINT
+    return tbegin() + DeviceSize();
   }
 
-  thrust::device_ptr<T> tend(int device) {  // NOLINT
-    return tbegin(device) + DeviceSize(device);
-  }
-
-  thrust::device_ptr<const T> tcend(int device) {  // NOLINT
-    return tcbegin(device) + DeviceSize(device);
-  }
-
-  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
-    CHECK_EQ(end - begin, Size());
-    if (perm_h_.CanWrite()) {
-      dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
-                               (end - begin) * sizeof(T),
-                               cudaMemcpyDeviceToHost));
-    } else {
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-        shard.ScatterFrom(begin.get());
-      });
-    }
-  }
-
-  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
-    CHECK_EQ(end - begin, Size());
-    if (perm_h_.CanWrite()) {
-      dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
-                               data_h_.size() * sizeof(T),
-                               cudaMemcpyHostToDevice));
-    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.GatherTo(begin); });
-    }
+  thrust::device_ptr<const T> tcend() {  // NOLINT
+    return tcbegin() + DeviceSize();
   }
 
   void Fill(T v) {  // NOLINT
     if (perm_h_.CanWrite()) {
       std::fill(data_h_.begin(), data_h_.end(), v);
     } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.Fill(v); });
+      DeviceFill(v);
     }
   }
 
@@ -320,14 +141,10 @@ struct HostDeviceVectorImpl {
       return;
     }
     // Data is on device;
-    if (distribution_ != other->distribution_) {
-      distribution_ = GPUDistribution();
-      Shard(other->Distribution());
-      size_d_ = other->size_d_;
+    if (device_ != other->device_) {
+      SetDevice(other->device_);
     }
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Copy(&other->shards_.at(i));
-      });
+    DeviceCopy(other);
   }
 
   void Copy(const std::vector<T>& other) {
@@ -335,9 +152,7 @@ struct HostDeviceVectorImpl {
     if (perm_h_.CanWrite()) {
       std::copy(other.begin(), other.end(), data_h_.begin());
     } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-          shard.ScatterFrom(other.data());
-        });
+      DeviceCopy(other.data());
     }
   }
 
@@ -346,9 +161,7 @@ struct HostDeviceVectorImpl {
     if (perm_h_.CanWrite()) {
       std::copy(other.begin(), other.end(), data_h_.begin());
     } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-          shard.ScatterFrom(other.begin());
-        });
+      DeviceCopy(other.begin());
     }
   }
 
@@ -362,40 +175,23 @@ struct HostDeviceVectorImpl {
     return data_h_;
   }
 
-  void Shard(const GPUDistribution& distribution) {
-    if (distribution_ == distribution) { return; }
-    CHECK(distribution_.IsEmpty())
-        << "Data resides on different GPUs: " << "ID: "
-        << *(distribution_.Devices().begin()) << " and ID: "
-        << *(distribution.Devices().begin());
-    distribution_ = distribution;
-    InitShards();
-  }
-
-  void Shard(GPUSet new_devices) {
-    if (distribution_.Devices() == new_devices) { return; }
-    Shard(GPUDistribution::Block(new_devices));
-  }
-
-  void Reshard(const GPUDistribution &distribution) {
-    if (distribution_ == distribution) { return; }
-    LazySyncHost(GPUAccess::kWrite);
-    distribution_ = distribution;
-    shards_.clear();
-    InitShards();
+  void SetDevice(int device) {
+    if (device_ == device) { return; }
+    if (device_ >= 0) {
+      LazySyncHost(GPUAccess::kWrite);
+    }
+    device_ = device;
+    if (device_ >= 0) {
+      LazyResizeDevice(data_h_.size());
+    }
   }
 
   void Resize(size_t new_size, T v) {
     if (new_size == Size()) { return; }
-    if (distribution_.IsFixedSize()) {
-      CHECK_EQ(new_size, distribution_.offsets_.back());
-    }
-    if (Size() == 0 && !distribution_.IsEmpty()) {
+    if (Size() == 0 && device_ >= 0) {
       // fast on-device resize
       perm_h_ = Permissions(false);
-      size_d_ = new_size;
-      InitShards();
-      Fill(v);
+      data_d_.resize(new_size, v);
     } else {
       // resize on host
       LazySyncHost(GPUAccess::kWrite);
@@ -407,72 +203,110 @@ struct HostDeviceVectorImpl {
     if (perm_h_.CanAccess(access)) { return; }
     if (perm_h_.CanRead()) {
       // data is present, just need to deny access to the device
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-          shard.Perm().DenyComplementary(access);
-        });
       perm_h_.Grant(access);
       return;
     }
     std::lock_guard<std::mutex> lock(mutex_);
-    if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-        shard.LazySyncHost(access);
-      });
+    if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
+    SetDevice();
+    dh::safe_cuda(cudaMemcpy(data_h_.data(),
+                             data_d_.data().get(),
+                             data_d_.size() * sizeof(T),
+                             cudaMemcpyDeviceToHost));
     perm_h_.Grant(access);
   }
 
-  void LazySyncDevice(int device, GPUAccess access) {
-    GPUSet devices = distribution_.Devices();
-    CHECK(devices.Contains(device));
-    shards_.at(devices.Index(device)).LazySyncDevice(access);
+  void LazySyncDevice(GPUAccess access) {
+    if (DevicePerm().CanAccess(access)) { return; }
+    if (DevicePerm().CanRead()) {
+      // deny read to the host
+      std::lock_guard<std::mutex> lock(mutex_);
+      perm_h_.DenyComplementary(access);
+      return;
+    }
+    // data is on the host
+    LazyResizeDevice(data_h_.size());
+    SetDevice();
+    dh::safe_cuda(cudaMemcpy(data_d_.data().get(),
+                             data_h_.data(),
+                             data_d_.size() * sizeof(T),
+                             cudaMemcpyHostToDevice));
+
+    std::lock_guard<std::mutex> lock(mutex_);
+    perm_h_.DenyComplementary(access);
   }
 
   bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
-
-  bool DeviceCanAccess(int device, GPUAccess access) {
-    GPUSet devices = distribution_.Devices();
-    if (!devices.Contains(device)) { return false; }
-    return shards_.at(devices.Index(device)).Perm().CanAccess(access);
-  }
+  bool DeviceCanAccess(GPUAccess access) { return DevicePerm().CanAccess(access); }
 
  private:
-  std::vector<T> data_h_;
-  Permissions perm_h_;
-  // the total size of the data stored on the devices
-  size_t size_d_;
-  GPUDistribution distribution_;
+  int device_{-1};
+  std::vector<T> data_h_{};
+  dh::device_vector<T> data_d_{};
+  Permissions perm_h_{false};
   // protects size_d_ and perm_h_ when updated from multiple threads
-  std::mutex mutex_;
-  std::vector<DeviceShard> shards_;
+  std::mutex mutex_{};
+
+  void DeviceFill(T v) {
+    // TODO(canonizer): avoid full copy of host data
+    LazySyncDevice(GPUAccess::kWrite);
+    SetDevice();
+    thrust::fill(data_d_.begin(), data_d_.end(), v);
+  }
+
+  void DeviceCopy(HostDeviceVectorImpl* other) {
+    // TODO(canonizer): avoid full copy of host data for this (but not for other)
+    LazySyncDevice(GPUAccess::kWrite);
+    other->LazySyncDevice(GPUAccess::kRead);
+    SetDevice();
+    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
+                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
+  }
+
+  void DeviceCopy(const T* begin) {
+    // TODO(canonizer): avoid full copy of host data
+    LazySyncDevice(GPUAccess::kWrite);
+    SetDevice();
+    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
+                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
+  }
+
+  void LazyResizeDevice(size_t new_size) {
+    if (new_size == data_d_.size()) { return; }
+    SetDevice();
+    data_d_.resize(new_size);
+  }
+
+  void SetDevice() {
+    CHECK_GE(device_, 0);
+    if (cudaSetDeviceHandler == nullptr) {
+      dh::safe_cuda(cudaSetDevice(device_));
+    } else {
+      (*cudaSetDeviceHandler)(device_);
+    }
+  }
+
+  Permissions DevicePerm() const { return perm_h_.Complementary(); }
 };
 
-template <typename T>
-HostDeviceVector<T>::HostDeviceVector
-(size_t size, T v, const GPUDistribution &distribution) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
-}
+template<typename T>
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
+    : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector
-(std::initializer_list<T> init, const GPUDistribution &distribution) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
-}
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector
-(const std::vector<T>& init, const GPUDistribution &distribution) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
-}
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 
 template <typename T>
 HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
-  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
-}
+    : impl_(new HostDeviceVectorImpl<T>(*other.impl_)) {}
 
 template <typename T>
-HostDeviceVector<T>& HostDeviceVector<T>::operator=
-(const HostDeviceVector<T>& other) {
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
   if (this == &other) { return *this; }
 
   std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
@@ -491,73 +325,51 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
 
 template <typename T>
-GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
+int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
 
 template <typename T>
-const GPUDistribution& HostDeviceVector<T>::Distribution() const {
-  return impl_->Distribution();
+T* HostDeviceVector<T>::DevicePointer() {
+  return impl_->DevicePointer();
 }
 
 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) {
-  return impl_->DevicePointer(device);
+const T* HostDeviceVector<T>::ConstDevicePointer() const {
+  return impl_->ConstDevicePointer();
 }
 
 template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
-  return impl_->ConstDevicePointer(device);
+common::Span<T> HostDeviceVector<T>::DeviceSpan() {
+  return impl_->DeviceSpan();
 }
 
 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
-  return impl_->DeviceSpan(device);
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
+  return impl_->ConstDeviceSpan();
 }
 
 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
-  return impl_->ConstDeviceSpan(device);
+size_t HostDeviceVector<T>::DeviceSize() const {
+  return impl_->DeviceSize();
 }
 
 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) const {
-  return impl_->DeviceStart(device);
+thrust::device_ptr<T> HostDeviceVector<T>::tbegin() {  // NOLINT
+  return impl_->tbegin();
 }
 
 template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) const {
-  return impl_->DeviceSize(device);
+thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin() const {  // NOLINT
+  return impl_->tcbegin();
 }
 
 template <typename T>
-thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) {  // NOLINT
-  return impl_->tbegin(device);
+thrust::device_ptr<T> HostDeviceVector<T>::tend() {  // NOLINT
+  return impl_->tend();
 }
 
 template <typename T>
-thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const {  // NOLINT
-  return impl_->tcbegin(device);
-}
-
-template <typename T>
-thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) {  // NOLINT
-  return impl_->tend(device);
-}
-
-template <typename T>
-thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const {  // NOLINT
-  return impl_->tcend(device);
-}
-
-template <typename T>
-void HostDeviceVector<T>::ScatterFrom
-(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
-  impl_->ScatterFrom(begin, end);
-}
-
-template <typename T>
-void HostDeviceVector<T>::GatherTo
-(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
-  impl_->GatherTo(begin, end);
+thrust::device_ptr<const T> HostDeviceVector<T>::tcend() const {  // NOLINT
+  return impl_->tcend();
 }
 
 template <typename T>
@@ -594,23 +406,13 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
 }
 
 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
-  return impl_->DeviceCanAccess(device, access);
+bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
+  return impl_->DeviceCanAccess(access);
 }
 
 template <typename T>
-void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
-  impl_->Shard(new_devices);
-}
-
-template <typename T>
-void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
-  impl_->Shard(distribution);
-}
-
-template <typename T>
-void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution) {
-  impl_->Reshard(distribution);
+void HostDeviceVector<T>::SetDevice(int device) const {
+  impl_->SetDevice(device);
 }
 
 template <typename T>
diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h
index 0ffe75f5e..abaa5dd59 100644
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -79,113 +79,6 @@ void SetCudaSetDeviceHandler(void (*handler)(int));
 
 template <typename T> struct HostDeviceVectorImpl;
 
-// Distribution for the HostDeviceVector; it specifies such aspects as the
-// devices it is distributed on, whether there are copies of elements from
-// other GPUs as well as the granularity of splitting. It may also specify
-// explicit boundaries for devices, in which case the size of the array cannot
-// be changed.
-class GPUDistribution {
-  template<typename T> friend struct HostDeviceVectorImpl;
-
- public:
-  explicit GPUDistribution(GPUSet devices = GPUSet::Empty())
-    : devices_(devices), granularity_(1), overlap_(0) {}
-
- private:
-  GPUDistribution(GPUSet devices, int granularity, int overlap,
-                  std::vector<size_t> &&offsets)
-    : devices_(devices), granularity_(granularity), overlap_(overlap),
-    offsets_(std::move(offsets)) {}
-
- public:
-  static GPUDistribution Empty() { return GPUDistribution(); }
-
-  static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
-
-  static GPUDistribution Overlap(GPUSet devices, int overlap) {
-    return GPUDistribution(devices, 1, overlap, std::vector<size_t>());
-  }
-
-  static GPUDistribution Granular(GPUSet devices, int granularity) {
-    return GPUDistribution(devices, granularity, 0, std::vector<size_t>());
-  }
-
-  // NOTE(rongou): Explicit offsets don't necessarily cover the whole vector. Sections before the
-  // first shard or after the last shard may be on host only. This windowing is done in the GPU
-  // predictor for external memory support.
-  static GPUDistribution Explicit(GPUSet devices, std::vector<size_t> offsets) {
-    return GPUDistribution(devices, 1, 0, std::move(offsets));
-  }
-
-  friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
-    bool const res = a.devices_ == b.devices_ &&
-                     a.granularity_ == b.granularity_ &&
-                     a.overlap_ == b.overlap_ &&
-                     a.offsets_ == b.offsets_;
-    return res;
-  }
-
-  friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
-    return !(a == b);
-  }
-
-  GPUSet Devices() const { return devices_; }
-
-  bool IsEmpty() const { return devices_.IsEmpty(); }
-
-  size_t ShardStart(size_t size, int index) const {
-    if (size == 0) { return 0; }
-    if (offsets_.size() > 0) {
-      // explicit offsets are provided
-      CHECK_EQ(offsets_.back(), size);
-      return offsets_.at(index);
-    }
-    // no explicit offsets
-    size_t begin = std::min(index * Portion(size), size);
-    begin = begin > size ? size : begin;
-    return begin;
-  }
-
-  size_t ShardSize(size_t size, size_t index) const {
-    if (size == 0) { return 0; }
-    if (offsets_.size() > 0) {
-      // explicit offsets are provided
-      CHECK_EQ(offsets_.back(), size);
-      return offsets_.at(index + 1)  - offsets_.at(index) +
-        (index == devices_.Size() - 1 ? overlap_ : 0);
-    }
-    size_t portion = Portion(size);
-    size_t begin = std::min(index * portion, size);
-    size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size);
-    return end - begin;
-  }
-
-  size_t ShardProperSize(size_t size, size_t index) const {
-    if (size == 0) { return 0; }
-    return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0);
-  }
-
-  bool IsFixedSize() const { return !offsets_.empty(); }
-
- private:
-  static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; }
-  static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; }
-
-  size_t Portion(size_t size) const {
-    return RoundUp
-      (DivRoundUp
-       (std::max(static_cast<int64_t>(size - overlap_ * granularity_),
-                 static_cast<int64_t>(1)),
-        devices_.Size()), granularity_);
-  }
-
-  GPUSet devices_;
-  int granularity_;
-  int overlap_;
-  // explicit offsets for the GPU parts, if any
-  std::vector<size_t> offsets_;
-};
-
 enum GPUAccess {
   kNone, kRead,
   // write implies read
@@ -199,46 +92,38 @@ inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
 template <typename T>
 class HostDeviceVector {
  public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(),
-                            const GPUDistribution &distribution = GPUDistribution());
-  HostDeviceVector(std::initializer_list<T> init,
-                   const GPUDistribution &distribution = GPUDistribution());
-  explicit HostDeviceVector(const std::vector<T>& init,
-                            const GPUDistribution &distribution = GPUDistribution());
+  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
+  HostDeviceVector(std::initializer_list<T> init, int device = -1);
+  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
   ~HostDeviceVector();
   HostDeviceVector(const HostDeviceVector<T>&);
   HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
   size_t Size() const;
-  GPUSet Devices() const;
-  const GPUDistribution& Distribution() const;
-  common::Span<T> DeviceSpan(int device);
-  common::Span<const T> ConstDeviceSpan(int device) const;
-  common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
-  T* DevicePointer(int device);
-  const T* ConstDevicePointer(int device) const;
-  const T* DevicePointer(int device) const { return ConstDevicePointer(device); }
+  int DeviceIdx() const;
+  common::Span<T> DeviceSpan();
+  common::Span<const T> ConstDeviceSpan() const;
+  common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
+  T* DevicePointer();
+  const T* ConstDevicePointer() const;
+  const T* DevicePointer() const { return ConstDevicePointer(); }
 
   T* HostPointer() { return HostVector().data(); }
   const T* ConstHostPointer() const { return ConstHostVector().data(); }
   const T* HostPointer() const { return ConstHostPointer(); }
 
-  size_t DeviceStart(int device) const;
-  size_t DeviceSize(int device) const;
+  size_t DeviceSize() const;
 
   // only define functions returning device_ptr
   // if HostDeviceVector.h is included from a .cu file
 #ifdef __CUDACC__
-  thrust::device_ptr<T> tbegin(int device);  // NOLINT
-  thrust::device_ptr<T> tend(int device);  // NOLINT
-  thrust::device_ptr<const T> tcbegin(int device) const;  // NOLINT
-  thrust::device_ptr<const T> tcend(int device) const;  // NOLINT
-  thrust::device_ptr<const T> tbegin(int device) const {  // NOLINT
-    return tcbegin(device);
+  thrust::device_ptr<T> tbegin();  // NOLINT
+  thrust::device_ptr<T> tend();  // NOLINT
+  thrust::device_ptr<const T> tcbegin() const;  // NOLINT
+  thrust::device_ptr<const T> tcend() const;  // NOLINT
+  thrust::device_ptr<const T> tbegin() const {  // NOLINT
+    return tcbegin();
   }
-  thrust::device_ptr<const T> tend(int device) const { return tcend(device); }  // NOLINT
-
-  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end);
-  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const;
+  thrust::device_ptr<const T> tend() const { return tcend(); }  // NOLINT
 #endif  // __CUDACC__
 
   void Fill(T v);
@@ -251,18 +136,9 @@ class HostDeviceVector {
   const std::vector<T>& HostVector() const {return ConstHostVector(); }
 
   bool HostCanAccess(GPUAccess access) const;
-  bool DeviceCanAccess(int device, GPUAccess access) const;
+  bool DeviceCanAccess(GPUAccess access) const;
 
-  /*!
-   * \brief Specify memory distribution.
-   */
-  void Shard(const GPUDistribution &distribution) const;
-  void Shard(GPUSet devices) const;
-
-  /*!
-   * \brief Change memory distribution.
-   */
-  void Reshard(const GPUDistribution &distribution);
+  void SetDevice(int device) const;
 
   void Resize(size_t new_size, T v = T());
 
diff --git a/src/common/transform.h b/src/common/transform.h
index b1bc55322..205dbf575 100644
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -57,14 +57,10 @@ class Transform {
   template <typename Functor>
   struct Evaluator {
    public:
-    Evaluator(Functor func, Range range, GPUSet devices, bool shard) :
+    Evaluator(Functor func, Range range, int device, bool shard) :
         func_(func), range_{std::move(range)},
         shard_{shard},
-        distribution_{GPUDistribution::Block(devices)} {}
-    Evaluator(Functor func, Range range, GPUDistribution dist,
-              bool shard) :
-        func_(func), range_{std::move(range)}, shard_{shard},
-        distribution_{std::move(dist)} {}
+        device_{device} {}
 
     /*!
      * \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -74,7 +70,7 @@ class Transform {
      */
     template <typename... HDV>
     void Eval(HDV... vectors) const {
-      bool on_device = !distribution_.IsEmpty();
+      bool on_device = device_ >= 0;
 
       if (on_device) {
         LaunchCUDA(func_, vectors...);
@@ -86,13 +82,13 @@ class Transform {
    private:
     // CUDA UnpackHDV
     template <typename T>
-    Span<T> UnpackHDV(HostDeviceVector<T>* _vec, int _device) const {
-      auto span = _vec->DeviceSpan(_device);
+    Span<T> UnpackHDVOnDevice(HostDeviceVector<T>* _vec) const {
+      auto span = _vec->DeviceSpan();
       return span;
     }
     template <typename T>
-    Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec, int _device) const {
-      auto span = _vec->ConstDeviceSpan(_device);
+    Span<T const> UnpackHDVOnDevice(const HostDeviceVector<T>* _vec) const {
+      auto span = _vec->ConstDeviceSpan();
       return span;
     }
     // CPU UnpackHDV
@@ -108,15 +104,15 @@ class Transform {
     }
     // Recursive unpack for Shard.
     template <typename T>
-    void UnpackShard(GPUDistribution dist, const HostDeviceVector<T> *vector) const {
-      vector->Shard(dist);
+    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+      vector->SetDevice(device);
     }
     template <typename Head, typename... Rest>
-    void UnpackShard(GPUDistribution dist,
+    void UnpackShard(int device,
                      const HostDeviceVector<Head> *_vector,
                      const HostDeviceVector<Rest> *... _vectors) const {
-      _vector->Shard(dist);
-      UnpackShard(dist, _vectors...);
+      _vector->SetDevice(device);
+      UnpackShard(device, _vectors...);
     }
 
 #if defined(__CUDACC__)
@@ -124,28 +120,20 @@ class Transform {
               typename... HDV>
     void LaunchCUDA(Functor _func, HDV*... _vectors) const {
       if (shard_)
-        UnpackShard(distribution_, _vectors...);
+        UnpackShard(device_, _vectors...);
 
-      GPUSet devices = distribution_.Devices();
       size_t range_size = *range_.end() - *range_.begin();
 
       // Extract index to deal with possible old OpenMP.
-      size_t device_beg = *(devices.begin());
-      size_t device_end = *(devices.end());
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (omp_ulong device = device_beg; device < device_end; ++device) {  // NOLINT
-        // Ignore other attributes of GPUDistribution for spliting index.
-        // This deals with situation like multi-class setting where
-        // granularity is used in data vector.
-        size_t shard_size = GPUDistribution::Block(devices).ShardSize(
-            range_size, devices.Index(device));
-        Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-        dh::safe_cuda(cudaSetDevice(device));
-        const int GRID_SIZE =
-            static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
-        detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
-            _func, shard_range, UnpackHDV(_vectors, device)...);
-      }
+      // This deals with situation like multi-class setting where
+      // granularity is used in data vector.
+      size_t shard_size = range_size;
+      Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
+      dh::safe_cuda(cudaSetDevice(device_));
+      const int GRID_SIZE =
+          static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
+      detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
+          _func, shard_range, UnpackHDVOnDevice(_vectors)...);
     }
 #else
     /*! \brief Dummy funtion defined when compiling for CPU.  */
@@ -172,7 +160,7 @@ class Transform {
     Range range_;
     /*! \brief Whether sharding for vectors is required. */
     bool shard_;
-    GPUDistribution distribution_;
+    int device_;
   };
 
  public:
@@ -191,15 +179,9 @@ class Transform {
    */
   template <typename Functor>
   static Evaluator<Functor> Init(Functor func, Range const range,
-                                 GPUSet const devices,
+                                 int device,
                                  bool const shard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(devices), shard};
-  }
-  template <typename Functor>
-  static Evaluator<Functor> Init(Functor func, Range const range,
-                                 GPUDistribution const dist,
-                                 bool const shard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(dist), shard};
+    return Evaluator<Functor> {func, std::move(range), device, shard};
   }
 };
 
diff --git a/src/data/data.cu b/src/data/data.cu
index d991739f6..27c7d7166 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -78,9 +78,9 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
   } else {
     LOG(FATAL) << "Unknown metainfo: " << key;
   }
-  dst->Reshard(GPUDistribution(GPUSet::Range(ptr_device, 1)));
+  dst->SetDevice(ptr_device);
   dst->Resize(length);
-  auto p_dst = thrust::device_pointer_cast(dst->DevicePointer(0));
+  auto p_dst = thrust::device_pointer_cast(dst->DevicePointer());
   thrust::copy(p_src, p_src + length, p_dst);
 }
 }  // namespace xgboost
diff --git a/src/data/simple_csr_source.cu b/src/data/simple_csr_source.cu
index af0cf505c..cd8f30309 100644
--- a/src/data/simple_csr_source.cu
+++ b/src/data/simple_csr_source.cu
@@ -77,16 +77,14 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector<Columnar> cols) {
 
   dh::safe_cuda(cudaSetDevice(device));
 
-  GPUSet devices = GPUSet::Range(device, 1);
-
-  page_.offset.Reshard(GPUDistribution(devices));
+  page_.offset.SetDevice(device);
   page_.offset.Resize(info.num_row_ + 1);
 
-  page_.data.Reshard(GPUDistribution(devices));
+  page_.data.SetDevice(device);
   page_.data.Resize(info.num_nonzero_);
 
-  auto s_data = page_.data.DeviceSpan(device);
-  auto s_offsets = page_.offset.DeviceSpan(device);
+  auto s_data = page_.data.DeviceSpan();
+  auto s_offsets = page_.offset.DeviceSpan();
   CHECK_EQ(s_offsets.size(), n_rows + 1);
 
   int32_t constexpr kThreads = 256;
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 43048061e..8d790e9e4 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -182,9 +182,9 @@ void GBTree::DoBoost(DMatrix* p_fmat,
     CHECK_EQ(in_gpair->Size() % ngroup, 0U)
         << "must have exactly ngroup*nrow gpairs";
     // TODO(canonizer): perform this on GPU if HostDeviceVector has device set.
-    HostDeviceVector<GradientPair> tmp
-        (in_gpair->Size() / ngroup, GradientPair(),
-         GPUDistribution::Block(in_gpair->Distribution().Devices()));
+    HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup,
+                                       GradientPair(),
+                                       in_gpair->DeviceIdx());
     const auto& gpair_h = in_gpair->ConstHostVector();
     auto nsize = static_cast<bst_omp_uint>(tmp.Size());
     for (int gid = 0; gid < ngroup; ++gid) {
diff --git a/src/learner.cc b/src/learner.cc
index 0eb175f3d..b6cf889c7 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -237,14 +237,13 @@ class LearnerImpl : public Learner {
       std::vector<std::pair<std::string, std::string> > attr;
       fi->Read(&attr);
       for (auto& kv : attr) {
-        // Load `predictor`, `n_gpus`, `gpu_id` parameters from extra attributes
+        // Load `predictor`, `gpu_id` parameters from extra attributes
         const std::string prefix = "SAVED_PARAM_";
         if (kv.first.find(prefix) == 0) {
           const std::string saved_param = kv.first.substr(prefix.length());
           bool is_gpu_predictor = saved_param == "predictor" && kv.second == "gpu_predictor";
 #ifdef XGBOOST_USE_CUDA
-          if (saved_param == "predictor" || saved_param == "n_gpus"
-              || saved_param == "gpu_id") {
+          if (saved_param == "predictor" || saved_param == "gpu_id") {
             cfg_[saved_param] = kv.second;
             LOG(INFO)
               << "Parameter '" << saved_param << "' has been recovered from "
@@ -266,7 +265,7 @@ class LearnerImpl : public Learner {
           }
 #endif  // XGBOOST_USE_CUDA
           // NO visible GPU in current environment
-          if (is_gpu_predictor && GPUSet::AllVisible().Size() == 0) {
+          if (is_gpu_predictor && common::AllVisibleGPUs() == 0) {
             cfg_["predictor"] = "cpu_predictor";
             kv.second = "cpu_predictor";
             LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
@@ -294,7 +293,9 @@ class LearnerImpl : public Learner {
     auto n = tparam_.__DICT__();
     cfg_.insert(n.cbegin(), n.cend());
 
-    gbm_->Configure({cfg_.cbegin(), cfg_.cend()});
+    Args args = {cfg_.cbegin(), cfg_.cend()};
+    generic_param_.InitAllowUnknown(args);
+    gbm_->Configure(args);
     obj_->Configure({cfg_.begin(), cfg_.end()});
 
     for (auto& p_metric : metrics_) {
@@ -331,9 +332,8 @@ class LearnerImpl : public Learner {
       }
     }
     {
-      // Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
-      for (const auto& key : std::vector<std::string>{
-          "predictor", "n_gpus", "gpu_id"}) {
+      // Write `predictor`, `gpu_id` parameters as extra attributes
+      for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) {
         auto it = cfg_.find(key);
         if (it != cfg_.end()) {
           mparam.contain_extra_attrs = 1;
@@ -581,13 +581,8 @@ class LearnerImpl : public Learner {
     gbm_->Configure(args);
 
     if (this->gbm_->UseGPU()) {
-      if (cfg_.find("n_gpus") == cfg_.cend()) {
-        generic_param_.n_gpus = 1;
-      }
-      if (generic_param_.n_gpus != 1) {
-        LOG(FATAL) << "Single process multi-GPU training is no longer supported. "
-                      "Please switch to distributed GPU training with one process per GPU. "
-                      "This can be done using Dask or Spark.";
+      if (cfg_.find("gpu_id") == cfg_.cend()) {
+        generic_param_.gpu_id = 0;
       }
     }
   }
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index fad98f31f..74bbd8e25 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -19,12 +19,6 @@ namespace linear {
 
 DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
 
-void RescaleIndices(int device_idx, size_t ridx_begin,
-                    common::Span<xgboost::Entry> data) {
-  dh::LaunchN(device_idx, data.size(),
-              [=] __device__(size_t idx) { data[idx].index -= ridx_begin; });
-}
-
 class DeviceShard {
   int device_id_;
   dh::BulkAllocator ba_;
@@ -32,18 +26,16 @@ class DeviceShard {
   common::Span<xgboost::Entry> data_;
   common::Span<GradientPair> gpair_;
   dh::CubMemory temp_;
-  size_t ridx_begin_;
-  size_t ridx_end_;
+  size_t shard_size_;
 
  public:
   DeviceShard(int device_id,
               const SparsePage &batch,  // column batch
-              bst_uint row_begin, bst_uint row_end,
+              bst_uint shard_size,
               const LinearTrainParam &param,
               const gbm::GBLinearModelParam &model_param)
       : device_id_(device_id),
-        ridx_begin_(row_begin),
-        ridx_end_(row_end) {
+        shard_size_(shard_size) {
     if ( IsEmpty() ) { return; }
     dh::safe_cuda(cudaSetDevice(device_id_));
     // The begin and end indices for the section of each column associated with
@@ -51,25 +43,25 @@ class DeviceShard {
     std::vector<std::pair<bst_uint, bst_uint>> column_segments;
     row_ptr_ = {0};
     // iterate through columns
-    for (auto fidx = 0; fidx < batch.Size(); fidx++) {
+    for (size_t fidx = 0; fidx < batch.Size(); fidx++) {
       common::Span<Entry const> col = batch[fidx];
       auto cmp = [](Entry e1, Entry e2) {
         return e1.index < e2.index;
       };
       auto column_begin =
           std::lower_bound(col.cbegin(), col.cend(),
-                           xgboost::Entry(row_begin, 0.0f), cmp);
+                           xgboost::Entry(0, 0.0f), cmp);
       auto column_end =
           std::lower_bound(col.cbegin(), col.cend(),
-                           xgboost::Entry(row_end, 0.0f), cmp);
+                           xgboost::Entry(shard_size_, 0.0f), cmp);
       column_segments.emplace_back(
           std::make_pair(column_begin - col.cbegin(), column_end - col.cbegin()));
       row_ptr_.push_back(row_ptr_.back() + (column_end - column_begin));
     }
     ba_.Allocate(device_id_, &data_, row_ptr_.back(), &gpair_,
-                 (row_end - row_begin) * model_param.num_output_group);
+                 shard_size_ * model_param.num_output_group);
 
-    for (int fidx = 0; fidx < batch.Size(); fidx++) {
+    for (size_t fidx = 0; fidx < batch.Size(); fidx++) {
       auto col = batch[fidx];
       auto seg = column_segments[fidx];
       dh::safe_cuda(cudaMemcpy(
@@ -77,23 +69,21 @@ class DeviceShard {
           col.data() + seg.first,
           sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice));
     }
-    // Rescale indices with respect to current shard
-    RescaleIndices(device_id_, ridx_begin_, data_);
   }
 
-  ~DeviceShard() {
+  ~DeviceShard() {  // NOLINT
     dh::safe_cuda(cudaSetDevice(device_id_));
   }
 
   bool IsEmpty() {
-    return (ridx_end_ - ridx_begin_) == 0;
+    return shard_size_ == 0;
   }
 
   void UpdateGpair(const std::vector<GradientPair> &host_gpair,
                    const gbm::GBLinearModelParam &model_param) {
     dh::safe_cuda(cudaMemcpyAsync(
         gpair_.data(),
-        host_gpair.data() + ridx_begin_ * model_param.num_output_group,
+        host_gpair.data(),
         gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice));
   }
 
@@ -107,13 +97,13 @@ class DeviceShard {
         counting, f);
     auto perm = thrust::make_permutation_iterator(gpair_.data(), skip);
 
-    return dh::SumReduction(temp_, perm, ridx_end_ - ridx_begin_);
+    return dh::SumReduction(temp_, perm, shard_size_);
   }
 
   void UpdateBiasResidual(float dbias, int group_idx, int num_groups) {
     if (dbias == 0.0f) return;
     auto d_gpair = gpair_;
-    dh::LaunchN(device_id_, ridx_end_ - ridx_begin_, [=] __device__(size_t idx) {
+    dh::LaunchN(device_id_, shard_size_, [=] __device__(size_t idx) {
       auto &g = d_gpair[idx * num_groups + group_idx];
       g += GradientPair(g.GetHess() * dbias, 0);
     });
@@ -154,7 +144,7 @@ class DeviceShard {
  * \brief Coordinate descent algorithm that updates one feature per iteration
  */
 
-class GPUCoordinateUpdater : public LinearUpdater {
+class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
  public:
   // set training parameter
   void Configure(Args const& args) override {
@@ -165,37 +155,23 @@ class GPUCoordinateUpdater : public LinearUpdater {
 
   void LazyInitShards(DMatrix *p_fmat,
                       const gbm::GBLinearModelParam &model_param) {
-    if (!shards_.empty()) return;
+    if (shard_) return;
 
-    dist_ = GPUDistribution::Block(GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus,
-                                               p_fmat->Info().num_row_));
-    auto devices = dist_.Devices();
+    device_ = learner_param_->gpu_id;
 
-    size_t n_devices = static_cast<size_t>(devices.Size());
-    size_t row_begin = 0;
-    size_t num_row = static_cast<size_t>(p_fmat->Info().num_row_);
+    auto num_row = static_cast<size_t>(p_fmat->Info().num_row_);
 
     // Partition input matrix into row segments
     std::vector<size_t> row_segments;
     row_segments.push_back(0);
-    for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
-      size_t shard_size = dist_.ShardSize(num_row, d_idx);
-      size_t row_end = row_begin + shard_size;
-      row_segments.push_back(row_end);
-      row_begin = row_end;
-    }
+    size_t shard_size = num_row;
+    row_segments.push_back(shard_size);
 
     CHECK(p_fmat->SingleColBlock());
     SparsePage const& batch = *(p_fmat->GetBatches<CSCPage>().begin());
 
-    shards_.resize(n_devices);
-    // Create device shards
-    dh::ExecuteIndexShards(&shards_,
-                           [&](int i, std::unique_ptr<DeviceShard>& shard) {
-        shard = std::unique_ptr<DeviceShard>(
-            new DeviceShard(devices.DeviceId(i), batch, row_segments[i],
-                            row_segments[i + 1], tparam_, model_param));
-      });
+    // Create device shard
+    shard_.reset(new DeviceShard(device_, batch, shard_size, tparam_, model_param));
   }
 
   void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
@@ -208,11 +184,9 @@ class GPUCoordinateUpdater : public LinearUpdater {
     monitor_.Start("UpdateGpair");
     auto &in_gpair_host = in_gpair->ConstHostVector();
     // Update gpair
-    dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
-      if (!shard->IsEmpty()) {
-        shard->UpdateGpair(in_gpair_host, model->param);
-      }
-    });
+    if (shard_) {
+      shard_->UpdateGpair(in_gpair_host, model->param);
+    }
     monitor_.Stop("UpdateGpair");
 
     monitor_.Start("UpdateBias");
@@ -237,32 +211,21 @@ class GPUCoordinateUpdater : public LinearUpdater {
   }
 
   void UpdateBias(DMatrix *p_fmat, gbm::GBLinearModel *model) {
-    for (int group_idx = 0; group_idx < model->param.num_output_group;
-         ++group_idx) {
+    for (int group_idx = 0; group_idx < model->param.num_output_group; ++group_idx) {
       // Get gradient
-      auto grad = dh::ReduceShards<GradientPair>(
-          &shards_, [&](std::unique_ptr<DeviceShard> &shard) {
-            if (!shard->IsEmpty()) {
-              GradientPair result =
-                  shard->GetBiasGradient(group_idx,
-                                         model->param.num_output_group);
-              return result;
-            }
-            return GradientPair(0, 0);
-          });
-
+      auto grad = GradientPair(0, 0);
+      if (shard_) {
+        grad = shard_->GetBiasGradient(group_idx, model->param.num_output_group);
+      }
       auto dbias = static_cast<float>(
           tparam_.learning_rate *
-          CoordinateDeltaBias(grad.GetGrad(), grad.GetHess()));
+              CoordinateDeltaBias(grad.GetGrad(), grad.GetHess()));
       model->bias()[group_idx] += dbias;
 
       // Update residual
-    dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
-        if (!shard->IsEmpty()) {
-          shard->UpdateBiasResidual(dbias, group_idx,
-                                    model->param.num_output_group);
-        }
-      });
+      if (shard_) {
+        shard_->UpdateBiasResidual(dbias, group_idx, model->param.num_output_group);
+      }
     }
   }
 
@@ -271,38 +234,30 @@ class GPUCoordinateUpdater : public LinearUpdater {
                      gbm::GBLinearModel *model) {
     bst_float &w = (*model)[fidx][group_idx];
     // Get gradient
-    auto grad = dh::ReduceShards<GradientPair>(
-        &shards_, [&](std::unique_ptr<DeviceShard> &shard) {
-          if (!shard->IsEmpty()) {
-            return shard->GetGradient(group_idx, model->param.num_output_group,
-                                      fidx);
-          }
-          return GradientPair(0, 0);
-        });
-
+    auto grad = GradientPair(0, 0);
+    if (shard_) {
+      grad = shard_->GetGradient(group_idx, model->param.num_output_group, fidx);
+    }
     auto dw = static_cast<float>(tparam_.learning_rate *
                                  CoordinateDelta(grad.GetGrad(), grad.GetHess(),
                                                  w, tparam_.reg_alpha_denorm,
                                                  tparam_.reg_lambda_denorm));
     w += dw;
 
-    dh::ExecuteIndexShards(&shards_, [&](int idx,
-                                        std::unique_ptr<DeviceShard> &shard) {
-      if (!shard->IsEmpty()) {
-        shard->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx);
-      }
-    });
+    if (shard_) {
+      shard_->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx);
+    }
   }
 
  private:
   // training parameter
   LinearTrainParam tparam_;
   CoordinateParam coord_param_;
-  GPUDistribution dist_;
+  int device_{};
   std::unique_ptr<FeatureSelector> selector_;
   common::Monitor monitor_;
 
-  std::vector<std::unique_ptr<DeviceShard>> shards_;
+  std::unique_ptr<DeviceShard> shard_{nullptr};
 };
 
 XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index 0415f98e1..661d02dac 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -30,8 +30,7 @@ DMLC_REGISTRY_FILE_TAG(elementwise_metric);
 template <typename EvalRow>
 class ElementWiseMetricsReduction {
  public:
-  explicit ElementWiseMetricsReduction(EvalRow policy) :
-    policy_(std::move(policy)) {}
+  explicit ElementWiseMetricsReduction(EvalRow policy) : policy_(std::move(policy)) {}
 
   PackedReduceResult CpuReduceMetrics(
       const HostDeviceVector<bst_float>& weights,
@@ -59,34 +58,31 @@ class ElementWiseMetricsReduction {
 #if defined(XGBOOST_USE_CUDA)
 
   ~ElementWiseMetricsReduction() {
-    for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
-      dh::safe_cuda(cudaSetDevice(id));
-      size_t index = devices_.Index(id);
-      allocators_.at(index).Free();
+    if (device_ >= 0) {
+      dh::safe_cuda(cudaSetDevice(device_));
+      allocator_.Free();
     }
   }
 
   PackedReduceResult DeviceReduceMetrics(
-      GPUSet::GpuIdType device_id,
-      size_t device_index,
       const HostDeviceVector<bst_float>& weights,
       const HostDeviceVector<bst_float>& labels,
       const HostDeviceVector<bst_float>& preds) {
-    size_t n_data = preds.DeviceSize(device_id);
+    size_t n_data = preds.DeviceSize();
 
     thrust::counting_iterator<size_t> begin(0);
     thrust::counting_iterator<size_t> end = begin + n_data;
 
-    auto s_label = labels.DeviceSpan(device_id);
-    auto s_preds = preds.DeviceSpan(device_id);
-    auto s_weights = weights.DeviceSpan(device_id);
+    auto s_label = labels.DeviceSpan();
+    auto s_preds = preds.DeviceSpan();
+    auto s_weights = weights.DeviceSpan();
 
     bool const is_null_weight = weights.Size() == 0;
 
     auto d_policy = policy_;
 
     PackedReduceResult result = thrust::transform_reduce(
-        thrust::cuda::par(allocators_.at(device_index)),
+        thrust::cuda::par(allocator_),
         begin, end,
         [=] XGBOOST_DEVICE(size_t idx) {
           bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
@@ -105,37 +101,24 @@ class ElementWiseMetricsReduction {
 
   PackedReduceResult Reduce(
       const GenericParameter &tparam,
-      GPUSet devices,
+      int device,
       const HostDeviceVector<bst_float>& weights,
       const HostDeviceVector<bst_float>& labels,
       const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (devices.IsEmpty()) {
+    if (device < 0) {
       result = CpuReduceMetrics(weights, labels, preds);
     }
 #if defined(XGBOOST_USE_CUDA)
     else {  // NOLINT
-      if (allocators_.empty()) {
-        devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
-        allocators_.resize(devices_.Size());
-      }
-      preds.Shard(devices);
-      labels.Shard(devices);
-      weights.Shard(devices);
-      std::vector<PackedReduceResult> res_per_device(devices.Size());
+      device_ = device;
+      preds.SetDevice(device_);
+      labels.SetDevice(device_);
+      weights.SetDevice(device_);
 
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
-        dh::safe_cuda(cudaSetDevice(id));
-        size_t index = devices.Index(id);
-        res_per_device.at(index) =
-            DeviceReduceMetrics(id, index, weights, labels, preds);
-      }
-
-      for (auto const& res : res_per_device) {
-        result += res;
-      }
+      dh::safe_cuda(cudaSetDevice(device_));
+      result = DeviceReduceMetrics(weights, labels, preds);
     }
 #endif  // defined(XGBOOST_USE_CUDA)
     return result;
@@ -144,8 +127,8 @@ class ElementWiseMetricsReduction {
  private:
   EvalRow policy_;
 #if defined(XGBOOST_USE_CUDA)
-  GPUSet devices_;
-  std::vector<dh::CubMemory> allocators_;
+  int device_{-1};
+  dh::CubMemory allocator_;
 #endif  // defined(XGBOOST_USE_CUDA)
 };
 
@@ -345,11 +328,10 @@ struct EvalEWiseBase : public Metric {
         << "label and prediction size not match, "
         << "hint: use merror or mlogloss for multi-class classification";
     const auto ndata = static_cast<omp_ulong>(info.labels_.Size());
-    // Dealing with ndata < n_gpus.
-    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
+    int device = tparam_->gpu_id;
 
     auto result =
-        reducer_.Reduce(*tparam_, devices, info.weights_, info.labels_, preds);
+        reducer_.Reduce(*tparam_, device, info.weights_, info.labels_, preds);
 
     double dat[2] { result.Residue(), result.Weights() };
     if (distributed) {
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index 4dde1ed4d..5b18b9f5f 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -74,35 +74,32 @@ class MultiClassMetricsReduction {
 #if defined(XGBOOST_USE_CUDA)
 
   ~MultiClassMetricsReduction() {
-    for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
-      dh::safe_cuda(cudaSetDevice(id));
-      size_t index = devices_.Index(id);
-      allocators_.at(index).Free();
+    if (device_ >= 0) {
+      dh::safe_cuda(cudaSetDevice(device_));
+      allocator_.Free();
     }
   }
 
   PackedReduceResult DeviceReduceMetrics(
-      GPUSet::GpuIdType device_id,
-      size_t device_index,
       const HostDeviceVector<bst_float>& weights,
       const HostDeviceVector<bst_float>& labels,
       const HostDeviceVector<bst_float>& preds,
       const size_t n_class) {
-    size_t n_data = labels.DeviceSize(device_id);
+    size_t n_data = labels.DeviceSize();
 
     thrust::counting_iterator<size_t> begin(0);
     thrust::counting_iterator<size_t> end = begin + n_data;
 
-    auto s_labels = labels.DeviceSpan(device_id);
-    auto s_preds = preds.DeviceSpan(device_id);
-    auto s_weights = weights.DeviceSpan(device_id);
+    auto s_labels = labels.DeviceSpan();
+    auto s_preds = preds.DeviceSpan();
+    auto s_weights = weights.DeviceSpan();
 
     bool const is_null_weight = weights.Size() == 0;
     auto s_label_error = label_error_.GetSpan<int32_t>(1);
     s_label_error[0] = 0;
 
     PackedReduceResult result = thrust::transform_reduce(
-        thrust::cuda::par(allocators_.at(device_index)),
+        thrust::cuda::par(allocator_),
         begin, end,
         [=] XGBOOST_DEVICE(size_t idx) {
           bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
@@ -127,38 +124,25 @@ class MultiClassMetricsReduction {
 
   PackedReduceResult Reduce(
       const GenericParameter &tparam,
-      GPUSet devices,
+      int device,
       size_t n_class,
       const HostDeviceVector<bst_float>& weights,
       const HostDeviceVector<bst_float>& labels,
       const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (devices.IsEmpty()) {
+    if (device < 0) {
       result = CpuReduceMetrics(weights, labels, preds, n_class);
     }
 #if defined(XGBOOST_USE_CUDA)
     else {  // NOLINT
-      if (allocators_.empty()) {
-        devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
-        allocators_.resize(devices_.Size());
-      }
-      preds.Shard(GPUDistribution::Granular(devices, n_class));
-      labels.Shard(devices);
-      weights.Shard(devices);
-      std::vector<PackedReduceResult> res_per_device(devices.Size());
+      device_ = tparam.gpu_id;
+      preds.SetDevice(device_);
+      labels.SetDevice(device_);
+      weights.SetDevice(device_);
 
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
-        dh::safe_cuda(cudaSetDevice(id));
-        size_t index = devices.Index(id);
-        res_per_device.at(index) =
-            DeviceReduceMetrics(id, index, weights, labels, preds, n_class);
-      }
-
-      for (auto const& res : res_per_device) {
-        result += res;
-      }
+      dh::safe_cuda(cudaSetDevice(device_));
+      result = DeviceReduceMetrics(weights, labels, preds, n_class);
     }
 #endif  // defined(XGBOOST_USE_CUDA)
     return result;
@@ -167,8 +151,8 @@ class MultiClassMetricsReduction {
  private:
 #if defined(XGBOOST_USE_CUDA)
   dh::PinnedMemory label_error_;
-  GPUSet devices_;
-  std::vector<dh::CubMemory> allocators_;
+  int device_{-1};
+  dh::CubMemory allocator_;
 #endif  // defined(XGBOOST_USE_CUDA)
 };
 
@@ -190,8 +174,8 @@ struct EvalMClassBase : public Metric {
         << " use logloss for binary classification";
     const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
 
-    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
-    auto result = reducer_.Reduce(*tparam_, devices, nclass, info.weights_, info.labels_, preds);
+    int device = tparam_->gpu_id;
+    auto result = reducer_.Reduce(*tparam_, device, nclass, info.weights_, info.labels_, preds);
     double dat[2] { result.Residue(), result.Weights() };
 
     if (distributed) {
diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu
index 0dd3010bc..88a9b06f3 100644
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -58,7 +58,7 @@ class HingeObj : public ObjFunction {
           _out_gpair[_idx] = GradientPair(g, h);
         },
         common::Range{0, static_cast<int64_t>(ndata)},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata)).Eval(
+        tparam_->gpu_id).Eval(
             out_gpair, &preds, &info.labels_, &info.weights_);
   }
 
@@ -68,7 +68,7 @@ class HingeObj : public ObjFunction {
           _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size()), 1},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
         .Eval(io_preds);
   }
 
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index ca803f522..80cf69410 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -59,14 +59,14 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const int nclass = param_.num_class;
     const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
 
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
-    out_gpair->Shard(GPUDistribution::Granular(devices, nclass));
-    info.labels_.Shard(GPUDistribution::Block(devices));
-    info.weights_.Shard(GPUDistribution::Block(devices));
-    preds.Shard(GPUDistribution::Granular(devices, nclass));
+    auto device = tparam_->gpu_id;
+    out_gpair->SetDevice(device);
+    info.labels_.SetDevice(device);
+    info.weights_.SetDevice(device);
+    preds.SetDevice(device);
 
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
-    label_correct_.Shard(GPUDistribution::Block(devices));
+    label_correct_.Resize(1);
+    label_correct_.SetDevice(device);
 
     out_gpair->Resize(preds.Size());
     label_correct_.Fill(1);
@@ -100,7 +100,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
             p = label == k ? p - 1.0f : p;
             gpair[idx * nclass + k] = GradientPair(p * wt, h);
           }
-        }, common::Range{0, ndata}, devices, false)
+        }, common::Range{0, ndata}, device, false)
         .Eval(out_gpair, &info.labels_, &preds, &info.weights_, &label_correct_);
 
     std::vector<int>& label_correct_h = label_correct_.HostVector();
@@ -125,7 +125,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
     max_preds_.Resize(ndata);
 
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size());
+    auto device = tparam_->gpu_id;
     if (prob) {
       common::Transform<>::Init(
           [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
@@ -133,11 +133,11 @@ class SoftmaxMultiClassObj : public ObjFunction {
                 _preds.subspan(_idx * nclass, nclass);
             common::Softmax(point.begin(), point.end());
           },
-          common::Range{0, ndata}, GPUDistribution::Granular(devices, nclass))
+          common::Range{0, ndata}, device)
         .Eval(io_preds);
     } else {
-      io_preds->Shard(GPUDistribution::Granular(devices, nclass));
-      max_preds_.Shard(GPUDistribution::Block(devices));
+      io_preds->SetDevice(device);
+      max_preds_.SetDevice(device);
       common::Transform<>::Init(
           [=] XGBOOST_DEVICE(size_t _idx,
                              common::Span<const bst_float> _preds,
@@ -148,7 +148,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
                 common::FindMaxIndex(point.cbegin(),
                                      point.cend()) - point.cbegin();
           },
-          common::Range{0, ndata}, devices, false)
+          common::Range{0, ndata}, device, false)
         .Eval(io_preds, &max_preds_);
     }
     if (!prob) {
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 0354d5777..ea2c92566 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -57,8 +57,8 @@ class RegLossObj : public ObjFunction {
         << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size();
     size_t ndata = preds.Size();
     out_gpair->Resize(ndata);
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    auto device = tparam_->gpu_id;
+    label_correct_.Resize(1);
     label_correct_.Fill(1);
 
     bool is_null_weight = info.weights_.Size() == 0;
@@ -83,7 +83,7 @@ class RegLossObj : public ObjFunction {
           _out_gpair[_idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
                                           Loss::SecondOrderGradient(p, label) * w);
         },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, device).Eval(
             &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
 
     // copy "label correct" flags back to host
@@ -105,7 +105,7 @@ class RegLossObj : public ObjFunction {
         [] XGBOOST_DEVICE(size_t _idx, common::Span<float> _preds) {
           _preds[_idx] = Loss::PredTransform(_preds[_idx]);
         }, common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
         .Eval(io_preds);
   }
 
@@ -175,8 +175,8 @@ class PoissonRegression : public ObjFunction {
     CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
     size_t ndata = preds.Size();
     out_gpair->Resize(ndata);
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    auto device = tparam_->gpu_id;
+    label_correct_.Resize(1);
     label_correct_.Fill(1);
 
     bool is_null_weight = info.weights_.Size() == 0;
@@ -197,7 +197,7 @@ class PoissonRegression : public ObjFunction {
           _out_gpair[_idx] = GradientPair{(expf(p) - y) * w,
                                           expf(p + max_delta_step) * w};
         },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, device).Eval(
             &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
     // copy "label correct" flags back to host
     std::vector<int>& label_correct_h = label_correct_.HostVector();
@@ -213,7 +213,7 @@ class PoissonRegression : public ObjFunction {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
         .Eval(io_preds);
   }
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -340,9 +340,9 @@ class GammaRegression : public ObjFunction {
     CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
     CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
     const size_t ndata = preds.Size();
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
+    auto device = tparam_->gpu_id;
     out_gpair->Resize(ndata);
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    label_correct_.Resize(1);
     label_correct_.Fill(1);
 
     const bool is_null_weight = info.weights_.Size() == 0;
@@ -361,7 +361,7 @@ class GammaRegression : public ObjFunction {
           }
           _out_gpair[_idx] = GradientPair((1 - y / expf(p)) * w, y / expf(p) * w);
         },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, device).Eval(
             &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
 
     // copy "label correct" flags back to host
@@ -378,7 +378,7 @@ class GammaRegression : public ObjFunction {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
         .Eval(io_preds);
   }
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -430,8 +430,8 @@ class TweedieRegression : public ObjFunction {
     const size_t ndata = preds.Size();
     out_gpair->Resize(ndata);
 
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    auto device = tparam_->gpu_id;
+    label_correct_.Resize(1);
     label_correct_.Fill(1);
 
     const bool is_null_weight = info.weights_.Size() == 0;
@@ -455,7 +455,7 @@ class TweedieRegression : public ObjFunction {
               std::exp((1 - rho) * p) + (2 - rho) * expf((2 - rho) * p);
           _out_gpair[_idx] = GradientPair(grad * w, hess * w);
         },
-        common::Range{0, static_cast<int64_t>(ndata), 1}, devices)
+        common::Range{0, static_cast<int64_t>(ndata), 1}, device)
         .Eval(&label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
 
     // copy "label correct" flags back to host
@@ -472,7 +472,7 @@ class TweedieRegression : public ObjFunction {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
         .Eval(io_preds);
   }
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 5fe4d426d..bd3ac3583 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -20,12 +20,6 @@ namespace predictor {
 
 DMLC_REGISTRY_FILE_TAG(gpu_predictor);
 
-template <typename IterT>
-void IncrementOffset(IterT begin_itr, IterT end_itr, size_t amount) {
-  thrust::transform(begin_itr, end_itr, begin_itr,
-                    [=] __device__(size_t elem) { return elem + amount; });
-}
-
 /**
  * \struct  DevicePredictionNode
  *
@@ -44,7 +38,7 @@ struct DevicePredictionNode {
   int fidx;
   int left_child_idx;
   int right_child_idx;
-  NodeValue val;
+  NodeValue val{};
 
   DevicePredictionNode(const RegTree::Node& n) {  // NOLINT
     static_assert(sizeof(DevicePredictionNode) == 16, "Size is not 16 bytes");
@@ -200,58 +194,14 @@ __global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
 }
 
 class GPUPredictor : public xgboost::Predictor {
- protected:
-  struct DevicePredictionCacheEntry {
-    std::shared_ptr<DMatrix> data;
-    HostDeviceVector<bst_float> predictions;
-  };
-
  private:
-  void DeviceOffsets(const HostDeviceVector<size_t>& data,
-                     size_t total_size,
-                     std::vector<size_t>* out_offsets) {
-    auto& offsets = *out_offsets;
-    offsets.resize(devices_.Size() + 1);
-    offsets[0] = 0;
-#pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
-    for (int shard = 0; shard < devices_.Size(); ++shard) {
-      int device = devices_.DeviceId(shard);
-      auto data_span = data.DeviceSpan(device);
-      dh::safe_cuda(cudaSetDevice(device));
-      if (data_span.size() == 0) {
-        offsets[shard + 1] = total_size;
-      } else {
-        // copy the last element from every shard
-        dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
-                                 &data_span[data_span.size()-1],
-                                 sizeof(size_t), cudaMemcpyDeviceToHost));
-      }
-    }
-  }
-
-  // This function populates the explicit offsets that can be used to create a window into the
-  // underlying host vector. The window starts from the `batch_offset` and has a size of
-  // `batch_size`, and is sharded across all the devices. Each shard is granular depending on
-  // the number of output classes `n_classes`.
-  void PredictionDeviceOffsets(size_t total_size, size_t batch_offset, size_t batch_size,
-                               int n_classes, std::vector<size_t>* out_offsets) {
-    auto& offsets = *out_offsets;
-    size_t n_shards = devices_.Size();
-    offsets.resize(n_shards + 2);
-    size_t rows_per_shard = common::DivRoundUp(batch_size, n_shards);
-    for (size_t shard = 0; shard < devices_.Size(); ++shard) {
-      size_t n_rows = std::min(batch_size, shard * rows_per_shard);
-      offsets[shard] = batch_offset + n_rows * n_classes;
-    }
-    offsets[n_shards] = batch_offset + batch_size * n_classes;
-    offsets[n_shards + 1] = total_size;
-  }
-
   struct DeviceShard {
     DeviceShard() : device_{-1} {}
 
     ~DeviceShard() {
-      dh::safe_cuda(cudaSetDevice(device_));
+      if (device_ >= 0) {
+        dh::safe_cuda(cudaSetDevice(device_));
+      }
     }
 
     void Init(int device) {
@@ -284,10 +234,9 @@ class GPUPredictor : public xgboost::Predictor {
     void PredictInternal
     (const SparsePage& batch, size_t num_features,
      HostDeviceVector<bst_float>* predictions) {
-      if (predictions->DeviceSize(device_) == 0) { return; }
       dh::safe_cuda(cudaSetDevice(device_));
       const int BLOCK_THREADS = 128;
-      size_t num_rows = batch.offset.DeviceSize(device_) - 1;
+      size_t num_rows = batch.offset.DeviceSize() - 1;
       const int GRID_SIZE = static_cast<int>(common::DivRoundUp(num_rows, BLOCK_THREADS));
 
       int shared_memory_bytes = static_cast<int>
@@ -297,14 +246,12 @@ class GPUPredictor : public xgboost::Predictor {
         shared_memory_bytes = 0;
         use_shared = false;
       }
-      const auto& data_distr = batch.data.Distribution();
-      size_t entry_start = data_distr.ShardStart(batch.data.Size(),
-                                                 data_distr.Devices().Index(device_));
+      size_t entry_start = 0;
 
       PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
-        (dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_),
-         dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_),
-         batch.data.DeviceSpan(device_), this->tree_begin_, this->tree_end_, num_features,
+        (dh::ToSpan(nodes_), predictions->DeviceSpan(), dh::ToSpan(tree_segments_),
+         dh::ToSpan(tree_group_), batch.offset.DeviceSpan(),
+         batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features,
          num_rows, entry_start, use_shared, this->num_group_);
     }
 
@@ -322,7 +269,7 @@ class GPUPredictor : public xgboost::Predictor {
   void InitModel(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) {
     CHECK_EQ(model.param.size_leaf_vector, 0);
     // Copy decision trees to device
-    thrust::host_vector<size_t> h_tree_segments;
+    thrust::host_vector<size_t> h_tree_segments{};
     h_tree_segments.reserve((tree_end - tree_begin) + 1);
     size_t sum = 0;
     h_tree_segments.push_back(sum);
@@ -337,9 +284,7 @@ class GPUPredictor : public xgboost::Predictor {
       std::copy(src_nodes.begin(), src_nodes.end(),
                 h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
     }
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard &shard) {
-      shard.InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
-    });
+    shard_.InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
   }
 
   void DevicePredictInternal(DMatrix* dmat,
@@ -352,40 +297,43 @@ class GPUPredictor : public xgboost::Predictor {
     InitModel(model, tree_begin, tree_end);
 
     size_t batch_offset = 0;
+    auto* preds = out_preds;
+    std::unique_ptr<HostDeviceVector<bst_float>> batch_preds{nullptr};
     for (auto &batch : dmat->GetBatches<SparsePage>()) {
       bool is_external_memory = batch.Size() < dmat->Info().num_row_;
       if (is_external_memory) {
-        std::vector<size_t> out_preds_offsets;
-        PredictionDeviceOffsets(out_preds->Size(), batch_offset, batch.Size(),
-                                model.param.num_output_group, &out_preds_offsets);
-        out_preds->Reshard(GPUDistribution::Explicit(devices_, out_preds_offsets));
+        batch_preds.reset(new HostDeviceVector<bst_float>);
+        batch_preds->Resize(batch.Size() * model.param.num_output_group);
+        std::copy(out_preds->ConstHostVector().begin() + batch_offset,
+                  out_preds->ConstHostVector().begin() + batch_offset + batch_preds->Size(),
+                  batch_preds->HostVector().begin());
+        preds = batch_preds.get();
       }
 
-      batch.offset.Shard(GPUDistribution::Overlap(devices_, 1));
-      std::vector<size_t> device_offsets;
-      DeviceOffsets(batch.offset, batch.data.Size(), &device_offsets);
-      batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
+      batch.offset.SetDevice(device_);
+      batch.data.SetDevice(device_);
+      preds->SetDevice(device_);
+      shard_.PredictInternal(batch, model.param.num_feature, preds);
 
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-        shard.PredictInternal(batch, model.param.num_feature, out_preds);
-      });
+      if (is_external_memory) {
+        auto h_preds = preds->ConstHostVector();
+        std::copy(h_preds.begin(), h_preds.end(), out_preds->HostVector().begin() + batch_offset);
+      }
       batch_offset += batch.Size() * model.param.num_output_group;
     }
-    out_preds->Reshard(GPUDistribution::Granular(devices_, model.param.num_output_group));
 
     monitor_.StopCuda("DevicePredictInternal");
   }
 
  public:
-  GPUPredictor() = default;
+  GPUPredictor() : device_{-1} {};
 
   void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                     const gbm::GBTreeModel& model, int tree_begin,
                     unsigned ntree_limit = 0) override {
-    GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus,
-                                 dmat->Info().num_row_);
-    CHECK_NE(devices.Size(), 0);
-    ConfigureShards(devices);
+    int device = learner_param_->gpu_id;
+    CHECK_GE(device, 0);
+    ConfigureShard(device);
 
     if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
       return;
@@ -408,10 +356,9 @@ class GPUPredictor : public xgboost::Predictor {
     size_t n_classes = model.param.num_output_group;
     size_t n = n_classes * info.num_row_;
     const HostDeviceVector<bst_float>& base_margin = info.base_margin_;
-    out_preds->Shard(GPUDistribution::Granular(devices_, n_classes));
     out_preds->Resize(n);
     if (base_margin.Size() != 0) {
-      CHECK_EQ(out_preds->Size(), n);
+      CHECK_EQ(base_margin.Size(), n);
       out_preds->Copy(base_margin);
     } else {
       out_preds->Fill(model.base_margin);
@@ -427,7 +374,7 @@ class GPUPredictor : public xgboost::Predictor {
         const HostDeviceVector<bst_float>& y = it->second.predictions;
         if (y.Size() != 0) {
           monitor_.StartCuda("PredictFromCache");
-          out_preds->Shard(y.Distribution());
+          out_preds->SetDevice(y.DeviceIdx());
           out_preds->Resize(y.Size());
           out_preds->Copy(y);
           monitor_.StopCuda("PredictFromCache");
@@ -500,25 +447,23 @@ class GPUPredictor : public xgboost::Predictor {
                  const std::vector<std::shared_ptr<DMatrix>>& cache) override {
     Predictor::Configure(cfg, cache);
 
-    GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus);
-    ConfigureShards(devices);
+    int device = learner_param_->gpu_id;
+    if (device >= 0) {
+      ConfigureShard(device);
+    }
   }
 
  private:
   /*! \brief Re configure shards when GPUSet is changed. */
-  void ConfigureShards(GPUSet devices) {
-    if (devices_ == devices) return;
+  void ConfigureShard(int device) {
+    if (device_ == device) return;
 
-    devices_ = devices;
-    shards_.clear();
-    shards_.resize(devices_.Size());
-    dh::ExecuteIndexShards(&shards_, [=](size_t i, DeviceShard& shard){
-        shard.Init(devices_.DeviceId(i));
-      });
+    device_ = device;
+    shard_.Init(device_);
   }
 
-  std::vector<DeviceShard> shards_;
-  GPUSet devices_;
+  DeviceShard shard_;
+  int device_;
   common::Monitor monitor_;
 };
 
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index f78825ab6..b72acac8e 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -702,7 +702,7 @@ struct DeviceShard {
     row_partitioner.reset(new RowPartitioner(device_id, n_rows));
 
     dh::safe_cuda(cudaMemcpyAsync(
-        gpair.data(), dh_gpair->ConstDevicePointer(device_id),
+        gpair.data(), dh_gpair->ConstDevicePointer(),
         gpair.size() * sizeof(GradientPair), cudaMemcpyHostToHost));
     SubsampleGradientPair(device_id, gpair, param.subsample, row_begin_idx);
     hist.Reset();
@@ -745,8 +745,8 @@ struct DeviceShard {
     for (auto i = 0ull; i < nidxs.size(); i++) {
       auto nidx = nidxs[i];
       auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx));
-      p_feature_set->Shard(GPUSet(device_id, 1));
-      auto d_sampled_features = p_feature_set->DeviceSpan(device_id);
+      p_feature_set->SetDevice(device_id);
+      auto d_sampled_features = p_feature_set->DeviceSpan();
       common::Span<int32_t> d_feature_set =
           interaction_constraints.Query(d_sampled_features, nidx);
       auto d_split_candidates =
@@ -1016,7 +1016,7 @@ struct DeviceShard {
                 dh::AllReducer* reducer, int64_t num_columns) {
     constexpr int kRootNIdx = 0;
 
-    const auto &gpair = gpair_all->DeviceSpan(device_id);
+    const auto &gpair = gpair_all->DeviceSpan();
 
     dh::SumReduction(temp_memory, gpair, node_sum_gradients_d,
                      gpair.size());
@@ -1294,11 +1294,8 @@ class GPUHistMakerSpecialised {
     param_.InitAllowUnknown(args);
     generic_param_ = generic_param;
     hist_maker_param_.InitAllowUnknown(args);
-    auto devices = GPUSet::All(generic_param_->gpu_id,
-                               generic_param_->n_gpus);
-    n_devices_ = devices.Size();
-    CHECK(n_devices_ != 0) << "Must have at least one device";
-    dist_ = GPUDistribution::Block(devices);
+    device_ = generic_param_->gpu_id;
+    CHECK_GE(device_, 0) << "Must have at least one device";
 
     dh::CheckComputeCapability();
 
@@ -1330,30 +1327,22 @@ class GPUHistMakerSpecialised {
   void InitDataOnce(DMatrix* dmat) {
     info_ = &dmat->Info();
 
-    int n_devices = dist_.Devices().Size();
-
-    device_list_.resize(n_devices);
-    for (int index = 0; index < n_devices; ++index) {
-      int device_id = dist_.Devices().DeviceId(index);
-      device_list_[index] = device_id;
-    }
-
-    reducer_.Init(device_list_);
+    reducer_.Init({device_});
 
     // Synchronise the column sampling seed
     uint32_t column_sampling_seed = common::GlobalRandom()();
     rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
 
     // Create device shards
-    shards_.resize(n_devices);
+    shards_.resize(1);
     dh::ExecuteIndexShards(
         &shards_,
         [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(idx)));
-          size_t start = dist_.ShardStart(info_->num_row_, idx);
-          size_t size = dist_.ShardSize(info_->num_row_, idx);
+          dh::safe_cuda(cudaSetDevice(device_));
+          size_t start = 0;
+          size_t size = info_->num_row_;
           shard = std::unique_ptr<DeviceShard<GradientSumT>>(
-            new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(idx), idx,
+            new DeviceShard<GradientSumT>(device_, idx,
                                           start, start + size, param_,
                                           column_sampling_seed,
                                           info_->num_col_));
@@ -1436,7 +1425,7 @@ class GPUHistMakerSpecialised {
     for (auto& tree : trees) {
       tree = *p_tree;
     }
-    gpair->Reshard(dist_);
+    gpair->SetDevice(device_);
 
     // Launch one thread for each device "shard" containing a subset of rows.
     // Threads will cooperatively build the tree, synchronising over histograms.
@@ -1462,13 +1451,13 @@ class GPUHistMakerSpecialised {
       return false;
     }
     monitor_.StartCuda("UpdatePredictionCache");
-    p_out_preds->Shard(dist_.Devices());
+    p_out_preds->SetDevice(device_);
     dh::ExecuteIndexShards(
         &shards_,
         [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
           dh::safe_cuda(cudaSetDevice(shard->device_id));
           shard->UpdatePredictionCache(
-              p_out_preds->DevicePointer(shard->device_id));
+              p_out_preds->DevicePointer());
         });
     monitor_.StopCuda("UpdatePredictionCache");
     return true;
@@ -1483,7 +1472,6 @@ class GPUHistMakerSpecialised {
  private:
   bool initialised_;
 
-  int n_devices_;
   int n_bins_;
 
   GPUHistMakerTrainParam hist_maker_param_;
@@ -1492,11 +1480,9 @@ class GPUHistMakerSpecialised {
   dh::AllReducer reducer_;
 
   DMatrix* p_last_fmat_;
-  GPUDistribution dist_;
+  int device_;
 
   common::Monitor monitor_;
-  /*! List storing device id. */
-  std::vector<int> device_list_;
 };
 
 class GPUHistMaker : public TreeUpdater {
diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc
deleted file mode 100644
index ba6946e80..000000000
--- a/tests/cpp/common/test_common.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "../../../src/common/common.h"
-#include <gtest/gtest.h>
-
-namespace xgboost {
-TEST(GPUSet, Basic) {
-  GPUSet devices = GPUSet::Empty();
-  ASSERT_TRUE(devices.IsEmpty());
-
-  devices = GPUSet{0, 1};
-  ASSERT_TRUE(devices != GPUSet::Empty());
-  EXPECT_EQ(devices.Size(), 1);
-
-  devices = GPUSet::Range(1, 0);
-  EXPECT_EQ(devices.Size(), 0);
-  EXPECT_TRUE(devices.IsEmpty());
-
-  EXPECT_FALSE(devices.Contains(1));
-
-  devices = GPUSet::Range(2, -1);
-  EXPECT_EQ(devices, GPUSet::Empty());
-  EXPECT_EQ(devices.Size(), 0);
-  EXPECT_TRUE(devices.IsEmpty());
-
-  devices = GPUSet::Range(2, 8);  // 2 ~ 10
-  EXPECT_EQ(devices.Size(), 8);
-  EXPECT_ANY_THROW(devices.DeviceId(8));
-
-  auto device_id = devices.DeviceId(0);
-  EXPECT_EQ(device_id, 2);
-  auto device_index = devices.Index(2);
-  EXPECT_EQ(device_index, 0);
-
-#ifndef XGBOOST_USE_CUDA
-  EXPECT_EQ(GPUSet::AllVisible(), GPUSet::Empty());
-#endif
-}
-}  // namespace xgboost
diff --git a/tests/cpp/common/test_common.cu b/tests/cpp/common/test_common.cu
deleted file mode 100644
index f2d62cebc..000000000
--- a/tests/cpp/common/test_common.cu
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <gtest/gtest.h>
-#include <xgboost/logging.h>
-#include "../../../src/common/common.h"
-#include "../helpers.h"
-
-#include <string>
-
-namespace xgboost {
-
-TEST(GPUSet, GPUBasic) {
-  GPUSet devices = GPUSet::Empty();
-  ASSERT_TRUE(devices.IsEmpty());
-
-  devices = GPUSet{1, 1};
-  ASSERT_TRUE(devices != GPUSet::Empty());
-  EXPECT_EQ(devices.Size(), 1);
-  EXPECT_EQ(*(devices.begin()), 1);
-
-  devices = GPUSet::Range(1, 0);
-  EXPECT_EQ(devices, GPUSet::Empty());
-  EXPECT_EQ(devices.Size(), 0);
-  EXPECT_TRUE(devices.IsEmpty());
-
-  EXPECT_FALSE(devices.Contains(1));
-
-  devices = GPUSet::Range(2, -1);
-  EXPECT_EQ(devices, GPUSet::Empty());
-
-  devices = GPUSet::Range(2, 8);
-  EXPECT_EQ(devices.Size(), 8);
-
-  EXPECT_EQ(*devices.begin(), 2);
-  EXPECT_EQ(*devices.end(), 2 + devices.Size());
-  EXPECT_EQ(8, devices.Size());
-
-  ASSERT_NO_THROW(GPUSet::AllVisible());
-  devices = GPUSet::AllVisible();
-  if (devices.IsEmpty()) {
-    LOG(WARNING) << "Empty devices.";
-  }
-}
-
-TEST(GPUSet, Verbose) {
-  {
-    std::map<std::string, std::string> args {};
-    args["verbosity"] = "3";  // LOG INFO
-
-    testing::internal::CaptureStderr();
-    ConsoleLogger::Configure({args.cbegin(), args.cend()});
-    GPUSet::All(0, 1);
-    std::string output = testing::internal::GetCapturedStderr();
-    ASSERT_NE(output.find("GPU ID: 0"), std::string::npos);
-    ASSERT_NE(output.find("GPUs: 1"), std::string::npos);
-
-    args["verbosity"] = "1";  // restore
-    ConsoleLogger::Configure({args.cbegin(), args.cend()});
-  }
-}
-
-#if defined(XGBOOST_USE_NCCL)
-TEST(GPUSet, MGPU_GPUBasic) {
-  {
-    GPUSet devices = GPUSet::All(1, 1);
-    ASSERT_EQ(*(devices.begin()), 1);
-    ASSERT_EQ(*(devices.end()), 2);
-    ASSERT_EQ(devices.Size(), 1);
-    ASSERT_TRUE(devices.Contains(1));
-  }
-
-  {
-    GPUSet devices = GPUSet::All(0, -1);
-    ASSERT_GE(devices.Size(), 2);
-  }
-
-  // Specify number of rows.
-  {
-    GPUSet devices = GPUSet::All(0, -1, 1);
-    ASSERT_EQ(devices.Size(), 1);
-  }
-}
-#endif
-
-}  // namespace xgboost
\ No newline at end of file
diff --git a/tests/cpp/common/test_config.cc b/tests/cpp/common/test_config.cc
index 542bf2c30..7bf61dcfd 100644
--- a/tests/cpp/common/test_config.cc
+++ b/tests/cpp/common/test_config.cc
@@ -87,8 +87,8 @@ TEST(ConfigParser, ParseKeyValuePair) {
   ASSERT_TRUE(parser.ParseKeyValuePair("booster = gbtree", &key, &value));
   ASSERT_EQ(key, "booster");
   ASSERT_EQ(value, "gbtree");
-  ASSERT_TRUE(parser.ParseKeyValuePair("n_gpus = 2", &key, &value));
-  ASSERT_EQ(key, "n_gpus");
+  ASSERT_TRUE(parser.ParseKeyValuePair("gpu_id = 2", &key, &value));
+  ASSERT_EQ(key, "gpu_id");
   ASSERT_EQ(value, "2");
   ASSERT_TRUE(parser.ParseKeyValuePair("monotone_constraints = (1,0,-1)",
                                        &key, &value));
diff --git a/tests/cpp/common/test_gpu_hist_util.cu b/tests/cpp/common/test_gpu_hist_util.cu
index 5be7d8dd2..bbb1cc4bf 100644
--- a/tests/cpp/common/test_gpu_hist_util.cu
+++ b/tests/cpp/common/test_gpu_hist_util.cu
@@ -18,7 +18,7 @@
 namespace xgboost {
 namespace common {
 
-void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
+void TestDeviceSketch(bool use_external_memory) {
   // create the data
   int nrows = 10001;
   std::shared_ptr<xgboost::DMatrix> *dmat = nullptr;
@@ -53,7 +53,7 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
 
   // find the cuts on the GPU
   HistogramCuts hmat_gpu;
-  size_t row_stride = DeviceSketch(p, CreateEmptyGenericParam(0, devices.Size()), gpu_batch_nrows,
+  size_t row_stride = DeviceSketch(p, CreateEmptyGenericParam(0), gpu_batch_nrows,
                                    dmat->get(), &hmat_gpu);
 
   // compare the row stride with the one obtained from the dmatrix
@@ -81,11 +81,11 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
 }
 
 TEST(gpu_hist_util, DeviceSketch) {
-  TestDeviceSketch(GPUSet::Range(0, 1), false);
+  TestDeviceSketch(false);
 }
 
 TEST(gpu_hist_util, DeviceSketch_ExternalMemory) {
-  TestDeviceSketch(GPUSet::Range(0, 1), true);
+  TestDeviceSketch(true);
 }
 
 }  // namespace common
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index 57c49754e..1ab4348c8 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -30,45 +30,36 @@ struct HostDeviceVectorSetDeviceHandler {
   }
 };
 
-void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
-                     HostDeviceVector<int> *v) {
+void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
   // create the vector
-  GPUSet devices = distribution.Devices();
-  v->Shard(distribution);
+  v->SetDevice(device);
   v->Resize(n);
 
   ASSERT_EQ(v->Size(), n);
-  ASSERT_TRUE(v->Distribution() == distribution);
-  ASSERT_TRUE(v->Devices() == devices);
-  // ensure that the devices have read-write access
-  for (int i = 0; i < devices.Size(); ++i) {
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite));
-  }
+  ASSERT_EQ(v->DeviceIdx(), device);
+  // ensure that the device have read-write access
+  ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
+  ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite));
   // ensure that the host has no access
   ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
   ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));
 
   // fill in the data on the host
   std::vector<int>& data_h = v->HostVector();
-  // ensure that the host has full access, while the devices have none
+  // ensure that the host has full access, while the device have none
   ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
   ASSERT_TRUE(v->HostCanAccess(GPUAccess::kWrite));
-  for (int i = 0; i < devices.Size(); ++i) {
-    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kRead));
-    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite));
-  }
+  ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kRead));
+  ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite));
   ASSERT_EQ(data_h.size(), n);
   std::copy_n(thrust::make_counting_iterator(0), n, data_h.begin());
 }
 
 void PlusOne(HostDeviceVector<int> *v) {
-  int n_devices = v->Devices().Size();
-  for (int i = 0; i < n_devices; ++i) {
-    SetDevice(i);
-    thrust::transform(v->tbegin(i), v->tend(i), v->tbegin(i),
-                      [=]__device__(unsigned int a){ return a + 1; });
-  }
+  int device = v->DeviceIdx();
+  SetDevice(device);
+  thrust::transform(v->tbegin(), v->tend(), v->tbegin(),
+                    [=]__device__(unsigned int a){ return a + 1; });
 }
 
 void CheckDevice(HostDeviceVector<int> *v,
@@ -76,24 +67,24 @@ void CheckDevice(HostDeviceVector<int> *v,
                  const std::vector<size_t>& sizes,
                  unsigned int first, GPUAccess access) {
   int n_devices = sizes.size();
-  ASSERT_EQ(v->Devices().Size(), n_devices);
+  ASSERT_EQ(n_devices, 1);
   for (int i = 0; i < n_devices; ++i) {
-    ASSERT_EQ(v->DeviceSize(i), sizes.at(i));
+    ASSERT_EQ(v->DeviceSize(), sizes.at(i));
     SetDevice(i);
-    ASSERT_TRUE(thrust::equal(v->tcbegin(i), v->tcend(i),
+    ASSERT_TRUE(thrust::equal(v->tcbegin(), v->tcend(),
                               thrust::make_counting_iterator(first + starts[i])));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
+    ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
     // ensure that the device has at most the access specified by access
-    ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kWrite), access == GPUAccess::kWrite);
+    ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite);
   }
   ASSERT_EQ(v->HostCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
   ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
   for (int i = 0; i < n_devices; ++i) {
     SetDevice(i);
-    ASSERT_TRUE(thrust::equal(v->tbegin(i), v->tend(i),
+    ASSERT_TRUE(thrust::equal(v->tbegin(), v->tend(),
                               thrust::make_counting_iterator(first + starts[i])));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+    ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
+    ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite));
   }
   ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));
   ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
@@ -107,20 +98,20 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
   }
   ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
   ASSERT_EQ(v->HostCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite);
-  size_t n_devices = v->Devices().Size();
+  size_t n_devices = 1;
   for (int i = 0; i < n_devices; ++i) {
-    ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kRead), access == GPUAccess::kRead);
+    ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
     // the devices should have no write access
-    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+    ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite));
   }
 }
 
 void TestHostDeviceVector
-(size_t n, const GPUDistribution& distribution,
+(size_t n, int device,
  const std::vector<size_t>& starts, const std::vector<size_t>& sizes) {
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
   HostDeviceVector<int> v;
-  InitHostDeviceVector(n, distribution, &v);
+  InitHostDeviceVector(n, device, &v);
   CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
   PlusOne(&v);
   CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
@@ -130,54 +121,24 @@ void TestHostDeviceVector
 
 TEST(HostDeviceVector, TestBlock) {
   size_t n = 1001;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
-  std::vector<size_t> starts{0, 501};
-  std::vector<size_t> sizes{501, 500};
-  TestHostDeviceVector(n, distribution, starts, sizes);
-}
-
-TEST(HostDeviceVector, TestGranular) {
-  size_t n = 3003;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Granular(GPUSet::Range(0, n_devices), 3);
-  std::vector<size_t> starts{0, 1503};
-  std::vector<size_t> sizes{1503, 1500};
-  TestHostDeviceVector(n, distribution, starts, sizes);
-}
-
-TEST(HostDeviceVector, TestOverlap) {
-  size_t n = 1001;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1);
-  std::vector<size_t> starts{0, 500};
-  std::vector<size_t> sizes{501, 501};
-  TestHostDeviceVector(n, distribution, starts, sizes);
-}
-
-TEST(HostDeviceVector, TestExplicit) {
-  size_t n = 1001;
-  int n_devices = 2;
-  std::vector<size_t> offsets{0, 550, 1001};
-  auto distribution = GPUDistribution::Explicit(GPUSet::Range(0, n_devices), offsets);
-  std::vector<size_t> starts{0, 550};
-  std::vector<size_t> sizes{550, 451};
-  TestHostDeviceVector(n, distribution, starts, sizes);
+  int device = 0;
+  std::vector<size_t> starts{0};
+  std::vector<size_t> sizes{1001};
+  TestHostDeviceVector(n, device, starts, sizes);
 }
 
 TEST(HostDeviceVector, TestCopy) {
   size_t n = 1001;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
-  std::vector<size_t> starts{0, 501};
-  std::vector<size_t> sizes{501, 500};
+  int device = 0;
+  std::vector<size_t> starts{0};
+  std::vector<size_t> sizes{1001};
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
 
   HostDeviceVector<int> v;
   {
     // a separate scope to ensure that v1 is gone before further checks
     HostDeviceVector<int> v1;
-    InitHostDeviceVector(n, distribution, &v1);
+    InitHostDeviceVector(n, device, &v1);
     v = v1;
   }
   CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
@@ -193,16 +154,16 @@ TEST(HostDeviceVector, Shard) {
     h_vec[i] = i;
   }
   HostDeviceVector<int> vec (h_vec);
-  auto devices = GPUSet::Range(0, 1);
+  auto device = 0;
 
-  vec.Shard(devices);
-  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
+  vec.SetDevice(device);
+  ASSERT_EQ(vec.DeviceSize(), h_vec.size());
   ASSERT_EQ(vec.Size(), h_vec.size());
-  auto span = vec.DeviceSpan(0);  // sync to device
+  auto span = vec.DeviceSpan();  // sync to device
 
-  vec.Reshard(GPUDistribution::Empty());  // pull back to cpu, empty devices.
+  vec.SetDevice(-1);  // pull back to cpu.
   ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_TRUE(vec.Devices().IsEmpty());
+  ASSERT_EQ(vec.DeviceIdx(), -1);
 
   auto h_vec_1 = vec.HostVector();
   ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@@ -214,16 +175,16 @@ TEST(HostDeviceVector, Reshard) {
     h_vec[i] = i;
   }
   HostDeviceVector<int> vec (h_vec);
-  auto devices = GPUSet::Range(0, 1);
+  auto device = 0;
 
-  vec.Shard(devices);
-  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
+  vec.SetDevice(device);
+  ASSERT_EQ(vec.DeviceSize(), h_vec.size());
   ASSERT_EQ(vec.Size(), h_vec.size());
   PlusOne(&vec);
 
-  vec.Reshard(GPUDistribution::Empty());
+  vec.SetDevice(-1);
   ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_TRUE(vec.Devices().IsEmpty());
+  ASSERT_EQ(vec.DeviceIdx(), -1);
 
   auto h_vec_1 = vec.HostVector();
   for (size_t i = 0; i < h_vec_1.size(); ++i) {
@@ -233,97 +194,14 @@ TEST(HostDeviceVector, Reshard) {
 
 TEST(HostDeviceVector, Span) {
   HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.Shard(GPUSet{0, 1});
-  auto span = vec.DeviceSpan(0);
-  ASSERT_EQ(vec.DeviceSize(0), span.size());
-  ASSERT_EQ(vec.DevicePointer(0), span.data());
-  auto const_span = vec.ConstDeviceSpan(0);
-  ASSERT_EQ(vec.DeviceSize(0), span.size());
-  ASSERT_EQ(vec.ConstDevicePointer(0), span.data());
+  vec.SetDevice(0);
+  auto span = vec.DeviceSpan();
+  ASSERT_EQ(vec.DeviceSize(), span.size());
+  ASSERT_EQ(vec.DevicePointer(), span.data());
+  auto const_span = vec.ConstDeviceSpan();
+  ASSERT_EQ(vec.DeviceSize(), span.size());
+  ASSERT_EQ(vec.ConstDevicePointer(), span.data());
 }
 
-// Multi-GPUs' test
-#if defined(XGBOOST_USE_NCCL)
-TEST(HostDeviceVector, MGPU_Shard) {
-  auto devices = GPUSet::AllVisible();
-  if (devices.Size() < 2) {
-    LOG(WARNING) << "Not testing in multi-gpu environment.";
-    return;
-  }
-
-  std::vector<int> h_vec (2345);
-  for (size_t i = 0; i < h_vec.size(); ++i) {
-    h_vec[i] = i;
-  }
-  HostDeviceVector<int> vec (h_vec);
-
-  // Data size for each device.
-  std::vector<size_t> devices_size (devices.Size());
-
-  // From CPU to GPUs.
-  vec.Shard(devices);
-  size_t total_size = 0;
-  for (size_t i = 0; i < devices.Size(); ++i) {
-    total_size += vec.DeviceSize(i);
-    devices_size[i] = vec.DeviceSize(i);
-  }
-  ASSERT_EQ(total_size, h_vec.size());
-  ASSERT_EQ(total_size, vec.Size());
-
-  // Shard from devices to devices with different distribution.
-  EXPECT_ANY_THROW(
-      vec.Shard(GPUDistribution::Granular(devices, 12)));
-
-  // All data is drawn back to CPU
-  vec.Reshard(GPUDistribution::Empty());
-  ASSERT_TRUE(vec.Devices().IsEmpty());
-  ASSERT_EQ(vec.Size(), h_vec.size());
-
-  vec.Shard(GPUDistribution::Granular(devices, 12));
-  total_size = 0;
-  for (size_t i = 0; i < devices.Size(); ++i) {
-    total_size += vec.DeviceSize(i);
-    devices_size[i] = vec.DeviceSize(i);
-  }
-  ASSERT_EQ(total_size, h_vec.size());
-  ASSERT_EQ(total_size, vec.Size());
-}
-
-TEST(HostDeviceVector, MGPU_Reshard) {
-  auto devices = GPUSet::AllVisible();
-  if (devices.Size() < 2) {
-    LOG(WARNING) << "Not testing in multi-gpu environment.";
-    return;
-  }
-
-  size_t n = 1001;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
-  std::vector<size_t> starts{0, 501};
-  std::vector<size_t> sizes{501, 500};
-
-  HostDeviceVector<int> v;
-  InitHostDeviceVector(n, distribution, &v);
-  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
-  PlusOne(&v);
-  CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
-  CheckHost(&v, GPUAccess::kRead);
-  CheckHost(&v, GPUAccess::kWrite);
-
-  auto distribution1 = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1);
-  v.Reshard(distribution1);
-
-  for (size_t i = 0; i < n_devices; ++i) {
-    auto span = v.DeviceSpan(i);  // sync to device
-  }
-
-  std::vector<size_t> starts1{0, 500};
-  std::vector<size_t> sizes1{501, 501};
-  CheckDevice(&v, starts1, sizes1, 1, GPUAccess::kWrite);
-  CheckHost(&v, GPUAccess::kRead);
-  CheckHost(&v, GPUAccess::kWrite);
-}
-#endif
-
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 1b37ba885..1ae31236b 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -33,7 +33,7 @@ std::string GetModelStr() {
   },
   "configuration": {
     "booster": "gbtree",
-    "n_gpus": "1",
+    "gpu_id": "0",
     "num_class": "0",
     "num_feature": "10",
     "objective": "reg:linear",
diff --git a/tests/cpp/common/test_transform_range.cc b/tests/cpp/common/test_transform_range.cc
index 902c282bc..4dee523ef 100644
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -9,13 +9,11 @@
 
 #if defined(__CUDACC__)
 
-#define TRANSFORM_GPU_RANGE GPUSet::Range(0, 1)
-#define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Range(0, 1))
+#define TRANSFORM_GPU 0
 
 #else
 
-#define TRANSFORM_GPU_RANGE GPUSet::Empty()
-#define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Empty())
+#define TRANSFORM_GPU -1
 
 #endif
 
@@ -46,13 +44,13 @@ TEST(Transform, DeclareUnifiedTest(Basic)) {
   std::vector<bst_float> h_sol(size);
   InitializeRange(h_sol.begin(), h_sol.end());
 
-  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU_DIST};
-  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU_DIST};
+  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
+  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
   out_vec.Fill(0);
 
   Transform<>::Init(TestTransformRange<bst_float>{},
 	                Range{0, static_cast<Range::DifferenceType>(size)},
-	                TRANSFORM_GPU_RANGE)
+	                TRANSFORM_GPU)
       .Eval(&out_vec, &in_vec);
   std::vector<bst_float> res = out_vec.HostVector();
 
diff --git a/tests/cpp/common/test_transform_range.cu b/tests/cpp/common/test_transform_range.cu
index 29172937f..4e4c259e4 100644
--- a/tests/cpp/common/test_transform_range.cu
+++ b/tests/cpp/common/test_transform_range.cu
@@ -5,87 +5,13 @@
 namespace xgboost {
 namespace common {
 
-// Test here is multi gpu specific
-TEST(Transform, MGPU_Basic) {
-  auto devices = GPUSet::AllVisible();
-  CHECK_GT(devices.Size(), 1);
-  const size_t size {256};
-  std::vector<bst_float> h_in(size);
-  std::vector<bst_float> h_out(size);
-  InitializeRange(h_in.begin(), h_in.end());
-  std::vector<bst_float> h_sol(size);
-  InitializeRange(h_sol.begin(), h_sol.end());
-
-  const HostDeviceVector<bst_float> in_vec {h_in,
-        GPUDistribution::Block(GPUSet::Empty())};
-  HostDeviceVector<bst_float> out_vec {h_out,
-        GPUDistribution::Block(GPUSet::Empty())};
-  out_vec.Fill(0);
-
-  in_vec.Shard(GPUDistribution::Granular(devices, 8));
-  out_vec.Shard(GPUDistribution::Block(devices));
-
-  // Granularity is different, sharding will throw.
-  EXPECT_ANY_THROW(
-      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, devices)
-      .Eval(&out_vec, &in_vec));
-
-
-  Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size},
-                    devices, false).Eval(&out_vec, &in_vec);
-  std::vector<bst_float> res = out_vec.HostVector();
-
-  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
-}
-
-// Test for multi-classes setting.
-template <typename T>
-struct TestTransformRangeGranular {
-  const size_t granularity = 8;
-
-  explicit TestTransformRangeGranular(const size_t granular) : granularity{granular} {}
-  void XGBOOST_DEVICE operator()(size_t _idx,
-                                 Span<bst_float> _out, Span<const bst_float> _in) {
-    auto in_sub = _in.subspan(_idx * granularity, granularity);
-    auto out_sub = _out.subspan(_idx * granularity, granularity);
-    for (size_t i = 0; i < granularity; ++i) {
-      out_sub[i] = in_sub[i];
-    }
-  }
-};
-
-TEST(Transform, MGPU_Granularity) {
-  GPUSet devices = GPUSet::All(0, -1);
-
-  const size_t size {8990};
-  const size_t granularity = 10;
-
-  GPUDistribution distribution =
-      GPUDistribution::Granular(devices, granularity);
-
-  std::vector<bst_float> h_in(size);
-  std::vector<bst_float> h_out(size);
-  InitializeRange(h_in.begin(), h_in.end());
-  std::vector<bst_float> h_sol(size);
-  InitializeRange(h_sol.begin(), h_sol.end());
-
-  const HostDeviceVector<bst_float> in_vec {h_in, distribution};
-  HostDeviceVector<bst_float> out_vec {h_out, distribution};
-
-  ASSERT_NO_THROW(
-      Transform<>::Init(
-          TestTransformRangeGranular<bst_float>{granularity},
-          Range{0, size / granularity},
-          distribution)
-      .Eval(&out_vec, &in_vec));
-  std::vector<bst_float> res = out_vec.HostVector();
-
-  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
-}
-
 TEST(Transform, MGPU_SpecifiedGpuId) {
+  if (AllVisibleGPUs() < 2) {
+    LOG(WARNING) << "Not testing in multi-gpu environment.";
+    return;
+  }
   // Use 1 GPU, Numbering of GPU starts from 1
-  auto devices = GPUSet::All(1, 1);
+  auto device = 1;
   const size_t size {256};
   std::vector<bst_float> h_in(size);
   std::vector<bst_float> h_out(size);
@@ -93,13 +19,11 @@ TEST(Transform, MGPU_SpecifiedGpuId) {
   std::vector<bst_float> h_sol(size);
   InitializeRange(h_sol.begin(), h_sol.end());
 
-  const HostDeviceVector<bst_float> in_vec {h_in,
-        GPUDistribution::Block(devices)};
-  HostDeviceVector<bst_float> out_vec {h_out,
-        GPUDistribution::Block(devices)};
+  const HostDeviceVector<bst_float> in_vec {h_in, device};
+  HostDeviceVector<bst_float> out_vec {h_out, device};
 
   ASSERT_NO_THROW(
-      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, devices)
+      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, device)
       .Eval(&out_vec, &in_vec));
   std::vector<bst_float> res = out_vec.HostVector();
   ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 10ba161dd..0ae2ba856 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -12,7 +12,7 @@ TEST(GBTree, SelectTreeMethod) {
   auto p_dmat {(*p_shared_ptr_dmat).get()};
 
   GenericParameter generic_param;
-  generic_param.InitAllowUnknown(std::vector<Arg>{Arg("n_gpus", "0")});
+  generic_param.InitAllowUnknown(std::vector<Arg>{});
   std::unique_ptr<GradientBooster> p_gbm{
     GradientBooster::Create("gbtree", &generic_param, {}, 0)};
   auto& gbtree = dynamic_cast<gbm::GBTree&> (*p_gbm);
@@ -35,7 +35,7 @@ TEST(GBTree, SelectTreeMethod) {
                                  Arg{"num_feature", n_feat}}, p_dmat);
   ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker");
 #ifdef XGBOOST_USE_CUDA
-  generic_param.InitAllowUnknown(std::vector<Arg>{Arg{"n_gpus", "1"}});
+  generic_param.InitAllowUnknown(std::vector<Arg>{Arg{"gpu_id", "0"}});
   gbtree.ConfigureWithKnownData({Arg("tree_method", "gpu_hist"), Arg("num_feature", n_feat)},
                                 p_dmat);
   ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist");
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 761307fd7..1571393fd 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -29,9 +29,9 @@
 #endif
 
 #if defined(__CUDACC__)
-#define NGPUS 1
+#define GPUIDX 0
 #else
-#define NGPUS 0
+#define GPUIDX -1
 #endif
 
 bool FileExists(const std::string& filename);
@@ -189,11 +189,10 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_c
 
 gbm::GBTreeModel CreateTestModel();
 
-inline GenericParameter CreateEmptyGenericParam(int gpu_id, int n_gpus) {
+inline GenericParameter CreateEmptyGenericParam(int gpu_id) {
   xgboost::GenericParameter tparam;
   std::vector<std::pair<std::string, std::string>> args {
-    {"gpu_id", std::to_string(gpu_id)},
-    {"n_gpus", std::to_string(n_gpus)}};
+    {"gpu_id", std::to_string(gpu_id)}};
   tparam.Init(args);
   return tparam;
 }
diff --git a/tests/cpp/linear/test_linear.cc b/tests/cpp/linear/test_linear.cc
index 504728a09..0a9c0178b 100644
--- a/tests/cpp/linear/test_linear.cc
+++ b/tests/cpp/linear/test_linear.cc
@@ -7,7 +7,7 @@
 
 TEST(Linear, shotgun) {
   auto mat = xgboost::CreateDMatrix(10, 10, 0);
-  auto lparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   {
     auto updater = std::unique_ptr<xgboost::LinearUpdater>(
         xgboost::LinearUpdater::Create("shotgun", &lparam));
@@ -33,7 +33,7 @@ TEST(Linear, shotgun) {
 
 TEST(Linear, coordinate) {
   auto mat = xgboost::CreateDMatrix(10, 10, 0);
-  auto lparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   auto updater = std::unique_ptr<xgboost::LinearUpdater>(
       xgboost::LinearUpdater::Create("coord_descent", &lparam));
   updater->Configure({{"eta", "1."}});
diff --git a/tests/cpp/linear/test_linear.cu b/tests/cpp/linear/test_linear.cu
index 9fba4735e..b8f4c2722 100644
--- a/tests/cpp/linear/test_linear.cu
+++ b/tests/cpp/linear/test_linear.cu
@@ -7,8 +7,7 @@ namespace xgboost {
 
 TEST(Linear, GPUCoordinate) {
   auto mat = xgboost::CreateDMatrix(10, 10, 0);
-  auto lparam = CreateEmptyGenericParam(0, 1);
-  lparam.n_gpus = 1;
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
   auto updater = std::unique_ptr<xgboost::LinearUpdater>(
       xgboost::LinearUpdater::Create("gpu_coord_descent", &lparam));
   updater->Configure({{"eta", "1."}});
diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc
index c38b81a7e..fe411f2c6 100644
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -6,7 +6,7 @@
 #include "../helpers.h"
 
 TEST(Metric, DeclareUnifiedTest(RMSE)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("rmse", &lparam);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "rmse");
@@ -20,7 +20,7 @@ TEST(Metric, DeclareUnifiedTest(RMSE)) {
 }
 
 TEST(Metric, DeclareUnifiedTest(RMSLE)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &lparam);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "rmsle");
@@ -32,7 +32,7 @@ TEST(Metric, DeclareUnifiedTest(RMSLE)) {
 }
 
 TEST(Metric, DeclareUnifiedTest(MAE)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("mae", &lparam);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mae");
@@ -45,7 +45,7 @@ TEST(Metric, DeclareUnifiedTest(MAE)) {
 }
 
 TEST(Metric, DeclareUnifiedTest(LogLoss)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("logloss", &lparam);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "logloss");
@@ -58,7 +58,7 @@ TEST(Metric, DeclareUnifiedTest(LogLoss)) {
 }
 
 TEST(Metric, DeclareUnifiedTest(Error)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("error", &lparam);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "error");
@@ -90,7 +90,7 @@ TEST(Metric, DeclareUnifiedTest(Error)) {
 }
 
 TEST(Metric, DeclareUnifiedTest(PoissionNegLogLik)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &lparam);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "poisson-nloglik");
diff --git a/tests/cpp/metric/test_metric.cc b/tests/cpp/metric/test_metric.cc
index f4cf682a3..fdb620928 100644
--- a/tests/cpp/metric/test_metric.cc
+++ b/tests/cpp/metric/test_metric.cc
@@ -4,7 +4,7 @@
 #include "../helpers.h"
 
 TEST(Metric, UnknownMetric) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = nullptr;
   EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name", &tparam));
   EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse", &tparam));
diff --git a/tests/cpp/metric/test_multiclass_metric.cc b/tests/cpp/metric/test_multiclass_metric.cc
index f8313082f..95f2c417e 100644
--- a/tests/cpp/metric/test_multiclass_metric.cc
+++ b/tests/cpp/metric/test_multiclass_metric.cc
@@ -4,10 +4,9 @@
 
 #include "../helpers.h"
 
-inline void TestMultiClassError(xgboost::GPUSet const& devices) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
-  lparam.gpu_id = *devices.begin();
-  lparam.n_gpus = devices.Size();
+inline void TestMultiClassError(int device) {
+  auto lparam = xgboost::CreateEmptyGenericParam(device);
+  lparam.gpu_id = device;
   xgboost::Metric * metric = xgboost::Metric::Create("merror", &lparam);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "merror");
@@ -23,14 +22,12 @@ inline void TestMultiClassError(xgboost::GPUSet const& devices) {
 }
 
 TEST(Metric, DeclareUnifiedTest(MultiClassError)) {
-  auto devices = xgboost::GPUSet::Range(0, NGPUS);
-  TestMultiClassError(devices);
+  TestMultiClassError(GPUIDX);
 }
 
-inline void TestMultiClassLogLoss(xgboost::GPUSet const& devices) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
-  lparam.gpu_id = *devices.begin();
-  lparam.n_gpus = devices.Size();
+inline void TestMultiClassLogLoss(int device) {
+  auto lparam = xgboost::CreateEmptyGenericParam(device);
+  lparam.gpu_id = device;
   xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &lparam);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mlogloss");
@@ -46,27 +43,31 @@ inline void TestMultiClassLogLoss(xgboost::GPUSet const& devices) {
 }
 
 TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) {
-  auto devices = xgboost::GPUSet::Range(0, NGPUS);
-  TestMultiClassLogLoss(devices);
+  TestMultiClassLogLoss(GPUIDX);
 }
 
 #if defined(XGBOOST_USE_NCCL) && defined(__CUDACC__)
+namespace xgboost {
+namespace common {
 TEST(Metric, MGPU_MultiClassError) {
+  if (AllVisibleGPUs() < 2) {
+    LOG(WARNING) << "Not testing in multi-gpu environment.";
+    return;
+  }
+
   {
-    auto devices = xgboost::GPUSet::All(0, -1);
-    TestMultiClassError(devices);
+    TestMultiClassError(0);
   }
   {
-    auto devices = xgboost::GPUSet::All(1, -1);
-    TestMultiClassError(devices);
+    TestMultiClassError(1);
   }
   {
-    auto devices = xgboost::GPUSet::All(0, -1);
-    TestMultiClassLogLoss(devices);
+    TestMultiClassLogLoss(0);
   }
   {
-    auto devices = xgboost::GPUSet::All(1, -1);
-    TestMultiClassLogLoss(devices);
+    TestMultiClassLogLoss(1);
   }
 }
+}  // namespace common
+}  // namespace xgboost
 #endif  // defined(XGBOOST_USE_NCCL)
diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc
index ef1d1377f..792deb5a6 100644
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -4,7 +4,7 @@
 #include "../helpers.h"
 
 TEST(Metric, AMS) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   EXPECT_ANY_THROW(xgboost::Metric::Create("ams", &tparam));
   xgboost::Metric * metric = xgboost::Metric::Create("ams@0.5f", &tparam);
   ASSERT_STREQ(metric->Name(), "ams@0.5");
@@ -23,7 +23,7 @@ TEST(Metric, AMS) {
 }
 
 TEST(Metric, AUC) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("auc", &tparam);
   ASSERT_STREQ(metric->Name(), "auc");
   EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
@@ -38,7 +38,7 @@ TEST(Metric, AUC) {
 }
 
 TEST(Metric, AUCPR) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric *metric = xgboost::Metric::Create("aucpr", &tparam);
   ASSERT_STREQ(metric->Name(), "aucpr");
   EXPECT_NEAR(GetMetricEval(metric, {0, 0, 1, 1}, {0, 0, 1, 1}), 1, 1e-10);
@@ -65,7 +65,7 @@ TEST(Metric, Precision) {
   // When the limit for precision is not given, it takes the limit at
   // std::numeric_limits<unsigned>::max(); hence all values are very small
   // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("pre", &tparam);
   ASSERT_STREQ(metric->Name(), "pre");
   EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-7);
@@ -89,7 +89,7 @@ TEST(Metric, Precision) {
 }
 
 TEST(Metric, NDCG) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("ndcg", &tparam);
   ASSERT_STREQ(metric->Name(), "ndcg");
   EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
@@ -147,7 +147,7 @@ TEST(Metric, NDCG) {
 }
 
 TEST(Metric, MAP) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("map", &tparam);
   ASSERT_STREQ(metric->Name(), "map");
   EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
diff --git a/tests/cpp/objective/test_hinge.cc b/tests/cpp/objective/test_hinge.cc
index f6485cbe6..fb9b24daa 100644
--- a/tests/cpp/objective/test_hinge.cc
+++ b/tests/cpp/objective/test_hinge.cc
@@ -6,7 +6,7 @@
 #include "../helpers.h"
 
 TEST(Objective, DeclareUnifiedTest(HingeObj)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("binary:hinge", &tparam);
 
   xgboost::bst_float eps = std::numeric_limits<xgboost::bst_float>::min();
diff --git a/tests/cpp/objective/test_multiclass_obj.cc b/tests/cpp/objective/test_multiclass_obj.cc
index d6157a672..77d38f09a 100644
--- a/tests/cpp/objective/test_multiclass_obj.cc
+++ b/tests/cpp/objective/test_multiclass_obj.cc
@@ -7,7 +7,7 @@
 #include "../helpers.h"
 
 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args {{"num_class", "3"}};
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("multi:softmax", &lparam);
 
@@ -25,7 +25,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args{
     std::pair<std::string, std::string>("num_class", "3")};
 
@@ -47,7 +47,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(SoftprobMultiClassBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args {
     std::pair<std::string, std::string>("num_class", "3")};
 
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index 7a092dff0..bcf2bd497 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -7,7 +7,7 @@
 #include "../helpers.h"
 
 TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
 
   xgboost::ObjFunction * obj =
@@ -32,7 +32,7 @@ TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
 
   xgboost::ObjFunction * obj =
@@ -56,7 +56,7 @@ TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:logistic", &tparam);
 
@@ -72,7 +72,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:logistic", &lparam);
 
@@ -102,7 +102,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("binary:logitraw", &lparam);
 
@@ -118,7 +118,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson", &lparam);
 
@@ -140,7 +140,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson", &lparam);
 
@@ -168,7 +168,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:gamma", &lparam);
 
@@ -189,7 +189,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:gamma", &lparam);
 
@@ -217,7 +217,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie", &lparam);
 
@@ -241,7 +241,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
 
 #if defined(__CUDACC__)
 TEST(Objective, CPU_vs_CUDA) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, 1);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
 
   xgboost::ObjFunction * obj =
       xgboost::ObjFunction::Create("reg:squarederror", &lparam);
@@ -267,12 +267,12 @@ TEST(Objective, CPU_vs_CUDA) {
 
   {
     // CPU
-    lparam.n_gpus = 0;
+    lparam.gpu_id = -1;
     obj->GetGradient(preds, info, 0, &cpu_out_preds);
   }
   {
     // CUDA
-    lparam.n_gpus = 1;
+    lparam.gpu_id = 0;
     obj->GetGradient(preds, info, 0, &cuda_out_preds);
   }
 
@@ -294,7 +294,7 @@ TEST(Objective, CPU_vs_CUDA) {
 #endif
 
 TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie", &lparam);
 
@@ -325,7 +325,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
 // CoxRegression not implemented in GPU code, no need for testing.
 #if !defined(__CUDACC__)
 TEST(Objective, CoxRegressionGPair) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, 0);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::ObjFunction * obj =
       xgboost::ObjFunction::Create("survival:cox", &lparam);
diff --git a/tests/cpp/plugin/test_example_objective.cc b/tests/cpp/plugin/test_example_objective.cc
index 816f861f4..4bcb38979 100644
--- a/tests/cpp/plugin/test_example_objective.cc
+++ b/tests/cpp/plugin/test_example_objective.cc
@@ -6,7 +6,7 @@
 namespace xgboost {
 
 TEST(Plugin, ExampleObjective) {
-  xgboost::GenericParameter tparam = CreateEmptyGenericParam(0, 0);
+  xgboost::GenericParameter tparam = CreateEmptyGenericParam(GPUIDX);
   auto * obj = xgboost::ObjFunction::Create("mylogistic", &tparam);
   ASSERT_EQ(obj->DefaultEvalMetric(), std::string{"error"});
   delete obj;
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 43bb1181b..32d323af3 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -6,7 +6,7 @@
 
 namespace xgboost {
 TEST(cpu_predictor, Test) {
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
   std::unique_ptr<Predictor> cpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
 
@@ -59,7 +59,7 @@ TEST(cpu_predictor, ExternalMemoryTest) {
   dmlc::TemporaryDirectory tmpdir;
   std::string filename = tmpdir.path + "/big.libsvm";
   std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64, filename);
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
   std::unique_ptr<Predictor> cpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
 
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 5cf5817de..9a50e76f9 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -33,8 +33,8 @@ namespace xgboost {
 namespace predictor {
 
 TEST(gpu_predictor, Test) {
-  auto cpu_lparam = CreateEmptyGenericParam(0, 0);
-  auto gpu_lparam = CreateEmptyGenericParam(0, 1);
+  auto cpu_lparam = CreateEmptyGenericParam(-1);
+  auto gpu_lparam = CreateEmptyGenericParam(0);
 
   std::unique_ptr<Predictor> gpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &gpu_lparam));
@@ -69,7 +69,7 @@ TEST(gpu_predictor, Test) {
 }
 
 TEST(gpu_predictor, ExternalMemoryTest) {
-  auto lparam = CreateEmptyGenericParam(0, 1);
+  auto lparam = CreateEmptyGenericParam(0);
   std::unique_ptr<Predictor> gpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &lparam));
   gpu_predictor->Configure({}, {});
@@ -83,26 +83,26 @@ TEST(gpu_predictor, ExternalMemoryTest) {
   std::string file1 = tmpdir.path + "/big_1.libsvm";
   std::string file2 = tmpdir.path + "/big_2.libsvm";
   dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0));
-  dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
-  dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
+//  dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
+//  dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
 
   for (const auto& dmat: dmats) {
-    // Test predict batch
+    dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);
     HostDeviceVector<float> out_predictions;
     gpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
     EXPECT_EQ(out_predictions.Size(), dmat->Info().num_row_ * n_classes);
     const std::vector<float> &host_vector = out_predictions.ConstHostVector();
     for (int i = 0; i < host_vector.size() / n_classes; i++) {
-      ASSERT_EQ(host_vector[i * n_classes], 1.5);
-      ASSERT_EQ(host_vector[i * n_classes + 1], 0.);
-      ASSERT_EQ(host_vector[i * n_classes + 2], 0.);
+      ASSERT_EQ(host_vector[i * n_classes], 2.0);
+      ASSERT_EQ(host_vector[i * n_classes + 1], 0.5);
+      ASSERT_EQ(host_vector[i * n_classes + 2], 0.5);
     }
   }
 }
 
 // Test whether pickling preserves predictor parameters
 TEST(gpu_predictor, PicklingTest) {
-  int const ngpu = 1;
+  int const gpuid = 0;
 
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
@@ -134,7 +134,7 @@ TEST(gpu_predictor, PicklingTest) {
   ASSERT_EQ(XGBoosterSetParam(
       bst, "tree_method", "gpu_hist"), 0) << XGBGetLastError();
   ASSERT_EQ(XGBoosterSetParam(
-      bst, "n_gpus", std::to_string(ngpu).c_str()), 0) << XGBGetLastError();
+      bst, "gpu_id", std::to_string(gpuid).c_str()), 0) << XGBGetLastError();
   ASSERT_EQ(XGBoosterSetParam(bst, "predictor", "gpu_predictor"), 0) << XGBGetLastError();
 
   // Run boosting iterations
@@ -160,7 +160,7 @@ TEST(gpu_predictor, PicklingTest) {
   {  // Query predictor
     const auto& kwargs = QueryBoosterConfigurationArguments(bst2);
     ASSERT_EQ(kwargs.at("predictor"), "gpu_predictor");
-    ASSERT_EQ(kwargs.at("n_gpus"), std::to_string(ngpu).c_str());
+    ASSERT_EQ(kwargs.at("gpu_id"), std::to_string(gpuid).c_str());
   }
 
   {  // Change predictor and query again
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 9261bb96f..318d09628 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -168,10 +168,9 @@ TEST(Learner, IO) {
   std::unique_ptr<Learner> learner {Learner::Create(mat)};
   learner->SetParams({Arg{"tree_method", "auto"},
                       Arg{"predictor", "gpu_predictor"},
-                      Arg{"n_gpus", "1"}});
+                      Arg{"gpu_id", "0"}});
   learner->UpdateOneIter(0, p_dmat.get());
   ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-  ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
 
   dmlc::TemporaryDirectory tempdir;
   const std::string fname = tempdir.path + "/model.bst";
@@ -185,7 +184,6 @@ TEST(Learner, IO) {
   std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
   learner->Load(fi.get());
   ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-  ASSERT_EQ(learner->GetGenericParameter().n_gpus, 0);
 
   delete pp_dmat;
 }
@@ -208,31 +206,27 @@ TEST(Learner, GPUConfiguration) {
                         Arg{"updater", "gpu_coord_descent"}});
     learner->UpdateOneIter(0, p_dmat.get());
     ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
   }
   {
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
     learner->SetParams({Arg{"tree_method", "gpu_hist"}});
     learner->UpdateOneIter(0, p_dmat.get());
     ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
   }
   {
     // with CPU algorithm
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
     learner->SetParams({Arg{"tree_method", "hist"}});
     learner->UpdateOneIter(0, p_dmat.get());
-    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 0);
+    ASSERT_EQ(learner->GetGenericParameter().gpu_id, -1);
   }
   {
-    // with CPU algorithm, but `n_gpus` takes priority
+    // with CPU algorithm, but `gpu_id` takes priority
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
     learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"n_gpus", "1"}});
+                        Arg{"gpu_id", "0"}});
     learner->UpdateOneIter(0, p_dmat.get());
     ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
   }
   {
     // With CPU algorithm but GPU Predictor, this is to simulate when
@@ -243,7 +237,6 @@ TEST(Learner, GPUConfiguration) {
                         Arg{"predictor", "gpu_predictor"}});
     learner->UpdateOneIter(0, p_dmat.get());
     ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
   }
 
   delete pp_dmat;
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 869859fee..df811c413 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -366,7 +366,7 @@ TEST(GpuHist, EvaluateSplits) {
   ASSERT_NEAR(res[1].fvalue, 0.26, xgboost::kRtEps);
 }
 
-void TestHistogramIndexImpl(int n_gpus) {
+void TestHistogramIndexImpl() {
   // Test if the compressed histogram index matches when using a sparse
   // dmatrix with and without using external memory
 
@@ -384,7 +384,7 @@ void TestHistogramIndexImpl(int n_gpus) {
     {"max_leaves", "0"}
   };
 
-  GenericParameter generic_param(CreateEmptyGenericParam(0, n_gpus));
+  GenericParameter generic_param(CreateEmptyGenericParam(0));
   hist_maker.Configure(training_params, &generic_param);
 
   hist_maker.InitDataOnce(hist_maker_dmat.get());
@@ -412,7 +412,7 @@ void TestHistogramIndexImpl(int n_gpus) {
 }
 
 TEST(GpuHist, TestHistogramIndex) {
-  TestHistogramIndexImpl(1);
+  TestHistogramIndexImpl();
 }
 
 }  // namespace tree
diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc
index 8ca8af479..ba1d15d29 100644
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -29,7 +29,7 @@ TEST(Updater, Prune) {
         {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} };
   auto dmat = CreateDMatrix(32, 16, 0.4, 3);
 
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
 
   // prepare tree
   RegTree tree = RegTree();
diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc
index 0bc1190bf..7d1f0d9e5 100644
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -25,7 +25,7 @@ TEST(Updater, Refresh) {
     {"reg_lambda", "1"}};
 
   RegTree tree = RegTree();
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
   tree.param.InitAllowUnknown(cfg);
   std::vector<RegTree*> trees {&tree};
   std::unique_ptr<TreeUpdater> refresher(TreeUpdater::Create("refresh", &lparam));
diff --git a/tests/distributed/distributed_gpu.py b/tests/distributed/distributed_gpu.py
index 6665cd142..4c47f54a3 100644
--- a/tests/distributed/distributed_gpu.py
+++ b/tests/distributed/distributed_gpu.py
@@ -61,7 +61,6 @@ base_params = {
 
 def params_basic_1x4(rank):
     return dict(base_params, **{
-        'n_gpus': 1,
         'gpu_id': rank,
     }), 20
 
diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py
index 79e5919b6..a0726f079 100644
--- a/tests/python-gpu/test_gpu_linear.py
+++ b/tests/python-gpu/test_gpu_linear.py
@@ -23,7 +23,7 @@ class TestGPULinear(unittest.TestCase):
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_gpu_coordinate(self):
         parameters = self.common_param.copy()
-        parameters['n_gpus'] = [1]
+        parameters['gpu_id'] = [0]
         for param in test_linear.parameter_combinations(parameters):
             results = test_linear.run_suite(
                 param, 150, self.datasets, scale_features=True)
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index dbc7b1020..fc21293ea 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -21,7 +21,7 @@ datasets = ["Boston", "Cancer", "Digits", "Sparse regression",
 
 class TestGPU(unittest.TestCase):
     def test_gpu_hist(self):
-        test_param = parameter_combinations({'n_gpus': [1], 'max_depth': [2, 8],
+        test_param = parameter_combinations({'gpu_id': [0], 'max_depth': [2, 8],
                                              'max_leaves': [255, 4],
                                              'max_bin': [2, 256],
                                              'grow_policy': ['lossguide']})
@@ -38,8 +38,7 @@ class TestGPU(unittest.TestCase):
 
     @pytest.mark.mgpu
     def test_specified_gpu_id_gpu_update(self):
-        variable_param = {'n_gpus': [1],
-                          'gpu_id': [1],
+        variable_param = {'gpu_id': [1],
                           'max_depth': [8],
                           'max_leaves': [255, 4],
                           'max_bin': [2, 64],
diff --git a/tests/python-gpu/test_large_sizes.py b/tests/python-gpu/test_large_sizes.py
index 421964285..739b21d6a 100644
--- a/tests/python-gpu/test_large_sizes.py
+++ b/tests/python-gpu/test_large_sizes.py
@@ -63,7 +63,7 @@ class TestGPU(unittest.TestCase):
                          'nthread': 0,
                          'eta': 1,
                          'verbosity': 3,
-                         'n_gpus': 1,
+                         'gpu_id': 0,
                          'objective': 'binary:logistic',
                          'max_bin': max_bin,
                          'eval_metric': 'auc'}