Make HostDeviceVector single gpu only (#4773)

* Make HostDeviceVector single gpu only
2019-08-25 14:51:13 -07:00 · 2019-08-25 14:51:13 -07:00 · 38ab79f889
commit 38ab79f889
parent 41227d1933
54 changed files with 641 additions and 1621 deletions
--- a/demo/c-api/c-api-demo.c
+++ b/demo/c-api/c-api-demo.c
@ -36,13 +36,12 @@ int main(int argc, char** argv) {
  //   https://xgboost.readthedocs.io/en/latest/parameter.html
  safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
  if (use_gpu) {
-    // set the number of GPUs and the first GPU to use;
+    // set the GPU to use;
    // this is not necessary, but provided here as an illustration
-    safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "1"));
    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
  } else {
    // avoid evaluating objective and metric on a GPU
-    safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "0"));
+    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
  }

  safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
--- a/include/xgboost/generic_parameters.h
+++ b/include/xgboost/generic_parameters.h
@ -19,10 +19,8 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
  // number of threads to use if OpenMP is enabled
  // if equals 0, use system default
  int nthread;
-  // primary device.
+  // primary device, -1 means no gpu.
  int gpu_id;
-  // number of devices to use, -1 implies using all available devices.
-  int n_gpus;
  // declare parameters
  DMLC_DECLARE_PARAMETER(GenericParameter) {
    DMLC_DECLARE_FIELD(seed).set_default(0).describe(
@ -36,15 +34,20 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
    DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
        "Number of threads to use.");
    DMLC_DECLARE_FIELD(gpu_id)
-        .set_default(0)
+        .set_default(-1)
+        .set_lower_bound(-1)
        .describe("The primary GPU device ordinal.");
    DMLC_DECLARE_FIELD(n_gpus)
        .set_default(0)
-        .set_range(0, 1)
+        .set_range(0, 0)
        .describe("Deprecated. Single process multi-GPU training is no longer supported. "
                  "Please switch to distributed training with one process per GPU. "
                  "This can be done using Dask or Spark.");
  }
+
+ private:
+  // number of devices to use (deprecated).
+  int n_gpus;
 };
 }  // namespace xgboost

--- a/plugin/example/custom_obj.cc
+++ b/plugin/example/custom_obj.cc
@ -60,8 +60,8 @@ class MyLogistic : public ObjFunction {
  void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
    // transform margin value to probability.
    std::vector<bst_float> &preds = io_preds->HostVector();
-    for (size_t i = 0; i < preds.size(); ++i) {
-      preds[i] = 1.0f / (1.0f + std::exp(-preds[i]));
+    for (auto& pred : preds) {
+      pred = 1.0f / (1.0f + std::exp(-pred));
    }
  }
  bst_float ProbToMargin(bst_float base_score) const override {
--- a/src/common/common.cc
+++ b/src/common/common.cc
@ -22,48 +22,12 @@ using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
 GlobalRandomEngine& GlobalRandom() {
  return RandomThreadLocalStore::Get()->engine;
 }
-}  // namespace common

 #if !defined(XGBOOST_USE_CUDA)
-int AllVisibleImpl::AllVisible() {
+int AllVisibleGPUs() {
  return 0;
 }
 #endif  // !defined(XGBOOST_USE_CUDA)

-constexpr GPUSet::GpuIdType GPUSet::kAll;
-
-GPUSet GPUSet::All(GpuIdType gpu_id, GpuIdType n_gpus, int32_t n_rows) {
-  CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0.";
-  CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1.";
-
-  GpuIdType const n_devices_visible = AllVisible().Size();
-  CHECK_LE(n_gpus, n_devices_visible);
-  if (n_devices_visible == 0 || n_gpus == 0 || n_rows == 0) {
-    LOG(DEBUG) << "Runing on CPU.";
-    return Empty();
-  }
-
-  GpuIdType const n_available_devices = n_devices_visible - gpu_id;
-
-  if (n_gpus == kAll) {  // Use all devices starting from `gpu_id'.
-    CHECK(gpu_id < n_devices_visible)
-        << "\ngpu_id should be less than number of visible devices.\ngpu_id: "
-        << gpu_id
-        << ", number of visible devices: "
-        << n_devices_visible;
-    GpuIdType n_devices =
-        n_available_devices < n_rows ? n_available_devices : n_rows;
-    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
-    return Range(gpu_id, n_devices);
-  } else {  // Use devices in ( gpu_id, gpu_id + n_gpus ).
-    CHECK_LE(n_gpus, n_available_devices)
-        << "Starting from gpu id: " << gpu_id << ", there are only "
-        << n_available_devices << " available devices, while n_gpus is set to: "
-        << n_gpus;
-    GpuIdType n_devices = n_gpus < n_rows ? n_gpus : n_rows;
-    LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
-    return Range(gpu_id, n_devices);
-  }
-}
-
+}  // namespace common
 }  // namespace xgboost
--- a/src/common/common.cu
+++ b/src/common/common.cu
@ -4,8 +4,9 @@
 #include "common.h"

 namespace xgboost {
+namespace common {

-int AllVisibleImpl::AllVisible() {
+int AllVisibleGPUs() {
  int n_visgpus = 0;
  try {
    // When compiled with CUDA but running on CPU only device,
@ -17,4 +18,5 @@ int AllVisibleImpl::AllVisible() {
  return n_visgpus;
 }

+}  // namespace common
 }  // namespace xgboost
--- a/src/common/common.h
+++ b/src/common/common.h
@ -140,88 +140,8 @@ class Range {
  Iterator begin_;
  Iterator end_;
 };
+
+int AllVisibleGPUs();
 }  // namespace common
-
-struct AllVisibleImpl {
-  static int AllVisible();
-};
-/* \brief set of devices across which HostDeviceVector can be distributed.
- *
- * Currently implemented as a range, but can be changed later to something else,
- *   e.g. a bitset
- */
-class GPUSet {
- public:
-  using GpuIdType = int;
-  static constexpr GpuIdType kAll = -1;
-
-  explicit GPUSet(int start = 0, int ndevices = 0)
-      : devices_(start, start + ndevices) {}
-
-  static GPUSet Empty() { return GPUSet(); }
-
-  static GPUSet Range(GpuIdType start, GpuIdType n_gpus) {
-    return n_gpus <= 0 ? Empty() : GPUSet{start, n_gpus};
-  }
-  /*! \brief n_gpus and num_rows both are upper bounds. */
-  static GPUSet All(GpuIdType gpu_id, GpuIdType n_gpus,
-                    GpuIdType num_rows = std::numeric_limits<GpuIdType>::max());
-
-  static GPUSet AllVisible() {
-    GpuIdType n =  AllVisibleImpl::AllVisible();
-    return Range(0, n);
-  }
-
-  size_t Size() const {
-    GpuIdType size = *devices_.end() - *devices_.begin();
-    GpuIdType res = size < 0 ? 0 : size;
-    return static_cast<size_t>(res);
-  }
-
-  /*
-   * By default, we have two configurations of identifying device, one
-   * is the device id obtained from `cudaGetDevice'.  But we sometimes
-   * store objects that allocated one for each device in a list, which
-   * requires a zero-based index.
-   *
-   * Hence, `DeviceId' converts a zero-based index to actual device id,
-   * `Index' converts a device id to a zero-based index.
-   */
-  GpuIdType DeviceId(size_t index) const {
-    GpuIdType result = *devices_.begin() + static_cast<GpuIdType>(index);
-    CHECK(Contains(result)) << "\nDevice " << result << " is not in GPUSet."
-                            << "\nIndex: " << index
-                            << "\nGPUSet: (" << *begin() << ", " << *end() << ")"
-                            << std::endl;
-    return result;
-  }
-  size_t Index(GpuIdType device) const {
-    CHECK(Contains(device)) << "\nDevice " << device << " is not in GPUSet."
-                            << "\nGPUSet: (" << *begin() << ", " << *end() << ")"
-                            << std::endl;
-    size_t result = static_cast<size_t>(device - *devices_.begin());
-    return result;
-  }
-
-  bool IsEmpty() const { return Size() == 0; }
-
-  bool Contains(GpuIdType device) const {
-    return *devices_.begin() <= device && device < *devices_.end();
-  }
-
-  common::Range::Iterator begin() const { return devices_.begin(); }  // NOLINT
-  common::Range::Iterator end() const { return devices_.end(); }      // NOLINT
-
-  friend bool operator==(const GPUSet& lhs, const GPUSet& rhs) {
-    return lhs.devices_ == rhs.devices_;
-  }
-  friend bool operator!=(const GPUSet& lhs, const GPUSet& rhs) {
-    return !(lhs == rhs);
-  }
-
- private:
-  common::Range devices_;
-};
-
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -72,22 +72,6 @@ const T *Raw(const thrust::device_vector<T> &v) {  //  NOLINT
  return raw_pointer_cast(v.data());
 }

-// if n_devices=-1, then use all visible devices
-inline void SynchronizeNDevices(xgboost::GPUSet devices) {
-  devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices;
-  for (auto const d : devices) {
-    safe_cuda(cudaSetDevice(d));
-    safe_cuda(cudaDeviceSynchronize());
-  }
-}
-
-inline void SynchronizeAll() {
-  for (int device_idx : xgboost::GPUSet::AllVisible()) {
-    safe_cuda(cudaSetDevice(device_idx));
-    safe_cuda(cudaDeviceSynchronize());
-  }
-}
-
 inline size_t AvailableMemory(int device_idx) {
  size_t device_free = 0;
  size_t device_total = 0;
@ -119,7 +103,7 @@ inline size_t MaxSharedMemory(int device_idx) {
 }

 inline void CheckComputeCapability() {
-  for (int d_idx : xgboost::GPUSet::AllVisible()) {
+  for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
    cudaDeviceProp prop;
    safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
    std::ostringstream oss;
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -35,7 +35,6 @@ __global__ void FindCutsK
  if (icut >= ncuts) {
    return;
  }
-  WXQSketch::Entry v;
  int isample = 0;
  if (icut == 0) {
    isample = 0;
@ -59,11 +58,14 @@ struct IsNotNaN {
  __device__ bool operator()(float a) const { return !isnan(a); }
 };

-__global__ void UnpackFeaturesK
-(float* __restrict__ fvalues, float* __restrict__ feature_weights,
- const size_t* __restrict__ row_ptrs, const float* __restrict__ weights,
- Entry* entries, size_t nrows_array, int ncols, size_t row_begin_ptr,
- size_t nrows) {
+__global__ void UnpackFeaturesK(float* __restrict__ fvalues,
+                                float* __restrict__ feature_weights,
+                                const size_t* __restrict__ row_ptrs,
+                                const float* __restrict__ weights,
+                                Entry* entries,
+                                size_t nrows_array,
+                                size_t row_begin_ptr,
+                                size_t nrows) {
  size_t irow = threadIdx.x + size_t(blockIdx.x) * blockDim.x;
  if (irow >= nrows) {
    return;
@ -102,8 +104,9 @@ struct SketchContainer {
    const MetaInfo &info = dmat->Info();
    // Initialize Sketches for this dmatrix
    sketches_.resize(info.num_col_);
-#pragma omp parallel for schedule(static) if (info.num_col_ > kOmpNumColsParallelizeLimit)
-    for (int icol = 0; icol < info.num_col_; ++icol) {
+#pragma omp parallel for default(none) shared(info, param) schedule(static) \
+if (info.num_col_ > kOmpNumColsParallelizeLimit)  // NOLINT
+    for (int icol = 0; icol < info.num_col_; ++icol) {  // NOLINT
      sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin));
    }
  }
@ -120,8 +123,6 @@ struct GPUSketcher {
  // manage memory for a single GPU
  class DeviceShard {
    int device_;
-    bst_uint row_begin_;  // The row offset for this shard
-    bst_uint row_end_;
    bst_uint n_rows_;
    int num_cols_{0};
    size_t n_cuts_{0};
@ -131,27 +132,31 @@ struct GPUSketcher {

    tree::TrainParam param_;
    SketchContainer *sketch_container_;
-    dh::device_vector<size_t> row_ptrs_;
-    dh::device_vector<Entry> entries_;
-    dh::device_vector<bst_float> fvalues_;
-    dh::device_vector<bst_float> feature_weights_;
-    dh::device_vector<bst_float> fvalues_cur_;
-    dh::device_vector<WXQSketch::Entry> cuts_d_;
-    thrust::host_vector<WXQSketch::Entry> cuts_h_;
-    dh::device_vector<bst_float> weights_;
-    dh::device_vector<bst_float> weights2_;
-    std::vector<size_t> n_cuts_cur_;
-    dh::device_vector<size_t> num_elements_;
-    dh::device_vector<char> tmp_storage_;
+    dh::device_vector<size_t> row_ptrs_{};
+    dh::device_vector<Entry> entries_{};
+    dh::device_vector<bst_float> fvalues_{};
+    dh::device_vector<bst_float> feature_weights_{};
+    dh::device_vector<bst_float> fvalues_cur_{};
+    dh::device_vector<WXQSketch::Entry> cuts_d_{};
+    thrust::host_vector<WXQSketch::Entry> cuts_h_{};
+    dh::device_vector<bst_float> weights_{};
+    dh::device_vector<bst_float> weights2_{};
+    std::vector<size_t> n_cuts_cur_{};
+    dh::device_vector<size_t> num_elements_{};
+    dh::device_vector<char> tmp_storage_{};

   public:
-    DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
-                tree::TrainParam param, SketchContainer *sketch_container) :
-      device_(device), row_begin_(row_begin), row_end_(row_end),
-      n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
+    DeviceShard(int device,
+                bst_uint n_rows,
+                tree::TrainParam param,
+                SketchContainer* sketch_container) :
+        device_(device),
+        n_rows_(n_rows),
+        param_(std::move(param)),
+        sketch_container_(sketch_container) {
    }

-    ~DeviceShard() {
+    ~DeviceShard() {  // NOLINT
      dh::safe_cuda(cudaSetDevice(device_));
    }

@ -319,19 +324,18 @@ struct GPUSketcher {
      const auto& offset_vec = row_batch.offset.HostVector();
      const auto& data_vec = row_batch.data.HostVector();

-      size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
-        offset_vec[row_begin_ + batch_row_begin];
+      size_t n_entries = offset_vec[batch_row_end] - offset_vec[batch_row_begin];
      // copy the batch to the GPU
      dh::safe_cuda
        (cudaMemcpyAsync(entries_.data().get(),
-                    data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
+                    data_vec.data() + offset_vec[batch_row_begin],
                    n_entries * sizeof(Entry), cudaMemcpyDefault));
      // copy the weights if necessary
      if (has_weights_) {
        const auto& weights_vec = info.weights_.HostVector();
        dh::safe_cuda
          (cudaMemcpyAsync(weights_.data().get(),
-                      weights_vec.data() + row_begin_ + batch_row_begin,
+                      weights_vec.data() + batch_row_begin,
                      batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
      }

@ -349,8 +353,7 @@ struct GPUSketcher {
        (fvalues_.data().get(), has_weights_ ? feature_weights_.data().get() : nullptr,
         row_ptrs_.data().get() + batch_row_begin,
         has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
-         gpu_batch_nrows_, num_cols_,
-         offset_vec[row_begin_ + batch_row_begin], batch_nrows);
+         gpu_batch_nrows_, offset_vec[batch_row_begin], batch_nrows);

      for (int icol = 0; icol < num_cols_; ++icol) {
        FindColumnCuts(batch_nrows, icol);
@ -358,7 +361,7 @@ struct GPUSketcher {

      // add cuts into sketches
      thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
-#pragma omp parallel for schedule(static) \
+#pragma omp parallel for default(none) schedule(static) \
      if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
      for (int icol = 0; icol < num_cols_; ++icol) {
        WXQSketch::SummaryContainer summary;
@ -391,8 +394,7 @@ struct GPUSketcher {
      dh::safe_cuda(cudaSetDevice(device_));
      const auto& offset_vec = row_batch.offset.HostVector();
      row_ptrs_.resize(n_rows_ + 1);
-      thrust::copy(offset_vec.data() + row_begin_,
-                   offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
+      thrust::copy(offset_vec.data(), offset_vec.data() + n_rows_ + 1, row_ptrs_.begin());
      size_t gpu_nbatches = common::DivRoundUp(n_rows_, gpu_batch_nrows_);
      for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
        SketchBatch(row_batch, info, gpu_batch);
@ -401,32 +403,18 @@ struct GPUSketcher {
  };

  void SketchBatch(const SparsePage &batch, const MetaInfo &info) {
-    GPUDistribution dist =
-      GPUDistribution::Block(GPUSet::All(generic_param_.gpu_id, generic_param_.n_gpus,
-                                         batch.Size()));
+    auto device = generic_param_.gpu_id;

-    // create device shards
-    shards_.resize(dist.Devices().Size());
-    dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
-        size_t start = dist.ShardStart(batch.Size(), i);
-        size_t size = dist.ShardSize(batch.Size(), i);
-        shard = std::unique_ptr<DeviceShard>(
-            new DeviceShard(dist.Devices().DeviceId(i), start,
-                            start + size, param_, sketch_container_.get()));
-      });
+    // create device shard
+    shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get()));

-    // compute sketches for each shard
-    dh::ExecuteIndexShards(&shards_,
-                           [&](int idx, std::unique_ptr<DeviceShard>& shard) {
-                             shard->Init(batch, info, gpu_batch_nrows_);
-                             shard->Sketch(batch, info);
-                             shard->ComputeRowStride();
-                           });
+    // compute sketches for the shard
+    shard_->Init(batch, info, gpu_batch_nrows_);
+    shard_->Sketch(batch, info);
+    shard_->ComputeRowStride();

-    // compute row stride across all shards
-    for (const auto &shard : shards_) {
-      row_stride_ = std::max(row_stride_, shard->GetRowStride());
-    }
+    // compute row stride
+    row_stride_ = shard_->GetRowStride();
  }

  GPUSketcher(const tree::TrainParam &param, const GenericParameter &generic_param, int gpu_nrows)
@ -444,13 +432,13 @@ struct GPUSketcher {
      this->SketchBatch(batch, info);
    }

-    hmat->Init(&sketch_container_.get()->sketches_, param_.max_bin);
+    hmat->Init(&sketch_container_->sketches_, param_.max_bin);

    return row_stride_;
  }

 private:
-  std::vector<std::unique_ptr<DeviceShard>> shards_;
+  std::unique_ptr<DeviceShard> shard_;
  const tree::TrainParam &param_;
  const GenericParameter &generic_param_;
  int gpu_batch_nrows_;
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@ -30,19 +30,19 @@ struct HostDeviceVectorImpl {
 };

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(size, v);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, const GPUDistribution &)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
  : impl_(nullptr) {
  impl_ = new HostDeviceVectorImpl<T>(init);
 }
@ -75,29 +75,23 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }

 template <typename T>
-GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
+int HostDeviceVector<T>::DeviceIdx() const { return -1; }

 template <typename T>
-const GPUDistribution& HostDeviceVector<T>::Distribution() const {
-  static GPUDistribution dummyInstance;
-  return dummyInstance;
-}
+T* HostDeviceVector<T>::DevicePointer() { return nullptr; }

 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
-
-template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
+const T* HostDeviceVector<T>::ConstDevicePointer() const {
  return nullptr;
 }

 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
+common::Span<T> HostDeviceVector<T>::DeviceSpan() {
  return common::Span<T>();
 }

 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
  return common::Span<const T>();
 }

@ -115,10 +109,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) const { return 0; }
-
-template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) const { return 0; }
+size_t HostDeviceVector<T>::DeviceSize() const { return 0; }

 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
@ -149,18 +140,12 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
 }

 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
+bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
  return false;
 }

 template <typename T>
-void HostDeviceVector<T>::Shard(const GPUDistribution& distribution) const { }
-
-template <typename T>
-void HostDeviceVector<T>::Shard(GPUSet devices) const { }
-
-template <typename T>
-void Reshard(const GPUDistribution &distribution) { }
+void HostDeviceVector<T>::SetDevice(int device) const {}

 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -10,7 +10,6 @@
 #include <mutex>
 #include "./device_helpers.cuh"

-
 namespace xgboost {

 // the handler to call instead of cudaSetDevice; only used for testing
@ -43,144 +42,12 @@ class Permissions {
 };

 template <typename T>
-struct HostDeviceVectorImpl {
-  struct DeviceShard {
-    DeviceShard()
-      : proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
-        cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
-
-    ~DeviceShard() {
+class HostDeviceVectorImpl {
+ public:
+  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), perm_h_(device < 0) {
+    if (device >= 0) {
      SetDevice();
-    }
-
-    void Init(HostDeviceVectorImpl<T>* vec, int device) {
-      if (vec_ == nullptr) { vec_ = vec; }
-      CHECK_EQ(vec, vec_);
-      device_ = device;
-      LazyResize(vec_->Size());
-      perm_d_ = vec_->perm_h_.Complementary();
-    }
-
-    void Init(HostDeviceVectorImpl<T>* vec, const DeviceShard& other) {
-      if (vec_ == nullptr) { vec_ = vec; }
-      CHECK_EQ(vec, vec_);
-      device_ = other.device_;
-      cached_size_ = other.cached_size_;
-      start_ = other.start_;
-      proper_size_ = other.proper_size_;
-      SetDevice();
-      data_.resize(other.data_.size());
-      perm_d_ = other.perm_d_;
-    }
-
-    void ScatterFrom(const T* begin) {
-      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice(GPUAccess::kWrite);
-      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_,
-                               data_.size() * sizeof(T), cudaMemcpyDefault));
-    }
-
-    void GatherTo(thrust::device_ptr<T> begin) {
-      LazySyncDevice(GPUAccess::kRead);
-      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(),
-                               proper_size_ * sizeof(T), cudaMemcpyDefault));
-    }
-
-    void Fill(T v) {
-      // TODO(canonizer): avoid full copy of host data
-      LazySyncDevice(GPUAccess::kWrite);
-      SetDevice();
-      thrust::fill(data_.begin(), data_.end(), v);
-    }
-
-    void Copy(DeviceShard* other) {
-      // TODO(canonizer): avoid full copy of host data for this (but not for other)
-      LazySyncDevice(GPUAccess::kWrite);
-      other->LazySyncDevice(GPUAccess::kRead);
-      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(),
-                               data_.size() * sizeof(T), cudaMemcpyDefault));
-    }
-
-    void LazySyncHost(GPUAccess access) {
-      SetDevice();
-      dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_,
-                               data_.data().get(),  proper_size_ * sizeof(T),
-                               cudaMemcpyDeviceToHost));
-      perm_d_.DenyComplementary(access);
-    }
-
-    void LazyResize(size_t new_size) {
-      if (new_size == cached_size_) { return; }
-      // resize is required
-      int ndevices = vec_->distribution_.devices_.Size();
-      int device_index = vec_->distribution_.devices_.Index(device_);
-      start_ = vec_->distribution_.ShardStart(new_size, device_index);
-      proper_size_ = vec_->distribution_.ShardProperSize(new_size, device_index);
-      // The size on this device.
-      size_t size_d = vec_->distribution_.ShardSize(new_size, device_index);
-      SetDevice();
-      data_.resize(size_d);
-      cached_size_ = new_size;
-    }
-
-    void LazySyncDevice(GPUAccess access) {
-      if (perm_d_.CanAccess(access)) { return; }
-      if (perm_d_.CanRead()) {
-        // deny read to the host
-        perm_d_.Grant(access);
-        std::lock_guard<std::mutex> lock(vec_->mutex_);
-        vec_->perm_h_.DenyComplementary(access);
-        return;
-      }
-      // data is on the host
-      size_t size_h = vec_->data_h_.size();
-      LazyResize(size_h);
-      SetDevice();
-      dh::safe_cuda(
-          cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_,
-                     data_.size() * sizeof(T), cudaMemcpyHostToDevice));
-      perm_d_.Grant(access);
-
-      std::lock_guard<std::mutex> lock(vec_->mutex_);
-      vec_->perm_h_.DenyComplementary(access);
-      vec_->size_d_ = size_h;
-    }
-
-    void SetDevice() {
-      if (cudaSetDeviceHandler == nullptr) {
-        dh::safe_cuda(cudaSetDevice(device_));
-      } else {
-        (*cudaSetDeviceHandler)(device_);
-      }
-    }
-
-    T*     Raw()                    { return data_.data().get(); }
-    size_t Start()            const { return start_; }
-    size_t DataSize()         const { return data_.size(); }
-    Permissions& Perm()             { return perm_d_; }
-    Permissions const& Perm() const { return perm_d_; }
-
-   private:
-    int device_;
-    dh::device_vector<T> data_;
-    // cached vector size
-    size_t cached_size_;
-    size_t start_;
-    // size of the portion to copy back to the host
-    size_t proper_size_;
-    Permissions perm_d_;
-    HostDeviceVectorImpl<T>* vec_;
-  };
-
-  HostDeviceVectorImpl(size_t size, T v, const GPUDistribution &distribution)
-    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
-    if (!distribution_.IsEmpty()) {
-      size_d_ = size;
-      InitShards();
-      Fill(v);
+      data_d_.resize(size, v);
    } else {
      data_h_.resize(size, v);
    }
@ -188,127 +55,81 @@ struct HostDeviceVectorImpl {

  // required, as a new std::mutex has to be created
  HostDeviceVectorImpl(const HostDeviceVectorImpl<T>& other)
-    : data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_),
-      distribution_(other.distribution_), mutex_() {
-    shards_.resize(other.shards_.size());
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Init(this, other.shards_.at(i));
-      });
+      : device_(other.device_), data_h_(other.data_h_), perm_h_(other.perm_h_), mutex_() {
+    if (device_ >= 0) {
+      SetDevice();
+      data_d_ = other.data_d_;
+    }
  }

  // Initializer can be std::vector<T> or std::initializer_list<T>
  template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, const GPUDistribution &distribution)
-    : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
-    if (!distribution_.IsEmpty()) {
-      size_d_ = init.size();
-      InitShards();
+  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device), perm_h_(device < 0) {
+    if (device >= 0) {
+      LazyResizeDevice(init.size());
      Copy(init);
    } else {
      data_h_ = init;
    }
  }

-  void InitShards() {
-    int ndevices = distribution_.devices_.Size();
-    shards_.resize(ndevices);
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Init(this, distribution_.devices_.DeviceId(i));
-      });
+  ~HostDeviceVectorImpl() {
+    if (device_ >= 0) {
+      SetDevice();
+    }
  }

-  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; }
+  size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : data_d_.size(); }

-  GPUSet Devices() const { return distribution_.devices_; }
+  int DeviceIdx() const { return device_; }

-  const GPUDistribution& Distribution() const { return distribution_; }
-
-  T* DevicePointer(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kWrite);
-    return shards_.at(distribution_.devices_.Index(device)).Raw();
+  T* DevicePointer() {
+    LazySyncDevice(GPUAccess::kWrite);
+    return data_d_.data().get();
  }

-  const T* ConstDevicePointer(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
-    return shards_.at(distribution_.devices_.Index(device)).Raw();
+  const T* ConstDevicePointer() {
+    LazySyncDevice(GPUAccess::kRead);
+    return data_d_.data().get();
  }

-  common::Span<T> DeviceSpan(int device) {
-    GPUSet devices = distribution_.devices_;
-    CHECK(devices.Contains(device));
-    LazySyncDevice(device, GPUAccess::kWrite);
-    return {shards_.at(devices.Index(device)).Raw(),
-          static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
+  common::Span<T> DeviceSpan() {
+    LazySyncDevice(GPUAccess::kWrite);
+    return {data_d_.data().get(), static_cast<typename common::Span<T>::index_type>(DeviceSize())};
  }

-  common::Span<const T> ConstDeviceSpan(int device) {
-    GPUSet devices = distribution_.devices_;
-    CHECK(devices.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
+  common::Span<const T> ConstDeviceSpan() {
+    LazySyncDevice(GPUAccess::kRead);
    using SpanInd = typename common::Span<const T>::index_type;
-    return {shards_.at(devices.Index(device)).Raw(),
-          static_cast<SpanInd>(DeviceSize(device))};
+    return {data_d_.data().get(), static_cast<SpanInd>(DeviceSize())};
  }

-  size_t DeviceSize(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
-    return shards_.at(distribution_.devices_.Index(device)).DataSize();
+  size_t DeviceSize() {
+    LazySyncDevice(GPUAccess::kRead);
+    return data_d_.size();
  }

-  size_t DeviceStart(int device) {
-    CHECK(distribution_.devices_.Contains(device));
-    LazySyncDevice(device, GPUAccess::kRead);
-    return shards_.at(distribution_.devices_.Index(device)).Start();
+  thrust::device_ptr<T> tbegin() {  // NOLINT
+    return thrust::device_ptr<T>(DevicePointer());
  }

-  thrust::device_ptr<T> tbegin(int device) {  // NOLINT
-    return thrust::device_ptr<T>(DevicePointer(device));
+  thrust::device_ptr<const T> tcbegin() {  // NOLINT
+    return thrust::device_ptr<const T>(ConstDevicePointer());
  }

-  thrust::device_ptr<const T> tcbegin(int device) {  // NOLINT
-    return thrust::device_ptr<const T>(ConstDevicePointer(device));
+  thrust::device_ptr<T> tend() {  // NOLINT
+    return tbegin() + DeviceSize();
  }

-  thrust::device_ptr<T> tend(int device) {  // NOLINT
-    return tbegin(device) + DeviceSize(device);
-  }
-
-  thrust::device_ptr<const T> tcend(int device) {  // NOLINT
-    return tcbegin(device) + DeviceSize(device);
-  }
-
-  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
-    CHECK_EQ(end - begin, Size());
-    if (perm_h_.CanWrite()) {
-      dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
-                               (end - begin) * sizeof(T),
-                               cudaMemcpyDeviceToHost));
-    } else {
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-        shard.ScatterFrom(begin.get());
-      });
-    }
-  }
-
-  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
-    CHECK_EQ(end - begin, Size());
-    if (perm_h_.CanWrite()) {
-      dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
-                               data_h_.size() * sizeof(T),
-                               cudaMemcpyHostToDevice));
-    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.GatherTo(begin); });
-    }
+  thrust::device_ptr<const T> tcend() {  // NOLINT
+    return tcbegin() + DeviceSize();
  }

  void Fill(T v) {  // NOLINT
    if (perm_h_.CanWrite()) {
      std::fill(data_h_.begin(), data_h_.end(), v);
    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.Fill(v); });
+      DeviceFill(v);
    }
  }

@ -320,14 +141,10 @@ struct HostDeviceVectorImpl {
      return;
    }
    // Data is on device;
-    if (distribution_ != other->distribution_) {
-      distribution_ = GPUDistribution();
-      Shard(other->Distribution());
-      size_d_ = other->size_d_;
+    if (device_ != other->device_) {
+      SetDevice(other->device_);
    }
-    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-        shard.Copy(&other->shards_.at(i));
-      });
+    DeviceCopy(other);
  }

  void Copy(const std::vector<T>& other) {
@ -335,9 +152,7 @@ struct HostDeviceVectorImpl {
    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-          shard.ScatterFrom(other.data());
-        });
+      DeviceCopy(other.data());
    }
  }

@ -346,9 +161,7 @@ struct HostDeviceVectorImpl {
    if (perm_h_.CanWrite()) {
      std::copy(other.begin(), other.end(), data_h_.begin());
    } else {
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-          shard.ScatterFrom(other.begin());
-        });
+      DeviceCopy(other.begin());
    }
  }

@ -362,40 +175,23 @@ struct HostDeviceVectorImpl {
    return data_h_;
  }

-  void Shard(const GPUDistribution& distribution) {
-    if (distribution_ == distribution) { return; }
-    CHECK(distribution_.IsEmpty())
-        << "Data resides on different GPUs: " << "ID: "
-        << *(distribution_.Devices().begin()) << " and ID: "
-        << *(distribution.Devices().begin());
-    distribution_ = distribution;
-    InitShards();
-  }
-
-  void Shard(GPUSet new_devices) {
-    if (distribution_.Devices() == new_devices) { return; }
-    Shard(GPUDistribution::Block(new_devices));
-  }
-
-  void Reshard(const GPUDistribution &distribution) {
-    if (distribution_ == distribution) { return; }
-    LazySyncHost(GPUAccess::kWrite);
-    distribution_ = distribution;
-    shards_.clear();
-    InitShards();
+  void SetDevice(int device) {
+    if (device_ == device) { return; }
+    if (device_ >= 0) {
+      LazySyncHost(GPUAccess::kWrite);
+    }
+    device_ = device;
+    if (device_ >= 0) {
+      LazyResizeDevice(data_h_.size());
+    }
  }

  void Resize(size_t new_size, T v) {
    if (new_size == Size()) { return; }
-    if (distribution_.IsFixedSize()) {
-      CHECK_EQ(new_size, distribution_.offsets_.back());
-    }
-    if (Size() == 0 && !distribution_.IsEmpty()) {
+    if (Size() == 0 && device_ >= 0) {
      // fast on-device resize
      perm_h_ = Permissions(false);
-      size_d_ = new_size;
-      InitShards();
-      Fill(v);
+      data_d_.resize(new_size, v);
    } else {
      // resize on host
      LazySyncHost(GPUAccess::kWrite);
@ -407,72 +203,110 @@ struct HostDeviceVectorImpl {
    if (perm_h_.CanAccess(access)) { return; }
    if (perm_h_.CanRead()) {
      // data is present, just need to deny access to the device
-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-          shard.Perm().DenyComplementary(access);
-        });
      perm_h_.Grant(access);
      return;
    }
    std::lock_guard<std::mutex> lock(mutex_);
-    if (data_h_.size() != size_d_) { data_h_.resize(size_d_); }
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-        shard.LazySyncHost(access);
-      });
+    if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
+    SetDevice();
+    dh::safe_cuda(cudaMemcpy(data_h_.data(),
+                             data_d_.data().get(),
+                             data_d_.size() * sizeof(T),
+                             cudaMemcpyDeviceToHost));
    perm_h_.Grant(access);
  }

-  void LazySyncDevice(int device, GPUAccess access) {
-    GPUSet devices = distribution_.Devices();
-    CHECK(devices.Contains(device));
-    shards_.at(devices.Index(device)).LazySyncDevice(access);
+  void LazySyncDevice(GPUAccess access) {
+    if (DevicePerm().CanAccess(access)) { return; }
+    if (DevicePerm().CanRead()) {
+      // deny read to the host
+      std::lock_guard<std::mutex> lock(mutex_);
+      perm_h_.DenyComplementary(access);
+      return;
+    }
+    // data is on the host
+    LazyResizeDevice(data_h_.size());
+    SetDevice();
+    dh::safe_cuda(cudaMemcpy(data_d_.data().get(),
+                             data_h_.data(),
+                             data_d_.size() * sizeof(T),
+                             cudaMemcpyHostToDevice));
+
+    std::lock_guard<std::mutex> lock(mutex_);
+    perm_h_.DenyComplementary(access);
  }

  bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
-
-  bool DeviceCanAccess(int device, GPUAccess access) {
-    GPUSet devices = distribution_.Devices();
-    if (!devices.Contains(device)) { return false; }
-    return shards_.at(devices.Index(device)).Perm().CanAccess(access);
-  }
+  bool DeviceCanAccess(GPUAccess access) { return DevicePerm().CanAccess(access); }

 private:
-  std::vector<T> data_h_;
-  Permissions perm_h_;
-  // the total size of the data stored on the devices
-  size_t size_d_;
-  GPUDistribution distribution_;
+  int device_{-1};
+  std::vector<T> data_h_{};
+  dh::device_vector<T> data_d_{};
+  Permissions perm_h_{false};
  // protects size_d_ and perm_h_ when updated from multiple threads
-  std::mutex mutex_;
-  std::vector<DeviceShard> shards_;
+  std::mutex mutex_{};
+
+  void DeviceFill(T v) {
+    // TODO(canonizer): avoid full copy of host data
+    LazySyncDevice(GPUAccess::kWrite);
+    SetDevice();
+    thrust::fill(data_d_.begin(), data_d_.end(), v);
+  }
+
+  void DeviceCopy(HostDeviceVectorImpl* other) {
+    // TODO(canonizer): avoid full copy of host data for this (but not for other)
+    LazySyncDevice(GPUAccess::kWrite);
+    other->LazySyncDevice(GPUAccess::kRead);
+    SetDevice();
+    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
+                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
+  }
+
+  void DeviceCopy(const T* begin) {
+    // TODO(canonizer): avoid full copy of host data
+    LazySyncDevice(GPUAccess::kWrite);
+    SetDevice();
+    dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
+                                  data_d_.size() * sizeof(T), cudaMemcpyDefault));
+  }
+
+  void LazyResizeDevice(size_t new_size) {
+    if (new_size == data_d_.size()) { return; }
+    SetDevice();
+    data_d_.resize(new_size);
+  }
+
+  void SetDevice() {
+    CHECK_GE(device_, 0);
+    if (cudaSetDeviceHandler == nullptr) {
+      dh::safe_cuda(cudaSetDevice(device_));
+    } else {
+      (*cudaSetDeviceHandler)(device_);
+    }
+  }
+
+  Permissions DevicePerm() const { return perm_h_.Complementary(); }
 };

-template <typename T>
-HostDeviceVector<T>::HostDeviceVector
-(size_t size, T v, const GPUDistribution &distribution) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(size, v, distribution);
-}
+template<typename T>
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
+    : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector
-(std::initializer_list<T> init, const GPUDistribution &distribution) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
-}
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}

 template <typename T>
-HostDeviceVector<T>::HostDeviceVector
-(const std::vector<T>& init, const GPUDistribution &distribution) : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(init, distribution);
-}
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+    : impl_(new HostDeviceVectorImpl<T>(init, device)) {}

 template <typename T>
 HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
-  : impl_(nullptr) {
-  impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
-}
+    : impl_(new HostDeviceVectorImpl<T>(*other.impl_)) {}

 template <typename T>
-HostDeviceVector<T>& HostDeviceVector<T>::operator=
-(const HostDeviceVector<T>& other) {
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
  if (this == &other) { return *this; }

  std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
@ -491,73 +325,51 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }

 template <typename T>
-GPUSet HostDeviceVector<T>::Devices() const { return impl_->Devices(); }
+int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }

 template <typename T>
-const GPUDistribution& HostDeviceVector<T>::Distribution() const {
-  return impl_->Distribution();
+T* HostDeviceVector<T>::DevicePointer() {
+  return impl_->DevicePointer();
 }

 template <typename T>
-T* HostDeviceVector<T>::DevicePointer(int device) {
-  return impl_->DevicePointer(device);
+const T* HostDeviceVector<T>::ConstDevicePointer() const {
+  return impl_->ConstDevicePointer();
 }

 template <typename T>
-const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
-  return impl_->ConstDevicePointer(device);
+common::Span<T> HostDeviceVector<T>::DeviceSpan() {
+  return impl_->DeviceSpan();
 }

 template <typename T>
-common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
-  return impl_->DeviceSpan(device);
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
+  return impl_->ConstDeviceSpan();
 }

 template <typename T>
-common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
-  return impl_->ConstDeviceSpan(device);
+size_t HostDeviceVector<T>::DeviceSize() const {
+  return impl_->DeviceSize();
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) const {
-  return impl_->DeviceStart(device);
+thrust::device_ptr<T> HostDeviceVector<T>::tbegin() {  // NOLINT
+  return impl_->tbegin();
 }

 template <typename T>
-size_t HostDeviceVector<T>::DeviceSize(int device) const {
-  return impl_->DeviceSize(device);
+thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin() const {  // NOLINT
+  return impl_->tcbegin();
 }

 template <typename T>
-thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) {  // NOLINT
-  return impl_->tbegin(device);
+thrust::device_ptr<T> HostDeviceVector<T>::tend() {  // NOLINT
+  return impl_->tend();
 }

 template <typename T>
-thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin(int device) const {  // NOLINT
-  return impl_->tcbegin(device);
-}
-
-template <typename T>
-thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) {  // NOLINT
-  return impl_->tend(device);
-}
-
-template <typename T>
-thrust::device_ptr<const T> HostDeviceVector<T>::tcend(int device) const {  // NOLINT
-  return impl_->tcend(device);
-}
-
-template <typename T>
-void HostDeviceVector<T>::ScatterFrom
-(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
-  impl_->ScatterFrom(begin, end);
-}
-
-template <typename T>
-void HostDeviceVector<T>::GatherTo
-(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const {
-  impl_->GatherTo(begin, end);
+thrust::device_ptr<const T> HostDeviceVector<T>::tcend() const {  // NOLINT
+  return impl_->tcend();
 }

 template <typename T>
@ -594,23 +406,13 @@ bool HostDeviceVector<T>::HostCanAccess(GPUAccess access) const {
 }

 template <typename T>
-bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
-  return impl_->DeviceCanAccess(device, access);
+bool HostDeviceVector<T>::DeviceCanAccess(GPUAccess access) const {
+  return impl_->DeviceCanAccess(access);
 }

 template <typename T>
-void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
-  impl_->Shard(new_devices);
-}
-
-template <typename T>
-void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
-  impl_->Shard(distribution);
-}
-
-template <typename T>
-void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution) {
-  impl_->Reshard(distribution);
+void HostDeviceVector<T>::SetDevice(int device) const {
+  impl_->SetDevice(device);
 }

 template <typename T>
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@ -79,113 +79,6 @@ void SetCudaSetDeviceHandler(void (*handler)(int));

 template <typename T> struct HostDeviceVectorImpl;

-// Distribution for the HostDeviceVector; it specifies such aspects as the
-// devices it is distributed on, whether there are copies of elements from
-// other GPUs as well as the granularity of splitting. It may also specify
-// explicit boundaries for devices, in which case the size of the array cannot
-// be changed.
-class GPUDistribution {
-  template<typename T> friend struct HostDeviceVectorImpl;
-
- public:
-  explicit GPUDistribution(GPUSet devices = GPUSet::Empty())
-    : devices_(devices), granularity_(1), overlap_(0) {}
-
- private:
-  GPUDistribution(GPUSet devices, int granularity, int overlap,
-                  std::vector<size_t> &&offsets)
-    : devices_(devices), granularity_(granularity), overlap_(overlap),
-    offsets_(std::move(offsets)) {}
-
- public:
-  static GPUDistribution Empty() { return GPUDistribution(); }
-
-  static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
-
-  static GPUDistribution Overlap(GPUSet devices, int overlap) {
-    return GPUDistribution(devices, 1, overlap, std::vector<size_t>());
-  }
-
-  static GPUDistribution Granular(GPUSet devices, int granularity) {
-    return GPUDistribution(devices, granularity, 0, std::vector<size_t>());
-  }
-
-  // NOTE(rongou): Explicit offsets don't necessarily cover the whole vector. Sections before the
-  // first shard or after the last shard may be on host only. This windowing is done in the GPU
-  // predictor for external memory support.
-  static GPUDistribution Explicit(GPUSet devices, std::vector<size_t> offsets) {
-    return GPUDistribution(devices, 1, 0, std::move(offsets));
-  }
-
-  friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
-    bool const res = a.devices_ == b.devices_ &&
-                     a.granularity_ == b.granularity_ &&
-                     a.overlap_ == b.overlap_ &&
-                     a.offsets_ == b.offsets_;
-    return res;
-  }
-
-  friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
-    return !(a == b);
-  }
-
-  GPUSet Devices() const { return devices_; }
-
-  bool IsEmpty() const { return devices_.IsEmpty(); }
-
-  size_t ShardStart(size_t size, int index) const {
-    if (size == 0) { return 0; }
-    if (offsets_.size() > 0) {
-      // explicit offsets are provided
-      CHECK_EQ(offsets_.back(), size);
-      return offsets_.at(index);
-    }
-    // no explicit offsets
-    size_t begin = std::min(index * Portion(size), size);
-    begin = begin > size ? size : begin;
-    return begin;
-  }
-
-  size_t ShardSize(size_t size, size_t index) const {
-    if (size == 0) { return 0; }
-    if (offsets_.size() > 0) {
-      // explicit offsets are provided
-      CHECK_EQ(offsets_.back(), size);
-      return offsets_.at(index + 1)  - offsets_.at(index) +
-        (index == devices_.Size() - 1 ? overlap_ : 0);
-    }
-    size_t portion = Portion(size);
-    size_t begin = std::min(index * portion, size);
-    size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size);
-    return end - begin;
-  }
-
-  size_t ShardProperSize(size_t size, size_t index) const {
-    if (size == 0) { return 0; }
-    return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0);
-  }
-
-  bool IsFixedSize() const { return !offsets_.empty(); }
-
- private:
-  static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; }
-  static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; }
-
-  size_t Portion(size_t size) const {
-    return RoundUp
-      (DivRoundUp
-       (std::max(static_cast<int64_t>(size - overlap_ * granularity_),
-                 static_cast<int64_t>(1)),
-        devices_.Size()), granularity_);
-  }
-
-  GPUSet devices_;
-  int granularity_;
-  int overlap_;
-  // explicit offsets for the GPU parts, if any
-  std::vector<size_t> offsets_;
-};
-
 enum GPUAccess {
  kNone, kRead,
  // write implies read
@ -199,46 +92,38 @@ inline GPUAccess operator-(GPUAccess a, GPUAccess b) {
 template <typename T>
 class HostDeviceVector {
 public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(),
-                            const GPUDistribution &distribution = GPUDistribution());
-  HostDeviceVector(std::initializer_list<T> init,
-                   const GPUDistribution &distribution = GPUDistribution());
-  explicit HostDeviceVector(const std::vector<T>& init,
-                            const GPUDistribution &distribution = GPUDistribution());
+  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
+  HostDeviceVector(std::initializer_list<T> init, int device = -1);
+  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
  ~HostDeviceVector();
  HostDeviceVector(const HostDeviceVector<T>&);
  HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
  size_t Size() const;
-  GPUSet Devices() const;
-  const GPUDistribution& Distribution() const;
-  common::Span<T> DeviceSpan(int device);
-  common::Span<const T> ConstDeviceSpan(int device) const;
-  common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
-  T* DevicePointer(int device);
-  const T* ConstDevicePointer(int device) const;
-  const T* DevicePointer(int device) const { return ConstDevicePointer(device); }
+  int DeviceIdx() const;
+  common::Span<T> DeviceSpan();
+  common::Span<const T> ConstDeviceSpan() const;
+  common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
+  T* DevicePointer();
+  const T* ConstDevicePointer() const;
+  const T* DevicePointer() const { return ConstDevicePointer(); }

  T* HostPointer() { return HostVector().data(); }
  const T* ConstHostPointer() const { return ConstHostVector().data(); }
  const T* HostPointer() const { return ConstHostPointer(); }

-  size_t DeviceStart(int device) const;
-  size_t DeviceSize(int device) const;
+  size_t DeviceSize() const;

  // only define functions returning device_ptr
  // if HostDeviceVector.h is included from a .cu file
 #ifdef __CUDACC__
-  thrust::device_ptr<T> tbegin(int device);  // NOLINT
-  thrust::device_ptr<T> tend(int device);  // NOLINT
-  thrust::device_ptr<const T> tcbegin(int device) const;  // NOLINT
-  thrust::device_ptr<const T> tcend(int device) const;  // NOLINT
-  thrust::device_ptr<const T> tbegin(int device) const {  // NOLINT
-    return tcbegin(device);
+  thrust::device_ptr<T> tbegin();  // NOLINT
+  thrust::device_ptr<T> tend();  // NOLINT
+  thrust::device_ptr<const T> tcbegin() const;  // NOLINT
+  thrust::device_ptr<const T> tcend() const;  // NOLINT
+  thrust::device_ptr<const T> tbegin() const {  // NOLINT
+    return tcbegin();
  }
-  thrust::device_ptr<const T> tend(int device) const { return tcend(device); }  // NOLINT
-
-  void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end);
-  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) const;
+  thrust::device_ptr<const T> tend() const { return tcend(); }  // NOLINT
 #endif  // __CUDACC__

  void Fill(T v);
@ -251,18 +136,9 @@ class HostDeviceVector {
  const std::vector<T>& HostVector() const {return ConstHostVector(); }

  bool HostCanAccess(GPUAccess access) const;
-  bool DeviceCanAccess(int device, GPUAccess access) const;
+  bool DeviceCanAccess(GPUAccess access) const;

-  /*!
-   * \brief Specify memory distribution.
-   */
-  void Shard(const GPUDistribution &distribution) const;
-  void Shard(GPUSet devices) const;
-
-  /*!
-   * \brief Change memory distribution.
-   */
-  void Reshard(const GPUDistribution &distribution);
+  void SetDevice(int device) const;

  void Resize(size_t new_size, T v = T());

--- a/src/common/transform.h
+++ b/src/common/transform.h
@ -57,14 +57,10 @@ class Transform {
  template <typename Functor>
  struct Evaluator {
   public:
-    Evaluator(Functor func, Range range, GPUSet devices, bool shard) :
+    Evaluator(Functor func, Range range, int device, bool shard) :
        func_(func), range_{std::move(range)},
        shard_{shard},
-        distribution_{GPUDistribution::Block(devices)} {}
-    Evaluator(Functor func, Range range, GPUDistribution dist,
-              bool shard) :
-        func_(func), range_{std::move(range)}, shard_{shard},
-        distribution_{std::move(dist)} {}
+        device_{device} {}

    /*!
     * \brief Evaluate the functor with input pointers to HostDeviceVector.
@ -74,7 +70,7 @@ class Transform {
     */
    template <typename... HDV>
    void Eval(HDV... vectors) const {
-      bool on_device = !distribution_.IsEmpty();
+      bool on_device = device_ >= 0;

      if (on_device) {
        LaunchCUDA(func_, vectors...);
@ -86,13 +82,13 @@ class Transform {
   private:
    // CUDA UnpackHDV
    template <typename T>
-    Span<T> UnpackHDV(HostDeviceVector<T>* _vec, int _device) const {
-      auto span = _vec->DeviceSpan(_device);
+    Span<T> UnpackHDVOnDevice(HostDeviceVector<T>* _vec) const {
+      auto span = _vec->DeviceSpan();
      return span;
    }
    template <typename T>
-    Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec, int _device) const {
-      auto span = _vec->ConstDeviceSpan(_device);
+    Span<T const> UnpackHDVOnDevice(const HostDeviceVector<T>* _vec) const {
+      auto span = _vec->ConstDeviceSpan();
      return span;
    }
    // CPU UnpackHDV
@ -108,15 +104,15 @@ class Transform {
    }
    // Recursive unpack for Shard.
    template <typename T>
-    void UnpackShard(GPUDistribution dist, const HostDeviceVector<T> *vector) const {
-      vector->Shard(dist);
+    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+      vector->SetDevice(device);
    }
    template <typename Head, typename... Rest>
-    void UnpackShard(GPUDistribution dist,
+    void UnpackShard(int device,
                     const HostDeviceVector<Head> *_vector,
                     const HostDeviceVector<Rest> *... _vectors) const {
-      _vector->Shard(dist);
-      UnpackShard(dist, _vectors...);
+      _vector->SetDevice(device);
+      UnpackShard(device, _vectors...);
    }

 #if defined(__CUDACC__)
@ -124,28 +120,20 @@ class Transform {
              typename... HDV>
    void LaunchCUDA(Functor _func, HDV*... _vectors) const {
      if (shard_)
-        UnpackShard(distribution_, _vectors...);
+        UnpackShard(device_, _vectors...);

-      GPUSet devices = distribution_.Devices();
      size_t range_size = *range_.end() - *range_.begin();

      // Extract index to deal with possible old OpenMP.
-      size_t device_beg = *(devices.begin());
-      size_t device_end = *(devices.end());
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (omp_ulong device = device_beg; device < device_end; ++device) {  // NOLINT
-        // Ignore other attributes of GPUDistribution for spliting index.
-        // This deals with situation like multi-class setting where
-        // granularity is used in data vector.
-        size_t shard_size = GPUDistribution::Block(devices).ShardSize(
-            range_size, devices.Index(device));
-        Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-        dh::safe_cuda(cudaSetDevice(device));
-        const int GRID_SIZE =
-            static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
-        detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
-            _func, shard_range, UnpackHDV(_vectors, device)...);
-      }
+      // This deals with situation like multi-class setting where
+      // granularity is used in data vector.
+      size_t shard_size = range_size;
+      Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
+      dh::safe_cuda(cudaSetDevice(device_));
+      const int GRID_SIZE =
+          static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
+      detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
+          _func, shard_range, UnpackHDVOnDevice(_vectors)...);
    }
 #else
    /*! \brief Dummy funtion defined when compiling for CPU.  */
@ -172,7 +160,7 @@ class Transform {
    Range range_;
    /*! \brief Whether sharding for vectors is required. */
    bool shard_;
-    GPUDistribution distribution_;
+    int device_;
  };

 public:
@ -191,15 +179,9 @@ class Transform {
   */
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range,
-                                 GPUSet const devices,
+                                 int device,
                                 bool const shard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(devices), shard};
-  }
-  template <typename Functor>
-  static Evaluator<Functor> Init(Functor func, Range const range,
-                                 GPUDistribution const dist,
-                                 bool const shard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(dist), shard};
+    return Evaluator<Functor> {func, std::move(range), device, shard};
  }
 };

--- a/src/data/data.cu
+++ b/src/data/data.cu
@ -78,9 +78,9 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
  } else {
    LOG(FATAL) << "Unknown metainfo: " << key;
  }
-  dst->Reshard(GPUDistribution(GPUSet::Range(ptr_device, 1)));
+  dst->SetDevice(ptr_device);
  dst->Resize(length);
-  auto p_dst = thrust::device_pointer_cast(dst->DevicePointer(0));
+  auto p_dst = thrust::device_pointer_cast(dst->DevicePointer());
  thrust::copy(p_src, p_src + length, p_dst);
 }
 }  // namespace xgboost
--- a/src/data/simple_csr_source.cu
+++ b/src/data/simple_csr_source.cu
@ -77,16 +77,14 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector<Columnar> cols) {

  dh::safe_cuda(cudaSetDevice(device));

-  GPUSet devices = GPUSet::Range(device, 1);
-
-  page_.offset.Reshard(GPUDistribution(devices));
+  page_.offset.SetDevice(device);
  page_.offset.Resize(info.num_row_ + 1);

-  page_.data.Reshard(GPUDistribution(devices));
+  page_.data.SetDevice(device);
  page_.data.Resize(info.num_nonzero_);

-  auto s_data = page_.data.DeviceSpan(device);
-  auto s_offsets = page_.offset.DeviceSpan(device);
+  auto s_data = page_.data.DeviceSpan();
+  auto s_offsets = page_.offset.DeviceSpan();
  CHECK_EQ(s_offsets.size(), n_rows + 1);

  int32_t constexpr kThreads = 256;
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -182,9 +182,9 @@ void GBTree::DoBoost(DMatrix* p_fmat,
    CHECK_EQ(in_gpair->Size() % ngroup, 0U)
        << "must have exactly ngroup*nrow gpairs";
    // TODO(canonizer): perform this on GPU if HostDeviceVector has device set.
-    HostDeviceVector<GradientPair> tmp
-        (in_gpair->Size() / ngroup, GradientPair(),
-         GPUDistribution::Block(in_gpair->Distribution().Devices()));
+    HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup,
+                                       GradientPair(),
+                                       in_gpair->DeviceIdx());
    const auto& gpair_h = in_gpair->ConstHostVector();
    auto nsize = static_cast<bst_omp_uint>(tmp.Size());
    for (int gid = 0; gid < ngroup; ++gid) {
--- a/src/learner.cc
+++ b/src/learner.cc
@ -237,14 +237,13 @@ class LearnerImpl : public Learner {
      std::vector<std::pair<std::string, std::string> > attr;
      fi->Read(&attr);
      for (auto& kv : attr) {
-        // Load `predictor`, `n_gpus`, `gpu_id` parameters from extra attributes
+        // Load `predictor`, `gpu_id` parameters from extra attributes
        const std::string prefix = "SAVED_PARAM_";
        if (kv.first.find(prefix) == 0) {
          const std::string saved_param = kv.first.substr(prefix.length());
          bool is_gpu_predictor = saved_param == "predictor" && kv.second == "gpu_predictor";
 #ifdef XGBOOST_USE_CUDA
-          if (saved_param == "predictor" || saved_param == "n_gpus"
-              || saved_param == "gpu_id") {
+          if (saved_param == "predictor" || saved_param == "gpu_id") {
            cfg_[saved_param] = kv.second;
            LOG(INFO)
              << "Parameter '" << saved_param << "' has been recovered from "
@ -266,7 +265,7 @@ class LearnerImpl : public Learner {
          }
 #endif  // XGBOOST_USE_CUDA
          // NO visible GPU in current environment
-          if (is_gpu_predictor && GPUSet::AllVisible().Size() == 0) {
+          if (is_gpu_predictor && common::AllVisibleGPUs() == 0) {
            cfg_["predictor"] = "cpu_predictor";
            kv.second = "cpu_predictor";
            LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
@ -294,7 +293,9 @@ class LearnerImpl : public Learner {
    auto n = tparam_.__DICT__();
    cfg_.insert(n.cbegin(), n.cend());

-    gbm_->Configure({cfg_.cbegin(), cfg_.cend()});
+    Args args = {cfg_.cbegin(), cfg_.cend()};
+    generic_param_.InitAllowUnknown(args);
+    gbm_->Configure(args);
    obj_->Configure({cfg_.begin(), cfg_.end()});

    for (auto& p_metric : metrics_) {
@ -331,9 +332,8 @@ class LearnerImpl : public Learner {
      }
    }
    {
-      // Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
-      for (const auto& key : std::vector<std::string>{
-          "predictor", "n_gpus", "gpu_id"}) {
+      // Write `predictor`, `gpu_id` parameters as extra attributes
+      for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) {
        auto it = cfg_.find(key);
        if (it != cfg_.end()) {
          mparam.contain_extra_attrs = 1;
@ -581,13 +581,8 @@ class LearnerImpl : public Learner {
    gbm_->Configure(args);

    if (this->gbm_->UseGPU()) {
-      if (cfg_.find("n_gpus") == cfg_.cend()) {
-        generic_param_.n_gpus = 1;
-      }
-      if (generic_param_.n_gpus != 1) {
-        LOG(FATAL) << "Single process multi-GPU training is no longer supported. "
-                      "Please switch to distributed GPU training with one process per GPU. "
-                      "This can be done using Dask or Spark.";
+      if (cfg_.find("gpu_id") == cfg_.cend()) {
+        generic_param_.gpu_id = 0;
      }
    }
  }
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@ -19,12 +19,6 @@ namespace linear {

 DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);

-void RescaleIndices(int device_idx, size_t ridx_begin,
-                    common::Span<xgboost::Entry> data) {
-  dh::LaunchN(device_idx, data.size(),
-              [=] __device__(size_t idx) { data[idx].index -= ridx_begin; });
-}
-
 class DeviceShard {
  int device_id_;
  dh::BulkAllocator ba_;
@ -32,18 +26,16 @@ class DeviceShard {
  common::Span<xgboost::Entry> data_;
  common::Span<GradientPair> gpair_;
  dh::CubMemory temp_;
-  size_t ridx_begin_;
-  size_t ridx_end_;
+  size_t shard_size_;

 public:
  DeviceShard(int device_id,
              const SparsePage &batch,  // column batch
-              bst_uint row_begin, bst_uint row_end,
+              bst_uint shard_size,
              const LinearTrainParam &param,
              const gbm::GBLinearModelParam &model_param)
      : device_id_(device_id),
-        ridx_begin_(row_begin),
-        ridx_end_(row_end) {
+        shard_size_(shard_size) {
    if ( IsEmpty() ) { return; }
    dh::safe_cuda(cudaSetDevice(device_id_));
    // The begin and end indices for the section of each column associated with
@ -51,25 +43,25 @@ class DeviceShard {
    std::vector<std::pair<bst_uint, bst_uint>> column_segments;
    row_ptr_ = {0};
    // iterate through columns
-    for (auto fidx = 0; fidx < batch.Size(); fidx++) {
+    for (size_t fidx = 0; fidx < batch.Size(); fidx++) {
      common::Span<Entry const> col = batch[fidx];
      auto cmp = [](Entry e1, Entry e2) {
        return e1.index < e2.index;
      };
      auto column_begin =
          std::lower_bound(col.cbegin(), col.cend(),
-                           xgboost::Entry(row_begin, 0.0f), cmp);
+                           xgboost::Entry(0, 0.0f), cmp);
      auto column_end =
          std::lower_bound(col.cbegin(), col.cend(),
-                           xgboost::Entry(row_end, 0.0f), cmp);
+                           xgboost::Entry(shard_size_, 0.0f), cmp);
      column_segments.emplace_back(
          std::make_pair(column_begin - col.cbegin(), column_end - col.cbegin()));
      row_ptr_.push_back(row_ptr_.back() + (column_end - column_begin));
    }
    ba_.Allocate(device_id_, &data_, row_ptr_.back(), &gpair_,
-                 (row_end - row_begin) * model_param.num_output_group);
+                 shard_size_ * model_param.num_output_group);

-    for (int fidx = 0; fidx < batch.Size(); fidx++) {
+    for (size_t fidx = 0; fidx < batch.Size(); fidx++) {
      auto col = batch[fidx];
      auto seg = column_segments[fidx];
      dh::safe_cuda(cudaMemcpy(
@ -77,23 +69,21 @@ class DeviceShard {
          col.data() + seg.first,
          sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice));
    }
-    // Rescale indices with respect to current shard
-    RescaleIndices(device_id_, ridx_begin_, data_);
  }

-  ~DeviceShard() {
+  ~DeviceShard() {  // NOLINT
    dh::safe_cuda(cudaSetDevice(device_id_));
  }

  bool IsEmpty() {
-    return (ridx_end_ - ridx_begin_) == 0;
+    return shard_size_ == 0;
  }

  void UpdateGpair(const std::vector<GradientPair> &host_gpair,
                   const gbm::GBLinearModelParam &model_param) {
    dh::safe_cuda(cudaMemcpyAsync(
        gpair_.data(),
-        host_gpair.data() + ridx_begin_ * model_param.num_output_group,
+        host_gpair.data(),
        gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice));
  }

@ -107,13 +97,13 @@ class DeviceShard {
        counting, f);
    auto perm = thrust::make_permutation_iterator(gpair_.data(), skip);

-    return dh::SumReduction(temp_, perm, ridx_end_ - ridx_begin_);
+    return dh::SumReduction(temp_, perm, shard_size_);
  }

  void UpdateBiasResidual(float dbias, int group_idx, int num_groups) {
    if (dbias == 0.0f) return;
    auto d_gpair = gpair_;
-    dh::LaunchN(device_id_, ridx_end_ - ridx_begin_, [=] __device__(size_t idx) {
+    dh::LaunchN(device_id_, shard_size_, [=] __device__(size_t idx) {
      auto &g = d_gpair[idx * num_groups + group_idx];
      g += GradientPair(g.GetHess() * dbias, 0);
    });
@ -154,7 +144,7 @@ class DeviceShard {
 * \brief Coordinate descent algorithm that updates one feature per iteration
 */

-class GPUCoordinateUpdater : public LinearUpdater {
+class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
 public:
  // set training parameter
  void Configure(Args const& args) override {
@ -165,37 +155,23 @@ class GPUCoordinateUpdater : public LinearUpdater {

  void LazyInitShards(DMatrix *p_fmat,
                      const gbm::GBLinearModelParam &model_param) {
-    if (!shards_.empty()) return;
+    if (shard_) return;

-    dist_ = GPUDistribution::Block(GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus,
-                                               p_fmat->Info().num_row_));
-    auto devices = dist_.Devices();
+    device_ = learner_param_->gpu_id;

-    size_t n_devices = static_cast<size_t>(devices.Size());
-    size_t row_begin = 0;
-    size_t num_row = static_cast<size_t>(p_fmat->Info().num_row_);
+    auto num_row = static_cast<size_t>(p_fmat->Info().num_row_);

    // Partition input matrix into row segments
    std::vector<size_t> row_segments;
    row_segments.push_back(0);
-    for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
-      size_t shard_size = dist_.ShardSize(num_row, d_idx);
-      size_t row_end = row_begin + shard_size;
-      row_segments.push_back(row_end);
-      row_begin = row_end;
-    }
+    size_t shard_size = num_row;
+    row_segments.push_back(shard_size);

    CHECK(p_fmat->SingleColBlock());
    SparsePage const& batch = *(p_fmat->GetBatches<CSCPage>().begin());

-    shards_.resize(n_devices);
-    // Create device shards
-    dh::ExecuteIndexShards(&shards_,
-                           [&](int i, std::unique_ptr<DeviceShard>& shard) {
-        shard = std::unique_ptr<DeviceShard>(
-            new DeviceShard(devices.DeviceId(i), batch, row_segments[i],
-                            row_segments[i + 1], tparam_, model_param));
-      });
+    // Create device shard
+    shard_.reset(new DeviceShard(device_, batch, shard_size, tparam_, model_param));
  }

  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
@ -208,11 +184,9 @@ class GPUCoordinateUpdater : public LinearUpdater {
    monitor_.Start("UpdateGpair");
    auto &in_gpair_host = in_gpair->ConstHostVector();
    // Update gpair
-    dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
-      if (!shard->IsEmpty()) {
-        shard->UpdateGpair(in_gpair_host, model->param);
-      }
-    });
+    if (shard_) {
+      shard_->UpdateGpair(in_gpair_host, model->param);
+    }
    monitor_.Stop("UpdateGpair");

    monitor_.Start("UpdateBias");
@ -237,32 +211,21 @@ class GPUCoordinateUpdater : public LinearUpdater {
  }

  void UpdateBias(DMatrix *p_fmat, gbm::GBLinearModel *model) {
-    for (int group_idx = 0; group_idx < model->param.num_output_group;
-         ++group_idx) {
+    for (int group_idx = 0; group_idx < model->param.num_output_group; ++group_idx) {
      // Get gradient
-      auto grad = dh::ReduceShards<GradientPair>(
-          &shards_, [&](std::unique_ptr<DeviceShard> &shard) {
-            if (!shard->IsEmpty()) {
-              GradientPair result =
-                  shard->GetBiasGradient(group_idx,
-                                         model->param.num_output_group);
-              return result;
-            }
-            return GradientPair(0, 0);
-          });
-
+      auto grad = GradientPair(0, 0);
+      if (shard_) {
+        grad = shard_->GetBiasGradient(group_idx, model->param.num_output_group);
+      }
      auto dbias = static_cast<float>(
          tparam_.learning_rate *
-          CoordinateDeltaBias(grad.GetGrad(), grad.GetHess()));
+              CoordinateDeltaBias(grad.GetGrad(), grad.GetHess()));
      model->bias()[group_idx] += dbias;

      // Update residual
-    dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
-        if (!shard->IsEmpty()) {
-          shard->UpdateBiasResidual(dbias, group_idx,
-                                    model->param.num_output_group);
-        }
-      });
+      if (shard_) {
+        shard_->UpdateBiasResidual(dbias, group_idx, model->param.num_output_group);
+      }
    }
  }

@ -271,38 +234,30 @@ class GPUCoordinateUpdater : public LinearUpdater {
                     gbm::GBLinearModel *model) {
    bst_float &w = (*model)[fidx][group_idx];
    // Get gradient
-    auto grad = dh::ReduceShards<GradientPair>(
-        &shards_, [&](std::unique_ptr<DeviceShard> &shard) {
-          if (!shard->IsEmpty()) {
-            return shard->GetGradient(group_idx, model->param.num_output_group,
-                                      fidx);
-          }
-          return GradientPair(0, 0);
-        });
-
+    auto grad = GradientPair(0, 0);
+    if (shard_) {
+      grad = shard_->GetGradient(group_idx, model->param.num_output_group, fidx);
+    }
    auto dw = static_cast<float>(tparam_.learning_rate *
                                 CoordinateDelta(grad.GetGrad(), grad.GetHess(),
                                                 w, tparam_.reg_alpha_denorm,
                                                 tparam_.reg_lambda_denorm));
    w += dw;

-    dh::ExecuteIndexShards(&shards_, [&](int idx,
-                                        std::unique_ptr<DeviceShard> &shard) {
-      if (!shard->IsEmpty()) {
-        shard->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx);
-      }
-    });
+    if (shard_) {
+      shard_->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx);
+    }
  }

 private:
  // training parameter
  LinearTrainParam tparam_;
  CoordinateParam coord_param_;
-  GPUDistribution dist_;
+  int device_{};
  std::unique_ptr<FeatureSelector> selector_;
  common::Monitor monitor_;

-  std::vector<std::unique_ptr<DeviceShard>> shards_;
+  std::unique_ptr<DeviceShard> shard_{nullptr};
 };

 XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@ -30,8 +30,7 @@ DMLC_REGISTRY_FILE_TAG(elementwise_metric);
 template <typename EvalRow>
 class ElementWiseMetricsReduction {
 public:
-  explicit ElementWiseMetricsReduction(EvalRow policy) :
-    policy_(std::move(policy)) {}
+  explicit ElementWiseMetricsReduction(EvalRow policy) : policy_(std::move(policy)) {}

  PackedReduceResult CpuReduceMetrics(
      const HostDeviceVector<bst_float>& weights,
@ -59,34 +58,31 @@ class ElementWiseMetricsReduction {
 #if defined(XGBOOST_USE_CUDA)

  ~ElementWiseMetricsReduction() {
-    for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
-      dh::safe_cuda(cudaSetDevice(id));
-      size_t index = devices_.Index(id);
-      allocators_.at(index).Free();
+    if (device_ >= 0) {
+      dh::safe_cuda(cudaSetDevice(device_));
+      allocator_.Free();
    }
  }

  PackedReduceResult DeviceReduceMetrics(
-      GPUSet::GpuIdType device_id,
-      size_t device_index,
      const HostDeviceVector<bst_float>& weights,
      const HostDeviceVector<bst_float>& labels,
      const HostDeviceVector<bst_float>& preds) {
-    size_t n_data = preds.DeviceSize(device_id);
+    size_t n_data = preds.DeviceSize();

    thrust::counting_iterator<size_t> begin(0);
    thrust::counting_iterator<size_t> end = begin + n_data;

-    auto s_label = labels.DeviceSpan(device_id);
-    auto s_preds = preds.DeviceSpan(device_id);
-    auto s_weights = weights.DeviceSpan(device_id);
+    auto s_label = labels.DeviceSpan();
+    auto s_preds = preds.DeviceSpan();
+    auto s_weights = weights.DeviceSpan();

    bool const is_null_weight = weights.Size() == 0;

    auto d_policy = policy_;

    PackedReduceResult result = thrust::transform_reduce(
-        thrust::cuda::par(allocators_.at(device_index)),
+        thrust::cuda::par(allocator_),
        begin, end,
        [=] XGBOOST_DEVICE(size_t idx) {
          bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
@ -105,37 +101,24 @@ class ElementWiseMetricsReduction {

  PackedReduceResult Reduce(
      const GenericParameter &tparam,
-      GPUSet devices,
+      int device,
      const HostDeviceVector<bst_float>& weights,
      const HostDeviceVector<bst_float>& labels,
      const HostDeviceVector<bst_float>& preds) {
    PackedReduceResult result;

-    if (devices.IsEmpty()) {
+    if (device < 0) {
      result = CpuReduceMetrics(weights, labels, preds);
    }
 #if defined(XGBOOST_USE_CUDA)
    else {  // NOLINT
-      if (allocators_.empty()) {
-        devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
-        allocators_.resize(devices_.Size());
-      }
-      preds.Shard(devices);
-      labels.Shard(devices);
-      weights.Shard(devices);
-      std::vector<PackedReduceResult> res_per_device(devices.Size());
+      device_ = device;
+      preds.SetDevice(device_);
+      labels.SetDevice(device_);
+      weights.SetDevice(device_);

-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
-        dh::safe_cuda(cudaSetDevice(id));
-        size_t index = devices.Index(id);
-        res_per_device.at(index) =
-            DeviceReduceMetrics(id, index, weights, labels, preds);
-      }
-
-      for (auto const& res : res_per_device) {
-        result += res;
-      }
+      dh::safe_cuda(cudaSetDevice(device_));
+      result = DeviceReduceMetrics(weights, labels, preds);
    }
 #endif  // defined(XGBOOST_USE_CUDA)
    return result;
@ -144,8 +127,8 @@ class ElementWiseMetricsReduction {
 private:
  EvalRow policy_;
 #if defined(XGBOOST_USE_CUDA)
-  GPUSet devices_;
-  std::vector<dh::CubMemory> allocators_;
+  int device_{-1};
+  dh::CubMemory allocator_;
 #endif  // defined(XGBOOST_USE_CUDA)
 };

@ -345,11 +328,10 @@ struct EvalEWiseBase : public Metric {
        << "label and prediction size not match, "
        << "hint: use merror or mlogloss for multi-class classification";
    const auto ndata = static_cast<omp_ulong>(info.labels_.Size());
-    // Dealing with ndata < n_gpus.
-    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
+    int device = tparam_->gpu_id;

    auto result =
-        reducer_.Reduce(*tparam_, devices, info.weights_, info.labels_, preds);
+        reducer_.Reduce(*tparam_, device, info.weights_, info.labels_, preds);

    double dat[2] { result.Residue(), result.Weights() };
    if (distributed) {
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@ -74,35 +74,32 @@ class MultiClassMetricsReduction {
 #if defined(XGBOOST_USE_CUDA)

  ~MultiClassMetricsReduction() {
-    for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
-      dh::safe_cuda(cudaSetDevice(id));
-      size_t index = devices_.Index(id);
-      allocators_.at(index).Free();
+    if (device_ >= 0) {
+      dh::safe_cuda(cudaSetDevice(device_));
+      allocator_.Free();
    }
  }

  PackedReduceResult DeviceReduceMetrics(
-      GPUSet::GpuIdType device_id,
-      size_t device_index,
      const HostDeviceVector<bst_float>& weights,
      const HostDeviceVector<bst_float>& labels,
      const HostDeviceVector<bst_float>& preds,
      const size_t n_class) {
-    size_t n_data = labels.DeviceSize(device_id);
+    size_t n_data = labels.DeviceSize();

    thrust::counting_iterator<size_t> begin(0);
    thrust::counting_iterator<size_t> end = begin + n_data;

-    auto s_labels = labels.DeviceSpan(device_id);
-    auto s_preds = preds.DeviceSpan(device_id);
-    auto s_weights = weights.DeviceSpan(device_id);
+    auto s_labels = labels.DeviceSpan();
+    auto s_preds = preds.DeviceSpan();
+    auto s_weights = weights.DeviceSpan();

    bool const is_null_weight = weights.Size() == 0;
    auto s_label_error = label_error_.GetSpan<int32_t>(1);
    s_label_error[0] = 0;

    PackedReduceResult result = thrust::transform_reduce(
-        thrust::cuda::par(allocators_.at(device_index)),
+        thrust::cuda::par(allocator_),
        begin, end,
        [=] XGBOOST_DEVICE(size_t idx) {
          bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
@ -127,38 +124,25 @@ class MultiClassMetricsReduction {

  PackedReduceResult Reduce(
      const GenericParameter &tparam,
-      GPUSet devices,
+      int device,
      size_t n_class,
      const HostDeviceVector<bst_float>& weights,
      const HostDeviceVector<bst_float>& labels,
      const HostDeviceVector<bst_float>& preds) {
    PackedReduceResult result;

-    if (devices.IsEmpty()) {
+    if (device < 0) {
      result = CpuReduceMetrics(weights, labels, preds, n_class);
    }
 #if defined(XGBOOST_USE_CUDA)
    else {  // NOLINT
-      if (allocators_.empty()) {
-        devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
-        allocators_.resize(devices_.Size());
-      }
-      preds.Shard(GPUDistribution::Granular(devices, n_class));
-      labels.Shard(devices);
-      weights.Shard(devices);
-      std::vector<PackedReduceResult> res_per_device(devices.Size());
+      device_ = tparam.gpu_id;
+      preds.SetDevice(device_);
+      labels.SetDevice(device_);
+      weights.SetDevice(device_);

-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
-        dh::safe_cuda(cudaSetDevice(id));
-        size_t index = devices.Index(id);
-        res_per_device.at(index) =
-            DeviceReduceMetrics(id, index, weights, labels, preds, n_class);
-      }
-
-      for (auto const& res : res_per_device) {
-        result += res;
-      }
+      dh::safe_cuda(cudaSetDevice(device_));
+      result = DeviceReduceMetrics(weights, labels, preds, n_class);
    }
 #endif  // defined(XGBOOST_USE_CUDA)
    return result;
@ -167,8 +151,8 @@ class MultiClassMetricsReduction {
 private:
 #if defined(XGBOOST_USE_CUDA)
  dh::PinnedMemory label_error_;
-  GPUSet devices_;
-  std::vector<dh::CubMemory> allocators_;
+  int device_{-1};
+  dh::CubMemory allocator_;
 #endif  // defined(XGBOOST_USE_CUDA)
 };

@ -190,8 +174,8 @@ struct EvalMClassBase : public Metric {
        << " use logloss for binary classification";
    const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());

-    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
-    auto result = reducer_.Reduce(*tparam_, devices, nclass, info.weights_, info.labels_, preds);
+    int device = tparam_->gpu_id;
+    auto result = reducer_.Reduce(*tparam_, device, nclass, info.weights_, info.labels_, preds);
    double dat[2] { result.Residue(), result.Weights() };

    if (distributed) {
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@ -58,7 +58,7 @@ class HingeObj : public ObjFunction {
          _out_gpair[_idx] = GradientPair(g, h);
        },
        common::Range{0, static_cast<int64_t>(ndata)},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata)).Eval(
+        tparam_->gpu_id).Eval(
            out_gpair, &preds, &info.labels_, &info.weights_);
  }

@ -68,7 +68,7 @@ class HingeObj : public ObjFunction {
          _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size()), 1},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }

--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@ -59,14 +59,14 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const int nclass = param_.num_class;
    const auto ndata = static_cast<int64_t>(preds.Size() / nclass);

-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
-    out_gpair->Shard(GPUDistribution::Granular(devices, nclass));
-    info.labels_.Shard(GPUDistribution::Block(devices));
-    info.weights_.Shard(GPUDistribution::Block(devices));
-    preds.Shard(GPUDistribution::Granular(devices, nclass));
+    auto device = tparam_->gpu_id;
+    out_gpair->SetDevice(device);
+    info.labels_.SetDevice(device);
+    info.weights_.SetDevice(device);
+    preds.SetDevice(device);

-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
-    label_correct_.Shard(GPUDistribution::Block(devices));
+    label_correct_.Resize(1);
+    label_correct_.SetDevice(device);

    out_gpair->Resize(preds.Size());
    label_correct_.Fill(1);
@ -100,7 +100,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
            p = label == k ? p - 1.0f : p;
            gpair[idx * nclass + k] = GradientPair(p * wt, h);
          }
-        }, common::Range{0, ndata}, devices, false)
+        }, common::Range{0, ndata}, device, false)
        .Eval(out_gpair, &info.labels_, &preds, &info.weights_, &label_correct_);

    std::vector<int>& label_correct_h = label_correct_.HostVector();
@ -125,7 +125,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
    const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
    max_preds_.Resize(ndata);

-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size());
+    auto device = tparam_->gpu_id;
    if (prob) {
      common::Transform<>::Init(
          [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
@ -133,11 +133,11 @@ class SoftmaxMultiClassObj : public ObjFunction {
                _preds.subspan(_idx * nclass, nclass);
            common::Softmax(point.begin(), point.end());
          },
-          common::Range{0, ndata}, GPUDistribution::Granular(devices, nclass))
+          common::Range{0, ndata}, device)
        .Eval(io_preds);
    } else {
-      io_preds->Shard(GPUDistribution::Granular(devices, nclass));
-      max_preds_.Shard(GPUDistribution::Block(devices));
+      io_preds->SetDevice(device);
+      max_preds_.SetDevice(device);
      common::Transform<>::Init(
          [=] XGBOOST_DEVICE(size_t _idx,
                             common::Span<const bst_float> _preds,
@ -148,7 +148,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
                common::FindMaxIndex(point.cbegin(),
                                     point.cend()) - point.cbegin();
          },
-          common::Range{0, ndata}, devices, false)
+          common::Range{0, ndata}, device, false)
        .Eval(io_preds, &max_preds_);
    }
    if (!prob) {
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@ -57,8 +57,8 @@ class RegLossObj : public ObjFunction {
        << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size();
    size_t ndata = preds.Size();
    out_gpair->Resize(ndata);
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    auto device = tparam_->gpu_id;
+    label_correct_.Resize(1);
    label_correct_.Fill(1);

    bool is_null_weight = info.weights_.Size() == 0;
@ -83,7 +83,7 @@ class RegLossObj : public ObjFunction {
          _out_gpair[_idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
                                          Loss::SecondOrderGradient(p, label) * w);
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, device).Eval(
            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);

    // copy "label correct" flags back to host
@ -105,7 +105,7 @@ class RegLossObj : public ObjFunction {
        [] XGBOOST_DEVICE(size_t _idx, common::Span<float> _preds) {
          _preds[_idx] = Loss::PredTransform(_preds[_idx]);
        }, common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }

@ -175,8 +175,8 @@ class PoissonRegression : public ObjFunction {
    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
    size_t ndata = preds.Size();
    out_gpair->Resize(ndata);
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    auto device = tparam_->gpu_id;
+    label_correct_.Resize(1);
    label_correct_.Fill(1);

    bool is_null_weight = info.weights_.Size() == 0;
@ -197,7 +197,7 @@ class PoissonRegression : public ObjFunction {
          _out_gpair[_idx] = GradientPair{(expf(p) - y) * w,
                                          expf(p + max_delta_step) * w};
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, device).Eval(
            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);
    // copy "label correct" flags back to host
    std::vector<int>& label_correct_h = label_correct_.HostVector();
@ -213,7 +213,7 @@ class PoissonRegression : public ObjFunction {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@ -340,9 +340,9 @@ class GammaRegression : public ObjFunction {
    CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
    CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided";
    const size_t ndata = preds.Size();
-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
+    auto device = tparam_->gpu_id;
    out_gpair->Resize(ndata);
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    label_correct_.Resize(1);
    label_correct_.Fill(1);

    const bool is_null_weight = info.weights_.Size() == 0;
@ -361,7 +361,7 @@ class GammaRegression : public ObjFunction {
          }
          _out_gpair[_idx] = GradientPair((1 - y / expf(p)) * w, y / expf(p) * w);
        },
-        common::Range{0, static_cast<int64_t>(ndata)}, devices).Eval(
+        common::Range{0, static_cast<int64_t>(ndata)}, device).Eval(
            &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);

    // copy "label correct" flags back to host
@ -378,7 +378,7 @@ class GammaRegression : public ObjFunction {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }
  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@ -430,8 +430,8 @@ class TweedieRegression : public ObjFunction {
    const size_t ndata = preds.Size();
    out_gpair->Resize(ndata);

-    auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size());
-    label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size());
+    auto device = tparam_->gpu_id;
+    label_correct_.Resize(1);
    label_correct_.Fill(1);

    const bool is_null_weight = info.weights_.Size() == 0;
@ -455,7 +455,7 @@ class TweedieRegression : public ObjFunction {
              std::exp((1 - rho) * p) + (2 - rho) * expf((2 - rho) * p);
          _out_gpair[_idx] = GradientPair(grad * w, hess * w);
        },
-        common::Range{0, static_cast<int64_t>(ndata), 1}, devices)
+        common::Range{0, static_cast<int64_t>(ndata), 1}, device)
        .Eval(&label_correct_, out_gpair, &preds, &info.labels_, &info.weights_);

    // copy "label correct" flags back to host
@ -472,7 +472,7 @@ class TweedieRegression : public ObjFunction {
          _preds[_idx] = expf(_preds[_idx]);
        },
        common::Range{0, static_cast<int64_t>(io_preds->Size())},
-        GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()))
+        tparam_->gpu_id)
        .Eval(io_preds);
  }

--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -20,12 +20,6 @@ namespace predictor {

 DMLC_REGISTRY_FILE_TAG(gpu_predictor);

-template <typename IterT>
-void IncrementOffset(IterT begin_itr, IterT end_itr, size_t amount) {
-  thrust::transform(begin_itr, end_itr, begin_itr,
-                    [=] __device__(size_t elem) { return elem + amount; });
-}
-
 /**
 * \struct  DevicePredictionNode
 *
@ -44,7 +38,7 @@ struct DevicePredictionNode {
  int fidx;
  int left_child_idx;
  int right_child_idx;
-  NodeValue val;
+  NodeValue val{};

  DevicePredictionNode(const RegTree::Node& n) {  // NOLINT
    static_assert(sizeof(DevicePredictionNode) == 16, "Size is not 16 bytes");
@ -200,58 +194,14 @@ __global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
 }

 class GPUPredictor : public xgboost::Predictor {
- protected:
-  struct DevicePredictionCacheEntry {
-    std::shared_ptr<DMatrix> data;
-    HostDeviceVector<bst_float> predictions;
-  };
-
 private:
-  void DeviceOffsets(const HostDeviceVector<size_t>& data,
-                     size_t total_size,
-                     std::vector<size_t>* out_offsets) {
-    auto& offsets = *out_offsets;
-    offsets.resize(devices_.Size() + 1);
-    offsets[0] = 0;
-#pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
-    for (int shard = 0; shard < devices_.Size(); ++shard) {
-      int device = devices_.DeviceId(shard);
-      auto data_span = data.DeviceSpan(device);
-      dh::safe_cuda(cudaSetDevice(device));
-      if (data_span.size() == 0) {
-        offsets[shard + 1] = total_size;
-      } else {
-        // copy the last element from every shard
-        dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
-                                 &data_span[data_span.size()-1],
-                                 sizeof(size_t), cudaMemcpyDeviceToHost));
-      }
-    }
-  }
-
-  // This function populates the explicit offsets that can be used to create a window into the
-  // underlying host vector. The window starts from the `batch_offset` and has a size of
-  // `batch_size`, and is sharded across all the devices. Each shard is granular depending on
-  // the number of output classes `n_classes`.
-  void PredictionDeviceOffsets(size_t total_size, size_t batch_offset, size_t batch_size,
-                               int n_classes, std::vector<size_t>* out_offsets) {
-    auto& offsets = *out_offsets;
-    size_t n_shards = devices_.Size();
-    offsets.resize(n_shards + 2);
-    size_t rows_per_shard = common::DivRoundUp(batch_size, n_shards);
-    for (size_t shard = 0; shard < devices_.Size(); ++shard) {
-      size_t n_rows = std::min(batch_size, shard * rows_per_shard);
-      offsets[shard] = batch_offset + n_rows * n_classes;
-    }
-    offsets[n_shards] = batch_offset + batch_size * n_classes;
-    offsets[n_shards + 1] = total_size;
-  }
-
  struct DeviceShard {
    DeviceShard() : device_{-1} {}

    ~DeviceShard() {
-      dh::safe_cuda(cudaSetDevice(device_));
+      if (device_ >= 0) {
+        dh::safe_cuda(cudaSetDevice(device_));
+      }
    }

    void Init(int device) {
@ -284,10 +234,9 @@ class GPUPredictor : public xgboost::Predictor {
    void PredictInternal
    (const SparsePage& batch, size_t num_features,
     HostDeviceVector<bst_float>* predictions) {
-      if (predictions->DeviceSize(device_) == 0) { return; }
      dh::safe_cuda(cudaSetDevice(device_));
      const int BLOCK_THREADS = 128;
-      size_t num_rows = batch.offset.DeviceSize(device_) - 1;
+      size_t num_rows = batch.offset.DeviceSize() - 1;
      const int GRID_SIZE = static_cast<int>(common::DivRoundUp(num_rows, BLOCK_THREADS));

      int shared_memory_bytes = static_cast<int>
@ -297,14 +246,12 @@ class GPUPredictor : public xgboost::Predictor {
        shared_memory_bytes = 0;
        use_shared = false;
      }
-      const auto& data_distr = batch.data.Distribution();
-      size_t entry_start = data_distr.ShardStart(batch.data.Size(),
-                                                 data_distr.Devices().Index(device_));
+      size_t entry_start = 0;

      PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
-        (dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_),
-         dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_),
-         batch.data.DeviceSpan(device_), this->tree_begin_, this->tree_end_, num_features,
+        (dh::ToSpan(nodes_), predictions->DeviceSpan(), dh::ToSpan(tree_segments_),
+         dh::ToSpan(tree_group_), batch.offset.DeviceSpan(),
+         batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features,
         num_rows, entry_start, use_shared, this->num_group_);
    }

@ -322,7 +269,7 @@ class GPUPredictor : public xgboost::Predictor {
  void InitModel(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) {
    CHECK_EQ(model.param.size_leaf_vector, 0);
    // Copy decision trees to device
-    thrust::host_vector<size_t> h_tree_segments;
+    thrust::host_vector<size_t> h_tree_segments{};
    h_tree_segments.reserve((tree_end - tree_begin) + 1);
    size_t sum = 0;
    h_tree_segments.push_back(sum);
@ -337,9 +284,7 @@ class GPUPredictor : public xgboost::Predictor {
      std::copy(src_nodes.begin(), src_nodes.end(),
                h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
    }
-    dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard &shard) {
-      shard.InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
-    });
+    shard_.InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
  }

  void DevicePredictInternal(DMatrix* dmat,
@ -352,40 +297,43 @@ class GPUPredictor : public xgboost::Predictor {
    InitModel(model, tree_begin, tree_end);

    size_t batch_offset = 0;
+    auto* preds = out_preds;
+    std::unique_ptr<HostDeviceVector<bst_float>> batch_preds{nullptr};
    for (auto &batch : dmat->GetBatches<SparsePage>()) {
      bool is_external_memory = batch.Size() < dmat->Info().num_row_;
      if (is_external_memory) {
-        std::vector<size_t> out_preds_offsets;
-        PredictionDeviceOffsets(out_preds->Size(), batch_offset, batch.Size(),
-                                model.param.num_output_group, &out_preds_offsets);
-        out_preds->Reshard(GPUDistribution::Explicit(devices_, out_preds_offsets));
+        batch_preds.reset(new HostDeviceVector<bst_float>);
+        batch_preds->Resize(batch.Size() * model.param.num_output_group);
+        std::copy(out_preds->ConstHostVector().begin() + batch_offset,
+                  out_preds->ConstHostVector().begin() + batch_offset + batch_preds->Size(),
+                  batch_preds->HostVector().begin());
+        preds = batch_preds.get();
      }

-      batch.offset.Shard(GPUDistribution::Overlap(devices_, 1));
-      std::vector<size_t> device_offsets;
-      DeviceOffsets(batch.offset, batch.data.Size(), &device_offsets);
-      batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
+      batch.offset.SetDevice(device_);
+      batch.data.SetDevice(device_);
+      preds->SetDevice(device_);
+      shard_.PredictInternal(batch, model.param.num_feature, preds);

-      dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
-        shard.PredictInternal(batch, model.param.num_feature, out_preds);
-      });
+      if (is_external_memory) {
+        auto h_preds = preds->ConstHostVector();
+        std::copy(h_preds.begin(), h_preds.end(), out_preds->HostVector().begin() + batch_offset);
+      }
      batch_offset += batch.Size() * model.param.num_output_group;
    }
-    out_preds->Reshard(GPUDistribution::Granular(devices_, model.param.num_output_group));

    monitor_.StopCuda("DevicePredictInternal");
  }

 public:
-  GPUPredictor() = default;
+  GPUPredictor() : device_{-1} {};

  void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                    const gbm::GBTreeModel& model, int tree_begin,
                    unsigned ntree_limit = 0) override {
-    GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus,
-                                 dmat->Info().num_row_);
-    CHECK_NE(devices.Size(), 0);
-    ConfigureShards(devices);
+    int device = learner_param_->gpu_id;
+    CHECK_GE(device, 0);
+    ConfigureShard(device);

    if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
      return;
@ -408,10 +356,9 @@ class GPUPredictor : public xgboost::Predictor {
    size_t n_classes = model.param.num_output_group;
    size_t n = n_classes * info.num_row_;
    const HostDeviceVector<bst_float>& base_margin = info.base_margin_;
-    out_preds->Shard(GPUDistribution::Granular(devices_, n_classes));
    out_preds->Resize(n);
    if (base_margin.Size() != 0) {
-      CHECK_EQ(out_preds->Size(), n);
+      CHECK_EQ(base_margin.Size(), n);
      out_preds->Copy(base_margin);
    } else {
      out_preds->Fill(model.base_margin);
@ -427,7 +374,7 @@ class GPUPredictor : public xgboost::Predictor {
        const HostDeviceVector<bst_float>& y = it->second.predictions;
        if (y.Size() != 0) {
          monitor_.StartCuda("PredictFromCache");
-          out_preds->Shard(y.Distribution());
+          out_preds->SetDevice(y.DeviceIdx());
          out_preds->Resize(y.Size());
          out_preds->Copy(y);
          monitor_.StopCuda("PredictFromCache");
@ -500,25 +447,23 @@ class GPUPredictor : public xgboost::Predictor {
                 const std::vector<std::shared_ptr<DMatrix>>& cache) override {
    Predictor::Configure(cfg, cache);

-    GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus);
-    ConfigureShards(devices);
+    int device = learner_param_->gpu_id;
+    if (device >= 0) {
+      ConfigureShard(device);
+    }
  }

 private:
  /*! \brief Re configure shards when GPUSet is changed. */
-  void ConfigureShards(GPUSet devices) {
-    if (devices_ == devices) return;
+  void ConfigureShard(int device) {
+    if (device_ == device) return;

-    devices_ = devices;
-    shards_.clear();
-    shards_.resize(devices_.Size());
-    dh::ExecuteIndexShards(&shards_, [=](size_t i, DeviceShard& shard){
-        shard.Init(devices_.DeviceId(i));
-      });
+    device_ = device;
+    shard_.Init(device_);
  }

-  std::vector<DeviceShard> shards_;
-  GPUSet devices_;
+  DeviceShard shard_;
+  int device_;
  common::Monitor monitor_;
 };

--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -702,7 +702,7 @@ struct DeviceShard {
    row_partitioner.reset(new RowPartitioner(device_id, n_rows));

    dh::safe_cuda(cudaMemcpyAsync(
-        gpair.data(), dh_gpair->ConstDevicePointer(device_id),
+        gpair.data(), dh_gpair->ConstDevicePointer(),
        gpair.size() * sizeof(GradientPair), cudaMemcpyHostToHost));
    SubsampleGradientPair(device_id, gpair, param.subsample, row_begin_idx);
    hist.Reset();
@ -745,8 +745,8 @@ struct DeviceShard {
    for (auto i = 0ull; i < nidxs.size(); i++) {
      auto nidx = nidxs[i];
      auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx));
-      p_feature_set->Shard(GPUSet(device_id, 1));
-      auto d_sampled_features = p_feature_set->DeviceSpan(device_id);
+      p_feature_set->SetDevice(device_id);
+      auto d_sampled_features = p_feature_set->DeviceSpan();
      common::Span<int32_t> d_feature_set =
          interaction_constraints.Query(d_sampled_features, nidx);
      auto d_split_candidates =
@ -1016,7 +1016,7 @@ struct DeviceShard {
                dh::AllReducer* reducer, int64_t num_columns) {
    constexpr int kRootNIdx = 0;

-    const auto &gpair = gpair_all->DeviceSpan(device_id);
+    const auto &gpair = gpair_all->DeviceSpan();

    dh::SumReduction(temp_memory, gpair, node_sum_gradients_d,
                     gpair.size());
@ -1294,11 +1294,8 @@ class GPUHistMakerSpecialised {
    param_.InitAllowUnknown(args);
    generic_param_ = generic_param;
    hist_maker_param_.InitAllowUnknown(args);
-    auto devices = GPUSet::All(generic_param_->gpu_id,
-                               generic_param_->n_gpus);
-    n_devices_ = devices.Size();
-    CHECK(n_devices_ != 0) << "Must have at least one device";
-    dist_ = GPUDistribution::Block(devices);
+    device_ = generic_param_->gpu_id;
+    CHECK_GE(device_, 0) << "Must have at least one device";

    dh::CheckComputeCapability();

@ -1330,30 +1327,22 @@ class GPUHistMakerSpecialised {
  void InitDataOnce(DMatrix* dmat) {
    info_ = &dmat->Info();

-    int n_devices = dist_.Devices().Size();
-
-    device_list_.resize(n_devices);
-    for (int index = 0; index < n_devices; ++index) {
-      int device_id = dist_.Devices().DeviceId(index);
-      device_list_[index] = device_id;
-    }
-
-    reducer_.Init(device_list_);
+    reducer_.Init({device_});

    // Synchronise the column sampling seed
    uint32_t column_sampling_seed = common::GlobalRandom()();
    rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);

    // Create device shards
-    shards_.resize(n_devices);
+    shards_.resize(1);
    dh::ExecuteIndexShards(
        &shards_,
        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(idx)));
-          size_t start = dist_.ShardStart(info_->num_row_, idx);
-          size_t size = dist_.ShardSize(info_->num_row_, idx);
+          dh::safe_cuda(cudaSetDevice(device_));
+          size_t start = 0;
+          size_t size = info_->num_row_;
          shard = std::unique_ptr<DeviceShard<GradientSumT>>(
-            new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(idx), idx,
+            new DeviceShard<GradientSumT>(device_, idx,
                                          start, start + size, param_,
                                          column_sampling_seed,
                                          info_->num_col_));
@ -1436,7 +1425,7 @@ class GPUHistMakerSpecialised {
    for (auto& tree : trees) {
      tree = *p_tree;
    }
-    gpair->Reshard(dist_);
+    gpair->SetDevice(device_);

    // Launch one thread for each device "shard" containing a subset of rows.
    // Threads will cooperatively build the tree, synchronising over histograms.
@ -1462,13 +1451,13 @@ class GPUHistMakerSpecialised {
      return false;
    }
    monitor_.StartCuda("UpdatePredictionCache");
-    p_out_preds->Shard(dist_.Devices());
+    p_out_preds->SetDevice(device_);
    dh::ExecuteIndexShards(
        &shards_,
        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
          dh::safe_cuda(cudaSetDevice(shard->device_id));
          shard->UpdatePredictionCache(
-              p_out_preds->DevicePointer(shard->device_id));
+              p_out_preds->DevicePointer());
        });
    monitor_.StopCuda("UpdatePredictionCache");
    return true;
@ -1483,7 +1472,6 @@ class GPUHistMakerSpecialised {
 private:
  bool initialised_;

-  int n_devices_;
  int n_bins_;

  GPUHistMakerTrainParam hist_maker_param_;
@ -1492,11 +1480,9 @@ class GPUHistMakerSpecialised {
  dh::AllReducer reducer_;

  DMatrix* p_last_fmat_;
-  GPUDistribution dist_;
+  int device_;

  common::Monitor monitor_;
-  /*! List storing device id. */
-  std::vector<int> device_list_;
 };

 class GPUHistMaker : public TreeUpdater {
--- a/tests/cpp/common/test_common.cc
+++ b/tests/cpp/common/test_common.cc
@ -1,37 +0,0 @@
-#include "../../../src/common/common.h"
-#include <gtest/gtest.h>
-
-namespace xgboost {
-TEST(GPUSet, Basic) {
-  GPUSet devices = GPUSet::Empty();
-  ASSERT_TRUE(devices.IsEmpty());
-
-  devices = GPUSet{0, 1};
-  ASSERT_TRUE(devices != GPUSet::Empty());
-  EXPECT_EQ(devices.Size(), 1);
-
-  devices = GPUSet::Range(1, 0);
-  EXPECT_EQ(devices.Size(), 0);
-  EXPECT_TRUE(devices.IsEmpty());
-
-  EXPECT_FALSE(devices.Contains(1));
-
-  devices = GPUSet::Range(2, -1);
-  EXPECT_EQ(devices, GPUSet::Empty());
-  EXPECT_EQ(devices.Size(), 0);
-  EXPECT_TRUE(devices.IsEmpty());
-
-  devices = GPUSet::Range(2, 8);  // 2 ~ 10
-  EXPECT_EQ(devices.Size(), 8);
-  EXPECT_ANY_THROW(devices.DeviceId(8));
-
-  auto device_id = devices.DeviceId(0);
-  EXPECT_EQ(device_id, 2);
-  auto device_index = devices.Index(2);
-  EXPECT_EQ(device_index, 0);
-
-#ifndef XGBOOST_USE_CUDA
-  EXPECT_EQ(GPUSet::AllVisible(), GPUSet::Empty());
-#endif
-}
-}  // namespace xgboost
--- a/tests/cpp/common/test_common.cu
+++ b/tests/cpp/common/test_common.cu
@ -1,83 +0,0 @@
-#include <gtest/gtest.h>
-#include <xgboost/logging.h>
-#include "../../../src/common/common.h"
-#include "../helpers.h"
-
-#include <string>
-
-namespace xgboost {
-
-TEST(GPUSet, GPUBasic) {
-  GPUSet devices = GPUSet::Empty();
-  ASSERT_TRUE(devices.IsEmpty());
-
-  devices = GPUSet{1, 1};
-  ASSERT_TRUE(devices != GPUSet::Empty());
-  EXPECT_EQ(devices.Size(), 1);
-  EXPECT_EQ(*(devices.begin()), 1);
-
-  devices = GPUSet::Range(1, 0);
-  EXPECT_EQ(devices, GPUSet::Empty());
-  EXPECT_EQ(devices.Size(), 0);
-  EXPECT_TRUE(devices.IsEmpty());
-
-  EXPECT_FALSE(devices.Contains(1));
-
-  devices = GPUSet::Range(2, -1);
-  EXPECT_EQ(devices, GPUSet::Empty());
-
-  devices = GPUSet::Range(2, 8);
-  EXPECT_EQ(devices.Size(), 8);
-
-  EXPECT_EQ(*devices.begin(), 2);
-  EXPECT_EQ(*devices.end(), 2 + devices.Size());
-  EXPECT_EQ(8, devices.Size());
-
-  ASSERT_NO_THROW(GPUSet::AllVisible());
-  devices = GPUSet::AllVisible();
-  if (devices.IsEmpty()) {
-    LOG(WARNING) << "Empty devices.";
-  }
-}
-
-TEST(GPUSet, Verbose) {
-  {
-    std::map<std::string, std::string> args {};
-    args["verbosity"] = "3";  // LOG INFO
-
-    testing::internal::CaptureStderr();
-    ConsoleLogger::Configure({args.cbegin(), args.cend()});
-    GPUSet::All(0, 1);
-    std::string output = testing::internal::GetCapturedStderr();
-    ASSERT_NE(output.find("GPU ID: 0"), std::string::npos);
-    ASSERT_NE(output.find("GPUs: 1"), std::string::npos);
-
-    args["verbosity"] = "1";  // restore
-    ConsoleLogger::Configure({args.cbegin(), args.cend()});
-  }
-}
-
-#if defined(XGBOOST_USE_NCCL)
-TEST(GPUSet, MGPU_GPUBasic) {
-  {
-    GPUSet devices = GPUSet::All(1, 1);
-    ASSERT_EQ(*(devices.begin()), 1);
-    ASSERT_EQ(*(devices.end()), 2);
-    ASSERT_EQ(devices.Size(), 1);
-    ASSERT_TRUE(devices.Contains(1));
-  }
-
-  {
-    GPUSet devices = GPUSet::All(0, -1);
-    ASSERT_GE(devices.Size(), 2);
-  }
-
-  // Specify number of rows.
-  {
-    GPUSet devices = GPUSet::All(0, -1, 1);
-    ASSERT_EQ(devices.Size(), 1);
-  }
-}
-#endif
-
-}  // namespace xgboost
--- a/tests/cpp/common/test_config.cc
+++ b/tests/cpp/common/test_config.cc
@ -87,8 +87,8 @@ TEST(ConfigParser, ParseKeyValuePair) {
  ASSERT_TRUE(parser.ParseKeyValuePair("booster = gbtree", &key, &value));
  ASSERT_EQ(key, "booster");
  ASSERT_EQ(value, "gbtree");
-  ASSERT_TRUE(parser.ParseKeyValuePair("n_gpus = 2", &key, &value));
-  ASSERT_EQ(key, "n_gpus");
+  ASSERT_TRUE(parser.ParseKeyValuePair("gpu_id = 2", &key, &value));
+  ASSERT_EQ(key, "gpu_id");
  ASSERT_EQ(value, "2");
  ASSERT_TRUE(parser.ParseKeyValuePair("monotone_constraints = (1,0,-1)",
                                       &key, &value));
--- a/tests/cpp/common/test_gpu_hist_util.cu
+++ b/tests/cpp/common/test_gpu_hist_util.cu
@ -18,7 +18,7 @@
 namespace xgboost {
 namespace common {

-void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
+void TestDeviceSketch(bool use_external_memory) {
  // create the data
  int nrows = 10001;
  std::shared_ptr<xgboost::DMatrix> *dmat = nullptr;
@ -53,7 +53,7 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {

  // find the cuts on the GPU
  HistogramCuts hmat_gpu;
-  size_t row_stride = DeviceSketch(p, CreateEmptyGenericParam(0, devices.Size()), gpu_batch_nrows,
+  size_t row_stride = DeviceSketch(p, CreateEmptyGenericParam(0), gpu_batch_nrows,
                                   dmat->get(), &hmat_gpu);

  // compare the row stride with the one obtained from the dmatrix
@ -81,11 +81,11 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
 }

 TEST(gpu_hist_util, DeviceSketch) {
-  TestDeviceSketch(GPUSet::Range(0, 1), false);
+  TestDeviceSketch(false);
 }

 TEST(gpu_hist_util, DeviceSketch_ExternalMemory) {
-  TestDeviceSketch(GPUSet::Range(0, 1), true);
+  TestDeviceSketch(true);
 }

 }  // namespace common
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@ -30,45 +30,36 @@ struct HostDeviceVectorSetDeviceHandler {
  }
 };

-void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
-                     HostDeviceVector<int> *v) {
+void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
  // create the vector
-  GPUSet devices = distribution.Devices();
-  v->Shard(distribution);
+  v->SetDevice(device);
  v->Resize(n);

  ASSERT_EQ(v->Size(), n);
-  ASSERT_TRUE(v->Distribution() == distribution);
-  ASSERT_TRUE(v->Devices() == devices);
-  // ensure that the devices have read-write access
-  for (int i = 0; i < devices.Size(); ++i) {
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite));
-  }
+  ASSERT_EQ(v->DeviceIdx(), device);
+  // ensure that the device have read-write access
+  ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
+  ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite));
  // ensure that the host has no access
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));

  // fill in the data on the host
  std::vector<int>& data_h = v->HostVector();
-  // ensure that the host has full access, while the devices have none
+  // ensure that the host has full access, while the device have none
  ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
  ASSERT_TRUE(v->HostCanAccess(GPUAccess::kWrite));
-  for (int i = 0; i < devices.Size(); ++i) {
-    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kRead));
-    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite));
-  }
+  ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kRead));
+  ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite));
  ASSERT_EQ(data_h.size(), n);
  std::copy_n(thrust::make_counting_iterator(0), n, data_h.begin());
 }

 void PlusOne(HostDeviceVector<int> *v) {
-  int n_devices = v->Devices().Size();
-  for (int i = 0; i < n_devices; ++i) {
-    SetDevice(i);
-    thrust::transform(v->tbegin(i), v->tend(i), v->tbegin(i),
-                      [=]__device__(unsigned int a){ return a + 1; });
-  }
+  int device = v->DeviceIdx();
+  SetDevice(device);
+  thrust::transform(v->tbegin(), v->tend(), v->tbegin(),
+                    [=]__device__(unsigned int a){ return a + 1; });
 }

 void CheckDevice(HostDeviceVector<int> *v,
@ -76,24 +67,24 @@ void CheckDevice(HostDeviceVector<int> *v,
                 const std::vector<size_t>& sizes,
                 unsigned int first, GPUAccess access) {
  int n_devices = sizes.size();
-  ASSERT_EQ(v->Devices().Size(), n_devices);
+  ASSERT_EQ(n_devices, 1);
  for (int i = 0; i < n_devices; ++i) {
-    ASSERT_EQ(v->DeviceSize(i), sizes.at(i));
+    ASSERT_EQ(v->DeviceSize(), sizes.at(i));
    SetDevice(i);
-    ASSERT_TRUE(thrust::equal(v->tcbegin(i), v->tcend(i),
+    ASSERT_TRUE(thrust::equal(v->tcbegin(), v->tcend(),
                              thrust::make_counting_iterator(first + starts[i])));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
+    ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
    // ensure that the device has at most the access specified by access
-    ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kWrite), access == GPUAccess::kWrite);
+    ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite);
  }
  ASSERT_EQ(v->HostCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
  for (int i = 0; i < n_devices; ++i) {
    SetDevice(i);
-    ASSERT_TRUE(thrust::equal(v->tbegin(i), v->tend(i),
+    ASSERT_TRUE(thrust::equal(v->tbegin(), v->tend(),
                              thrust::make_counting_iterator(first + starts[i])));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
-    ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+    ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
+    ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite));
  }
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));
  ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
@ -107,20 +98,20 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
  }
  ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
  ASSERT_EQ(v->HostCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite);
-  size_t n_devices = v->Devices().Size();
+  size_t n_devices = 1;
  for (int i = 0; i < n_devices; ++i) {
-    ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kRead), access == GPUAccess::kRead);
+    ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
    // the devices should have no write access
-    ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite));
+    ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite));
  }
 }

 void TestHostDeviceVector
-(size_t n, const GPUDistribution& distribution,
+(size_t n, int device,
 const std::vector<size_t>& starts, const std::vector<size_t>& sizes) {
  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
  HostDeviceVector<int> v;
-  InitHostDeviceVector(n, distribution, &v);
+  InitHostDeviceVector(n, device, &v);
  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
  PlusOne(&v);
  CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
@ -130,54 +121,24 @@ void TestHostDeviceVector

 TEST(HostDeviceVector, TestBlock) {
  size_t n = 1001;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
-  std::vector<size_t> starts{0, 501};
-  std::vector<size_t> sizes{501, 500};
-  TestHostDeviceVector(n, distribution, starts, sizes);
-}
-
-TEST(HostDeviceVector, TestGranular) {
-  size_t n = 3003;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Granular(GPUSet::Range(0, n_devices), 3);
-  std::vector<size_t> starts{0, 1503};
-  std::vector<size_t> sizes{1503, 1500};
-  TestHostDeviceVector(n, distribution, starts, sizes);
-}
-
-TEST(HostDeviceVector, TestOverlap) {
-  size_t n = 1001;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1);
-  std::vector<size_t> starts{0, 500};
-  std::vector<size_t> sizes{501, 501};
-  TestHostDeviceVector(n, distribution, starts, sizes);
-}
-
-TEST(HostDeviceVector, TestExplicit) {
-  size_t n = 1001;
-  int n_devices = 2;
-  std::vector<size_t> offsets{0, 550, 1001};
-  auto distribution = GPUDistribution::Explicit(GPUSet::Range(0, n_devices), offsets);
-  std::vector<size_t> starts{0, 550};
-  std::vector<size_t> sizes{550, 451};
-  TestHostDeviceVector(n, distribution, starts, sizes);
+  int device = 0;
+  std::vector<size_t> starts{0};
+  std::vector<size_t> sizes{1001};
+  TestHostDeviceVector(n, device, starts, sizes);
 }

 TEST(HostDeviceVector, TestCopy) {
  size_t n = 1001;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
-  std::vector<size_t> starts{0, 501};
-  std::vector<size_t> sizes{501, 500};
+  int device = 0;
+  std::vector<size_t> starts{0};
+  std::vector<size_t> sizes{1001};
  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);

  HostDeviceVector<int> v;
  {
    // a separate scope to ensure that v1 is gone before further checks
    HostDeviceVector<int> v1;
-    InitHostDeviceVector(n, distribution, &v1);
+    InitHostDeviceVector(n, device, &v1);
    v = v1;
  }
  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
@ -193,16 +154,16 @@ TEST(HostDeviceVector, Shard) {
    h_vec[i] = i;
  }
  HostDeviceVector<int> vec (h_vec);
-  auto devices = GPUSet::Range(0, 1);
+  auto device = 0;

-  vec.Shard(devices);
-  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
+  vec.SetDevice(device);
+  ASSERT_EQ(vec.DeviceSize(), h_vec.size());
  ASSERT_EQ(vec.Size(), h_vec.size());
-  auto span = vec.DeviceSpan(0);  // sync to device
+  auto span = vec.DeviceSpan();  // sync to device

-  vec.Reshard(GPUDistribution::Empty());  // pull back to cpu, empty devices.
+  vec.SetDevice(-1);  // pull back to cpu.
  ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_TRUE(vec.Devices().IsEmpty());
+  ASSERT_EQ(vec.DeviceIdx(), -1);

  auto h_vec_1 = vec.HostVector();
  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@ -214,16 +175,16 @@ TEST(HostDeviceVector, Reshard) {
    h_vec[i] = i;
  }
  HostDeviceVector<int> vec (h_vec);
-  auto devices = GPUSet::Range(0, 1);
+  auto device = 0;

-  vec.Shard(devices);
-  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
+  vec.SetDevice(device);
+  ASSERT_EQ(vec.DeviceSize(), h_vec.size());
  ASSERT_EQ(vec.Size(), h_vec.size());
  PlusOne(&vec);

-  vec.Reshard(GPUDistribution::Empty());
+  vec.SetDevice(-1);
  ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_TRUE(vec.Devices().IsEmpty());
+  ASSERT_EQ(vec.DeviceIdx(), -1);

  auto h_vec_1 = vec.HostVector();
  for (size_t i = 0; i < h_vec_1.size(); ++i) {
@ -233,97 +194,14 @@ TEST(HostDeviceVector, Reshard) {

 TEST(HostDeviceVector, Span) {
  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.Shard(GPUSet{0, 1});
-  auto span = vec.DeviceSpan(0);
-  ASSERT_EQ(vec.DeviceSize(0), span.size());
-  ASSERT_EQ(vec.DevicePointer(0), span.data());
-  auto const_span = vec.ConstDeviceSpan(0);
-  ASSERT_EQ(vec.DeviceSize(0), span.size());
-  ASSERT_EQ(vec.ConstDevicePointer(0), span.data());
+  vec.SetDevice(0);
+  auto span = vec.DeviceSpan();
+  ASSERT_EQ(vec.DeviceSize(), span.size());
+  ASSERT_EQ(vec.DevicePointer(), span.data());
+  auto const_span = vec.ConstDeviceSpan();
+  ASSERT_EQ(vec.DeviceSize(), span.size());
+  ASSERT_EQ(vec.ConstDevicePointer(), span.data());
 }

-// Multi-GPUs' test
-#if defined(XGBOOST_USE_NCCL)
-TEST(HostDeviceVector, MGPU_Shard) {
-  auto devices = GPUSet::AllVisible();
-  if (devices.Size() < 2) {
-    LOG(WARNING) << "Not testing in multi-gpu environment.";
-    return;
-  }
-
-  std::vector<int> h_vec (2345);
-  for (size_t i = 0; i < h_vec.size(); ++i) {
-    h_vec[i] = i;
-  }
-  HostDeviceVector<int> vec (h_vec);
-
-  // Data size for each device.
-  std::vector<size_t> devices_size (devices.Size());
-
-  // From CPU to GPUs.
-  vec.Shard(devices);
-  size_t total_size = 0;
-  for (size_t i = 0; i < devices.Size(); ++i) {
-    total_size += vec.DeviceSize(i);
-    devices_size[i] = vec.DeviceSize(i);
-  }
-  ASSERT_EQ(total_size, h_vec.size());
-  ASSERT_EQ(total_size, vec.Size());
-
-  // Shard from devices to devices with different distribution.
-  EXPECT_ANY_THROW(
-      vec.Shard(GPUDistribution::Granular(devices, 12)));
-
-  // All data is drawn back to CPU
-  vec.Reshard(GPUDistribution::Empty());
-  ASSERT_TRUE(vec.Devices().IsEmpty());
-  ASSERT_EQ(vec.Size(), h_vec.size());
-
-  vec.Shard(GPUDistribution::Granular(devices, 12));
-  total_size = 0;
-  for (size_t i = 0; i < devices.Size(); ++i) {
-    total_size += vec.DeviceSize(i);
-    devices_size[i] = vec.DeviceSize(i);
-  }
-  ASSERT_EQ(total_size, h_vec.size());
-  ASSERT_EQ(total_size, vec.Size());
-}
-
-TEST(HostDeviceVector, MGPU_Reshard) {
-  auto devices = GPUSet::AllVisible();
-  if (devices.Size() < 2) {
-    LOG(WARNING) << "Not testing in multi-gpu environment.";
-    return;
-  }
-
-  size_t n = 1001;
-  int n_devices = 2;
-  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
-  std::vector<size_t> starts{0, 501};
-  std::vector<size_t> sizes{501, 500};
-
-  HostDeviceVector<int> v;
-  InitHostDeviceVector(n, distribution, &v);
-  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
-  PlusOne(&v);
-  CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
-  CheckHost(&v, GPUAccess::kRead);
-  CheckHost(&v, GPUAccess::kWrite);
-
-  auto distribution1 = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1);
-  v.Reshard(distribution1);
-
-  for (size_t i = 0; i < n_devices; ++i) {
-    auto span = v.DeviceSpan(i);  // sync to device
-  }
-
-  std::vector<size_t> starts1{0, 500};
-  std::vector<size_t> sizes1{501, 501};
-  CheckDevice(&v, starts1, sizes1, 1, GPUAccess::kWrite);
-  CheckHost(&v, GPUAccess::kRead);
-  CheckHost(&v, GPUAccess::kWrite);
-}
-#endif
-
 }  // namespace common
 }  // namespace xgboost
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@ -33,7 +33,7 @@ std::string GetModelStr() {
  },
  "configuration": {
    "booster": "gbtree",
-    "n_gpus": "1",
+    "gpu_id": "0",
    "num_class": "0",
    "num_feature": "10",
    "objective": "reg:linear",
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@ -9,13 +9,11 @@

 #if defined(__CUDACC__)

-#define TRANSFORM_GPU_RANGE GPUSet::Range(0, 1)
-#define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Range(0, 1))
+#define TRANSFORM_GPU 0

 #else

-#define TRANSFORM_GPU_RANGE GPUSet::Empty()
-#define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Empty())
+#define TRANSFORM_GPU -1

 #endif

@ -46,13 +44,13 @@ TEST(Transform, DeclareUnifiedTest(Basic)) {
  std::vector<bst_float> h_sol(size);
  InitializeRange(h_sol.begin(), h_sol.end());

-  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU_DIST};
-  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU_DIST};
+  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
+  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
  out_vec.Fill(0);

  Transform<>::Init(TestTransformRange<bst_float>{},
 	                Range{0, static_cast<Range::DifferenceType>(size)},
-	                TRANSFORM_GPU_RANGE)
+	                TRANSFORM_GPU)
      .Eval(&out_vec, &in_vec);
  std::vector<bst_float> res = out_vec.HostVector();

--- a/tests/cpp/common/test_transform_range.cu
+++ b/tests/cpp/common/test_transform_range.cu
@ -5,87 +5,13 @@
 namespace xgboost {
 namespace common {

-// Test here is multi gpu specific
-TEST(Transform, MGPU_Basic) {
-  auto devices = GPUSet::AllVisible();
-  CHECK_GT(devices.Size(), 1);
-  const size_t size {256};
-  std::vector<bst_float> h_in(size);
-  std::vector<bst_float> h_out(size);
-  InitializeRange(h_in.begin(), h_in.end());
-  std::vector<bst_float> h_sol(size);
-  InitializeRange(h_sol.begin(), h_sol.end());
-
-  const HostDeviceVector<bst_float> in_vec {h_in,
-        GPUDistribution::Block(GPUSet::Empty())};
-  HostDeviceVector<bst_float> out_vec {h_out,
-        GPUDistribution::Block(GPUSet::Empty())};
-  out_vec.Fill(0);
-
-  in_vec.Shard(GPUDistribution::Granular(devices, 8));
-  out_vec.Shard(GPUDistribution::Block(devices));
-
-  // Granularity is different, sharding will throw.
-  EXPECT_ANY_THROW(
-      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, devices)
-      .Eval(&out_vec, &in_vec));
-
-
-  Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size},
-                    devices, false).Eval(&out_vec, &in_vec);
-  std::vector<bst_float> res = out_vec.HostVector();
-
-  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
-}
-
-// Test for multi-classes setting.
-template <typename T>
-struct TestTransformRangeGranular {
-  const size_t granularity = 8;
-
-  explicit TestTransformRangeGranular(const size_t granular) : granularity{granular} {}
-  void XGBOOST_DEVICE operator()(size_t _idx,
-                                 Span<bst_float> _out, Span<const bst_float> _in) {
-    auto in_sub = _in.subspan(_idx * granularity, granularity);
-    auto out_sub = _out.subspan(_idx * granularity, granularity);
-    for (size_t i = 0; i < granularity; ++i) {
-      out_sub[i] = in_sub[i];
-    }
-  }
-};
-
-TEST(Transform, MGPU_Granularity) {
-  GPUSet devices = GPUSet::All(0, -1);
-
-  const size_t size {8990};
-  const size_t granularity = 10;
-
-  GPUDistribution distribution =
-      GPUDistribution::Granular(devices, granularity);
-
-  std::vector<bst_float> h_in(size);
-  std::vector<bst_float> h_out(size);
-  InitializeRange(h_in.begin(), h_in.end());
-  std::vector<bst_float> h_sol(size);
-  InitializeRange(h_sol.begin(), h_sol.end());
-
-  const HostDeviceVector<bst_float> in_vec {h_in, distribution};
-  HostDeviceVector<bst_float> out_vec {h_out, distribution};
-
-  ASSERT_NO_THROW(
-      Transform<>::Init(
-          TestTransformRangeGranular<bst_float>{granularity},
-          Range{0, size / granularity},
-          distribution)
-      .Eval(&out_vec, &in_vec));
-  std::vector<bst_float> res = out_vec.HostVector();
-
-  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
-}
-
 TEST(Transform, MGPU_SpecifiedGpuId) {
+  if (AllVisibleGPUs() < 2) {
+    LOG(WARNING) << "Not testing in multi-gpu environment.";
+    return;
+  }
  // Use 1 GPU, Numbering of GPU starts from 1
-  auto devices = GPUSet::All(1, 1);
+  auto device = 1;
  const size_t size {256};
  std::vector<bst_float> h_in(size);
  std::vector<bst_float> h_out(size);
@ -93,13 +19,11 @@ TEST(Transform, MGPU_SpecifiedGpuId) {
  std::vector<bst_float> h_sol(size);
  InitializeRange(h_sol.begin(), h_sol.end());

-  const HostDeviceVector<bst_float> in_vec {h_in,
-        GPUDistribution::Block(devices)};
-  HostDeviceVector<bst_float> out_vec {h_out,
-        GPUDistribution::Block(devices)};
+  const HostDeviceVector<bst_float> in_vec {h_in, device};
+  HostDeviceVector<bst_float> out_vec {h_out, device};

  ASSERT_NO_THROW(
-      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, devices)
+      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, device)
      .Eval(&out_vec, &in_vec));
  std::vector<bst_float> res = out_vec.HostVector();
  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@ -12,7 +12,7 @@ TEST(GBTree, SelectTreeMethod) {
  auto p_dmat {(*p_shared_ptr_dmat).get()};

  GenericParameter generic_param;
-  generic_param.InitAllowUnknown(std::vector<Arg>{Arg("n_gpus", "0")});
+  generic_param.InitAllowUnknown(std::vector<Arg>{});
  std::unique_ptr<GradientBooster> p_gbm{
    GradientBooster::Create("gbtree", &generic_param, {}, 0)};
  auto& gbtree = dynamic_cast<gbm::GBTree&> (*p_gbm);
@ -35,7 +35,7 @@ TEST(GBTree, SelectTreeMethod) {
                                 Arg{"num_feature", n_feat}}, p_dmat);
  ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker");
 #ifdef XGBOOST_USE_CUDA
-  generic_param.InitAllowUnknown(std::vector<Arg>{Arg{"n_gpus", "1"}});
+  generic_param.InitAllowUnknown(std::vector<Arg>{Arg{"gpu_id", "0"}});
  gbtree.ConfigureWithKnownData({Arg("tree_method", "gpu_hist"), Arg("num_feature", n_feat)},
                                p_dmat);
  ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist");
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@ -29,9 +29,9 @@
 #endif

 #if defined(__CUDACC__)
-#define NGPUS 1
+#define GPUIDX 0
 #else
-#define NGPUS 0
+#define GPUIDX -1
 #endif

 bool FileExists(const std::string& filename);
@ -189,11 +189,10 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_c

 gbm::GBTreeModel CreateTestModel();

-inline GenericParameter CreateEmptyGenericParam(int gpu_id, int n_gpus) {
+inline GenericParameter CreateEmptyGenericParam(int gpu_id) {
  xgboost::GenericParameter tparam;
  std::vector<std::pair<std::string, std::string>> args {
-    {"gpu_id", std::to_string(gpu_id)},
-    {"n_gpus", std::to_string(n_gpus)}};
+    {"gpu_id", std::to_string(gpu_id)}};
  tparam.Init(args);
  return tparam;
 }
--- a/tests/cpp/linear/test_linear.cc
+++ b/tests/cpp/linear/test_linear.cc
@ -7,7 +7,7 @@

 TEST(Linear, shotgun) {
  auto mat = xgboost::CreateDMatrix(10, 10, 0);
-  auto lparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  {
    auto updater = std::unique_ptr<xgboost::LinearUpdater>(
        xgboost::LinearUpdater::Create("shotgun", &lparam));
@ -33,7 +33,7 @@ TEST(Linear, shotgun) {

 TEST(Linear, coordinate) {
  auto mat = xgboost::CreateDMatrix(10, 10, 0);
-  auto lparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  auto updater = std::unique_ptr<xgboost::LinearUpdater>(
      xgboost::LinearUpdater::Create("coord_descent", &lparam));
  updater->Configure({{"eta", "1."}});
--- a/tests/cpp/linear/test_linear.cu
+++ b/tests/cpp/linear/test_linear.cu
@ -7,8 +7,7 @@ namespace xgboost {

 TEST(Linear, GPUCoordinate) {
  auto mat = xgboost::CreateDMatrix(10, 10, 0);
-  auto lparam = CreateEmptyGenericParam(0, 1);
-  lparam.n_gpus = 1;
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
  auto updater = std::unique_ptr<xgboost::LinearUpdater>(
      xgboost::LinearUpdater::Create("gpu_coord_descent", &lparam));
  updater->Configure({{"eta", "1."}});
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@ -6,7 +6,7 @@
 #include "../helpers.h"

 TEST(Metric, DeclareUnifiedTest(RMSE)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("rmse", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "rmse");
@ -20,7 +20,7 @@ TEST(Metric, DeclareUnifiedTest(RMSE)) {
 }

 TEST(Metric, DeclareUnifiedTest(RMSLE)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "rmsle");
@ -32,7 +32,7 @@ TEST(Metric, DeclareUnifiedTest(RMSLE)) {
 }

 TEST(Metric, DeclareUnifiedTest(MAE)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("mae", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "mae");
@ -45,7 +45,7 @@ TEST(Metric, DeclareUnifiedTest(MAE)) {
 }

 TEST(Metric, DeclareUnifiedTest(LogLoss)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("logloss", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "logloss");
@ -58,7 +58,7 @@ TEST(Metric, DeclareUnifiedTest(LogLoss)) {
 }

 TEST(Metric, DeclareUnifiedTest(Error)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("error", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "error");
@ -90,7 +90,7 @@ TEST(Metric, DeclareUnifiedTest(Error)) {
 }

 TEST(Metric, DeclareUnifiedTest(PoissionNegLogLik)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "poisson-nloglik");
--- a/tests/cpp/metric/test_metric.cc
+++ b/tests/cpp/metric/test_metric.cc
@ -4,7 +4,7 @@
 #include "../helpers.h"

 TEST(Metric, UnknownMetric) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = nullptr;
  EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name", &tparam));
  EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse", &tparam));
--- a/tests/cpp/metric/test_multiclass_metric.cc
+++ b/tests/cpp/metric/test_multiclass_metric.cc
@ -4,10 +4,9 @@

 #include "../helpers.h"

-inline void TestMultiClassError(xgboost::GPUSet const& devices) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
-  lparam.gpu_id = *devices.begin();
-  lparam.n_gpus = devices.Size();
+inline void TestMultiClassError(int device) {
+  auto lparam = xgboost::CreateEmptyGenericParam(device);
+  lparam.gpu_id = device;
  xgboost::Metric * metric = xgboost::Metric::Create("merror", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "merror");
@ -23,14 +22,12 @@ inline void TestMultiClassError(xgboost::GPUSet const& devices) {
 }

 TEST(Metric, DeclareUnifiedTest(MultiClassError)) {
-  auto devices = xgboost::GPUSet::Range(0, NGPUS);
-  TestMultiClassError(devices);
+  TestMultiClassError(GPUIDX);
 }

-inline void TestMultiClassLogLoss(xgboost::GPUSet const& devices) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
-  lparam.gpu_id = *devices.begin();
-  lparam.n_gpus = devices.Size();
+inline void TestMultiClassLogLoss(int device) {
+  auto lparam = xgboost::CreateEmptyGenericParam(device);
+  lparam.gpu_id = device;
  xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &lparam);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "mlogloss");
@ -46,27 +43,31 @@ inline void TestMultiClassLogLoss(xgboost::GPUSet const& devices) {
 }

 TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) {
-  auto devices = xgboost::GPUSet::Range(0, NGPUS);
-  TestMultiClassLogLoss(devices);
+  TestMultiClassLogLoss(GPUIDX);
 }

 #if defined(XGBOOST_USE_NCCL) && defined(__CUDACC__)
+namespace xgboost {
+namespace common {
 TEST(Metric, MGPU_MultiClassError) {
+  if (AllVisibleGPUs() < 2) {
+    LOG(WARNING) << "Not testing in multi-gpu environment.";
+    return;
+  }
+
  {
-    auto devices = xgboost::GPUSet::All(0, -1);
-    TestMultiClassError(devices);
+    TestMultiClassError(0);
  }
  {
-    auto devices = xgboost::GPUSet::All(1, -1);
-    TestMultiClassError(devices);
+    TestMultiClassError(1);
  }
  {
-    auto devices = xgboost::GPUSet::All(0, -1);
-    TestMultiClassLogLoss(devices);
+    TestMultiClassLogLoss(0);
  }
  {
-    auto devices = xgboost::GPUSet::All(1, -1);
-    TestMultiClassLogLoss(devices);
+    TestMultiClassLogLoss(1);
  }
 }
+}  // namespace common
+}  // namespace xgboost
 #endif  // defined(XGBOOST_USE_NCCL)
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@ -4,7 +4,7 @@
 #include "../helpers.h"

 TEST(Metric, AMS) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  EXPECT_ANY_THROW(xgboost::Metric::Create("ams", &tparam));
  xgboost::Metric * metric = xgboost::Metric::Create("ams@0.5f", &tparam);
  ASSERT_STREQ(metric->Name(), "ams@0.5");
@ -23,7 +23,7 @@ TEST(Metric, AMS) {
 }

 TEST(Metric, AUC) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("auc", &tparam);
  ASSERT_STREQ(metric->Name(), "auc");
  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
@ -38,7 +38,7 @@ TEST(Metric, AUC) {
 }

 TEST(Metric, AUCPR) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric *metric = xgboost::Metric::Create("aucpr", &tparam);
  ASSERT_STREQ(metric->Name(), "aucpr");
  EXPECT_NEAR(GetMetricEval(metric, {0, 0, 1, 1}, {0, 0, 1, 1}), 1, 1e-10);
@ -65,7 +65,7 @@ TEST(Metric, Precision) {
  // When the limit for precision is not given, it takes the limit at
  // std::numeric_limits<unsigned>::max(); hence all values are very small
  // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("pre", &tparam);
  ASSERT_STREQ(metric->Name(), "pre");
  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-7);
@ -89,7 +89,7 @@ TEST(Metric, Precision) {
 }

 TEST(Metric, NDCG) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("ndcg", &tparam);
  ASSERT_STREQ(metric->Name(), "ndcg");
  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
@ -147,7 +147,7 @@ TEST(Metric, NDCG) {
 }

 TEST(Metric, MAP) {
-  auto tparam = xgboost::CreateEmptyGenericParam(0, 0);
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric * metric = xgboost::Metric::Create("map", &tparam);
  ASSERT_STREQ(metric->Name(), "map");
  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
--- a/tests/cpp/objective/test_hinge.cc
+++ b/tests/cpp/objective/test_hinge.cc
@ -6,7 +6,7 @@
 #include "../helpers.h"

 TEST(Objective, DeclareUnifiedTest(HingeObj)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("binary:hinge", &tparam);

  xgboost::bst_float eps = std::numeric_limits<xgboost::bst_float>::min();
--- a/tests/cpp/objective/test_multiclass_obj.cc
+++ b/tests/cpp/objective/test_multiclass_obj.cc
@ -7,7 +7,7 @@
 #include "../helpers.h"

 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args {{"num_class", "3"}};
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("multi:softmax", &lparam);

@ -25,7 +25,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
 }

 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
-  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args{
    std::pair<std::string, std::string>("num_class", "3")};

@ -47,7 +47,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
 }

 TEST(Objective, DeclareUnifiedTest(SoftprobMultiClassBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args {
    std::pair<std::string, std::string>("num_class", "3")};

--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@ -7,7 +7,7 @@
 #include "../helpers.h"

 TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;

  xgboost::ObjFunction * obj =
@ -32,7 +32,7 @@ TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
 }

 TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;

  xgboost::ObjFunction * obj =
@ -56,7 +56,7 @@ TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
 }

 TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
-  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:logistic", &tparam);

@ -72,7 +72,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
 }

 TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:logistic", &lparam);

@ -102,7 +102,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
 }

 TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("binary:logitraw", &lparam);

@ -118,7 +118,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
 }

 TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson", &lparam);

@ -140,7 +140,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
 }

 TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson", &lparam);

@ -168,7 +168,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
 }

 TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:gamma", &lparam);

@ -189,7 +189,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
 }

 TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:gamma", &lparam);

@ -217,7 +217,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
 }

 TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie", &lparam);

@ -241,7 +241,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {

 #if defined(__CUDACC__)
 TEST(Objective, CPU_vs_CUDA) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, 1);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);

  xgboost::ObjFunction * obj =
      xgboost::ObjFunction::Create("reg:squarederror", &lparam);
@ -267,12 +267,12 @@ TEST(Objective, CPU_vs_CUDA) {

  {
    // CPU
-    lparam.n_gpus = 0;
+    lparam.gpu_id = -1;
    obj->GetGradient(preds, info, 0, &cpu_out_preds);
  }
  {
    // CUDA
-    lparam.n_gpus = 1;
+    lparam.gpu_id = 0;
    obj->GetGradient(preds, info, 0, &cuda_out_preds);
  }

@ -294,7 +294,7 @@ TEST(Objective, CPU_vs_CUDA) {
 #endif

 TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie", &lparam);

@ -325,7 +325,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
 // CoxRegression not implemented in GPU code, no need for testing.
 #if !defined(__CUDACC__)
 TEST(Objective, CoxRegressionGPair) {
-  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, 0);
+  xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  std::vector<std::pair<std::string, std::string>> args;
  xgboost::ObjFunction * obj =
      xgboost::ObjFunction::Create("survival:cox", &lparam);
--- a/tests/cpp/plugin/test_example_objective.cc
+++ b/tests/cpp/plugin/test_example_objective.cc
@ -6,7 +6,7 @@
 namespace xgboost {

 TEST(Plugin, ExampleObjective) {
-  xgboost::GenericParameter tparam = CreateEmptyGenericParam(0, 0);
+  xgboost::GenericParameter tparam = CreateEmptyGenericParam(GPUIDX);
  auto * obj = xgboost::ObjFunction::Create("mylogistic", &tparam);
  ASSERT_EQ(obj->DefaultEvalMetric(), std::string{"error"});
  delete obj;
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@ -6,7 +6,7 @@

 namespace xgboost {
 TEST(cpu_predictor, Test) {
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
  std::unique_ptr<Predictor> cpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));

@ -59,7 +59,7 @@ TEST(cpu_predictor, ExternalMemoryTest) {
  dmlc::TemporaryDirectory tmpdir;
  std::string filename = tmpdir.path + "/big.libsvm";
  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64, filename);
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
  std::unique_ptr<Predictor> cpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));

--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@ -33,8 +33,8 @@ namespace xgboost {
 namespace predictor {

 TEST(gpu_predictor, Test) {
-  auto cpu_lparam = CreateEmptyGenericParam(0, 0);
-  auto gpu_lparam = CreateEmptyGenericParam(0, 1);
+  auto cpu_lparam = CreateEmptyGenericParam(-1);
+  auto gpu_lparam = CreateEmptyGenericParam(0);

  std::unique_ptr<Predictor> gpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &gpu_lparam));
@ -69,7 +69,7 @@ TEST(gpu_predictor, Test) {
 }

 TEST(gpu_predictor, ExternalMemoryTest) {
-  auto lparam = CreateEmptyGenericParam(0, 1);
+  auto lparam = CreateEmptyGenericParam(0);
  std::unique_ptr<Predictor> gpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &lparam));
  gpu_predictor->Configure({}, {});
@ -83,26 +83,26 @@ TEST(gpu_predictor, ExternalMemoryTest) {
  std::string file1 = tmpdir.path + "/big_1.libsvm";
  std::string file2 = tmpdir.path + "/big_2.libsvm";
  dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0));
-  dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
-  dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
+//  dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
+//  dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));

  for (const auto& dmat: dmats) {
-    // Test predict batch
+    dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);
    HostDeviceVector<float> out_predictions;
    gpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
    EXPECT_EQ(out_predictions.Size(), dmat->Info().num_row_ * n_classes);
    const std::vector<float> &host_vector = out_predictions.ConstHostVector();
    for (int i = 0; i < host_vector.size() / n_classes; i++) {
-      ASSERT_EQ(host_vector[i * n_classes], 1.5);
-      ASSERT_EQ(host_vector[i * n_classes + 1], 0.);
-      ASSERT_EQ(host_vector[i * n_classes + 2], 0.);
+      ASSERT_EQ(host_vector[i * n_classes], 2.0);
+      ASSERT_EQ(host_vector[i * n_classes + 1], 0.5);
+      ASSERT_EQ(host_vector[i * n_classes + 2], 0.5);
    }
  }
 }

 // Test whether pickling preserves predictor parameters
 TEST(gpu_predictor, PicklingTest) {
-  int const ngpu = 1;
+  int const gpuid = 0;

  dmlc::TemporaryDirectory tempdir;
  const std::string tmp_file = tempdir.path + "/simple.libsvm";
@ -134,7 +134,7 @@ TEST(gpu_predictor, PicklingTest) {
  ASSERT_EQ(XGBoosterSetParam(
      bst, "tree_method", "gpu_hist"), 0) << XGBGetLastError();
  ASSERT_EQ(XGBoosterSetParam(
-      bst, "n_gpus", std::to_string(ngpu).c_str()), 0) << XGBGetLastError();
+      bst, "gpu_id", std::to_string(gpuid).c_str()), 0) << XGBGetLastError();
  ASSERT_EQ(XGBoosterSetParam(bst, "predictor", "gpu_predictor"), 0) << XGBGetLastError();

  // Run boosting iterations
@ -160,7 +160,7 @@ TEST(gpu_predictor, PicklingTest) {
  {  // Query predictor
    const auto& kwargs = QueryBoosterConfigurationArguments(bst2);
    ASSERT_EQ(kwargs.at("predictor"), "gpu_predictor");
-    ASSERT_EQ(kwargs.at("n_gpus"), std::to_string(ngpu).c_str());
+    ASSERT_EQ(kwargs.at("gpu_id"), std::to_string(gpuid).c_str());
  }

  {  // Change predictor and query again
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@ -168,10 +168,9 @@ TEST(Learner, IO) {
  std::unique_ptr<Learner> learner {Learner::Create(mat)};
  learner->SetParams({Arg{"tree_method", "auto"},
                      Arg{"predictor", "gpu_predictor"},
-                      Arg{"n_gpus", "1"}});
+                      Arg{"gpu_id", "0"}});
  learner->UpdateOneIter(0, p_dmat.get());
  ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-  ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);

  dmlc::TemporaryDirectory tempdir;
  const std::string fname = tempdir.path + "/model.bst";
@ -185,7 +184,6 @@ TEST(Learner, IO) {
  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
  learner->Load(fi.get());
  ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-  ASSERT_EQ(learner->GetGenericParameter().n_gpus, 0);

  delete pp_dmat;
 }
@ -208,31 +206,27 @@ TEST(Learner, GPUConfiguration) {
                        Arg{"updater", "gpu_coord_descent"}});
    learner->UpdateOneIter(0, p_dmat.get());
    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
  }
  {
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "gpu_hist"}});
    learner->UpdateOneIter(0, p_dmat.get());
    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
  }
  {
    // with CPU algorithm
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "hist"}});
    learner->UpdateOneIter(0, p_dmat.get());
-    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 0);
+    ASSERT_EQ(learner->GetGenericParameter().gpu_id, -1);
  }
  {
-    // with CPU algorithm, but `n_gpus` takes priority
+    // with CPU algorithm, but `gpu_id` takes priority
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"n_gpus", "1"}});
+                        Arg{"gpu_id", "0"}});
    learner->UpdateOneIter(0, p_dmat.get());
    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
  }
  {
    // With CPU algorithm but GPU Predictor, this is to simulate when
@ -243,7 +237,6 @@ TEST(Learner, GPUConfiguration) {
                        Arg{"predictor", "gpu_predictor"}});
    learner->UpdateOneIter(0, p_dmat.get());
    ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-    ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
  }

  delete pp_dmat;
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@ -366,7 +366,7 @@ TEST(GpuHist, EvaluateSplits) {
  ASSERT_NEAR(res[1].fvalue, 0.26, xgboost::kRtEps);
 }

-void TestHistogramIndexImpl(int n_gpus) {
+void TestHistogramIndexImpl() {
  // Test if the compressed histogram index matches when using a sparse
  // dmatrix with and without using external memory

@ -384,7 +384,7 @@ void TestHistogramIndexImpl(int n_gpus) {
    {"max_leaves", "0"}
  };

-  GenericParameter generic_param(CreateEmptyGenericParam(0, n_gpus));
+  GenericParameter generic_param(CreateEmptyGenericParam(0));
  hist_maker.Configure(training_params, &generic_param);

  hist_maker.InitDataOnce(hist_maker_dmat.get());
@ -412,7 +412,7 @@ void TestHistogramIndexImpl(int n_gpus) {
 }

 TEST(GpuHist, TestHistogramIndex) {
-  TestHistogramIndexImpl(1);
+  TestHistogramIndexImpl();
 }

 }  // namespace tree
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@ -29,7 +29,7 @@ TEST(Updater, Prune) {
        {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} };
  auto dmat = CreateDMatrix(32, 16, 0.4, 3);

-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);

  // prepare tree
  RegTree tree = RegTree();
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@ -25,7 +25,7 @@ TEST(Updater, Refresh) {
    {"reg_lambda", "1"}};

  RegTree tree = RegTree();
-  auto lparam = CreateEmptyGenericParam(0, 0);
+  auto lparam = CreateEmptyGenericParam(GPUIDX);
  tree.param.InitAllowUnknown(cfg);
  std::vector<RegTree*> trees {&tree};
  std::unique_ptr<TreeUpdater> refresher(TreeUpdater::Create("refresh", &lparam));
--- a/tests/distributed/distributed_gpu.py
+++ b/tests/distributed/distributed_gpu.py
@ -61,7 +61,6 @@ base_params = {

 def params_basic_1x4(rank):
    return dict(base_params, **{
-        'n_gpus': 1,
        'gpu_id': rank,
    }), 20

--- a/tests/python-gpu/test_gpu_linear.py
+++ b/tests/python-gpu/test_gpu_linear.py
@ -23,7 +23,7 @@ class TestGPULinear(unittest.TestCase):
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_gpu_coordinate(self):
        parameters = self.common_param.copy()
-        parameters['n_gpus'] = [1]
+        parameters['gpu_id'] = [0]
        for param in test_linear.parameter_combinations(parameters):
            results = test_linear.run_suite(
                param, 150, self.datasets, scale_features=True)
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@ -21,7 +21,7 @@ datasets = ["Boston", "Cancer", "Digits", "Sparse regression",

 class TestGPU(unittest.TestCase):
    def test_gpu_hist(self):
-        test_param = parameter_combinations({'n_gpus': [1], 'max_depth': [2, 8],
+        test_param = parameter_combinations({'gpu_id': [0], 'max_depth': [2, 8],
                                             'max_leaves': [255, 4],
                                             'max_bin': [2, 256],
                                             'grow_policy': ['lossguide']})
@ -38,8 +38,7 @@ class TestGPU(unittest.TestCase):

    @pytest.mark.mgpu
    def test_specified_gpu_id_gpu_update(self):
-        variable_param = {'n_gpus': [1],
-                          'gpu_id': [1],
+        variable_param = {'gpu_id': [1],
                          'max_depth': [8],
                          'max_leaves': [255, 4],
                          'max_bin': [2, 64],
--- a/tests/python-gpu/test_large_sizes.py
+++ b/tests/python-gpu/test_large_sizes.py
@ -63,7 +63,7 @@ class TestGPU(unittest.TestCase):
                         'nthread': 0,
                         'eta': 1,
                         'verbosity': 3,
-                         'n_gpus': 1,
+                         'gpu_id': 0,
                         'objective': 'binary:logistic',
                         'max_bin': max_bin,
                         'eval_metric': 'auc'}