diff --git a/CITATION b/CITATION
index 189062510..b2acce7c1 100644
--- a/CITATION
+++ b/CITATION
@@ -15,4 +15,3 @@
  address = {New York, NY, USA},
  keywords = {large-scale machine learning},
 }
-
diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index 97c9799fd..3cee0cdf5 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -22,7 +22,8 @@ Supported parameters
 GPU accelerated prediction is enabled by default for the above mentioned ``tree_method`` parameters but can be switched to CPU prediction by setting ``predictor`` to ``cpu_predictor``. This could be useful if you want to conserve GPU memory. Likewise when using CPU algorithms, GPU accelerated prediction can be enabled by setting ``predictor`` to ``gpu_predictor``.
 
 The device ordinal (which GPU to use if you have many of them) can be selected using the
-``gpu_id`` parameter, which defaults to 0 (the first device reported by CUDA runtime).
+``device`` parameter, which defaults to 0 when "CUDA" is specified(the first device reported by CUDA
+runtime).
 
 
 The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :doc:`/install` for details.
@@ -30,13 +31,13 @@ The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :do
 .. code-block:: python
   :caption: Python example
 
-  param['gpu_id'] = 0
+  param["device"] = "cuda:0"
   param['tree_method'] = 'gpu_hist'
 
 .. code-block:: python
   :caption: With Scikit-Learn interface
 
-  XGBRegressor(tree_method='gpu_hist', gpu_id=0)
+  XGBRegressor(tree_method='gpu_hist', device="cuda")
 
 
 GPU-Accelerated SHAP values
@@ -45,7 +46,7 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
 
 .. code-block:: python
 
-  model.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+  model.set_param({"device": "cuda:0", "tree_method": "gpu_hist"})
   shap_values = model.predict(dtrain, pred_contribs=True)
   shap_interaction_values = model.predict(dtrain, pred_interactions=True)
 
diff --git a/doc/install.rst b/doc/install.rst
index 0e155f647..51f0d0d60 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -3,10 +3,10 @@ Installation Guide
 ##################
 
 XGBoost provides binary packages for some language bindings.  The binary packages support
-the GPU algorithm (``gpu_hist``) on machines with NVIDIA GPUs. Please note that **training
-with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.  Also we
-have both stable releases and nightly builds, see below for how to install them.  For
-building from source, visit :doc:`this page </build>`.
+the GPU algorithm (``device=cuda:0``) on machines with NVIDIA GPUs. Please note that
+**training with multiple GPUs is only supported for Linux platform**. See
+:doc:`gpu/index`.  Also we have both stable releases and nightly builds, see below for how
+to install them.  For building from source, visit :doc:`this page </build>`.
 
 .. contents:: Contents
 
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 22893e400..d628d161b 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -59,6 +59,18 @@ General Parameters
 
   - Feature dimension used in boosting, set to maximum dimension of the feature
 
+* ``device`` [default= ``cpu``]
+
+  .. versionadded:: 2.0.0
+
+  - Device for XGBoost to run. User can set it to one of the following values:
+
+    + ``cpu``: Use CPU.
+    + ``cuda``: Use a GPU (CUDA device).
+    + ``cuda:<ordinal>``: ``<ordinal>`` is an integer that specifies the ordinal of the GPU (which GPU do you want to use if you have more than one devices).
+    + ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
+    + ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
+
 Parameters for Tree Booster
 ===========================
 * ``eta`` [default=0.3, alias: ``learning_rate``]
@@ -99,7 +111,7 @@ Parameters for Tree Booster
   - ``gradient_based``: the selection probability for each training instance is proportional to the
     *regularized absolute value* of gradients (more specifically, :math:`\sqrt{g^2+\lambda h^2}`).
     ``subsample`` may be set to as low as 0.1 without loss of model accuracy. Note that this
-    sampling method is only supported when ``tree_method`` is set to ``gpu_hist``; other tree
+    sampling method is only supported when ``tree_method`` is set to ``hist`` and the device is ``cuda``; other tree
     methods only support ``uniform`` sampling.
 
 * ``colsample_bytree``, ``colsample_bylevel``, ``colsample_bynode`` [default=1]
@@ -131,26 +143,15 @@ Parameters for Tree Booster
 * ``tree_method`` string [default= ``auto``]
 
   - The tree construction algorithm used in XGBoost. See description in the `reference paper <http://arxiv.org/abs/1603.02754>`_ and :doc:`treemethod`.
-  - XGBoost supports  ``approx``, ``hist`` and ``gpu_hist`` for distributed training.  Experimental support for external memory is available for ``approx`` and ``gpu_hist``.
 
-  - Choices: ``auto``, ``exact``, ``approx``, ``hist``, ``gpu_hist``, this is a
-    combination of commonly used updaters.  For other updaters like ``refresh``, set the
-    parameter ``updater`` directly.
+  - Choices: ``auto``, ``exact``, ``approx``, ``hist``, this is a combination of commonly
+    used updaters.  For other updaters like ``refresh``, set the parameter ``updater``
+    directly.
 
-    - ``auto``: Use heuristic to choose the fastest method.
-
-      - For small dataset, exact greedy (``exact``) will be used.
-      - For larger dataset, approximate algorithm (``approx``) will be chosen.  It's
-        recommended to try ``hist`` and ``gpu_hist`` for higher performance with large
-        dataset.
-        (``gpu_hist``)has support for ``external memory``.
-
-      - Because old behavior is always use exact greedy in single machine, user will get a
-        message when approximate algorithm is chosen to notify this choice.
+    - ``auto``: Same as the ``hist`` tree method.
     - ``exact``: Exact greedy algorithm.  Enumerates all split candidates.
     - ``approx``: Approximate greedy algorithm using quantile sketch and gradient histogram.
     - ``hist``: Faster histogram optimized approximate greedy algorithm.
-    - ``gpu_hist``: GPU implementation of ``hist`` algorithm.
 
 * ``scale_pos_weight`` [default=1]
 
@@ -163,7 +164,7 @@ Parameters for Tree Booster
     - ``grow_colmaker``: non-distributed column-based construction of trees.
     - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
     - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
-    - ``grow_gpu_hist``: Grow tree with GPU.
+    - ``grow_gpu_hist``: Grow tree with GPU. Same as setting tree method to ``hist`` and use ``device=cuda``.
     - ``sync``: synchronizes trees in all distributed nodes.
     - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
     - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
@@ -183,7 +184,7 @@ Parameters for Tree Booster
 * ``grow_policy`` [default= ``depthwise``]
 
   - Controls a way new nodes are added to the tree.
-  - Currently supported only if ``tree_method`` is set to ``hist``, ``approx`` or ``gpu_hist``.
+  - Currently supported only if ``tree_method`` is set to ``hist`` or ``approx``.
   - Choices: ``depthwise``, ``lossguide``
 
     - ``depthwise``: split at nodes closest to the root.
@@ -195,7 +196,7 @@ Parameters for Tree Booster
 
 * ``max_bin``, [default=256]
 
-  - Only used if ``tree_method`` is set to ``hist``, ``approx`` or ``gpu_hist``.
+  - Only used if ``tree_method`` is set to ``hist`` or ``approx``.
   - Maximum number of discrete bins to bucket continuous features.
   - Increasing this number improves the optimality of splits at the cost of higher computation time.
 
diff --git a/doc/treemethod.rst b/doc/treemethod.rst
index 254eafb28..8ecddc066 100644
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -3,14 +3,14 @@ Tree Methods
 ############
 
 For training boosted tree models, there are 2 parameters used for choosing algorithms,
-namely ``updater`` and ``tree_method``.  XGBoost has 4 builtin tree methods, namely
-``exact``, ``approx``, ``hist`` and ``gpu_hist``.  Along with these tree methods, there
-are also some free standing updaters including ``refresh``,
-``prune`` and ``sync``.  The parameter ``updater`` is more primitive than ``tree_method``
-as the latter is just a pre-configuration of the former.  The difference is mostly due to
-historical reasons that each updater requires some specific configurations and might has
-missing features.  As we are moving forward, the gap between them is becoming more and
-more irrelevant.  We will collectively document them under tree methods.
+namely ``updater`` and ``tree_method``.  XGBoost has 3 builtin tree methods, namely
+``exact``, ``approx`` and ``hist``.  Along with these tree methods, there are also some
+free standing updaters including ``refresh``, ``prune`` and ``sync``.  The parameter
+``updater`` is more primitive than ``tree_method`` as the latter is just a
+pre-configuration of the former.  The difference is mostly due to historical reasons that
+each updater requires some specific configurations and might has missing features.  As we
+are moving forward, the gap between them is becoming more and more irrelevant.  We will
+collectively document them under tree methods.
 
 **************
 Exact Solution
@@ -19,23 +19,23 @@ Exact Solution
 Exact means XGBoost considers all candidates from data for tree splitting, but underlying
 the objective is still interpreted as a Taylor expansion.
 
-1. ``exact``: Vanilla gradient boosting tree algorithm described in `reference paper
-   <http://arxiv.org/abs/1603.02754>`_.  During each split finding procedure, it iterates
-   over all entries of input data.  It's more accurate (among other greedy methods) but
-   slow in computation performance.  Also it doesn't support distributed training as
-   XGBoost employs row spliting data distribution while ``exact`` tree method works on a
-   sorted column format.  This tree method can be used with parameter ``tree_method`` set
-   to ``exact``.
+1. ``exact``: The vanilla gradient boosting tree algorithm described in `reference paper
+   <http://arxiv.org/abs/1603.02754>`_.  During split-finding, it iterates over all
+   entries of input data.  It's more accurate (among other greedy methods) but
+   computationally slower in compared to other tree methods.  Further more, its feature
+   set is limited. Features like distributed training and external memory that require
+   approximated quantiles are not supported. This tree method can be used with the
+   parameter ``tree_method`` set to ``exact``.
 
 
 **********************
 Approximated Solutions
 **********************
 
-As ``exact`` tree method is slow in performance and not scalable, we often employ
-approximated training algorithms.  These algorithms build a gradient histogram for each
-node and iterate through the histogram instead of real dataset.  Here we introduce the
-implementations in XGBoost below.
+As ``exact`` tree method is slow in computation performance and difficult to scale, we
+often employ approximated training algorithms.  These algorithms build a gradient
+histogram for each node and iterate through the histogram instead of real dataset.  Here
+we introduce the implementations in XGBoost.
 
 1. ``approx`` tree method: An approximation tree method described in `reference paper
    <http://arxiv.org/abs/1603.02754>`_.  It runs sketching before building each tree
@@ -48,22 +48,18 @@ implementations in XGBoost below.
    this global sketch.  This is the fastest algorithm as it runs sketching only once.  The
    algorithm can be accessed by setting ``tree_method`` to ``hist``.
 
-3. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of
-   ``hist``, with additional support for gradient based sampling.  The algorithm can be
-   accessed by setting ``tree_method`` to ``gpu_hist``.
-
 ************
 Implications
 ************
 
-Some objectives like ``reg:squarederror`` have constant hessian.  In this case, ``hist``
-or ``gpu_hist`` should be preferred as weighted sketching doesn't make sense with constant
+Some objectives like ``reg:squarederror`` have constant hessian.  In this case, the
+``hist`` should be preferred as weighted sketching doesn't make sense with constant
 weights.  When using non-constant hessian objectives, sometimes ``approx`` yields better
-accuracy, but with slower computation performance.  Most of the time using ``(gpu)_hist``
-with higher ``max_bin`` can achieve similar or even superior accuracy while maintaining
-good performance.  However, as xgboost is largely driven by community effort, the actual
-implementations have some differences than pure math description.  Result might have
-slight differences than expectation, which we are currently trying to overcome.
+accuracy, but with slower computation performance.  Most of the time using ``hist`` with
+higher ``max_bin`` can achieve similar or even superior accuracy while maintaining good
+performance.  However, as xgboost is largely driven by community effort, the actual
+implementations have some differences than pure math description.  Result might be
+slightly different than expectation, which we are currently trying to overcome.
 
 **************
 Other Updaters
@@ -106,8 +102,8 @@ solely for the interest of documentation.
    histogram creation step and uses sketching values directly during split evaluation.  It
    was never tested and contained some unknown bugs, we decided to remove it and focus our
    resources on more promising algorithms instead.  For accuracy, most of the time
-   ``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tuning, so
-   removing them don't have any real practical impact.
+   ``approx`` and ``hist`` are enough with some parameters tuning, so removing them don't
+   have any real practical impact.
 
 3. ``grow_local_histmaker`` updater: An approximation tree method described in `reference
    paper <http://arxiv.org/abs/1603.02754>`_.  This updater was rarely used in practice so
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 8cb2e6ee2..7fde35b0e 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -149,7 +149,7 @@ Also for inplace prediction:
 .. code-block:: python
 
   # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
-  booster.set_param({"gpu_id": "0"})
+  booster.set_param({"device": "cuda:0"})
   prediction = xgb.dask.inplace_predict(client, booster, X)
 
 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
index e536f3fcc..5d9ba1d55 100644
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -163,7 +163,7 @@ Will print out something similar to (not actual output as it's too long for demo
     {
       "Learner": {
         "generic_parameter": {
-          "gpu_id": "0",
+          "device": "cuda:0",
           "gpu_page_size": "0",
           "n_jobs": "0",
           "random_state": "0",
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 6ccd168f3..9a61151f4 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -119,7 +119,7 @@ using bst_group_t = std::uint32_t;  // NOLINT
  */
 using bst_target_t = std::uint32_t;  // NOLINT
 /**
- * brief Type for indexing boosted layers.
+ * @brief Type for indexing boosted layers.
  */
 using bst_layer_t = std::int32_t;  // NOLINT
 /**
diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index de7648079..262733b22 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -12,12 +12,18 @@
 #include <cstdint>      // for int16_t, int32_t, int64_t
 #include <memory>       // for shared_ptr
 #include <string>       // for string, to_string
-#include <type_traits>  // for invoke_result_t, is_same_v
+#include <type_traits>  // for invoke_result_t, is_same_v, underlying_type_t
 
 namespace xgboost {
 
 struct CUDAContext;
 
+// symbolic names
+struct DeviceSym {
+  static auto constexpr CPU() { return "cpu"; }
+  static auto constexpr CUDA() { return "cuda"; }
+};
+
 /**
  * @brief A type for device ordinal. The type is packed into 32-bit for efficient use in
  *        viewing types like `linalg::TensorView`.
@@ -59,9 +65,9 @@ struct DeviceOrd {
   [[nodiscard]] std::string Name() const {
     switch (device) {
       case DeviceOrd::kCPU:
-        return "CPU";
+        return DeviceSym::CPU();
       case DeviceOrd::kCUDA:
-        return "CUDA:" + std::to_string(ordinal);
+        return DeviceSym::CUDA() + (':' + std::to_string(ordinal));
       default: {
         LOG(FATAL) << "Unknown device.";
         return "";
@@ -76,26 +82,39 @@ static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
  * @brief Runtime context for XGBoost. Contains information like threads and device.
  */
 struct Context : public XGBoostParameter<Context> {
+ private:
+  std::string device{DeviceSym::CPU()};  // NOLINT
+  // The device object for the current context. We are in the middle of replacing the
+  // `gpu_id` with this device field.
+  DeviceOrd device_{DeviceOrd::CPU()};
+
  public:
   // Constant representing the device ID of CPU.
-  static std::int32_t constexpr kCpuId = -1;
+  static bst_d_ordinal_t constexpr kCpuId = -1;
+  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
   static std::int64_t constexpr kDefaultSeed = 0;
 
  public:
   Context();
 
+  template <typename Container>
+  Args UpdateAllowUnknown(Container const& kwargs) {
+    auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
+    this->SetDeviceOrdinal(kwargs);
+    return args;
+  }
+
+  std::int32_t gpu_id{kCpuId};
+  // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
+  std::int32_t nthread{0};  // NOLINT
   // stored random seed
   std::int64_t seed{kDefaultSeed};
   // whether seed the PRNG each iteration
   bool seed_per_iteration{false};
-  // number of threads to use if OpenMP is enabled
-  // if equals 0, use system default
-  std::int32_t nthread{0};
-  // primary device, -1 means no gpu.
-  std::int32_t gpu_id{kCpuId};
   // fail when gpu_id is invalid
   bool fail_on_invalid_gpu_id{false};
   bool validate_parameters{false};
+
   /**
    * @brief Configure the parameter `gpu_id'.
    *
@@ -111,21 +130,19 @@ struct Context : public XGBoostParameter<Context> {
   /**
    * @brief Is XGBoost running on CPU?
    */
-  [[nodiscard]] bool IsCPU() const { return gpu_id == kCpuId; }
+  [[nodiscard]] bool IsCPU() const { return Device().IsCPU(); }
   /**
    * @brief Is XGBoost running on a CUDA device?
    */
-  [[nodiscard]] bool IsCUDA() const { return !IsCPU(); }
+  [[nodiscard]] bool IsCUDA() const { return Device().IsCUDA(); }
   /**
    * @brief Get the current device and ordinal.
    */
-  [[nodiscard]] DeviceOrd Device() const {
-    return IsCPU() ? DeviceOrd::CPU() : DeviceOrd::CUDA(static_cast<bst_d_ordinal_t>(gpu_id));
-  }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
   /**
    * @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
    */
-  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return this->gpu_id; }
+  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return Device().ordinal; }
   /**
    * @brief Name of the current device.
    */
@@ -134,24 +151,22 @@ struct Context : public XGBoostParameter<Context> {
    * @brief Get a CUDA device context for allocator and stream.
    */
   [[nodiscard]] CUDAContext const* CUDACtx() const;
+
   /**
    * @brief Make a CUDA context based on the current context.
    *
    * @param ordinal The CUDA device ordinal.
    */
-  [[nodiscard]] Context MakeCUDA(std::int32_t ordinal = 0) const {
+  [[nodiscard]] Context MakeCUDA(bst_d_ordinal_t ordinal = 0) const {
     Context ctx = *this;
-    CHECK_GE(ordinal, 0);
-    ctx.gpu_id = ordinal;
-    return ctx;
+    return ctx.SetDevice(DeviceOrd::CUDA(ordinal));
   }
   /**
    * @brief Make a CPU context based on the current context.
    */
   [[nodiscard]] Context MakeCPU() const {
     Context ctx = *this;
-    ctx.gpu_id = kCpuId;
-    return ctx;
+    return ctx.SetDevice(DeviceOrd::CPU());
   }
   /**
    * @brief Call function based on the current device.
@@ -167,7 +182,8 @@ struct Context : public XGBoostParameter<Context> {
       default:
         // Do not use the device name as this is likely an internal error, the name
         // wouldn't be valid.
-        LOG(FATAL) << "Unknown device type:" << static_cast<std::int16_t>(this->Device().device);
+        LOG(FATAL) << "Unknown device type:"
+                   << static_cast<std::underlying_type_t<DeviceOrd::Type>>(this->Device().device);
         break;
     }
     return std::invoke_result_t<CPUFn>();
@@ -182,11 +198,9 @@ struct Context : public XGBoostParameter<Context> {
     DMLC_DECLARE_FIELD(seed_per_iteration)
         .set_default(false)
         .describe("Seed PRNG determnisticly via iterator number.");
+    DMLC_DECLARE_FIELD(device).set_default(DeviceSym::CPU()).describe("Device ordinal.");
     DMLC_DECLARE_FIELD(nthread).set_default(0).describe("Number of threads to use.");
     DMLC_DECLARE_ALIAS(nthread, n_jobs);
-
-    DMLC_DECLARE_FIELD(gpu_id).set_default(-1).set_lower_bound(-1).describe(
-        "The primary GPU device ordinal.");
     DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id)
         .set_default(false)
         .describe("Fail with error when gpu_id is invalid.");
@@ -196,6 +210,14 @@ struct Context : public XGBoostParameter<Context> {
   }
 
  private:
+  void SetDeviceOrdinal(Args const& kwargs);
+  Context& SetDevice(DeviceOrd d) {
+    this->device_ = d;
+    this->gpu_id = d.ordinal;  // this can be removed once we move away from `gpu_id`.
+    this->device = d.Name();
+    return *this;
+  }
+
   // mutable for lazy cuda context initialization. This avoids initializing CUDA at load.
   // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
   // p_impl while trying to hide CUDA code from the host compiler.
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index 3b34c2874..cb22e120e 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -664,11 +664,11 @@ Object ToJson(Parameter const& param) {
 template <typename Parameter>
 Args FromJson(Json const& obj, Parameter* param) {
   auto const& j_param = get<Object const>(obj);
-  std::map<std::string, std::string> m;
+  Args args;
   for (auto const& kv : j_param) {
-    m[kv.first] = get<String const>(kv.second);
+    args.emplace_back(kv.first, get<String const>(kv.second));
   }
-  return param->UpdateAllowUnknown(m);
+  return param->UpdateAllowUnknown(args);
 }
 }  // namespace xgboost
 #endif  // XGBOOST_JSON_H_
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index f2b377ac1..8adb3cb27 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -110,15 +110,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    * \param approx_contribs whether to approximate the feature contributions for speed
    * \param pred_interactions whether to compute the feature pair contributions
    */
-  virtual void Predict(std::shared_ptr<DMatrix> data,
-                       bool output_margin,
-                       HostDeviceVector<bst_float> *out_preds,
-                       unsigned layer_begin,
-                       unsigned layer_end,
-                       bool training = false,
-                       bool pred_leaf = false,
-                       bool pred_contribs = false,
-                       bool approx_contribs = false,
+  virtual void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
+                       HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
+                       bst_layer_t layer_end, bool training = false, bool pred_leaf = false,
+                       bool pred_contribs = false, bool approx_contribs = false,
                        bool pred_interactions = false) = 0;
 
   /*!
@@ -132,8 +127,8 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    * \param          layer_end   End of booster layer. 0 means do not limit trees.
    */
   virtual void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
-                              HostDeviceVector<bst_float>** out_preds, uint32_t layer_begin,
-                              uint32_t layer_end) = 0;
+                              HostDeviceVector<float>** out_preds, bst_layer_t layer_begin,
+                              bst_layer_t layer_end) = 0;
 
   /*!
    * \brief Calculate feature score.  See doc in C API for outputs.
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index f0d2e8e37..2c69cf648 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -39,9 +39,8 @@ struct PredictionCacheEntry {
    *
    * \param v Added versions.
    */
-  void Update(std::uint32_t v) {
-    version += v;
-  }
+  void Update(std::uint32_t v) { version += v; }
+  void Reset() { version = 0; }
 };
 
 /**
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
index eef10a36d..9ff42e370 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -280,7 +280,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
             // - gpu id
             // - predictor: Force to gpu predictor since native doesn't save predictor.
             val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
-            booster.setParam("gpu_id", gpuId.toString)
+            booster.setParam("device", s"cuda:$gpuId")
             logger.info("GPU transform on device: " + gpuId)
             boosterFlag.isGpuParamsSet = true;
           }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 9208449ca..48b31a99f 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -326,7 +326,7 @@ object XGBoost extends Serializable {
           getGPUAddrFromResources
         }
         logger.info("Leveraging gpu device " + gpuId + " to train")
-        params = params + ("gpu_id" -> gpuId)
+        params = params + ("device" -> s"cuda:$gpuId")
       }
       val booster = if (makeCheckpoint) {
         SXGBoost.trainAndSaveCheckpoint(
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 31f34256d..d6214c7a6 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1393,13 +1393,13 @@ class _ProxyDMatrix(DMatrix):
 
 
 class QuantileDMatrix(DMatrix):
-    """A DMatrix variant that generates quantilized data directly from input for
-    ``hist`` and ``gpu_hist`` tree methods. This DMatrix is primarily designed to save
-    memory in training by avoiding intermediate storage. Set ``max_bin`` to control the
-    number of bins during quantisation, which should be consistent with the training
-    parameter ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset,
-    ``ref`` should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as
-    it defeats the purpose of saving memory) constructed from training dataset.  See
+    """A DMatrix variant that generates quantilized data directly from input for the
+    ``hist`` tree method. This DMatrix is primarily designed to save memory in training
+    by avoiding intermediate storage. Set ``max_bin`` to control the number of bins
+    during quantisation, which should be consistent with the training parameter
+    ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset, ``ref``
+    should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as it
+    defeats the purpose of saving memory) constructed from training dataset.  See
     :py:obj:`xgboost.DMatrix` for documents on meta info.
 
     .. note::
@@ -2277,10 +2277,10 @@ class Booster:
 
         .. code-block:: python
 
-            booster.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+            booster.set_param({"device": "cuda:0"})
             booster.inplace_predict(cupy_array)
 
-            booster.set_param({"gpu_id": "-1", "tree_method": "hist"})
+            booster.set_param({"device": "cpu"})
             booster.inplace_predict(numpy_array)
 
         .. versionadded:: 1.1.0
@@ -2311,8 +2311,8 @@ class Booster:
         Returns
         -------
         prediction : numpy.ndarray/cupy.ndarray
-            The prediction result.  When input data is on GPU, prediction
-            result is stored in a cupy array.
+            The prediction result.  When input data is on GPU, prediction result is
+            stored in a cupy array.
 
         """
         preds = ctypes.POINTER(ctypes.c_float)()
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index a46ba14d0..e9f9e9f10 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -273,7 +273,7 @@ __model_doc = f"""
         * For linear model, only "weight" is defined and it's the normalized coefficients
           without bias.
 
-    gpu_id : Optional[int]
+    device : Optional[str]
         Device ordinal.
     validate_parameters : Optional[bool]
         Give warnings for unknown parameter.
@@ -647,7 +647,7 @@ class XGBModel(XGBModelBase):
         monotone_constraints: Optional[Union[Dict[str, int], str]] = None,
         interaction_constraints: Optional[Union[str, Sequence[Sequence[str]]]] = None,
         importance_type: Optional[str] = None,
-        gpu_id: Optional[int] = None,
+        device: Optional[str] = None,
         validate_parameters: Optional[bool] = None,
         enable_categorical: bool = False,
         feature_types: Optional[FeatureTypes] = None,
@@ -693,7 +693,7 @@ class XGBModel(XGBModelBase):
         self.monotone_constraints = monotone_constraints
         self.interaction_constraints = interaction_constraints
         self.importance_type = importance_type
-        self.gpu_id = gpu_id
+        self.device = device
         self.validate_parameters = validate_parameters
         self.enable_categorical = enable_categorical
         self.feature_types = feature_types
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 0181e678d..a170fbf9f 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -1,4 +1,4 @@
-"""Xgboost pyspark integration submodule for core code."""
+"""XGBoost pyspark integration submodule for core code."""
 import base64
 
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
@@ -133,6 +133,7 @@ _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.it
 
 _unsupported_xgb_params = [
     "gpu_id",  # we have "use_gpu" pyspark param instead.
+    "device",  # we have "use_gpu" pyspark param instead.
     "enable_categorical",  # Use feature_types param to specify categorical feature instead
     "use_label_encoder",
     "n_jobs",  # Do not allow user to set it, will use `spark.task.cpus` value instead.
@@ -899,12 +900,14 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
             context = BarrierTaskContext.get()
 
-            gpu_id = None
+            dev_ordinal = None
             use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
 
             if use_gpu:
-                gpu_id = context.partitionId() if is_local else _get_gpu_id(context)
-                booster_params["gpu_id"] = gpu_id
+                dev_ordinal = (
+                    context.partitionId() if is_local else _get_gpu_id(context)
+                )
+                booster_params["device"] = "cuda:" + str(dev_ordinal)
                 # If cuDF is not installed, then using DMatrix instead of QDM,
                 # because without cuDF, DMatrix performs better than QDM.
                 # Note: Checking `is_cudf_available` in spark worker side because
@@ -945,7 +948,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 dtrain, dvalid = create_dmatrix_from_partitions(
                     pandas_df_iter,
                     feature_prop.features_cols_names,
-                    gpu_id,
+                    dev_ordinal,
                     use_qdm,
                     dmatrix_kwargs,
                     enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,
diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
index 8f84459d7..f9c12ba66 100644
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -157,7 +157,7 @@ def _read_csr_matrix_from_unwrapped_spark_vec(part: pd.DataFrame) -> csr_matrix:
 
 def make_qdm(
     data: Dict[str, List[np.ndarray]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
     meta: Dict[str, Any],
     ref: Optional[DMatrix],
     params: Dict[str, Any],
@@ -165,7 +165,7 @@ def make_qdm(
     """Handle empty partition for QuantileDMatrix."""
     if not data:
         return QuantileDMatrix(np.empty((0, 0)), ref=ref)
-    it = PartIter(data, gpu_id, **meta)
+    it = PartIter(data, dev_ordinal, **meta)
     m = QuantileDMatrix(it, **params, ref=ref)
     return m
 
@@ -173,7 +173,7 @@ def make_qdm(
 def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
     iterator: Iterator[pd.DataFrame],
     feature_cols: Optional[Sequence[str]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
     use_qdm: bool,
     kwargs: Dict[str, Any],  # use dict to make sure this parameter is passed.
     enable_sparse_data_optim: bool,
@@ -187,7 +187,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
         Pyspark partition iterator.
     feature_cols:
         A sequence of feature names, used only when rapids plugin is enabled.
-    gpu_id:
+    dev_ordinal:
         Device ordinal, used when GPU is enabled.
     use_qdm :
         Whether QuantileDMatrix should be used instead of DMatrix.
@@ -304,13 +304,13 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
 
     if feature_cols is not None and use_qdm:
         cache_partitions(iterator, append_fn)
-        dtrain: DMatrix = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain: DMatrix = make_qdm(train_data, dev_ordinal, meta, None, params)
     elif feature_cols is not None and not use_qdm:
         cache_partitions(iterator, append_fn)
         dtrain = make(train_data, kwargs)
     elif feature_cols is None and use_qdm:
         cache_partitions(iterator, append_fn)
-        dtrain = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain = make_qdm(train_data, dev_ordinal, meta, None, params)
     else:
         cache_partitions(iterator, append_fn)
         dtrain = make(train_data, kwargs)
@@ -324,7 +324,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
     if has_validation_col:
         if use_qdm:
             dvalid: Optional[DMatrix] = make_qdm(
-                valid_data, gpu_id, meta, dtrain, params
+                valid_data, dev_ordinal, meta, dtrain, params
             )
         else:
             dvalid = make(valid_data, kwargs) if has_validation_col else None
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 5054ef0dd..ba75aca7f 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -78,8 +78,7 @@ def _set_pyspark_xgb_cls_param_attrs(
 
 
 class SparkXGBRegressor(_SparkXGBEstimator):
-    """
-    SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
+    """SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
     algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
     and PySpark ML meta algorithms like :py:class:`~pyspark.ml.tuning.CrossValidator`/
     :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
@@ -89,8 +88,8 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
     :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
 
-    SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    SparkXGBRegressor doesn't support setting `device` but supports another param
+    `use_gpu`, see doc below for more details.
 
     SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but support
     another param called `base_margin_col`. see doc below for more details.
@@ -247,8 +246,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
     :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
 
-    SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    SparkXGBClassifier doesn't support setting `device` but support another param
+    `use_gpu`, see doc below for more details.
 
     SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but support
     another param called `base_margin_col`. see doc below for more details.
@@ -423,7 +422,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
     :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
     :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.
 
-    SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
+    SparkXGBRanker doesn't support setting `device` but support another param `use_gpu`,
     see doc below for more details.
 
     SparkXGBRanker doesn't support setting `base_margin` explicitly as well, but support
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 8e2e13f43..6445f1c94 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -723,24 +723,6 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
 M = TypeVar("M", xgb.Booster, xgb.XGBModel)
 
 
-def set_ordinal(ordinal: int, booster: M) -> M:
-    """Temporary solution for setting the device ordinal until we move away from
-    `gpu_id`.
-
-    """
-    if ordinal < 0:
-        params = {"gpu_id": -1, "tree_method": "hist"}
-    else:
-        params = {"gpu_id": ordinal, "tree_method": "gpu_hist"}
-
-    if isinstance(booster, xgb.Booster):
-        booster.set_param(params)
-    elif isinstance(booster, xgb.XGBModel):
-        booster.set_params(**params)
-
-    return booster
-
-
 def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
     """Evaluation metric for xgb.train"""
     label = dtrain.get_label()
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index af060f6dc..964ab0c3f 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -117,10 +117,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
                           RequiredArg<Integer>(config, "iteration_begin", __func__),
                           RequiredArg<Integer>(config, "iteration_end", __func__));
   CHECK(p_predt);
-  if (learner->Ctx()->IsCPU()) {
-    // Prediction using DMatrix as fallback.
-    CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
-  } else {
+  if (learner->Ctx()->IsCUDA()) {
     CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
   }
   p_predt->SetDevice(proxy->DeviceIdx());
diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index 813cbe8b1..bb57014a6 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -3,23 +3,18 @@
  */
 #include "error_msg.h"
 
+#include "../collective/communicator-inl.h"  // for GetRank
 #include "xgboost/logging.h"
 
 namespace xgboost::error {
 void WarnDeprecatedGPUHist() {
-  bool static thread_local logged{false};
-  if (logged) {
-    return;
-  }
   auto msg =
       "The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
       R"(parameter to CUDA instead.
 
     E.g. tree_method = "hist", device = "CUDA"
-
 )";
   LOG(WARNING) << msg;
-  logged = true;
 }
 
 void WarnManualUpdater() {
@@ -33,4 +28,23 @@ void WarnManualUpdater() {
          "behavior. For common uses, we recommend using `tree_method` parameter instead.";
   logged = true;
 }
+
+void WarnDeprecatedGPUId() {
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "`gpu_id` is deprecated in favor of the new `device` parameter: "
+               << "device = cpu/cuda/cuda:0";
+  logged = true;
+}
+
+void WarnEmptyDataset() {
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+  logged = true;
+}
 }  // namespace xgboost::error
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index c19197007..07b5c3e53 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -82,5 +82,9 @@ inline void WarnOldSerialization() {
 void WarnDeprecatedGPUHist();
 
 void WarnManualUpdater();
+
+void WarnDeprecatedGPUId();
+
+void WarnEmptyDataset();
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/context.cc b/src/context.cc
index 28fda9c45..1acaa6443 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -3,53 +3,201 @@
  *
  * \brief Context object used for controlling runtime parameters.
  */
-#include <xgboost/context.h>
+#include "xgboost/context.h"
 
-#include "common/common.h"  // AssertGPUSupport
+#include <algorithm>  // for find_if
+#include <charconv>   // for from_chars
+#include <iterator>   // for distance
+#include <optional>   // for optional
+#include <regex>      // for regex_replace, regex_match
+
+#include "common/common.h"     // AssertGPUSupport
+#include "common/error_msg.h"  // WarnDeprecatedGPUId
 #include "common/threading_utils.h"
+#include "xgboost/string_view.h"
 
 namespace xgboost {
 
 DMLC_REGISTER_PARAMETER(Context);
 
-std::int32_t constexpr Context::kCpuId;
+bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;
 
 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
 
-void Context::ConfigureGpuId(bool require_gpu) {
-#if defined(XGBOOST_USE_CUDA)
-  if (gpu_id == kCpuId) {  // 0. User didn't specify the `gpu_id'
-    if (require_gpu) {     // 1. `tree_method' or `predictor' or both are using
-                           // GPU.
-      // 2. Use device 0 as default.
-      this->UpdateAllowUnknown(Args{{"gpu_id", "0"}});
-    }
-  }
+namespace {
+inline constexpr char const* kDevice = "device";
 
-  // 3. When booster is loaded from a memory image (Python pickle or R
-  // raw model), number of available GPUs could be different.  Wrap around it.
-  int32_t n_gpus = common::AllVisibleGPUs();
-  if (n_gpus == 0) {
-    if (gpu_id != kCpuId) {
-      LOG(WARNING) << "No visible GPU is found, setting `gpu_id` to -1";
-    }
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  } else if (fail_on_invalid_gpu_id) {
-    CHECK(gpu_id == kCpuId || gpu_id < n_gpus)
-        << "Only " << n_gpus << " GPUs are visible, gpu_id " << gpu_id << " is invalid.";
-  } else if (gpu_id != kCpuId && gpu_id >= n_gpus) {
-    LOG(WARNING) << "Only " << n_gpus << " GPUs are visible, setting `gpu_id` to "
-                 << gpu_id % n_gpus;
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(gpu_id % n_gpus)}});
-  }
+#if !defined(XGBOOST_USE_CUDA)
+DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
+  device = DeviceOrd::CPU();
+  return device;
+}
 #else
-  // Just set it to CPU, don't think about it.
-  this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  (void)(require_gpu);
-#endif  // defined(XGBOOST_USE_CUDA)
+// Check CUDA on the current device, wrap the ordinal if necessary.
+[[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
+  // When booster is loaded from a memory image (Python pickle or R raw model), number of
+  // available GPUs could be different.  Wrap around it.
+  std::int32_t n_visible = common::AllVisibleGPUs();
+  if (n_visible == 0) {
+    if (device.IsCUDA()) {
+      LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
+    }
+    device = DeviceOrd::CPU();
+  } else if (fail_on_invalid) {
+    CHECK(device.IsCPU() || device.ordinal < n_visible)
+        << "Only " << n_visible << " GPUs are visible, ordinal " << device.ordinal
+        << " is invalid.";
+  } else if (device.IsCUDA() && device.ordinal >= n_visible) {
+    device.ordinal = device.ordinal % n_visible;
+    LOG(WARNING) << "Only " << n_visible << " GPUs are visible, setting device ordinal to "
+                 << device.ordinal;
+  }
 
-  common::SetDevice(this->gpu_id);
+  if (device.IsCUDA()) {
+    common::SetDevice(device.ordinal);
+  }
+  return device;
+}
+#endif  //  !defined(XGBOOST_USE_CUDA)
+
+[[nodiscard]] std::optional<std::int32_t> ParseInt(StringView ordinal) {
+  // Some basic checks to ensure valid `gpu_id` and device ordinal instead of directly parsing and
+  // letting go of unknown characters.
+  if (ordinal.empty()) {
+    return std::nullopt;
+  }
+
+  std::size_t offset{0};
+  if (ordinal[0] == '-') {
+    offset = 1;
+  }
+  if (ordinal.size() <= offset) {
+    return std::nullopt;
+  }
+
+  bool valid = std::all_of(ordinal.cbegin() + offset, ordinal.cend(),
+                           [](auto c) { return std::isdigit(c); });
+  if (!valid) {
+    return std::nullopt;
+  }
+
+  std::int32_t parsed_id{Context::kCpuId};
+  auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
+  if (res.ec != std::errc()) {
+    return std::nullopt;
+  }
+
+  return parsed_id;
+}
+
+[[nodiscard]] DeviceOrd MakeDeviceOrd(std::string const& input, bool fail_on_invalid_gpu_id) {
+  StringView msg{R"(Invalid argument for `device`. Expected to be one of the following:
+- cpu
+- cuda
+- cuda:<device ordinal>  # e.g. cuda:0
+- gpu
+- gpu:<device ordinal>   # e.g. gpu:0
+)"};
+  auto fatal = [&] { LOG(FATAL) << msg << "Got: `" << input << "`."; };
+
+#if defined(__MINGW32__)
+  // mingw hangs on regex using rtools 430. Basic checks only.
+  CHECK_GE(input.size(), 3) << msg;
+  auto substr = input.substr(0, 3);
+  bool valid = substr == "cpu" || substr == "cud" || substr == "gpu";
+  CHECK(valid) << msg;
+#else
+  std::regex pattern{"gpu(:[0-9]+)?|cuda(:[0-9]+)?|cpu"};
+  if (!std::regex_match(input, pattern)) {
+    fatal();
+  }
+#endif  // defined(__MINGW32__)
+
+  // handle alias
+  std::string s_device = std::regex_replace(input, std::regex{"gpu"}, DeviceSym::CUDA());
+
+  auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
+  DeviceOrd device;
+  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  if (split_it == s_device.cend()) {
+    // no ordinal.
+    if (s_device == DeviceSym::CPU()) {
+      device = DeviceOrd::CPU();
+    } else if (s_device == DeviceSym::CUDA()) {
+      device = DeviceOrd::CUDA(0);  // use 0 as default;
+    } else {
+      fatal();
+    }
+  } else {
+    // must be CUDA when ordinal is specifed.
+    // +1 for colon
+    std::size_t offset = std::distance(s_device.cbegin(), split_it) + 1;
+    // substr
+    StringView s_ordinal = {s_device.data() + offset, s_device.size() - offset};
+    if (s_ordinal.empty()) {
+      fatal();
+    }
+    auto opt_id = ParseInt(s_ordinal);
+    if (!opt_id.has_value()) {
+      fatal();
+    }
+    CHECK_LE(opt_id.value(), std::numeric_limits<bst_d_ordinal_t>::max())
+        << "Ordinal value too large.";
+    device = DeviceOrd::CUDA(opt_id.value());
+  }
+
+  if (device.ordinal < Context::kCpuId) {
+    fatal();
+  }
+  device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
+
+  return device;
+}
+}  // namespace
+
+void Context::ConfigureGpuId(bool require_gpu) {
+  if (this->IsCPU() && require_gpu) {
+    this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
+  }
+}
+
+void Context::SetDeviceOrdinal(Args const& kwargs) {
+  auto gpu_id_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == "gpu_id"; });
+  auto has_gpu_id = gpu_id_it != kwargs.cend();
+  auto device_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == kDevice; });
+  auto has_device = device_it != kwargs.cend();
+  if (has_device && has_gpu_id) {
+    LOG(FATAL) << "Both `device` and `gpu_id` are specified. Use `device` instead.";
+  }
+
+  if (has_gpu_id) {
+    // Compatible with XGBoost < 2.0.0
+    error::WarnDeprecatedGPUId();
+    auto opt_id = ParseInt(StringView{gpu_id_it->second});
+    CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
+    if (opt_id.value() > Context::kCpuId) {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
+    } else {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
+    }
+    return;
+  }
+
+  auto new_d = MakeDeviceOrd(this->device, this->fail_on_invalid_gpu_id);
+
+  if (!has_device) {
+    CHECK_EQ(new_d.ordinal, this->device_.ordinal);  // unchanged
+  }
+  this->SetDevice(new_d);
+
+  if (this->IsCPU()) {
+    CHECK_EQ(this->device_.ordinal, kCpuId);
+  } else {
+    CHECK_GT(this->device_.ordinal, kCpuId);
+  }
 }
 
 std::int32_t Context::Threads() const {
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index c2c9a1d70..a53b88c13 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -33,10 +33,11 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   bool valid = iter.Next();
   CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";
 
-  auto d = MakeProxy(proxy_)->DeviceIdx();
+  auto pctx = MakeProxy(proxy_)->Ctx();
 
   Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
+  ctx.UpdateAllowUnknown(
+      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
   // hardcoded parameter.
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index e0a28142d..cb8e290c8 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -54,6 +54,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
     p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
   }
 
+  CHECK(p_fmat) << "Failed to fallback.";
   return p_fmat;
 }
 }  // namespace xgboost::data
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index 65abd1b7d..ded1c3aef 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -7,28 +7,31 @@
 
 namespace xgboost::data {
 void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
-  std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
-  auto const& value = adapter->Value();
+  auto adapter{std::make_shared<CudfAdapter>(interface_str)};
   this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
     CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
   }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }
 
 void DMatrixProxy::FromCudaArray(StringView interface_str) {
-  std::shared_ptr<CupyAdapter> adapter(new CupyAdapter{StringView{interface_str}});
+  auto adapter(std::make_shared<CupyAdapter>(StringView{interface_str}));
   this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
     CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
   }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }
 
 namespace cuda_impl {
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index b2be701d5..68cab0d5a 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -27,7 +27,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
   dh::safe_cuda(cudaSetDevice(device));
 
   Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(device)}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
 
   CHECK(adapter->NumRows() != kAdapterUnknownSize);
   CHECK(adapter->NumColumns() != kAdapterUnknownSize);
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 55b935ea0..e97b27665 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -84,6 +84,25 @@ bool UpdatersMatched(std::vector<std::string> updater_seq,
                       return name == up->Name();
                     });
 }
+
+void MismatchedDevices(Context const* booster, Context const* data) {
+  bool thread_local static logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might "
+                  "lead to higher memory usage and slower performance. XGBoost is running on: "
+               << booster->DeviceName() << ", while the input data is on: " << data->DeviceName()
+               << ".\n"
+               << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+
+This warning will only be shown once, and subsequent warnings made by the current thread will be
+suppressed.
+)";
+  logged = true;
+}
 }  // namespace
 
 void GBTree::Configure(Args const& cfg) {
@@ -208,6 +227,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
   bst_target_t const n_groups = model_.learner_model_param->OutputLength();
   monitor_.Start("BoostNewTrees");
 
+  predt->predictions.SetDevice(ctx_->Ordinal());
   auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                     model_.learner_model_param->OutputLength());
   CHECK_NE(n_groups, 0);
@@ -521,18 +541,6 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
   }
 }
 
-namespace {
-inline void MismatchedDevices(Context const* booster, Context const* data) {
-  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
-               << "is running on: " << booster->DeviceName()
-               << ", while the input data is on: " << data->DeviceName() << ".\n"
-               << R"(Potential solutions:
-- Use a data structure that matches the device ordinal in the booster.
-- Set the device for booster before call to inplace_predict.
-)";
-}
-};  // namespace
-
 void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
                           bst_layer_t layer_begin, bst_layer_t layer_end) {
   // dispatch to const function.
diff --git a/src/learner.cc b/src/learner.cc
index 4fd0a0f09..03714a056 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -40,7 +40,7 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
-#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization
+#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization, ...
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@@ -711,6 +711,7 @@ class LearnerConfiguration : public Learner {
     // FIXME(trivialfis): Make eval_metric a training parameter.
     keys.emplace_back(kEvalMetric);
     keys.emplace_back("num_output_group");
+    keys.emplace_back("gpu_id");  // deprecated param.
 
     std::sort(keys.begin(), keys.end());
 
@@ -1340,10 +1341,9 @@ class LearnerImpl : public LearnerIO {
   }
 
   void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
-               HostDeviceVector<bst_float> *out_preds, unsigned layer_begin,
-               unsigned layer_end, bool training,
-               bool pred_leaf, bool pred_contribs, bool approx_contribs,
-               bool pred_interactions) override {
+               HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
+               bst_layer_t layer_end, bool training, bool pred_leaf, bool pred_contribs,
+               bool approx_contribs, bool pred_interactions) override {
     int multiple_predictions = static_cast<int>(pred_leaf) +
                                static_cast<int>(pred_interactions) +
                                static_cast<int>(pred_contribs);
@@ -1391,15 +1391,16 @@ class LearnerImpl : public LearnerIO {
   }
 
   void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
-                      HostDeviceVector<bst_float>** out_preds, uint32_t iteration_begin,
-                      uint32_t iteration_end) override {
+                      HostDeviceVector<float>** out_preds, bst_layer_t iteration_begin,
+                      bst_layer_t iteration_end) override {
     this->Configure();
     this->CheckModelInitialized();
 
     auto& out_predictions = this->GetThreadLocal().prediction_entry;
-    out_predictions.version = 0;
+    out_predictions.Reset();
 
     this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
+
     if (type == PredictionType::kValue) {
       obj_->PredTransform(&out_predictions.predictions);
     } else if (type == PredictionType::kMargin) {
@@ -1454,7 +1455,7 @@ class LearnerImpl : public LearnerIO {
     }
 
     if (p_fmat->Info().num_row_ == 0) {
-      LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+      error::WarnEmptyDataset();
     }
   }
 
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index dda2746bf..08baa844b 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -28,6 +28,7 @@ class LintersPaths:
         "tests/python-gpu/test_gpu_prediction.py",
         "tests/python-gpu/load_pickle.py",
         "tests/python-gpu/test_gpu_pickling.py",
+        "tests/python-gpu/test_gpu_eval_metrics.py",
         "tests/test_distributed/test_with_spark/",
         "tests/test_distributed/test_gpu_with_spark/",
         # demo
diff --git a/tests/cpp/common/test_algorithm.cu b/tests/cpp/common/test_algorithm.cu
index 26c9aea4d..c36073397 100644
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -16,8 +16,7 @@
 namespace xgboost {
 namespace common {
 void TestSegmentedArgSort() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
 
   size_t constexpr kElements = 100, kGroups = 3;
   dh::device_vector<size_t> sorted_idx(kElements, 0);
@@ -55,8 +54,7 @@ void TestSegmentedArgSort() {
 TEST(Algorithm, SegmentedArgSort) { TestSegmentedArgSort(); }
 
 TEST(Algorithm, GpuArgSort) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
 
   dh::device_vector<float> values(20);
   dh::Iota(dh::ToSpan(values));                                    // accending
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 127cd95d4..2d5735925 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -227,7 +227,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   }
   // check categorical
   beg = n_samples;
-  for (std::size_t i = 0; i < n_categories; ++i) {
+  for (bst_cat_t i = 0; i < n_categories; ++i) {
     // all from the second column
     ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
   }
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index fe38f0f9b..be89d51bc 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -4,6 +4,7 @@
 #include <gtest/gtest.h>
 
 #include "../../../src/common/linalg_op.cuh"
+#include "../helpers.h"
 #include "xgboost/context.h"
 #include "xgboost/linalg.h"
 
@@ -54,8 +55,7 @@ void TestElementWiseKernel() {
 }
 
 void TestSlice() {
-  Context ctx;
-  ctx.gpu_id = 1;
+  auto ctx = MakeCUDACtx(1);
   thrust::device_vector<double> data(2 * 3 * 4);
   auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
   dh::LaunchN(1, [=] __device__(size_t) {
diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu
index db0ff3b66..d62f5f171 100644
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -23,8 +23,7 @@
 
 namespace xgboost::ltr {
 void TestCalcQueriesInvIDCG() {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   std::size_t n_groups = 5, n_samples_per_group = 32;
 
   dh::device_vector<float> scores(n_samples_per_group * n_groups);
@@ -85,20 +84,17 @@ void TestRankingCache(Context const* ctx) {
 }  // namespace
 
 TEST(RankingCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   TestRankingCache(&ctx);
 }
 
 TEST(NDCGCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   TestNDCGCache(&ctx);
 }
 
 TEST(MAPCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   TestMAPCache(&ctx);
 }
 }  // namespace xgboost::ltr
diff --git a/tests/cpp/common/test_stats.cc b/tests/cpp/common/test_stats.cc
index abdf00425..e74caeb3a 100644
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -7,6 +7,7 @@
 
 #include "../../../src/common/stats.h"
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
+#include "../helpers.h"
 
 namespace xgboost {
 namespace common {
@@ -71,7 +72,7 @@ TEST(Stats, Median) {
     ASSERT_EQ(m, .5f);
 
 #if defined(XGBOOST_USE_CUDA)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
     ASSERT_FALSE(ctx.IsCPU());
     Median(&ctx, values, weights, &out);
     m = out(0);
@@ -80,7 +81,7 @@ TEST(Stats, Median) {
   }
 
   {
-    ctx.gpu_id = Context::kCpuId;
+    ctx = ctx.MakeCPU();
     // 4x2 matrix
     linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
     HostDeviceVector<float> weights;
@@ -90,7 +91,7 @@ TEST(Stats, Median) {
     ASSERT_EQ(out(1), .5f);
 
 #if defined(XGBOOST_USE_CUDA)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
     Median(&ctx, values, weights, &out);
     ASSERT_EQ(out(0), .5f);
     ASSERT_EQ(out(1), .5f);
@@ -123,8 +124,7 @@ TEST(Stats, Mean) {
 
 #if defined(XGBOOST_USE_CUDA)
 TEST(Stats, GPUMean) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   TestMean(&ctx);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu
index 8643e75a7..08877ac8d 100644
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -3,16 +3,17 @@
  */
 #include <gtest/gtest.h>
 
-#include <cstddef>                            // std::size_t
-#include <utility>                            // std::pair
-#include <vector>                             // std::vector
+#include <cstddef>  // std::size_t
+#include <utility>  // std::pair
+#include <vector>   // std::vector
 
 #include "../../../src/common/linalg_op.cuh"  // ElementWiseTransformDevice
 #include "../../../src/common/stats.cuh"
-#include "xgboost/base.h"                     // XGBOOST_DEVICE
-#include "xgboost/context.h"                  // Context
-#include "xgboost/host_device_vector.h"       // HostDeviceVector
-#include "xgboost/linalg.h"                   // Tensor
+#include "../helpers.h"
+#include "xgboost/base.h"                // XGBOOST_DEVICE
+#include "xgboost/context.h"             // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/linalg.h"              // Tensor
 
 namespace xgboost {
 namespace common {
@@ -33,7 +34,7 @@ class StatsGPU : public ::testing::Test {
   }
 
  public:
-  void SetUp() override { ctx_.gpu_id = 0; }
+  void SetUp() override { ctx_  = MakeCUDACtx(0); }
 
   void WeightedMulti() {
     // data for one segment
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index 5354c2f1a..f2ade711b 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -171,8 +171,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
     ASSERT_TRUE(Xy->SingleColBlock());
     bst_bin_t constexpr kBins{17};
     auto p = BatchParam{kBins, threshold};
-    Context gpu_ctx;
-    gpu_ctx.gpu_id = 0;
+    auto gpu_ctx = MakeCUDACtx(0);
     for (auto const &page : Xy->GetBatches<EllpackPage>(
              &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
       from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 1c0927031..9e6311701 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -180,7 +180,12 @@ TEST(GBTree, ChooseTreeMethod) {
       learner->SetParam("tree_method", tree_method.value());
     }
     if (device.has_value()) {
-      learner->SetParam("gpu_id", device.value());
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
@@ -199,7 +204,12 @@ TEST(GBTree, ChooseTreeMethod) {
       learner->SetParam("tree_method", tree_method.value());
     }
     if (device.has_value()) {
-      learner->SetParam("gpu_id", device.value());
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
@@ -215,11 +225,12 @@ TEST(GBTree, ChooseTreeMethod) {
 
   // |        | hist    | gpu_hist | exact | NA  |
   // |--------+---------+----------+-------+-----|
-  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU | # not yet tested
-  // | CPU    | CPU     | Err      | CPU   | CPU | # not yet tested
+  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU |
+  // | CPU    | CPU     | GPU (w)  | CPU   | CPU |
   // |--------+---------+----------+-------+-----|
   // | -1     | CPU     | GPU (w)  | CPU   | CPU |
   // | 0      | GPU     | GPU (w)  | Err   | GPU |
+  // |--------+---------+----------+-------+-----|
   // | NA     | CPU     | GPU (w)  | CPU   | CPU |
   //
   // - (w): warning
@@ -237,18 +248,30 @@ TEST(GBTree, ChooseTreeMethod) {
           // hist
           {{"hist", "-1"}, "grow_quantile_histmaker"},
           {{"hist", "0"}, "grow_gpu_hist"},
+          {{"hist", "cpu"}, "grow_quantile_histmaker"},
+          {{"hist", "cuda"}, "grow_gpu_hist"},
+          {{"hist", "cuda:0"}, "grow_gpu_hist"},
           {{"hist", std::nullopt}, "grow_quantile_histmaker"},
           // gpu_hist
           {{"gpu_hist", "-1"}, "grow_gpu_hist"},
           {{"gpu_hist", "0"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cpu"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda:0"}, "grow_gpu_hist"},
           {{"gpu_hist", std::nullopt}, "grow_gpu_hist"},
           // exact
           {{"exact", "-1"}, "grow_colmaker,prune"},
           {{"exact", "0"}, "err"},
+          {{"exact", "cpu"}, "grow_colmaker,prune"},
+          {{"exact", "cuda"}, "err"},
+          {{"exact", "cuda:0"}, "err"},
           {{"exact", std::nullopt}, "grow_colmaker,prune"},
           // NA
           {{std::nullopt, "-1"}, "grow_quantile_histmaker"},
           {{std::nullopt, "0"}, "grow_gpu_hist"},  // default to hist
+          {{std::nullopt, "cpu"}, "grow_quantile_histmaker"},
+          {{std::nullopt, "cuda"}, "grow_gpu_hist"},
+          {{std::nullopt, "cuda:0"}, "grow_gpu_hist"},
           {{std::nullopt, std::nullopt}, "grow_quantile_histmaker"},
       };
 
@@ -392,8 +415,7 @@ class Dart : public testing::TestWithParam<char const*> {
     for (size_t i = 0; i < 16; ++i) {
       learner->UpdateOneIter(i, p_mat);
     }
-
-    ConfigLearnerByCtx(&ctx, learner.get());
+    learner->SetParam("device", ctx.DeviceName());
 
     HostDeviceVector<float> predts_training;
     learner->Predict(p_mat, false, &predts_training, 0, 0, true);
@@ -654,8 +676,7 @@ TEST(GBTree, InplacePredictionError) {
         RandomDataGenerator{n_samples, n_features, 0.5f}.Batches(2).GenerateSparsePageDMatrix(
             "cache", true);
     std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParam("booster", booster);
-    ConfigLearnerByCtx(ctx, learner.get());
+    learner->SetParams(Args{{"booster", booster}, {"device", ctx->DeviceName()}});
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
       learner->UpdateOneIter(i, p_fmat);
@@ -697,9 +718,9 @@ TEST(GBTree, InplacePredictionError) {
 #endif  // defined(XGBOOST_USE_CUDA)
     };
     std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParam("booster", booster);
-    learner->SetParam("max_bin", std::to_string(max_bins));
-    ConfigLearnerByCtx(ctx, learner.get());
+    learner->SetParams(Args{{"booster", booster},
+                            {"max_bin", std::to_string(max_bins)},
+                            {"device", ctx->DeviceName()}});
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
       learner->UpdateOneIter(i, p_fmat);
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index 7321be75e..03f689822 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -8,6 +8,7 @@
 #include <limits>  // for numeric_limits
 #include <memory>  // for shared_ptr
 #include <string>  // for string
+#include <thread>  // for thread
 
 #include "../../../src/data/adapter.h"           // for ArrayAdapter
 #include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
@@ -41,7 +42,7 @@ void TestInplaceFallback(Context const* ctx) {
 
   // learner is configured to the device specified by ctx
   std::unique_ptr<Learner> learner{Learner::Create({Xy})};
-  ConfigLearnerByCtx(ctx, learner.get());
+  learner->SetParam("device", ctx->DeviceName());
   for (std::int32_t i = 0; i < 3; ++i) {
     learner->UpdateOneIter(i, Xy);
   }
@@ -56,18 +57,31 @@ void TestInplaceFallback(Context const* ctx) {
 
   HostDeviceVector<float>* out_predt{nullptr};
   ConsoleLogger::Configure(Args{{"verbosity", "1"}});
+  std::string output;
   // test whether the warning is raised
+#if !defined(_WIN32)
+  // Windows has issue with CUDA and thread local storage. For some reason, on Windows a
+  // cudaInitializationError is raised during destruction of `HostDeviceVector`. This
+  // might be related to https://github.com/dmlc/xgboost/issues/5793
   ::testing::internal::CaptureStderr();
+  std::thread{[&] {
+    // Launch a new thread to ensure a warning is raised as we prevent over-verbose
+    // warning by using thread-local flags.
+    learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                            &out_predt, 0, 0);
+  }}.join();
+  output = testing::internal::GetCapturedStderr();
+  ASSERT_NE(output.find("Falling back"), std::string::npos);
+#endif
+
   learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                           &out_predt, 0, 0);
-  auto output = testing::internal::GetCapturedStderr();
-  ASSERT_NE(output.find("Falling back"), std::string::npos);
 
   // test when the contexts match
   Context new_ctx = *proxy->Ctx();
   ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
 
-  ConfigLearnerByCtx(&new_ctx, learner.get());
+  learner->SetParam("device", new_ctx.DeviceName());
   HostDeviceVector<float>* out_predt_1{nullptr};
   // no warning is raised
   ::testing::internal::CaptureStderr();
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 449d97a40..b166109d9 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -559,16 +559,4 @@ class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
     }
   }
 };
-
-// A temporary solution before we move away from gpu_id.
-inline void ConfigLearnerByCtx(Context const* ctx, Learner* learner) {
-  if (ctx->IsCPU()) {
-    learner->SetParam("tree_method", "hist");
-  } else {
-    learner->SetParam("tree_method", "gpu_hist");
-  }
-  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
-  learner->Configure();
-  ASSERT_EQ(learner->Ctx()->gpu_id, ctx->gpu_id);
-}
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_multiclass_metric.h b/tests/cpp/metric/test_multiclass_metric.h
index 0f4017041..5fdead596 100644
--- a/tests/cpp/metric/test_multiclass_metric.h
+++ b/tests/cpp/metric/test_multiclass_metric.h
@@ -46,7 +46,6 @@ inline void CheckDeterministicMetricMultiClass(StringView name, int32_t device)
 
 inline void TestMultiClassError(int device, DataSplitMode data_split_mode) {
   auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
   xgboost::Metric * metric = xgboost::Metric::Create("merror", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "merror");
@@ -67,7 +66,6 @@ inline void VerifyMultiClassError(DataSplitMode data_split_mode = DataSplitMode:
 
 inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
   auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
   xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mlogloss");
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
index d0f448993..16dc45307 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -13,26 +13,22 @@
 
 namespace xgboost::obj {
 TEST(LambdaRank, GPUNDCGJsonIO) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestNDCGJsonIO(&ctx);
 }
 
 TEST(LambdaRank, GPUMAPStat) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestMAPStat(&ctx);
 }
 
 TEST(LambdaRank, GPUNDCGGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestNDCGGPair(&ctx);
 }
 
 void TestGPUMakePair() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
 
   MetaInfo info;
   HostDeviceVector<float> predt;
@@ -126,8 +122,7 @@ void TestGPUMakePair() {
 TEST(LambdaRank, GPUMakePair) { TestGPUMakePair(); }
 
 TEST(LambdaRank, GPUUnbiasedNDCG) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestUnbiasedNDCG(&ctx);
 }
 
@@ -161,8 +156,7 @@ TEST(LambdaRank, RankItemCountOnRight) {
 }
 
 TEST(LambdaRank, GPUMAPGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestMAPGPair(&ctx);
 }
 }  // namespace xgboost::obj
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index 1b5573b0f..b8a40603b 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -305,12 +305,12 @@ TEST(Objective, CPU_vs_CUDA) {
 
   {
     // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
     obj->GetGradient(preds, info, 0, &cpu_out_preds);
   }
   {
     // CUDA
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
     obj->GetGradient(preds, info, 0, &cuda_out_preds);
   }
 
diff --git a/tests/cpp/plugin/test_regression_obj_oneapi.cc b/tests/cpp/plugin/test_regression_obj_oneapi.cc
index 031a9ec2c..c01d9d951 100755
--- a/tests/cpp/plugin/test_regression_obj_oneapi.cc
+++ b/tests/cpp/plugin/test_regression_obj_oneapi.cc
@@ -148,7 +148,7 @@ TEST(Plugin, CPUvsOneAPI) {
 
   {
     // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
     obj_cpu->GetGradient(preds, info, 0, &cpu_out_preds);
   }
   {
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 841a576d5..a54c42a98 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -214,15 +214,16 @@ void TestUpdatePredictionCache(bool use_subsampling) {
 }
 }  // namespace
 
-TEST(CPUPredictor, GHistIndex) {
+TEST(CPUPredictor, GHistIndexTraining) {
   size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  Context ctx;
   auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
   auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage);
   auto adapter = data::ArrayAdapter(columnar.c_str());
   std::shared_ptr<DMatrix> p_full{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
-  TestTrainingPrediction(kRows, kBins, "hist", p_full, p_hist);
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
 }
 
 TEST(CPUPredictor, CategoricalPrediction) {
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 15fbd462e..be0cad5ce 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -33,9 +33,8 @@ TEST(GPUPredictor, Basic) {
     int n_row = i, n_col = i;
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
-    Context ctx;
-    ctx.gpu_id = 0;
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    auto ctx = MakeCUDACtx(0);
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -71,7 +70,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
     std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -102,7 +101,7 @@ TEST(GPUPredictor, MGPUBasicColumnSplit) {
     size_t n_row = i, n_col = i;
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -132,18 +131,19 @@ TEST(GPUPredictor, EllpackBasic) {
 }
 
 TEST(GPUPredictor, EllpackTraining) {
-  size_t constexpr kRows { 128 }, kCols { 16 }, kBins { 64 };
-  auto p_ellpack =
-      RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).Device(0).GenerateDeviceDMatrix(false);
+  auto ctx = MakeCUDACtx(0);
+  size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0}
+                       .Bins(kBins)
+                       .Device(ctx.Ordinal())
+                       .GenerateDeviceDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
-  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}
-       .Device(0)
-       .GenerateArrayInterface(&storage);
+  auto columnar =
+      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Ordinal()).GenerateArrayInterface(&storage);
   auto adapter = data::CupyAdapter(columnar);
-  std::shared_ptr<DMatrix> p_full {
-    DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)
-  };
-  TestTrainingPrediction(kRows, kBins, "gpu_hist", p_full, p_ellpack);
+  std::shared_ptr<DMatrix> p_full{
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_ellpack);
 }
 
 TEST(GPUPredictor, ExternalMemoryTest) {
@@ -153,9 +153,8 @@ TEST(GPUPredictor, ExternalMemoryTest) {
   gpu_predictor->Configure({});
 
   const int n_classes = 3;
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.gpu_id)};
+  Context ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Ordinal())};
 
   gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, n_classes);
   std::vector<std::unique_ptr<DMatrix>> dmats;
@@ -185,7 +184,7 @@ TEST(GPUPredictor, InplacePredictCupy) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.gpu_id);
+  gen.Device(ctx.Ordinal());
   HostDeviceVector<float> data;
   std::string interface_str = gen.GenerateArrayInterface(&data);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -197,7 +196,7 @@ TEST(GPUPredictor, InplacePredictCuDF) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.gpu_id);
+  gen.Device(ctx.Ordinal());
   std::vector<HostDeviceVector<float>> storage(kCols);
   auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -214,9 +213,8 @@ TEST(GpuPredictor, LesserFeatures) {
 TEST(GPUPredictor, ShapStump) {
   cudaSetDevice(0);
 
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -241,9 +239,8 @@ TEST(GPUPredictor, ShapStump) {
 }
 
 TEST(GPUPredictor, Shap) {
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index b85abf183..993504c57 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -44,60 +44,49 @@ TEST(Predictor, PredictionCache) {
   EXPECT_ANY_THROW(container.Entry(m));
 }
 
-void TestTrainingPrediction(size_t rows, size_t bins,
-                            std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist) {
+void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist) {
   size_t constexpr kCols = 16;
   size_t constexpr kClasses = 3;
   size_t constexpr kIters = 3;
 
   std::unique_ptr<Learner> learner;
-  auto train = [&](Context const& ctx) {
-    p_hist->Info().labels.Reshape(rows, 1);
-    auto &h_label = p_hist->Info().labels.Data()->HostVector();
 
-    for (size_t i = 0; i < rows; ++i) {
-      h_label[i] = i % kClasses;
-    }
+  p_hist->Info().labels.Reshape(rows, 1);
+  auto &h_label = p_hist->Info().labels.Data()->HostVector();
 
-    learner.reset(Learner::Create({}));
-    learner->SetParam("tree_method", tree_method);
-    learner->SetParam("objective", "multi:softprob");
-    learner->SetParam("num_feature", std::to_string(kCols));
-    learner->SetParam("num_class", std::to_string(kClasses));
-    learner->SetParam("max_bin", std::to_string(bins));
-    ConfigLearnerByCtx(&ctx, learner.get());
-    learner->Configure();
+  for (size_t i = 0; i < rows; ++i) {
+    h_label[i] = i % kClasses;
+  }
 
-    for (size_t i = 0; i < kIters; ++i) {
-      learner->UpdateOneIter(i, p_hist);
-    }
+  learner.reset(Learner::Create({}));
+  learner->SetParams(Args{{"objective", "multi:softprob"},
+                          {"num_feature", std::to_string(kCols)},
+                          {"num_class", std::to_string(kClasses)},
+                          {"max_bin", std::to_string(bins)},
+                          {"device", ctx->DeviceName()}});
+  learner->Configure();
 
-    Json model{Object{}};
-    learner->SaveModel(&model);
+  for (size_t i = 0; i < kIters; ++i) {
+    learner->UpdateOneIter(i, p_hist);
+  }
 
-    learner.reset(Learner::Create({}));
-    learner->LoadModel(model);
-    ConfigLearnerByCtx(&ctx, learner.get());
-    learner->Configure();
+  Json model{Object{}};
+  learner->SaveModel(&model);
 
-    HostDeviceVector<float> from_full;
-    learner->Predict(p_full, false, &from_full, 0, 0);
+  learner.reset(Learner::Create({}));
+  learner->LoadModel(model);
+  learner->SetParam("device", ctx->DeviceName());
+  learner->Configure();
 
-    HostDeviceVector<float> from_hist;
-    learner->Predict(p_hist, false, &from_hist, 0, 0);
+  HostDeviceVector<float> from_full;
+  learner->Predict(p_full, false, &from_full, 0, 0);
 
-    for (size_t i = 0; i < rows; ++i) {
-      EXPECT_NEAR(from_hist.ConstHostVector()[i],
-                  from_full.ConstHostVector()[i], kRtEps);
-    }
-  };
+  HostDeviceVector<float> from_hist;
+  learner->Predict(p_hist, false, &from_hist, 0, 0);
 
-  if (tree_method == "gpu_hist") {
-    train(MakeCUDACtx(0));
-  } else {
-    train(Context{});
+  for (size_t i = 0; i < rows; ++i) {
+    EXPECT_NEAR(from_hist.ConstHostVector()[i], from_full.ConstHostVector()[i], kRtEps);
   }
 }
 
@@ -120,7 +109,7 @@ void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_r
     learner->UpdateOneIter(it, m);
   }
 
-  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+  learner->SetParam("device", ctx->DeviceName());
   learner->Configure();
 
   HostDeviceVector<float> *p_out_predictions_0{nullptr};
@@ -153,7 +142,7 @@ void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_r
     ASSERT_NEAR(h_pred[i], h_pred_0[i] + h_pred_1[i] - 0.5f, kRtEps);
   }
 
-  learner->SetParam("gpu_id", "-1");
+  learner->SetParam("device", "cpu");
   learner->Configure();
 }
 
@@ -161,12 +150,12 @@ namespace {
 std::unique_ptr<Learner> LearnerForTest(Context const *ctx, std::shared_ptr<DMatrix> dmat,
                                         size_t iters, size_t forest = 1) {
   std::unique_ptr<Learner> learner{Learner::Create({dmat})};
-  learner->SetParams(Args{{"num_parallel_tree", std::to_string(forest)}});
+  learner->SetParams(
+      Args{{"num_parallel_tree", std::to_string(forest)}, {"device", ctx->DeviceName()}});
   for (size_t i = 0; i < iters; ++i) {
     learner->UpdateOneIter(i, dmat);
   }
 
-  ConfigLearnerByCtx(ctx, learner.get());
   return learner;
 }
 
@@ -215,7 +204,7 @@ void TestPredictionDeviceAccess() {
   {
     ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId);
     Context cpu_ctx;
-    ConfigLearnerByCtx(&cpu_ctx, learner.get());
+    learner->SetParam("device", cpu_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cpu, 0, 0);
     ASSERT_TRUE(from_cpu.HostCanWrite());
     ASSERT_FALSE(from_cpu.DeviceCanRead());
@@ -225,7 +214,7 @@ void TestPredictionDeviceAccess() {
   HostDeviceVector<float> from_cuda;
   {
     Context cuda_ctx = MakeCUDACtx(0);
-    ConfigLearnerByCtx(&cuda_ctx, learner.get());
+    learner->SetParam("device", cuda_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cuda, 0, 0);
     ASSERT_EQ(from_cuda.DeviceIdx(), 0);
     ASSERT_TRUE(from_cuda.DeviceCanWrite());
@@ -465,11 +454,7 @@ void TestIterationRangeColumnSplit(Context const* ctx) {
   auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
   auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
 
-  if (ctx->IsCPU()) {
-    learner->SetParams(Args{{"gpu_id", std::to_string(-1)}});
-  } else {
-    learner->SetParams(Args{{"gpu_id", std::to_string(0)}});
-  }
+  learner->SetParam("device", ctx->DeviceName());
 
   bool bound = false;
   std::unique_ptr<Learner> sliced{learner->Slice(0, 3, 1, &bound)};
@@ -582,7 +567,7 @@ void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
   learner.reset(Learner::Create({Xy}));
   learner->LoadModel(model);
 
-  ConfigLearnerByCtx(ctx, learner.get());
+  learner->SetParam("device", ctx->DeviceName());
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
   auto constexpr kWorldSize = 2;
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index c6f4d1816..81ec3cb5d 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -84,9 +84,8 @@ void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t col
 }
 
 // p_full and p_hist should come from the same data set.
-void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist);
+void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);
 
 void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
                            bst_feature_t cols);
diff --git a/tests/cpp/test_context.cc b/tests/cpp/test_context.cc
new file mode 100644
index 000000000..d49f7b4b2
--- /dev/null
+++ b/tests/cpp/test_context.cc
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>
+#include <xgboost/context.h>
+
+namespace xgboost {
+TEST(Context, CPU) {
+  Context ctx;
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+  ASSERT_EQ(ctx.Ordinal(), Context::kCpuId);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, -1);
+
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "oops"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "-1"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CUDA"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU:0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:+0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:0-"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":gpu"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ""}}), dmlc::Error);
+}
+}  // namespace xgboost
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
new file mode 100644
index 000000000..035d22125
--- /dev/null
+++ b/tests/cpp/test_context.cu
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>  // for Args
+#include <xgboost/context.h>
+#include <xgboost/json.h>  // for FromJson, ToJson
+
+#include <string>  // for string, to_string
+
+#include "../../src/common/common.h"  // for AllVisibleGPUs
+
+namespace xgboost {
+namespace {
+void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
+  ASSERT_EQ(ctx.gpu_id, ord);
+  ASSERT_EQ(ctx.Device().ordinal, ord);
+  ASSERT_EQ(ctx.DeviceName(), "cuda:" + std::to_string(ord));
+  ASSERT_EQ(ctx.Ordinal(), ord);
+  ASSERT_TRUE(ctx.IsCUDA());
+  ASSERT_FALSE(ctx.IsCPU());
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CUDA(ord));
+
+  Json jctx{ToJson(ctx)};
+  Context new_ctx;
+  FromJson(jctx, &new_ctx);
+  ASSERT_EQ(new_ctx.Device(), ctx.Device());
+  ASSERT_EQ(new_ctx.gpu_id, ctx.gpu_id);
+}
+}  // namespace
+
+TEST(Context, DeviceOrdinal) {
+  Context ctx;
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+
+  std::string device = "cuda:" + std::to_string(ord);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  TestCUDA(ctx, ord);
+
+  device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+
+  TestCUDA(ctx, ord);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, 1);
+
+  Context new_ctx = ctx;
+  TestCUDA(new_ctx, ctx.Ordinal());
+
+  auto cpu_ctx = ctx.MakeCPU();
+  ASSERT_TRUE(cpu_ctx.IsCPU());
+  ASSERT_EQ(cpu_ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(cpu_ctx.Device(), DeviceOrd::CPU());
+
+  auto cuda_ctx = cpu_ctx.MakeCUDA(ctx.Ordinal());
+  TestCUDA(cuda_ctx, ctx.Ordinal());
+
+  cuda_ctx.UpdateAllowUnknown(Args{{"fail_on_invalid_gpu_id", "true"}});
+  ASSERT_THROW({ cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:9999"}}); }, dmlc::Error);
+  cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:00"}});
+  ASSERT_EQ(cuda_ctx.Ordinal(), 0);
+
+  ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  // Test alias
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu:0"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu"}});
+  TestCUDA(ctx, 0);
+
+  // Test the thread local memory in dmlc is not linking different instances together.
+  cpu_ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{});
+  TestCUDA(ctx, 0);
+}
+
+TEST(Context, GPUId) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestCUDA(ctx, 0);
+
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
+  TestCUDA(ctx, ord);
+
+  auto device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+  TestCUDA(ctx, ord);
+
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "-1"}});
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+}
+}  // namespace xgboost
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 0981fc352..2165c6c8d 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -27,7 +27,6 @@
 #include "../../src/common/io.h"                    // for LoadSequentialFile
 #include "../../src/common/linalg_op.h"             // for ElementWiseTransformHost, begin, end
 #include "../../src/common/random.h"                // for GlobalRandom
-#include "../../src/common/transform_iterator.h"    // for IndexTransformIter
 #include "dmlc/io.h"                                // for Stream
 #include "dmlc/omp.h"                               // for omp_get_max_threads
 #include "dmlc/registry.h"                          // for Registry
@@ -35,14 +34,13 @@
 #include "helpers.h"                                // for GetBaseScore, RandomDataGenerator
 #include "objective_helpers.h"                      // for MakeObjNamesForTest, ObjTestNameGenerator
 #include "xgboost/base.h"                           // for bst_float, Args, bst_feature_t, bst_int
-#include "xgboost/context.h"                        // for Context
+#include "xgboost/context.h"                        // for Context, DeviceOrd
 #include "xgboost/data.h"                           // for DMatrix, MetaInfo, DataType
 #include "xgboost/host_device_vector.h"             // for HostDeviceVector
 #include "xgboost/json.h"                           // for Json, Object, get, String, IsA, opera...
 #include "xgboost/linalg.h"                         // for Tensor, TensorView
 #include "xgboost/logging.h"                        // for ConsoleLogger
 #include "xgboost/predictor.h"                      // for PredictionCacheEntry
-#include "xgboost/span.h"                           // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"                    // for StringView
 
 namespace xgboost {
@@ -58,9 +56,9 @@ TEST(Learner, Basic) {
   auto minor = XGBOOST_VER_MINOR;
   auto patch = XGBOOST_VER_PATCH;
 
-  static_assert(std::is_integral<decltype(major)>::value, "Wrong major version type");
-  static_assert(std::is_integral<decltype(minor)>::value, "Wrong minor version type");
-  static_assert(std::is_integral<decltype(patch)>::value, "Wrong patch version type");
+  static_assert(std::is_integral_v<decltype(major)>, "Wrong major version type");
+  static_assert(std::is_integral_v<decltype(minor)>, "Wrong minor version type");
+  static_assert(std::is_integral_v<decltype(patch)>, "Wrong patch version type");
 }
 
 TEST(Learner, ParameterValidation) {
@@ -92,8 +90,7 @@ TEST(Learner, CheckGroup) {
   size_t constexpr kNumRows = 17;
   bst_feature_t constexpr kNumCols = 15;
 
-  std::shared_ptr<DMatrix> p_mat{
-      RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
+  std::shared_ptr<DMatrix> p_mat{RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
   std::vector<bst_float> weight(kNumGroups, 1);
   std::vector<bst_int> group(kNumGroups);
   group[0] = 2;
@@ -312,35 +309,36 @@ TEST(Learner, GPUConfiguration) {
     learner->SetParams({Arg{"booster", "gblinear"},
                         Arg{"updater", "gpu_coord_descent"}});
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
   {
-    std::unique_ptr<Learner> learner {Learner::Create(mat)};
+    std::unique_ptr<Learner> learner{Learner::Create(mat)};
     learner->SetParams({Arg{"tree_method", "gpu_hist"}});
+    learner->Configure();
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
   {
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
     learner->SetParams({Arg{"tree_method", "gpu_hist"},
                         Arg{"gpu_id", "-1"}});
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
   {
     // with CPU algorithm
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
     learner->SetParams({Arg{"tree_method", "hist"}});
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, -1);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CPU());
   }
   {
     // with CPU algorithm, but `gpu_id` takes priority
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"gpu_id", "0"}});
+    learner->SetParams({Arg{"tree_method", "hist"}, Arg{"gpu_id", "0"}});
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
 }
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/tests/cpp/tree/test_node_partition.cc b/tests/cpp/tree/test_node_partition.cc
index d7254fa60..abde2da70 100644
--- a/tests/cpp/tree/test_node_partition.cc
+++ b/tests/cpp/tree/test_node_partition.cc
@@ -6,7 +6,9 @@
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_updater.h>  // for TreeUpdater
 
-#include <memory>                  // for unique_ptr
+#include <memory>  // for unique_ptr
+
+#include "../helpers.h"
 
 namespace xgboost {
 TEST(Updater, HasNodePosition) {
@@ -19,7 +21,7 @@ TEST(Updater, HasNodePosition) {
   ASSERT_TRUE(up->HasNodePosition());
 
 #if defined(XGBOOST_USE_CUDA)
-  ctx.gpu_id = 0;
+  ctx = MakeCUDACtx(0);
   up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, &task));
   ASSERT_TRUE(up->HasNodePosition());
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/tests/cpp/tree/test_prediction_cache.cc b/tests/cpp/tree/test_prediction_cache.cc
index 1877b7a35..e60d9cd8a 100644
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -70,9 +70,9 @@ class TestPredictionCache : public ::testing::Test {
       Context ctx;
       ctx.InitAllowUnknown(Args{{"nthread", "8"}});
       if (updater_name == "grow_gpu_hist") {
-        ctx.gpu_id = 0;
+        ctx = ctx.MakeCUDA(0);
       } else {
-        ctx.gpu_id = Context::kCpuId;
+        ctx = ctx.MakeCPU();
       }
 
       ObjInfo task{ObjInfo::kRegression};
diff --git a/tests/python-gpu/load_pickle.py b/tests/python-gpu/load_pickle.py
index caefa362d..a63dd28aa 100644
--- a/tests/python-gpu/load_pickle.py
+++ b/tests/python-gpu/load_pickle.py
@@ -34,7 +34,7 @@ class TestLoadPickle:
         bst = load_pickle(model_path)
         config = bst.save_config()
         config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "-1"
+        assert config["learner"]["generic_param"]["device"] == "cpu"
 
     def test_context_is_preserved(self) -> None:
         """Test the device context is preserved after pickling."""
@@ -42,14 +42,14 @@ class TestLoadPickle:
         bst = load_pickle(model_path)
         config = bst.save_config()
         config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "0"
+        assert config["learner"]["generic_param"]["device"] == "cuda:0"
 
     def test_wrap_gpu_id(self) -> None:
         assert os.environ["CUDA_VISIBLE_DEVICES"] == "0"
         bst = load_pickle(model_path)
         config = bst.save_config()
         config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "0"
+        assert config["learner"]["generic_param"]["device"] == "cuda:0"
 
         x, y = build_dataset()
         test_x = xgb.DMatrix(x)
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index 477e9f2a1..ace17933b 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -203,7 +203,7 @@ class TestQuantileDMatrix:
         np.testing.assert_equal(h_ret.indices, d_ret.indices)
 
         booster = xgb.train(
-            {"tree_method": "gpu_hist", "gpu_id": "0"}, dtrain=d_m
+            {"tree_method": "hist", "device": "cuda:0"}, dtrain=d_m
         )
 
         np.testing.assert_allclose(
diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py
index a6f50c224..e97ca210e 100644
--- a/tests/python-gpu/test_gpu_basic_models.py
+++ b/tests/python-gpu/test_gpu_basic_models.py
@@ -65,16 +65,20 @@ class TestGPUBasicModels:
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_invalid_gpu_id(self):
         from sklearn.datasets import load_digits
+
         X, y = load_digits(return_X_y=True)
         # should pass with invalid gpu id
-        cls1 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=9999)
+        cls1 = xgb.XGBClassifier(tree_method="gpu_hist", gpu_id=9999)
         cls1.fit(X, y)
         # should throw error with fail_on_invalid_gpu_id enabled
         cls2 = xgb.XGBClassifier(
-            tree_method='gpu_hist', gpu_id=9999, fail_on_invalid_gpu_id=True
+            tree_method="gpu_hist", gpu_id=9999, fail_on_invalid_gpu_id=True
         )
-        try:
+        with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
+            cls2.fit(X, y)
+
+        cls2 = xgb.XGBClassifier(
+            tree_method="hist", device="cuda:9999", fail_on_invalid_gpu_id=True
+        )
+        with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
             cls2.fit(X, y)
-            assert False, "Should have failed with with fail_on_invalid_gpu_id enabled"
-        except xgb.core.XGBoostError as err:
-            assert "gpu_id 9999 is invalid" in str(err)
diff --git a/tests/python-gpu/test_gpu_eval_metrics.py b/tests/python-gpu/test_gpu_eval_metrics.py
index f5f770d2f..f084eaa45 100644
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -43,10 +43,16 @@ class TestGPUEvalMetrics:
             num_boost_round=10,
         )
         cpu_auc = float(booster.eval(Xy).split(":")[1])
-        booster.set_param({"gpu_id": "0"})
-        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        booster.set_param({"device": "cuda:0"})
+        assert (
+            json.loads(booster.save_config())["learner"]["generic_param"]["device"]
+            == "cuda:0"
+        )
         gpu_auc = float(booster.eval(Xy).split(":")[1])
-        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        assert (
+            json.loads(booster.save_config())["learner"]["generic_param"]["device"]
+            == "cuda:0"
+        )
 
         np.testing.assert_allclose(cpu_auc, gpu_auc)
 
diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py
index 49ac24740..10c4c7e45 100644
--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@@ -113,14 +113,6 @@ class TestPickling:
         param = {"tree_method": "gpu_hist", "verbosity": 1}
         bst = xgb.train(param, train_x)
 
-        with tm.captured_output() as (out, err):
-            bst.inplace_predict(x)
-
-        # The warning is redirected to Python callback, so it's printed in stdout
-        # instead of stderr.
-        stdout = out.getvalue()
-        assert stdout.find("mismatched devices") != -1
-
         save_pickle(bst, model_path)
 
         args = self.args_template.copy()
@@ -177,7 +169,7 @@ class TestPickling:
 
         # Switch to CPU predictor
         bst = model.get_booster()
-        tm.set_ordinal(-1, bst)
+        bst.set_param({"device": "cpu"})
         cpu_pred = model.predict(x, output_margin=True)
         np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
 
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index 0d961d0e3..fb5f47c2b 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -39,7 +39,8 @@ predict_parameter_strategy = strategies.fixed_dictionaries(
     }
 )
 
-pytestmark = tm.timeout(20)
+# cupy nvrtc compilation can take a long time for the first run
+pytestmark = tm.timeout(30)
 
 
 class TestGPUPredict:
@@ -71,8 +72,8 @@ class TestGPUPredict:
                 param = {
                     "objective": "binary:logistic",
                     "eval_metric": "logloss",
-                    "tree_method": "gpu_hist",
-                    "gpu_id": 0,
+                    "tree_method": "hist",
+                    "device": "gpu:0",
                     "max_depth": 1,
                 }
                 bst = xgb.train(
@@ -84,7 +85,7 @@ class TestGPUPredict:
                 gpu_pred_test = bst.predict(dtest, output_margin=True)
                 gpu_pred_val = bst.predict(dval, output_margin=True)
 
-                bst.set_param({"gpu_id": -1, "tree_method": "hist"})
+                bst.set_param({"device": "cpu", "tree_method": "hist"})
                 bst_cpu = copy(bst)
                 cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                 cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
@@ -107,14 +108,15 @@ class TestGPUPredict:
         dtrain = xgb.DMatrix(X_train, label=y_train)
 
         params = {}
-        params["tree_method"] = "gpu_hist"
+        params["tree_method"] = "hist"
+        params["device"] = "cuda:0"
         bst = xgb.train(params, dtrain)
 
-        tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
         # Don't reuse the DMatrix for prediction, otherwise the result is cached.
         predict_gpu_0 = bst.predict(xgb.DMatrix(X_test))
         predict_gpu_1 = bst.predict(xgb.DMatrix(X_test))
-        tm.set_ordinal(-1, bst)
+        bst.set_param({"device": "cpu"})
         predict_cpu = bst.predict(xgb.DMatrix(X_test))
 
         assert np.allclose(predict_gpu_0, predict_gpu_1)
@@ -131,8 +133,8 @@ class TestGPUPredict:
         X_test, y_test = X[tr_size:, :], y[tr_size:]
 
         params = {
-            "tree_method": "gpu_hist",
-            "gpu_id": "0",
+            "tree_method": "hist",
+            "device": "cuda:0",
             "n_jobs": -1,
             "seed": 123,
         }
@@ -141,13 +143,54 @@ class TestGPUPredict:
         gpu_test_score = m.score(X_test, y_test)
 
         # Now with cpu
-        m = tm.set_ordinal(-1, m)
+        m.set_params(device="cpu")
         cpu_train_score = m.score(X_train, y_train)
         cpu_test_score = m.score(X_test, y_test)
 
         assert np.allclose(cpu_train_score, gpu_train_score)
         assert np.allclose(cpu_test_score, gpu_test_score)
 
+    @pytest.mark.parametrize("device", ["cpu", "cuda"])
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_inplace_predict_device_type(self, device: str) -> None:
+        """Test inplace predict with different device and data types.
+
+        The sklearn interface uses inplace predict by default and gbtree fallbacks to
+        DMatrix whenever device doesn't match. This test checks that XGBoost can handle
+        different combinations of device and input data type.
+
+        """
+        import cudf
+        import cupy as cp
+        import pandas as pd
+        from scipy.sparse import csr_matrix
+
+        reg = xgb.XGBRegressor(tree_method="hist", device=device)
+        n_samples = 4096
+        n_features = 13
+        X, y, w = tm.make_regression(n_samples, n_features, use_cupy=True)
+        X[X == 0.0] = 1.0
+
+        reg.fit(X, y, sample_weight=w)
+        predt_0 = reg.predict(X)
+
+        X = cp.asnumpy(X)
+        predt_1 = reg.predict(X)
+
+        df = pd.DataFrame(X)
+        predt_2 = reg.predict(df)
+
+        df = cudf.DataFrame(X)
+        predt_3 = reg.predict(df)
+
+        X_csr = csr_matrix(X)
+        predt_4 = reg.predict(X_csr)
+
+        np.testing.assert_allclose(predt_0, predt_1)
+        np.testing.assert_allclose(predt_0, predt_2)
+        np.testing.assert_allclose(predt_0, predt_3)
+        np.testing.assert_allclose(predt_0, predt_4)
+
     def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
         import cupy as cp
 
@@ -175,7 +218,9 @@ class TestGPUPredict:
         dtrain = xgb.DMatrix(X, y)
 
         booster = xgb.train(
-            {"tree_method": "gpu_hist", "gpu_id": device}, dtrain, num_boost_round=10
+            {"tree_method": "hist", "device": f"cuda:{device}"},
+            dtrain,
+            num_boost_round=10,
         )
 
         test = xgb.DMatrix(X[:10, ...], missing=missing)
@@ -208,13 +253,13 @@ class TestGPUPredict:
         missing_idx = [i for i in range(0, X.shape[1], 16)]
         X[:, missing_idx] = missing
         reg = xgb.XGBRegressor(
-            tree_method="gpu_hist", n_estimators=8, missing=missing, gpu_id=device
+            tree_method="hist", n_estimators=8, missing=missing, device=f"cuda:{device}"
         )
         reg.fit(X, y)
 
-        reg = tm.set_ordinal(device, reg)
+        reg.set_params(device=f"cuda:{device}")
         gpu_predt = reg.predict(X)
-        reg = tm.set_ordinal(-1, reg)
+        reg = reg.set_params(device="cpu")
         cpu_predt = reg.predict(cp.asnumpy(X))
         np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)
         cp.cuda.runtime.setDevice(0)
@@ -250,7 +295,9 @@ class TestGPUPredict:
 
         dtrain = xgb.DMatrix(X, y)
 
-        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain, num_boost_round=10)
+        booster = xgb.train(
+            {"tree_method": "hist", "device": "cuda:0"}, dtrain, num_boost_round=10
+        )
         test = xgb.DMatrix(X)
         predt_from_array = booster.inplace_predict(X)
         predt_from_dmatrix = booster.predict(test)
@@ -280,12 +327,12 @@ class TestGPUPredict:
     def test_shap(self, num_rounds, dataset, param):
         if dataset.name.endswith("-l1"):  # not supported by the exact tree method
             return
-        param.update({"tree_method": "gpu_hist", "gpu_id": 0})
+        param.update({"tree_method": "hist", "device": "gpu:0"})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
         bst = xgb.train(param, dmat, num_rounds)
         test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "gpu:0"})
         shap = bst.predict(test_dmat, pred_contribs=True)
         margin = bst.predict(test_dmat, output_margin=True)
         assume(len(dataset.y) > 0)
@@ -298,12 +345,12 @@ class TestGPUPredict:
     def test_shap_interactions(self, num_rounds, dataset, param):
         if dataset.name.endswith("-l1"):  # not supported by the exact tree method
             return
-        param.update({"tree_method": "hist", "gpu_id": 0})
+        param.update({"tree_method": "hist", "device": "cuda:0"})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
         bst = xgb.train(param, dmat, num_rounds)
         test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
         shap = bst.predict(test_dmat, pred_interactions=True)
         margin = bst.predict(test_dmat, output_margin=True)
         assume(len(dataset.y) > 0)
@@ -317,16 +364,18 @@ class TestGPUPredict:
     def test_shap_categorical(self):
         X, y = tm.make_categorical(100, 20, 7, False)
         Xy = xgb.DMatrix(X, y, enable_categorical=True)
-        booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10)
+        booster = xgb.train(
+            {"tree_method": "hist", "device": "gpu:0"}, Xy, num_boost_round=10
+        )
 
-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
         shap = booster.predict(Xy, pred_contribs=True)
         margin = booster.predict(Xy, output_margin=True)
         np.testing.assert_allclose(
             np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3
         )
 
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
         shap = booster.predict(Xy, pred_contribs=True)
         margin = booster.predict(Xy, output_margin=True)
         np.testing.assert_allclose(
@@ -334,8 +383,8 @@ class TestGPUPredict:
         )
 
     def test_predict_leaf_basic(self):
-        gpu_leaf = run_predict_leaf(0)
-        cpu_leaf = run_predict_leaf(-1)
+        gpu_leaf = run_predict_leaf("gpu:0")
+        cpu_leaf = run_predict_leaf("cpu")
         np.testing.assert_equal(gpu_leaf, cpu_leaf)
 
     def run_predict_leaf_booster(self, param, num_rounds, dataset):
@@ -344,23 +393,22 @@ class TestGPUPredict:
         booster = xgb.train(
             param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds
         )
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
         cpu_leaf = booster.predict(m, pred_leaf=True)
 
-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
         gpu_leaf = booster.predict(m, pred_leaf=True)
 
         np.testing.assert_equal(cpu_leaf, gpu_leaf)
 
     @given(predict_parameter_strategy, tm.make_dataset_strategy())
     @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_predict_leaf_gbtree(self, param, dataset):
+    def test_predict_leaf_gbtree(self, param: dict, dataset: tm.TestDataset) -> None:
         # Unsupported for random forest
         if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
             return
 
-        param["booster"] = "gbtree"
-        param["tree_method"] = "gpu_hist"
+        param.update({"booster": "gbtree", "tree_method": "hist", "device": "cuda:0"})
         self.run_predict_leaf_booster(param, 10, dataset)
 
     @given(predict_parameter_strategy, tm.make_dataset_strategy())
@@ -370,8 +418,7 @@ class TestGPUPredict:
         if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
             return
 
-        param["booster"] = "dart"
-        param["tree_method"] = "gpu_hist"
+        param.update({"booster": "dart", "tree_method": "hist", "device": "cuda:0"})
         self.run_predict_leaf_booster(param, 10, dataset)
 
     @pytest.mark.skipif(**tm.no_sklearn())
@@ -395,12 +442,12 @@ class TestGPUPredict:
         dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)
 
         params = {
-            "tree_method": "gpu_hist",
+            "tree_method": "hist",
             "max_depth": 3,
             "learning_rate": 1.0,
             "base_score": 0.0,
             "eval_metric": "rmse",
-            "gpu_id": "0",
+            "device": "cuda:0",
         }
 
         eval_history = {}
@@ -412,7 +459,7 @@ class TestGPUPredict:
             verbose_eval=False,
             evals_result=eval_history,
         )
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
         pred = bst.predict(dtrain)
         rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
         np.testing.assert_almost_equal(
@@ -434,14 +481,16 @@ class TestGPUPredict:
         Xy = xgb.DMatrix(X, y)
         if n_classes == 2:
             params = {
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda:0",
                 "booster": "dart",
                 "rate_drop": 0.5,
                 "objective": "binary:logistic",
             }
         else:
             params = {
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda:0",
                 "booster": "dart",
                 "rate_drop": 0.5,
                 "objective": "multi:softprob",
@@ -455,7 +504,7 @@ class TestGPUPredict:
         copied = booster.predict(Xy)
 
         # CPU
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
         cpu_inplace = booster.inplace_predict(X_)
         cpu_copied = booster.predict(Xy)
 
@@ -465,7 +514,7 @@ class TestGPUPredict:
         cp.testing.assert_allclose(inplace, copied, atol=1e-6)
 
         # GPU
-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
         inplace = booster.inplace_predict(X)
         copied = booster.predict(Xy)
 
@@ -482,7 +531,7 @@ class TestGPUPredict:
         orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols)
         y = rng.randint(low=0, high=127, size=rows)
         dtrain = xgb.DMatrix(orig, label=y)
-        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)
+        booster = xgb.train({"tree_method": "hist", "device": "cuda:0"}, dtrain)
 
         predt_orig = booster.inplace_predict(orig)
         # all primitive types in numpy
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index 15288f53e..04a7d70cb 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -28,7 +28,7 @@ def run_threaded_predict(X, rows, predict_func):
         assert f.result()
 
 
-def run_predict_leaf(gpu_id: int) -> np.ndarray:
+def run_predict_leaf(device: str) -> np.ndarray:
     rows = 100
     cols = 4
     classes = 5
@@ -48,7 +48,7 @@ def run_predict_leaf(gpu_id: int) -> np.ndarray:
         num_boost_round=num_boost_round,
     )
 
-    booster = tm.set_ordinal(gpu_id, booster)
+    booster.set_param({"device": device})
     empty = xgb.DMatrix(np.ones(shape=(0, cols)))
     empty_leaf = booster.predict(empty, pred_leaf=True)
     assert empty_leaf.shape[0] == 0
@@ -74,14 +74,14 @@ def run_predict_leaf(gpu_id: int) -> np.ndarray:
 
     # When there's only 1 tree, the output is a 1 dim vector
     booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
-    booster = tm.set_ordinal(gpu_id, booster)
+    booster.set_param({"device": device})
     assert booster.predict(m, pred_leaf=True).shape == (rows,)
 
     return leaf
 
 
 def test_predict_leaf() -> None:
-    run_predict_leaf(-1)
+    run_predict_leaf("cpu")
 
 
 def test_predict_shape():
diff --git a/tests/test_distributed/test_with_spark/test_data.py b/tests/test_distributed/test_with_spark/test_data.py
index b08fcdf1d..7f8f1a13e 100644
--- a/tests/test_distributed/test_with_spark/test_data.py
+++ b/tests/test_distributed/test_with_spark/test_data.py
@@ -69,7 +69,7 @@ def run_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool, on_gpu: bool) -> None:
     train_Xy, valid_Xy = create_dmatrix_from_partitions(
         iter(dfs),
         feature_cols,
-        gpu_id=device_id,
+        dev_ordinal=device_id,
         use_qdm=is_qdm,
         kwargs=kwargs,
         enable_sparse_data_optim=False,
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 6d88323ac..dfdadb2ef 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -1025,6 +1025,7 @@ class XgboostLocalTest(SparkTestCase):
         self.assertTrue(hasattr(py_reg, "n_estimators"))
         self.assertEqual(py_reg.n_estimators.parent, py_reg.uid)
         self.assertFalse(hasattr(py_reg, "gpu_id"))
+        self.assertFalse(hasattr(py_reg, "device"))
         self.assertEqual(py_reg.getOrDefault(py_reg.n_estimators), 100)
         self.assertEqual(py_reg.getOrDefault(py_reg.objective), "reg:squarederror")
         py_reg2 = SparkXGBRegressor(n_estimators=200)
@@ -1038,6 +1039,7 @@ class XgboostLocalTest(SparkTestCase):
         self.assertTrue(hasattr(py_cls, "n_estimators"))
         self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
         self.assertFalse(hasattr(py_cls, "gpu_id"))
+        self.assertFalse(hasattr(py_cls, "device"))
         self.assertEqual(py_cls.getOrDefault(py_cls.n_estimators), 100)
         self.assertEqual(py_cls.getOrDefault(py_cls.objective), None)
         py_cls2 = SparkXGBClassifier(n_estimators=200)
@@ -1051,6 +1053,7 @@ class XgboostLocalTest(SparkTestCase):
         self.assertTrue(hasattr(py_cls, "n_estimators"))
         self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
         self.assertFalse(hasattr(py_cls, "gpu_id"))
+        self.assertFalse(hasattr(py_cls, "device"))
         self.assertTrue(hasattr(py_cls, "arbitrary_params_dict"))
         expected_kwargs = {"sketch_eps": 0.03}
         self.assertEqual(