Define the new device parameter. (#9362)

2023-07-13 19:30:25 +08:00 · 2023-07-13 19:30:25 +08:00 · 04aff3af8e
commit 04aff3af8e
parent 2d0cd2817e
63 changed files with 827 additions and 477 deletions
--- a/1
+++ b/1
@ -15,4 +15,3 @@
 address = {New York, NY, USA},
 keywords = {large-scale machine learning},
 }
-
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@ -22,7 +22,8 @@ Supported parameters
 GPU accelerated prediction is enabled by default for the above mentioned ``tree_method`` parameters but can be switched to CPU prediction by setting ``predictor`` to ``cpu_predictor``. This could be useful if you want to conserve GPU memory. Likewise when using CPU algorithms, GPU accelerated prediction can be enabled by setting ``predictor`` to ``gpu_predictor``.

 The device ordinal (which GPU to use if you have many of them) can be selected using the
-``gpu_id`` parameter, which defaults to 0 (the first device reported by CUDA runtime).
+``device`` parameter, which defaults to 0 when "CUDA" is specified(the first device reported by CUDA
+runtime).


 The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :doc:`/install` for details.
@ -30,13 +31,13 @@ The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :do
 .. code-block:: python
  :caption: Python example

-  param['gpu_id'] = 0
+  param["device"] = "cuda:0"
  param['tree_method'] = 'gpu_hist'

 .. code-block:: python
  :caption: With Scikit-Learn interface

-  XGBRegressor(tree_method='gpu_hist', gpu_id=0)
+  XGBRegressor(tree_method='gpu_hist', device="cuda")


 GPU-Accelerated SHAP values
@ -45,7 +46,7 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as

 .. code-block:: python

-  model.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+  model.set_param({"device": "cuda:0", "tree_method": "gpu_hist"})
  shap_values = model.predict(dtrain, pred_contribs=True)
  shap_interaction_values = model.predict(dtrain, pred_interactions=True)

--- a/doc/install.rst
+++ b/doc/install.rst
@ -3,10 +3,10 @@ Installation Guide
 ##################

 XGBoost provides binary packages for some language bindings.  The binary packages support
-the GPU algorithm (``gpu_hist``) on machines with NVIDIA GPUs. Please note that **training
-with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.  Also we
-have both stable releases and nightly builds, see below for how to install them.  For
-building from source, visit :doc:`this page </build>`.
+the GPU algorithm (``device=cuda:0``) on machines with NVIDIA GPUs. Please note that
+**training with multiple GPUs is only supported for Linux platform**. See
+:doc:`gpu/index`.  Also we have both stable releases and nightly builds, see below for how
+to install them.  For building from source, visit :doc:`this page </build>`.

 .. contents:: Contents

--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -59,6 +59,18 @@ General Parameters

  - Feature dimension used in boosting, set to maximum dimension of the feature

+* ``device`` [default= ``cpu``]
+
+  .. versionadded:: 2.0.0
+
+  - Device for XGBoost to run. User can set it to one of the following values:
+
+    + ``cpu``: Use CPU.
+    + ``cuda``: Use a GPU (CUDA device).
+    + ``cuda:<ordinal>``: ``<ordinal>`` is an integer that specifies the ordinal of the GPU (which GPU do you want to use if you have more than one devices).
+    + ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
+    + ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
+
 Parameters for Tree Booster
 ===========================
 * ``eta`` [default=0.3, alias: ``learning_rate``]
@ -99,7 +111,7 @@ Parameters for Tree Booster
  - ``gradient_based``: the selection probability for each training instance is proportional to the
    *regularized absolute value* of gradients (more specifically, :math:`\sqrt{g^2+\lambda h^2}`).
    ``subsample`` may be set to as low as 0.1 without loss of model accuracy. Note that this
-    sampling method is only supported when ``tree_method`` is set to ``gpu_hist``; other tree
+    sampling method is only supported when ``tree_method`` is set to ``hist`` and the device is ``cuda``; other tree
    methods only support ``uniform`` sampling.

 * ``colsample_bytree``, ``colsample_bylevel``, ``colsample_bynode`` [default=1]
@ -131,26 +143,15 @@ Parameters for Tree Booster
 * ``tree_method`` string [default= ``auto``]

  - The tree construction algorithm used in XGBoost. See description in the `reference paper <http://arxiv.org/abs/1603.02754>`_ and :doc:`treemethod`.
-  - XGBoost supports  ``approx``, ``hist`` and ``gpu_hist`` for distributed training.  Experimental support for external memory is available for ``approx`` and ``gpu_hist``.

-  - Choices: ``auto``, ``exact``, ``approx``, ``hist``, ``gpu_hist``, this is a
-    combination of commonly used updaters.  For other updaters like ``refresh``, set the
-    parameter ``updater`` directly.
+  - Choices: ``auto``, ``exact``, ``approx``, ``hist``, this is a combination of commonly
+    used updaters.  For other updaters like ``refresh``, set the parameter ``updater``
+    directly.

-    - ``auto``: Use heuristic to choose the fastest method.
-
-      - For small dataset, exact greedy (``exact``) will be used.
-      - For larger dataset, approximate algorithm (``approx``) will be chosen.  It's
-        recommended to try ``hist`` and ``gpu_hist`` for higher performance with large
-        dataset.
-        (``gpu_hist``)has support for ``external memory``.
-
-      - Because old behavior is always use exact greedy in single machine, user will get a
-        message when approximate algorithm is chosen to notify this choice.
+    - ``auto``: Same as the ``hist`` tree method.
    - ``exact``: Exact greedy algorithm.  Enumerates all split candidates.
    - ``approx``: Approximate greedy algorithm using quantile sketch and gradient histogram.
    - ``hist``: Faster histogram optimized approximate greedy algorithm.
-    - ``gpu_hist``: GPU implementation of ``hist`` algorithm.

 * ``scale_pos_weight`` [default=1]

@ -163,7 +164,7 @@ Parameters for Tree Booster
    - ``grow_colmaker``: non-distributed column-based construction of trees.
    - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
    - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
-    - ``grow_gpu_hist``: Grow tree with GPU.
+    - ``grow_gpu_hist``: Grow tree with GPU. Same as setting tree method to ``hist`` and use ``device=cuda``.
    - ``sync``: synchronizes trees in all distributed nodes.
    - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
    - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
@ -183,7 +184,7 @@ Parameters for Tree Booster
 * ``grow_policy`` [default= ``depthwise``]

  - Controls a way new nodes are added to the tree.
-  - Currently supported only if ``tree_method`` is set to ``hist``, ``approx`` or ``gpu_hist``.
+  - Currently supported only if ``tree_method`` is set to ``hist`` or ``approx``.
  - Choices: ``depthwise``, ``lossguide``

    - ``depthwise``: split at nodes closest to the root.
@ -195,7 +196,7 @@ Parameters for Tree Booster

 * ``max_bin``, [default=256]

-  - Only used if ``tree_method`` is set to ``hist``, ``approx`` or ``gpu_hist``.
+  - Only used if ``tree_method`` is set to ``hist`` or ``approx``.
  - Maximum number of discrete bins to bucket continuous features.
  - Increasing this number improves the optimality of splits at the cost of higher computation time.

--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@ -3,14 +3,14 @@ Tree Methods
 ############

 For training boosted tree models, there are 2 parameters used for choosing algorithms,
-namely ``updater`` and ``tree_method``.  XGBoost has 4 builtin tree methods, namely
-``exact``, ``approx``, ``hist`` and ``gpu_hist``.  Along with these tree methods, there
-are also some free standing updaters including ``refresh``,
-``prune`` and ``sync``.  The parameter ``updater`` is more primitive than ``tree_method``
-as the latter is just a pre-configuration of the former.  The difference is mostly due to
-historical reasons that each updater requires some specific configurations and might has
-missing features.  As we are moving forward, the gap between them is becoming more and
-more irrelevant.  We will collectively document them under tree methods.
+namely ``updater`` and ``tree_method``.  XGBoost has 3 builtin tree methods, namely
+``exact``, ``approx`` and ``hist``.  Along with these tree methods, there are also some
+free standing updaters including ``refresh``, ``prune`` and ``sync``.  The parameter
+``updater`` is more primitive than ``tree_method`` as the latter is just a
+pre-configuration of the former.  The difference is mostly due to historical reasons that
+each updater requires some specific configurations and might has missing features.  As we
+are moving forward, the gap between them is becoming more and more irrelevant.  We will
+collectively document them under tree methods.

 **************
 Exact Solution
@ -19,23 +19,23 @@ Exact Solution
 Exact means XGBoost considers all candidates from data for tree splitting, but underlying
 the objective is still interpreted as a Taylor expansion.

-1. ``exact``: Vanilla gradient boosting tree algorithm described in `reference paper
-   <http://arxiv.org/abs/1603.02754>`_.  During each split finding procedure, it iterates
-   over all entries of input data.  It's more accurate (among other greedy methods) but
-   slow in computation performance.  Also it doesn't support distributed training as
-   XGBoost employs row spliting data distribution while ``exact`` tree method works on a
-   sorted column format.  This tree method can be used with parameter ``tree_method`` set
-   to ``exact``.
+1. ``exact``: The vanilla gradient boosting tree algorithm described in `reference paper
+   <http://arxiv.org/abs/1603.02754>`_.  During split-finding, it iterates over all
+   entries of input data.  It's more accurate (among other greedy methods) but
+   computationally slower in compared to other tree methods.  Further more, its feature
+   set is limited. Features like distributed training and external memory that require
+   approximated quantiles are not supported. This tree method can be used with the
+   parameter ``tree_method`` set to ``exact``.


 **********************
 Approximated Solutions
 **********************

-As ``exact`` tree method is slow in performance and not scalable, we often employ
-approximated training algorithms.  These algorithms build a gradient histogram for each
-node and iterate through the histogram instead of real dataset.  Here we introduce the
-implementations in XGBoost below.
+As ``exact`` tree method is slow in computation performance and difficult to scale, we
+often employ approximated training algorithms.  These algorithms build a gradient
+histogram for each node and iterate through the histogram instead of real dataset.  Here
+we introduce the implementations in XGBoost.

 1. ``approx`` tree method: An approximation tree method described in `reference paper
   <http://arxiv.org/abs/1603.02754>`_.  It runs sketching before building each tree
@ -48,22 +48,18 @@ implementations in XGBoost below.
   this global sketch.  This is the fastest algorithm as it runs sketching only once.  The
   algorithm can be accessed by setting ``tree_method`` to ``hist``.

-3. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of
-   ``hist``, with additional support for gradient based sampling.  The algorithm can be
-   accessed by setting ``tree_method`` to ``gpu_hist``.
-
 ************
 Implications
 ************

-Some objectives like ``reg:squarederror`` have constant hessian.  In this case, ``hist``
-or ``gpu_hist`` should be preferred as weighted sketching doesn't make sense with constant
+Some objectives like ``reg:squarederror`` have constant hessian.  In this case, the
+``hist`` should be preferred as weighted sketching doesn't make sense with constant
 weights.  When using non-constant hessian objectives, sometimes ``approx`` yields better
-accuracy, but with slower computation performance.  Most of the time using ``(gpu)_hist``
-with higher ``max_bin`` can achieve similar or even superior accuracy while maintaining
-good performance.  However, as xgboost is largely driven by community effort, the actual
-implementations have some differences than pure math description.  Result might have
-slight differences than expectation, which we are currently trying to overcome.
+accuracy, but with slower computation performance.  Most of the time using ``hist`` with
+higher ``max_bin`` can achieve similar or even superior accuracy while maintaining good
+performance.  However, as xgboost is largely driven by community effort, the actual
+implementations have some differences than pure math description.  Result might be
+slightly different than expectation, which we are currently trying to overcome.

 **************
 Other Updaters
@ -106,8 +102,8 @@ solely for the interest of documentation.
   histogram creation step and uses sketching values directly during split evaluation.  It
   was never tested and contained some unknown bugs, we decided to remove it and focus our
   resources on more promising algorithms instead.  For accuracy, most of the time
-   ``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tuning, so
-   removing them don't have any real practical impact.
+   ``approx`` and ``hist`` are enough with some parameters tuning, so removing them don't
+   have any real practical impact.

 3. ``grow_local_histmaker`` updater: An approximation tree method described in `reference
   paper <http://arxiv.org/abs/1603.02754>`_.  This updater was rarely used in practice so
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@ -149,7 +149,7 @@ Also for inplace prediction:
 .. code-block:: python

  # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
-  booster.set_param({"gpu_id": "0"})
+  booster.set_param({"device": "cuda:0"})
  prediction = xgb.dask.inplace_predict(client, booster, X)

 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@ -163,7 +163,7 @@ Will print out something similar to (not actual output as it's too long for demo
    {
      "Learner": {
        "generic_parameter": {
-          "gpu_id": "0",
+          "device": "cuda:0",
          "gpu_page_size": "0",
          "n_jobs": "0",
          "random_state": "0",
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@ -119,7 +119,7 @@ using bst_group_t = std::uint32_t;  // NOLINT
 */
 using bst_target_t = std::uint32_t;  // NOLINT
 /**
- * brief Type for indexing boosted layers.
+ * @brief Type for indexing boosted layers.
 */
 using bst_layer_t = std::int32_t;  // NOLINT
 /**
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@ -12,12 +12,18 @@
 #include <cstdint>      // for int16_t, int32_t, int64_t
 #include <memory>       // for shared_ptr
 #include <string>       // for string, to_string
-#include <type_traits>  // for invoke_result_t, is_same_v
+#include <type_traits>  // for invoke_result_t, is_same_v, underlying_type_t

 namespace xgboost {

 struct CUDAContext;

+// symbolic names
+struct DeviceSym {
+  static auto constexpr CPU() { return "cpu"; }
+  static auto constexpr CUDA() { return "cuda"; }
+};
+
 /**
 * @brief A type for device ordinal. The type is packed into 32-bit for efficient use in
 *        viewing types like `linalg::TensorView`.
@ -59,9 +65,9 @@ struct DeviceOrd {
  [[nodiscard]] std::string Name() const {
    switch (device) {
      case DeviceOrd::kCPU:
-        return "CPU";
+        return DeviceSym::CPU();
      case DeviceOrd::kCUDA:
-        return "CUDA:" + std::to_string(ordinal);
+        return DeviceSym::CUDA() + (':' + std::to_string(ordinal));
      default: {
        LOG(FATAL) << "Unknown device.";
        return "";
@ -76,26 +82,39 @@ static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
 * @brief Runtime context for XGBoost. Contains information like threads and device.
 */
 struct Context : public XGBoostParameter<Context> {
+ private:
+  std::string device{DeviceSym::CPU()};  // NOLINT
+  // The device object for the current context. We are in the middle of replacing the
+  // `gpu_id` with this device field.
+  DeviceOrd device_{DeviceOrd::CPU()};
+
 public:
  // Constant representing the device ID of CPU.
-  static std::int32_t constexpr kCpuId = -1;
+  static bst_d_ordinal_t constexpr kCpuId = -1;
+  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
  static std::int64_t constexpr kDefaultSeed = 0;

 public:
  Context();

+  template <typename Container>
+  Args UpdateAllowUnknown(Container const& kwargs) {
+    auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
+    this->SetDeviceOrdinal(kwargs);
+    return args;
+  }
+
+  std::int32_t gpu_id{kCpuId};
+  // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
+  std::int32_t nthread{0};  // NOLINT
  // stored random seed
  std::int64_t seed{kDefaultSeed};
  // whether seed the PRNG each iteration
  bool seed_per_iteration{false};
-  // number of threads to use if OpenMP is enabled
-  // if equals 0, use system default
-  std::int32_t nthread{0};
-  // primary device, -1 means no gpu.
-  std::int32_t gpu_id{kCpuId};
  // fail when gpu_id is invalid
  bool fail_on_invalid_gpu_id{false};
  bool validate_parameters{false};
+
  /**
   * @brief Configure the parameter `gpu_id'.
   *
@ -111,21 +130,19 @@ struct Context : public XGBoostParameter<Context> {
  /**
   * @brief Is XGBoost running on CPU?
   */
-  [[nodiscard]] bool IsCPU() const { return gpu_id == kCpuId; }
+  [[nodiscard]] bool IsCPU() const { return Device().IsCPU(); }
  /**
   * @brief Is XGBoost running on a CUDA device?
   */
-  [[nodiscard]] bool IsCUDA() const { return !IsCPU(); }
+  [[nodiscard]] bool IsCUDA() const { return Device().IsCUDA(); }
  /**
   * @brief Get the current device and ordinal.
   */
-  [[nodiscard]] DeviceOrd Device() const {
-    return IsCPU() ? DeviceOrd::CPU() : DeviceOrd::CUDA(static_cast<bst_d_ordinal_t>(gpu_id));
-  }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
  /**
   * @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
   */
-  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return this->gpu_id; }
+  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return Device().ordinal; }
  /**
   * @brief Name of the current device.
   */
@ -134,24 +151,22 @@ struct Context : public XGBoostParameter<Context> {
   * @brief Get a CUDA device context for allocator and stream.
   */
  [[nodiscard]] CUDAContext const* CUDACtx() const;
+
  /**
   * @brief Make a CUDA context based on the current context.
   *
   * @param ordinal The CUDA device ordinal.
   */
-  [[nodiscard]] Context MakeCUDA(std::int32_t ordinal = 0) const {
+  [[nodiscard]] Context MakeCUDA(bst_d_ordinal_t ordinal = 0) const {
    Context ctx = *this;
-    CHECK_GE(ordinal, 0);
-    ctx.gpu_id = ordinal;
-    return ctx;
+    return ctx.SetDevice(DeviceOrd::CUDA(ordinal));
  }
  /**
   * @brief Make a CPU context based on the current context.
   */
  [[nodiscard]] Context MakeCPU() const {
    Context ctx = *this;
-    ctx.gpu_id = kCpuId;
-    return ctx;
+    return ctx.SetDevice(DeviceOrd::CPU());
  }
  /**
   * @brief Call function based on the current device.
@ -167,7 +182,8 @@ struct Context : public XGBoostParameter<Context> {
      default:
        // Do not use the device name as this is likely an internal error, the name
        // wouldn't be valid.
-        LOG(FATAL) << "Unknown device type:" << static_cast<std::int16_t>(this->Device().device);
+        LOG(FATAL) << "Unknown device type:"
+                   << static_cast<std::underlying_type_t<DeviceOrd::Type>>(this->Device().device);
        break;
    }
    return std::invoke_result_t<CPUFn>();
@ -182,11 +198,9 @@ struct Context : public XGBoostParameter<Context> {
    DMLC_DECLARE_FIELD(seed_per_iteration)
        .set_default(false)
        .describe("Seed PRNG determnisticly via iterator number.");
+    DMLC_DECLARE_FIELD(device).set_default(DeviceSym::CPU()).describe("Device ordinal.");
    DMLC_DECLARE_FIELD(nthread).set_default(0).describe("Number of threads to use.");
    DMLC_DECLARE_ALIAS(nthread, n_jobs);
-
-    DMLC_DECLARE_FIELD(gpu_id).set_default(-1).set_lower_bound(-1).describe(
-        "The primary GPU device ordinal.");
    DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id)
        .set_default(false)
        .describe("Fail with error when gpu_id is invalid.");
@ -196,6 +210,14 @@ struct Context : public XGBoostParameter<Context> {
  }

 private:
+  void SetDeviceOrdinal(Args const& kwargs);
+  Context& SetDevice(DeviceOrd d) {
+    this->device_ = d;
+    this->gpu_id = d.ordinal;  // this can be removed once we move away from `gpu_id`.
+    this->device = d.Name();
+    return *this;
+  }
+
  // mutable for lazy cuda context initialization. This avoids initializing CUDA at load.
  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
  // p_impl while trying to hide CUDA code from the host compiler.
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@ -664,11 +664,11 @@ Object ToJson(Parameter const& param) {
 template <typename Parameter>
 Args FromJson(Json const& obj, Parameter* param) {
  auto const& j_param = get<Object const>(obj);
-  std::map<std::string, std::string> m;
+  Args args;
  for (auto const& kv : j_param) {
-    m[kv.first] = get<String const>(kv.second);
+    args.emplace_back(kv.first, get<String const>(kv.second));
  }
-  return param->UpdateAllowUnknown(m);
+  return param->UpdateAllowUnknown(args);
 }
 }  // namespace xgboost
 #endif  // XGBOOST_JSON_H_
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@ -110,15 +110,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   * \param approx_contribs whether to approximate the feature contributions for speed
   * \param pred_interactions whether to compute the feature pair contributions
   */
-  virtual void Predict(std::shared_ptr<DMatrix> data,
-                       bool output_margin,
-                       HostDeviceVector<bst_float> *out_preds,
-                       unsigned layer_begin,
-                       unsigned layer_end,
-                       bool training = false,
-                       bool pred_leaf = false,
-                       bool pred_contribs = false,
-                       bool approx_contribs = false,
+  virtual void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
+                       HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
+                       bst_layer_t layer_end, bool training = false, bool pred_leaf = false,
+                       bool pred_contribs = false, bool approx_contribs = false,
                       bool pred_interactions = false) = 0;

  /*!
@ -132,8 +127,8 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   * \param          layer_end   End of booster layer. 0 means do not limit trees.
   */
  virtual void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
-                              HostDeviceVector<bst_float>** out_preds, uint32_t layer_begin,
-                              uint32_t layer_end) = 0;
+                              HostDeviceVector<float>** out_preds, bst_layer_t layer_begin,
+                              bst_layer_t layer_end) = 0;

  /*!
   * \brief Calculate feature score.  See doc in C API for outputs.
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@ -39,9 +39,8 @@ struct PredictionCacheEntry {
   *
   * \param v Added versions.
   */
-  void Update(std::uint32_t v) {
-    version += v;
-  }
+  void Update(std::uint32_t v) { version += v; }
+  void Reset() { version = 0; }
 };

 /**
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@ -280,7 +280,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
            // - gpu id
            // - predictor: Force to gpu predictor since native doesn't save predictor.
            val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
-            booster.setParam("gpu_id", gpuId.toString)
+            booster.setParam("device", s"cuda:$gpuId")
            logger.info("GPU transform on device: " + gpuId)
            boosterFlag.isGpuParamsSet = true;
          }
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@ -326,7 +326,7 @@ object XGBoost extends Serializable {
          getGPUAddrFromResources
        }
        logger.info("Leveraging gpu device " + gpuId + " to train")
-        params = params + ("gpu_id" -> gpuId)
+        params = params + ("device" -> s"cuda:$gpuId")
      }
      val booster = if (makeCheckpoint) {
        SXGBoost.trainAndSaveCheckpoint(
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -1393,13 +1393,13 @@ class _ProxyDMatrix(DMatrix):


 class QuantileDMatrix(DMatrix):
-    """A DMatrix variant that generates quantilized data directly from input for
-    ``hist`` and ``gpu_hist`` tree methods. This DMatrix is primarily designed to save
-    memory in training by avoiding intermediate storage. Set ``max_bin`` to control the
-    number of bins during quantisation, which should be consistent with the training
-    parameter ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset,
-    ``ref`` should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as
-    it defeats the purpose of saving memory) constructed from training dataset.  See
+    """A DMatrix variant that generates quantilized data directly from input for the
+    ``hist`` tree method. This DMatrix is primarily designed to save memory in training
+    by avoiding intermediate storage. Set ``max_bin`` to control the number of bins
+    during quantisation, which should be consistent with the training parameter
+    ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset, ``ref``
+    should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as it
+    defeats the purpose of saving memory) constructed from training dataset.  See
    :py:obj:`xgboost.DMatrix` for documents on meta info.

    .. note::
@ -2277,10 +2277,10 @@ class Booster:

        .. code-block:: python

-            booster.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+            booster.set_param({"device": "cuda:0"})
            booster.inplace_predict(cupy_array)

-            booster.set_param({"gpu_id": "-1", "tree_method": "hist"})
+            booster.set_param({"device": "cpu"})
            booster.inplace_predict(numpy_array)

        .. versionadded:: 1.1.0
@ -2311,8 +2311,8 @@ class Booster:
        Returns
        -------
        prediction : numpy.ndarray/cupy.ndarray
-            The prediction result.  When input data is on GPU, prediction
-            result is stored in a cupy array.
+            The prediction result.  When input data is on GPU, prediction result is
+            stored in a cupy array.

        """
        preds = ctypes.POINTER(ctypes.c_float)()
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -273,7 +273,7 @@ __model_doc = f"""
        * For linear model, only "weight" is defined and it's the normalized coefficients
          without bias.

-    gpu_id : Optional[int]
+    device : Optional[str]
        Device ordinal.
    validate_parameters : Optional[bool]
        Give warnings for unknown parameter.
@ -647,7 +647,7 @@ class XGBModel(XGBModelBase):
        monotone_constraints: Optional[Union[Dict[str, int], str]] = None,
        interaction_constraints: Optional[Union[str, Sequence[Sequence[str]]]] = None,
        importance_type: Optional[str] = None,
-        gpu_id: Optional[int] = None,
+        device: Optional[str] = None,
        validate_parameters: Optional[bool] = None,
        enable_categorical: bool = False,
        feature_types: Optional[FeatureTypes] = None,
@ -693,7 +693,7 @@ class XGBModel(XGBModelBase):
        self.monotone_constraints = monotone_constraints
        self.interaction_constraints = interaction_constraints
        self.importance_type = importance_type
-        self.gpu_id = gpu_id
+        self.device = device
        self.validate_parameters = validate_parameters
        self.enable_categorical = enable_categorical
        self.feature_types = feature_types
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@ -1,4 +1,4 @@
-"""Xgboost pyspark integration submodule for core code."""
+"""XGBoost pyspark integration submodule for core code."""
 import base64

 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
@ -133,6 +133,7 @@ _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.it

 _unsupported_xgb_params = [
    "gpu_id",  # we have "use_gpu" pyspark param instead.
+    "device",  # we have "use_gpu" pyspark param instead.
    "enable_categorical",  # Use feature_types param to specify categorical feature instead
    "use_label_encoder",
    "n_jobs",  # Do not allow user to set it, will use `spark.task.cpus` value instead.
@ -899,12 +900,14 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):

            context = BarrierTaskContext.get()

-            gpu_id = None
+            dev_ordinal = None
            use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")

            if use_gpu:
-                gpu_id = context.partitionId() if is_local else _get_gpu_id(context)
-                booster_params["gpu_id"] = gpu_id
+                dev_ordinal = (
+                    context.partitionId() if is_local else _get_gpu_id(context)
+                )
+                booster_params["device"] = "cuda:" + str(dev_ordinal)
                # If cuDF is not installed, then using DMatrix instead of QDM,
                # because without cuDF, DMatrix performs better than QDM.
                # Note: Checking `is_cudf_available` in spark worker side because
@ -945,7 +948,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                dtrain, dvalid = create_dmatrix_from_partitions(
                    pandas_df_iter,
                    feature_prop.features_cols_names,
-                    gpu_id,
+                    dev_ordinal,
                    use_qdm,
                    dmatrix_kwargs,
                    enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@ -157,7 +157,7 @@ def _read_csr_matrix_from_unwrapped_spark_vec(part: pd.DataFrame) -> csr_matrix:

 def make_qdm(
    data: Dict[str, List[np.ndarray]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
    meta: Dict[str, Any],
    ref: Optional[DMatrix],
    params: Dict[str, Any],
@ -165,7 +165,7 @@ def make_qdm(
    """Handle empty partition for QuantileDMatrix."""
    if not data:
        return QuantileDMatrix(np.empty((0, 0)), ref=ref)
-    it = PartIter(data, gpu_id, **meta)
+    it = PartIter(data, dev_ordinal, **meta)
    m = QuantileDMatrix(it, **params, ref=ref)
    return m

@ -173,7 +173,7 @@ def make_qdm(
 def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
    iterator: Iterator[pd.DataFrame],
    feature_cols: Optional[Sequence[str]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
    use_qdm: bool,
    kwargs: Dict[str, Any],  # use dict to make sure this parameter is passed.
    enable_sparse_data_optim: bool,
@ -187,7 +187,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
        Pyspark partition iterator.
    feature_cols:
        A sequence of feature names, used only when rapids plugin is enabled.
-    gpu_id:
+    dev_ordinal:
        Device ordinal, used when GPU is enabled.
    use_qdm :
        Whether QuantileDMatrix should be used instead of DMatrix.
@ -304,13 +304,13 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments

    if feature_cols is not None and use_qdm:
        cache_partitions(iterator, append_fn)
-        dtrain: DMatrix = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain: DMatrix = make_qdm(train_data, dev_ordinal, meta, None, params)
    elif feature_cols is not None and not use_qdm:
        cache_partitions(iterator, append_fn)
        dtrain = make(train_data, kwargs)
    elif feature_cols is None and use_qdm:
        cache_partitions(iterator, append_fn)
-        dtrain = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain = make_qdm(train_data, dev_ordinal, meta, None, params)
    else:
        cache_partitions(iterator, append_fn)
        dtrain = make(train_data, kwargs)
@ -324,7 +324,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
    if has_validation_col:
        if use_qdm:
            dvalid: Optional[DMatrix] = make_qdm(
-                valid_data, gpu_id, meta, dtrain, params
+                valid_data, dev_ordinal, meta, dtrain, params
            )
        else:
            dvalid = make(valid_data, kwargs) if has_validation_col else None
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@ -78,8 +78,7 @@ def _set_pyspark_xgb_cls_param_attrs(


 class SparkXGBRegressor(_SparkXGBEstimator):
-    """
-    SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
+    """SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
    algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
    and PySpark ML meta algorithms like :py:class:`~pyspark.ml.tuning.CrossValidator`/
    :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
@ -89,8 +88,8 @@ class SparkXGBRegressor(_SparkXGBEstimator):
    :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.

-    SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    SparkXGBRegressor doesn't support setting `device` but supports another param
+    `use_gpu`, see doc below for more details.

    SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but support
    another param called `base_margin_col`. see doc below for more details.
@ -247,8 +246,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
    :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.

-    SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    SparkXGBClassifier doesn't support setting `device` but support another param
+    `use_gpu`, see doc below for more details.

    SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but support
    another param called `base_margin_col`. see doc below for more details.
@ -423,7 +422,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
    :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
    :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.

-    SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
+    SparkXGBRanker doesn't support setting `device` but support another param `use_gpu`,
    see doc below for more details.

    SparkXGBRanker doesn't support setting `base_margin` explicitly as well, but support
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@ -723,24 +723,6 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
 M = TypeVar("M", xgb.Booster, xgb.XGBModel)


-def set_ordinal(ordinal: int, booster: M) -> M:
-    """Temporary solution for setting the device ordinal until we move away from
-    `gpu_id`.
-
-    """
-    if ordinal < 0:
-        params = {"gpu_id": -1, "tree_method": "hist"}
-    else:
-        params = {"gpu_id": ordinal, "tree_method": "gpu_hist"}
-
-    if isinstance(booster, xgb.Booster):
-        booster.set_param(params)
-    elif isinstance(booster, xgb.XGBModel):
-        booster.set_params(**params)
-
-    return booster
-
-
 def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
    """Evaluation metric for xgb.train"""
    label = dtrain.get_label()
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@ -117,10 +117,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
                          RequiredArg<Integer>(config, "iteration_begin", __func__),
                          RequiredArg<Integer>(config, "iteration_end", __func__));
  CHECK(p_predt);
-  if (learner->Ctx()->IsCPU()) {
-    // Prediction using DMatrix as fallback.
-    CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
-  } else {
+  if (learner->Ctx()->IsCUDA()) {
    CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
  }
  p_predt->SetDevice(proxy->DeviceIdx());
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@ -3,23 +3,18 @@
 */
 #include "error_msg.h"

+#include "../collective/communicator-inl.h"  // for GetRank
 #include "xgboost/logging.h"

 namespace xgboost::error {
 void WarnDeprecatedGPUHist() {
-  bool static thread_local logged{false};
-  if (logged) {
-    return;
-  }
  auto msg =
      "The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
      R"(parameter to CUDA instead.

    E.g. tree_method = "hist", device = "CUDA"
-
 )";
  LOG(WARNING) << msg;
-  logged = true;
 }

 void WarnManualUpdater() {
@ -33,4 +28,23 @@ void WarnManualUpdater() {
         "behavior. For common uses, we recommend using `tree_method` parameter instead.";
  logged = true;
 }
+
+void WarnDeprecatedGPUId() {
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "`gpu_id` is deprecated in favor of the new `device` parameter: "
+               << "device = cpu/cuda/cuda:0";
+  logged = true;
+}
+
+void WarnEmptyDataset() {
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+  logged = true;
+}
 }  // namespace xgboost::error
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@ -82,5 +82,9 @@ inline void WarnOldSerialization() {
 void WarnDeprecatedGPUHist();

 void WarnManualUpdater();
+
+void WarnDeprecatedGPUId();
+
+void WarnEmptyDataset();
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/context.cc
+++ b/src/context.cc
@ -3,53 +3,201 @@
 *
 * \brief Context object used for controlling runtime parameters.
 */
-#include <xgboost/context.h>
+#include "xgboost/context.h"

-#include "common/common.h"  // AssertGPUSupport
+#include <algorithm>  // for find_if
+#include <charconv>   // for from_chars
+#include <iterator>   // for distance
+#include <optional>   // for optional
+#include <regex>      // for regex_replace, regex_match
+
+#include "common/common.h"     // AssertGPUSupport
+#include "common/error_msg.h"  // WarnDeprecatedGPUId
 #include "common/threading_utils.h"
+#include "xgboost/string_view.h"

 namespace xgboost {

 DMLC_REGISTER_PARAMETER(Context);

-std::int32_t constexpr Context::kCpuId;
+bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;

 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}

-void Context::ConfigureGpuId(bool require_gpu) {
-#if defined(XGBOOST_USE_CUDA)
-  if (gpu_id == kCpuId) {  // 0. User didn't specify the `gpu_id'
-    if (require_gpu) {     // 1. `tree_method' or `predictor' or both are using
-                           // GPU.
-      // 2. Use device 0 as default.
-      this->UpdateAllowUnknown(Args{{"gpu_id", "0"}});
-    }
-  }
+namespace {
+inline constexpr char const* kDevice = "device";

-  // 3. When booster is loaded from a memory image (Python pickle or R
-  // raw model), number of available GPUs could be different.  Wrap around it.
-  int32_t n_gpus = common::AllVisibleGPUs();
-  if (n_gpus == 0) {
-    if (gpu_id != kCpuId) {
-      LOG(WARNING) << "No visible GPU is found, setting `gpu_id` to -1";
-    }
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  } else if (fail_on_invalid_gpu_id) {
-    CHECK(gpu_id == kCpuId || gpu_id < n_gpus)
-        << "Only " << n_gpus << " GPUs are visible, gpu_id " << gpu_id << " is invalid.";
-  } else if (gpu_id != kCpuId && gpu_id >= n_gpus) {
-    LOG(WARNING) << "Only " << n_gpus << " GPUs are visible, setting `gpu_id` to "
-                 << gpu_id % n_gpus;
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(gpu_id % n_gpus)}});
-  }
+#if !defined(XGBOOST_USE_CUDA)
+DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
+  device = DeviceOrd::CPU();
+  return device;
+}
 #else
-  // Just set it to CPU, don't think about it.
-  this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  (void)(require_gpu);
-#endif  // defined(XGBOOST_USE_CUDA)
+// Check CUDA on the current device, wrap the ordinal if necessary.
+[[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
+  // When booster is loaded from a memory image (Python pickle or R raw model), number of
+  // available GPUs could be different.  Wrap around it.
+  std::int32_t n_visible = common::AllVisibleGPUs();
+  if (n_visible == 0) {
+    if (device.IsCUDA()) {
+      LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
+    }
+    device = DeviceOrd::CPU();
+  } else if (fail_on_invalid) {
+    CHECK(device.IsCPU() || device.ordinal < n_visible)
+        << "Only " << n_visible << " GPUs are visible, ordinal " << device.ordinal
+        << " is invalid.";
+  } else if (device.IsCUDA() && device.ordinal >= n_visible) {
+    device.ordinal = device.ordinal % n_visible;
+    LOG(WARNING) << "Only " << n_visible << " GPUs are visible, setting device ordinal to "
+                 << device.ordinal;
+  }

-  common::SetDevice(this->gpu_id);
+  if (device.IsCUDA()) {
+    common::SetDevice(device.ordinal);
+  }
+  return device;
+}
+#endif  //  !defined(XGBOOST_USE_CUDA)
+
+[[nodiscard]] std::optional<std::int32_t> ParseInt(StringView ordinal) {
+  // Some basic checks to ensure valid `gpu_id` and device ordinal instead of directly parsing and
+  // letting go of unknown characters.
+  if (ordinal.empty()) {
+    return std::nullopt;
+  }
+
+  std::size_t offset{0};
+  if (ordinal[0] == '-') {
+    offset = 1;
+  }
+  if (ordinal.size() <= offset) {
+    return std::nullopt;
+  }
+
+  bool valid = std::all_of(ordinal.cbegin() + offset, ordinal.cend(),
+                           [](auto c) { return std::isdigit(c); });
+  if (!valid) {
+    return std::nullopt;
+  }
+
+  std::int32_t parsed_id{Context::kCpuId};
+  auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
+  if (res.ec != std::errc()) {
+    return std::nullopt;
+  }
+
+  return parsed_id;
+}
+
+[[nodiscard]] DeviceOrd MakeDeviceOrd(std::string const& input, bool fail_on_invalid_gpu_id) {
+  StringView msg{R"(Invalid argument for `device`. Expected to be one of the following:
+- cpu
+- cuda
+- cuda:<device ordinal>  # e.g. cuda:0
+- gpu
+- gpu:<device ordinal>   # e.g. gpu:0
+)"};
+  auto fatal = [&] { LOG(FATAL) << msg << "Got: `" << input << "`."; };
+
+#if defined(__MINGW32__)
+  // mingw hangs on regex using rtools 430. Basic checks only.
+  CHECK_GE(input.size(), 3) << msg;
+  auto substr = input.substr(0, 3);
+  bool valid = substr == "cpu" || substr == "cud" || substr == "gpu";
+  CHECK(valid) << msg;
+#else
+  std::regex pattern{"gpu(:[0-9]+)?|cuda(:[0-9]+)?|cpu"};
+  if (!std::regex_match(input, pattern)) {
+    fatal();
+  }
+#endif  // defined(__MINGW32__)
+
+  // handle alias
+  std::string s_device = std::regex_replace(input, std::regex{"gpu"}, DeviceSym::CUDA());
+
+  auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
+  DeviceOrd device;
+  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  if (split_it == s_device.cend()) {
+    // no ordinal.
+    if (s_device == DeviceSym::CPU()) {
+      device = DeviceOrd::CPU();
+    } else if (s_device == DeviceSym::CUDA()) {
+      device = DeviceOrd::CUDA(0);  // use 0 as default;
+    } else {
+      fatal();
+    }
+  } else {
+    // must be CUDA when ordinal is specifed.
+    // +1 for colon
+    std::size_t offset = std::distance(s_device.cbegin(), split_it) + 1;
+    // substr
+    StringView s_ordinal = {s_device.data() + offset, s_device.size() - offset};
+    if (s_ordinal.empty()) {
+      fatal();
+    }
+    auto opt_id = ParseInt(s_ordinal);
+    if (!opt_id.has_value()) {
+      fatal();
+    }
+    CHECK_LE(opt_id.value(), std::numeric_limits<bst_d_ordinal_t>::max())
+        << "Ordinal value too large.";
+    device = DeviceOrd::CUDA(opt_id.value());
+  }
+
+  if (device.ordinal < Context::kCpuId) {
+    fatal();
+  }
+  device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
+
+  return device;
+}
+}  // namespace
+
+void Context::ConfigureGpuId(bool require_gpu) {
+  if (this->IsCPU() && require_gpu) {
+    this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
+  }
+}
+
+void Context::SetDeviceOrdinal(Args const& kwargs) {
+  auto gpu_id_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == "gpu_id"; });
+  auto has_gpu_id = gpu_id_it != kwargs.cend();
+  auto device_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == kDevice; });
+  auto has_device = device_it != kwargs.cend();
+  if (has_device && has_gpu_id) {
+    LOG(FATAL) << "Both `device` and `gpu_id` are specified. Use `device` instead.";
+  }
+
+  if (has_gpu_id) {
+    // Compatible with XGBoost < 2.0.0
+    error::WarnDeprecatedGPUId();
+    auto opt_id = ParseInt(StringView{gpu_id_it->second});
+    CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
+    if (opt_id.value() > Context::kCpuId) {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
+    } else {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
+    }
+    return;
+  }
+
+  auto new_d = MakeDeviceOrd(this->device, this->fail_on_invalid_gpu_id);
+
+  if (!has_device) {
+    CHECK_EQ(new_d.ordinal, this->device_.ordinal);  // unchanged
+  }
+  this->SetDevice(new_d);
+
+  if (this->IsCPU()) {
+    CHECK_EQ(this->device_.ordinal, kCpuId);
+  } else {
+    CHECK_GT(this->device_.ordinal, kCpuId);
+  }
 }

 std::int32_t Context::Threads() const {
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@ -33,10 +33,11 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
  bool valid = iter.Next();
  CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";

-  auto d = MakeProxy(proxy_)->DeviceIdx();
+  auto pctx = MakeProxy(proxy_)->Ctx();

  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
+  ctx.UpdateAllowUnknown(
+      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
  // hardcoded parameter.
  BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};

--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@ -54,6 +54,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
  }

+  CHECK(p_fmat) << "Failed to fallback.";
  return p_fmat;
 }
 }  // namespace xgboost::data
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@ -7,28 +7,31 @@

 namespace xgboost::data {
 void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
-  std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
-  auto const& value = adapter->Value();
+  auto adapter{std::make_shared<CudfAdapter>(interface_str)};
  this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
  }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }

 void DMatrixProxy::FromCudaArray(StringView interface_str) {
-  std::shared_ptr<CupyAdapter> adapter(new CupyAdapter{StringView{interface_str}});
+  auto adapter(std::make_shared<CupyAdapter>(StringView{interface_str}));
  this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
    CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
  }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }

 namespace cuda_impl {
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@ -27,7 +27,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
  dh::safe_cuda(cudaSetDevice(device));

  Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(device)}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});

  CHECK(adapter->NumRows() != kAdapterUnknownSize);
  CHECK(adapter->NumColumns() != kAdapterUnknownSize);
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -84,6 +84,25 @@ bool UpdatersMatched(std::vector<std::string> updater_seq,
                      return name == up->Name();
                    });
 }
+
+void MismatchedDevices(Context const* booster, Context const* data) {
+  bool thread_local static logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might "
+                  "lead to higher memory usage and slower performance. XGBoost is running on: "
+               << booster->DeviceName() << ", while the input data is on: " << data->DeviceName()
+               << ".\n"
+               << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+
+This warning will only be shown once, and subsequent warnings made by the current thread will be
+suppressed.
+)";
+  logged = true;
+}
 }  // namespace

 void GBTree::Configure(Args const& cfg) {
@ -208,6 +227,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
  monitor_.Start("BoostNewTrees");

+  predt->predictions.SetDevice(ctx_->Ordinal());
  auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                    model_.learner_model_param->OutputLength());
  CHECK_NE(n_groups, 0);
@ -521,18 +541,6 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
  }
 }

-namespace {
-inline void MismatchedDevices(Context const* booster, Context const* data) {
-  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
-               << "is running on: " << booster->DeviceName()
-               << ", while the input data is on: " << data->DeviceName() << ".\n"
-               << R"(Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
-)";
-}
-};  // namespace
-
 void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
                          bst_layer_t layer_begin, bst_layer_t layer_end) {
  // dispatch to const function.
--- a/src/learner.cc
+++ b/src/learner.cc
@ -40,7 +40,7 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
-#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization
+#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization, ...
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@ -711,6 +711,7 @@ class LearnerConfiguration : public Learner {
    // FIXME(trivialfis): Make eval_metric a training parameter.
    keys.emplace_back(kEvalMetric);
    keys.emplace_back("num_output_group");
+    keys.emplace_back("gpu_id");  // deprecated param.

    std::sort(keys.begin(), keys.end());

@ -1340,10 +1341,9 @@ class LearnerImpl : public LearnerIO {
  }

  void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
-               HostDeviceVector<bst_float> *out_preds, unsigned layer_begin,
-               unsigned layer_end, bool training,
-               bool pred_leaf, bool pred_contribs, bool approx_contribs,
-               bool pred_interactions) override {
+               HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
+               bst_layer_t layer_end, bool training, bool pred_leaf, bool pred_contribs,
+               bool approx_contribs, bool pred_interactions) override {
    int multiple_predictions = static_cast<int>(pred_leaf) +
                               static_cast<int>(pred_interactions) +
                               static_cast<int>(pred_contribs);
@ -1391,15 +1391,16 @@ class LearnerImpl : public LearnerIO {
  }

  void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
-                      HostDeviceVector<bst_float>** out_preds, uint32_t iteration_begin,
-                      uint32_t iteration_end) override {
+                      HostDeviceVector<float>** out_preds, bst_layer_t iteration_begin,
+                      bst_layer_t iteration_end) override {
    this->Configure();
    this->CheckModelInitialized();

    auto& out_predictions = this->GetThreadLocal().prediction_entry;
-    out_predictions.version = 0;
+    out_predictions.Reset();

    this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
+
    if (type == PredictionType::kValue) {
      obj_->PredTransform(&out_predictions.predictions);
    } else if (type == PredictionType::kMargin) {
@ -1454,7 +1455,7 @@ class LearnerImpl : public LearnerIO {
    }

    if (p_fmat->Info().num_row_ == 0) {
-      LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+      error::WarnEmptyDataset();
    }
  }

--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@ -28,6 +28,7 @@ class LintersPaths:
        "tests/python-gpu/test_gpu_prediction.py",
        "tests/python-gpu/load_pickle.py",
        "tests/python-gpu/test_gpu_pickling.py",
+        "tests/python-gpu/test_gpu_eval_metrics.py",
        "tests/test_distributed/test_with_spark/",
        "tests/test_distributed/test_gpu_with_spark/",
        # demo
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@ -16,8 +16,7 @@
 namespace xgboost {
 namespace common {
 void TestSegmentedArgSort() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  size_t constexpr kElements = 100, kGroups = 3;
  dh::device_vector<size_t> sorted_idx(kElements, 0);
@ -55,8 +54,7 @@ void TestSegmentedArgSort() {
 TEST(Algorithm, SegmentedArgSort) { TestSegmentedArgSort(); }

 TEST(Algorithm, GpuArgSort) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  dh::device_vector<float> values(20);
  dh::Iota(dh::ToSpan(values));                                    // accending
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@ -227,7 +227,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
  }
  // check categorical
  beg = n_samples;
-  for (std::size_t i = 0; i < n_categories; ++i) {
+  for (bst_cat_t i = 0; i < n_categories; ++i) {
    // all from the second column
    ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
  }
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@ -4,6 +4,7 @@
 #include <gtest/gtest.h>

 #include "../../../src/common/linalg_op.cuh"
+#include "../helpers.h"
 #include "xgboost/context.h"
 #include "xgboost/linalg.h"

@ -54,8 +55,7 @@ void TestElementWiseKernel() {
 }

 void TestSlice() {
-  Context ctx;
-  ctx.gpu_id = 1;
+  auto ctx = MakeCUDACtx(1);
  thrust::device_vector<double> data(2 * 3 * 4);
  auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
  dh::LaunchN(1, [=] __device__(size_t) {
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@ -23,8 +23,7 @@

 namespace xgboost::ltr {
 void TestCalcQueriesInvIDCG() {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  std::size_t n_groups = 5, n_samples_per_group = 32;

  dh::device_vector<float> scores(n_samples_per_group * n_groups);
@ -85,20 +84,17 @@ void TestRankingCache(Context const* ctx) {
 }  // namespace

 TEST(RankingCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestRankingCache(&ctx);
 }

 TEST(NDCGCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestNDCGCache(&ctx);
 }

 TEST(MAPCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestMAPCache(&ctx);
 }
 }  // namespace xgboost::ltr
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@ -7,6 +7,7 @@

 #include "../../../src/common/stats.h"
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
+#include "../helpers.h"

 namespace xgboost {
 namespace common {
@ -71,7 +72,7 @@ TEST(Stats, Median) {
    ASSERT_EQ(m, .5f);

 #if defined(XGBOOST_USE_CUDA)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    ASSERT_FALSE(ctx.IsCPU());
    Median(&ctx, values, weights, &out);
    m = out(0);
@ -80,7 +81,7 @@ TEST(Stats, Median) {
  }

  {
-    ctx.gpu_id = Context::kCpuId;
+    ctx = ctx.MakeCPU();
    // 4x2 matrix
    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
    HostDeviceVector<float> weights;
@ -90,7 +91,7 @@ TEST(Stats, Median) {
    ASSERT_EQ(out(1), .5f);

 #if defined(XGBOOST_USE_CUDA)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    Median(&ctx, values, weights, &out);
    ASSERT_EQ(out(0), .5f);
    ASSERT_EQ(out(1), .5f);
@ -123,8 +124,7 @@ TEST(Stats, Mean) {

 #if defined(XGBOOST_USE_CUDA)
 TEST(Stats, GPUMean) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestMean(&ctx);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@ -3,16 +3,17 @@
 */
 #include <gtest/gtest.h>

-#include <cstddef>                            // std::size_t
-#include <utility>                            // std::pair
-#include <vector>                             // std::vector
+#include <cstddef>  // std::size_t
+#include <utility>  // std::pair
+#include <vector>   // std::vector

 #include "../../../src/common/linalg_op.cuh"  // ElementWiseTransformDevice
 #include "../../../src/common/stats.cuh"
-#include "xgboost/base.h"                     // XGBOOST_DEVICE
-#include "xgboost/context.h"                  // Context
-#include "xgboost/host_device_vector.h"       // HostDeviceVector
-#include "xgboost/linalg.h"                   // Tensor
+#include "../helpers.h"
+#include "xgboost/base.h"                // XGBOOST_DEVICE
+#include "xgboost/context.h"             // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/linalg.h"              // Tensor

 namespace xgboost {
 namespace common {
@ -33,7 +34,7 @@ class StatsGPU : public ::testing::Test {
  }

 public:
-  void SetUp() override { ctx_.gpu_id = 0; }
+  void SetUp() override { ctx_  = MakeCUDACtx(0); }

  void WeightedMulti() {
    // data for one segment
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@ -171,8 +171,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
    ASSERT_TRUE(Xy->SingleColBlock());
    bst_bin_t constexpr kBins{17};
    auto p = BatchParam{kBins, threshold};
-    Context gpu_ctx;
-    gpu_ctx.gpu_id = 0;
+    auto gpu_ctx = MakeCUDACtx(0);
    for (auto const &page : Xy->GetBatches<EllpackPage>(
             &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
      from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@ -180,7 +180,12 @@ TEST(GBTree, ChooseTreeMethod) {
      learner->SetParam("tree_method", tree_method.value());
    }
    if (device.has_value()) {
-      learner->SetParam("gpu_id", device.value());
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
    }
    learner->Configure();
    for (std::int32_t i = 0; i < 3; ++i) {
@ -199,7 +204,12 @@ TEST(GBTree, ChooseTreeMethod) {
      learner->SetParam("tree_method", tree_method.value());
    }
    if (device.has_value()) {
-      learner->SetParam("gpu_id", device.value());
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
    }
    learner->Configure();
    for (std::int32_t i = 0; i < 3; ++i) {
@ -215,11 +225,12 @@ TEST(GBTree, ChooseTreeMethod) {

  // |        | hist    | gpu_hist | exact | NA  |
  // |--------+---------+----------+-------+-----|
-  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU | # not yet tested
-  // | CPU    | CPU     | Err      | CPU   | CPU | # not yet tested
+  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU |
+  // | CPU    | CPU     | GPU (w)  | CPU   | CPU |
  // |--------+---------+----------+-------+-----|
  // | -1     | CPU     | GPU (w)  | CPU   | CPU |
  // | 0      | GPU     | GPU (w)  | Err   | GPU |
+  // |--------+---------+----------+-------+-----|
  // | NA     | CPU     | GPU (w)  | CPU   | CPU |
  //
  // - (w): warning
@ -237,18 +248,30 @@ TEST(GBTree, ChooseTreeMethod) {
          // hist
          {{"hist", "-1"}, "grow_quantile_histmaker"},
          {{"hist", "0"}, "grow_gpu_hist"},
+          {{"hist", "cpu"}, "grow_quantile_histmaker"},
+          {{"hist", "cuda"}, "grow_gpu_hist"},
+          {{"hist", "cuda:0"}, "grow_gpu_hist"},
          {{"hist", std::nullopt}, "grow_quantile_histmaker"},
          // gpu_hist
          {{"gpu_hist", "-1"}, "grow_gpu_hist"},
          {{"gpu_hist", "0"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cpu"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda:0"}, "grow_gpu_hist"},
          {{"gpu_hist", std::nullopt}, "grow_gpu_hist"},
          // exact
          {{"exact", "-1"}, "grow_colmaker,prune"},
          {{"exact", "0"}, "err"},
+          {{"exact", "cpu"}, "grow_colmaker,prune"},
+          {{"exact", "cuda"}, "err"},
+          {{"exact", "cuda:0"}, "err"},
          {{"exact", std::nullopt}, "grow_colmaker,prune"},
          // NA
          {{std::nullopt, "-1"}, "grow_quantile_histmaker"},
          {{std::nullopt, "0"}, "grow_gpu_hist"},  // default to hist
+          {{std::nullopt, "cpu"}, "grow_quantile_histmaker"},
+          {{std::nullopt, "cuda"}, "grow_gpu_hist"},
+          {{std::nullopt, "cuda:0"}, "grow_gpu_hist"},
          {{std::nullopt, std::nullopt}, "grow_quantile_histmaker"},
      };

@ -392,8 +415,7 @@ class Dart : public testing::TestWithParam<char const*> {
    for (size_t i = 0; i < 16; ++i) {
      learner->UpdateOneIter(i, p_mat);
    }
-
-    ConfigLearnerByCtx(&ctx, learner.get());
+    learner->SetParam("device", ctx.DeviceName());

    HostDeviceVector<float> predts_training;
    learner->Predict(p_mat, false, &predts_training, 0, 0, true);
@ -654,8 +676,7 @@ TEST(GBTree, InplacePredictionError) {
        RandomDataGenerator{n_samples, n_features, 0.5f}.Batches(2).GenerateSparsePageDMatrix(
            "cache", true);
    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParam("booster", booster);
-    ConfigLearnerByCtx(ctx, learner.get());
+    learner->SetParams(Args{{"booster", booster}, {"device", ctx->DeviceName()}});
    learner->Configure();
    for (std::int32_t i = 0; i < 3; ++i) {
      learner->UpdateOneIter(i, p_fmat);
@ -697,9 +718,9 @@ TEST(GBTree, InplacePredictionError) {
 #endif  // defined(XGBOOST_USE_CUDA)
    };
    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParam("booster", booster);
-    learner->SetParam("max_bin", std::to_string(max_bins));
-    ConfigLearnerByCtx(ctx, learner.get());
+    learner->SetParams(Args{{"booster", booster},
+                            {"max_bin", std::to_string(max_bins)},
+                            {"device", ctx->DeviceName()}});
    learner->Configure();
    for (std::int32_t i = 0; i < 3; ++i) {
      learner->UpdateOneIter(i, p_fmat);
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@ -8,6 +8,7 @@
 #include <limits>  // for numeric_limits
 #include <memory>  // for shared_ptr
 #include <string>  // for string
+#include <thread>  // for thread

 #include "../../../src/data/adapter.h"           // for ArrayAdapter
 #include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
@ -41,7 +42,7 @@ void TestInplaceFallback(Context const* ctx) {

  // learner is configured to the device specified by ctx
  std::unique_ptr<Learner> learner{Learner::Create({Xy})};
-  ConfigLearnerByCtx(ctx, learner.get());
+  learner->SetParam("device", ctx->DeviceName());
  for (std::int32_t i = 0; i < 3; ++i) {
    learner->UpdateOneIter(i, Xy);
  }
@ -56,18 +57,31 @@ void TestInplaceFallback(Context const* ctx) {

  HostDeviceVector<float>* out_predt{nullptr};
  ConsoleLogger::Configure(Args{{"verbosity", "1"}});
+  std::string output;
  // test whether the warning is raised
+#if !defined(_WIN32)
+  // Windows has issue with CUDA and thread local storage. For some reason, on Windows a
+  // cudaInitializationError is raised during destruction of `HostDeviceVector`. This
+  // might be related to https://github.com/dmlc/xgboost/issues/5793
  ::testing::internal::CaptureStderr();
+  std::thread{[&] {
+    // Launch a new thread to ensure a warning is raised as we prevent over-verbose
+    // warning by using thread-local flags.
+    learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                            &out_predt, 0, 0);
+  }}.join();
+  output = testing::internal::GetCapturedStderr();
+  ASSERT_NE(output.find("Falling back"), std::string::npos);
+#endif
+
  learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                          &out_predt, 0, 0);
-  auto output = testing::internal::GetCapturedStderr();
-  ASSERT_NE(output.find("Falling back"), std::string::npos);

  // test when the contexts match
  Context new_ctx = *proxy->Ctx();
  ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);

-  ConfigLearnerByCtx(&new_ctx, learner.get());
+  learner->SetParam("device", new_ctx.DeviceName());
  HostDeviceVector<float>* out_predt_1{nullptr};
  // no warning is raised
  ::testing::internal::CaptureStderr();
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@ -559,16 +559,4 @@ class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
    }
  }
 };
-
-// A temporary solution before we move away from gpu_id.
-inline void ConfigLearnerByCtx(Context const* ctx, Learner* learner) {
-  if (ctx->IsCPU()) {
-    learner->SetParam("tree_method", "hist");
-  } else {
-    learner->SetParam("tree_method", "gpu_hist");
-  }
-  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
-  learner->Configure();
-  ASSERT_EQ(learner->Ctx()->gpu_id, ctx->gpu_id);
-}
 }  // namespace xgboost
--- a/tests/cpp/metric/test_multiclass_metric.h
+++ b/tests/cpp/metric/test_multiclass_metric.h
@ -46,7 +46,6 @@ inline void CheckDeterministicMetricMultiClass(StringView name, int32_t device)

 inline void TestMultiClassError(int device, DataSplitMode data_split_mode) {
  auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
  xgboost::Metric * metric = xgboost::Metric::Create("merror", &ctx);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "merror");
@ -67,7 +66,6 @@ inline void VerifyMultiClassError(DataSplitMode data_split_mode = DataSplitMode:

 inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
  auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
  xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &ctx);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "mlogloss");
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@ -13,26 +13,22 @@

 namespace xgboost::obj {
 TEST(LambdaRank, GPUNDCGJsonIO) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestNDCGJsonIO(&ctx);
 }

 TEST(LambdaRank, GPUMAPStat) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestMAPStat(&ctx);
 }

 TEST(LambdaRank, GPUNDCGGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestNDCGGPair(&ctx);
 }

 void TestGPUMakePair() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  MetaInfo info;
  HostDeviceVector<float> predt;
@ -126,8 +122,7 @@ void TestGPUMakePair() {
 TEST(LambdaRank, GPUMakePair) { TestGPUMakePair(); }

 TEST(LambdaRank, GPUUnbiasedNDCG) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestUnbiasedNDCG(&ctx);
 }

@ -161,8 +156,7 @@ TEST(LambdaRank, RankItemCountOnRight) {
 }

 TEST(LambdaRank, GPUMAPGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestMAPGPair(&ctx);
 }
 }  // namespace xgboost::obj
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@ -305,12 +305,12 @@ TEST(Objective, CPU_vs_CUDA) {

  {
    // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
    obj->GetGradient(preds, info, 0, &cpu_out_preds);
  }
  {
    // CUDA
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    obj->GetGradient(preds, info, 0, &cuda_out_preds);
  }

--- a/tests/cpp/plugin/test_regression_obj_oneapi.cc
+++ b/tests/cpp/plugin/test_regression_obj_oneapi.cc
@ -148,7 +148,7 @@ TEST(Plugin, CPUvsOneAPI) {

  {
    // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
    obj_cpu->GetGradient(preds, info, 0, &cpu_out_preds);
  }
  {
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@ -214,15 +214,16 @@ void TestUpdatePredictionCache(bool use_subsampling) {
 }
 }  // namespace

-TEST(CPUPredictor, GHistIndex) {
+TEST(CPUPredictor, GHistIndexTraining) {
  size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  Context ctx;
  auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix(false);
  HostDeviceVector<float> storage(kRows * kCols);
  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage);
  auto adapter = data::ArrayAdapter(columnar.c_str());
  std::shared_ptr<DMatrix> p_full{
      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
-  TestTrainingPrediction(kRows, kBins, "hist", p_full, p_hist);
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
 }

 TEST(CPUPredictor, CategoricalPrediction) {
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@ -33,9 +33,8 @@ TEST(GPUPredictor, Basic) {
    int n_row = i, n_col = i;
    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();

-    Context ctx;
-    ctx.gpu_id = 0;
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    auto ctx = MakeCUDACtx(0);
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);

    // Test predict batch
@ -71,7 +70,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
    std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};

-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);

    // Test predict batch
@ -102,7 +101,7 @@ TEST(GPUPredictor, MGPUBasicColumnSplit) {
    size_t n_row = i, n_col = i;
    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();

-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);

    // Test predict batch
@ -132,18 +131,19 @@ TEST(GPUPredictor, EllpackBasic) {
 }

 TEST(GPUPredictor, EllpackTraining) {
-  size_t constexpr kRows { 128 }, kCols { 16 }, kBins { 64 };
-  auto p_ellpack =
-      RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).Device(0).GenerateDeviceDMatrix(false);
+  auto ctx = MakeCUDACtx(0);
+  size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0}
+                       .Bins(kBins)
+                       .Device(ctx.Ordinal())
+                       .GenerateDeviceDMatrix(false);
  HostDeviceVector<float> storage(kRows * kCols);
-  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}
-       .Device(0)
-       .GenerateArrayInterface(&storage);
+  auto columnar =
+      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Ordinal()).GenerateArrayInterface(&storage);
  auto adapter = data::CupyAdapter(columnar);
-  std::shared_ptr<DMatrix> p_full {
-    DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)
-  };
-  TestTrainingPrediction(kRows, kBins, "gpu_hist", p_full, p_ellpack);
+  std::shared_ptr<DMatrix> p_full{
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_ellpack);
 }

 TEST(GPUPredictor, ExternalMemoryTest) {
@ -153,9 +153,8 @@ TEST(GPUPredictor, ExternalMemoryTest) {
  gpu_predictor->Configure({});

  const int n_classes = 3;
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.gpu_id)};
+  Context ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Ordinal())};

  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, n_classes);
  std::vector<std::unique_ptr<DMatrix>> dmats;
@ -185,7 +184,7 @@ TEST(GPUPredictor, InplacePredictCupy) {
  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows{128}, kCols{64};
  RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.gpu_id);
+  gen.Device(ctx.Ordinal());
  HostDeviceVector<float> data;
  std::string interface_str = gen.GenerateArrayInterface(&data);
  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@ -197,7 +196,7 @@ TEST(GPUPredictor, InplacePredictCuDF) {
  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows{128}, kCols{64};
  RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.gpu_id);
+  gen.Device(ctx.Ordinal());
  std::vector<HostDeviceVector<float>> storage(kCols);
  auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@ -214,9 +213,8 @@ TEST(GpuPredictor, LesserFeatures) {
 TEST(GPUPredictor, ShapStump) {
  cudaSetDevice(0);

-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
  gbm::GBTreeModel model(&mparam, &ctx);

  std::vector<std::unique_ptr<RegTree>> trees;
@ -241,9 +239,8 @@ TEST(GPUPredictor, ShapStump) {
 }

 TEST(GPUPredictor, Shap) {
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
  gbm::GBTreeModel model(&mparam, &ctx);

  std::vector<std::unique_ptr<RegTree>> trees;
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@ -44,60 +44,49 @@ TEST(Predictor, PredictionCache) {
  EXPECT_ANY_THROW(container.Entry(m));
 }

-void TestTrainingPrediction(size_t rows, size_t bins,
-                            std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist) {
+void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist) {
  size_t constexpr kCols = 16;
  size_t constexpr kClasses = 3;
  size_t constexpr kIters = 3;

  std::unique_ptr<Learner> learner;
-  auto train = [&](Context const& ctx) {
-    p_hist->Info().labels.Reshape(rows, 1);
-    auto &h_label = p_hist->Info().labels.Data()->HostVector();

-    for (size_t i = 0; i < rows; ++i) {
-      h_label[i] = i % kClasses;
-    }
+  p_hist->Info().labels.Reshape(rows, 1);
+  auto &h_label = p_hist->Info().labels.Data()->HostVector();

-    learner.reset(Learner::Create({}));
-    learner->SetParam("tree_method", tree_method);
-    learner->SetParam("objective", "multi:softprob");
-    learner->SetParam("num_feature", std::to_string(kCols));
-    learner->SetParam("num_class", std::to_string(kClasses));
-    learner->SetParam("max_bin", std::to_string(bins));
-    ConfigLearnerByCtx(&ctx, learner.get());
-    learner->Configure();
+  for (size_t i = 0; i < rows; ++i) {
+    h_label[i] = i % kClasses;
+  }

-    for (size_t i = 0; i < kIters; ++i) {
-      learner->UpdateOneIter(i, p_hist);
-    }
+  learner.reset(Learner::Create({}));
+  learner->SetParams(Args{{"objective", "multi:softprob"},
+                          {"num_feature", std::to_string(kCols)},
+                          {"num_class", std::to_string(kClasses)},
+                          {"max_bin", std::to_string(bins)},
+                          {"device", ctx->DeviceName()}});
+  learner->Configure();

-    Json model{Object{}};
-    learner->SaveModel(&model);
+  for (size_t i = 0; i < kIters; ++i) {
+    learner->UpdateOneIter(i, p_hist);
+  }

-    learner.reset(Learner::Create({}));
-    learner->LoadModel(model);
-    ConfigLearnerByCtx(&ctx, learner.get());
-    learner->Configure();
+  Json model{Object{}};
+  learner->SaveModel(&model);

-    HostDeviceVector<float> from_full;
-    learner->Predict(p_full, false, &from_full, 0, 0);
+  learner.reset(Learner::Create({}));
+  learner->LoadModel(model);
+  learner->SetParam("device", ctx->DeviceName());
+  learner->Configure();

-    HostDeviceVector<float> from_hist;
-    learner->Predict(p_hist, false, &from_hist, 0, 0);
+  HostDeviceVector<float> from_full;
+  learner->Predict(p_full, false, &from_full, 0, 0);

-    for (size_t i = 0; i < rows; ++i) {
-      EXPECT_NEAR(from_hist.ConstHostVector()[i],
-                  from_full.ConstHostVector()[i], kRtEps);
-    }
-  };
+  HostDeviceVector<float> from_hist;
+  learner->Predict(p_hist, false, &from_hist, 0, 0);

-  if (tree_method == "gpu_hist") {
-    train(MakeCUDACtx(0));
-  } else {
-    train(Context{});
+  for (size_t i = 0; i < rows; ++i) {
+    EXPECT_NEAR(from_hist.ConstHostVector()[i], from_full.ConstHostVector()[i], kRtEps);
  }
 }

@ -120,7 +109,7 @@ void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_r
    learner->UpdateOneIter(it, m);
  }

-  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+  learner->SetParam("device", ctx->DeviceName());
  learner->Configure();

  HostDeviceVector<float> *p_out_predictions_0{nullptr};
@ -153,7 +142,7 @@ void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_r
    ASSERT_NEAR(h_pred[i], h_pred_0[i] + h_pred_1[i] - 0.5f, kRtEps);
  }

-  learner->SetParam("gpu_id", "-1");
+  learner->SetParam("device", "cpu");
  learner->Configure();
 }

@ -161,12 +150,12 @@ namespace {
 std::unique_ptr<Learner> LearnerForTest(Context const *ctx, std::shared_ptr<DMatrix> dmat,
                                        size_t iters, size_t forest = 1) {
  std::unique_ptr<Learner> learner{Learner::Create({dmat})};
-  learner->SetParams(Args{{"num_parallel_tree", std::to_string(forest)}});
+  learner->SetParams(
+      Args{{"num_parallel_tree", std::to_string(forest)}, {"device", ctx->DeviceName()}});
  for (size_t i = 0; i < iters; ++i) {
    learner->UpdateOneIter(i, dmat);
  }

-  ConfigLearnerByCtx(ctx, learner.get());
  return learner;
 }

@ -215,7 +204,7 @@ void TestPredictionDeviceAccess() {
  {
    ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId);
    Context cpu_ctx;
-    ConfigLearnerByCtx(&cpu_ctx, learner.get());
+    learner->SetParam("device", cpu_ctx.DeviceName());
    learner->Predict(m_test, false, &from_cpu, 0, 0);
    ASSERT_TRUE(from_cpu.HostCanWrite());
    ASSERT_FALSE(from_cpu.DeviceCanRead());
@ -225,7 +214,7 @@ void TestPredictionDeviceAccess() {
  HostDeviceVector<float> from_cuda;
  {
    Context cuda_ctx = MakeCUDACtx(0);
-    ConfigLearnerByCtx(&cuda_ctx, learner.get());
+    learner->SetParam("device", cuda_ctx.DeviceName());
    learner->Predict(m_test, false, &from_cuda, 0, 0);
    ASSERT_EQ(from_cuda.DeviceIdx(), 0);
    ASSERT_TRUE(from_cuda.DeviceCanWrite());
@ -465,11 +454,7 @@ void TestIterationRangeColumnSplit(Context const* ctx) {
  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
  auto learner = LearnerForTest(ctx, dmat, kIters, kForest);

-  if (ctx->IsCPU()) {
-    learner->SetParams(Args{{"gpu_id", std::to_string(-1)}});
-  } else {
-    learner->SetParams(Args{{"gpu_id", std::to_string(0)}});
-  }
+  learner->SetParam("device", ctx->DeviceName());

  bool bound = false;
  std::unique_ptr<Learner> sliced{learner->Slice(0, 3, 1, &bound)};
@ -582,7 +567,7 @@ void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
  learner.reset(Learner::Create({Xy}));
  learner->LoadModel(model);

-  ConfigLearnerByCtx(ctx, learner.get());
+  learner->SetParam("device", ctx->DeviceName());
  learner->Predict(Xy, false, &sparse_predt, 0, 0);

  auto constexpr kWorldSize = 2;
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@ -84,9 +84,8 @@ void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t col
 }

 // p_full and p_hist should come from the same data set.
-void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist);
+void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);

 void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
                           bst_feature_t cols);
--- a/tests/cpp/test_context.cc
+++ b/tests/cpp/test_context.cc
@ -0,0 +1,31 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>
+#include <xgboost/context.h>
+
+namespace xgboost {
+TEST(Context, CPU) {
+  Context ctx;
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+  ASSERT_EQ(ctx.Ordinal(), Context::kCpuId);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, -1);
+
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "oops"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "-1"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CUDA"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU:0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:+0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:0-"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":gpu"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ""}}), dmlc::Error);
+}
+}  // namespace xgboost
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@ -0,0 +1,99 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>  // for Args
+#include <xgboost/context.h>
+#include <xgboost/json.h>  // for FromJson, ToJson
+
+#include <string>  // for string, to_string
+
+#include "../../src/common/common.h"  // for AllVisibleGPUs
+
+namespace xgboost {
+namespace {
+void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
+  ASSERT_EQ(ctx.gpu_id, ord);
+  ASSERT_EQ(ctx.Device().ordinal, ord);
+  ASSERT_EQ(ctx.DeviceName(), "cuda:" + std::to_string(ord));
+  ASSERT_EQ(ctx.Ordinal(), ord);
+  ASSERT_TRUE(ctx.IsCUDA());
+  ASSERT_FALSE(ctx.IsCPU());
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CUDA(ord));
+
+  Json jctx{ToJson(ctx)};
+  Context new_ctx;
+  FromJson(jctx, &new_ctx);
+  ASSERT_EQ(new_ctx.Device(), ctx.Device());
+  ASSERT_EQ(new_ctx.gpu_id, ctx.gpu_id);
+}
+}  // namespace
+
+TEST(Context, DeviceOrdinal) {
+  Context ctx;
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+
+  std::string device = "cuda:" + std::to_string(ord);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  TestCUDA(ctx, ord);
+
+  device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+
+  TestCUDA(ctx, ord);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, 1);
+
+  Context new_ctx = ctx;
+  TestCUDA(new_ctx, ctx.Ordinal());
+
+  auto cpu_ctx = ctx.MakeCPU();
+  ASSERT_TRUE(cpu_ctx.IsCPU());
+  ASSERT_EQ(cpu_ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(cpu_ctx.Device(), DeviceOrd::CPU());
+
+  auto cuda_ctx = cpu_ctx.MakeCUDA(ctx.Ordinal());
+  TestCUDA(cuda_ctx, ctx.Ordinal());
+
+  cuda_ctx.UpdateAllowUnknown(Args{{"fail_on_invalid_gpu_id", "true"}});
+  ASSERT_THROW({ cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:9999"}}); }, dmlc::Error);
+  cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:00"}});
+  ASSERT_EQ(cuda_ctx.Ordinal(), 0);
+
+  ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  // Test alias
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu:0"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu"}});
+  TestCUDA(ctx, 0);
+
+  // Test the thread local memory in dmlc is not linking different instances together.
+  cpu_ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{});
+  TestCUDA(ctx, 0);
+}
+
+TEST(Context, GPUId) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestCUDA(ctx, 0);
+
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
+  TestCUDA(ctx, ord);
+
+  auto device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+  TestCUDA(ctx, ord);
+
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "-1"}});
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+}
+}  // namespace xgboost
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@ -27,7 +27,6 @@
 #include "../../src/common/io.h"                    // for LoadSequentialFile
 #include "../../src/common/linalg_op.h"             // for ElementWiseTransformHost, begin, end
 #include "../../src/common/random.h"                // for GlobalRandom
-#include "../../src/common/transform_iterator.h"    // for IndexTransformIter
 #include "dmlc/io.h"                                // for Stream
 #include "dmlc/omp.h"                               // for omp_get_max_threads
 #include "dmlc/registry.h"                          // for Registry
@ -35,14 +34,13 @@
 #include "helpers.h"                                // for GetBaseScore, RandomDataGenerator
 #include "objective_helpers.h"                      // for MakeObjNamesForTest, ObjTestNameGenerator
 #include "xgboost/base.h"                           // for bst_float, Args, bst_feature_t, bst_int
-#include "xgboost/context.h"                        // for Context
+#include "xgboost/context.h"                        // for Context, DeviceOrd
 #include "xgboost/data.h"                           // for DMatrix, MetaInfo, DataType
 #include "xgboost/host_device_vector.h"             // for HostDeviceVector
 #include "xgboost/json.h"                           // for Json, Object, get, String, IsA, opera...
 #include "xgboost/linalg.h"                         // for Tensor, TensorView
 #include "xgboost/logging.h"                        // for ConsoleLogger
 #include "xgboost/predictor.h"                      // for PredictionCacheEntry
-#include "xgboost/span.h"                           // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"                    // for StringView

 namespace xgboost {
@ -58,9 +56,9 @@ TEST(Learner, Basic) {
  auto minor = XGBOOST_VER_MINOR;
  auto patch = XGBOOST_VER_PATCH;

-  static_assert(std::is_integral<decltype(major)>::value, "Wrong major version type");
-  static_assert(std::is_integral<decltype(minor)>::value, "Wrong minor version type");
-  static_assert(std::is_integral<decltype(patch)>::value, "Wrong patch version type");
+  static_assert(std::is_integral_v<decltype(major)>, "Wrong major version type");
+  static_assert(std::is_integral_v<decltype(minor)>, "Wrong minor version type");
+  static_assert(std::is_integral_v<decltype(patch)>, "Wrong patch version type");
 }

 TEST(Learner, ParameterValidation) {
@ -92,8 +90,7 @@ TEST(Learner, CheckGroup) {
  size_t constexpr kNumRows = 17;
  bst_feature_t constexpr kNumCols = 15;

-  std::shared_ptr<DMatrix> p_mat{
-      RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
+  std::shared_ptr<DMatrix> p_mat{RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
  std::vector<bst_float> weight(kNumGroups, 1);
  std::vector<bst_int> group(kNumGroups);
  group[0] = 2;
@ -312,35 +309,36 @@ TEST(Learner, GPUConfiguration) {
    learner->SetParams({Arg{"booster", "gblinear"},
                        Arg{"updater", "gpu_coord_descent"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
  {
-    std::unique_ptr<Learner> learner {Learner::Create(mat)};
+    std::unique_ptr<Learner> learner{Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "gpu_hist"}});
+    learner->Configure();
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
  {
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "gpu_hist"},
                        Arg{"gpu_id", "-1"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
  {
    // with CPU algorithm
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "hist"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, -1);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CPU());
  }
  {
    // with CPU algorithm, but `gpu_id` takes priority
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"gpu_id", "0"}});
+    learner->SetParams({Arg{"tree_method", "hist"}, Arg{"gpu_id", "0"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
 }
 #endif  // defined(XGBOOST_USE_CUDA)
--- a/tests/cpp/tree/test_node_partition.cc
+++ b/tests/cpp/tree/test_node_partition.cc
@ -6,7 +6,9 @@
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_updater.h>  // for TreeUpdater

-#include <memory>                  // for unique_ptr
+#include <memory>  // for unique_ptr
+
+#include "../helpers.h"

 namespace xgboost {
 TEST(Updater, HasNodePosition) {
@ -19,7 +21,7 @@ TEST(Updater, HasNodePosition) {
  ASSERT_TRUE(up->HasNodePosition());

 #if defined(XGBOOST_USE_CUDA)
-  ctx.gpu_id = 0;
+  ctx = MakeCUDACtx(0);
  up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, &task));
  ASSERT_TRUE(up->HasNodePosition());
 #endif  // defined(XGBOOST_USE_CUDA)
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@ -70,9 +70,9 @@ class TestPredictionCache : public ::testing::Test {
      Context ctx;
      ctx.InitAllowUnknown(Args{{"nthread", "8"}});
      if (updater_name == "grow_gpu_hist") {
-        ctx.gpu_id = 0;
+        ctx = ctx.MakeCUDA(0);
      } else {
-        ctx.gpu_id = Context::kCpuId;
+        ctx = ctx.MakeCPU();
      }

      ObjInfo task{ObjInfo::kRegression};
--- a/tests/python-gpu/load_pickle.py
+++ b/tests/python-gpu/load_pickle.py
@ -34,7 +34,7 @@ class TestLoadPickle:
        bst = load_pickle(model_path)
        config = bst.save_config()
        config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "-1"
+        assert config["learner"]["generic_param"]["device"] == "cpu"

    def test_context_is_preserved(self) -> None:
        """Test the device context is preserved after pickling."""
@ -42,14 +42,14 @@ class TestLoadPickle:
        bst = load_pickle(model_path)
        config = bst.save_config()
        config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "0"
+        assert config["learner"]["generic_param"]["device"] == "cuda:0"

    def test_wrap_gpu_id(self) -> None:
        assert os.environ["CUDA_VISIBLE_DEVICES"] == "0"
        bst = load_pickle(model_path)
        config = bst.save_config()
        config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "0"
+        assert config["learner"]["generic_param"]["device"] == "cuda:0"

        x, y = build_dataset()
        test_x = xgb.DMatrix(x)
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@ -203,7 +203,7 @@ class TestQuantileDMatrix:
        np.testing.assert_equal(h_ret.indices, d_ret.indices)

        booster = xgb.train(
-            {"tree_method": "gpu_hist", "gpu_id": "0"}, dtrain=d_m
+            {"tree_method": "hist", "device": "cuda:0"}, dtrain=d_m
        )

        np.testing.assert_allclose(
--- a/tests/python-gpu/test_gpu_basic_models.py
+++ b/tests/python-gpu/test_gpu_basic_models.py
@ -65,16 +65,20 @@ class TestGPUBasicModels:
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_invalid_gpu_id(self):
        from sklearn.datasets import load_digits
+
        X, y = load_digits(return_X_y=True)
        # should pass with invalid gpu id
-        cls1 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=9999)
+        cls1 = xgb.XGBClassifier(tree_method="gpu_hist", gpu_id=9999)
        cls1.fit(X, y)
        # should throw error with fail_on_invalid_gpu_id enabled
        cls2 = xgb.XGBClassifier(
-            tree_method='gpu_hist', gpu_id=9999, fail_on_invalid_gpu_id=True
+            tree_method="gpu_hist", gpu_id=9999, fail_on_invalid_gpu_id=True
        )
-        try:
+        with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
+            cls2.fit(X, y)
+
+        cls2 = xgb.XGBClassifier(
+            tree_method="hist", device="cuda:9999", fail_on_invalid_gpu_id=True
+        )
+        with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
            cls2.fit(X, y)
-            assert False, "Should have failed with with fail_on_invalid_gpu_id enabled"
-        except xgb.core.XGBoostError as err:
-            assert "gpu_id 9999 is invalid" in str(err)
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@ -43,10 +43,16 @@ class TestGPUEvalMetrics:
            num_boost_round=10,
        )
        cpu_auc = float(booster.eval(Xy).split(":")[1])
-        booster.set_param({"gpu_id": "0"})
-        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        booster.set_param({"device": "cuda:0"})
+        assert (
+            json.loads(booster.save_config())["learner"]["generic_param"]["device"]
+            == "cuda:0"
+        )
        gpu_auc = float(booster.eval(Xy).split(":")[1])
-        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        assert (
+            json.loads(booster.save_config())["learner"]["generic_param"]["device"]
+            == "cuda:0"
+        )

        np.testing.assert_allclose(cpu_auc, gpu_auc)

--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@ -113,14 +113,6 @@ class TestPickling:
        param = {"tree_method": "gpu_hist", "verbosity": 1}
        bst = xgb.train(param, train_x)

-        with tm.captured_output() as (out, err):
-            bst.inplace_predict(x)
-
-        # The warning is redirected to Python callback, so it's printed in stdout
-        # instead of stderr.
-        stdout = out.getvalue()
-        assert stdout.find("mismatched devices") != -1
-
        save_pickle(bst, model_path)

        args = self.args_template.copy()
@ -177,7 +169,7 @@ class TestPickling:

        # Switch to CPU predictor
        bst = model.get_booster()
-        tm.set_ordinal(-1, bst)
+        bst.set_param({"device": "cpu"})
        cpu_pred = model.predict(x, output_margin=True)
        np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)

--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@ -39,7 +39,8 @@ predict_parameter_strategy = strategies.fixed_dictionaries(
    }
 )

-pytestmark = tm.timeout(20)
+# cupy nvrtc compilation can take a long time for the first run
+pytestmark = tm.timeout(30)


 class TestGPUPredict:
@ -71,8 +72,8 @@ class TestGPUPredict:
                param = {
                    "objective": "binary:logistic",
                    "eval_metric": "logloss",
-                    "tree_method": "gpu_hist",
-                    "gpu_id": 0,
+                    "tree_method": "hist",
+                    "device": "gpu:0",
                    "max_depth": 1,
                }
                bst = xgb.train(
@ -84,7 +85,7 @@ class TestGPUPredict:
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

-                bst.set_param({"gpu_id": -1, "tree_method": "hist"})
+                bst.set_param({"device": "cpu", "tree_method": "hist"})
                bst_cpu = copy(bst)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
@ -107,14 +108,15 @@ class TestGPUPredict:
        dtrain = xgb.DMatrix(X_train, label=y_train)

        params = {}
-        params["tree_method"] = "gpu_hist"
+        params["tree_method"] = "hist"
+        params["device"] = "cuda:0"
        bst = xgb.train(params, dtrain)

-        tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
        # Don't reuse the DMatrix for prediction, otherwise the result is cached.
        predict_gpu_0 = bst.predict(xgb.DMatrix(X_test))
        predict_gpu_1 = bst.predict(xgb.DMatrix(X_test))
-        tm.set_ordinal(-1, bst)
+        bst.set_param({"device": "cpu"})
        predict_cpu = bst.predict(xgb.DMatrix(X_test))

        assert np.allclose(predict_gpu_0, predict_gpu_1)
@ -131,8 +133,8 @@ class TestGPUPredict:
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        params = {
-            "tree_method": "gpu_hist",
-            "gpu_id": "0",
+            "tree_method": "hist",
+            "device": "cuda:0",
            "n_jobs": -1,
            "seed": 123,
        }
@ -141,13 +143,54 @@ class TestGPUPredict:
        gpu_test_score = m.score(X_test, y_test)

        # Now with cpu
-        m = tm.set_ordinal(-1, m)
+        m.set_params(device="cpu")
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

+    @pytest.mark.parametrize("device", ["cpu", "cuda"])
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_inplace_predict_device_type(self, device: str) -> None:
+        """Test inplace predict with different device and data types.
+
+        The sklearn interface uses inplace predict by default and gbtree fallbacks to
+        DMatrix whenever device doesn't match. This test checks that XGBoost can handle
+        different combinations of device and input data type.
+
+        """
+        import cudf
+        import cupy as cp
+        import pandas as pd
+        from scipy.sparse import csr_matrix
+
+        reg = xgb.XGBRegressor(tree_method="hist", device=device)
+        n_samples = 4096
+        n_features = 13
+        X, y, w = tm.make_regression(n_samples, n_features, use_cupy=True)
+        X[X == 0.0] = 1.0
+
+        reg.fit(X, y, sample_weight=w)
+        predt_0 = reg.predict(X)
+
+        X = cp.asnumpy(X)
+        predt_1 = reg.predict(X)
+
+        df = pd.DataFrame(X)
+        predt_2 = reg.predict(df)
+
+        df = cudf.DataFrame(X)
+        predt_3 = reg.predict(df)
+
+        X_csr = csr_matrix(X)
+        predt_4 = reg.predict(X_csr)
+
+        np.testing.assert_allclose(predt_0, predt_1)
+        np.testing.assert_allclose(predt_0, predt_2)
+        np.testing.assert_allclose(predt_0, predt_3)
+        np.testing.assert_allclose(predt_0, predt_4)
+
    def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
        import cupy as cp

@ -175,7 +218,9 @@ class TestGPUPredict:
        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train(
-            {"tree_method": "gpu_hist", "gpu_id": device}, dtrain, num_boost_round=10
+            {"tree_method": "hist", "device": f"cuda:{device}"},
+            dtrain,
+            num_boost_round=10,
        )

        test = xgb.DMatrix(X[:10, ...], missing=missing)
@ -208,13 +253,13 @@ class TestGPUPredict:
        missing_idx = [i for i in range(0, X.shape[1], 16)]
        X[:, missing_idx] = missing
        reg = xgb.XGBRegressor(
-            tree_method="gpu_hist", n_estimators=8, missing=missing, gpu_id=device
+            tree_method="hist", n_estimators=8, missing=missing, device=f"cuda:{device}"
        )
        reg.fit(X, y)

-        reg = tm.set_ordinal(device, reg)
+        reg.set_params(device=f"cuda:{device}")
        gpu_predt = reg.predict(X)
-        reg = tm.set_ordinal(-1, reg)
+        reg = reg.set_params(device="cpu")
        cpu_predt = reg.predict(cp.asnumpy(X))
        np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)
        cp.cuda.runtime.setDevice(0)
@ -250,7 +295,9 @@ class TestGPUPredict:

        dtrain = xgb.DMatrix(X, y)

-        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain, num_boost_round=10)
+        booster = xgb.train(
+            {"tree_method": "hist", "device": "cuda:0"}, dtrain, num_boost_round=10
+        )
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)
@ -280,12 +327,12 @@ class TestGPUPredict:
    def test_shap(self, num_rounds, dataset, param):
        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
            return
-        param.update({"tree_method": "gpu_hist", "gpu_id": 0})
+        param.update({"tree_method": "hist", "device": "gpu:0"})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "gpu:0"})
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
@ -298,12 +345,12 @@ class TestGPUPredict:
    def test_shap_interactions(self, num_rounds, dataset, param):
        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
            return
-        param.update({"tree_method": "hist", "gpu_id": 0})
+        param.update({"tree_method": "hist", "device": "cuda:0"})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
@ -317,16 +364,18 @@ class TestGPUPredict:
    def test_shap_categorical(self):
        X, y = tm.make_categorical(100, 20, 7, False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
-        booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10)
+        booster = xgb.train(
+            {"tree_method": "hist", "device": "gpu:0"}, Xy, num_boost_round=10
+        )

-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(
            np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3
        )

-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(
@ -334,8 +383,8 @@ class TestGPUPredict:
        )

    def test_predict_leaf_basic(self):
-        gpu_leaf = run_predict_leaf(0)
-        cpu_leaf = run_predict_leaf(-1)
+        gpu_leaf = run_predict_leaf("gpu:0")
+        cpu_leaf = run_predict_leaf("cpu")
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
@ -344,23 +393,22 @@ class TestGPUPredict:
        booster = xgb.train(
            param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds
        )
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
        cpu_leaf = booster.predict(m, pred_leaf=True)

-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)

    @given(predict_parameter_strategy, tm.make_dataset_strategy())
    @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_predict_leaf_gbtree(self, param, dataset):
+    def test_predict_leaf_gbtree(self, param: dict, dataset: tm.TestDataset) -> None:
        # Unsupported for random forest
        if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
            return

-        param["booster"] = "gbtree"
-        param["tree_method"] = "gpu_hist"
+        param.update({"booster": "gbtree", "tree_method": "hist", "device": "cuda:0"})
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.make_dataset_strategy())
@ -370,8 +418,7 @@ class TestGPUPredict:
        if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
            return

-        param["booster"] = "dart"
-        param["tree_method"] = "gpu_hist"
+        param.update({"booster": "dart", "tree_method": "hist", "device": "cuda:0"})
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
@ -395,12 +442,12 @@ class TestGPUPredict:
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
-            "tree_method": "gpu_hist",
+            "tree_method": "hist",
            "max_depth": 3,
            "learning_rate": 1.0,
            "base_score": 0.0,
            "eval_metric": "rmse",
-            "gpu_id": "0",
+            "device": "cuda:0",
        }

        eval_history = {}
@ -412,7 +459,7 @@ class TestGPUPredict:
            verbose_eval=False,
            evals_result=eval_history,
        )
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(
@ -434,14 +481,16 @@ class TestGPUPredict:
        Xy = xgb.DMatrix(X, y)
        if n_classes == 2:
            params = {
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda:0",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "binary:logistic",
            }
        else:
            params = {
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda:0",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "multi:softprob",
@ -455,7 +504,7 @@ class TestGPUPredict:
        copied = booster.predict(Xy)

        # CPU
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
        cpu_inplace = booster.inplace_predict(X_)
        cpu_copied = booster.predict(Xy)

@ -465,7 +514,7 @@ class TestGPUPredict:
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

        # GPU
-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)

@ -482,7 +531,7 @@ class TestGPUPredict:
        orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols)
        y = rng.randint(low=0, high=127, size=rows)
        dtrain = xgb.DMatrix(orig, label=y)
-        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)
+        booster = xgb.train({"tree_method": "hist", "device": "cuda:0"}, dtrain)

        predt_orig = booster.inplace_predict(orig)
        # all primitive types in numpy
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@ -28,7 +28,7 @@ def run_threaded_predict(X, rows, predict_func):
        assert f.result()


-def run_predict_leaf(gpu_id: int) -> np.ndarray:
+def run_predict_leaf(device: str) -> np.ndarray:
    rows = 100
    cols = 4
    classes = 5
@ -48,7 +48,7 @@ def run_predict_leaf(gpu_id: int) -> np.ndarray:
        num_boost_round=num_boost_round,
    )

-    booster = tm.set_ordinal(gpu_id, booster)
+    booster.set_param({"device": device})
    empty = xgb.DMatrix(np.ones(shape=(0, cols)))
    empty_leaf = booster.predict(empty, pred_leaf=True)
    assert empty_leaf.shape[0] == 0
@ -74,14 +74,14 @@ def run_predict_leaf(gpu_id: int) -> np.ndarray:

    # When there's only 1 tree, the output is a 1 dim vector
    booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
-    booster = tm.set_ordinal(gpu_id, booster)
+    booster.set_param({"device": device})
    assert booster.predict(m, pred_leaf=True).shape == (rows,)

    return leaf


 def test_predict_leaf() -> None:
-    run_predict_leaf(-1)
+    run_predict_leaf("cpu")


 def test_predict_shape():
--- a/tests/test_distributed/test_with_spark/test_data.py
+++ b/tests/test_distributed/test_with_spark/test_data.py
@ -69,7 +69,7 @@ def run_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool, on_gpu: bool) -> None:
    train_Xy, valid_Xy = create_dmatrix_from_partitions(
        iter(dfs),
        feature_cols,
-        gpu_id=device_id,
+        dev_ordinal=device_id,
        use_qdm=is_qdm,
        kwargs=kwargs,
        enable_sparse_data_optim=False,
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@ -1025,6 +1025,7 @@ class XgboostLocalTest(SparkTestCase):
        self.assertTrue(hasattr(py_reg, "n_estimators"))
        self.assertEqual(py_reg.n_estimators.parent, py_reg.uid)
        self.assertFalse(hasattr(py_reg, "gpu_id"))
+        self.assertFalse(hasattr(py_reg, "device"))
        self.assertEqual(py_reg.getOrDefault(py_reg.n_estimators), 100)
        self.assertEqual(py_reg.getOrDefault(py_reg.objective), "reg:squarederror")
        py_reg2 = SparkXGBRegressor(n_estimators=200)
@ -1038,6 +1039,7 @@ class XgboostLocalTest(SparkTestCase):
        self.assertTrue(hasattr(py_cls, "n_estimators"))
        self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
        self.assertFalse(hasattr(py_cls, "gpu_id"))
+        self.assertFalse(hasattr(py_cls, "device"))
        self.assertEqual(py_cls.getOrDefault(py_cls.n_estimators), 100)
        self.assertEqual(py_cls.getOrDefault(py_cls.objective), None)
        py_cls2 = SparkXGBClassifier(n_estimators=200)
@ -1051,6 +1053,7 @@ class XgboostLocalTest(SparkTestCase):
        self.assertTrue(hasattr(py_cls, "n_estimators"))
        self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
        self.assertFalse(hasattr(py_cls, "gpu_id"))
+        self.assertFalse(hasattr(py_cls, "device"))
        self.assertTrue(hasattr(py_cls, "arbitrary_params_dict"))
        expected_kwargs = {"sketch_eps": 0.03}
        self.assertEqual(