Define the new device parameter. (#9362)

2023-07-13 19:30:25 +08:00
parent 2d0cd2817e
commit 04aff3af8e
63 changed files with 827 additions and 477 deletions
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -28,6 +28,7 @@ class LintersPaths:
        "tests/python-gpu/test_gpu_prediction.py",
        "tests/python-gpu/load_pickle.py",
        "tests/python-gpu/test_gpu_pickling.py",
+        "tests/python-gpu/test_gpu_eval_metrics.py",
        "tests/test_distributed/test_with_spark/",
        "tests/test_distributed/test_gpu_with_spark/",
        # demo
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -16,8 +16,7 @@
 namespace xgboost {
 namespace common {
 void TestSegmentedArgSort() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  size_t constexpr kElements = 100, kGroups = 3;
  dh::device_vector<size_t> sorted_idx(kElements, 0);
@@ -55,8 +54,7 @@ void TestSegmentedArgSort() {
 TEST(Algorithm, SegmentedArgSort) { TestSegmentedArgSort(); }

 TEST(Algorithm, GpuArgSort) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  dh::device_vector<float> values(20);
  dh::Iota(dh::ToSpan(values));                                    // accending
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -227,7 +227,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
  }
  // check categorical
  beg = n_samples;
-  for (std::size_t i = 0; i < n_categories; ++i) {
+  for (bst_cat_t i = 0; i < n_categories; ++i) {
    // all from the second column
    ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
  }
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -4,6 +4,7 @@
 #include <gtest/gtest.h>

 #include "../../../src/common/linalg_op.cuh"
+#include "../helpers.h"
 #include "xgboost/context.h"
 #include "xgboost/linalg.h"

@@ -54,8 +55,7 @@ void TestElementWiseKernel() {
 }

 void TestSlice() {
-  Context ctx;
-  ctx.gpu_id = 1;
+  auto ctx = MakeCUDACtx(1);
  thrust::device_vector<double> data(2 * 3 * 4);
  auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
  dh::LaunchN(1, [=] __device__(size_t) {
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -23,8 +23,7 @@

 namespace xgboost::ltr {
 void TestCalcQueriesInvIDCG() {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  std::size_t n_groups = 5, n_samples_per_group = 32;

  dh::device_vector<float> scores(n_samples_per_group * n_groups);
@@ -85,20 +84,17 @@ void TestRankingCache(Context const* ctx) {
 }  // namespace

 TEST(RankingCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestRankingCache(&ctx);
 }

 TEST(NDCGCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestNDCGCache(&ctx);
 }

 TEST(MAPCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestMAPCache(&ctx);
 }
 }  // namespace xgboost::ltr
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -7,6 +7,7 @@

 #include "../../../src/common/stats.h"
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
+#include "../helpers.h"

 namespace xgboost {
 namespace common {
@@ -71,7 +72,7 @@ TEST(Stats, Median) {
    ASSERT_EQ(m, .5f);

 #if defined(XGBOOST_USE_CUDA)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    ASSERT_FALSE(ctx.IsCPU());
    Median(&ctx, values, weights, &out);
    m = out(0);
@@ -80,7 +81,7 @@ TEST(Stats, Median) {
  }

  {
-    ctx.gpu_id = Context::kCpuId;
+    ctx = ctx.MakeCPU();
    // 4x2 matrix
    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
    HostDeviceVector<float> weights;
@@ -90,7 +91,7 @@ TEST(Stats, Median) {
    ASSERT_EQ(out(1), .5f);

 #if defined(XGBOOST_USE_CUDA)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    Median(&ctx, values, weights, &out);
    ASSERT_EQ(out(0), .5f);
    ASSERT_EQ(out(1), .5f);
@@ -123,8 +124,7 @@ TEST(Stats, Mean) {

 #if defined(XGBOOST_USE_CUDA)
 TEST(Stats, GPUMean) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestMean(&ctx);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -3,16 +3,17 @@
 */
 #include <gtest/gtest.h>

-#include <cstddef>                            // std::size_t
-#include <utility>                            // std::pair
-#include <vector>                             // std::vector
+#include <cstddef>  // std::size_t
+#include <utility>  // std::pair
+#include <vector>   // std::vector

 #include "../../../src/common/linalg_op.cuh"  // ElementWiseTransformDevice
 #include "../../../src/common/stats.cuh"
-#include "xgboost/base.h"                     // XGBOOST_DEVICE
-#include "xgboost/context.h"                  // Context
-#include "xgboost/host_device_vector.h"       // HostDeviceVector
-#include "xgboost/linalg.h"                   // Tensor
+#include "../helpers.h"
+#include "xgboost/base.h"                // XGBOOST_DEVICE
+#include "xgboost/context.h"             // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/linalg.h"              // Tensor

 namespace xgboost {
 namespace common {
@@ -33,7 +34,7 @@ class StatsGPU : public ::testing::Test {
  }

 public:
-  void SetUp() override { ctx_.gpu_id = 0; }
+  void SetUp() override { ctx_  = MakeCUDACtx(0); }

  void WeightedMulti() {
    // data for one segment
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -171,8 +171,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
    ASSERT_TRUE(Xy->SingleColBlock());
    bst_bin_t constexpr kBins{17};
    auto p = BatchParam{kBins, threshold};
-    Context gpu_ctx;
-    gpu_ctx.gpu_id = 0;
+    auto gpu_ctx = MakeCUDACtx(0);
    for (auto const &page : Xy->GetBatches<EllpackPage>(
             &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
      from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -180,7 +180,12 @@ TEST(GBTree, ChooseTreeMethod) {
      learner->SetParam("tree_method", tree_method.value());
    }
    if (device.has_value()) {
-      learner->SetParam("gpu_id", device.value());
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
    }
    learner->Configure();
    for (std::int32_t i = 0; i < 3; ++i) {
@@ -199,7 +204,12 @@ TEST(GBTree, ChooseTreeMethod) {
      learner->SetParam("tree_method", tree_method.value());
    }
    if (device.has_value()) {
-      learner->SetParam("gpu_id", device.value());
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
    }
    learner->Configure();
    for (std::int32_t i = 0; i < 3; ++i) {
@@ -215,11 +225,12 @@ TEST(GBTree, ChooseTreeMethod) {

  // |        | hist    | gpu_hist | exact | NA  |
  // |--------+---------+----------+-------+-----|
-  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU | # not yet tested
-  // | CPU    | CPU     | Err      | CPU   | CPU | # not yet tested
+  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU |
+  // | CPU    | CPU     | GPU (w)  | CPU   | CPU |
  // |--------+---------+----------+-------+-----|
  // | -1     | CPU     | GPU (w)  | CPU   | CPU |
  // | 0      | GPU     | GPU (w)  | Err   | GPU |
+  // |--------+---------+----------+-------+-----|
  // | NA     | CPU     | GPU (w)  | CPU   | CPU |
  //
  // - (w): warning
@@ -237,18 +248,30 @@ TEST(GBTree, ChooseTreeMethod) {
          // hist
          {{"hist", "-1"}, "grow_quantile_histmaker"},
          {{"hist", "0"}, "grow_gpu_hist"},
+          {{"hist", "cpu"}, "grow_quantile_histmaker"},
+          {{"hist", "cuda"}, "grow_gpu_hist"},
+          {{"hist", "cuda:0"}, "grow_gpu_hist"},
          {{"hist", std::nullopt}, "grow_quantile_histmaker"},
          // gpu_hist
          {{"gpu_hist", "-1"}, "grow_gpu_hist"},
          {{"gpu_hist", "0"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cpu"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda:0"}, "grow_gpu_hist"},
          {{"gpu_hist", std::nullopt}, "grow_gpu_hist"},
          // exact
          {{"exact", "-1"}, "grow_colmaker,prune"},
          {{"exact", "0"}, "err"},
+          {{"exact", "cpu"}, "grow_colmaker,prune"},
+          {{"exact", "cuda"}, "err"},
+          {{"exact", "cuda:0"}, "err"},
          {{"exact", std::nullopt}, "grow_colmaker,prune"},
          // NA
          {{std::nullopt, "-1"}, "grow_quantile_histmaker"},
          {{std::nullopt, "0"}, "grow_gpu_hist"},  // default to hist
+          {{std::nullopt, "cpu"}, "grow_quantile_histmaker"},
+          {{std::nullopt, "cuda"}, "grow_gpu_hist"},
+          {{std::nullopt, "cuda:0"}, "grow_gpu_hist"},
          {{std::nullopt, std::nullopt}, "grow_quantile_histmaker"},
      };

@@ -392,8 +415,7 @@ class Dart : public testing::TestWithParam<char const*> {
    for (size_t i = 0; i < 16; ++i) {
      learner->UpdateOneIter(i, p_mat);
    }
-
-    ConfigLearnerByCtx(&ctx, learner.get());
+    learner->SetParam("device", ctx.DeviceName());

    HostDeviceVector<float> predts_training;
    learner->Predict(p_mat, false, &predts_training, 0, 0, true);
@@ -654,8 +676,7 @@ TEST(GBTree, InplacePredictionError) {
        RandomDataGenerator{n_samples, n_features, 0.5f}.Batches(2).GenerateSparsePageDMatrix(
            "cache", true);
    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParam("booster", booster);
-    ConfigLearnerByCtx(ctx, learner.get());
+    learner->SetParams(Args{{"booster", booster}, {"device", ctx->DeviceName()}});
    learner->Configure();
    for (std::int32_t i = 0; i < 3; ++i) {
      learner->UpdateOneIter(i, p_fmat);
@@ -697,9 +718,9 @@ TEST(GBTree, InplacePredictionError) {
 #endif  // defined(XGBOOST_USE_CUDA)
    };
    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParam("booster", booster);
-    learner->SetParam("max_bin", std::to_string(max_bins));
-    ConfigLearnerByCtx(ctx, learner.get());
+    learner->SetParams(Args{{"booster", booster},
+                            {"max_bin", std::to_string(max_bins)},
+                            {"device", ctx->DeviceName()}});
    learner->Configure();
    for (std::int32_t i = 0; i < 3; ++i) {
      learner->UpdateOneIter(i, p_fmat);
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -8,6 +8,7 @@
 #include <limits>  // for numeric_limits
 #include <memory>  // for shared_ptr
 #include <string>  // for string
+#include <thread>  // for thread

 #include "../../../src/data/adapter.h"           // for ArrayAdapter
 #include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
@@ -41,7 +42,7 @@ void TestInplaceFallback(Context const* ctx) {

  // learner is configured to the device specified by ctx
  std::unique_ptr<Learner> learner{Learner::Create({Xy})};
-  ConfigLearnerByCtx(ctx, learner.get());
+  learner->SetParam("device", ctx->DeviceName());
  for (std::int32_t i = 0; i < 3; ++i) {
    learner->UpdateOneIter(i, Xy);
  }
@@ -56,18 +57,31 @@ void TestInplaceFallback(Context const* ctx) {

  HostDeviceVector<float>* out_predt{nullptr};
  ConsoleLogger::Configure(Args{{"verbosity", "1"}});
+  std::string output;
  // test whether the warning is raised
+#if !defined(_WIN32)
+  // Windows has issue with CUDA and thread local storage. For some reason, on Windows a
+  // cudaInitializationError is raised during destruction of `HostDeviceVector`. This
+  // might be related to https://github.com/dmlc/xgboost/issues/5793
  ::testing::internal::CaptureStderr();
+  std::thread{[&] {
+    // Launch a new thread to ensure a warning is raised as we prevent over-verbose
+    // warning by using thread-local flags.
+    learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                            &out_predt, 0, 0);
+  }}.join();
+  output = testing::internal::GetCapturedStderr();
+  ASSERT_NE(output.find("Falling back"), std::string::npos);
+#endif
+
  learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                          &out_predt, 0, 0);
-  auto output = testing::internal::GetCapturedStderr();
-  ASSERT_NE(output.find("Falling back"), std::string::npos);

  // test when the contexts match
  Context new_ctx = *proxy->Ctx();
  ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);

-  ConfigLearnerByCtx(&new_ctx, learner.get());
+  learner->SetParam("device", new_ctx.DeviceName());
  HostDeviceVector<float>* out_predt_1{nullptr};
  // no warning is raised
  ::testing::internal::CaptureStderr();
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -559,16 +559,4 @@ class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
    }
  }
 };
-
-// A temporary solution before we move away from gpu_id.
-inline void ConfigLearnerByCtx(Context const* ctx, Learner* learner) {
-  if (ctx->IsCPU()) {
-    learner->SetParam("tree_method", "hist");
-  } else {
-    learner->SetParam("tree_method", "gpu_hist");
-  }
-  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
-  learner->Configure();
-  ASSERT_EQ(learner->Ctx()->gpu_id, ctx->gpu_id);
-}
 }  // namespace xgboost
--- a/tests/cpp/metric/test_multiclass_metric.h
+++ b/tests/cpp/metric/test_multiclass_metric.h
@@ -46,7 +46,6 @@ inline void CheckDeterministicMetricMultiClass(StringView name, int32_t device)

 inline void TestMultiClassError(int device, DataSplitMode data_split_mode) {
  auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
  xgboost::Metric * metric = xgboost::Metric::Create("merror", &ctx);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "merror");
@@ -67,7 +66,6 @@ inline void VerifyMultiClassError(DataSplitMode data_split_mode = DataSplitMode:

 inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
  auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
  xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &ctx);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "mlogloss");
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -13,26 +13,22 @@

 namespace xgboost::obj {
 TEST(LambdaRank, GPUNDCGJsonIO) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestNDCGJsonIO(&ctx);
 }

 TEST(LambdaRank, GPUMAPStat) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestMAPStat(&ctx);
 }

 TEST(LambdaRank, GPUNDCGGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestNDCGGPair(&ctx);
 }

 void TestGPUMakePair() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  MetaInfo info;
  HostDeviceVector<float> predt;
@@ -126,8 +122,7 @@ void TestGPUMakePair() {
 TEST(LambdaRank, GPUMakePair) { TestGPUMakePair(); }

 TEST(LambdaRank, GPUUnbiasedNDCG) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestUnbiasedNDCG(&ctx);
 }

@@ -161,8 +156,7 @@ TEST(LambdaRank, RankItemCountOnRight) {
 }

 TEST(LambdaRank, GPUMAPGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestMAPGPair(&ctx);
 }
 }  // namespace xgboost::obj
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -305,12 +305,12 @@ TEST(Objective, CPU_vs_CUDA) {

  {
    // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
    obj->GetGradient(preds, info, 0, &cpu_out_preds);
  }
  {
    // CUDA
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    obj->GetGradient(preds, info, 0, &cuda_out_preds);
  }

--- a/tests/cpp/plugin/test_regression_obj_oneapi.cc
+++ b/tests/cpp/plugin/test_regression_obj_oneapi.cc
@@ -148,7 +148,7 @@ TEST(Plugin, CPUvsOneAPI) {

  {
    // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
    obj_cpu->GetGradient(preds, info, 0, &cpu_out_preds);
  }
  {
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -214,15 +214,16 @@ void TestUpdatePredictionCache(bool use_subsampling) {
 }
 }  // namespace

-TEST(CPUPredictor, GHistIndex) {
+TEST(CPUPredictor, GHistIndexTraining) {
  size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  Context ctx;
  auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix(false);
  HostDeviceVector<float> storage(kRows * kCols);
  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage);
  auto adapter = data::ArrayAdapter(columnar.c_str());
  std::shared_ptr<DMatrix> p_full{
      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
-  TestTrainingPrediction(kRows, kBins, "hist", p_full, p_hist);
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
 }

 TEST(CPUPredictor, CategoricalPrediction) {
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -33,9 +33,8 @@ TEST(GPUPredictor, Basic) {
    int n_row = i, n_col = i;
    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();

-    Context ctx;
-    ctx.gpu_id = 0;
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    auto ctx = MakeCUDACtx(0);
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);

    // Test predict batch
@@ -71,7 +70,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
    std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};

-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);

    // Test predict batch
@@ -102,7 +101,7 @@ TEST(GPUPredictor, MGPUBasicColumnSplit) {
    size_t n_row = i, n_col = i;
    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();

-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);

    // Test predict batch
@@ -132,18 +131,19 @@ TEST(GPUPredictor, EllpackBasic) {
 }

 TEST(GPUPredictor, EllpackTraining) {
-  size_t constexpr kRows { 128 }, kCols { 16 }, kBins { 64 };
-  auto p_ellpack =
-      RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).Device(0).GenerateDeviceDMatrix(false);
+  auto ctx = MakeCUDACtx(0);
+  size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0}
+                       .Bins(kBins)
+                       .Device(ctx.Ordinal())
+                       .GenerateDeviceDMatrix(false);
  HostDeviceVector<float> storage(kRows * kCols);
-  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}
-       .Device(0)
-       .GenerateArrayInterface(&storage);
+  auto columnar =
+      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Ordinal()).GenerateArrayInterface(&storage);
  auto adapter = data::CupyAdapter(columnar);
-  std::shared_ptr<DMatrix> p_full {
-    DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)
-  };
-  TestTrainingPrediction(kRows, kBins, "gpu_hist", p_full, p_ellpack);
+  std::shared_ptr<DMatrix> p_full{
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_ellpack);
 }

 TEST(GPUPredictor, ExternalMemoryTest) {
@@ -153,9 +153,8 @@ TEST(GPUPredictor, ExternalMemoryTest) {
  gpu_predictor->Configure({});

  const int n_classes = 3;
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.gpu_id)};
+  Context ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Ordinal())};

  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, n_classes);
  std::vector<std::unique_ptr<DMatrix>> dmats;
@@ -185,7 +184,7 @@ TEST(GPUPredictor, InplacePredictCupy) {
  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows{128}, kCols{64};
  RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.gpu_id);
+  gen.Device(ctx.Ordinal());
  HostDeviceVector<float> data;
  std::string interface_str = gen.GenerateArrayInterface(&data);
  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -197,7 +196,7 @@ TEST(GPUPredictor, InplacePredictCuDF) {
  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows{128}, kCols{64};
  RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.gpu_id);
+  gen.Device(ctx.Ordinal());
  std::vector<HostDeviceVector<float>> storage(kCols);
  auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -214,9 +213,8 @@ TEST(GpuPredictor, LesserFeatures) {
 TEST(GPUPredictor, ShapStump) {
  cudaSetDevice(0);

-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
  gbm::GBTreeModel model(&mparam, &ctx);

  std::vector<std::unique_ptr<RegTree>> trees;
@@ -241,9 +239,8 @@ TEST(GPUPredictor, ShapStump) {
 }

 TEST(GPUPredictor, Shap) {
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
  gbm::GBTreeModel model(&mparam, &ctx);

  std::vector<std::unique_ptr<RegTree>> trees;
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -44,60 +44,49 @@ TEST(Predictor, PredictionCache) {
  EXPECT_ANY_THROW(container.Entry(m));
 }

-void TestTrainingPrediction(size_t rows, size_t bins,
-                            std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist) {
+void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist) {
  size_t constexpr kCols = 16;
  size_t constexpr kClasses = 3;
  size_t constexpr kIters = 3;

  std::unique_ptr<Learner> learner;
-  auto train = [&](Context const& ctx) {
-    p_hist->Info().labels.Reshape(rows, 1);
-    auto &h_label = p_hist->Info().labels.Data()->HostVector();

-    for (size_t i = 0; i < rows; ++i) {
-      h_label[i] = i % kClasses;
-    }
+  p_hist->Info().labels.Reshape(rows, 1);
+  auto &h_label = p_hist->Info().labels.Data()->HostVector();

-    learner.reset(Learner::Create({}));
-    learner->SetParam("tree_method", tree_method);
-    learner->SetParam("objective", "multi:softprob");
-    learner->SetParam("num_feature", std::to_string(kCols));
-    learner->SetParam("num_class", std::to_string(kClasses));
-    learner->SetParam("max_bin", std::to_string(bins));
-    ConfigLearnerByCtx(&ctx, learner.get());
-    learner->Configure();
+  for (size_t i = 0; i < rows; ++i) {
+    h_label[i] = i % kClasses;
+  }

-    for (size_t i = 0; i < kIters; ++i) {
-      learner->UpdateOneIter(i, p_hist);
-    }
+  learner.reset(Learner::Create({}));
+  learner->SetParams(Args{{"objective", "multi:softprob"},
+                          {"num_feature", std::to_string(kCols)},
+                          {"num_class", std::to_string(kClasses)},
+                          {"max_bin", std::to_string(bins)},
+                          {"device", ctx->DeviceName()}});
+  learner->Configure();

-    Json model{Object{}};
-    learner->SaveModel(&model);
+  for (size_t i = 0; i < kIters; ++i) {
+    learner->UpdateOneIter(i, p_hist);
+  }

-    learner.reset(Learner::Create({}));
-    learner->LoadModel(model);
-    ConfigLearnerByCtx(&ctx, learner.get());
-    learner->Configure();
+  Json model{Object{}};
+  learner->SaveModel(&model);

-    HostDeviceVector<float> from_full;
-    learner->Predict(p_full, false, &from_full, 0, 0);
+  learner.reset(Learner::Create({}));
+  learner->LoadModel(model);
+  learner->SetParam("device", ctx->DeviceName());
+  learner->Configure();

-    HostDeviceVector<float> from_hist;
-    learner->Predict(p_hist, false, &from_hist, 0, 0);
+  HostDeviceVector<float> from_full;
+  learner->Predict(p_full, false, &from_full, 0, 0);

-    for (size_t i = 0; i < rows; ++i) {
-      EXPECT_NEAR(from_hist.ConstHostVector()[i],
-                  from_full.ConstHostVector()[i], kRtEps);
-    }
-  };
+  HostDeviceVector<float> from_hist;
+  learner->Predict(p_hist, false, &from_hist, 0, 0);

-  if (tree_method == "gpu_hist") {
-    train(MakeCUDACtx(0));
-  } else {
-    train(Context{});
+  for (size_t i = 0; i < rows; ++i) {
+    EXPECT_NEAR(from_hist.ConstHostVector()[i], from_full.ConstHostVector()[i], kRtEps);
  }
 }

@@ -120,7 +109,7 @@ void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_r
    learner->UpdateOneIter(it, m);
  }

-  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+  learner->SetParam("device", ctx->DeviceName());
  learner->Configure();

  HostDeviceVector<float> *p_out_predictions_0{nullptr};
@@ -153,7 +142,7 @@ void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_r
    ASSERT_NEAR(h_pred[i], h_pred_0[i] + h_pred_1[i] - 0.5f, kRtEps);
  }

-  learner->SetParam("gpu_id", "-1");
+  learner->SetParam("device", "cpu");
  learner->Configure();
 }

@@ -161,12 +150,12 @@ namespace {
 std::unique_ptr<Learner> LearnerForTest(Context const *ctx, std::shared_ptr<DMatrix> dmat,
                                        size_t iters, size_t forest = 1) {
  std::unique_ptr<Learner> learner{Learner::Create({dmat})};
-  learner->SetParams(Args{{"num_parallel_tree", std::to_string(forest)}});
+  learner->SetParams(
+      Args{{"num_parallel_tree", std::to_string(forest)}, {"device", ctx->DeviceName()}});
  for (size_t i = 0; i < iters; ++i) {
    learner->UpdateOneIter(i, dmat);
  }

-  ConfigLearnerByCtx(ctx, learner.get());
  return learner;
 }

@@ -215,7 +204,7 @@ void TestPredictionDeviceAccess() {
  {
    ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId);
    Context cpu_ctx;
-    ConfigLearnerByCtx(&cpu_ctx, learner.get());
+    learner->SetParam("device", cpu_ctx.DeviceName());
    learner->Predict(m_test, false, &from_cpu, 0, 0);
    ASSERT_TRUE(from_cpu.HostCanWrite());
    ASSERT_FALSE(from_cpu.DeviceCanRead());
@@ -225,7 +214,7 @@ void TestPredictionDeviceAccess() {
  HostDeviceVector<float> from_cuda;
  {
    Context cuda_ctx = MakeCUDACtx(0);
-    ConfigLearnerByCtx(&cuda_ctx, learner.get());
+    learner->SetParam("device", cuda_ctx.DeviceName());
    learner->Predict(m_test, false, &from_cuda, 0, 0);
    ASSERT_EQ(from_cuda.DeviceIdx(), 0);
    ASSERT_TRUE(from_cuda.DeviceCanWrite());
@@ -465,11 +454,7 @@ void TestIterationRangeColumnSplit(Context const* ctx) {
  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
  auto learner = LearnerForTest(ctx, dmat, kIters, kForest);

-  if (ctx->IsCPU()) {
-    learner->SetParams(Args{{"gpu_id", std::to_string(-1)}});
-  } else {
-    learner->SetParams(Args{{"gpu_id", std::to_string(0)}});
-  }
+  learner->SetParam("device", ctx->DeviceName());

  bool bound = false;
  std::unique_ptr<Learner> sliced{learner->Slice(0, 3, 1, &bound)};
@@ -582,7 +567,7 @@ void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
  learner.reset(Learner::Create({Xy}));
  learner->LoadModel(model);

-  ConfigLearnerByCtx(ctx, learner.get());
+  learner->SetParam("device", ctx->DeviceName());
  learner->Predict(Xy, false, &sparse_predt, 0, 0);

  auto constexpr kWorldSize = 2;
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -84,9 +84,8 @@ void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t col
 }

 // p_full and p_hist should come from the same data set.
-void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist);
+void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);

 void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
                           bst_feature_t cols);
--- a/tests/cpp/test_context.cc
+++ b/tests/cpp/test_context.cc
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>
+#include <xgboost/context.h>
+
+namespace xgboost {
+TEST(Context, CPU) {
+  Context ctx;
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+  ASSERT_EQ(ctx.Ordinal(), Context::kCpuId);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, -1);
+
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "oops"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "-1"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CUDA"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU:0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:+0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:0-"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":gpu"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ""}}), dmlc::Error);
+}
+}  // namespace xgboost
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>  // for Args
+#include <xgboost/context.h>
+#include <xgboost/json.h>  // for FromJson, ToJson
+
+#include <string>  // for string, to_string
+
+#include "../../src/common/common.h"  // for AllVisibleGPUs
+
+namespace xgboost {
+namespace {
+void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
+  ASSERT_EQ(ctx.gpu_id, ord);
+  ASSERT_EQ(ctx.Device().ordinal, ord);
+  ASSERT_EQ(ctx.DeviceName(), "cuda:" + std::to_string(ord));
+  ASSERT_EQ(ctx.Ordinal(), ord);
+  ASSERT_TRUE(ctx.IsCUDA());
+  ASSERT_FALSE(ctx.IsCPU());
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CUDA(ord));
+
+  Json jctx{ToJson(ctx)};
+  Context new_ctx;
+  FromJson(jctx, &new_ctx);
+  ASSERT_EQ(new_ctx.Device(), ctx.Device());
+  ASSERT_EQ(new_ctx.gpu_id, ctx.gpu_id);
+}
+}  // namespace
+
+TEST(Context, DeviceOrdinal) {
+  Context ctx;
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+
+  std::string device = "cuda:" + std::to_string(ord);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  TestCUDA(ctx, ord);
+
+  device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+
+  TestCUDA(ctx, ord);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, 1);
+
+  Context new_ctx = ctx;
+  TestCUDA(new_ctx, ctx.Ordinal());
+
+  auto cpu_ctx = ctx.MakeCPU();
+  ASSERT_TRUE(cpu_ctx.IsCPU());
+  ASSERT_EQ(cpu_ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(cpu_ctx.Device(), DeviceOrd::CPU());
+
+  auto cuda_ctx = cpu_ctx.MakeCUDA(ctx.Ordinal());
+  TestCUDA(cuda_ctx, ctx.Ordinal());
+
+  cuda_ctx.UpdateAllowUnknown(Args{{"fail_on_invalid_gpu_id", "true"}});
+  ASSERT_THROW({ cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:9999"}}); }, dmlc::Error);
+  cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:00"}});
+  ASSERT_EQ(cuda_ctx.Ordinal(), 0);
+
+  ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  // Test alias
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu:0"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu"}});
+  TestCUDA(ctx, 0);
+
+  // Test the thread local memory in dmlc is not linking different instances together.
+  cpu_ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{});
+  TestCUDA(ctx, 0);
+}
+
+TEST(Context, GPUId) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestCUDA(ctx, 0);
+
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
+  TestCUDA(ctx, ord);
+
+  auto device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+  TestCUDA(ctx, ord);
+
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "-1"}});
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+}
+}  // namespace xgboost
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -27,7 +27,6 @@
 #include "../../src/common/io.h"                    // for LoadSequentialFile
 #include "../../src/common/linalg_op.h"             // for ElementWiseTransformHost, begin, end
 #include "../../src/common/random.h"                // for GlobalRandom
-#include "../../src/common/transform_iterator.h"    // for IndexTransformIter
 #include "dmlc/io.h"                                // for Stream
 #include "dmlc/omp.h"                               // for omp_get_max_threads
 #include "dmlc/registry.h"                          // for Registry
@@ -35,14 +34,13 @@
 #include "helpers.h"                                // for GetBaseScore, RandomDataGenerator
 #include "objective_helpers.h"                      // for MakeObjNamesForTest, ObjTestNameGenerator
 #include "xgboost/base.h"                           // for bst_float, Args, bst_feature_t, bst_int
-#include "xgboost/context.h"                        // for Context
+#include "xgboost/context.h"                        // for Context, DeviceOrd
 #include "xgboost/data.h"                           // for DMatrix, MetaInfo, DataType
 #include "xgboost/host_device_vector.h"             // for HostDeviceVector
 #include "xgboost/json.h"                           // for Json, Object, get, String, IsA, opera...
 #include "xgboost/linalg.h"                         // for Tensor, TensorView
 #include "xgboost/logging.h"                        // for ConsoleLogger
 #include "xgboost/predictor.h"                      // for PredictionCacheEntry
-#include "xgboost/span.h"                           // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"                    // for StringView

 namespace xgboost {
@@ -58,9 +56,9 @@ TEST(Learner, Basic) {
  auto minor = XGBOOST_VER_MINOR;
  auto patch = XGBOOST_VER_PATCH;

-  static_assert(std::is_integral<decltype(major)>::value, "Wrong major version type");
-  static_assert(std::is_integral<decltype(minor)>::value, "Wrong minor version type");
-  static_assert(std::is_integral<decltype(patch)>::value, "Wrong patch version type");
+  static_assert(std::is_integral_v<decltype(major)>, "Wrong major version type");
+  static_assert(std::is_integral_v<decltype(minor)>, "Wrong minor version type");
+  static_assert(std::is_integral_v<decltype(patch)>, "Wrong patch version type");
 }

 TEST(Learner, ParameterValidation) {
@@ -92,8 +90,7 @@ TEST(Learner, CheckGroup) {
  size_t constexpr kNumRows = 17;
  bst_feature_t constexpr kNumCols = 15;

-  std::shared_ptr<DMatrix> p_mat{
-      RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
+  std::shared_ptr<DMatrix> p_mat{RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
  std::vector<bst_float> weight(kNumGroups, 1);
  std::vector<bst_int> group(kNumGroups);
  group[0] = 2;
@@ -312,35 +309,36 @@ TEST(Learner, GPUConfiguration) {
    learner->SetParams({Arg{"booster", "gblinear"},
                        Arg{"updater", "gpu_coord_descent"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
  {
-    std::unique_ptr<Learner> learner {Learner::Create(mat)};
+    std::unique_ptr<Learner> learner{Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "gpu_hist"}});
+    learner->Configure();
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
  {
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "gpu_hist"},
                        Arg{"gpu_id", "-1"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
  {
    // with CPU algorithm
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "hist"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, -1);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CPU());
  }
  {
    // with CPU algorithm, but `gpu_id` takes priority
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"gpu_id", "0"}});
+    learner->SetParams({Arg{"tree_method", "hist"}, Arg{"gpu_id", "0"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
 }
 #endif  // defined(XGBOOST_USE_CUDA)
--- a/tests/cpp/tree/test_node_partition.cc
+++ b/tests/cpp/tree/test_node_partition.cc
@@ -6,7 +6,9 @@
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_updater.h>  // for TreeUpdater

-#include <memory>                  // for unique_ptr
+#include <memory>  // for unique_ptr
+
+#include "../helpers.h"

 namespace xgboost {
 TEST(Updater, HasNodePosition) {
@@ -19,7 +21,7 @@ TEST(Updater, HasNodePosition) {
  ASSERT_TRUE(up->HasNodePosition());

 #if defined(XGBOOST_USE_CUDA)
-  ctx.gpu_id = 0;
+  ctx = MakeCUDACtx(0);
  up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, &task));
  ASSERT_TRUE(up->HasNodePosition());
 #endif  // defined(XGBOOST_USE_CUDA)
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -70,9 +70,9 @@ class TestPredictionCache : public ::testing::Test {
      Context ctx;
      ctx.InitAllowUnknown(Args{{"nthread", "8"}});
      if (updater_name == "grow_gpu_hist") {
-        ctx.gpu_id = 0;
+        ctx = ctx.MakeCUDA(0);
      } else {
-        ctx.gpu_id = Context::kCpuId;
+        ctx = ctx.MakeCPU();
      }

      ObjInfo task{ObjInfo::kRegression};
--- a/tests/python-gpu/load_pickle.py
+++ b/tests/python-gpu/load_pickle.py
@@ -34,7 +34,7 @@ class TestLoadPickle:
        bst = load_pickle(model_path)
        config = bst.save_config()
        config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "-1"
+        assert config["learner"]["generic_param"]["device"] == "cpu"

    def test_context_is_preserved(self) -> None:
        """Test the device context is preserved after pickling."""
@@ -42,14 +42,14 @@ class TestLoadPickle:
        bst = load_pickle(model_path)
        config = bst.save_config()
        config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "0"
+        assert config["learner"]["generic_param"]["device"] == "cuda:0"

    def test_wrap_gpu_id(self) -> None:
        assert os.environ["CUDA_VISIBLE_DEVICES"] == "0"
        bst = load_pickle(model_path)
        config = bst.save_config()
        config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "0"
+        assert config["learner"]["generic_param"]["device"] == "cuda:0"

        x, y = build_dataset()
        test_x = xgb.DMatrix(x)
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -203,7 +203,7 @@ class TestQuantileDMatrix:
        np.testing.assert_equal(h_ret.indices, d_ret.indices)

        booster = xgb.train(
-            {"tree_method": "gpu_hist", "gpu_id": "0"}, dtrain=d_m
+            {"tree_method": "hist", "device": "cuda:0"}, dtrain=d_m
        )

        np.testing.assert_allclose(
--- a/tests/python-gpu/test_gpu_basic_models.py
+++ b/tests/python-gpu/test_gpu_basic_models.py
@@ -65,16 +65,20 @@ class TestGPUBasicModels:
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_invalid_gpu_id(self):
        from sklearn.datasets import load_digits
+
        X, y = load_digits(return_X_y=True)
        # should pass with invalid gpu id
-        cls1 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=9999)
+        cls1 = xgb.XGBClassifier(tree_method="gpu_hist", gpu_id=9999)
        cls1.fit(X, y)
        # should throw error with fail_on_invalid_gpu_id enabled
        cls2 = xgb.XGBClassifier(
-            tree_method='gpu_hist', gpu_id=9999, fail_on_invalid_gpu_id=True
+            tree_method="gpu_hist", gpu_id=9999, fail_on_invalid_gpu_id=True
        )
-        try:
+        with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
+            cls2.fit(X, y)
+
+        cls2 = xgb.XGBClassifier(
+            tree_method="hist", device="cuda:9999", fail_on_invalid_gpu_id=True
+        )
+        with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
            cls2.fit(X, y)
-            assert False, "Should have failed with with fail_on_invalid_gpu_id enabled"
-        except xgb.core.XGBoostError as err:
-            assert "gpu_id 9999 is invalid" in str(err)
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -43,10 +43,16 @@ class TestGPUEvalMetrics:
            num_boost_round=10,
        )
        cpu_auc = float(booster.eval(Xy).split(":")[1])
-        booster.set_param({"gpu_id": "0"})
-        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        booster.set_param({"device": "cuda:0"})
+        assert (
+            json.loads(booster.save_config())["learner"]["generic_param"]["device"]
+            == "cuda:0"
+        )
        gpu_auc = float(booster.eval(Xy).split(":")[1])
-        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        assert (
+            json.loads(booster.save_config())["learner"]["generic_param"]["device"]
+            == "cuda:0"
+        )

        np.testing.assert_allclose(cpu_auc, gpu_auc)

--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@@ -113,14 +113,6 @@ class TestPickling:
        param = {"tree_method": "gpu_hist", "verbosity": 1}
        bst = xgb.train(param, train_x)

-        with tm.captured_output() as (out, err):
-            bst.inplace_predict(x)
-
-        # The warning is redirected to Python callback, so it's printed in stdout
-        # instead of stderr.
-        stdout = out.getvalue()
-        assert stdout.find("mismatched devices") != -1
-
        save_pickle(bst, model_path)

        args = self.args_template.copy()
@@ -177,7 +169,7 @@ class TestPickling:

        # Switch to CPU predictor
        bst = model.get_booster()
-        tm.set_ordinal(-1, bst)
+        bst.set_param({"device": "cpu"})
        cpu_pred = model.predict(x, output_margin=True)
        np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)

--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -39,7 +39,8 @@ predict_parameter_strategy = strategies.fixed_dictionaries(
    }
 )

-pytestmark = tm.timeout(20)
+# cupy nvrtc compilation can take a long time for the first run
+pytestmark = tm.timeout(30)


 class TestGPUPredict:
@@ -71,8 +72,8 @@ class TestGPUPredict:
                param = {
                    "objective": "binary:logistic",
                    "eval_metric": "logloss",
-                    "tree_method": "gpu_hist",
-                    "gpu_id": 0,
+                    "tree_method": "hist",
+                    "device": "gpu:0",
                    "max_depth": 1,
                }
                bst = xgb.train(
@@ -84,7 +85,7 @@ class TestGPUPredict:
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

-                bst.set_param({"gpu_id": -1, "tree_method": "hist"})
+                bst.set_param({"device": "cpu", "tree_method": "hist"})
                bst_cpu = copy(bst)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
@@ -107,14 +108,15 @@ class TestGPUPredict:
        dtrain = xgb.DMatrix(X_train, label=y_train)

        params = {}
-        params["tree_method"] = "gpu_hist"
+        params["tree_method"] = "hist"
+        params["device"] = "cuda:0"
        bst = xgb.train(params, dtrain)

-        tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
        # Don't reuse the DMatrix for prediction, otherwise the result is cached.
        predict_gpu_0 = bst.predict(xgb.DMatrix(X_test))
        predict_gpu_1 = bst.predict(xgb.DMatrix(X_test))
-        tm.set_ordinal(-1, bst)
+        bst.set_param({"device": "cpu"})
        predict_cpu = bst.predict(xgb.DMatrix(X_test))

        assert np.allclose(predict_gpu_0, predict_gpu_1)
@@ -131,8 +133,8 @@ class TestGPUPredict:
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        params = {
-            "tree_method": "gpu_hist",
-            "gpu_id": "0",
+            "tree_method": "hist",
+            "device": "cuda:0",
            "n_jobs": -1,
            "seed": 123,
        }
@@ -141,13 +143,54 @@ class TestGPUPredict:
        gpu_test_score = m.score(X_test, y_test)

        # Now with cpu
-        m = tm.set_ordinal(-1, m)
+        m.set_params(device="cpu")
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

+    @pytest.mark.parametrize("device", ["cpu", "cuda"])
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_inplace_predict_device_type(self, device: str) -> None:
+        """Test inplace predict with different device and data types.
+
+        The sklearn interface uses inplace predict by default and gbtree fallbacks to
+        DMatrix whenever device doesn't match. This test checks that XGBoost can handle
+        different combinations of device and input data type.
+
+        """
+        import cudf
+        import cupy as cp
+        import pandas as pd
+        from scipy.sparse import csr_matrix
+
+        reg = xgb.XGBRegressor(tree_method="hist", device=device)
+        n_samples = 4096
+        n_features = 13
+        X, y, w = tm.make_regression(n_samples, n_features, use_cupy=True)
+        X[X == 0.0] = 1.0
+
+        reg.fit(X, y, sample_weight=w)
+        predt_0 = reg.predict(X)
+
+        X = cp.asnumpy(X)
+        predt_1 = reg.predict(X)
+
+        df = pd.DataFrame(X)
+        predt_2 = reg.predict(df)
+
+        df = cudf.DataFrame(X)
+        predt_3 = reg.predict(df)
+
+        X_csr = csr_matrix(X)
+        predt_4 = reg.predict(X_csr)
+
+        np.testing.assert_allclose(predt_0, predt_1)
+        np.testing.assert_allclose(predt_0, predt_2)
+        np.testing.assert_allclose(predt_0, predt_3)
+        np.testing.assert_allclose(predt_0, predt_4)
+
    def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
        import cupy as cp

@@ -175,7 +218,9 @@ class TestGPUPredict:
        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train(
-            {"tree_method": "gpu_hist", "gpu_id": device}, dtrain, num_boost_round=10
+            {"tree_method": "hist", "device": f"cuda:{device}"},
+            dtrain,
+            num_boost_round=10,
        )

        test = xgb.DMatrix(X[:10, ...], missing=missing)
@@ -208,13 +253,13 @@ class TestGPUPredict:
        missing_idx = [i for i in range(0, X.shape[1], 16)]
        X[:, missing_idx] = missing
        reg = xgb.XGBRegressor(
-            tree_method="gpu_hist", n_estimators=8, missing=missing, gpu_id=device
+            tree_method="hist", n_estimators=8, missing=missing, device=f"cuda:{device}"
        )
        reg.fit(X, y)

-        reg = tm.set_ordinal(device, reg)
+        reg.set_params(device=f"cuda:{device}")
        gpu_predt = reg.predict(X)
-        reg = tm.set_ordinal(-1, reg)
+        reg = reg.set_params(device="cpu")
        cpu_predt = reg.predict(cp.asnumpy(X))
        np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)
        cp.cuda.runtime.setDevice(0)
@@ -250,7 +295,9 @@ class TestGPUPredict:

        dtrain = xgb.DMatrix(X, y)

-        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain, num_boost_round=10)
+        booster = xgb.train(
+            {"tree_method": "hist", "device": "cuda:0"}, dtrain, num_boost_round=10
+        )
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)
@@ -280,12 +327,12 @@ class TestGPUPredict:
    def test_shap(self, num_rounds, dataset, param):
        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
            return
-        param.update({"tree_method": "gpu_hist", "gpu_id": 0})
+        param.update({"tree_method": "hist", "device": "gpu:0"})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "gpu:0"})
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
@@ -298,12 +345,12 @@ class TestGPUPredict:
    def test_shap_interactions(self, num_rounds, dataset, param):
        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
            return
-        param.update({"tree_method": "hist", "gpu_id": 0})
+        param.update({"tree_method": "hist", "device": "cuda:0"})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
@@ -317,16 +364,18 @@ class TestGPUPredict:
    def test_shap_categorical(self):
        X, y = tm.make_categorical(100, 20, 7, False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
-        booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10)
+        booster = xgb.train(
+            {"tree_method": "hist", "device": "gpu:0"}, Xy, num_boost_round=10
+        )

-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(
            np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3
        )

-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(
@@ -334,8 +383,8 @@ class TestGPUPredict:
        )

    def test_predict_leaf_basic(self):
-        gpu_leaf = run_predict_leaf(0)
-        cpu_leaf = run_predict_leaf(-1)
+        gpu_leaf = run_predict_leaf("gpu:0")
+        cpu_leaf = run_predict_leaf("cpu")
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
@@ -344,23 +393,22 @@ class TestGPUPredict:
        booster = xgb.train(
            param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds
        )
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
        cpu_leaf = booster.predict(m, pred_leaf=True)

-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)

    @given(predict_parameter_strategy, tm.make_dataset_strategy())
    @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_predict_leaf_gbtree(self, param, dataset):
+    def test_predict_leaf_gbtree(self, param: dict, dataset: tm.TestDataset) -> None:
        # Unsupported for random forest
        if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
            return

-        param["booster"] = "gbtree"
-        param["tree_method"] = "gpu_hist"
+        param.update({"booster": "gbtree", "tree_method": "hist", "device": "cuda:0"})
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.make_dataset_strategy())
@@ -370,8 +418,7 @@ class TestGPUPredict:
        if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
            return

-        param["booster"] = "dart"
-        param["tree_method"] = "gpu_hist"
+        param.update({"booster": "dart", "tree_method": "hist", "device": "cuda:0"})
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
@@ -395,12 +442,12 @@ class TestGPUPredict:
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
-            "tree_method": "gpu_hist",
+            "tree_method": "hist",
            "max_depth": 3,
            "learning_rate": 1.0,
            "base_score": 0.0,
            "eval_metric": "rmse",
-            "gpu_id": "0",
+            "device": "cuda:0",
        }

        eval_history = {}
@@ -412,7 +459,7 @@ class TestGPUPredict:
            verbose_eval=False,
            evals_result=eval_history,
        )
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(
@@ -434,14 +481,16 @@ class TestGPUPredict:
        Xy = xgb.DMatrix(X, y)
        if n_classes == 2:
            params = {
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda:0",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "binary:logistic",
            }
        else:
            params = {
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda:0",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "multi:softprob",
@@ -455,7 +504,7 @@ class TestGPUPredict:
        copied = booster.predict(Xy)

        # CPU
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
        cpu_inplace = booster.inplace_predict(X_)
        cpu_copied = booster.predict(Xy)

@@ -465,7 +514,7 @@ class TestGPUPredict:
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

        # GPU
-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)

@@ -482,7 +531,7 @@ class TestGPUPredict:
        orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols)
        y = rng.randint(low=0, high=127, size=rows)
        dtrain = xgb.DMatrix(orig, label=y)
-        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)
+        booster = xgb.train({"tree_method": "hist", "device": "cuda:0"}, dtrain)

        predt_orig = booster.inplace_predict(orig)
        # all primitive types in numpy
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -28,7 +28,7 @@ def run_threaded_predict(X, rows, predict_func):
        assert f.result()


-def run_predict_leaf(gpu_id: int) -> np.ndarray:
+def run_predict_leaf(device: str) -> np.ndarray:
    rows = 100
    cols = 4
    classes = 5
@@ -48,7 +48,7 @@ def run_predict_leaf(gpu_id: int) -> np.ndarray:
        num_boost_round=num_boost_round,
    )

-    booster = tm.set_ordinal(gpu_id, booster)
+    booster.set_param({"device": device})
    empty = xgb.DMatrix(np.ones(shape=(0, cols)))
    empty_leaf = booster.predict(empty, pred_leaf=True)
    assert empty_leaf.shape[0] == 0
@@ -74,14 +74,14 @@ def run_predict_leaf(gpu_id: int) -> np.ndarray:

    # When there's only 1 tree, the output is a 1 dim vector
    booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
-    booster = tm.set_ordinal(gpu_id, booster)
+    booster.set_param({"device": device})
    assert booster.predict(m, pred_leaf=True).shape == (rows,)

    return leaf


 def test_predict_leaf() -> None:
-    run_predict_leaf(-1)
+    run_predict_leaf("cpu")


 def test_predict_shape():
--- a/tests/test_distributed/test_with_spark/test_data.py
+++ b/tests/test_distributed/test_with_spark/test_data.py
@@ -69,7 +69,7 @@ def run_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool, on_gpu: bool) -> None:
    train_Xy, valid_Xy = create_dmatrix_from_partitions(
        iter(dfs),
        feature_cols,
-        gpu_id=device_id,
+        dev_ordinal=device_id,
        use_qdm=is_qdm,
        kwargs=kwargs,
        enable_sparse_data_optim=False,
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -1025,6 +1025,7 @@ class XgboostLocalTest(SparkTestCase):
        self.assertTrue(hasattr(py_reg, "n_estimators"))
        self.assertEqual(py_reg.n_estimators.parent, py_reg.uid)
        self.assertFalse(hasattr(py_reg, "gpu_id"))
+        self.assertFalse(hasattr(py_reg, "device"))
        self.assertEqual(py_reg.getOrDefault(py_reg.n_estimators), 100)
        self.assertEqual(py_reg.getOrDefault(py_reg.objective), "reg:squarederror")
        py_reg2 = SparkXGBRegressor(n_estimators=200)
@@ -1038,6 +1039,7 @@ class XgboostLocalTest(SparkTestCase):
        self.assertTrue(hasattr(py_cls, "n_estimators"))
        self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
        self.assertFalse(hasattr(py_cls, "gpu_id"))
+        self.assertFalse(hasattr(py_cls, "device"))
        self.assertEqual(py_cls.getOrDefault(py_cls.n_estimators), 100)
        self.assertEqual(py_cls.getOrDefault(py_cls.objective), None)
        py_cls2 = SparkXGBClassifier(n_estimators=200)
@@ -1051,6 +1053,7 @@ class XgboostLocalTest(SparkTestCase):
        self.assertTrue(hasattr(py_cls, "n_estimators"))
        self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
        self.assertFalse(hasattr(py_cls, "gpu_id"))
+        self.assertFalse(hasattr(py_cls, "device"))
        self.assertTrue(hasattr(py_cls, "arbitrary_params_dict"))
        expected_kwargs = {"sketch_eps": 0.03}
        self.assertEqual(