enable ROCm on latest XGBoost

2023-10-23 11:07:08 -07:00
parent fb19e15ce3 3b86260b50
commit 15421e40d9
328 changed files with 8028 additions and 3642 deletions
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -147,7 +147,7 @@ TEST(CutsBuilder, SearchGroupInd) {

  EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));

-  p_mat->Info().Validate(-1);
+  p_mat->Info().Validate(DeviceOrd::CPU());
  EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
               dmlc::Error);

@@ -330,7 +330,7 @@ TEST(HistUtil, IndexBinData) {
 void TestSketchFromWeights(bool with_group) {
  size_t constexpr kRows = 300, kCols = 20, kBins = 256;
  size_t constexpr kGroups = 10;
-  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
+  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
  Context ctx;
  common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);

--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -222,7 +222,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
  ASSERT_EQ(info.feature_types.Size(), n_features);

  HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
-  cuts_ptr.SetDevice(0);
+  cuts_ptr.SetDevice(DeviceOrd::CUDA(0));

  dh::device_vector<float> weight(n_samples * n_features, 0);
  dh::Iota(dh::ToSpan(weight));
@@ -235,7 +235,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
  thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
                      detail::EntryCompareOp());

-  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
+  detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries,
                                     &weight, &columns_ptr);

  auto const& h_cptr = cuts_ptr.ConstHostVector();
@@ -377,7 +377,8 @@ template <typename Adapter>
 auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
  common::HistogramCuts batched_cuts;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
+  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
+                                   DeviceOrd::CUDA(0));
  MetaInfo info;
  AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
  sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
@@ -444,7 +445,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
  ConsoleLogger::Configure({{"verbosity", "3"}});
  common::HistogramCuts batched_cuts;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
  AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
  HistogramCuts cuts;
@@ -472,7 +473,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
  ConsoleLogger::Configure({{"verbosity", "3"}});
  common::HistogramCuts batched_cuts;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
  AdapterDeviceSketch(adapter.Value(), num_bins, info,
                      std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
@@ -507,7 +508,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
  }

  ASSERT_EQ(info.feature_types.Size(), 1);
-  SketchContainer container(info.feature_types, num_bins, 1, n, 0);
+  SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0));
  AdapterDeviceSketch(adapter.Value(), num_bins, info,
                      std::numeric_limits<float>::quiet_NaN(), &container);
  HistogramCuts cuts;
@@ -580,11 +581,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {

 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-#if defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
-#endif
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
  auto n = n_samples * n_features;
  std::vector<float> x;
  x.resize(n);
@@ -624,21 +621,21 @@ void TestGetColumnSize(std::size_t n_samples) {
  std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());

  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());

  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
  ASSERT_EQ(h_column_size, h_column_size_1);

  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
  ASSERT_EQ(h_column_size, h_column_size_1);

  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
  ASSERT_EQ(h_column_size, h_column_size_1);
 }
@@ -715,9 +712,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
  size_t constexpr kRows = 300, kCols = 20, kBins = 256;
  size_t constexpr kGroups = 10;
  HostDeviceVector<float> storage;
-  std::string m =
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
-          &storage);
+  std::string m = RandomDataGenerator{kRows, kCols, 0}
+                      .Device(DeviceOrd::CUDA(0))
+                      .GenerateArrayInterface(&storage);
  MetaInfo info;
  Context ctx;
  auto& h_weights = info.weights_.HostVector();
@@ -736,14 +733,14 @@ void TestAdapterSketchFromWeights(bool with_group) {
    info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
  }

-  info.weights_.SetDevice(0);
+  info.weights_.SetDevice(DeviceOrd::CUDA(0));
  info.num_row_ = kRows;
  info.num_col_ = kCols;

  data::CupyAdapter adapter(m);
  auto const& batch = adapter.Value();
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0));
  AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);

@@ -787,7 +784,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
      // https://github.com/dmlc/xgboost/issues/7946
      h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
    }
-    SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+    SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
    AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                        &sketch_container);
    sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -1,7 +1,6 @@
-/*!
- * Copyright 2018 XGBoost contributors
+/**
+ * Copyright 2018-2023 XGBoost contributors
 */
-
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -13,21 +12,14 @@
 #endif
 #include <xgboost/host_device_vector.h>

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace {
-void SetDeviceForTest(int device) {
+void SetDeviceForTest(DeviceOrd device) {
  int n_devices;

-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaGetDeviceCount(&n_devices));
-  device %= n_devices;
-  dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipGetDeviceCount(&n_devices));
-  device %= n_devices;
-  dh::safe_cuda(hipSetDevice(device));
-#endif
+  device.ordinal %= n_devices;
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 }
 }  // namespace

@@ -42,13 +34,13 @@ struct HostDeviceVectorSetDeviceHandler {
  }
 };

-void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
+void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
  // create the vector
  v->SetDevice(device);
  v->Resize(n);

  ASSERT_EQ(v->Size(), n);
-  ASSERT_EQ(v->DeviceIdx(), device);
+  ASSERT_EQ(v->Device(), device);
  // ensure that the device have read-write access
  ASSERT_TRUE(v->DeviceCanRead());
  ASSERT_TRUE(v->DeviceCanWrite());
@@ -68,7 +60,7 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
 }

 void PlusOne(HostDeviceVector<int> *v) {
-  int device = v->DeviceIdx();
+  auto device = v->Device();
  SetDeviceForTest(device);
  thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
                    [=]__device__(unsigned int a){ return a + 1; });
@@ -80,7 +72,7 @@ void CheckDevice(HostDeviceVector<int>* v,
                 unsigned int first,
                 GPUAccess access) {
  ASSERT_EQ(v->Size(), size);
-  SetDeviceForTest(v->DeviceIdx());
+  SetDeviceForTest(v->Device());

  ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
                            thrust::make_counting_iterator(first)));
@@ -111,7 +103,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
  ASSERT_FALSE(v->DeviceCanWrite());
 }

-void TestHostDeviceVector(size_t n, int device) {
+void TestHostDeviceVector(size_t n, DeviceOrd device) {
  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
  HostDeviceVector<int> v;
  InitHostDeviceVector(n, device, &v);
@@ -124,13 +116,13 @@ void TestHostDeviceVector(size_t n, int device) {

 TEST(HostDeviceVector, Basic) {
  size_t n = 1001;
-  int device = 0;
+  DeviceOrd device = DeviceOrd::CUDA(0);
  TestHostDeviceVector(n, device);
 }

 TEST(HostDeviceVector, Copy) {
  size_t n = 1001;
-  int device = 0;
+  auto device = DeviceOrd::CUDA(0);
  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);

  HostDeviceVector<int> v;
@@ -154,15 +146,15 @@ TEST(HostDeviceVector, SetDevice) {
    h_vec[i] = i;
  }
  HostDeviceVector<int> vec (h_vec);
-  auto device = 0;
+  auto device = DeviceOrd::CUDA(0);

  vec.SetDevice(device);
  ASSERT_EQ(vec.Size(), h_vec.size());
  auto span = vec.DeviceSpan();  // sync to device

-  vec.SetDevice(-1);  // pull back to cpu.
+  vec.SetDevice(DeviceOrd::CPU());  // pull back to cpu.
  ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_EQ(vec.DeviceIdx(), -1);
+  ASSERT_EQ(vec.Device(), DeviceOrd::CPU());

  auto h_vec_1 = vec.HostVector();
  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@@ -170,7 +162,7 @@ TEST(HostDeviceVector, SetDevice) {

 TEST(HostDeviceVector, Span) {
  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.SetDevice(0);
+  vec.SetDevice(DeviceOrd::CUDA(0));
  auto span = vec.DeviceSpan();
  ASSERT_EQ(vec.Size(), span.size());
  ASSERT_EQ(vec.DevicePointer(), span.data());
@@ -194,5 +186,4 @@ TEST(HostDeviceVector, Empty) {
  ASSERT_FALSE(another.Empty());
  ASSERT_TRUE(vec.Empty());
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -144,7 +144,8 @@ TEST(IO, Resource) {
    fout << 1.0 << std::endl;
    fout.close();

-    auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
+    auto resource = std::shared_ptr<MmapResource>{
+      new MmapResource{path, 0, sizeof(double)}};
    ASSERT_EQ(resource->Size(), sizeof(double));
    ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
    ASSERT_EQ(resource->DataAs<double>()[0], val);
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -1,13 +1,15 @@
 /**
- * Copyright (c) 2019-2023, XGBoost Contributors
+ * Copyright 2019-2023, XGBoost Contributors
 */
 #include <gtest/gtest.h>

 #include <fstream>
+#include <iterator>  // for back_inserter
 #include <map>

 #include "../../../src/common/charconv.h"
 #include "../../../src/common/io.h"
+#include "../../../src/common/json_utils.h"
 #include "../../../src/common/threading_utils.h"  // for ParallelFor
 #include "../filesystem.h"                        // dmlc::TemporaryDirectory
 #include "../helpers.h"
@@ -691,4 +693,16 @@ TEST(Json, TypeCheck) {
    ASSERT_NE(err.find("foo"), std::string::npos);
  }
 }
+
+TEST(Json, Dump) {
+  auto str = GetModelStr();
+  auto jobj = Json::Load(str);
+  std::string result_s = Json::Dump(jobj);
+
+  std::vector<char> result_v = Json::Dump<std::vector<char>>(jobj);
+  ASSERT_EQ(result_s.size(), result_v.size());
+  for (std::size_t i = 0; i < result_s.size(); ++i) {
+    ASSERT_EQ(result_s[i], result_v[i]);
+  }
+}
 }  // namespace xgboost
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -16,7 +16,7 @@ namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
  auto device = DeviceOrd::CUDA(0);
-  Tensor<float, 3> l{{2, 3, 4}, 0};
+  Tensor<float, 3> l{{2, 3, 4}, device};
  {
    /**
     * Non-contiguous
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -9,9 +9,7 @@
 #include "../../../src/data/adapter.h"
 #include "xgboost/context.h"

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 TEST(Quantile, LoadBalance) {
  size_t constexpr kRows = 1000, kCols = 100;
  auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
@@ -314,7 +312,7 @@ void TestSameOnAllWorkers() {
        }

        auto m = RandomDataGenerator{kRows, kCols, 0}
-                     .Device(Context::kCpuId)
+                     .Device(DeviceOrd::CPU())
                     .Type(ft)
                     .MaxCategory(17)
                     .Seed(rank + seed)
@@ -373,6 +371,4 @@ TEST(Quantile, SameOnAllWorkers) {
  auto constexpr kWorkers = 4;
  RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
 }
-
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -32,7 +32,7 @@ class MGPUQuantileTest : public BaseMGPUTest {};
 TEST(GPUQuantile, Basic) {
  constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
  dh::caching_device_vector<Entry> entries;
  dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
  thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
@@ -45,12 +45,12 @@ void TestSketchUnique(float sparsity) {
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());

    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
                                    .Seed(seed)
-                                    .Device(0)
+                                    .Device(FstCU())
                                    .GenerateArrayInterface(&storage);
    data::CupyAdapter adapter(interface_str);
    AdapterDeviceSketch(adapter.Value(), n_bins, info,
@@ -65,7 +65,7 @@ void TestSketchUnique(float sparsity) {
        thrust::make_counting_iterator(0llu),
        [=] __device__(size_t idx) { return batch.GetElement(idx); });
    auto end = kCols * kRows;
-    detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
+    detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
                               &cut_sizes_scan, &column_sizes_scan);
    auto const& cut_sizes = cut_sizes_scan.HostVector();
    ASSERT_LE(sketch.Data().size(), cut_sizes.back());
@@ -93,13 +93,9 @@ TEST(GPUQuantile, Unique) {
 }

 // if with_error is true, the test tolerates floating point error
-void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,
+void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
                          Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
-#if defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#endif
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
  std::vector<SketchEntry> h_in(in.size());
  dh::CopyDeviceSpanToVector(&h_in, in);
  std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
@@ -134,13 +130,12 @@ TEST(GPUQuantile, Prune) {
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());

    HostDeviceVector<float> storage;
-    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage);
+    std::string interface_str =
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+            &storage);
    data::CupyAdapter adapter(interface_str);
    AdapterDeviceSketch(adapter.Value(), n_bins, info,
                        std::numeric_limits<float>::quiet_NaN(), &sketch);
@@ -156,7 +151,7 @@ TEST(GPUQuantile, Prune) {
    ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
                                  sketch.Data().data() + sketch.Data().size(),
                                  detail::SketchUnique{}));
-    TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr());
  });
 }

@@ -164,10 +159,10 @@ TEST(GPUQuantile, MergeEmpty) {
  constexpr size_t kRows = 1000, kCols = 100;
  size_t n_bins = 10;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
  HostDeviceVector<float> storage_0;
  std::string interface_str_0 =
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
+      RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface(
          &storage_0);
  data::CupyAdapter adapter_0(interface_str_0);
  MetaInfo info;
@@ -204,34 +199,33 @@ TEST(GPUQuantile, MergeBasic) {
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
    HostDeviceVector<float> storage_0;
    std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                  .Device(FstCU())
                                      .Seed(seed)
                                      .GenerateArrayInterface(&storage_0);
    data::CupyAdapter adapter_0(interface_str_0);
    AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                        std::numeric_limits<float>::quiet_NaN(), &sketch_0);

-    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0);
+    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU());
    HostDeviceVector<float> storage_1;
-    std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
-                                      .Seed(seed)
-                                      .GenerateArrayInterface(&storage_1);
+    std::string interface_str_1 =
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+            &storage_1);
    data::CupyAdapter adapter_1(interface_str_1);
-    AdapterDeviceSketch(adapter_1.Value(), n_bins, info,
-                        std::numeric_limits<float>::quiet_NaN(), &sketch_1);
+    AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
+                        &sketch_1);

    size_t size_before_merge = sketch_0.Data().size();
    sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
    if (info.weights_.Size() != 0) {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
      sketch_0.FixError();
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
    } else {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
    }

    auto columns_ptr = sketch_0.ColumnsPtr();
@@ -251,24 +245,22 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
  MetaInfo info;
  int32_t seed = 0;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, cols, rows, 0);
+  SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU());
  HostDeviceVector<float> storage_0;
-  std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage_0);
+  std::string interface_str_0 =
+      RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+          &storage_0);
  data::CupyAdapter adapter_0(interface_str_0);
  AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                      std::numeric_limits<float>::quiet_NaN(),
                      &sketch_0);

  size_t f_rows = rows * frac;
-  SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0);
+  SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU());
  HostDeviceVector<float> storage_1;
-  std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage_1);
+  std::string interface_str_1 =
+      RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+          &storage_1);
  auto data_1 = storage_1.DeviceSpan();
  auto tuple_it = thrust::make_tuple(
      thrust::make_counting_iterator<size_t>(0ul), data_1.data());
@@ -290,7 +282,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {

  size_t size_before_merge = sketch_0.Data().size();
  sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
-  TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+  TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());

  auto columns_ptr = sketch_0.ColumnsPtr();
  std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
@@ -321,11 +313,10 @@ TEST(GPUQuantile, MergeDuplicated) {
 TEST(GPUQuantile, MultiMerge) {
  constexpr size_t kRows = 20, kCols = 1;
  int32_t world = 2;
-  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
-                                 MetaInfo const &info) {
+  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
    // Set up single node version
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());

    size_t intermediate_num_cuts = std::min(
        kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@@ -333,12 +324,12 @@ TEST(GPUQuantile, MultiMerge) {
    for (auto rank = 0; rank < world; ++rank) {
      HostDeviceVector<float> storage;
      std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                      .Device(FstCU())
                                      .Seed(rank + seed)
                                      .GenerateArrayInterface(&storage);
      data::CupyAdapter adapter(interface_str);
      HostDeviceVector<FeatureType> ft;
-      containers.emplace_back(ft, n_bins, kCols, kRows, 0);
+      containers.emplace_back(ft, n_bins, kCols, kRows, FstCU());
      AdapterDeviceSketch(adapter.Value(), n_bins, info,
                          std::numeric_limits<float>::quiet_NaN(),
                          &containers.back());
@@ -348,21 +339,44 @@ TEST(GPUQuantile, MultiMerge) {
      sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
      sketch_on_single_node.FixError();
    }
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
-                         sketch_on_single_node.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());

    sketch_on_single_node.Unique();
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
-                         sketch_on_single_node.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
  });
 }

+TEST(GPUQuantile, MissingColumns) {
+  auto dmat = std::unique_ptr<DMatrix>{[=]() {
+    std::size_t constexpr kRows = 1000, kCols = 100;
+    auto sparsity = 0.5f;
+    std::vector<FeatureType> ft(kCols);
+    for (size_t i = 0; i < ft.size(); ++i) {
+      ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
+    }
+    auto dmat = RandomDataGenerator{kRows, kCols, sparsity}
+                    .Seed(0)
+                    .Lower(.0f)
+                    .Upper(1.0f)
+                    .Type(ft)
+                    .MaxCategory(13)
+                    .GenerateDMatrix();
+    return dmat->SliceCol(2, 1);
+  }()};
+  dmat->Info().data_split_mode = DataSplitMode::kRow;
+
+  auto ctx = MakeCUDACtx(0);
+  std::size_t constexpr kBins = 64;
+  HistogramCuts cuts = common::DeviceSketch(&ctx, dmat.get(), kBins);
+  ASSERT_TRUE(cuts.HasCategorical());
+}
+
 namespace {
 void TestAllReduceBasic() {
  auto const world = collective::GetWorldSize();
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);

    // Set up single node version;
    HostDeviceVector<FeatureType> ft({}, device);
@@ -440,18 +454,14 @@ TEST_F(MGPUQuantileTest, AllReduceBasic) {
 }

 namespace {
-void TestColumnSplitBasic() {
+void TestColumnSplit(DMatrix* dmat) {
  auto const world = collective::GetWorldSize();
  auto const rank = collective::GetRank();
-  std::size_t constexpr kRows = 1000, kCols = 100, kBins = 64;
-
-  auto m = std::unique_ptr<DMatrix>{[=]() {
-    auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-    return dmat->SliceCol(world, rank);
-  }()};
+  auto m = std::unique_ptr<DMatrix>{dmat->SliceCol(world, rank)};

  // Generate cuts for distributed environment.
  auto ctx = MakeCUDACtx(GPUIDX);
+  std::size_t constexpr kBins = 64;
  HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins);

  // Generate cuts for single node environment
@@ -484,7 +494,26 @@ void TestColumnSplitBasic() {
 }  // anonymous namespace

 TEST_F(MGPUQuantileTest, ColumnSplitBasic) {
-  DoTest(TestColumnSplitBasic);
+  std::size_t constexpr kRows = 1000, kCols = 100;
+  auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+  DoTest(TestColumnSplit, dmat.get());
+}
+
+TEST_F(MGPUQuantileTest, ColumnSplitCategorical) {
+  std::size_t constexpr kRows = 1000, kCols = 100;
+  auto sparsity = 0.5f;
+  std::vector<FeatureType> ft(kCols);
+  for (size_t i = 0; i < ft.size(); ++i) {
+    ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
+  }
+  auto dmat = RandomDataGenerator{kRows, kCols, sparsity}
+                  .Seed(0)
+                  .Lower(.0f)
+                  .Upper(1.0f)
+                  .Type(ft)
+                  .MaxCategory(13)
+                  .GenerateDMatrix();
+  DoTest(TestColumnSplit, dmat.get());
 }

 namespace {
@@ -494,7 +523,7 @@ void TestSameOnAllWorkers() {
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                 MetaInfo const &info) {
    auto const rank = collective::GetRank();
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);
    HostDeviceVector<FeatureType> ft({}, device);
    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
    HostDeviceVector<float> storage({}, device);
@@ -525,9 +554,9 @@ void TestSameOnAllWorkers() {
    thrust::copy(thrust::device, local_data.data(),
                 local_data.data() + local_data.size(),
                 all_workers.begin() + local_data.size() * rank);
-    collective::AllReduce<collective::Operation::kSum>(device, all_workers.data().get(),
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, all_workers.data().get(),
                                                       all_workers.size());
-    collective::Synchronize(device);
+    collective::Synchronize(device.ordinal);

    auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
    std::vector<float> h_base_line(base_line.size());
@@ -573,7 +602,7 @@ TEST(GPUQuantile, Push) {
  columns_ptr[1] = kRows;

  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
  sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});

  auto sketch_data = sketch.Data();
@@ -613,7 +642,7 @@ TEST(GPUQuantile, MultiColPush) {

  int32_t n_bins = 16;
  HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
  dh::device_vector<Entry> d_entries {entries};

  dh::device_vector<size_t> columns_ptr(kCols + 1, 0);
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@@ -95,7 +95,7 @@ void TestRankingCache(Context const* ctx) {
  HostDeviceVector<float> predt(info.num_row_, 0);
  auto& h_predt = predt.HostVector();
  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());

  auto rank_idx =
      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
@@ -129,7 +129,7 @@ void TestNDCGCache(Context const* ctx) {
    auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
    // empty label
    ASSERT_THROW(fail(), dmlc::Error);
-    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
+    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, DeviceOrd::CPU()};
    // invalid label
    ASSERT_THROW(fail(), dmlc::Error);
    auto h_labels = info.labels.HostView();
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -42,7 +42,7 @@ void TestCalcQueriesInvIDCG() {
  auto d_scores = dh::ToSpan(scores);
  common::SegmentedSequence(&ctx, d_group_ptr, d_scores);

-  linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
+  linalg::Vector<double> inv_IDCG({n_groups}, ctx.Device());

  ltr::LambdaRankParam p;
  p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
@@ -77,7 +77,7 @@ void TestRankingCache(Context const* ctx) {
  HostDeviceVector<float> predt(info.num_row_, 0);
  auto& h_predt = predt.HostVector();
  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());

  auto rank_idx =
      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -9,12 +9,11 @@
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
 #include "../helpers.h"

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(Stats, Quantile) {
  Context ctx;
  {
-    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
+    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, DeviceOrd::CPU());
    std::vector<size_t> index{0, 2, 3, 4, 6};
    auto h_arr = arr.HostView();
    auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
@@ -40,8 +39,8 @@ TEST(Stats, Quantile) {

 TEST(Stats, WeightedQuantile) {
  Context ctx;
-  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
-  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);
+  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, DeviceOrd::CPU());
+  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, DeviceOrd::CPU());

  auto h_arr = arr.HostView();
  auto h_weight = weight.HostView();
@@ -64,7 +63,7 @@ TEST(Stats, Median) {
  Context ctx;

  {
-    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, Context::kCpuId};
+    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, DeviceOrd::CPU()};
    HostDeviceVector<float> weights;
    linalg::Tensor<float, 1> out;
    Median(&ctx, values, weights, &out);
@@ -83,7 +82,7 @@ TEST(Stats, Median) {
  {
    ctx = ctx.MakeCPU();
    // 4x2 matrix
-    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
+    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.Device()};
    HostDeviceVector<float> weights;
    linalg::Tensor<float, 1> out;
    Median(&ctx, values, weights, &out);
@@ -102,14 +101,14 @@ TEST(Stats, Median) {
 namespace {
 void TestMean(Context const* ctx) {
  std::size_t n{128};
-  linalg::Vector<float> data({n}, ctx->gpu_id);
+  linalg::Vector<float> data({n}, ctx->Device());
  auto h_v = data.HostView().Values();
  std::iota(h_v.begin(), h_v.end(), .0f);

  auto nf = static_cast<float>(n);
  float mean = nf * (nf - 1) / 2 / n;

-  linalg::Vector<float> res{{1}, ctx->gpu_id};
+  linalg::Vector<float> res{{1}, ctx->Device()};
  Mean(ctx, data, &res);
  auto h_res = res.HostView();
  ASSERT_EQ(h_res.Size(), 1);
@@ -127,6 +126,5 @@ TEST(Stats, GPUMean) {
  auto ctx = MakeCUDACtx(0);
  TestMean(&ctx);
 }
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-}  // namespace common
-}  // namespace xgboost
+#endif  // defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -25,8 +25,8 @@ namespace common {
 namespace {
 class StatsGPU : public ::testing::Test {
 private:
-  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0};
-  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, 0};
+  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, FstCU()};
+  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, FstCU()};
  HostDeviceVector<float> results_;
  using TestSet = std::vector<std::pair<float, float>>;
  Context ctx_;
@@ -51,7 +51,7 @@ class StatsGPU : public ::testing::Test {
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
    auto d_arr = arr.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -63,7 +63,7 @@ class StatsGPU : public ::testing::Test {

    // one alpha for each segment
    HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
    auto d_alphas = alphas.ConstDeviceSpan();
    auto w_it = thrust::make_constant_iterator(0.1f);
    SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
@@ -85,7 +85,7 @@ class StatsGPU : public ::testing::Test {
    auto val_it =
        dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                         [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
-    linalg::Tensor<float, 1> weights{{10}, 0};
+    linalg::Tensor<float, 1> weights{{10}, FstCU()};
    linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
                                       [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
    auto w_it = weights.Data()->ConstDevicePointer();
@@ -106,7 +106,7 @@ class StatsGPU : public ::testing::Test {
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
    auto d_arr = arr.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -118,7 +118,7 @@ class StatsGPU : public ::testing::Test {

    // one alpha for each segment
    HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
    auto d_alphas = alphas.ConstDeviceSpan();
    SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
                      val_it + d_arr.Size(), &results_);
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -11,63 +11,59 @@
 #include "../../../src/common/transform.h"
 #include "../helpers.h"

+namespace xgboost::common {
+namespace {
+constexpr DeviceOrd TransformDevice() {
 #if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
-
-#define TRANSFORM_GPU 0
-
+  return DeviceOrd::CUDA(0);
 #else
-
-#define TRANSFORM_GPU -1
-
+  return DeviceOrd::CPU();
 #endif
-
-namespace xgboost {
-namespace common {
+}
+}  // namespace

 template <typename T>
 struct TestTransformRange {
-  void XGBOOST_DEVICE operator()(size_t _idx,
-                                 Span<bst_float> _out, Span<const bst_float> _in) {
+  void XGBOOST_DEVICE operator()(std::size_t _idx, Span<float> _out, Span<const float> _in) {
    _out[_idx] = _in[_idx];
  }
 };

 TEST(Transform, DeclareUnifiedTest(Basic)) {
-  const size_t size {256};
-  std::vector<bst_float> h_in(size);
-  std::vector<bst_float> h_out(size);
+  const size_t size{256};
+  std::vector<float> h_in(size);
+  std::vector<float> h_out(size);
  std::iota(h_in.begin(), h_in.end(), 0);
-  std::vector<bst_float> h_sol(size);
+  std::vector<float> h_sol(size);
  std::iota(h_sol.begin(), h_sol.end(), 0);

-  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
-  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
+  auto device = TransformDevice();
+  HostDeviceVector<float> const in_vec{h_in, device};
+  HostDeviceVector<float> out_vec{h_out, device};
  out_vec.Fill(0);

-  Transform<>::Init(TestTransformRange<bst_float>{},
+  Transform<>::Init(TestTransformRange<float>{},
                    Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(),
-                    TRANSFORM_GPU)
+                    TransformDevice())
      .Eval(&out_vec, &in_vec);
-  std::vector<bst_float> res = out_vec.HostVector();
+  std::vector<float> res = out_vec.HostVector();

  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
 }

 #if !defined(__CUDACC__) && !defined(__HIP_PLATFORM_AMD__)
 TEST(TransformDeathTest, Exception) {
-  size_t const kSize {16};
-  std::vector<bst_float> h_in(kSize);
-  const HostDeviceVector<bst_float> in_vec{h_in, -1};
+  size_t const kSize{16};
+  std::vector<float> h_in(kSize);
+  const HostDeviceVector<float> in_vec{h_in, DeviceOrd::CPU()};
  EXPECT_DEATH(
      {
        Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; },
                          Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(),
-                          -1)
+                          DeviceOrd::CPU())
            .Eval(&in_vec);
      },
      "");
 }
 #endif
-
-} // namespace common
-} // namespace xgboost
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_transform_range.cu
+++ b/tests/cpp/common/test_transform_range.cu
@@ -0,0 +1,5 @@
+/**
+ * Copyright 2023 XGBoost contributors
+ */
+// Dummy file to keep the CUDA tests.
+#include "test_transform_range.cc"