From 0f35493b650070dbce7117261da60ff2f317df59 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 6 Sep 2023 02:03:59 -0700
Subject: [PATCH 01/59] Add GPU support to NVFlare demo (#9552)

---
 demo/nvflare/horizontal/README.md         | 6 +++---
 demo/nvflare/horizontal/custom/trainer.py | 2 +-
 demo/nvflare/vertical/README.md           | 7 ++++++-
 demo/nvflare/vertical/custom/trainer.py   | 6 ++++--
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/demo/nvflare/horizontal/README.md b/demo/nvflare/horizontal/README.md
index 19ac4cf4e..7337f1720 100644
--- a/demo/nvflare/horizontal/README.md
+++ b/demo/nvflare/horizontal/README.md
@@ -85,8 +85,8 @@ shutdown server
 ## Training with GPUs
 
 To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
-Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
-turned off (see the [README](../../plugin/federated/README.md)).
+Build XGBoost with the federated learning plugin enabled along with CUDA
+(see the [README](../../plugin/federated/README.md)).
 
-Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+Modify `../config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
 above.
diff --git a/demo/nvflare/horizontal/custom/trainer.py b/demo/nvflare/horizontal/custom/trainer.py
index b1ec94211..4f20b2f39 100644
--- a/demo/nvflare/horizontal/custom/trainer.py
+++ b/demo/nvflare/horizontal/custom/trainer.py
@@ -67,7 +67,7 @@ class XGBoostTrainer(Executor):
             dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm')
 
             # Specify parameters via map, definition are same as c++ version
-            param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+            param = {'tree_method': 'hist', 'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
             if self._use_gpus:
                 self.log_info(fl_ctx, f'Training with GPU {rank}')
                 param['device'] = f"cuda:{rank}"
diff --git a/demo/nvflare/vertical/README.md b/demo/nvflare/vertical/README.md
index f9cca57d9..d63b2bca4 100644
--- a/demo/nvflare/vertical/README.md
+++ b/demo/nvflare/vertical/README.md
@@ -56,4 +56,9 @@ shutdown server
 
 ## Training with GPUs
 
-Currently GPUs are not yet supported by vertical federated XGBoost.
+To demo with Vertical Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
+Build XGBoost with the federated learning plugin enabled along with CUDA
+(see the [README](../../plugin/federated/README.md)).
+
+Modify `../config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+above.
diff --git a/demo/nvflare/vertical/custom/trainer.py b/demo/nvflare/vertical/custom/trainer.py
index 1c235a439..efe320734 100644
--- a/demo/nvflare/vertical/custom/trainer.py
+++ b/demo/nvflare/vertical/custom/trainer.py
@@ -77,13 +77,15 @@ class XGBoostTrainer(Executor):
                 'gamma': 1.0,
                 'max_depth': 8,
                 'min_child_weight': 100,
-                'tree_method': 'approx',
+                'tree_method': 'hist',
                 'grow_policy': 'depthwise',
                 'objective': 'binary:logistic',
                 'eval_metric': 'auc',
             }
             if self._use_gpus:
-                self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost')
+                if self._use_gpus:
+                    self.log_info(fl_ctx, f'Training with GPU {rank}')
+                    param['device'] = f"cuda:{rank}"
 
             # specify validations set to watch performance
             watchlist = [(dtest, "eval"), (dtrain, "train")]

From 6c791b5b478e1b005aa9d954d20239f312b7f068 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Thu, 7 Sep 2023 12:15:50 +0800
Subject: [PATCH 02/59] [pyspark] support gpu transform (#9542)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/compat.py              | 12 +++
 python-package/xgboost/spark/core.py          | 96 ++++++++++++++++++-
 python-package/xgboost/spark/utils.py         |  4 +-
 .../test_gpu_with_spark/test_gpu_spark.py     | 33 ++++++-
 .../test_with_spark/test_spark_local.py       | 28 ++++++
 5 files changed, 166 insertions(+), 7 deletions(-)

diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index a01eeef09..c40dea5fd 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -88,6 +88,18 @@ def is_cudf_available() -> bool:
         return False
 
 
+def is_cupy_available() -> bool:
+    """Check cupy package available or not"""
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+
+        return True
+    except ImportError:
+        return False
+
+
 try:
     import scipy.sparse as scipy_sparse
     from scipy.sparse import csr_matrix as scipy_csr
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index d6667ad89..6b1d2faaa 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -59,7 +59,7 @@ from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
 
 import xgboost
 from xgboost import XGBClassifier
-from xgboost.compat import is_cudf_available
+from xgboost.compat import is_cudf_available, is_cupy_available
 from xgboost.core import Booster, _check_distributed_params
 from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
 from xgboost.training import train as worker_train
@@ -242,6 +242,13 @@ class _SparkXGBParams(
         TypeConverters.toList,
     )
 
+    def set_device(self, value: str) -> "_SparkXGBParams":
+        """Set device, optional value: cpu, cuda, gpu"""
+        _check_distributed_params({"device": value})
+        assert value in ("cpu", "cuda", "gpu")
+        self.set(self.device, value)
+        return self
+
     @classmethod
     def _xgb_cls(cls) -> Type[XGBModel]:
         """
@@ -1193,6 +1200,31 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
             dataset = dataset.drop(pred_struct_col)
         return dataset
 
+    def _gpu_transform(self) -> bool:
+        """If gpu is used to do the prediction, true to gpu prediction"""
+
+        if _is_local(_get_spark_session().sparkContext):
+            # if it's local model, we just use the internal "device"
+            return use_cuda(self.getOrDefault(self.device))
+
+        gpu_per_task = (
+            _get_spark_session()
+            .sparkContext.getConf()
+            .get("spark.task.resource.gpu.amount")
+        )
+
+        # User don't set gpu configurations, just use cpu
+        if gpu_per_task is None:
+            if use_cuda(self.getOrDefault(self.device)):
+                get_logger("XGBoost-PySpark").warning(
+                    "Do the prediction on the CPUs since "
+                    "no gpu configurations are set"
+                )
+            return False
+
+        # User already sets the gpu configurations, we just use the internal "device".
+        return use_cuda(self.getOrDefault(self.device))
+
     def _transform(self, dataset: DataFrame) -> DataFrame:
         # pylint: disable=too-many-statements, too-many-locals
         # Save xgb_sklearn_model and predict_params to be local variable
@@ -1216,21 +1248,77 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
 
         _, schema = self._out_schema()
 
+        is_local = _is_local(_get_spark_session().sparkContext)
+        run_on_gpu = self._gpu_transform()
+
         @pandas_udf(schema)  # type: ignore
         def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
             assert xgb_sklearn_model is not None
             model = xgb_sklearn_model
+
+            from pyspark import TaskContext
+
+            context = TaskContext.get()
+            assert context is not None
+
+            dev_ordinal = -1
+
+            if is_cudf_available():
+                if is_local:
+                    if run_on_gpu and is_cupy_available():
+                        import cupy as cp  # pylint: disable=import-error
+
+                        total_gpus = cp.cuda.runtime.getDeviceCount()
+                        if total_gpus > 0:
+                            partition_id = context.partitionId()
+                            # For transform local mode, default the dev_ordinal to
+                            # (partition id) % gpus.
+                            dev_ordinal = partition_id % total_gpus
+                elif run_on_gpu:
+                    dev_ordinal = _get_gpu_id(context)
+
+                if dev_ordinal >= 0:
+                    device = "cuda:" + str(dev_ordinal)
+                    get_logger("XGBoost-PySpark").info(
+                        "Do the inference with device: %s", device
+                    )
+                    model.set_params(device=device)
+                else:
+                    get_logger("XGBoost-PySpark").info("Do the inference on the CPUs")
+            else:
+                msg = (
+                    "CUDF is unavailable, fallback the inference on the CPUs"
+                    if run_on_gpu
+                    else "Do the inference on the CPUs"
+                )
+                get_logger("XGBoost-PySpark").info(msg)
+
+            def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
+                """Move the data to gpu if possible"""
+                if dev_ordinal >= 0:
+                    import cudf  # pylint: disable=import-error
+                    import cupy as cp  # pylint: disable=import-error
+
+                    # We must set the device after import cudf, which will change the device id to 0
+                    # See https://github.com/rapidsai/cudf/issues/11386
+                    cp.cuda.runtime.setDevice(dev_ordinal)  # pylint: disable=I1101
+                    df = cudf.DataFrame(data)
+                    del data
+                    return df
+                return data
+
             for data in iterator:
                 if enable_sparse_data_optim:
                     X = _read_csr_matrix_from_unwrapped_spark_vec(data)
                 else:
                     if feature_col_names is not None:
-                        X = data[feature_col_names]
+                        tmp = data[feature_col_names]
                     else:
-                        X = stack_series(data[alias.data])
+                        tmp = stack_series(data[alias.data])
+                    X = to_gpu_if_possible(tmp)
 
                 if has_base_margin:
-                    base_margin = data[alias.margin].to_numpy()
+                    base_margin = to_gpu_if_possible(data[alias.margin])
                 else:
                     base_margin = None
 
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 33a45a90e..66d7ca454 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -10,7 +10,7 @@ from threading import Thread
 from typing import Any, Callable, Dict, Optional, Set, Type
 
 import pyspark
-from pyspark import BarrierTaskContext, SparkContext, SparkFiles
+from pyspark import BarrierTaskContext, SparkContext, SparkFiles, TaskContext
 from pyspark.sql.session import SparkSession
 
 from xgboost import Booster, XGBModel, collective
@@ -129,7 +129,7 @@ def _is_local(spark_context: SparkContext) -> bool:
     return spark_context._jsc.sc().isLocal()
 
 
-def _get_gpu_id(task_context: BarrierTaskContext) -> int:
+def _get_gpu_id(task_context: TaskContext) -> int:
     """Get the gpu id from the task resources"""
     if task_context is None:
         # This is a safety check.
diff --git a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
index a954d9d6c..513554e43 100644
--- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
@@ -2,6 +2,7 @@ import json
 import logging
 import subprocess
 
+import numpy as np
 import pytest
 import sklearn
 
@@ -13,7 +14,7 @@ from pyspark.ml.linalg import Vectors
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 from pyspark.sql import SparkSession
 
-from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
+from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor, SparkXGBRegressorModel
 
 gpu_discovery_script_path = "tests/test_distributed/test_gpu_with_spark/discover_gpu.sh"
 
@@ -242,3 +243,33 @@ def test_sparkxgb_regressor_feature_cols_with_gpu(spark_diabetes_dataset_feature
     evaluator = RegressionEvaluator(metricName="rmse")
     rmse = evaluator.evaluate(pred_result_df)
     assert rmse <= 65.0
+
+
+def test_gpu_transform(spark_diabetes_dataset) -> None:
+    regressor = SparkXGBRegressor(device="cuda", num_workers=num_workers)
+    train_df, test_df = spark_diabetes_dataset
+    model: SparkXGBRegressorModel = regressor.fit(train_df)
+
+    # The model trained with GPUs, and transform with GPU configurations.
+    assert model._gpu_transform()
+
+    model.set_device("cpu")
+    assert not model._gpu_transform()
+    # without error
+    cpu_rows = model.transform(test_df).select("prediction").collect()
+
+    regressor = SparkXGBRegressor(device="cpu", num_workers=num_workers)
+    model = regressor.fit(train_df)
+
+    # The model trained with CPUs. Even with GPU configurations,
+    # still prefer transforming with CPUs
+    assert not model._gpu_transform()
+
+    # Set gpu transform explicitly.
+    model.set_device("cuda")
+    assert model._gpu_transform()
+    # without error
+    gpu_rows = model.transform(test_df).select("prediction").collect()
+
+    for cpu, gpu in zip(cpu_rows, gpu_rows):
+        np.testing.assert_allclose(cpu.prediction, gpu.prediction, atol=1e-3)
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index e323a3606..861e67a75 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -888,6 +888,34 @@ class TestPySparkLocal:
         clf = SparkXGBClassifier(device="cuda")
         clf._validate_params()
 
+    def test_gpu_transform(self, clf_data: ClfData) -> None:
+        """local mode"""
+        classifier = SparkXGBClassifier(device="cpu")
+        model: SparkXGBClassifierModel = classifier.fit(clf_data.cls_df_train)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = "file:" + tmpdir
+            model.write().overwrite().save(path)
+
+            # The model trained with CPU, transform defaults to cpu
+            assert not model._gpu_transform()
+
+            # without error
+            model.transform(clf_data.cls_df_test).collect()
+
+            model.set_device("cuda")
+            assert model._gpu_transform()
+
+            model_loaded = SparkXGBClassifierModel.load(path)
+
+            # The model trained with CPU, transform defaults to cpu
+            assert not model_loaded._gpu_transform()
+            # without error
+            model_loaded.transform(clf_data.cls_df_test).collect()
+
+            model_loaded.set_device("cuda")
+            assert model_loaded._gpu_transform()
+
 
 class XgboostLocalTest(SparkTestCase):
     def setUp(self):

From 66a08327782e141003e2babd40487080dc3b66c4 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 7 Sep 2023 02:21:58 -0700
Subject: [PATCH 03/59] Add tests for gpu_approx (#9553)

---
 tests/cpp/plugin/test_federated_learner.cc |  5 ++
 tests/cpp/tree/test_gpu_hist.cu            | 60 ++++++++++++++++++++--
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/tests/cpp/plugin/test_federated_learner.cc b/tests/cpp/plugin/test_federated_learner.cc
index 427bd790c..a9adedc63 100644
--- a/tests/cpp/plugin/test_federated_learner.cc
+++ b/tests/cpp/plugin/test_federated_learner.cc
@@ -120,6 +120,11 @@ TEST_P(VerticalFederatedLearnerTest, Hist) {
 }
 
 #if defined(XGBOOST_USE_CUDA)
+TEST_P(VerticalFederatedLearnerTest, GPUApprox) {
+  std::string objective = GetParam();
+  this->Run("approx", "cuda:0", objective);
+}
+
 TEST_P(VerticalFederatedLearnerTest, GPUHist) {
   std::string objective = GetParam();
   this->Run("hist", "cuda:0", objective);
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 76734e526..f21ed1f06 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -428,7 +428,7 @@ TEST(GpuHist, MaxDepth) {
 }
 
 namespace {
-RegTree GetUpdatedTree(Context const* ctx, DMatrix* dmat) {
+RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
   ObjInfo task{ObjInfo::kRegression};
   GPUHistMaker hist_maker{ctx, &task};
   hist_maker.Configure(Args{});
@@ -446,7 +446,7 @@ RegTree GetUpdatedTree(Context const* ctx, DMatrix* dmat) {
   return tree;
 }
 
-void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+void VerifyHistColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
   Context ctx(MakeCUDACtx(GPUIDX));
 
   auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
@@ -454,7 +454,7 @@ void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expect
   auto const rank = collective::GetRank();
   std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
 
-  RegTree tree = GetUpdatedTree(&ctx, sliced.get());
+  RegTree tree = GetHistTree(&ctx, sliced.get());
 
   Json json{Object{}};
   tree.SaveModel(&json);
@@ -472,8 +472,58 @@ TEST_F(MGPUHistTest, GPUHistColumnSplit) {
 
   Context ctx(MakeCUDACtx(0));
   auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
-  RegTree expected_tree = GetUpdatedTree(&ctx, dmat.get());
+  RegTree expected_tree = GetHistTree(&ctx, dmat.get());
 
-  DoTest(VerifyColumnSplit, kRows, kCols, expected_tree);
+  DoTest(VerifyHistColumnSplit, kRows, kCols, expected_tree);
+}
+
+namespace {
+RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
+  ObjInfo task{ObjInfo::kRegression};
+  GPUGlobalApproxMaker approx_maker{ctx, &task};
+  approx_maker.Configure(Args{});
+
+  TrainParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+  RegTree tree;
+  approx_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                      {&tree});
+  return tree;
+}
+
+void VerifyApproxColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+  Context ctx(MakeCUDACtx(GPUIDX));
+
+  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
+
+  RegTree tree = GetApproxTree(&ctx, sliced.get());
+
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
+  ASSERT_EQ(json, expected_json);
+}
+}  // anonymous namespace
+
+class MGPUApproxTest : public BaseMGPUTest {};
+
+TEST_F(MGPUApproxTest, GPUApproxColumnSplit) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  Context ctx(MakeCUDACtx(0));
+  auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+  RegTree expected_tree = GetApproxTree(&ctx, dmat.get());
+
+  DoTest(VerifyApproxColumnSplit, kRows, kCols, expected_tree);
 }
 }  // namespace xgboost::tree

From 9027686cac6b91f90e76c70b5aa6511f96c6e2c6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 11 Sep 2023 17:44:51 +0800
Subject: [PATCH 04/59] Support pandas 2.1.0. (#9557)

---
 python-package/xgboost/data.py   | 66 +++++++++++++++++++++-----------
 tests/python/test_with_pandas.py |  8 ++--
 2 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 04bdc5739..428e48d10 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -317,7 +317,6 @@ def pandas_feature_info(
 ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
     """Handle feature info for pandas dataframe."""
     import pandas as pd
-    from pandas.api.types import is_categorical_dtype, is_sparse
 
     # handle feature names
     if feature_names is None and meta is None:
@@ -332,10 +331,10 @@ def pandas_feature_info(
     if feature_types is None and meta is None:
         feature_types = []
         for dtype in data.dtypes:
-            if is_sparse(dtype):
+            if is_pd_sparse_dtype(dtype):
                 feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
             elif (
-                is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
+                is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
             ) and enable_categorical:
                 feature_types.append(CAT_T)
             else:
@@ -345,18 +344,13 @@ def pandas_feature_info(
 
 def is_nullable_dtype(dtype: PandasDType) -> bool:
     """Whether dtype is a pandas nullable type."""
-    from pandas.api.types import (
-        is_bool_dtype,
-        is_categorical_dtype,
-        is_float_dtype,
-        is_integer_dtype,
-    )
+    from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
 
     is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
     # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
     is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
     is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
-    return is_int or is_bool or is_float or is_categorical_dtype(dtype)
+    return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)
 
 
 def is_pa_ext_dtype(dtype: Any) -> bool:
@@ -371,17 +365,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
     )
 
 
+def is_pd_cat_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas category type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import CategoricalDtype
+
+            return isinstance(dtype, CategoricalDtype)
+
+    from pandas.api.types import is_categorical_dtype
+
+    return is_categorical_dtype(dtype)
+
+
+def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas sparse type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import SparseDtype
+
+            return isinstance(dtype, SparseDtype)
+
+    from pandas.api.types import is_sparse
+
+    return is_sparse(dtype)
+
+
 def pandas_cat_null(data: DataFrame) -> DataFrame:
     """Handle categorical dtype and nullable extension types from pandas."""
     import pandas as pd
-    from pandas.api.types import is_categorical_dtype
 
     # handle category codes and nullable.
     cat_columns = []
     nul_columns = []
     # avoid an unnecessary conversion if possible
     for col, dtype in zip(data.columns, data.dtypes):
-        if is_categorical_dtype(dtype):
+        if is_pd_cat_dtype(dtype):
             cat_columns.append(col)
         elif is_pa_ext_categorical_dtype(dtype):
             raise ValueError(
@@ -398,7 +423,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
         transformed = data
 
     def cat_codes(ser: pd.Series) -> pd.Series:
-        if is_categorical_dtype(ser.dtype):
+        if is_pd_cat_dtype(ser.dtype):
             return ser.cat.codes
         assert is_pa_ext_categorical_dtype(ser.dtype)
         # Not yet supported, the index is not ordered for some reason. Alternately:
@@ -454,14 +479,12 @@ def _transform_pandas_df(
     meta: Optional[str] = None,
     meta_type: Optional[NumpyDType] = None,
 ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
-    from pandas.api.types import is_categorical_dtype, is_sparse
-
     pyarrow_extension = False
     for dtype in data.dtypes:
         if not (
             (dtype.name in _pandas_dtype_mapper)
-            or is_sparse(dtype)
-            or (is_categorical_dtype(dtype) and enable_categorical)
+            or is_pd_sparse_dtype(dtype)
+            or (is_pd_cat_dtype(dtype) and enable_categorical)
             or is_pa_ext_dtype(dtype)
         ):
             _invalid_dataframe_dtype(data)
@@ -515,9 +538,8 @@ def _meta_from_pandas_series(
 ) -> None:
     """Help transform pandas series for meta data like labels"""
     data = data.values.astype("float")
-    from pandas.api.types import is_sparse
 
-    if is_sparse(data):
+    if is_pd_sparse_dtype(getattr(data, "dtype", data)):
         data = data.to_dense()  # type: ignore
     assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
     _meta_from_numpy(data, name, dtype, handle)
@@ -539,13 +561,11 @@ def _from_pandas_series(
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
 ) -> DispatchedDataBackendReturnType:
-    from pandas.api.types import is_categorical_dtype
-
     if (data.dtype.name not in _pandas_dtype_mapper) and not (
-        is_categorical_dtype(data.dtype) and enable_categorical
+        is_pd_cat_dtype(data.dtype) and enable_categorical
     ):
         _invalid_dataframe_dtype(data)
-    if enable_categorical and is_categorical_dtype(data.dtype):
+    if enable_categorical and is_pd_cat_dtype(data.dtype):
         data = data.cat.codes
     return _from_numpy_array(
         data.values.reshape(data.shape[0], 1).astype("float"),
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index f8a21b6ab..a23a66b63 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -211,7 +211,7 @@ class TestPandas:
         y = np.random.randn(kRows)
         w = np.random.uniform(size=kRows).astype(np.float32)
         w_pd = pd.DataFrame(w)
-        data = xgb.DMatrix(X, y, w_pd)
+        data = xgb.DMatrix(X, y, weight=w_pd)
 
         assert data.num_row() == kRows
         assert data.num_col() == kCols
@@ -301,14 +301,14 @@ class TestPandas:
 
     @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
     def test_nullable_type(self, DMatrixT) -> None:
-        from pandas.api.types import is_categorical_dtype
+        from xgboost.data import is_pd_cat_dtype
 
         for orig, df in pd_dtypes():
             if hasattr(df.dtypes, "__iter__"):
-                enable_categorical = any(is_categorical_dtype for dtype in df.dtypes)
+                enable_categorical = any(is_pd_cat_dtype(dtype) for dtype in df.dtypes)
             else:
                 # series
-                enable_categorical = is_categorical_dtype(df.dtype)
+                enable_categorical = is_pd_cat_dtype(df.dtype)
 
             f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
             f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df

From d05ea589fb5dca2aa1ace6c9fbf7b5fbce896ed3 Mon Sep 17 00:00:00 2001
From: Jon Yoquinto <60784177+yoquinjo@users.noreply.github.com>
Date: Mon, 11 Sep 2023 17:29:51 -0600
Subject: [PATCH 05/59] Allow JVM-Package to access inplace predict method
 (#9167)

---------

Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
Co-authored-by: Joe <25804777+ByteSizedJoe@users.noreply.github.com>
---
 .../java/ml/dmlc/xgboost4j/java/Booster.java  | 106 ++++++++++
 .../ml/dmlc/xgboost4j/java/XGBoostJNI.java    |   8 +-
 .../xgboost4j/src/native/xgboost4j.cpp        |  79 +++++++
 jvm-packages/xgboost4j/src/native/xgboost4j.h |  16 +-
 .../dmlc/xgboost4j/java/BoosterImplTest.java  | 193 +++++++++++++++++-
 5 files changed, 384 insertions(+), 18 deletions(-)

diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index 11f5299c0..7ed12c704 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -39,6 +39,21 @@ public class Booster implements Serializable, KryoSerializable {
   // handle to the booster.
   private long handle = 0;
   private int version = 0;
+  /**
+   * Type of prediction, used for inplace_predict.
+   */
+  public enum PredictionType {
+    kValue(0),
+    kMargin(1);
+
+    private Integer ptype;
+    private PredictionType(final Integer ptype) {
+      this.ptype = ptype;
+    }
+    public Integer getPType() {
+      return ptype;
+    }
+  }
 
   /**
    * Create a new Booster with empty stage.
@@ -375,6 +390,97 @@ public class Booster implements Serializable, KryoSerializable {
     return predicts;
   }
 
+  /**
+   * Perform thread-safe prediction.
+   *
+   * @param data      Flattened input matrix of features for prediction
+   * @param nrow      The number of preditions to make (count of input matrix rows)
+   * @param ncol      The number of features in the model (count of input matrix columns)
+   * @param missing   Value indicating missing element in the <code>data</code> input matrix
+   *
+   * @return predict  Result matrix
+   */
+  public float[][] inplace_predict(float[] data,
+                                   int nrow,
+                                   int ncol,
+                                   float missing) throws XGBoostError {
+    int[] iteration_range = new int[2];
+    iteration_range[0] = 0;
+    iteration_range[1] = 0;
+    return this.inplace_predict(data, nrow, ncol,
+        missing, iteration_range, PredictionType.kValue, null);
+  }
+
+  /**
+   * Perform thread-safe prediction.
+   *
+   * @param data      Flattened input matrix of features for prediction
+   * @param nrow      The number of preditions to make (count of input matrix rows)
+   * @param ncol      The number of features in the model (count of input matrix columns)
+   * @param missing   Value indicating missing element in the <code>data</code> input matrix
+   * @param iteration_range Specifies which layer of trees are used in prediction.  For
+   *                        example, if a random forest is trained with 100 rounds.
+   *                        Specifying `iteration_range=[10, 20)`, then only the forests
+   *                        built during [10, 20) (half open set) rounds are used in this
+   *                        prediction.
+   *
+   * @return predict  Result matrix
+   */
+  public float[][] inplace_predict(float[] data,
+                                   int nrow,
+                                   int ncol,
+                                   float missing, int[] iteration_range) throws XGBoostError {
+    return this.inplace_predict(data, nrow, ncol,
+        missing, iteration_range, PredictionType.kValue, null);
+  }
+
+
+  /**
+   * Perform thread-safe prediction.
+   *
+   * @param data            Flattened input matrix of features for prediction
+   * @param nrow            The number of preditions to make (count of input matrix rows)
+   * @param ncol            The number of features in the model (count of input matrix columns)
+   * @param missing         Value indicating missing element in the <code>data</code> input matrix
+   * @param iteration_range Specifies which layer of trees are used in prediction.  For
+   *                        example, if a random forest is trained with 100 rounds.
+   *                        Specifying `iteration_range=[10, 20)`, then only the forests
+   *                        built during [10, 20) (half open set) rounds are used in this
+   *                        prediction.
+   * @param predict_type    What kind of prediction to run.
+   * @return predict       Result matrix
+   */
+  public float[][] inplace_predict(float[] data,
+                                   int nrow,
+                                   int ncol,
+                                   float missing,
+                                   int[] iteration_range,
+                                   PredictionType predict_type,
+                                   float[] base_margin) throws XGBoostError {
+    if (iteration_range.length != 2) {
+      throw new XGBoostError(new String("Iteration range is expected to be [begin, end)."));
+    }
+    int ptype = predict_type.getPType();
+
+    int begin = iteration_range[0];
+    int end = iteration_range[1];
+
+    float[][] rawPredicts = new float[1][];
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterPredictFromDense(handle, data, nrow, ncol,
+        missing,
+        begin, end, ptype, base_margin, rawPredicts));
+
+    int col = rawPredicts[0].length / nrow;
+    float[][] predicts = new float[nrow][col];
+    int r, c;
+    for (int i = 0; i < rawPredicts[0].length; i++) {
+      r = i / col;
+      c = i % col;
+      predicts[r][c] = rawPredicts[0][i];
+    }
+    return predicts;
+  }
+
   /**
    * Predict leaf indices given the data
    *
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
index d71d0a4f5..eabbf29ba 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
@@ -119,6 +119,10 @@ class XGBoostJNI {
   public final static native int XGBoosterPredict(long handle, long dmat, int option_mask,
                                                   int ntree_limit, float[][] predicts);
 
+  public final static native int XGBoosterPredictFromDense(long handle, float[] data,
+      long nrow, long ncol, float missing, int iteration_begin, int iteration_end, int predict_type, float[] margin,
+      float[][] predicts);
+
   public final static native int XGBoosterLoadModel(long handle, String fname);
 
   public final static native int XGBoosterSaveModel(long handle, String fname);
@@ -154,10 +158,6 @@ class XGBoostJNI {
   public final static native int XGDMatrixSetInfoFromInterface(
     long handle, String field, String json);
 
-  @Deprecated
-  public final static native int XGDeviceQuantileDMatrixCreateFromCallback(
-    java.util.Iterator<ColumnBatch> iter, float missing, int nthread, int maxBin, long[] out);
-
   public final static native int XGQuantileDMatrixCreateFromCallback(
     java.util.Iterator<ColumnBatch> iter, java.util.Iterator<ColumnBatch> ref, String config, long[] out);
 
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index c0c077430..821b1ebff 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -684,6 +684,85 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredict
   return ret;
 }
 
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterPredictFromDense
+ * Signature: (J[FJJFIII[F[[F)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredictFromDense(
+    JNIEnv *jenv, jclass jcls, jlong jhandle, jfloatArray jdata, jlong num_rows, jlong num_features,
+    jfloat missing, jint iteration_begin, jint iteration_end, jint predict_type,
+    jfloatArray jmargin, jobjectArray jout) {
+  API_BEGIN();
+  BoosterHandle handle = reinterpret_cast<BoosterHandle>(jhandle);
+
+  /**
+   * Create array interface.
+   */
+  namespace linalg = xgboost::linalg;
+  jfloat *data = jenv->GetFloatArrayElements(jdata, nullptr);
+  xgboost::Context ctx;
+  auto t_data = linalg::MakeTensorView(
+      ctx.Device(),
+      xgboost::common::Span{data, static_cast<std::size_t>(num_rows * num_features)}, num_rows,
+      num_features);
+  auto s_array = linalg::ArrayInterfaceStr(t_data);
+
+  /**
+   * Create configuration object.
+   */
+  xgboost::Json config{xgboost::Object{}};
+  config["cache_id"] = xgboost::Integer{};
+  config["type"] = xgboost::Integer{static_cast<std::int32_t>(predict_type)};
+  config["iteration_begin"] = xgboost::Integer{static_cast<xgboost::bst_layer_t>(iteration_begin)};
+  config["iteration_end"] = xgboost::Integer{static_cast<xgboost::bst_layer_t>(iteration_end)};
+  config["missing"] = xgboost::Number{static_cast<float>(missing)};
+  config["strict_shape"] = xgboost::Boolean{true};
+  std::string s_config;
+  xgboost::Json::Dump(config, &s_config);
+
+  /**
+   * Handle base margin
+   */
+  BoosterHandle proxy{nullptr};
+
+  float *margin{nullptr};
+  if (jmargin) {
+    margin = jenv->GetFloatArrayElements(jmargin, nullptr);
+    JVM_CHECK_CALL(XGProxyDMatrixCreate(&proxy));
+    JVM_CHECK_CALL(
+        XGDMatrixSetFloatInfo(proxy, "base_margin", margin, jenv->GetArrayLength(jmargin)));
+  }
+
+  bst_ulong const *out_shape;
+  bst_ulong out_dim;
+  float const *result;
+  auto ret = XGBoosterPredictFromDense(handle, s_array.c_str(), s_config.c_str(), proxy, &out_shape,
+                                       &out_dim, &result);
+
+  jenv->ReleaseFloatArrayElements(jdata, data, 0);
+  if (proxy) {
+    XGDMatrixFree(proxy);
+    jenv->ReleaseFloatArrayElements(jmargin, margin, 0);
+  }
+
+  if (ret != 0) {
+    return ret;
+  }
+
+  std::size_t n{1};
+  for (std::size_t i = 0; i < out_dim; ++i) {
+    n *= out_shape[i];
+  }
+
+  jfloatArray jarray = jenv->NewFloatArray(n);
+
+  jenv->SetFloatArrayRegion(jarray, 0, n, result);
+  jenv->SetObjectArrayElement(jout, 0, jarray);
+
+  API_END();
+}
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterLoadModel
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.h b/jvm-packages/xgboost4j/src/native/xgboost4j.h
index b221c6a57..87ff6d30d 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.h
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.h
@@ -207,6 +207,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterEvalOneIt
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredict
   (JNIEnv *, jclass, jlong, jlong, jint, jint, jobjectArray);
 
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterPredictFromDense
+ * Signature: (J[FJJFIII[F[[F)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredictFromDense
+  (JNIEnv *, jclass, jlong, jfloatArray, jlong, jlong, jfloat, jint, jint, jint, jfloatArray, jobjectArray);
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterLoadModel
@@ -359,14 +367,6 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_CommunicatorAllred
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetInfoFromInterface
   (JNIEnv *, jclass, jlong, jstring, jstring);
 
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGDeviceQuantileDMatrixCreateFromCallback
- * Signature: (Ljava/util/Iterator;FII[J)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDeviceQuantileDMatrixCreateFromCallback
-  (JNIEnv *, jclass, jobject, jfloat, jint, jint, jlongArray);
-
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGQuantileDMatrixCreateFromCallback
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index 70966a38f..c7508b20d 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -15,16 +15,24 @@
  */
 package ml.dmlc.xgboost4j.java;
 
-import java.io.*;
-import java.util.*;
-
 import junit.framework.TestCase;
+import org.junit.Assert;
 import org.junit.Test;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+import java.util.concurrent.*;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.fail;
+
 /**
- * test cases for Booster
- *
- * @author hzx
+ * test cases for Booster Inplace Predict
+ * 
+ * @author hzx and Sovrn
  */
 public class BoosterImplTest {
   private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1&format=libsvm";
@@ -99,6 +107,179 @@ public class BoosterImplTest {
     TestCase.assertTrue(eval.eval(predicts, testMat) < 0.1f);
   }
 
+  @Test
+  public void inplacePredictTest() throws XGBoostError {
+    /* Data Generation */
+    // Generate a training set.
+    int trainRows = 1000;
+    int features = 10;
+    int trainSize = trainRows * features;
+    float[] trainX = generateRandomDataSet(trainSize);
+    float[] trainY = generateRandomDataSet(trainRows);
+
+    DMatrix trainingMatrix = new DMatrix(trainX, trainRows, features, Float.NaN);
+    trainingMatrix.setLabel(trainY);
+
+    // Generate a testing set
+    int testRows = 10;
+    int testSize = testRows * features;
+    float[] testX = generateRandomDataSet(testSize);
+    float[] testY = generateRandomDataSet(testRows);
+
+    DMatrix testingMatrix = new DMatrix(testX, testRows, features, Float.NaN);
+    testingMatrix.setLabel(testY);
+
+    /* Training */
+
+    // Set parameters
+    Map<String, Object> params = new HashMap<>();
+    params.put("eta", 1.0);
+    params.put("max_depth",2);
+    params.put("silent", 1);
+    params.put("tree_method", "hist");
+
+    Map<String, DMatrix> watches = new HashMap<>();
+    watches.put("train", trainingMatrix);
+    watches.put("test", testingMatrix);
+
+    Booster booster = XGBoost.train(trainingMatrix, params, 10, watches, null, null);
+
+    /* Prediction */
+
+    // Standard prediction
+    float[][] predictions = booster.predict(testingMatrix);
+
+    // Inplace-prediction
+    float[][] inplacePredictions = booster.inplace_predict(testX, testRows, features, Float.NaN);
+
+    // Confirm that the two prediction results are identical
+    assertArrayEquals(predictions, inplacePredictions);
+  }
+
+  @Test
+  public void inplacePredictMultiPredictTest() throws InterruptedException {
+    // Multithreaded, multiple prediction
+    int trainRows = 1000;
+    int features = 10;
+    int trainSize = trainRows * features;
+
+    int testRows = 10;
+    int testSize = testRows * features;
+
+    //Simulate multiple predictions on multiple random data sets simultaneously.
+    ExecutorService executorService = Executors.newFixedThreadPool(5);
+    int predictsToPerform = 100;
+    for(int i = 0; i < predictsToPerform; i++) {
+      executorService.submit(() -> {
+        try {
+          float[] trainX = generateRandomDataSet(trainSize);
+          float[] trainY = generateRandomDataSet(trainRows);
+          DMatrix trainingMatrix = new DMatrix(trainX, trainRows, features, Float.NaN);
+          trainingMatrix.setLabel(trainY);
+
+          float[] testX = generateRandomDataSet(testSize);
+          float[] testY = generateRandomDataSet(testRows);
+          DMatrix testingMatrix = new DMatrix(testX, testRows, features, Float.NaN);
+          testingMatrix.setLabel(testY);
+
+          Map<String, Object> params = new HashMap<>();
+          params.put("eta", 1.0);
+          params.put("max_depth", 2);
+          params.put("silent", 1);
+          params.put("tree_method", "hist");
+
+          Map<String, DMatrix> watches = new HashMap<>();
+          watches.put("train", trainingMatrix);
+          watches.put("test", testingMatrix);
+
+          Booster booster = XGBoost.train(trainingMatrix, params, 10, watches, null, null);
+
+          float[][] predictions = booster.predict(testingMatrix);
+          float[][] inplacePredictions = booster.inplace_predict(testX, testRows, features, Float.NaN);
+
+          assertArrayEquals(predictions, inplacePredictions);
+        } catch (XGBoostError xgBoostError) {
+          fail(xgBoostError.getMessage());
+        }
+      });
+    }
+    executorService.shutdown();
+    if(!executorService.awaitTermination(1, TimeUnit.MINUTES))
+      executorService.shutdownNow();
+  }
+
+  @Test
+  public void inplacePredictWithMarginTest() throws XGBoostError {
+    //Generate a training set
+    int trainRows = 1000;
+    int features = 10;
+    int trainSize = trainRows * features;
+    float[] trainX = generateRandomDataSet(trainSize);
+    float[] trainY = generateRandomDataSet(trainRows);
+
+    DMatrix trainingMatrix = new DMatrix(trainX, trainRows, features, Float.NaN);
+    trainingMatrix.setLabel(trainY);
+
+    // Generate a testing set
+    int testRows = 10;
+    int testSize = testRows * features;
+    float[] testX = generateRandomDataSet(testSize);
+    float[] testY = generateRandomDataSet(testRows);
+
+    DMatrix testingMatrix = new DMatrix(testX, testRows, features, Float.NaN);
+    testingMatrix.setLabel(testY);
+
+    // Set booster parameters
+    Map<String, Object> params = new HashMap<>();
+    params.put("eta", 1.0);
+    params.put("max_depth",2);
+    params.put("tree_method", "hist");
+    params.put("base_score", 0.0);
+
+    Map<String, DMatrix> watches = new HashMap<>();
+    watches.put("train", trainingMatrix);
+    watches.put("test", testingMatrix);
+
+    // Train booster on training matrix.
+    Booster booster = XGBoost.train(trainingMatrix, params, 10, watches, null, null);
+
+    // Create a margin
+    float[] margin = new float[testRows];
+    Arrays.fill(margin, 0.5f);
+
+    // Define an iteration range to use all training iterations, this should match
+    // the without margin call
+    // which defines an iteration range of [0,0)
+    int[] iterationRange = new int[] { 0, 0 };
+
+    float[][] inplacePredictionsWithMargin = booster.inplace_predict(testX,
+        testRows,
+        features,
+        Float.NaN,
+        iterationRange,
+        Booster.PredictionType.kValue,
+        margin);
+    float[][] inplacePredictionsWithoutMargin = booster.inplace_predict(testX, testRows, features, Float.NaN);
+
+    for (int i = 0; i < inplacePredictionsWithoutMargin.length; i++) {
+      for (int j = 0; j < inplacePredictionsWithoutMargin[i].length; j++) {
+        inplacePredictionsWithoutMargin[i][j] += margin[j];
+      }
+    }
+    for (int i = 0; i < inplacePredictionsWithoutMargin.length; i++) {
+      assertArrayEquals(inplacePredictionsWithMargin[i], inplacePredictionsWithoutMargin[i], 1e-6f);
+    }
+  }
+
+  private float[] generateRandomDataSet(int size) {
+    float[] newSet = new float[size];
+    Random random = new Random();
+    for(int i = 0; i < size; i++) {
+      newSet[i] = random.nextFloat();
+    }
+    return newSet;
+  }
+
   @Test
   public void saveLoadModelWithPath() throws XGBoostError, IOException {
     DMatrix trainMat = new DMatrix(this.train_uri);

From f90d034a86784f4f07417d1d28a77b4a189acc89 Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Tue, 12 Sep 2023 21:53:07 +0900
Subject: [PATCH 06/59] [doc] Fix typo in python_packaging.rst (#9573)

---
 doc/contrib/python_packaging.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/contrib/python_packaging.rst b/doc/contrib/python_packaging.rst
index 5cf085685..ebd9d36ec 100644
--- a/doc/contrib/python_packaging.rst
+++ b/doc/contrib/python_packaging.rst
@@ -35,7 +35,7 @@ Building sdists
 
 In the case of XGBoost, an sdist contains both the Python code as well as
 the C++ code, so that the core part of XGBoost can be compiled into the
-shared libary ``libxgboost.so`` [#shared_lib_name]_.
+shared library ``libxgboost.so`` [#shared_lib_name]_.
 
 You can obtain an sdist as follows:
 

From 5abe50ff8ca3acccd3b11487d82283f550e78e1e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 13 Sep 2023 23:19:29 +0800
Subject: [PATCH 07/59] [R] Fix method name. (#9577)

---
 R-package/R/callbacks.R | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index 7265967b2..96bcaef32 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -70,7 +70,7 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
         i == env$begin_iteration ||
         i == env$end_iteration) {
       stdev <- if (showsd) env$bst_evaluation_err else NULL
-      msg <- format.eval.string(i, env$bst_evaluation, stdev)
+      msg <- .format_eval_string(i, env$bst_evaluation, stdev)
       cat(msg, '\n')
     }
   }
@@ -380,7 +380,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
     if ((maximize && score > best_score) ||
         (!maximize && score < best_score)) {
 
-      best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err)
+      best_msg <<- .format_eval_string(
+        i, env$bst_evaluation, env$bst_evaluation_err
+      )
       best_score <<- score
       best_iteration <<- i
       best_ntreelimit <<- best_iteration * env$num_parallel_tree
@@ -754,7 +756,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
 #
 
 # Format the evaluation metric string
-format.eval.string <- function(iter, eval_res, eval_err = NULL) {
+.format_eval_string <- function(iter, eval_res, eval_err = NULL) {
   if (length(eval_res) == 0)
     stop('no evaluation results')
   enames <- names(eval_res)

From b438d684d27286f09ec2864a2ce7da28f32acc0f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 14 Sep 2023 01:41:42 +0800
Subject: [PATCH 08/59] Utilities and cleanups for socket. (#9576)

- Use c++-17 nodiscard and nested ns.
- Add bind method to socket.
- Remove rabit parameters.
---
 include/xgboost/c_api.h                   |  10 --
 include/xgboost/collective/socket.h       | 140 +++++++++++++++++-----
 include/xgboost/json.h                    |  13 ++
 include/xgboost/string_view.h             |   2 +-
 plugin/federated/federated_client.h       |   8 +-
 plugin/federated/federated_communicator.h |  11 +-
 plugin/federated/federated_server.cc      |   8 +-
 rabit/src/allreduce_base.cc               |  10 +-
 src/collective/communicator.cc            |   8 +-
 src/collective/socket.cc                  |  28 +++--
 tests/cpp/collective/test_socket.cc       |  11 ++
 tests/cpp/common/test_json.cc             |  13 ++
 12 files changed, 187 insertions(+), 75 deletions(-)

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index afc1f47fd..9bce616ef 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1554,29 +1554,19 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
  * \param config JSON encoded configuration. Accepted JSON keys are:
  *   - xgboost_communicator: The type of the communicator. Can be set as an environment variable.
  *     * rabit: Use Rabit. This is the default if the type is unspecified.
- *     * mpi: Use MPI.
  *     * federated: Use the gRPC interface for Federated Learning.
  * Only applicable to the Rabit communicator (these are case-sensitive):
  *   - rabit_tracker_uri: Hostname of the tracker.
  *   - rabit_tracker_port: Port number of the tracker.
  *   - rabit_task_id: ID of the current task, can be used to obtain deterministic rank assignment.
  *   - rabit_world_size: Total number of workers.
- *   - rabit_hadoop_mode: Enable Hadoop support.
- *   - rabit_tree_reduce_minsize: Minimal size for tree reduce.
- *   - rabit_reduce_ring_mincount: Minimal count to perform ring reduce.
- *   - rabit_reduce_buffer: Size of the reduce buffer.
- *   - rabit_bootstrap_cache: Size of the bootstrap cache.
- *   - rabit_debug: Enable debugging.
  *   - rabit_timeout: Enable timeout.
  *   - rabit_timeout_sec: Timeout in seconds.
- *   - rabit_enable_tcp_no_delay: Enable TCP no delay on Unix platforms.
  * Only applicable to the Rabit communicator (these are case-sensitive, and can be set as
  * environment variables):
  *   - DMLC_TRACKER_URI: Hostname of the tracker.
  *   - DMLC_TRACKER_PORT: Port number of the tracker.
  *   - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
- *   - DMLC_ROLE: Role of the current task, "worker" or "server".
- *   - DMLC_NUM_ATTEMPT: Number of attempts after task failure.
  *   - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
  * Only applicable to the Federated communicator (use upper case for environment variables, use
  * lower case for runtime configuration):
diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index 5bff2204e..f36cdccb2 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -215,9 +215,9 @@ class SockAddrV4 {
   static SockAddrV4 Loopback();
   static SockAddrV4 InaddrAny();
 
-  in_port_t Port() const { return ntohs(addr_.sin_port); }
+  [[nodiscard]] in_port_t Port() const { return ntohs(addr_.sin_port); }
 
-  std::string Addr() const {
+  [[nodiscard]] std::string Addr() const {
     char buf[INET_ADDRSTRLEN];
     auto const *s = system::inet_ntop(static_cast<std::int32_t>(SockDomain::kV4), &addr_.sin_addr,
                                       buf, INET_ADDRSTRLEN);
@@ -226,7 +226,7 @@ class SockAddrV4 {
     }
     return {buf};
   }
-  sockaddr_in const &Handle() const { return addr_; }
+  [[nodiscard]] sockaddr_in const &Handle() const { return addr_; }
 };
 
 /**
@@ -243,13 +243,13 @@ class SockAddress {
   explicit SockAddress(SockAddrV6 const &addr) : v6_{addr}, domain_{SockDomain::kV6} {}
   explicit SockAddress(SockAddrV4 const &addr) : v4_{addr} {}
 
-  auto Domain() const { return domain_; }
+  [[nodiscard]] auto Domain() const { return domain_; }
 
-  bool IsV4() const { return Domain() == SockDomain::kV4; }
-  bool IsV6() const { return !IsV4(); }
+  [[nodiscard]] bool IsV4() const { return Domain() == SockDomain::kV4; }
+  [[nodiscard]] bool IsV6() const { return !IsV4(); }
 
-  auto const &V4() const { return v4_; }
-  auto const &V6() const { return v6_; }
+  [[nodiscard]] auto const &V4() const { return v4_; }
+  [[nodiscard]] auto const &V6() const { return v6_; }
 };
 
 /**
@@ -261,6 +261,7 @@ class TCPSocket {
 
  private:
   HandleT handle_{InvalidSocket()};
+  bool non_blocking_{false};
   // There's reliable no way to extract domain from a socket without first binding that
   // socket on macos.
 #if defined(__APPLE__)
@@ -276,7 +277,7 @@ class TCPSocket {
   /**
    * \brief Return the socket domain.
    */
-  auto Domain() const -> SockDomain {
+  [[nodiscard]] auto Domain() const -> SockDomain {
     auto ret_iafamily = [](std::int32_t domain) {
       switch (domain) {
         case AF_INET:
@@ -321,10 +322,10 @@ class TCPSocket {
 #endif  // platforms
   }
 
-  bool IsClosed() const { return handle_ == InvalidSocket(); }
+  [[nodiscard]] bool IsClosed() const { return handle_ == InvalidSocket(); }
 
-  /** \brief get last error code if any */
-  Result GetSockError() const {
+  /** @brief get last error code if any */
+  [[nodiscard]] Result GetSockError() const {
     std::int32_t optval = 0;
     socklen_t len = sizeof(optval);
     auto ret = getsockopt(handle_, SOL_SOCKET, SO_ERROR, reinterpret_cast<char *>(&optval), &len);
@@ -340,7 +341,7 @@ class TCPSocket {
   }
 
   /** \brief check if anything bad happens */
-  bool BadSocket() const {
+  [[nodiscard]] bool BadSocket() const {
     if (IsClosed()) {
       return true;
     }
@@ -352,24 +353,56 @@ class TCPSocket {
     return false;
   }
 
-  void SetNonBlock(bool non_block) {
+  [[nodiscard]] Result NonBlocking(bool non_block) {
 #if defined(_WIN32)
     u_long mode = non_block ? 1 : 0;
-    xgboost_CHECK_SYS_CALL(ioctlsocket(handle_, FIONBIO, &mode), NO_ERROR);
+    if (ioctlsocket(handle_, FIONBIO, &mode) != NO_ERROR) {
+      return system::FailWithCode("Failed to set socket to non-blocking.");
+    }
 #else
     std::int32_t flag = fcntl(handle_, F_GETFL, 0);
-    if (flag == -1) {
-      system::ThrowAtError("fcntl");
+    auto rc = flag;
+    if (rc == -1) {
+      return system::FailWithCode("Failed to get socket flag.");
     }
     if (non_block) {
       flag |= O_NONBLOCK;
     } else {
       flag &= ~O_NONBLOCK;
     }
-    if (fcntl(handle_, F_SETFL, flag) == -1) {
-      system::ThrowAtError("fcntl");
+    rc = fcntl(handle_, F_SETFL, flag);
+    if (rc == -1) {
+      return system::FailWithCode("Failed to set socket to non-blocking.");
     }
 #endif  // _WIN32
+    non_blocking_ = non_block;
+    return Success();
+  }
+  [[nodiscard]] bool NonBlocking() const { return non_blocking_; }
+  [[nodiscard]] Result RecvTimeout(std::chrono::seconds timeout) {
+    timeval tv;
+    tv.tv_sec = timeout.count();
+    tv.tv_usec = 0;
+    auto rc = setsockopt(Handle(), SOL_SOCKET, SO_RCVTIMEO, reinterpret_cast<char const *>(&tv),
+                         sizeof(tv));
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set timeout on recv.");
+    }
+    return Success();
+  }
+
+  [[nodiscard]] Result SetBufSize(std::int32_t n_bytes) {
+    auto rc = setsockopt(this->Handle(), SOL_SOCKET, SO_SNDBUF, reinterpret_cast<char *>(&n_bytes),
+                         sizeof(n_bytes));
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set send buffer size.");
+    }
+    rc = setsockopt(this->Handle(), SOL_SOCKET, SO_RCVBUF, reinterpret_cast<char *>(&n_bytes),
+                    sizeof(n_bytes));
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set recv buffer size.");
+    }
+    return Success();
   }
 
   void SetKeepAlive() {
@@ -391,7 +424,7 @@ class TCPSocket {
    * \brief Accept new connection, returns a new TCP socket for the new connection.
    */
   TCPSocket Accept() {
-    HandleT newfd = accept(handle_, nullptr, nullptr);
+    HandleT newfd = accept(Handle(), nullptr, nullptr);
     if (newfd == InvalidSocket()) {
       system::ThrowAtError("accept");
     }
@@ -399,6 +432,18 @@ class TCPSocket {
     return newsock;
   }
 
+  [[nodiscard]] Result Accept(TCPSocket *out, SockAddrV4 *addr) {
+    struct sockaddr_in caddr;
+    socklen_t caddr_len = sizeof(caddr);
+    HandleT newfd = accept(Handle(), reinterpret_cast<sockaddr *>(&caddr), &caddr_len);
+    if (newfd == InvalidSocket()) {
+      return system::FailWithCode("Failed to accept.");
+    }
+    *addr = SockAddrV4{caddr};
+    *out = TCPSocket{newfd};
+    return Success();
+  }
+
   ~TCPSocket() {
     if (!IsClosed()) {
       Close();
@@ -413,9 +458,9 @@ class TCPSocket {
     return *this;
   }
   /**
-   * \brief Return the native socket file descriptor.
+   * @brief Return the native socket file descriptor.
    */
-  HandleT const &Handle() const { return handle_; }
+  [[nodiscard]] HandleT const &Handle() const { return handle_; }
   /**
    * \brief Listen to incoming requests. Should be called after bind.
    */
@@ -448,6 +493,49 @@ class TCPSocket {
       return ntohs(res_addr.sin_port);
     }
   }
+
+  [[nodiscard]] auto Port() const {
+    if (this->Domain() == SockDomain::kV4) {
+      sockaddr_in res_addr;
+      socklen_t addrlen = sizeof(res_addr);
+      auto code = getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen);
+      if (code != 0) {
+        return std::make_pair(system::FailWithCode("getsockname"), std::int32_t{0});
+      }
+      return std::make_pair(Success(), std::int32_t{ntohs(res_addr.sin_port)});
+    } else {
+      sockaddr_in6 res_addr;
+      socklen_t addrlen = sizeof(res_addr);
+      auto code = getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen);
+      if (code != 0) {
+        return std::make_pair(system::FailWithCode("getsockname"), std::int32_t{0});
+      }
+      return std::make_pair(Success(), std::int32_t{ntohs(res_addr.sin6_port)});
+    }
+  }
+
+  [[nodiscard]] Result Bind(StringView ip, std::int32_t *port) {
+    // bind socket handle_ to ip
+    auto addr = MakeSockAddress(ip, 0);
+    std::int32_t errc{0};
+    if (addr.IsV4()) {
+      auto handle = reinterpret_cast<sockaddr const *>(&addr.V4().Handle());
+      errc = bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.V4().Handle())>));
+    } else {
+      auto handle = reinterpret_cast<sockaddr const *>(&addr.V6().Handle());
+      errc = bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.V6().Handle())>));
+    }
+    if (errc != 0) {
+      return system::FailWithCode("Failed to bind socket.");
+    }
+    auto [rc, new_port] = this->Port();
+    if (!rc.OK()) {
+      return std::move(rc);
+    }
+    *port = new_port;
+    return Success();
+  }
+
   /**
    * \brief Send data, without error then all data should be sent.
    */
@@ -567,13 +655,9 @@ class TCPSocket {
                              xgboost::collective::TCPSocket *out_conn);
 
 /**
- * \brief Get the local host name.
+ * @brief Get the local host name.
  */
-inline std::string GetHostName() {
-  char buf[HOST_NAME_MAX];
-  xgboost_CHECK_SYS_CALL(gethostname(&buf[0], HOST_NAME_MAX), 0);
-  return buf;
-}
+[[nodiscard]] Result GetHostName(std::string *p_out);
 }  // namespace collective
 }  // namespace xgboost
 
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index cb22e120e..b099d1c47 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -372,6 +372,19 @@ class Json {
   /*! \brief Use your own JsonWriter. */
   static void Dump(Json json, JsonWriter* writer);
 
+  template <typename Container = std::string>
+  static Container Dump(Json json) {
+    if constexpr (std::is_same_v<Container, std::string>) {
+      std::string str;
+      Dump(json, &str);
+      return str;
+    } else {
+      std::vector<char> str;
+      Dump(json, &str);
+      return str;
+    }
+  }
+
   Json() = default;
 
   // number
diff --git a/include/xgboost/string_view.h b/include/xgboost/string_view.h
index 8b5bff7f6..ba0d9f368 100644
--- a/include/xgboost/string_view.h
+++ b/include/xgboost/string_view.h
@@ -29,7 +29,7 @@ struct StringView {
  public:
   constexpr StringView() = default;
   constexpr StringView(CharT const* str, std::size_t size) : str_{str}, size_{size} {}
-  explicit StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {}
+  StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {}  // NOLINT
   constexpr StringView(CharT const* str)  // NOLINT
       : str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {}
 
diff --git a/plugin/federated/federated_client.h b/plugin/federated/federated_client.h
index 2b4637339..d104cb231 100644
--- a/plugin/federated/federated_client.h
+++ b/plugin/federated/federated_client.h
@@ -11,9 +11,7 @@
 #include <limits>
 #include <string>
 
-namespace xgboost {
-namespace federated {
-
+namespace xgboost::federated {
 /**
  * @brief A wrapper around the gRPC client.
  */
@@ -112,6 +110,4 @@ class FederatedClient {
   int const rank_;
   uint64_t sequence_number_{};
 };
-
-}  // namespace federated
-}  // namespace xgboost
+}  // namespace xgboost::federated
diff --git a/plugin/federated/federated_communicator.h b/plugin/federated/federated_communicator.h
index 7acd8a829..996b433cb 100644
--- a/plugin/federated/federated_communicator.h
+++ b/plugin/federated/federated_communicator.h
@@ -9,9 +9,7 @@
 #include "../../src/common/io.h"
 #include "federated_client.h"
 
-namespace xgboost {
-namespace collective {
-
+namespace xgboost::collective {
 /**
  * @brief A Federated Learning communicator class that handles collective communication.
  */
@@ -118,13 +116,13 @@ class FederatedCommunicator : public Communicator {
    * \brief Get if the communicator is distributed.
    * \return True.
    */
-  bool IsDistributed() const override { return true; }
+  [[nodiscard]] bool IsDistributed() const override { return true; }
 
   /**
    * \brief Get if the communicator is federated.
    * \return True.
    */
-  bool IsFederated() const override { return true; }
+  [[nodiscard]] bool IsFederated() const override { return true; }
 
   /**
    * \brief Perform in-place allgather.
@@ -189,5 +187,4 @@ class FederatedCommunicator : public Communicator {
  private:
   std::unique_ptr<xgboost::federated::FederatedClient> client_{};
 };
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/plugin/federated/federated_server.cc b/plugin/federated/federated_server.cc
index c50bf1f35..ae42f6d28 100644
--- a/plugin/federated/federated_server.cc
+++ b/plugin/federated/federated_server.cc
@@ -11,9 +11,7 @@
 
 #include "../../src/common/io.h"
 
-namespace xgboost {
-namespace federated {
-
+namespace xgboost::federated {
 grpc::Status FederatedService::Allgather(grpc::ServerContext*, AllgatherRequest const* request,
                                          AllgatherReply* reply) {
   handler_.Allgather(request->send_buffer().data(), request->send_buffer().size(),
@@ -75,6 +73,4 @@ void RunInsecureServer(int port, int world_size) {
 
   server->Wait();
 }
-
-}  // namespace federated
-}  // namespace xgboost
+}  // namespace xgboost::federated
diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc
index bd48d3599..6480adf03 100644
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -115,9 +115,12 @@ bool AllreduceBase::Init(int argc, char* argv[]) {
   // start socket
   xgboost::system::SocketStartup();
   utils::Assert(all_links.size() == 0, "can only call Init once");
-  this->host_uri = xgboost::collective::GetHostName();
+  auto rc = xgboost::collective::GetHostName(&this->host_uri);
+  if (!rc.OK()) {
+    LOG(FATAL) << rc.Report();
+  }
   // get information from tracker
-  auto rc = this->ReConnectLinks();
+  rc = this->ReConnectLinks();
   if (rc.OK()) {
     return true;
   }
@@ -406,13 +409,14 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
       if (!match) all_links.emplace_back(std::move(r));
     }
     sock_listen.Close();
+
     this->parent_index = -1;
     // setup tree links and ring structure
     tree_links.plinks.clear();
     for (auto &all_link : all_links) {
       utils::Assert(!all_link.sock.BadSocket(), "ReConnectLink: bad socket");
       // set the socket to non-blocking mode, enable TCP keepalive
-      all_link.sock.SetNonBlock(true);
+      CHECK(all_link.sock.NonBlocking(true).OK());
       all_link.sock.SetKeepAlive();
       if (rabit_enable_tcp_no_delay) {
         all_link.sock.SetNoDelay();
diff --git a/src/collective/communicator.cc b/src/collective/communicator.cc
index e4c491c2b..6ac9ff58e 100644
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@@ -11,9 +11,7 @@
 #include "../../plugin/federated/federated_communicator.h"
 #endif
 
-namespace xgboost {
-namespace collective {
-
+namespace xgboost::collective {
 thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
 thread_local CommunicatorType Communicator::type_{};
 
@@ -57,6 +55,4 @@ void Communicator::Finalize() {
   communicator_.reset(new NoOpCommunicator());
 }
 #endif
-
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/src/collective/socket.cc b/src/collective/socket.cc
index 78dc3d79b..8ca936ff3 100644
--- a/src/collective/socket.cc
+++ b/src/collective/socket.cc
@@ -3,6 +3,7 @@
  */
 #include "xgboost/collective/socket.h"
 
+#include <array>         // for array
 #include <cstddef>       // std::size_t
 #include <cstdint>       // std::int32_t
 #include <cstring>       // std::memcpy, std::memset
@@ -92,13 +93,18 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
 
   conn = TCPSocket::Create(addr.Domain());
   CHECK_EQ(static_cast<std::int32_t>(conn.Domain()), static_cast<std::int32_t>(addr.Domain()));
-  conn.SetNonBlock(true);
+  auto non_blocking = conn.NonBlocking();
+  auto rc = conn.NonBlocking(true);
+  if (!rc.OK()) {
+    return Fail("Failed to set socket option.", std::move(rc));
+  }
 
   Result last_error;
-  auto log_failure = [&host, &last_error](Result err, char const *file, std::int32_t line) {
+  auto log_failure = [&host, &last_error, port](Result err, char const *file, std::int32_t line) {
     last_error = std::move(err);
     LOG(WARNING) << std::filesystem::path{file}.filename().string() << "(" << line
-                 << "): Failed to connect to:" << host << " Error:" << last_error.Report();
+                 << "): Failed to connect to:" << host << ":" << port
+                 << " Error:" << last_error.Report();
   };
 
   for (std::int32_t attempt = 0; attempt < std::max(retry, 1); ++attempt) {
@@ -138,12 +144,9 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
         continue;
       }
 
-      conn.SetNonBlock(false);
-      return Success();
-
+      return conn.NonBlocking(non_blocking);
     } else {
-      conn.SetNonBlock(false);
-      return Success();
+      return conn.NonBlocking(non_blocking);
     }
   }
 
@@ -152,4 +155,13 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
   conn.Close();
   return Fail(ss.str(), std::move(last_error));
 }
+
+[[nodiscard]] Result GetHostName(std::string *p_out) {
+  std::array<char, HOST_NAME_MAX> buf;
+  if (gethostname(&buf[0], HOST_NAME_MAX) != 0) {
+    return system::FailWithCode("Failed to get host name.");
+  }
+  *p_out = buf.data();
+  return Success();
+}
 }  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_socket.cc b/tests/cpp/collective/test_socket.cc
index ddc73d1f2..07a7f52d0 100644
--- a/tests/cpp/collective/test_socket.cc
+++ b/tests/cpp/collective/test_socket.cc
@@ -73,4 +73,15 @@ TEST(Socket, Basic) {
 
   system::SocketFinalize();
 }
+
+TEST(Socket, Bind) {
+  system::SocketStartup();
+  auto any = SockAddrV4::InaddrAny().Addr();
+  auto sock = TCPSocket::Create(SockDomain::kV4);
+  std::int32_t port{0};
+  auto rc = sock.Bind(any, &port);
+  ASSERT_TRUE(rc.OK());
+  ASSERT_NE(port, 0);
+  system::SocketFinalize();
+}
 }  // namespace xgboost::collective
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 4d498ffd5..1d1319274 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -4,6 +4,7 @@
 #include <gtest/gtest.h>
 
 #include <fstream>
+#include <iterator>  // for back_inserter
 #include <map>
 
 #include "../../../src/common/charconv.h"
@@ -691,4 +692,16 @@ TEST(Json, TypeCheck) {
     ASSERT_NE(err.find("foo"), std::string::npos);
   }
 }
+
+TEST(Json, Dump) {
+  auto str = GetModelStr();
+  auto jobj = Json::Load(str);
+  std::string result_s = Json::Dump(jobj);
+
+  std::vector<char> result_v = Json::Dump<std::vector<char>>(jobj);
+  ASSERT_EQ(result_s.size(), result_v.size());
+  for (std::size_t i = 0; i < result_s.size(); ++i) {
+    ASSERT_EQ(result_s[i], result_v[i]);
+  }
+}
 }  // namespace xgboost

From 300f9ace06b9d3c66c3376a5188894f82d127e46 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 14 Sep 2023 04:05:47 +0800
Subject: [PATCH 09/59] Fix default metric configuration. (#9575)

---
 src/learner.cc                        |  4 +-
 src/objective/regression_obj.cu       |  7 ++++
 tests/cpp/objective/test_objective.cc | 58 +++++++++++++++++++++++++++
 tests/cpp/objective_helpers.cc        | 31 ++++++++++++++
 tests/cpp/objective_helpers.h         |  4 ++
 tests/cpp/test_learner.cc             | 26 +-----------
 6 files changed, 105 insertions(+), 25 deletions(-)
 create mode 100644 tests/cpp/objective_helpers.cc

diff --git a/src/learner.cc b/src/learner.cc
index 79dca44bd..5697ee627 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1319,7 +1319,9 @@ class LearnerImpl : public LearnerIO {
     if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
       metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
       auto config = obj_->DefaultMetricConfig();
-      metrics_.back()->LoadConfig(config);
+      if (!IsA<Null>(config)) {
+        metrics_.back()->LoadConfig(config);
+      }
       metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
     }
 
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 4f099a537..35610beb7 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -287,6 +287,13 @@ class PseudoHuberRegression : public FitIntercept {
     }
     FromJson(in["pseudo_huber_param"], &param_);
   }
+  [[nodiscard]] Json DefaultMetricConfig() const override {
+    CHECK(param_.GetInitialised());
+    Json config{Object{}};
+    config["name"] = String{this->DefaultEvalMetric()};
+    config["pseudo_huber_param"] = ToJson(param_);
+    return config;
+  }
 };
 
 XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror")
diff --git a/tests/cpp/objective/test_objective.cc b/tests/cpp/objective/test_objective.cc
index 718f8f659..21ffc7caf 100644
--- a/tests/cpp/objective/test_objective.cc
+++ b/tests/cpp/objective/test_objective.cc
@@ -6,6 +6,7 @@
 #include <xgboost/objective.h>
 
 #include "../helpers.h"
+#include "../objective_helpers.h"
 
 TEST(Objective, UnknownFunction) {
   xgboost::ObjFunction* obj = nullptr;
@@ -43,4 +44,61 @@ TEST(Objective, PredTransform) {
     ASSERT_TRUE(predts.HostCanWrite());
   }
 }
+
+class TestDefaultObjConfig : public ::testing::TestWithParam<std::string> {
+  Context ctx_;
+
+ public:
+  void Run(std::string objective) {
+    auto Xy = MakeFmatForObjTest(objective);
+    std::unique_ptr<Learner> learner{Learner::Create({Xy})};
+    std::unique_ptr<ObjFunction> objfn{ObjFunction::Create(objective, &ctx_)};
+
+    learner->SetParam("objective", objective);
+    if (objective.find("multi") != std::string::npos) {
+      learner->SetParam("num_class", "3");
+      objfn->Configure(Args{{"num_class", "3"}});
+    } else if (objective.find("quantile") != std::string::npos) {
+      learner->SetParam("quantile_alpha", "0.5");
+      objfn->Configure(Args{{"quantile_alpha", "0.5"}});
+    } else {
+      objfn->Configure(Args{});
+    }
+    learner->Configure();
+    learner->UpdateOneIter(0, Xy);
+    learner->EvalOneIter(0, {Xy}, {"train"});
+    Json config{Object{}};
+    learner->SaveConfig(&config);
+    auto jobj = get<Object const>(config["learner"]["objective"]);
+
+    ASSERT_TRUE(jobj.find("name") != jobj.cend());
+    // FIXME(jiamingy): We should have the following check, but some legacy parameter like
+    // "pos_weight", "delta_step" in objectives are not in metrics.
+
+    // if (jobj.size() > 1) {
+    //   ASSERT_FALSE(IsA<Null>(objfn->DefaultMetricConfig()));
+    // }
+    auto mconfig = objfn->DefaultMetricConfig();
+    if (!IsA<Null>(mconfig)) {
+      // make sure metric can handle it
+      std::unique_ptr<Metric> metricfn{Metric::Create(get<String const>(mconfig["name"]), &ctx_)};
+      metricfn->LoadConfig(mconfig);
+      Json loaded(Object{});
+      metricfn->SaveConfig(&loaded);
+      metricfn->Configure(Args{});
+      ASSERT_EQ(mconfig, loaded);
+    }
+  }
+};
+
+TEST_P(TestDefaultObjConfig, Objective) {
+  std::string objective = GetParam();
+  this->Run(objective);
+}
+
+INSTANTIATE_TEST_SUITE_P(Objective, TestDefaultObjConfig,
+                         ::testing::ValuesIn(MakeObjNamesForTest()),
+                         [](const ::testing::TestParamInfo<TestDefaultObjConfig::ParamType>& info) {
+                           return ObjTestNameGenerator(info);
+                         });
 } // namespace xgboost
diff --git a/tests/cpp/objective_helpers.cc b/tests/cpp/objective_helpers.cc
new file mode 100644
index 000000000..ed80f71d5
--- /dev/null
+++ b/tests/cpp/objective_helpers.cc
@@ -0,0 +1,31 @@
+/**
+ * Copyright (c) 2023, XGBoost contributors
+ */
+#include "objective_helpers.h"
+
+#include "../../src/common/linalg_op.h"  // for begin, end
+#include "helpers.h"                     // for RandomDataGenerator
+
+namespace xgboost {
+std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj) {
+  auto constexpr kRows = 10, kCols = 10;
+  auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+  auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
+  auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
+  h_lower.resize(kRows);
+  h_upper.resize(kRows);
+  for (size_t i = 0; i < kRows; ++i) {
+    h_lower[i] = 1;
+    h_upper[i] = 10;
+  }
+  if (obj.find("rank:") != std::string::npos) {
+    auto h_label = p_fmat->Info().labels.HostView();
+    std::size_t k = 0;
+    for (auto& v : h_label) {
+      v = k % 2 == 0;
+      ++k;
+    }
+  }
+  return p_fmat;
+};
+}  // namespace xgboost
diff --git a/tests/cpp/objective_helpers.h b/tests/cpp/objective_helpers.h
index b26470746..7f394ef8d 100644
--- a/tests/cpp/objective_helpers.h
+++ b/tests/cpp/objective_helpers.h
@@ -1,6 +1,8 @@
 /**
  * Copyright (c) 2023, XGBoost contributors
  */
+#pragma once
+
 #include <dmlc/registry.h>  // for Registry
 #include <gtest/gtest.h>
 #include <xgboost/objective.h>  // for ObjFunctionReg
@@ -29,4 +31,6 @@ inline std::string ObjTestNameGenerator(const ::testing::TestParamInfo<ParamType
   }
   return name;
 };
+
+std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj);
 }  // namespace xgboost
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 5a31ce1bd..0ca7ee0eb 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -655,33 +655,11 @@ TEST_F(InitBaseScore, InitWithPredict) { this->TestInitWithPredt(); }
 TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); }
 
 class TestColumnSplit : public ::testing::TestWithParam<std::string> {
-  static auto MakeFmat(std::string const& obj) {
-    auto constexpr kRows = 10, kCols = 10;
-    auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
-    auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
-    auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
-    h_lower.resize(kRows);
-    h_upper.resize(kRows);
-    for (size_t i = 0; i < kRows; ++i) {
-      h_lower[i] = 1;
-      h_upper[i] = 10;
-    }
-    if (obj.find("rank:") != std::string::npos) {
-      auto h_label = p_fmat->Info().labels.HostView();
-      std::size_t k = 0;
-      for (auto& v : h_label) {
-        v = k % 2 == 0;
-        ++k;
-      }
-    }
-    return p_fmat;
-  };
-
   void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) {
     auto const world_size = collective::GetWorldSize();
     auto const rank = collective::GetRank();
 
-    auto p_fmat = MakeFmat(objective);
+    auto p_fmat = MakeFmatForObjTest(objective);
     std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
     std::unique_ptr<Learner> learner{Learner::Create({sliced})};
     learner->SetParam("tree_method", "approx");
@@ -705,7 +683,7 @@ class TestColumnSplit : public ::testing::TestWithParam<std::string> {
 
  public:
   void Run(std::string objective) {
-    auto p_fmat = MakeFmat(objective);
+    auto p_fmat = MakeFmatForObjTest(objective);
     std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
     learner->SetParam("tree_method", "approx");
     learner->SetParam("objective", objective);

From a343ae3b3452e42617a60fedd448d9a969148560 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 13 Sep 2023 14:53:46 -0700
Subject: [PATCH 10/59] fix dupliate gpu check (#9578)

---
 demo/nvflare/vertical/custom/trainer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/demo/nvflare/vertical/custom/trainer.py b/demo/nvflare/vertical/custom/trainer.py
index efe320734..b6c3855ef 100644
--- a/demo/nvflare/vertical/custom/trainer.py
+++ b/demo/nvflare/vertical/custom/trainer.py
@@ -83,9 +83,8 @@ class XGBoostTrainer(Executor):
                 'eval_metric': 'auc',
             }
             if self._use_gpus:
-                if self._use_gpus:
-                    self.log_info(fl_ctx, f'Training with GPU {rank}')
-                    param['device'] = f"cuda:{rank}"
+                self.log_info(fl_ctx, f'Training with GPU {rank}')
+                param['device'] = f"cuda:{rank}"
 
             # specify validations set to watch performance
             watchlist = [(dtest, "eval"), (dtrain, "train")]

From d8c3cc92ae677c320dc15c8090bf3b41e7725039 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 13 Sep 2023 17:13:13 -0700
Subject: [PATCH 11/59] More support for column split in gpu predictor (#9562)

---
 src/predictor/gpu_predictor.cu            | 108 ++++++++++-----
 tests/cpp/predictor/test_cpu_predictor.cc |  28 ++--
 tests/cpp/predictor/test_gpu_predictor.cu |  27 +++-
 tests/cpp/predictor/test_predictor.cc     | 159 +++++++++++++---------
 tests/cpp/predictor/test_predictor.h      |  14 +-
 5 files changed, 212 insertions(+), 124 deletions(-)

diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 70a5c02d5..f217ad827 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -633,11 +633,12 @@ __global__ void MaskBitVectorKernel(
     common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
     std::size_t tree_begin, std::size_t tree_end, std::size_t num_features, std::size_t num_rows,
     std::size_t entry_start, std::size_t num_nodes, bool use_shared, float missing) {
+  // This needs to be always instantiated since the data is loaded cooperatively by all threads.
+  SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
   auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (row_idx >= num_rows) {
     return;
   }
-  SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
 
   std::size_t tree_offset = 0;
   for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -668,10 +669,10 @@ __global__ void MaskBitVectorKernel(
   }
 }
 
-__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
-                                          BitVector const& decision_bits,
-                                          BitVector const& missing_bits, std::size_t num_nodes,
-                                          std::size_t tree_offset) {
+__device__ bst_node_t GetLeafIndexByBitVector(bst_row_t ridx, TreeView const& tree,
+                                              BitVector const& decision_bits,
+                                              BitVector const& missing_bits, std::size_t num_nodes,
+                                              std::size_t tree_offset) {
   bst_node_t nidx = 0;
   RegTree::Node n = tree.d_tree[nidx];
   while (!n.IsLeaf()) {
@@ -683,9 +684,19 @@ __device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
     }
     n = tree.d_tree[nidx];
   }
+  return nidx;
+}
+
+__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
+                                          BitVector const& decision_bits,
+                                          BitVector const& missing_bits, std::size_t num_nodes,
+                                          std::size_t tree_offset) {
+  auto const nidx =
+      GetLeafIndexByBitVector(ridx, tree, decision_bits, missing_bits, num_nodes, tree_offset);
   return tree.d_tree[nidx].LeafValue();
 }
 
+template <bool predict_leaf>
 __global__ void PredictByBitVectorKernel(
     common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
     common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
@@ -701,27 +712,39 @@ __global__ void PredictByBitVectorKernel(
   }
 
   std::size_t tree_offset = 0;
-  if (num_group == 1) {
-    float sum = 0;
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+  if constexpr (predict_leaf) {
+    for (size_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
       TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
                       d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
                       d_cat_node_segments, d_categories};
-      sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
-                                      tree_offset);
+      auto const leaf = GetLeafIndexByBitVector(row_idx, d_tree, decision_bits, missing_bits,
+                                                num_nodes, tree_offset);
+      d_out_predictions[row_idx * (tree_end - tree_begin) + tree_idx] = static_cast<float>(leaf);
       tree_offset += d_tree.d_tree.size();
     }
-    d_out_predictions[row_idx] += sum;
   } else {
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto const tree_group = d_tree_group[tree_idx];
-      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                      d_cat_node_segments, d_categories};
-      bst_uint out_prediction_idx = row_idx * num_group + tree_group;
-      d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
-          row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
-      tree_offset += d_tree.d_tree.size();
+    if (num_group == 1) {
+      float sum = 0;
+      for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                          d_cat_node_segments, d_categories};
+          sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
+                                          tree_offset);
+          tree_offset += d_tree.d_tree.size();
+      }
+      d_out_predictions[row_idx] += sum;
+    } else {
+      for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          auto const tree_group = d_tree_group[tree_idx];
+          TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                          d_cat_node_segments, d_categories};
+          bst_uint out_prediction_idx = row_idx * num_group + tree_group;
+          d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
+              row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
+          tree_offset += d_tree.d_tree.size();
+      }
     }
   }
 }
@@ -733,13 +756,21 @@ class ColumnSplitHelper {
   void PredictBatch(DMatrix* dmat, HostDeviceVector<float>* out_preds,
                     gbm::GBTreeModel const& model, DeviceModel const& d_model) const {
     CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
-    PredictDMatrix(dmat, out_preds, d_model, model.learner_model_param->num_feature,
-                   model.learner_model_param->num_output_group);
+    PredictDMatrix<false>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
+                          model.learner_model_param->num_output_group);
+  }
+
+  void PredictLeaf(DMatrix* dmat, HostDeviceVector<float>* out_preds, gbm::GBTreeModel const& model,
+                   DeviceModel const& d_model) const {
+    CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
+    PredictDMatrix<true>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
+                         model.learner_model_param->num_output_group);
   }
 
  private:
   using BitType = BitVector::value_type;
 
+  template <bool predict_leaf>
   void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
                       bst_feature_t num_features, std::uint32_t num_group) const {
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
@@ -777,7 +808,7 @@ class ColumnSplitHelper {
       AllReduceBitVectors(&decision_storage, &missing_storage);
 
       dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
-          PredictByBitVectorKernel, model.nodes.ConstDeviceSpan(),
+          PredictByBitVectorKernel<predict_leaf>, model.nodes.ConstDeviceSpan(),
           out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
           model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
           model.categories_tree_segments.ConstDeviceSpan(),
@@ -795,12 +826,11 @@ class ColumnSplitHelper {
         ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
     collective::AllReduce<collective::Operation::kBitwiseAND>(
         ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
-    collective::Synchronize(ctx_->gpu_id);
   }
 
   void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
-                               dh::caching_device_vector<BitType>* missing_storage,
-                               std::size_t total_bits) const {
+                        dh::caching_device_vector<BitType>* missing_storage,
+                        std::size_t total_bits) const {
     auto const size = BitVector::ComputeStorageSize(total_bits);
     if (decision_storage->size() < size) {
       decision_storage->resize(size);
@@ -889,7 +919,7 @@ class GPUPredictor : public xgboost::Predictor {
     DeviceModel d_model;
     d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
 
-    if (dmat->Info().IsColumnSplit()) {
+    if (info.IsColumnSplit()) {
       column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
       return;
     }
@@ -1018,6 +1048,9 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
+    CHECK(!p_fmat->Info().IsColumnSplit())
+        << "Predict contribution support for column-wise data split is not yet implemented.";
+
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
     out_contribs->SetDevice(ctx_->gpu_id);
     if (tree_end == 0 || tree_end > model.trees.size()) {
@@ -1136,17 +1169,9 @@ class GPUPredictor : public xgboost::Predictor {
                    const gbm::GBTreeModel &model,
                    unsigned tree_end) const override {
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
 
     const MetaInfo& info = p_fmat->Info();
-    constexpr uint32_t kBlockThreads = 128;
-    size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
-        info.num_col_, max_shared_memory_bytes);
-    bool use_shared = shared_memory_bytes != 0;
-    bst_feature_t num_features = info.num_col_;
     bst_row_t num_rows = info.num_row_;
-    size_t entry_start = 0;
-
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
@@ -1155,6 +1180,19 @@ class GPUPredictor : public xgboost::Predictor {
     DeviceModel d_model;
     d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
 
+    if (info.IsColumnSplit()) {
+      column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
+      return;
+    }
+
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
+    constexpr uint32_t kBlockThreads = 128;
+    size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
+        info.num_col_, max_shared_memory_bytes);
+    bool use_shared = shared_memory_bytes != 0;
+    bst_feature_t num_features = info.num_col_;
+    size_t entry_start = 0;
+
     if (p_fmat->PageExists<SparsePage>()) {
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
         batch.data.SetDevice(ctx_->gpu_id);
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 5ff0fdeec..c99360d8a 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -127,8 +127,8 @@ TEST(CpuPredictor, IterationRange) {
 }
 
 TEST(CpuPredictor, IterationRangeColmnSplit) {
-  Context ctx;
-  TestIterationRangeColumnSplit(&ctx);
+  auto constexpr kWorldSize = 2;
+  TestIterationRangeColumnSplit(kWorldSize, false);
 }
 
 TEST(CpuPredictor, ExternalMemory) {
@@ -226,23 +226,21 @@ TEST(CPUPredictor, GHistIndexTraining) {
 }
 
 TEST(CPUPredictor, CategoricalPrediction) {
-  Context ctx;
-  TestCategoricalPrediction(&ctx, false);
+  TestCategoricalPrediction(false, false);
 }
 
 TEST(CPUPredictor, CategoricalPredictionColumnSplit) {
-  Context ctx;
-  TestCategoricalPredictionColumnSplit(&ctx);
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPrediction, false, true);
 }
 
 TEST(CPUPredictor, CategoricalPredictLeaf) {
-  Context ctx;
-  TestCategoricalPredictLeaf(&ctx, false);
+  TestCategoricalPredictLeaf(false, false);
 }
 
 TEST(CPUPredictor, CategoricalPredictLeafColumnSplit) {
-  Context ctx;
-  TestCategoricalPredictLeafColumnSplit(&ctx);
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, false, true);
 }
 
 TEST(CpuPredictor, UpdatePredictionCache) {
@@ -256,8 +254,8 @@ TEST(CpuPredictor, LesserFeatures) {
 }
 
 TEST(CpuPredictor, LesserFeaturesColumnSplit) {
-  Context ctx;
-  TestPredictionWithLesserFeaturesColumnSplit(&ctx);
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, TestPredictionWithLesserFeaturesColumnSplit, false);
 }
 
 TEST(CpuPredictor, Sparse) {
@@ -267,9 +265,9 @@ TEST(CpuPredictor, Sparse) {
 }
 
 TEST(CpuPredictor, SparseColumnSplit) {
-  Context ctx;
-  TestSparsePredictionColumnSplit(&ctx, 0.2);
-  TestSparsePredictionColumnSplit(&ctx, 0.8);
+  auto constexpr kWorldSize = 2;
+  TestSparsePredictionColumnSplit(kWorldSize, false, 0.2);
+  TestSparsePredictionColumnSplit(kWorldSize, false, 0.8);
 }
 
 TEST(CpuPredictor, Multi) {
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index f31158482..beeab70fa 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -206,6 +206,10 @@ TEST(GpuPredictor, LesserFeatures) {
   TestPredictionWithLesserFeatures(&ctx);
 }
 
+TEST_F(MGPUPredictorTest, LesserFeaturesColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, TestPredictionWithLesserFeaturesColumnSplit, true);
+}
+
 // Very basic test of empty model
 TEST(GPUPredictor, ShapStump) {
   cudaSetDevice(0);
@@ -270,14 +274,24 @@ TEST(GPUPredictor, IterationRange) {
   TestIterationRange(&ctx);
 }
 
+TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
+  TestIterationRangeColumnSplit(world_size_, true);
+}
+
 TEST(GPUPredictor, CategoricalPrediction) {
-  auto ctx = MakeCUDACtx(0);
-  TestCategoricalPrediction(&ctx, false);
+  TestCategoricalPrediction(true, false);
+}
+
+TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, TestCategoricalPrediction, true, true);
 }
 
 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  auto ctx = MakeCUDACtx(0);
-  TestCategoricalPredictLeaf(&ctx, false);
+  TestCategoricalPredictLeaf(true, false);
+}
+
+TEST_F(MGPUPredictorTest, CategoricalPredictionLeafColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, TestCategoricalPredictLeaf, true, true);
 }
 
 TEST(GPUPredictor, PredictLeafBasic) {
@@ -305,4 +319,9 @@ TEST(GPUPredictor, Sparse) {
   TestSparsePrediction(&ctx, 0.2);
   TestSparsePrediction(&ctx, 0.8);
 }
+
+TEST_F(MGPUPredictorTest, SparseColumnSplit) {
+  TestSparsePredictionColumnSplit(world_size_, true, 0.2);
+  TestSparsePredictionColumnSplit(world_size_, true, 0.8);
+}
 }  // namespace xgboost::predictor
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index a9f218c0c..873c1672e 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -172,16 +172,6 @@ void VerifyPredictionWithLesserFeatures(Learner *learner, bst_row_t kRows,
   ASSERT_THROW({ learner->Predict(m_invalid, false, &prediction, 0, 0); }, dmlc::Error);
 }
 
-void VerifyPredictionWithLesserFeaturesColumnSplit(Learner *learner, size_t rows,
-                                                   std::shared_ptr<DMatrix> m_test,
-                                                   std::shared_ptr<DMatrix> m_invalid) {
-  auto const world_size = collective::GetWorldSize();
-  auto const rank = collective::GetRank();
-  std::shared_ptr<DMatrix> sliced_test{m_test->SliceCol(world_size, rank)};
-  std::shared_ptr<DMatrix> sliced_invalid{m_invalid->SliceCol(world_size, rank)};
-
-  VerifyPredictionWithLesserFeatures(learner, rows, sliced_test, sliced_invalid);
-}
 }  // anonymous namespace
 
 void TestPredictionWithLesserFeatures(Context const *ctx) {
@@ -229,16 +219,24 @@ void TestPredictionDeviceAccess() {
 #endif  // defined(XGBOOST_USE_CUDA)
 }
 
-void TestPredictionWithLesserFeaturesColumnSplit(Context const *ctx) {
-  size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
-  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
-  auto learner = LearnerForTest(ctx, m_train, kIters);
+void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+
+  std::size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
+  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true);
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+  }
+  auto learner = LearnerForTest(&ctx, m_train, kIters);
   auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
   auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);
 
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, VerifyPredictionWithLesserFeaturesColumnSplit,
-                              learner.get(), kRows, m_test, m_invalid);
+  std::shared_ptr<DMatrix> sliced_test{m_test->SliceCol(world_size, rank)};
+  std::shared_ptr<DMatrix> sliced_invalid{m_invalid->SliceCol(world_size, rank)};
+
+  VerifyPredictionWithLesserFeatures(learner.get(), kRows, sliced_test, sliced_invalid);
 }
 
 void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
@@ -260,7 +258,11 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
   model->CommitModelGroup(std::move(trees), 0);
 }
 
-void TestCategoricalPrediction(Context const* ctx, bool is_column_split) {
+void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  }
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
 
@@ -270,10 +272,10 @@ void TestCategoricalPrediction(Context const* ctx, bool is_column_split) {
   float left_weight = 1.3f;
   float right_weight = 1.7f;
 
-  gbm::GBTreeModel model(&mparam, ctx);
+  gbm::GBTreeModel model(&mparam, &ctx);
   GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight);
 
-  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(&ctx)};
 
   std::vector<float> row(kCols);
   row[split_ind] = split_cat;
@@ -303,12 +305,11 @@ void TestCategoricalPrediction(Context const* ctx, bool is_column_split) {
   ASSERT_EQ(out_predictions.predictions.HostVector()[0], left_weight + score);
 }
 
-void TestCategoricalPredictionColumnSplit(Context const *ctx) {
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPrediction, ctx, true);
-}
-
-void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
+void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split) {
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  }
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
 
@@ -319,10 +320,10 @@ void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
   float left_weight = 1.3f;
   float right_weight = 1.7f;
 
-  gbm::GBTreeModel model(&mparam, ctx);
+  gbm::GBTreeModel model(&mparam, &ctx);
   GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight);
 
-  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(&ctx)};
 
   std::vector<float> row(kCols);
   row[split_ind] = split_cat;
@@ -347,11 +348,6 @@ void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
   ASSERT_EQ(out_predictions.predictions.HostVector()[0], 1);
 }
 
-void TestCategoricalPredictLeafColumnSplit(Context const *ctx) {
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, ctx, true);
-}
-
 void TestIterationRange(Context const* ctx) {
   size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
   auto dmat = RandomDataGenerator(kRows, kCols, 0)
@@ -411,15 +407,30 @@ void TestIterationRange(Context const* ctx) {
 }
 
 namespace {
-void VerifyIterationRangeColumnSplit(DMatrix *dmat, Learner *learner, Learner *sliced,
+void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
+                                     Json const &sliced_model, std::size_t rows, std::size_t cols,
+                                     std::size_t classes,
                                      std::vector<float> const &expected_margin_ranged,
                                      std::vector<float> const &expected_margin_sliced,
                                      std::vector<float> const &expected_leaf_ranged,
                                      std::vector<float> const &expected_leaf_sliced) {
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+  }
+  auto dmat = RandomDataGenerator(rows, cols, 0).GenerateDMatrix(true, true, classes);
   std::shared_ptr<DMatrix> Xy{dmat->SliceCol(world_size, rank)};
 
+  std::unique_ptr<Learner> learner{Learner::Create({Xy})};
+  learner->SetParam("device", ctx.DeviceName());
+  learner->LoadModel(ranged_model);
+
+  std::unique_ptr<Learner> sliced{Learner::Create({Xy})};
+  sliced->SetParam("device", ctx.DeviceName());
+  sliced->LoadModel(sliced_model);
+
   HostDeviceVector<float> out_predt_sliced;
   HostDeviceVector<float> out_predt_ranged;
 
@@ -428,11 +439,15 @@ void VerifyIterationRangeColumnSplit(DMatrix *dmat, Learner *learner, Learner *s
     sliced->Predict(Xy, true, &out_predt_sliced, 0, 0, false, false, false, false, false);
     learner->Predict(Xy, true, &out_predt_ranged, 0, 3, false, false, false, false, false);
     auto const &h_sliced = out_predt_sliced.HostVector();
-    auto const &h_range = out_predt_ranged.HostVector();
-    ASSERT_EQ(h_sliced.size(), expected_margin_sliced.size());
-    ASSERT_EQ(h_sliced, expected_margin_sliced);
-    ASSERT_EQ(h_range.size(), expected_margin_ranged.size());
-    ASSERT_EQ(h_range, expected_margin_ranged);
+    auto const &h_ranged = out_predt_ranged.HostVector();
+    EXPECT_EQ(h_sliced.size(), expected_margin_sliced.size());
+    for (std::size_t i = 0; i < expected_margin_sliced.size(); ++i) {
+      ASSERT_FLOAT_EQ(h_sliced[i], expected_margin_sliced[i]) << "rank " << rank << ", i " << i;
+    }
+    EXPECT_EQ(h_ranged.size(), expected_margin_ranged.size());
+    for (std::size_t i = 0; i < expected_margin_ranged.size(); ++i) {
+      ASSERT_FLOAT_EQ(h_ranged[i], expected_margin_ranged[i]) << "rank " << rank << ", i " << i;
+    }
   }
 
   // Leaf
@@ -440,21 +455,27 @@ void VerifyIterationRangeColumnSplit(DMatrix *dmat, Learner *learner, Learner *s
     sliced->Predict(Xy, false, &out_predt_sliced, 0, 0, false, true, false, false, false);
     learner->Predict(Xy, false, &out_predt_ranged, 0, 3, false, true, false, false, false);
     auto const &h_sliced = out_predt_sliced.HostVector();
-    auto const &h_range = out_predt_ranged.HostVector();
-    ASSERT_EQ(h_sliced.size(), expected_leaf_sliced.size());
-    ASSERT_EQ(h_sliced, expected_leaf_sliced);
-    ASSERT_EQ(h_range.size(), expected_leaf_ranged.size());
-    ASSERT_EQ(h_range, expected_leaf_ranged);
+    auto const &h_ranged = out_predt_ranged.HostVector();
+    EXPECT_EQ(h_sliced.size(), expected_leaf_sliced.size());
+    for (std::size_t i = 0; i < expected_leaf_sliced.size(); ++i) {
+      ASSERT_FLOAT_EQ(h_sliced[i], expected_leaf_sliced[i]) << "rank " << rank << ", i " << i;
+    }
+    EXPECT_EQ(h_ranged.size(), expected_leaf_ranged.size());
+    for (std::size_t i = 0; i < expected_leaf_ranged.size(); ++i) {
+      ASSERT_FLOAT_EQ(h_ranged[i], expected_leaf_ranged[i]) << "rank " << rank << ", i " << i;
+    }
   }
 }
 }  // anonymous namespace
 
-void TestIterationRangeColumnSplit(Context const* ctx) {
-  size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
+void TestIterationRangeColumnSplit(int world_size, bool use_gpu) {
+  std::size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
   auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
-  auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
-
-  learner->SetParam("device", ctx->DeviceName());
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(0);
+  }
+  auto learner = LearnerForTest(&ctx, dmat, kIters, kForest);
 
   bool bound = false;
   std::unique_ptr<Learner> sliced{learner->Slice(0, 3, 1, &bound)};
@@ -476,9 +497,13 @@ void TestIterationRangeColumnSplit(Context const* ctx) {
   auto const &leaf_sliced = leaf_predt_sliced.HostVector();
   auto const &leaf_ranged = leaf_predt_ranged.HostVector();
 
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, VerifyIterationRangeColumnSplit, dmat.get(),
-                              learner.get(), sliced.get(), margin_ranged, margin_sliced,
+  Json ranged_model{Object{}};
+  learner->SaveModel(&ranged_model);
+  Json sliced_model{Object{}};
+  sliced->SaveModel(&sliced_model);
+
+  RunWithInMemoryCommunicator(world_size, VerifyIterationRangeColumnSplit, use_gpu, ranged_model,
+                              sliced_model, kRows, kCols, kClasses, margin_ranged, margin_sliced,
                               leaf_ranged, leaf_sliced);
 }
 
@@ -539,11 +564,20 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
 }
 
 namespace {
-void VerifySparsePredictionColumnSplit(DMatrix *dmat, Learner *learner,
+void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::size_t rows,
+                                       std::size_t cols, float sparsity,
                                        std::vector<float> const &expected_predt) {
-  std::shared_ptr<DMatrix> sliced{
-      dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  }
+  auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true);
+  std::shared_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
   HostDeviceVector<float> sparse_predt;
+
+  std::unique_ptr<Learner> learner{Learner::Create({sliced})};
+  learner->SetParam("device", ctx.DeviceName());
+  learner->LoadModel(model);
   learner->Predict(sliced, false, &sparse_predt, 0, 0);
 
   auto const &predt = sparse_predt.HostVector();
@@ -554,10 +588,14 @@ void VerifySparsePredictionColumnSplit(DMatrix *dmat, Learner *learner,
 }
 }  // anonymous namespace
 
-void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
+void TestSparsePredictionColumnSplit(int world_size, bool use_gpu, float sparsity) {
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(0);
+  }
   size_t constexpr kRows = 512, kCols = 128, kIters = 4;
   auto Xy = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix(true);
-  auto learner = LearnerForTest(ctx, Xy, kIters);
+  auto learner = LearnerForTest(&ctx, Xy, kIters);
 
   HostDeviceVector<float> sparse_predt;
 
@@ -567,12 +605,11 @@ void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
   learner.reset(Learner::Create({Xy}));
   learner->LoadModel(model);
 
-  learner->SetParam("device", ctx->DeviceName());
+  learner->SetParam("device", ctx.DeviceName());
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, VerifySparsePredictionColumnSplit, Xy.get(),
-                              learner.get(), sparse_predt.HostVector());
+  RunWithInMemoryCommunicator(world_size, VerifySparsePredictionColumnSplit, use_gpu, model,
+                              kRows, kCols, sparsity, sparse_predt.HostVector());
 }
 
 void TestVectorLeafPrediction(Context const *ctx) {
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 81ec3cb5d..9e0891d56 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -94,23 +94,19 @@ void TestPredictionWithLesserFeatures(Context const* ctx);
 
 void TestPredictionDeviceAccess();
 
-void TestCategoricalPrediction(Context const* ctx, bool is_column_split);
+void TestCategoricalPrediction(bool use_gpu, bool is_column_split);
 
-void TestCategoricalPredictionColumnSplit(Context const* ctx);
+void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu);
 
-void TestPredictionWithLesserFeaturesColumnSplit(Context const* ctx);
-
-void TestCategoricalPredictLeaf(Context const* ctx, bool is_column_split);
-
-void TestCategoricalPredictLeafColumnSplit(Context const* ctx);
+void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split);
 
 void TestIterationRange(Context const* ctx);
 
-void TestIterationRangeColumnSplit(Context const* ctx);
+void TestIterationRangeColumnSplit(int world_size, bool use_gpu);
 
 void TestSparsePrediction(Context const* ctx, float sparsity);
 
-void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity);
+void TestSparsePredictionColumnSplit(int world_size, bool use_gpu, float sparsity);
 
 void TestVectorLeafPrediction(Context const* ctx);
 }  // namespace xgboost

From 730bc1f68852927c07b1f2ed8ff00bf3e28e448f Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 14 Sep 2023 04:11:26 -0500
Subject: [PATCH 12/59] [R] remove unused headers (#9546)

---
 R-package/src/init.c       | 1 -
 R-package/src/xgboost_R.cc | 1 -
 2 files changed, 2 deletions(-)

diff --git a/R-package/src/init.c b/R-package/src/init.c
index 09174222e..5c8e179d6 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -5,7 +5,6 @@
  * and edited to conform to xgboost C linter requirements. For details, see
  * https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Registering-native-routines
  */
-#include <R.h>
 #include <Rinternals.h>
 #include <stdlib.h>
 #include <R_ext/Rdynload.h>
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 44082f255..2938d4b6e 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -20,7 +20,6 @@
 #include "../../src/common/threading_utils.h"
 
 #include "./xgboost_R.h"  // Must follow other includes.
-#include "Rinternals.h"
 
 /*!
  * \brief macro to annotate begin of api

From 0df1da2db4ba30f9abfd18440e068d00f13c5ff7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 18 Sep 2023 17:58:48 +0800
Subject: [PATCH 13/59] fix rel script with relative path and end note version.
 [skip ci] (#9572)

---
 dev/release-artifacts.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index d9b9d6203..429fac078 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -250,8 +250,8 @@ echo "<hash> <artifact>" | shasum -a 256 --check
 ```
 
 **Experimental binary packages for R with CUDA enabled**
-* xgboost_r_gpu_linux_1.7.5.tar.gz: [Download]({r_gpu_linux_url})
-* xgboost_r_gpu_win64_1.7.5.tar.gz: [Download]({r_gpu_win64_url})
+* xgboost_r_gpu_linux_{release}.tar.gz: [Download]({r_gpu_linux_url})
+* xgboost_r_gpu_win64_{release}.tar.gz: [Download]({r_gpu_win64_url})
 
 **Source tarball**
 * xgboost.tar.gz: [Download]({src_tarball})"""
@@ -296,12 +296,13 @@ def main(args: argparse.Namespace) -> None:
     git.submodule("update")
     commit_hash = latest_hash()
 
-    if not os.path.exists(args.outdir):
-        os.mkdir(args.outdir)
+    outdir = os.path.abspath(args.outdir)
+    if not os.path.exists(outdir):
+        os.mkdir(outdir)
 
     # source tarball
     hashes: List[str] = []
-    tarname, h = make_src_package(release, args.outdir)
+    tarname, h = make_src_package(release, outdir)
     hashes.append(h)
 
     # CUDA R packages
@@ -310,18 +311,18 @@ def main(args: argparse.Namespace) -> None:
         branch,
         "" if rc is None else rc + str(rc_ver),
         commit_hash,
-        args.outdir,
+        outdir,
     )
     hashes.extend(hr)
 
     # Python source wheel
-    make_pysrc_wheel(release, rc, rc_ver, args.outdir)
+    make_pysrc_wheel(release, rc, rc_ver, outdir)
 
     # Python binary wheels
-    download_py_packages(branch, major, minor, commit_hash, args.outdir)
+    download_py_packages(branch, major, minor, commit_hash, outdir)
 
     # Write end note
-    release_note(release, hashes, urls, tarname, args.outdir)
+    release_note(release, hashes, urls, tarname, outdir)
 
 
 if __name__ == "__main__":

From 259d80c0cf4b8de4c9d9aed2563c6ed2784fd4ed Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 19 Sep 2023 05:46:02 +0800
Subject: [PATCH 14/59] News for 2.0. [skip ci] (#9484)

---------

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
---
 NEWS.md | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 2a1000e55..43019d877 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,6 +3,207 @@ XGBoost Change Log
 
 This file records the changes in xgboost library in reverse chronological order.
 
+## 2.0.0 (2023 Aug 16)
+
+We are excited to announce the release of XGBoost 2.0. This note will begin by covering some overall changes and then highlight specific updates to the package.
+
+### Initial work on multi-target trees with vector-leaf outputs
+We have been working on vector-leaf tree models for multi-target regression, multi-label classification, and multi-class classification in version 2.0. Previously, XGBoost would build a separate model for each target. However, with this new feature that's still being developed, XGBoost can build one tree for all targets. The feature has multiple benefits and trade-offs compared to the existing approach. It can help prevent overfitting, produce smaller models, and build trees that consider the correlation between targets. In addition, users can combine vector leaf and scalar leaf trees during a training session using a callback. Please note that the feature is still a working in progress, and many parts are not yet available. See #9043 for the current status. Related PRs: (#8538, #8697, #8902, #8884, #8895, #8898, #8612, #8652, #8698, #8908, #8928, #8968, #8616, #8922, #8890, #8872, #8889, #9509) Please note that, only the `hist` (default) tree method on CPU can be used for building vector leaf trees at the moment.
+
+### New `device` parameter.
+
+A new `device` parameter is set to replace the existing `gpu_id`, `gpu_hist`, `gpu_predictor`, `cpu_predictor`, `gpu_coord_descent`, and the PySpark specific parameter `use_gpu`. Onward, users need only the `device` parameter to select which device to run along with the ordinal of the device. For more information, please see our document page (https://xgboost.readthedocs.io/en/stable/parameter.html#general-parameters) . For example, with  `device="cuda", tree_method="hist"`, XGBoost will run the `hist` tree method on GPU. (#9363, #8528, #8604, #9354, #9274, #9243, #8896, #9129, #9362, #9402, #9385, #9398, #9390, #9386, #9412, #9507, #9536). The old behavior of ``gpu_hist``  is preserved but deprecated. In addition, the `predictor` parameter is removed.
+
+
+### `hist` is now the default tree method
+Starting from 2.0, the `hist` tree method will be the default. In previous versions, XGBoost chooses `approx` or `exact` depending on the input data and training environment. The new default can help XGBoost train models more efficiently and consistently. (#9320, #9353)
+
+### GPU-based approx tree method
+There's initial support for using the `approx` tree method on GPU. The performance of the `approx` is not yet well optimized but is feature complete except for the JVM packages. It can be accessed through the use of the parameter combination `device="cuda", tree_method="approx"`. (#9414, #9399, #9478). Please note that the Scala-based Spark interface is not yet supported.
+
+### Optimize and bound the size of the histogram on CPU, to control memory footprint
+
+XGBoost has a new parameter `max_cached_hist_node` for users to limit the CPU cache size for histograms. It can help prevent XGBoost from caching histograms too aggressively. Without the cache, performance is likely to decrease. However, the size of the cache grows exponentially with the depth of the tree. The limit can be crucial when growing deep trees. In most cases, users need not configure this parameter as it does not affect the model's accuracy. (#9455, #9441, #9440, #9427, #9400).
+
+Along with the cache limit, XGBoost also reduces the memory usage of the `hist` and `approx` tree method on distributed systems by cutting the size of the cache by half. (#9433)
+
+### Improved external memory support
+There is some exciting development around external memory support in XGBoost. It's still an experimental feature, but the performance has been significantly improved with the default `hist` tree method. We replaced the old file IO logic with memory map. In addition to performance, we have reduced CPU memory usage and added extensive documentation. Beginning from 2.0.0, we encourage users to try it with the `hist` tree method when the memory saving by `QuantileDMatrix` is not sufficient. (#9361, #9317, #9282, #9315, #8457)
+
+### Learning to rank
+We created a brand-new implementation for the learning-to-rank task. With the latest version, XGBoost gained a set of new features for ranking task including:
+
+- A new parameter `lambdarank_pair_method` for choosing the pair construction strategy.
+- A new parameter `lambdarank_num_pair_per_sample` for controlling the number of samples for each group.
+- An experimental implementation of unbiased learning-to-rank, which can be accessed using the `lambdarank_unbiased` parameter.
+- Support for custom gain function with `NDCG` using the `ndcg_exp_gain` parameter.
+- Deterministic GPU computation for all objectives and metrics.
+- `NDCG` is now the default objective function.
+- Improved performance of metrics using caches.
+- Support scikit-learn utilities for `XGBRanker`.
+- Extensive documentation on how learning-to-rank works with XGBoost.
+
+For more information, please see the [tutorial](https://xgboost.readthedocs.io/en/latest/tutorials/learning_to_rank.html). Related PRs: (#8771, #8692, #8783, #8789, #8790, #8859, #8887, #8893, #8906, #8931, #9075, #9015, #9381, #9336, #8822, #9222, #8984, #8785, #8786, #8768)
+
+### Automatically estimated intercept
+
+In the previous version, `base_score` was a constant that could be set as a training parameter. In the new version, XGBoost can automatically estimate this parameter based on input labels for optimal accuracy. (#8539, #8498, #8272, #8793, #8607)
+
+### Quantile regression
+The XGBoost algorithm now supports quantile regression, which involves minimizing the quantile loss (also called "pinball loss"). Furthermore, XGBoost allows for training with multiple target quantiles simultaneously with one tree per quantile. (#8775, #8761, #8760, #8758, #8750)
+
+### L1 and Quantile regression now supports learning rate
+Both objectives use adaptive trees due to the lack of proper Hessian values. In the new version, XGBoost can scale the leaf value with the learning rate accordingly. (#8866)
+
+### Export cut value
+
+Using the Python or the C package, users can export the quantile values (not to be confused with quantile regression) used for the `hist` tree method. (#9356)
+
+### column-based split and federated learning
+We made progress on column-based split for federated learning. In 2.0, both `approx`, `hist`, and `hist` with vector leaf can work with column-based data split, along with support for vertical federated learning. Work on GPU support is still on-going, stay tuned. (#8576, #8468, #8442, #8847, #8811, #8985, #8623, #8568, #8828, #8932, #9081, #9102, #9103, #9124, #9120, #9367, #9370, #9343, #9171, #9346, #9270, #9244, #8494, #8434, #8742, #8804, #8710, #8676, #9020, #9002, #9058, #9037, #9018, #9295, #9006, #9300, #8765, #9365, #9060)
+
+### PySpark
+After the initial introduction of the PySpark interface, it has gained some new features and optimizations in 2.0.
+
+- GPU-based prediction. (#9292, #9542)
+- Optimization for data initialization by avoiding the stack operation. (#9088)
+- Support predict feature contribution. (#8633)
+- Python typing support. (#9156, #9172, #9079, #8375)
+- `use_gpu` is deprecated. The `device` parameter is preferred.
+- Update eval_metric validation to support list of strings (#8826)
+- Improved logs for training (#9449)
+- Maintenance, including refactoring and document updates (#8324, #8465, #8605, #9202, #9460, #9302, #8385, #8630, #8525, #8496)
+- Fix for GPU setup. (#9495)
+
+### Other General New Features
+Here's a list of new features that don't have their own section and yet are general to all language bindings.
+
+- Use array interface for CSC matrix. This helps XGBoost to use a consistent number of threads and align the interface of the CSC matrix with other interfaces. In addition, memory usage is likely to decrease with CSC input thanks to on-the-fly type conversion. (#8672)
+- CUDA compute 90 is now part of the default build.. (#9397)
+
+### Other General Optimization
+These optimizations are general to all language bindings. For language-specific optimization, please visit the corresponding sections.
+
+- Performance for input with `array_interface` on CPU (like `numpy`) is significantly improved. (#9090)
+- Some optimization with CUDA for data initialization. (#9199, #9209, #9144)
+- Use the latest thrust policy to prevent synchronizing GPU devices. (#9212)
+- XGBoost now uses a per-thread CUDA stream, which prevents synchronization with other streams. (#9416, #9396, #9413)
+
+### Notable breaking change
+
+Other than the aforementioned change with the `device` parameter, here's a list of breaking changes affecting all packages.
+
+- Users must specify the format for text input (#9077). However, we suggest using third-party data structures such as `numpy.ndarray` instead of relying on text inputs. See https://github.com/dmlc/xgboost/issues/9472 for more info.
+
+### Notable bug fixes
+
+Some noteworthy bug fixes that are not related to specific language bindings are listed in this section.
+
+- Some language environments use a different thread to perform garbage collection, which breaks the thread-local cache used in XGBoost. XGBoost 2.0 implements a new thread-safe cache using a light weight lock to replace the thread-local cache. (#8851)
+- Fix model IO by clearing the prediction cache. (#8904)
+- `inf` is checked during data construction. (#8911)
+- Preserve order of saved updaters configuration. Usually, this is not an issue unless the `updater` parameter is used instead of the `tree_method` parameter (#9355)
+- Fix GPU memory allocation issue with categorical splits. (#9529)
+- Handle escape sequence like `\t\n` in feature names for JSON model dump. (#9474)
+- Normalize file path for model IO and text input. This handles short paths on Windows and paths that contain `~` on Unix (#9463). In addition, all path inputs are required to be encoded in UTF-8 (#9448, #9443)
+- Fix integer overflow on H100. (#9380)
+- Fix weighted sketching on GPU with categorical features. (#9341)
+- Fix metric serialization. The bug might cause some of the metrics to be dropped during evaluation. (#9405)
+- Fixes compilation errors on MSVC x86 targets (#8823)
+- Pick up the dmlc-core fix for the CSV parser. (#8897)
+
+
+### Documentation
+Aside from documents for new features, we have many smaller updates to improve user experience, from troubleshooting guides to typo fixes.
+
+- Explain CPU/GPU interop. (#8450)
+- Guide to troubleshoot NCCL errors. (#8943, #9206)
+- Add a note for rabit port selection. (#8879)
+- How to build the docs using conda (#9276)
+- Explain how to obtain reproducible results on distributed systems. (#8903)
+
+* Fixes and small updates to document and demonstration scripts. (#8626, #8436, #8995, #8907, #8923, #8926, #9358, #9232, #9201, #9469, #9462, #9458, #8543, #8597, #8401, #8784, #9213, #9098, #9008, #9223, #9333, #9434, #9435, #9415, #8773, #8752, #9291, #9549)
+
+### Python package
+* New Features and Improvements
+- Support primitive types of pyarrow-backed pandas dataframe. (#8653)
+- Warning messages emitted by XGBoost are now emitted using Python warnings. (#9387)
+- User can now format the value printed near the bars on the `plot_importance` plot (#8540)
+- XGBoost has improved half-type support (float16) with pandas, cupy, and cuDF. With GPU input, the handling is through CUDA `__half` type, and no data copy is made. (#8487, #9207, #8481)
+- Support `Series` and Python primitive types in `inplace_predict` and `QuantileDMatrix` (#8547, #8542)
+- Support all pandas' nullable integer types. (#8480)
+- Custom metric with the scikit-learn interface now supports `sample_weight`. (#8706)
+- Enable Installation of Python Package with System lib in a Virtual Environment (#9349)
+- Raise if expected workers are not alive in `xgboost.dask.train` (#9421)
+
+* Optimization
+- Cache transformed data in `QuantileDMatrix` for efficiency. (#8666, #9445)
+- Take datatable as row-major input. (#8472)
+- Remove unnecessary conversions between data structures (#8546)
+
+* Adopt modern Python packaging conventions (PEP 517, PEP 518, PEP 621)
+-  XGBoost adopted the modern Python packaging conventions. The old setup script `setup.py` is now replaced with the new configuration file `pyproject.toml`. Along with this, XGBoost now supports Python 3.11. (#9021, #9112, #9114, #9115) Consult the latest documentation for the updated instructions to build and install XGBoost.
+
+* Fixes
+- `DataIter` now accepts only keyword arguments. (#9431)
+- Fix empty DMatrix with categorical features. (#8739)
+- Convert ``DaskXGBClassifier.classes_`` to an array (#8452)
+- Define `best_iteration` only if early stopping is used to be consistent with documented behavior. (#9403)
+- Make feature validation immutable. (#9388)
+
+* Breaking changes
+- Discussed in the new `device` parameter section,  the `predictor` parameter is now removed. (#9129)
+- Remove support for single-string feature info. Feature type and names should be a sequence of strings (#9401)
+- Remove parameters in the `save_model` call for the scikit-learn interface. (#8963)
+- Remove the `ntree_limit` in the python package. This has been deprecated in previous versions. (#8345)
+
+* Maintenance including formatting and refactoring along with type hints.
+- More consistent use of `black` and `isort` for code formatting (#8420, #8748, #8867)
+- Improved type support. Most of the type changes happen in the PySpark module; here, we list the remaining changes. (#8444, #8617, #9197, #9005)
+- Set `enable_categorical` to True in predict. (#8592)
+- Some refactoring and updates for tests (#8395, #8372, #8557, #8379, #8702, #9459, #9316, #8446, #8695, #8409, #8993, #9480)
+
+* Documentation
+- Add introduction and notes for the sklearn interface. (#8948)
+- Demo for using dask for hyper-parameter optimization. (#8891)
+- Document all supported Python input types. (#8643)
+- Other documentation updates (#8944, #9304)
+
+### R package
+- Use the new data consumption interface for CSR and CSC. This provides better control for the number of threads and improves performance. (#8455, #8673)
+- Accept multiple evaluation metrics during training. (#8657)
+- Fix integer inputs with `NA`. (#9522)
+- Some refactoring for the R package (#8545, #8430, #8614, #8624, #8613, #9457, #8689, #8563, #9461, #8647, #8564, #8565, #8736, #8610, #8609, #8599, #8704, #9456, #9450, #9476, #9477, #9481). Special thanks to @jameslamb.
+- Document updates (#8886, #9323, #9437, #8998)
+
+### JVM packages
+Following are changes specific to various JVM-based packages.
+
+- Stop using Rabit in prediction (#9054)
+- Set feature_names and feature_types in jvm-packages. This is to prepare support for categorical features (#9364)
+- Scala 2.13 support. (#9099)
+- Change training stage from `ResultStage` to `ShuffleMapStage` (#9423)
+- Automatically set the max/min direction for the best score during early stopping. (#9404)
+* Revised support for `flink` (#9046)
+
+* Breaking changes
+- Scala-based tracker is removed. (#9078, #9045)
+- Change `DeviceQuantileDmatrix` into `QuantileDMatrix` (#8461)
+
+* Maintenance (#9253, #9166, #9395, #9389, #9224, #9233, #9351, #9479)
+
+* CI bot PRs
+We employed GitHub dependent bot to help us keep the dependencies up-to-date for JVM packages. With the help from the bot, we have cleared up all the dependencies that are lagging behind (#8501, #8507).
+
+Here's a list of dependency update PRs including those made by dependent bots (#8456, #8560, #8571, #8561, #8562, #8600, #8594, #8524, #8509, #8548, #8549, #8533, #8521, #8534, #8532, #8516, #8503, #8531, #8530, #8518, #8512, #8515, #8517, #8506, #8504, #8502, #8629, #8815, #8813, #8814, #8877, #8876, #8875, #8874, #8873, #9049, #9070, #9073, #9039, #9083, #8917, #8952, #8980, #8973, #8962, #9252, #9208, #9131, #9136, #9219, #9160, #9158, #9163, #9184, #9192, #9265, #9268, #8882, #8837, #8662, #8661, #8390, #9056, #8508, #8925, #8920, #9149, #9230, #9097, #8648, #9203, #8593).
+
+### Maintenance
+Maintenance work includes refactoring, fixing small issues that don't affect end users. (#9256, #8627, #8756, #8735, #8966, #8864, #8747, #8892, #9057, #8921, #8949, #8941, #8942, #9108, #9125, #9155, #9153, #9176, #9447, #9444, #9436, #9438, #9430, #9200, #9210, #9055, #9014, #9004, #8999, #9154, #9148, #9283, #9246, #8888, #8900, #8871, #8861, #8858, #8791, #8807, #8751, #8703, #8696, #8693, #8677, #8686, #8665, #8660, #8386, #8371, #8410, #8578, #8574, #8483, #8443, #8454, #8733)
+
+### CI
+- Build pip wheel with RMM support (#9383)
+- Other CI updates including updating dependencies and work on the CI infrastructure. (#9464, #9428, #8767, #9394, #9278, #9214, #9234, #9205, #9034, #9104, #8878, #9294, #8625, #8806, #8741, #8707, #8381, #8382, #8388, #8402, #8397, #8445, #8602, #8628, #8583, #8460, #9544)
+
 ## 1.7.6 (2023 Jun 16)
 
 This is a patch release for bug fixes. The CRAN package for the R binding is kept at 1.7.5.

From 38ac52dd87bb47f4f61e0249831f377181eff83a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 20 Sep 2023 02:09:07 +0800
Subject: [PATCH 15/59] Build a simple event loop for collective. (#9593)

---
 rabit/include/rabit/internal/io.h     |   7 +-
 rabit/include/rabit/internal/socket.h |  46 +++++--
 rabit/src/allreduce_base.cc           |   3 +-
 src/collective/loop.cc                | 167 ++++++++++++++++++++++++++
 src/collective/loop.h                 |  83 +++++++++++++
 src/collective/socket.cc              |  62 +++++-----
 tests/cpp/collective/test_loop.cc     |  81 +++++++++++++
 7 files changed, 402 insertions(+), 47 deletions(-)
 create mode 100644 src/collective/loop.cc
 create mode 100644 src/collective/loop.h
 create mode 100644 tests/cpp/collective/test_loop.cc

diff --git a/rabit/include/rabit/internal/io.h b/rabit/include/rabit/internal/io.h
index d93f32ff9..d5d0fee4d 100644
--- a/rabit/include/rabit/internal/io.h
+++ b/rabit/include/rabit/internal/io.h
@@ -16,8 +16,8 @@
 #include <string>
 #include <vector>
 
-#include "rabit/internal/utils.h"
-#include "rabit/serializable.h"
+#include "dmlc/io.h"
+#include "xgboost/logging.h"
 
 namespace rabit::utils {
 /*! \brief re-use definition of dmlc::SeekStream */
@@ -84,8 +84,7 @@ struct MemoryBufferStream : public SeekStream {
   }
   ~MemoryBufferStream() override = default;
   size_t Read(void *ptr, size_t size) override {
-    utils::Assert(curr_ptr_ <= p_buffer_->length(),
-                  "read can not have position excceed buffer length");
+    CHECK_LE(curr_ptr_, p_buffer_->length()) << "read can not have position excceed buffer length";
     size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
     if (nread != 0) std::memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
     curr_ptr_ += nread;
diff --git a/rabit/include/rabit/internal/socket.h b/rabit/include/rabit/internal/socket.h
index 6fb7fe725..f1a6699fb 100644
--- a/rabit/include/rabit/internal/socket.h
+++ b/rabit/include/rabit/internal/socket.h
@@ -29,11 +29,10 @@
 #include <chrono>
 #include <cstring>
 #include <string>
+#include <system_error>  // make_error_code, errc
 #include <unordered_map>
 #include <vector>
 
-#include "utils.h"
-
 #if !defined(_WIN32)
 
 #include <sys/poll.h>
@@ -93,6 +92,20 @@ int PollImpl(PollFD* pfd, int nfds, std::chrono::seconds timeout) noexcept(true)
 #endif  // IS_MINGW()
 }
 
+template <typename E>
+std::enable_if_t<std::is_integral_v<E>, xgboost::collective::Result> PollError(E const& revents) {
+  if ((revents & POLLERR) != 0) {
+    return xgboost::system::FailWithCode("Poll error condition.");
+  }
+  if ((revents & POLLNVAL) != 0) {
+    return xgboost::system::FailWithCode("Invalid polling request.");
+  }
+  if ((revents & POLLHUP) != 0) {
+    return xgboost::system::FailWithCode("Poll hung up.");
+  }
+  return xgboost::collective::Success();
+}
+
 /*! \brief helper data structure to perform poll */
 struct PollHelper {
  public:
@@ -160,25 +173,32 @@ struct PollHelper {
    *
    * @param timeout specify timeout in seconds. Block if negative.
    */
-  [[nodiscard]] xgboost::collective::Result Poll(std::chrono::seconds timeout) {
+  [[nodiscard]] xgboost::collective::Result Poll(std::chrono::seconds timeout,
+                                                 bool check_error = true) {
     std::vector<pollfd> fdset;
     fdset.reserve(fds.size());
     for (auto kv : fds) {
       fdset.push_back(kv.second);
     }
-    int ret = PollImpl(fdset.data(), fdset.size(), timeout);
+    std::int32_t ret = PollImpl(fdset.data(), fdset.size(), timeout);
     if (ret == 0) {
-      return xgboost::collective::Fail("Poll timeout.");
+      return xgboost::collective::Fail("Poll timeout.", std::make_error_code(std::errc::timed_out));
     } else if (ret < 0) {
       return xgboost::system::FailWithCode("Poll failed.");
-    } else {
-      for (auto& pfd : fdset) {
-        auto revents = pfd.revents & pfd.events;
-        if (!revents) {
-          fds.erase(pfd.fd);
-        } else {
-          fds[pfd.fd].events = revents;
-        }
+    }
+
+    for (auto& pfd : fdset) {
+      auto result = PollError(pfd.revents);
+      if (check_error && !result.OK()) {
+        return result;
+      }
+
+      auto revents = pfd.revents & pfd.events;
+      if (!revents) {
+        // FIXME(jiamingy): remove this once rabit is replaced.
+        fds.erase(pfd.fd);
+      } else {
+        fds[pfd.fd].events = revents;
       }
     }
     return xgboost::collective::Success();
diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc
index 6480adf03..04246b5a1 100644
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -721,12 +721,11 @@ AllreduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
         }
         finished = false;
       }
-      watcher.WatchException(links[i].sock);
     }
     // finish running
     if (finished) break;
     // select
-    auto poll_res = watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec, false);  // fail on macos
     if (!poll_res.OK()) {
       LOG(FATAL) << poll_res.Report();
     }
diff --git a/src/collective/loop.cc b/src/collective/loop.cc
new file mode 100644
index 000000000..95a1019ac
--- /dev/null
+++ b/src/collective/loop.cc
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "loop.h"
+
+#include <queue>  // for queue
+
+#include "rabit/internal/socket.h"      // for PollHelper
+#include "xgboost/collective/socket.h"  // for FailWithCode
+#include "xgboost/logging.h"            // for CHECK
+
+namespace xgboost::collective {
+Result Loop::EmptyQueue() {
+  timer_.Start(__func__);
+  auto error = [this] {
+    this->stop_ = true;
+    timer_.Stop(__func__);
+  };
+
+  while (!queue_.empty() && !stop_) {
+    std::queue<Op> qcopy;
+    rabit::utils::PollHelper poll;
+
+    // watch all ops
+    while (!queue_.empty()) {
+      auto op = queue_.front();
+      queue_.pop();
+
+      switch (op.code) {
+        case Op::kRead: {
+          poll.WatchRead(*op.sock);
+          break;
+        }
+        case Op::kWrite: {
+          poll.WatchWrite(*op.sock);
+          break;
+        }
+        default: {
+          error();
+          return Fail("Invalid socket operation.");
+        }
+      }
+      qcopy.push(op);
+    }
+
+    // poll, work on fds that are ready.
+    timer_.Start("poll");
+    auto rc = poll.Poll(timeout_);
+    timer_.Stop("poll");
+    if (!rc.OK()) {
+      error();
+      return rc;
+    }
+    // we wonldn't be here if the queue is empty.
+    CHECK(!qcopy.empty());
+
+    while (!qcopy.empty() && !stop_) {
+      auto op = qcopy.front();
+      qcopy.pop();
+
+      std::int32_t n_bytes_done{0};
+      CHECK(op.sock->NonBlocking());
+
+      switch (op.code) {
+        case Op::kRead: {
+          if (poll.CheckRead(*op.sock)) {
+            n_bytes_done = op.sock->Recv(op.ptr + op.off, op.n - op.off);
+          }
+          break;
+        }
+        case Op::kWrite: {
+          if (poll.CheckWrite(*op.sock)) {
+            n_bytes_done = op.sock->Send(op.ptr + op.off, op.n - op.off);
+          }
+          break;
+        }
+        default: {
+          error();
+          return Fail("Invalid socket operation.");
+        }
+      }
+
+      if (n_bytes_done == -1 && !system::LastErrorWouldBlock()) {
+        stop_ = true;
+        auto rc = system::FailWithCode("Invalid socket output.");
+        error();
+        return rc;
+      }
+      op.off += n_bytes_done;
+      CHECK_LE(op.off, op.n);
+
+      if (op.off != op.n) {
+        // not yet finished, push back to queue for next round.
+        queue_.push(op);
+      }
+    }
+  }
+  timer_.Stop(__func__);
+  return Success();
+}
+
+void Loop::Process() {
+  // consumer
+  while (true) {
+    std::unique_lock lock{mu_};
+    cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
+    if (stop_) {
+      break;
+    }
+    CHECK(!mu_.try_lock());
+
+    this->rc_ = this->EmptyQueue();
+    if (!rc_.OK()) {
+      stop_ = true;
+      cv_.notify_one();
+      break;
+    }
+
+    CHECK(queue_.empty());
+    CHECK(!mu_.try_lock());
+    cv_.notify_one();
+  }
+
+  if (rc_.OK()) {
+    CHECK(queue_.empty());
+  }
+}
+
+Result Loop::Stop() {
+  std::unique_lock lock{mu_};
+  stop_ = true;
+  lock.unlock();
+
+  CHECK_EQ(this->Block().OK(), this->rc_.OK());
+
+  if (curr_exce_) {
+    std::rethrow_exception(curr_exce_);
+  }
+
+  return Success();
+}
+
+Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
+  timer_.Init(__func__);
+  worker_ = std::thread{[this] {
+    try {
+      this->Process();
+    } catch (std::exception const& e) {
+      std::lock_guard<std::mutex> guard{mu_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+        rc_ = Fail("Exception was thrown");
+      }
+      stop_ = true;
+      cv_.notify_all();
+    } catch (...) {
+      std::lock_guard<std::mutex> guard{mu_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+        rc_ = Fail("Exception was thrown");
+      }
+      stop_ = true;
+      cv_.notify_all();
+    }
+  }};
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/loop.h b/src/collective/loop.h
new file mode 100644
index 000000000..0bccbc0d0
--- /dev/null
+++ b/src/collective/loop.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <chrono>              // for seconds
+#include <condition_variable>  // for condition_variable
+#include <cstddef>             // for size_t
+#include <cstdint>             // for int8_t, int32_t
+#include <exception>           // for exception_ptr
+#include <mutex>               // for unique_lock, mutex
+#include <queue>               // for queue
+#include <thread>              // for thread
+#include <utility>             // for move
+
+#include "../common/timer.h"            // for Monitor
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+
+namespace xgboost::collective {
+class Loop {
+ public:
+  struct Op {
+    enum Code : std::int8_t { kRead = 0, kWrite = 1 } code;
+    std::int32_t rank{-1};
+    std::int8_t* ptr{nullptr};
+    std::size_t n{0};
+    TCPSocket* sock{nullptr};
+    std::size_t off{0};
+
+    Op(Code c, std::int32_t rank, std::int8_t* ptr, std::size_t n, TCPSocket* sock, std::size_t off)
+        : code{c}, rank{rank}, ptr{ptr}, n{n}, sock{sock}, off{off} {}
+    Op(Op const&) = default;
+    Op& operator=(Op const&) = default;
+    Op(Op&&) = default;
+    Op& operator=(Op&&) = default;
+  };
+
+ private:
+  std::thread worker_;
+  std::condition_variable cv_;
+  std::mutex mu_;
+  std::queue<Op> queue_;
+  std::chrono::seconds timeout_;
+  Result rc_;
+  bool stop_{false};
+  std::exception_ptr curr_exce_{nullptr};
+  common::Monitor timer_;
+
+  Result EmptyQueue();
+  void Process();
+
+ public:
+  Result Stop();
+
+  void Submit(Op op) {
+    // producer
+    std::unique_lock lock{mu_};
+    queue_.push(op);
+    lock.unlock();
+    cv_.notify_one();
+  }
+
+  [[nodiscard]] Result Block() {
+    {
+      std::unique_lock lock{mu_};
+      cv_.notify_all();
+    }
+    std::unique_lock lock{mu_};
+    cv_.wait(lock, [this] { return this->queue_.empty() || stop_; });
+    return std::move(rc_);
+  }
+
+  explicit Loop(std::chrono::seconds timeout);
+
+  ~Loop() noexcept(false) {
+    this->Stop();
+
+    if (worker_.joinable()) {
+      worker_.join();
+    }
+  }
+};
+}  // namespace xgboost::collective
diff --git a/src/collective/socket.cc b/src/collective/socket.cc
index 8ca936ff3..43da366bd 100644
--- a/src/collective/socket.cc
+++ b/src/collective/socket.cc
@@ -118,36 +118,42 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
     }
 
     auto rc = connect(conn.Handle(), addr_handle, addr_len);
-    if (rc != 0) {
-      auto errcode = system::LastError();
-      if (!system::ErrorWouldBlock(errcode)) {
-        log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
-                    __FILE__, __LINE__);
-        continue;
-      }
-
-      rabit::utils::PollHelper poll;
-      poll.WatchWrite(conn);
-      auto result = poll.Poll(timeout);
-      if (!result.OK()) {
-        log_failure(std::move(result), __FILE__, __LINE__);
-        continue;
-      }
-      if (!poll.CheckWrite(conn)) {
-        log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}),
-                    __FILE__, __LINE__);
-        continue;
-      }
-      result = conn.GetSockError();
-      if (!result.OK()) {
-        log_failure(std::move(result), __FILE__, __LINE__);
-        continue;
-      }
-
-      return conn.NonBlocking(non_blocking);
-    } else {
+    if (rc == 0) {
       return conn.NonBlocking(non_blocking);
     }
+
+    auto errcode = system::LastError();
+    if (!system::ErrorWouldBlock(errcode)) {
+      log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
+                  __FILE__, __LINE__);
+      continue;
+    }
+
+    rabit::utils::PollHelper poll;
+    poll.WatchWrite(conn);
+    auto result = poll.Poll(timeout);
+    if (!result.OK()) {
+      // poll would fail if there's a socket error, we log the root cause instead of the
+      // poll failure.
+      auto sockerr = conn.GetSockError();
+      if (!sockerr.OK()) {
+        result = std::move(sockerr);
+      }
+      log_failure(std::move(result), __FILE__, __LINE__);
+      continue;
+    }
+    if (!poll.CheckWrite(conn)) {
+      log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}), __FILE__,
+                  __LINE__);
+      continue;
+    }
+    result = conn.GetSockError();
+    if (!result.OK()) {
+      log_failure(std::move(result), __FILE__, __LINE__);
+      continue;
+    }
+
+    return conn.NonBlocking(non_blocking);
   }
 
   std::stringstream ss;
diff --git a/tests/cpp/collective/test_loop.cc b/tests/cpp/collective/test_loop.cc
new file mode 100644
index 000000000..4686060ce
--- /dev/null
+++ b/tests/cpp/collective/test_loop.cc
@@ -0,0 +1,81 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>                // for ASSERT_TRUE, ASSERT_EQ
+#include <xgboost/collective/socket.h>  // for TCPSocket, Connect, SocketFinalize, SocketStartup
+#include <xgboost/string_view.h>        // for StringView
+
+#include <chrono>        // for seconds
+#include <cstdint>       // for int8_t
+#include <memory>        // for make_shared, shared_ptr
+#include <system_error>  // for make_error_code, errc
+#include <utility>       // for pair
+#include <vector>        // for vector
+
+#include "../../../src/collective/loop.h"  // for Loop
+
+namespace xgboost::collective {
+namespace {
+class LoopTest : public ::testing::Test {
+ protected:
+  std::pair<TCPSocket, TCPSocket> pair_;
+  std::shared_ptr<Loop> loop_;
+
+ protected:
+  void SetUp() override {
+    system::SocketStartup();
+    std::chrono::seconds timeout{1};
+
+    auto domain = SockDomain::kV4;
+    pair_.first = TCPSocket::Create(domain);
+    auto port = pair_.first.BindHost();
+    pair_.first.Listen();
+
+    auto const& addr = SockAddrV4::Loopback().Addr();
+    auto rc = Connect(StringView{addr}, port, 1, timeout, &pair_.second);
+    ASSERT_TRUE(rc.OK());
+    rc = pair_.second.NonBlocking(true);
+    ASSERT_TRUE(rc.OK());
+
+    pair_.first = pair_.first.Accept();
+    rc = pair_.first.NonBlocking(true);
+    ASSERT_TRUE(rc.OK());
+
+    loop_ = std::make_shared<Loop>(timeout);
+  }
+
+  void TearDown() override {
+    pair_ = decltype(pair_){};
+    system::SocketFinalize();
+  }
+};
+}  // namespace
+
+TEST_F(LoopTest, Timeout) {
+  std::vector<std::int8_t> data(1);
+  Loop::Op op{Loop::Op::kRead, 0, data.data(), data.size(), &pair_.second, 0};
+  loop_->Submit(op);
+  auto rc = loop_->Block();
+  ASSERT_FALSE(rc.OK());
+  ASSERT_EQ(rc.Code(), std::make_error_code(std::errc::timed_out)) << rc.Report();
+}
+
+TEST_F(LoopTest, Op) {
+  TCPSocket& send = pair_.first;
+  TCPSocket& recv = pair_.second;
+
+  std::vector<std::int8_t> wbuf(1, 1);
+  std::vector<std::int8_t> rbuf(1, 0);
+
+  Loop::Op wop{Loop::Op::kWrite, 0, wbuf.data(), wbuf.size(), &send, 0};
+  Loop::Op rop{Loop::Op::kRead, 0, rbuf.data(), rbuf.size(), &recv, 0};
+
+  loop_->Submit(wop);
+  loop_->Submit(rop);
+
+  auto rc = loop_->Block();
+  ASSERT_TRUE(rc.OK()) << rc.Report();
+
+  ASSERT_EQ(rbuf[0], wbuf[0]);
+}
+}  // namespace xgboost::collective

From 8c676c889d380a32ed61ee2b69596c3f7ef154b8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 20 Sep 2023 23:29:51 +0800
Subject: [PATCH 16/59] Remove internal use of gpu_id. (#9568)

---
 .github/workflows/python_tests.yml            |   2 +-
 include/xgboost/context.h                     |  41 +++--
 include/xgboost/data.h                        |   6 +-
 include/xgboost/host_device_vector.h          |  31 ++--
 include/xgboost/linalg.h                      |  39 ++---
 include/xgboost/predictor.h                   |   4 +-
 src/c_api/c_api.cu                            |   4 +-
 src/collective/aggregator.cuh                 |  10 +-
 src/common/hist_util.cu                       |   8 +-
 src/common/hist_util.cuh                      |  27 +--
 src/common/host_device_vector.cc              |  11 +-
 src/common/host_device_vector.cu              |  68 ++++----
 src/common/numeric.cu                         |  10 +-
 src/common/optional_weight.h                  |   2 +-
 src/common/quantile.cu                        |  24 +--
 src/common/quantile.cuh                       |  51 +++---
 src/common/stats.cc                           |  10 +-
 src/common/stats.cu                           |  12 +-
 src/common/stats.cuh                          |   4 +-
 src/common/transform.h                        |  18 +-
 src/context.cc                                |  35 +++-
 src/data/data.cc                              |  18 +-
 src/data/data.cu                              |   8 +-
 src/data/device_adapter.cuh                   |  54 +++---
 src/data/ellpack_page.cu                      |  61 +++----
 src/data/ellpack_page.cuh                     |  59 ++++---
 src/data/ellpack_page_source.cu               |   2 +-
 src/data/ellpack_page_source.h                |   4 +-
 src/data/iterative_dmatrix.cc                 |   5 +-
 src/data/iterative_dmatrix.cu                 |  21 ++-
 src/data/proxy_dmatrix.cc                     |   8 +-
 src/data/proxy_dmatrix.cu                     |   8 +-
 src/data/proxy_dmatrix.h                      |   2 +-
 src/data/simple_dmatrix.cc                    |   2 +-
 src/data/simple_dmatrix.cu                    |  18 +-
 src/data/simple_dmatrix.cuh                   |  15 +-
 src/data/sparse_page_dmatrix.cu               |   3 +-
 src/data/sparse_page_source.cu                |   8 +-
 src/gbm/gbtree.cc                             |  18 +-
 src/learner.cc                                |  22 +--
 src/linear/updater_gpu_coordinate.cu          |  16 +-
 src/metric/auc.cc                             |  30 ++--
 src/metric/auc.cu                             |   2 +-
 src/metric/multiclass_metric.cu               |  14 +-
 src/metric/rank_metric.cu                     |   4 +-
 src/metric/survival_metric.cu                 |  12 +-
 src/objective/adaptive.h                      |   2 +-
 src/objective/aft_obj.cu                      |   6 +-
 src/objective/hinge.cu                        |  16 +-
 src/objective/init_estimation.cc              |   4 +-
 src/objective/lambdarank_obj.cc               |   8 +-
 src/objective/lambdarank_obj.cu               |   8 +-
 src/objective/multiclass_obj.cu               |   4 +-
 src/objective/quantile_obj.cu                 |  12 +-
 src/objective/regression_obj.cu               |  28 +--
 src/predictor/gpu_predictor.cu                | 159 +++++++++---------
 src/predictor/predictor.cc                    |   4 +-
 src/tree/fit_stump.cc                         |   8 +-
 src/tree/fit_stump.cu                         |  10 +-
 src/tree/gpu_hist/evaluate_splits.cu          |   2 +-
 src/tree/gpu_hist/evaluate_splits.cuh         |   6 +-
 src/tree/gpu_hist/evaluator.cu                |  11 +-
 src/tree/gpu_hist/feature_groups.cuh          |   9 +-
 src/tree/gpu_hist/gradient_based_sampler.cu   |  12 +-
 src/tree/gpu_hist/row_partitioner.cu          |   6 +-
 src/tree/gpu_hist/row_partitioner.cuh         |   4 +-
 src/tree/hist/evaluate_splits.h               |   4 +-
 src/tree/split_evaluator.h                    |  22 ++-
 src/tree/updater_colmaker.cc                  |   2 +-
 src/tree/updater_gpu_hist.cu                  |  52 +++---
 src/tree/updater_quantile_hist.cc             |   2 +-
 .../test_nccl_device_communicator.cu          |   6 +-
 tests/cpp/common/test_hist_util.cc            |   4 +-
 tests/cpp/common/test_hist_util.cu            |  35 ++--
 tests/cpp/common/test_host_device_vector.cu   |  39 ++---
 tests/cpp/common/test_linalg.cu               |   2 +-
 tests/cpp/common/test_quantile.cc             |  10 +-
 tests/cpp/common/test_quantile.cu             |  97 +++++------
 tests/cpp/common/test_ranking_utils.cc        |   4 +-
 tests/cpp/common/test_ranking_utils.cu        |   4 +-
 tests/cpp/common/test_stats.cc                |  20 +--
 tests/cpp/common/test_stats.cu                |  14 +-
 tests/cpp/common/test_transform_range.cc      |  50 +++---
 tests/cpp/common/test_transform_range.cu      |   5 +
 tests/cpp/data/test_device_adapter.cu         |   6 +-
 tests/cpp/data/test_ellpack_page.cu           |  22 +--
 tests/cpp/data/test_gradient_index.cc         |   2 +-
 tests/cpp/data/test_iterative_dmatrix.cu      |  21 +--
 tests/cpp/data/test_metainfo.cc               |  12 +-
 tests/cpp/data/test_proxy_dmatrix.cc          |  28 ++-
 tests/cpp/data/test_proxy_dmatrix.cu          |  10 +-
 tests/cpp/data/test_simple_dmatrix.cc         |   4 +-
 tests/cpp/data/test_sparse_page_dmatrix.cu    |  14 +-
 tests/cpp/gbm/test_gbtree.cc                  |  10 +-
 tests/cpp/gbm/test_gbtree.cu                  |   8 +-
 tests/cpp/helpers.cc                          |  39 +++--
 tests/cpp/helpers.cu                          |   2 +-
 tests/cpp/helpers.h                           |   9 +-
 tests/cpp/histogram_helpers.h                 |  11 +-
 tests/cpp/metric/test_auc.h                   |   2 +-
 tests/cpp/metric/test_elementwise_metric.cc   |   6 +-
 tests/cpp/metric/test_elementwise_metric.h    |  11 +-
 tests/cpp/metric/test_rank_metric.h           |   2 +-
 tests/cpp/objective/test_lambdarank_obj.cc    |   8 +-
 tests/cpp/objective/test_quantile_obj.cc      |   2 +-
 tests/cpp/predictor/test_cpu_predictor.cc     |   2 +-
 tests/cpp/predictor/test_gpu_predictor.cu     |  30 ++--
 tests/cpp/predictor/test_predictor.cc         |  14 +-
 tests/cpp/test_context.cc                     |  19 ++-
 tests/cpp/test_context.cu                     |   5 +-
 tests/cpp/test_serialization.cc               |   4 +-
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 101 ++++++-----
 .../gpu_hist/test_gradient_based_sampler.cu   |   8 +-
 tests/cpp/tree/gpu_hist/test_histogram.cu     |  39 ++---
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  15 +-
 tests/cpp/tree/hist/test_evaluate_splits.cc   |   2 +-
 tests/cpp/tree/test_evaluate_splits.h         |   8 +-
 tests/cpp/tree/test_gpu_hist.cu               |  42 ++---
 tests/cpp/tree/test_histmaker.cc              |   6 +-
 .../cpp/tree/test_multi_target_tree_model.cc  |   6 +-
 tests/cpp/tree/test_tree_stat.cc              |   4 +-
 121 files changed, 1012 insertions(+), 1044 deletions(-)
 create mode 100644 tests/cpp/common/test_transform_range.cu

diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 532c9277a..e9704c75d 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -190,7 +190,7 @@ jobs:
       run: |
         mkdir build_msvc
         cd build_msvc
-        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON  -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON
+        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON
         cmake --build . --config Release --parallel $(nproc)
 
     - name: Install Python package
diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index 262733b22..7748db9f9 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -29,31 +29,37 @@ struct DeviceSym {
  *        viewing types like `linalg::TensorView`.
  */
 struct DeviceOrd {
+  // Constant representing the device ID of CPU.
+  static bst_d_ordinal_t constexpr CPUOrdinal() { return -1; }
+  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
+
   enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
   // CUDA device ordinal.
-  bst_d_ordinal_t ordinal{-1};
+  bst_d_ordinal_t ordinal{CPUOrdinal()};
 
   [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
   [[nodiscard]] bool IsCPU() const { return device == kCPU; }
 
-  DeviceOrd() = default;
+  constexpr DeviceOrd() = default;
   constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
 
-  DeviceOrd(DeviceOrd const& that) = default;
-  DeviceOrd& operator=(DeviceOrd const& that) = default;
-  DeviceOrd(DeviceOrd&& that) = default;
-  DeviceOrd& operator=(DeviceOrd&& that) = default;
+  constexpr DeviceOrd(DeviceOrd const& that) = default;
+  constexpr DeviceOrd& operator=(DeviceOrd const& that) = default;
+  constexpr DeviceOrd(DeviceOrd&& that) = default;
+  constexpr DeviceOrd& operator=(DeviceOrd&& that) = default;
 
   /**
    * @brief Constructor for CPU.
    */
-  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
+  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, CPUOrdinal()}; }
   /**
    * @brief Constructor for CUDA device.
    *
    * @param ordinal CUDA device ordinal.
    */
-  [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
+  [[nodiscard]] static constexpr auto CUDA(bst_d_ordinal_t ordinal) {
+    return DeviceOrd{kCUDA, ordinal};
+  }
 
   [[nodiscard]] bool operator==(DeviceOrd const& that) const {
     return device == that.device && ordinal == that.ordinal;
@@ -78,25 +84,26 @@ struct DeviceOrd {
 
 static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
 
+std::ostream& operator<<(std::ostream& os, DeviceOrd ord);
+
 /**
  * @brief Runtime context for XGBoost. Contains information like threads and device.
  */
 struct Context : public XGBoostParameter<Context> {
  private:
+  // User interfacing parameter for device ordinal
   std::string device{DeviceSym::CPU()};  // NOLINT
-  // The device object for the current context. We are in the middle of replacing the
-  // `gpu_id` with this device field.
+  // The device ordinal set by user
   DeviceOrd device_{DeviceOrd::CPU()};
 
  public:
-  // Constant representing the device ID of CPU.
-  static bst_d_ordinal_t constexpr kCpuId = -1;
-  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
   static std::int64_t constexpr kDefaultSeed = 0;
 
  public:
   Context();
 
+  void Init(Args const& kwargs);
+
   template <typename Container>
   Args UpdateAllowUnknown(Container const& kwargs) {
     auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
@@ -104,7 +111,6 @@ struct Context : public XGBoostParameter<Context> {
     return args;
   }
 
-  std::int32_t gpu_id{kCpuId};
   // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
   std::int32_t nthread{0};  // NOLINT
   // stored random seed
@@ -116,7 +122,8 @@ struct Context : public XGBoostParameter<Context> {
   bool validate_parameters{false};
 
   /**
-   * @brief Configure the parameter `gpu_id'.
+   * @brief Configure the parameter `device'. Deprecated, will remove once `gpu_id` is
+   *        removed.
    *
    * @param require_gpu Whether GPU is explicitly required by the user through other
    *                    configurations.
@@ -212,9 +219,7 @@ struct Context : public XGBoostParameter<Context> {
  private:
   void SetDeviceOrdinal(Args const& kwargs);
   Context& SetDevice(DeviceOrd d) {
-    this->device_ = d;
-    this->gpu_id = d.ordinal;  // this can be removed once we move away from `gpu_id`.
-    this->device = d.Name();
+    this->device = (this->device_ = d).Name();
     return *this;
   }
 
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index eae2f612b..c232819f9 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -106,10 +106,10 @@ class MetaInfo {
   MetaInfo& operator=(MetaInfo&& that) = default;
   MetaInfo& operator=(MetaInfo const& that) = delete;
 
-  /*!
-   * \brief Validate all metainfo.
+  /**
+   * @brief Validate all metainfo.
    */
-  void Validate(int32_t device) const;
+  void Validate(DeviceOrd device) const;
 
   MetaInfo Slice(common::Span<int32_t const> ridxs) const;
 
diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index ed7117d65..9a53d3858 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -88,9 +88,9 @@ class HostDeviceVector {
   static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types");
 
  public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
-  HostDeviceVector(std::initializer_list<T> init, int device = -1);
-  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
+  explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU());
+  HostDeviceVector(std::initializer_list<T> init, DeviceOrd device = DeviceOrd::CPU());
+  explicit HostDeviceVector(const std::vector<T>& init, DeviceOrd device = DeviceOrd::CPU());
   ~HostDeviceVector();
 
   HostDeviceVector(const HostDeviceVector<T>&) = delete;
@@ -99,17 +99,9 @@ class HostDeviceVector {
   HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
   HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
 
-  bool Empty() const { return Size() == 0; }
-  size_t Size() const;
-  int DeviceIdx() const;
-  DeviceOrd Device() const {
-    auto idx = this->DeviceIdx();
-    if (idx == DeviceOrd::CPU().ordinal) {
-      return DeviceOrd::CPU();
-    } else {
-      return DeviceOrd::CUDA(idx);
-    }
-  }
+  [[nodiscard]] bool Empty() const { return Size() == 0; }
+  [[nodiscard]] std::size_t Size() const;
+  [[nodiscard]] DeviceOrd Device() const;
   common::Span<T> DeviceSpan();
   common::Span<const T> ConstDeviceSpan() const;
   common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
@@ -135,13 +127,12 @@ class HostDeviceVector {
   const std::vector<T>& ConstHostVector() const;
   const std::vector<T>& HostVector() const {return ConstHostVector(); }
 
-  bool HostCanRead() const;
-  bool HostCanWrite() const;
-  bool DeviceCanRead() const;
-  bool DeviceCanWrite() const;
-  GPUAccess DeviceAccess() const;
+  [[nodiscard]] bool HostCanRead() const;
+  [[nodiscard]] bool HostCanWrite() const;
+  [[nodiscard]] bool DeviceCanRead() const;
+  [[nodiscard]] bool DeviceCanWrite() const;
+  [[nodiscard]] GPUAccess DeviceAccess() const;
 
-  void SetDevice(int device) const;
   void SetDevice(DeviceOrd device) const;
 
   void Resize(size_t new_size, T v = T());
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index b3ae2f169..d56170aec 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -659,13 +659,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> *data) {
-  return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(),
-                 data->Size(), data->Device());
+  return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
+                 data->Device());
 }
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> const *data) {
-  return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(),
+  return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
                  data->Size(), data->Device());
 }
 
@@ -757,13 +757,13 @@ class Tensor {
   Order order_{Order::kC};
 
   template <typename I, std::int32_t D>
-  void Initialize(I const (&shape)[D], std::int32_t device) {
+  void Initialize(I const (&shape)[D], DeviceOrd device) {
     static_assert(D <= kDim, "Invalid shape.");
     std::copy(shape, shape + D, shape_);
     for (auto i = D; i < kDim; ++i) {
       shape_[i] = 1;
     }
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       data_.SetDevice(device);
       data_.ConstDevicePointer();  // Pull to device;
     }
@@ -780,14 +780,11 @@ class Tensor {
    * See \ref TensorView for parameters of this constructor.
    */
   template <typename I, int32_t D>
-  explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
-      : Tensor{common::Span<I const, D>{shape}, device, order} {}
-  template <typename I, int32_t D>
   explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
-      : Tensor{common::Span<I const, D>{shape}, device.ordinal, order} {}
+      : Tensor{common::Span<I const, D>{shape}, device, order} {}
 
   template <typename I, size_t D>
-  explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC)
+  explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC)
       : order_{order} {
     // No device unroll as this is a host only function.
     std::copy(shape.data(), shape.data() + D, shape_);
@@ -795,11 +792,11 @@ class Tensor {
       shape_[i] = 1;
     }
     auto size = detail::CalcSize(shape_);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       data_.SetDevice(device);
     }
     data_.Resize(size);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       data_.DevicePointer();  // Pull to device
     }
   }
@@ -807,7 +804,7 @@ class Tensor {
    * Initialize from 2 host iterators.
    */
   template <typename It, typename I, int32_t D>
-  explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC)
+  explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC)
       : order_{order} {
     auto &h_vec = data_.HostVector();
     h_vec.insert(h_vec.begin(), begin, end);
@@ -816,7 +813,7 @@ class Tensor {
   }
 
   template <typename I, int32_t D>
-  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], std::int32_t device,
+  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
                   Order order = kC)
       : order_{order} {
     auto &h_vec = data_.HostVector();
@@ -824,10 +821,6 @@ class Tensor {
     // shape
     this->Initialize(shape, device);
   }
-  template <typename I, int32_t D>
-  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
-                  Order order = kC)
-      : Tensor{data, shape, device.ordinal, order} {}
   /**
    * \brief Index operator. Not thread safe, should not be used in performance critical
    *        region. For more efficient indexing, consider getting a view first.
@@ -944,9 +937,7 @@ class Tensor {
   /**
    * \brief Set device ordinal for this tensor.
    */
-  void SetDevice(int32_t device) const { data_.SetDevice(device); }
   void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
-  [[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
   [[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
 };
 
@@ -962,7 +953,7 @@ using Vector = Tensor<T, 1>;
 template <typename T, typename... Index>
 auto Empty(Context const *ctx, Index &&...index) {
   Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->gpu_id);
+  t.SetDevice(ctx->Device());
   t.Reshape(index...);
   return t;
 }
@@ -973,7 +964,7 @@ auto Empty(Context const *ctx, Index &&...index) {
 template <typename T, typename... Index>
 auto Constant(Context const *ctx, T v, Index &&...index) {
   Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->gpu_id);
+  t.SetDevice(ctx->Device());
   t.Reshape(index...);
   t.Data()->Fill(std::move(v));
   return t;
@@ -990,8 +981,8 @@ auto Zeros(Context const *ctx, Index &&...index) {
 // Only first axis is supported for now.
 template <typename T, int32_t D>
 void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
-  if (r.DeviceIdx() >= 0) {
-    l->SetDevice(r.DeviceIdx());
+  if (r.Device().IsCUDA()) {
+    l->SetDevice(r.Device());
   }
   l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {
     for (size_t i = 1; i < D; ++i) {
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index 2c69cf648..25571213d 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -52,9 +52,9 @@ class PredictionContainer : public DMatrixCache<PredictionCacheEntry> {
 
  public:
   PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
-  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, std::int32_t device) {
+  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, DeviceOrd device) {
     auto p_cache = this->CacheItem(m);
-    if (device != Context::kCpuId) {
+    if (device.IsCUDA()) {
       p_cache->predictions.SetDevice(device);
     }
     return *p_cache;
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 1dddb1444..84a371558 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -66,7 +66,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
   auto hess_dev = dh::CudaGetPointerDevice(hess.data);
   CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
   auto &gpair = *out_gpair;
-  gpair.SetDevice(grad_dev);
+  gpair.SetDevice(DeviceOrd::CUDA(grad_dev));
   gpair.Reshape(grad.Shape(0), grad.Shape(1));
   auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
   auto cuctx = ctx->CUDACtx();
@@ -144,7 +144,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
   if (learner->Ctx()->IsCUDA()) {
     CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
   }
-  p_predt->SetDevice(proxy->DeviceIdx());
+  p_predt->SetDevice(proxy->Device());
 
   auto &shape = learner->GetThreadLocal().prediction_shape;
   size_t n_samples = p_m->Info().num_row_;
diff --git a/src/collective/aggregator.cuh b/src/collective/aggregator.cuh
index a87a968ab..66766470b 100644
--- a/src/collective/aggregator.cuh
+++ b/src/collective/aggregator.cuh
@@ -15,8 +15,7 @@
 
 #include "communicator-inl.cuh"
 
-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 
 /**
  * @brief Find the global sum of the given values across all workers.
@@ -31,10 +30,9 @@ namespace collective {
  * @param size Number of values to sum.
  */
 template <typename T>
-void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
+void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) {
   if (info.IsRowSplit()) {
-    collective::AllReduce<collective::Operation::kSum>(device, values, size);
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, values, size);
   }
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 2dfba7215..1f06c2a6f 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -123,7 +123,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
       [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
 }
 
-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@@ -240,13 +240,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
       sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
         return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
       });
-  detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
                              IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                              &column_sizes_scan);
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
   if (sketch_container->HasCategorical()) {
     auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
-    detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
+    detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
                                        &column_sizes_scan);
   }
 
@@ -347,7 +347,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
 
   HistogramCuts cuts;
   SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
-                                   ctx->Ordinal());
+                                   ctx->Device());
   CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
   for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
     std::size_t page_nnz = page.data.Size();
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index d7be12749..3cd13030e 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -82,9 +82,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 }
 
 template <std::uint32_t kBlockThreads, typename Kernel>
-std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
+std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
   int n_mps = 0;
-  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
   int n_blocks_per_mp = 0;
   dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                               kBlockThreads, shared_mem));
@@ -106,11 +106,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
  * \param out_column_size Output buffer for the size of each column.
  */
 template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
-void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
+void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
                                data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
   thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
 
-  std::size_t max_shared_memory = dh::MaxSharedMemory(device);
+  std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
   // Not strictly correct as we should use number of samples to determine the type of
   // counter. However, the sample size is not known due to sliding window on number of
   // elements.
@@ -154,7 +154,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
 }
 
 template <typename BatchIt>
-void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
+void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
                         IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
                         HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                         dh::caching_device_vector<size_t>* column_sizes_scan) {
@@ -215,7 +215,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
 void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
-                            float missing, size_t columns, size_t cuts_per_feature, int device,
+                            float missing, size_t columns, size_t cuts_per_feature,
+                            DeviceOrd device,
                             HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                             dh::caching_device_vector<size_t>* column_sizes_scan,
                             dh::device_vector<Entry>* sorted_entries) {
@@ -239,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                   dh::device_vector<Entry>* sorted_entries);
 
-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -277,7 +278,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
 
 template <typename AdapterBatch>
 void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
-                          int device, size_t columns, size_t begin, size_t end,
+                          DeviceOrd device, size_t columns, size_t begin, size_t end,
                           float missing, SketchContainer *sketch_container,
                           int num_cuts) {
   // Copy current subset of valid elements into temporary storage and sort
@@ -316,11 +317,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
 template <typename Batch>
 void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
                                   int num_cuts_per_feature,
-                                  bool is_ranking, float missing, int device,
+                                  bool is_ranking, float missing, DeviceOrd device,
                                   size_t columns, size_t begin, size_t end,
                                   SketchContainer *sketch_container) {
   dh::XGBCachingDeviceAllocator<char> alloc;
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   info.weights_.SetDevice(device);
   auto weights = info.weights_.ConstDeviceSpan();
 
@@ -412,14 +413,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
   size_t num_rows = batch.NumRows();
   size_t num_cols = batch.NumCols();
   size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
-  int32_t device = sketch_container->DeviceIdx();
+  auto device = sketch_container->DeviceIdx();
   bool weighted = !info.weights_.Empty();
 
   if (weighted) {
     sketch_batch_num_elements = detail::SketchBatchNumElements(
         sketch_batch_num_elements,
         num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, true);
+        device.ordinal, num_cuts_per_feature, true);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
       size_t end =
           std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
@@ -432,7 +433,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
     sketch_batch_num_elements = detail::SketchBatchNumElements(
         sketch_batch_num_elements,
         num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, false);
+        device.ordinal, num_cuts_per_feature, false);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
       size_t end =
           std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index 175a5cbf1..66d8024bd 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
 };
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(size, v);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -81,7 +81,7 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
 
 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return -1; }
+DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
 
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
   return false;
 }
 
-template <typename T>
-void HostDeviceVector<T>::SetDevice(int) const {}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
 
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 7acb6719b..5f7b71043 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
 template <typename T>
 class HostDeviceVectorImpl {
  public:
-  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       data_d_->resize(size, v);
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
 
   // Initializer can be std::vector<T> or std::initializer_list<T>
   template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       LazyResizeDevice(init.size());
       Copy(init);
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
     gpu_access_{that.gpu_access_} {}
 
   ~HostDeviceVectorImpl() {
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
       SetDevice();
     }
   }
 
-  size_t Size() const {
+  [[nodiscard]] size_t Size() const {
     return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
   }
 
-  int DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 
   T* DevicePointer() {
     LazySyncDevice(GPUAccess::kWrite);
@@ -138,7 +138,7 @@ class HostDeviceVectorImpl {
     } else {
       auto ptr = other->ConstDevicePointer();
       SetDevice();
-      CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
+      CHECK_EQ(this->Device(), other->Device());
       dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
                                     ptr,
                                     other->Size() * sizeof(T),
@@ -156,24 +156,25 @@ class HostDeviceVectorImpl {
     return data_h_;
   }
 
-  void SetDevice(int device) {
+  void SetDevice(DeviceOrd device) {
     if (device_ == device) { return; }
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
       LazySyncHost(GPUAccess::kNone);
     }
 
-    if (device_ >= 0 && device >= 0) {
-      CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
+    if (device_.IsCUDA() && device.IsCUDA()) {
+      CHECK_EQ(device_.ordinal, device.ordinal)
+          << "New device ordinal is different from previous one.";
     }
     device_ = device;
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
       LazyResizeDevice(data_h_.size());
     }
   }
 
   void Resize(size_t new_size, T v) {
     if (new_size == Size()) { return; }
-    if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
+    if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
       // fast on-device resize
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
@@ -218,16 +219,16 @@ class HostDeviceVectorImpl {
     gpu_access_ = access;
   }
 
-  bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
-  bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
-  bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
-  bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
-  bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
-  bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
-  GPUAccess Access() const { return gpu_access_; }
+  [[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
+  [[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
+  [[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
+  [[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
+  [[nodiscard]] GPUAccess Access() const { return gpu_access_; }
 
  private:
-  int device_{-1};
+  DeviceOrd device_{DeviceOrd::CPU()};
   std::vector<T> data_h_{};
   std::unique_ptr<dh::device_vector<T>> data_d_{};
   GPUAccess gpu_access_{GPUAccess::kNone};
@@ -259,11 +260,11 @@ class HostDeviceVectorImpl {
   }
 
   void SetDevice() {
-    CHECK_GE(device_, 0);
+    CHECK_GE(device_.ordinal, 0);
     if (cudaSetDeviceHandler == nullptr) {
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
     } else {
-      (*cudaSetDeviceHandler)(device_);
+      (*cudaSetDeviceHandler)(device_.ordinal);
     }
 
     if (!data_d_) {
@@ -273,15 +274,15 @@ class HostDeviceVectorImpl {
 };
 
 template<typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
     : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
     : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
     : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 
 template <typename T>
@@ -309,7 +310,9 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
 
 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
+DeviceOrd HostDeviceVector<T>::Device() const {
+  return impl_->Device();
+}
 
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() {
@@ -389,14 +392,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
   return impl_->Access();
 }
 
-template <typename T>
-void HostDeviceVector<T>::SetDevice(int device) const {
-  impl_->SetDevice(device);
-}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
-  impl_->SetDevice(device.ordinal);
+  impl_->SetDevice(device);
 }
 
 template <typename T>
diff --git a/src/common/numeric.cu b/src/common/numeric.cu
index b292edf1a..a326b9355 100644
--- a/src/common/numeric.cu
+++ b/src/common/numeric.cu
@@ -8,16 +8,12 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 
-namespace xgboost {
-namespace common {
-namespace cuda_impl {
+namespace xgboost::common::cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
-  values.SetDevice(ctx->gpu_id);
+  values.SetDevice(ctx->Device());
   auto const d_values = values.ConstDeviceSpan();
   dh::XGBCachingDeviceAllocator<char> alloc;
   return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
                     thrust::plus<float>{});
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/optional_weight.h b/src/common/optional_weight.h
index c2844d73f..997f3fad1 100644
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@@ -24,7 +24,7 @@ struct OptionalWeights {
 inline OptionalWeights MakeOptionalWeights(Context const* ctx,
                                            HostDeviceVector<float> const& weights) {
   if (ctx->IsCUDA()) {
-    weights.SetDevice(ctx->gpu_id);
+    weights.SetDevice(ctx->Device());
   }
   return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
 }
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 25c4543c6..776752476 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -207,10 +207,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 // summary does the output element come from) result by definition of merged rank.  So we
 // run it in 2 passes to obtain the merge path and then customize the standard merge
 // algorithm.
-void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
+void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
                Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
                Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   CHECK_EQ(d_x.size() + d_y.size(), out.size());
   CHECK_EQ(x_ptr.size(), out_ptr.size());
   CHECK_EQ(y_ptr.size(), out_ptr.size());
@@ -308,7 +308,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
 void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
                            common::Span<OffsetT> cuts_ptr,
                            size_t total_cuts, Span<float> weights) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   Span<SketchEntry> out;
   dh::device_vector<SketchEntry> cuts;
   bool first_window = this->Current().empty();
@@ -367,7 +367,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
    * pruning or merging. We preserve the first type and remove the second type.
    */
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
   dh::XGBCachingDeviceAllocator<char> alloc;
 
@@ -407,7 +407,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
 
 void SketchContainer::Prune(size_t to) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
 
   OffsetT to_total = 0;
   auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -442,7 +442,7 @@ void SketchContainer::Prune(size_t to) {
 
 void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
                             Span<SketchEntry const> that) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   timer_.Start(__func__);
   if (this->Current().size() == 0) {
     CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -477,7 +477,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
 }
 
 void SketchContainer::FixError() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
   auto in = dh::ToSpan(this->Current());
   dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -502,7 +502,7 @@ void SketchContainer::FixError() {
 }
 
 void SketchContainer::AllReduce(bool is_column_split) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto world = collective::GetWorldSize();
   if (world == 1 || is_column_split) {
     return;
@@ -529,15 +529,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
   auto offset = rank * d_columns_ptr.size();
   thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
                gathered_ptrs.begin() + offset);
-  collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
+  collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
                                                      gathered_ptrs.size());
 
   // Get the data from all workers.
   std::vector<size_t> recv_lengths;
   dh::caching_device_vector<char> recvbuf;
-  collective::AllGatherV(device_, this->Current().data().get(),
+  collective::AllGatherV(device_.ordinal, this->Current().data().get(),
                          dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
-  collective::Synchronize(device_);
+  collective::Synchronize(device_.ordinal);
 
   // Segment the received data.
   auto s_recvbuf = dh::ToSpan(recvbuf);
@@ -584,7 +584,7 @@ struct InvalidCatOp {
 
 void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index fedbdbd82..b47834782 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -41,7 +41,7 @@ class SketchContainer {
   bst_row_t num_rows_;
   bst_feature_t num_columns_;
   int32_t num_bins_;
-  int32_t device_;
+  DeviceOrd device_;
 
   // Double buffer as neither prune nor merge can be performed inplace.
   dh::device_vector<SketchEntry> entries_a_;
@@ -93,35 +93,32 @@ class SketchContainer {
    * \param num_rows    Total number of rows in known dataset (typically the rows in current worker).
    * \param device      GPU ID.
    */
-   SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
-                   int32_t max_bin, bst_feature_t num_columns,
-                   bst_row_t num_rows, int32_t device)
-       : num_rows_{num_rows},
-         num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
-     CHECK_GE(device, 0);
-     // Initialize Sketches for this dmatrix
-     this->columns_ptr_.SetDevice(device_);
-     this->columns_ptr_.Resize(num_columns + 1);
-     this->columns_ptr_b_.SetDevice(device_);
-     this->columns_ptr_b_.Resize(num_columns + 1);
+  SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
+                  bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
+      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
+    CHECK(device.IsCUDA());
+    // Initialize Sketches for this dmatrix
+    this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_b_.SetDevice(device_);
+    this->columns_ptr_b_.Resize(num_columns + 1);
 
-     this->feature_types_.Resize(feature_types.Size());
-     this->feature_types_.Copy(feature_types);
-     // Pull to device.
-     this->feature_types_.SetDevice(device);
-     this->feature_types_.ConstDeviceSpan();
-     this->feature_types_.ConstHostSpan();
+    this->feature_types_.Resize(feature_types.Size());
+    this->feature_types_.Copy(feature_types);
+    // Pull to device.
+    this->feature_types_.SetDevice(device);
+    this->feature_types_.ConstDeviceSpan();
+    this->feature_types_.ConstHostSpan();
 
-     auto d_feature_types = feature_types_.ConstDeviceSpan();
-     has_categorical_ =
-         !d_feature_types.empty() &&
-         thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
-                        common::IsCatOp{});
+    auto d_feature_types = feature_types_.ConstDeviceSpan();
+    has_categorical_ =
+        !d_feature_types.empty() &&
+        thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
 
-     timer_.Init(__func__);
-   }
+    timer_.Init(__func__);
+  }
   /* \brief Return GPU ID for this container. */
-  int32_t DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
   /* \brief Whether the predictor matrix contains categorical features. */
   bool HasCategorical() const { return has_categorical_; }
   /* \brief Accumulate weights of duplicated entries in input. */
@@ -175,7 +172,7 @@ class SketchContainer {
   template <typename KeyComp = thrust::equal_to<size_t>>
   size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
     timer_.Start(__func__);
-    dh::safe_cuda(cudaSetDevice(device_));
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
     this->columns_ptr_.SetDevice(device_);
     Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
     CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
diff --git a/src/common/stats.cc b/src/common/stats.cc
index 03ee00b87..aa73a07c3 100644
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -15,8 +15,7 @@
 #include "xgboost/linalg.h"              // Tensor, UnravelIndex, Apply
 #include "xgboost/logging.h"             // CHECK_EQ
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
             HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
   if (!ctx->IsCPU()) {
@@ -46,8 +45,8 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
 }
 
 void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
-  v.SetDevice(ctx->gpu_id);
-  out->SetDevice(ctx->gpu_id);
+  v.SetDevice(ctx->Device());
+  out->SetDevice(ctx->Device());
   out->Reshape(1);
 
   if (ctx->IsCPU()) {
@@ -62,5 +61,4 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
     cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
   }
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/stats.cu b/src/common/stats.cu
index ab4871776..10c7565bc 100644
--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@@ -15,14 +15,12 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // linalg::TensorView, UnravelIndex, Apply
 
-namespace xgboost {
-namespace common {
-namespace cuda_impl {
+namespace xgboost::common::cuda_impl {
 void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
             common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
   CHECK_GE(t.Shape(1), 1);
   HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
-  segments.SetDevice(ctx->gpu_id);
+  segments.SetDevice(ctx->Device());
   auto d_segments = segments.DeviceSpan();
   dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
               [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
@@ -31,7 +29,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
         return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
       });
 
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
   out->Reshape(t.Shape(1));
   if (weights.Empty()) {
     common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
@@ -60,6 +58,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
   dh::TemporaryArray<char> temp{bytes};
   cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/stats.cuh b/src/common/stats.cuh
index f31233461..66ab3953e 100644
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
   auto d_sorted_idx = dh::ToSpan(sorted_idx);
   auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
 
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
   quantiles->Resize(n_segments);
   auto d_results = quantiles->DeviceSpan();
 
@@ -220,7 +220,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
                                 scan_val, weights_cdf.begin());
 
   auto n_segments = std::distance(seg_beg, seg_end) - 1;
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
   quantiles->Resize(n_segments);
   auto d_results = quantiles->DeviceSpan();
   auto d_weight_cdf = dh::ToSpan(weights_cdf);
diff --git a/src/common/transform.h b/src/common/transform.h
index a7b96766c..3329439a5 100644
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -60,8 +60,8 @@ class Transform {
   template <typename Functor>
   struct Evaluator {
    public:
-    Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
-        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
+    Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
+        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
 
     /*!
      * \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -71,7 +71,7 @@ class Transform {
      */
     template <typename... HDV>
     void Eval(HDV... vectors) const {
-      bool on_device = device_ >= 0;
+      bool on_device = device_.IsCUDA();
 
       if (on_device) {
         LaunchCUDA(func_, vectors...);
@@ -116,11 +116,11 @@ class Transform {
     }
     // Recursive unpack for Shard.
     template <typename T>
-    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+    void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
       vector->SetDevice(device);
     }
     template <typename Head, typename... Rest>
-    void UnpackShard(int device,
+    void UnpackShard(DeviceOrd device,
                      const HostDeviceVector<Head> *_vector,
                      const HostDeviceVector<Rest> *... _vectors) const {
       _vector->SetDevice(device);
@@ -140,7 +140,7 @@ class Transform {
       // granularity is used in data vector.
       size_t shard_size = range_size;
       Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
       const int kGrids =
           static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
       if (kGrids == 0) {
@@ -174,7 +174,7 @@ class Transform {
     /*! \brief Range object specifying parallel threads index range. */
     Range range_;
     int32_t n_threads_;
-    int32_t device_;
+    DeviceOrd device_;
   };
 
  public:
@@ -192,8 +192,8 @@ class Transform {
    */
   template <typename Functor>
   static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
-                                 int32_t device_idx) {
-    return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
+                                 DeviceOrd device) {
+    return Evaluator<Functor>{func, std::move(range), n_threads, device};
   }
 };
 
diff --git a/src/context.cc b/src/context.cc
index 1acaa6443..850015c26 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -20,7 +20,6 @@ namespace xgboost {
 
 DMLC_REGISTER_PARAMETER(Context);
 
-bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;
 
 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
@@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
     return std::nullopt;
   }
 
-  std::int32_t parsed_id{Context::kCpuId};
+  std::int32_t parsed_id{DeviceOrd::CPUOrdinal()};
   auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
   if (res.ec != std::errc()) {
     return std::nullopt;
@@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 
   auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
   DeviceOrd device;
-  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  device.ordinal = DeviceOrd::InvalidOrdinal();  // mark it invalid for check.
   if (split_it == s_device.cend()) {
     // no ordinal.
     if (s_device == DeviceSym::CPU()) {
@@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
     device = DeviceOrd::CUDA(opt_id.value());
   }
 
-  if (device.ordinal < Context::kCpuId) {
+  if (device.ordinal < DeviceOrd::CPUOrdinal()) {
     fatal();
   }
   device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
@@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 }
 }  // namespace
 
+std::ostream& operator<<(std::ostream& os, DeviceOrd ord) {
+  os << ord.Name();
+  return os;
+}
+
+void Context::Init(Args const& kwargs) {
+  auto unknown = this->UpdateAllowUnknown(kwargs);
+  if (!unknown.empty()) {
+    std::stringstream ss;
+    std::size_t i = 0;
+    ss << "[Internal Error] Unknown parameters passed to the Context {";
+    for (auto const& [k, _] : unknown) {
+      ss << '"' << k << '"';
+      if (++i != unknown.size()) {
+        ss << ", ";
+      }
+    }
+    ss << "}\n";
+    LOG(FATAL) << ss.str();
+  }
+}
+
 void Context::ConfigureGpuId(bool require_gpu) {
   if (this->IsCPU() && require_gpu) {
     this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
@@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
     error::WarnDeprecatedGPUId();
     auto opt_id = ParseInt(StringView{gpu_id_it->second});
     CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
-    if (opt_id.value() > Context::kCpuId) {
+    if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
       this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
     } else {
       this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
@@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
   this->SetDevice(new_d);
 
   if (this->IsCPU()) {
-    CHECK_EQ(this->device_.ordinal, kCpuId);
+    CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal());
   } else {
-    CHECK_GT(this->device_.ordinal, kCpuId);
+    CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal());
   }
 }
 
diff --git a/src/data/data.cc b/src/data/data.cc
index f143faf97..92547dafd 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -687,13 +687,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
 
   linalg::Stack(&this->labels, that.labels);
 
-  this->weights_.SetDevice(that.weights_.DeviceIdx());
+  this->weights_.SetDevice(that.weights_.Device());
   this->weights_.Extend(that.weights_);
 
-  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
+  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
   this->labels_lower_bound_.Extend(that.labels_lower_bound_);
 
-  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
+  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
   this->labels_upper_bound_.Extend(that.labels_upper_bound_);
 
   linalg::Stack(&this->base_margin_, that.base_margin_);
@@ -723,7 +723,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
   }
   if (!that.feature_weights.Empty()) {
     this->feature_weights.Resize(that.feature_weights.Size());
-    this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
+    this->feature_weights.SetDevice(that.feature_weights.Device());
     this->feature_weights.Copy(that.feature_weights);
   }
 }
@@ -738,22 +738,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {
 
 namespace {
 template <typename T>
-void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
+void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
+  bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
   if (!valid) {
     LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
                   "the booster. The device ordinal of the data is: "
-               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
+               << v.Device() << "; the device ordinal of the Booster is: " << device;
   }
 }
 
 template <typename T, std::int32_t D>
-void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
+void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
   CheckDevice(device, *v.Data());
 }
 }  // anonymous namespace
 
-void MetaInfo::Validate(std::int32_t device) const {
+void MetaInfo::Validate(DeviceOrd device) const {
   if (group_ptr_.size() != 0 && weights_.Size() != 0) {
     CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
     return;
diff --git a/src/data/data.cu b/src/data/data.cu
index 74db2b28c..670af48c7 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -29,13 +29,13 @@ template <typename T, int32_t D>
 void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
   ArrayInterface<D> array(arr_interface);
   if (array.n == 0) {
-    p_out->SetDevice(0);
+    p_out->SetDevice(DeviceOrd::CUDA(0));
     p_out->Reshape(array.shape);
     return;
   }
   CHECK_EQ(array.valid.Capacity(), 0)
       << "Meta info like label or weight can not have missing value.";
-  auto ptr_device = SetDeviceToPtr(array.data);
+  auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
   p_out->SetDevice(ptr_device);
 
   if (array.is_contiguous && array.type == ToDType<T>::kType) {
@@ -50,7 +50,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
     return;
   }
   p_out->Reshape(array.shape);
-  auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
+  auto t = p_out->View(ptr_device);
   linalg::ElementWiseTransformDevice(
       t,
       [=] __device__(size_t i, T) {
@@ -86,7 +86,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
       });
   dh::caching_device_vector<bool> flag(1);
   auto d_flag = dh::ToSpan(flag);
-  auto d = SetDeviceToPtr(array_interface.data);
+  auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
   dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
   dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
     auto typed = TypedIndex<uint32_t, 1>{array_interface};
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 8c11d74c9..67ceb92f2 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
   CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
       : columns_(columns),
         num_rows_(num_rows) {}
-  size_t Size() const { return num_rows_ * columns_.size(); }
-  __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
+  [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
     size_t column_idx = idx % columns_.size();
     size_t row_idx = idx / columns_.size();
     auto const& column = columns_[column_idx];
@@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     return {row_idx, column_idx, value};
   }
 
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
     auto const& column = columns_[fidx];
     float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
                       ? column(ridx)
@@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     return value;
   }
 
-  XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
 
  private:
   common::Span<ArrayInterface<1>> columns_;
@@ -120,14 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
       return;
     }
 
-    device_idx_ = dh::CudaGetPointerDevice(first_column.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
-    dh::safe_cuda(cudaSetDevice(device_idx_));
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
+    CHECK(device_.IsCUDA());
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
     for (auto& json_col : json_columns) {
       auto column = ArrayInterface<1>(get<Object const>(json_col));
       columns.push_back(column);
       num_rows_ = std::max(num_rows_, column.Shape(0));
-      CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
+      CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
           << "All columns should use the same device.";
       CHECK_EQ(num_rows_, column.Shape(0))
           << "All columns should have same number of rows.";
@@ -143,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
     return batch_;
   }
 
-  size_t NumRows() const { return num_rows_; }
-  size_t NumColumns() const { return columns_.size(); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return num_rows_; }
+  [[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 
  private:
   CudfAdapterBatch batch_;
   dh::device_vector<ArrayInterface<1>> columns_;
   size_t num_rows_{0};
-  int32_t device_idx_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };
 
 class CupyAdapterBatch : public detail::NoMetaInfo {
@@ -159,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
   CupyAdapterBatch() = default;
   explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
     : array_interface_(std::move(array_interface)) {}
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
     return array_interface_.Shape(0) * array_interface_.Shape(1);
   }
-  __device__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
     size_t column_idx = idx % array_interface_.Shape(1);
     size_t row_idx = idx / array_interface_.Shape(1);
     float value = array_interface_(row_idx, column_idx);
     return {row_idx, column_idx, value};
   }
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
     float value = array_interface_(ridx, fidx);
     return value;
   }
 
-  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
 
  private:
   ArrayInterface<2> array_interface_;
@@ -189,28 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
     if (array_interface_.Shape(0) == 0) {
       return;
     }
-    device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
+    CHECK(device_.IsCUDA());
   }
   explicit CupyAdapter(std::string cuda_interface_str)
       : CupyAdapter{StringView{cuda_interface_str}} {}
-  const CupyAdapterBatch& Value() const override { return batch_; }
+  [[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }
 
-  size_t NumRows() const { return array_interface_.Shape(0); }
-  size_t NumColumns() const { return array_interface_.Shape(1); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 
  private:
   ArrayInterface<2> array_interface_;
   CupyAdapterBatch batch_;
-  int32_t device_idx_ {Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };
 
 // Returns maximum row length
 template <typename AdapterBatchT>
-std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
+std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
                          float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
   dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
 
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 369021376..44b9c8dd6 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -94,22 +94,18 @@ __global__ void CompressBinEllpackKernel(
 }
 
 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
-                                 bool is_dense, size_t row_stride,
-                                 size_t n_rows)
-    : is_dense(is_dense),
-      cuts_(std::move(cuts)),
-      row_stride(row_stride),
-      n_rows(n_rows) {
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
+                                 size_t row_stride, size_t n_rows)
+    : is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   monitor_.Start("InitCompressedData");
   InitCompressedData(device);
   monitor_.Stop("InitCompressedData");
 }
 
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
                                  const SparsePage &page, bool is_dense,
                                  size_t row_stride,
                                  common::Span<FeatureType const> feature_types)
@@ -123,7 +119,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
     : is_dense(dmat->IsDense()) {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 
   n_rows = dmat->Info().num_row_;
 
@@ -138,15 +134,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
   monitor_.Stop("Quantiles");
 
   monitor_.Start("InitCompressedData");
-  this->InitCompressedData(ctx->gpu_id);
+  this->InitCompressedData(ctx->Device());
   monitor_.Stop("InitCompressedData");
 
-  dmat->Info().feature_types.SetDevice(ctx->gpu_id);
+  dmat->Info().feature_types.SetDevice(ctx->Device());
   auto ft = dmat->Info().feature_types.ConstDeviceSpan();
   monitor_.Start("BinningCompression");
   CHECK(dmat->SingleColBlock());
   for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(ctx->gpu_id, batch, ft);
+    CreateHistIndices(ctx->Device(), batch, ft);
   }
   monitor_.Stop("BinningCompression");
 }
@@ -209,7 +205,7 @@ struct TupleScanOp {
 // to remove missing data
 template <typename AdapterBatchT>
 void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
-                       EllpackPageImpl* dst, int device_idx, float missing) {
+                       EllpackPageImpl* dst, DeviceOrd device, float missing) {
   // Some witchcraft happens here
   // The goal is to copy valid elements out of the input to an ELLPACK matrix
   // with a given row stride, using no extra working memory Standard stream
@@ -241,7 +237,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
   // Tuple[2] = The index in the input data
   using Tuple = thrust::tuple<size_t, size_t, size_t>;
 
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
   auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
 
@@ -280,10 +276,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 #endif
 }
 
-void WriteNullValues(EllpackPageImpl* dst, int device_idx,
-                     common::Span<size_t> row_counts) {
+void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
   // Write the null values
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
   auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
   auto row_stride = dst->row_stride;
@@ -300,11 +295,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
 }
 
 template <typename AdapterBatch>
-EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                                  common::Span<size_t> row_counts_span,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
                                  size_t n_rows, common::HistogramCuts const& cuts) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
   CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -313,7 +308,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
 
 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
   template EllpackPageImpl::EllpackPageImpl(                                               \
-      __BATCH_T batch, float missing, int device, bool is_dense,                           \
+      __BATCH_T batch, float missing, DeviceOrd device, bool is_dense,                     \
       common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
       size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);
 
@@ -370,9 +365,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
       [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
   row_stride = *std::max_element(it, it + page.Size());
 
-  CHECK_GE(ctx->gpu_id, 0);
+  CHECK(ctx->IsCUDA());
   monitor_.Start("InitCompressedData");
-  InitCompressedData(ctx->gpu_id);
+  InitCompressedData(ctx->Device());
   monitor_.Stop("InitCompressedData");
 
   // copy gidx
@@ -382,7 +377,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                 cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
 
-  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
+  auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
   auto null = accessor.NullValue();
   CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
 }
@@ -407,8 +402,7 @@ struct CopyPage {
 };
 
 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
-                             size_t offset) {
+size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
   monitor_.Start("Copy");
   size_t num_elements = page->n_rows * page->row_stride;
   CHECK_EQ(row_stride, page->row_stride);
@@ -468,7 +462,7 @@ struct CompactPage {
 };
 
 // Compacts the data from the given EllpackPage into the current page.
-void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
+void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
                               common::Span<size_t> row_indexes) {
   monitor_.Start("Compact");
   CHECK_EQ(row_stride, page->row_stride);
@@ -481,13 +475,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
 }
 
 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(int device) {
+void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
   size_t num_symbols = NumSymbols();
 
   // Required buffer size for storing data matrix in ELLPack format.
   size_t compressed_size_bytes =
-    common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
-      num_symbols);
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
   gidx_buffer.SetDevice(device);
   // Don't call fill unnecessarily
   if (gidx_buffer.Size() == 0) {
@@ -499,7 +492,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
 }
 
 // Compress a CSR page into ELLPACK.
-void EllpackPageImpl::CreateHistIndices(int device,
+void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
                                         const SparsePage& row_batch,
                                         common::Span<FeatureType const> feature_types) {
   if (row_batch.Size() == 0) return;
@@ -509,7 +502,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
 
   // bin and compress entries in batches of rows
   size_t gpu_batch_nrows =
-      std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
+      std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
                static_cast<size_t>(row_batch.Size()));
 
   size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
@@ -572,7 +565,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
 }
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
-    int device, common::Span<FeatureType const> feature_types) const {
+    DeviceOrd device, common::Span<FeatureType const> feature_types) const {
   gidx_buffer.SetDevice(device);
   return {device,
           cuts_,
@@ -586,7 +579,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
 }
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
     common::Span<FeatureType const> feature_types) const {
-  return {Context::kCpuId,
+  return {DeviceOrd::CPU(),
           cuts_,
           is_dense,
           row_stride,
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 96963463b..c64462082 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {
 
   common::Span<const FeatureType> feature_types;
 
-  EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
-                        bool is_dense, size_t row_stride, size_t base_rowid,
-                        size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter,
+  EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
+                        size_t row_stride, size_t base_rowid, size_t n_rows,
+                        common::CompressedIterator<uint32_t> gidx_iter,
                         common::Span<FeatureType const> feature_types)
       : is_dense(is_dense),
         row_stride(row_stride),
         base_rowid(base_rowid),
-        n_rows(n_rows) ,gidx_iter(gidx_iter),
+        n_rows(n_rows),
+        gidx_iter(gidx_iter),
         feature_types{feature_types} {
-    if (device == Context::kCpuId) {
+    if (device.IsCPU()) {
       gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
       feature_segments = cuts.cut_ptrs_.ConstHostSpan();
       min_fvalue = cuts.min_vals_.ConstHostSpan();
@@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
   }
   // Get a matrix element, uses binary search for look up Return NaN if missing
   // Given a row index and a feature index, returns the corresponding cut value
-  __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
     ridx -= base_rowid;
     auto row_begin = row_stride * ridx;
     auto row_end = row_begin + row_stride;
@@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
   }
 
   template <bool is_cat>
-  __device__ uint32_t SearchBin(float value, size_t column_id) const {
+  [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
     auto beg = feature_segments[column_id];
     auto end = feature_segments[column_id + 1];
     uint32_t idx = 0;
@@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
     return idx;
   }
 
-  __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
     auto gidx = GetBinIndex(ridx, fidx);
     if (gidx == -1) {
       return nan("");
@@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
   }
 
   // Check if the row id is withing range of the current batch.
-  __device__ bool IsInRange(size_t row_id) const {
+  [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
     return row_id >= base_rowid && row_id < base_rowid + n_rows;
   }
   /*! \brief Return the total number of symbols (total number of bins plus 1 for
    * not found). */
-  XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
 
-  XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
 
-  XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
 
-  XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
 };
 
 
@@ -141,14 +142,13 @@ class EllpackPageImpl {
    * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
    * and the given number of rows.
    */
-  EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
-                  size_t row_stride, size_t n_rows);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
+                  size_t n_rows);
   /*!
    * \brief Constructor used for external memory.
    */
-  EllpackPageImpl(int device, common::HistogramCuts cuts,
-                  const SparsePage &page, bool is_dense, size_t row_stride,
-                  common::Span<FeatureType const> feature_types);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
+                  bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);
 
   /*!
    * \brief Constructor from an existing DMatrix.
@@ -159,7 +159,7 @@ class EllpackPageImpl {
   explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
 
   template <typename AdapterBatch>
-  explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+  explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                            common::Span<size_t> row_counts_span,
                            common::Span<FeatureType const> feature_types, size_t row_stride,
                            size_t n_rows, common::HistogramCuts const& cuts);
@@ -176,7 +176,7 @@ class EllpackPageImpl {
    * @param offset The number of elements to skip before copying.
    * @returns The number of elements copied.
    */
-  size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
+  size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
 
   /*! \brief Compact the given ELLPACK page into the current page.
    *
@@ -184,11 +184,10 @@ class EllpackPageImpl {
    * @param page The ELLPACK page to compact from.
    * @param row_indexes Row indexes for the compacted page.
    */
-  void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
-
+  void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
 
   /*! \return Number of instances in the page. */
-  size_t Size() const;
+  [[nodiscard]] size_t Size() const;
 
   /*! \brief Set the base row id for this page. */
   void SetBaseRowId(std::size_t row_id) {
@@ -204,12 +203,12 @@ class EllpackPageImpl {
 
   /*! \brief Return the total number of symbols (total number of bins plus 1 for
    * not found). */
-  size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
+  [[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
 
-  EllpackDeviceAccessor
-  GetDeviceAccessor(int device,
-                    common::Span<FeatureType const> feature_types = {}) const;
-  EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
+      DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
+      common::Span<FeatureType const> feature_types = {}) const;
 
  private:
   /*!
@@ -218,13 +217,13 @@ class EllpackPageImpl {
    * @param device The GPU device to use.
    * @param row_batch The CSR page.
    */
-  void CreateHistIndices(int device,
+  void CreateHistIndices(DeviceOrd device,
                          const SparsePage& row_batch,
                          common::Span<FeatureType const> feature_types);
   /*!
    * \brief Initialize the buffer to store compressed features.
    */
-  void InitCompressedData(int device);
+  void InitCompressedData(DeviceOrd device);
 
 
 public:
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index abfc400c1..41b0f480b 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,7 +10,7 @@
 
 namespace xgboost::data {
 void EllpackPageSource::Fetch() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   if (!this->ReadCache()) {
     if (count_ != 0 && !sync_) {
       // source is initialized to be the 0th page during construction, so when count_ is 0
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 146db94ed..24182e209 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
   BatchParam param_;
   common::Span<FeatureType const> feature_types_;
   std::unique_ptr<common::HistogramCuts> cuts_;
-  std::int32_t device_;
+  DeviceOrd device_;
 
  public:
   EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
                     std::shared_ptr<Cache> cache, BatchParam param,
                     std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
                     common::Span<FeatureType const> feature_types,
-                    std::shared_ptr<SparsePageSource> source, std::int32_t device)
+                    std::shared_ptr<SparsePageSource> source, DeviceOrd device)
       : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
         is_dense_{is_dense},
         row_stride_{row_stride},
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index a53b88c13..45f6286fb 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   auto pctx = MakeProxy(proxy_)->Ctx();
 
   Context ctx;
-  ctx.UpdateAllowUnknown(
-      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
   // hardcoded parameter.
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
@@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
     return HostAdapterDispatch(proxy, [&](auto const& value) {
       size_t n_threads = ctx->Threads();
       size_t n_features = column_sizes.size();
-      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
+      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
       column_sizes_tloc.Data()->Fill(0ul);
       auto view = column_sizes_tloc.HostView();
       common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index cf34ca61d..2fffd516b 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -47,9 +47,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
 
   int32_t current_device;
   dh::safe_cuda(cudaGetDevice(&current_device));
-  auto get_device = [&]() -> int32_t {
-    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
-    CHECK_NE(d, Context::kCpuId);
+  auto get_device = [&]() {
+    auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
+    CHECK(!d.IsCPU());
     return d;
   };
 
@@ -59,9 +59,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   common::HistogramCuts cuts;
   do {
     // We use do while here as the first batch is fetched in ctor
-    // ctx_.gpu_id = proxy->DeviceIdx();
-    CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
-    dh::safe_cuda(cudaSetDevice(get_device()));
+    CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
     if (cols == 0) {
       cols = num_cols();
       collective::Allreduce<collective::Operation::kMax>(&cols, 1);
@@ -93,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   auto n_features = cols;
   CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
 
-  dh::safe_cuda(cudaSetDevice(get_device()));
+  dh::safe_cuda(cudaSetDevice(get_device().ordinal));
   if (!ref) {
     HostDeviceVector<FeatureType> ft;
     common::SketchContainer final_sketch(
@@ -132,7 +131,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   size_t n_batches_for_verification = 0;
   while (iter.Next()) {
     init_page();
-    dh::safe_cuda(cudaSetDevice(get_device()));
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
     auto rows = num_rows();
     dh::device_vector<size_t> row_counts(rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
@@ -184,18 +183,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
   if (!ellpack_) {
     ellpack_.reset(new EllpackPage());
     if (ctx->IsCUDA()) {
-      this->Info().feature_types.SetDevice(ctx->gpu_id);
+      this->Info().feature_types.SetDevice(ctx->Device());
       *ellpack_->Impl() =
           EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     } else if (fmat_ctx_.IsCUDA()) {
-      this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
+      this->Info().feature_types.SetDevice(fmat_ctx_.Device());
       *ellpack_->Impl() =
           EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     } else {
       // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
       // for cut reference.
       auto cuda_ctx = ctx->MakeCUDA();
-      this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
+      this->Info().feature_types.SetDevice(cuda_ctx.Device());
       *ellpack_->Impl() =
           EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     }
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index e920ef50e..c6e840539 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
-void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
-                              char const *c_values, bst_feature_t n_features, bool on_host) {
+void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
+                              bst_feature_t n_features, bool on_host) {
   CHECK(on_host) << "Not implemented on device.";
   std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
       StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
 namespace cuda_impl {
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index ded1c3aef..cd76e49cf 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
     return;
   }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }
 
 void DMatrixProxy::FromCudaArray(StringView interface_str) {
@@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
     return;
   }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }
 
 namespace cuda_impl {
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 59f0935be..3bcdfbff3 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA)
 
  public:
-  int DeviceIdx() const { return ctx_.gpu_id; }
+  DeviceOrd Device() const { return ctx_.Device(); }
 
   void SetCUDAArray(char const* c_interface) {
     common::AssertGPUSupport();
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 85ede3258..bf7b27eb7 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -253,7 +253,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
     }
     if (batch.BaseMargin() != nullptr) {
       info_.base_margin_ = decltype(info_.base_margin_){
-          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId};
+          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
     }
     if (batch.Qid() != nullptr) {
       qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index 68cab0d5a..e41d59394 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -10,9 +10,7 @@
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 // Does not currently support metainfo as no on-device data source contains this
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
@@ -21,13 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                              DataSplitMode data_split_mode) {
   CHECK(data_split_mode != DataSplitMode::kCol)
       << "Column-wise data split is currently not supported on the GPU.";
-  auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
-                                                                      : adapter->DeviceIdx();
-  CHECK_GE(device, 0);
-  dh::safe_cuda(cudaSetDevice(device));
+  auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
+                    ? DeviceOrd::CUDA(dh::CurrentDevice())
+                    : adapter->Device();
+  CHECK(device.IsCUDA());
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
 
   CHECK(adapter->NumRows() != kAdapterUnknownSize);
   CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@@ -52,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
                                       int nthread, DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
                                       int nthread, DataSplitMode data_split_mode);
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh
index e2c0ae347..47d736050 100644
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -40,9 +40,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
 }
 
 template <typename AdapterBatchT>
-void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
-                     int device_idx, float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
+void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
+                     float missing) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
   // Count elements per row
   dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
@@ -55,14 +55,13 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
   });
 
   dh::XGBCachingDeviceAllocator<char> alloc;
-  thrust::exclusive_scan(thrust::cuda::par(alloc),
-      thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data() + offset.size()),
-      thrust::device_pointer_cast(offset.data()));
+  thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+                         thrust::device_pointer_cast(offset.data() + offset.size()),
+                         thrust::device_pointer_cast(offset.data()));
 }
 
 template <typename AdapterBatchT>
-size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
+size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
                         SparsePage* page) {
   bool valid = NoInfInData(batch, IsValidFunctor{missing});
   CHECK(valid) << error::InfInData();
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index 9d4c63387..572d6cb08 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     ellpack_page_source_.reset();  // make sure resource is released before making new ones.
     ellpack_page_source_ = std::make_shared<EllpackPageSource>(
         this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
-        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
+        ctx->Device());
   } else {
     CHECK(sparse_page_source_);
     ellpack_page_source_->Reset();
diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu
index 8d4adda17..40037eedc 100644
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
 }  // namespace detail
 
 void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
-  auto device = proxy->DeviceIdx();
-  if (device < 0) {
-    device = dh::CurrentDevice();
+  auto device = proxy->Device();
+  if (device.IsCPU()) {
+    device = DeviceOrd::CUDA(dh::CurrentDevice());
   }
-  CHECK_GE(device, 0);
+  CHECK(device.IsCUDA());
 
   cuda_impl::Dispatch(proxy,
                       [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 438fd15e6..b0327da15 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   bst_target_t const n_groups = model_.learner_model_param->OutputLength();
   monitor_.Start("BoostNewTrees");
 
-  predt->predictions.SetDevice(ctx_->Ordinal());
+  predt->predictions.SetDevice(ctx_->Device());
   auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                     model_.learner_model_param->OutputLength());
   CHECK_NE(n_groups, 0);
@@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   } else {
     CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
     linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
-                                     ctx_->Ordinal()};
+                                     ctx_->Device()};
     bool update_predict = true;
     for (bst_target_t gid = 0; gid < n_groups; ++gid) {
       node_position.clear();
@@ -736,7 +736,7 @@ class Dart : public GBTree {
 
     PredictionCacheEntry predts;  // temporary storage for prediction
     if (ctx_->IsCUDA()) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+      predts.predictions.SetDevice(ctx_->Device());
     }
     predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
     // multi-target is not yet supported.
@@ -761,8 +761,8 @@ class Dart : public GBTree {
       CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
 
       size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+      if (predts.predictions.Device().IsCUDA()) {
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
         GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
                           predts.predictions.DeviceSpan(), w, n_rows, n_groups,
                           group);
@@ -801,8 +801,8 @@ class Dart : public GBTree {
 
     StringView msg{"Unsupported data type for inplace predict."};
     PredictionCacheEntry predts;
-    if (ctx_->gpu_id != Context::kCpuId) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+    if (ctx_->IsCUDA()) {
+      predts.predictions.SetDevice(ctx_->Device());
     }
     predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
 
@@ -838,8 +838,8 @@ class Dart : public GBTree {
       CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
 
       size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+      if (predts.predictions.Device().IsCUDA()) {
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
         auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
         GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
                                  predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
diff --git a/src/learner.cc b/src/learner.cc
index 5697ee627..d2b297dca 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -305,10 +305,10 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* c
 
 void LearnerModelParam::Copy(LearnerModelParam const& that) {
   base_score_.Reshape(that.base_score_.Shape());
-  base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
+  base_score_.Data()->SetDevice(that.base_score_.Device());
   base_score_.Data()->Copy(*that.base_score_.Data());
   std::as_const(base_score_).HostView();
-  if (that.base_score_.DeviceIdx() != Context::kCpuId) {
+  if (!that.base_score_.Device().IsCPU()) {
     std::as_const(base_score_).View(that.base_score_.Device());
   }
   CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
@@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
     if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
       if (p_fmat) {
         auto const& info = p_fmat->Info();
-        info.Validate(Ctx()->Ordinal());
+        info.Validate(Ctx()->Device());
         // We estimate it from input data.
         linalg::Tensor<float, 1> base_score;
         InitEstimation(info, &base_score);
@@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner {
     monitor_.Init("Learner");
     for (std::shared_ptr<DMatrix> const& d : cache) {
       if (d) {
-        prediction_container_.Cache(d, Context::kCpuId);
+        prediction_container_.Cache(d, DeviceOrd::CPU());
       }
     }
   }
@@ -1046,7 +1046,7 @@ class LearnerIO : public LearnerConfiguration {
                                                         ? std::numeric_limits<float>::quiet_NaN()
                                                         : obj_->ProbToMargin(mparam_.base_score)},
                                                    {1},
-                                                   Context::kCpuId},
+                                                   DeviceOrd::CPU()},
                           obj_->Task(), tparam_.multi_strategy);
 
     if (attributes_.find("objective") != attributes_.cend()) {
@@ -1271,7 +1271,7 @@ class LearnerImpl : public LearnerIO {
 
     this->ValidateDMatrix(train.get(), true);
 
-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());
 
     monitor_.Start("PredictRaw");
     this->PredictRaw(train.get(), &predt, true, 0, 0);
@@ -1301,7 +1301,7 @@ class LearnerImpl : public LearnerIO {
     CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
         << "The number of columns in gradient should be equal to the number of targets/classes in "
            "the model.";
-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());
     gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
     monitor_.Stop("BoostOneIter");
   }
@@ -1327,11 +1327,11 @@ class LearnerImpl : public LearnerIO {
 
     for (size_t i = 0; i < data_sets.size(); ++i) {
       std::shared_ptr<DMatrix> m = data_sets[i];
-      auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
+      auto &predt = prediction_container_.Cache(m, ctx_.Device());
       this->ValidateDMatrix(m.get(), false);
       this->PredictRaw(m.get(), &predt, false, 0, 0);
 
-      auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions;
+      auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions;
       out.Resize(predt.predictions.Size());
       out.Copy(predt.predictions);
 
@@ -1367,7 +1367,7 @@ class LearnerImpl : public LearnerIO {
     } else if (pred_leaf) {
       gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
     } else {
-      auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
+      auto& prediction = prediction_container_.Cache(data, ctx_.Device());
       this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
       // Copy the prediction cache to output prediction. out_preds comes from C API
       out_preds->SetDevice(ctx_.Device());
@@ -1447,7 +1447,7 @@ class LearnerImpl : public LearnerIO {
 
   void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
     MetaInfo const& info = p_fmat->Info();
-    info.Validate(ctx_.gpu_id);
+    info.Validate(ctx_.Device());
 
     if (is_training) {
       CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index 659b45135..e1bc5b73a 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
   }
 
   void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
-    if (ctx_->gpu_id < 0) return;
+    if (ctx_->IsCPU()) return;
 
     num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
 
@@ -60,7 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
       return;
     }
 
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     // The begin and end indices for the section of each column associated with
     // this device
     std::vector<std::pair<bst_uint, bst_uint>> column_segments;
@@ -133,7 +133,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
          ++group_idx) {
       // Get gradient
       auto grad = GradientPair(0, 0);
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
         grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
       }
       auto dbias = static_cast<float>(
@@ -142,7 +142,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
       model->Bias()[group_idx] += dbias;
 
       // Update residual
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
         UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
       }
     }
@@ -153,7 +153,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     bst_float &w = (*model)[fidx][group_idx];
     // Get gradient
     auto grad = GradientPair(0, 0);
-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
       grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
     }
     auto dw = static_cast<float>(tparam_.learning_rate *
@@ -162,14 +162,14 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
                                                  tparam_.reg_lambda_denorm));
     w += dw;
 
-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
       UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
     }
   }
 
   // This needs to be public because of the __device__ lambda.
   GradientPair GetBiasGradient(int group_idx, int num_group) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     auto counting = thrust::make_counting_iterator(0ull);
     auto f = [=] __device__(size_t idx) {
       return idx * num_group + group_idx;
@@ -193,7 +193,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
 
   // This needs to be public because of the __device__ lambda.
   GradientPair GetGradient(int group_idx, int num_group, int fidx) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
     size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
     common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index a2e7372fb..2e5c88174 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -23,8 +23,7 @@
 #include "xgboost/linalg.h"
 #include "xgboost/metric.h"
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(auc);
 /**
@@ -257,10 +256,10 @@ template <typename Curve>
 class EvalAUC : public MetricNoCache {
   double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
     double auc {0};
-    if (ctx_->gpu_id != Context::kCpuId) {
-      preds.SetDevice(ctx_->gpu_id);
-      info.labels.SetDevice(ctx_->gpu_id);
-      info.weights_.SetDevice(ctx_->gpu_id);
+    if (ctx_->Device().IsCUDA()) {
+      preds.SetDevice(ctx_->Device());
+      info.labels.SetDevice(ctx_->Device());
+      info.weights_.SetDevice(ctx_->Device());
     }
     //  We use the global size to handle empty dataset.
     std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
@@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       std::tie(auc, valid_groups) =
           RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
     } else {
@@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     auto n_threads = ctx_->Threads();
     CHECK_NE(n_classes, 0);
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
     } else {
       auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
@@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double fp, tp, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
                                            info.labels.HostView().Slice(linalg::All(), 0),
                                            common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
   }
 
  public:
-  char const* Name() const override {
+  [[nodiscard]] char const* Name() const override {
     return "auc";
   }
 };
@@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double pr, re, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       std::tie(pr, re, auc) =
           BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                       common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
 
   double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
                         size_t n_classes) {
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       auto n_threads = this->ctx_->Threads();
       return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
     } else {
@@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       auto labels = info.labels.Data()->ConstHostSpan();
       if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
         InvalidLabels();
@@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
   }
 
  public:
-  const char *Name() const override { return "aucpr"; }
+  [[nodiscard]] const char *Name() const override { return "aucpr"; }
 };
 
 XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
@@ -473,5 +472,4 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *, common::Span<f
   return {};
 }
 #endif
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index dd9e4483f..a4838d783 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -824,7 +824,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
                                                  common::Span<float const> predts,
                                                  MetaInfo const &info,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache) {
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   if (predts.empty()) {
     return std::make_pair(0.0, static_cast<uint32_t>(0));
   }
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index f6f3f3d04..897c91dab 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -127,24 +127,24 @@ class MultiClassMetricsReduction {
 
 #endif  // XGBOOST_USE_CUDA
 
-  PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class,
+  PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class,
                             const HostDeviceVector<bst_float>& weights,
                             const HostDeviceVector<bst_float>& labels,
                             const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (device < 0) {
+    if (device.IsCPU()) {
       result =
-          CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads());
+          CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
     }
 #if defined(XGBOOST_USE_CUDA)
     else {  // NOLINT
-      device_ = tparam.gpu_id;
+      device_ = ctx.Device();
       preds.SetDevice(device_);
       labels.SetDevice(device_);
       weights.SetDevice(device_);
 
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
       result = DeviceReduceMetrics(weights, labels, preds, n_class);
     }
 #endif  // defined(XGBOOST_USE_CUDA)
@@ -154,7 +154,7 @@ class MultiClassMetricsReduction {
  private:
 #if defined(XGBOOST_USE_CUDA)
   dh::PinnedMemory label_error_;
-  int device_{-1};
+  DeviceOrd device_{DeviceOrd::CPU()};
 #endif  // defined(XGBOOST_USE_CUDA)
 };
 
@@ -176,7 +176,7 @@ struct EvalMClassBase : public MetricNoCache {
       CHECK_GE(nclass, 1U)
           << "mlogloss and merror are only used for multi-class classification,"
           << " use logloss for binary classification";
-      int device = ctx_->gpu_id;
+      auto device = ctx_->Device();
       auto result =
           reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
       dat[0] = result.Residue();
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index f79d52742..372eb6805 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -35,7 +35,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
   auto d_gptr = p_cache->DataGroupPtr(ctx);
   auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
 
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
   auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
   auto topk = p_cache->Param().TopK();
   auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
@@ -90,7 +90,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
     CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
   }
   auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
   auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
 
   auto d_group_ptr = p_cache->DataGroupPtr(ctx);
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 5f8c8ee6a..0625af25a 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -130,18 +130,18 @@ class ElementWiseSurvivalMetricsReduction {
       const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (ctx.gpu_id < 0) {
+    if (ctx.IsCPU()) {
       result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
                                 preds, ctx.Threads());
     }
 #if defined(XGBOOST_USE_CUDA)
     else {  // NOLINT
-      preds.SetDevice(ctx.gpu_id);
-      labels_lower_bound.SetDevice(ctx.gpu_id);
-      labels_upper_bound.SetDevice(ctx.gpu_id);
-      weights.SetDevice(ctx.gpu_id);
+      preds.SetDevice(ctx.Device());
+      labels_lower_bound.SetDevice(ctx.Device());
+      labels_upper_bound.SetDevice(ctx.Device());
+      weights.SetDevice(ctx.Device());
 
-      dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
+      dh::safe_cuda(cudaSetDevice(ctx.Ordinal()));
       result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
     }
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index ffd3ddec7..c49c5b6fb 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -100,7 +100,7 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> cons
     detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
                                predt, alpha, p_tree);
   } else {
-    position.SetDevice(ctx->gpu_id);
+    position.SetDevice(ctx->Device());
     detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
                                  predt, alpha, p_tree);
   }
diff --git a/src/objective/aft_obj.cu b/src/objective/aft_obj.cu
index 522866a42..3ad9ca847 100644
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -42,7 +42,7 @@ class AFTObj : public ObjFunction {
 
   template <typename Distribution>
   void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
-                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
+                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, DeviceOrd device,
                        bool is_null_weight, float aft_loss_distribution_scale) {
     common::Transform<>::Init(
         [=] XGBOOST_DEVICE(size_t _idx,
@@ -75,7 +75,7 @@ class AFTObj : public ObjFunction {
     CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(ndata, 1);
-    const int device = ctx_->gpu_id;
+    const auto device = ctx_->Device();
     const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
     const bool is_null_weight = info.weights_.Size() == 0;
     if (!is_null_weight) {
@@ -108,7 +108,7 @@ class AFTObj : public ObjFunction {
           _preds[_idx] = exp(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu
index 0d3ed6ca4..dd9a19b13 100644
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2022 by XGBoost Contributors
+/**
+ * Copyright 2018-2023, XGBoost Contributors
  * \file hinge.cc
  * \brief Provides an implementation of the hinge loss function
  * \author Henry Gouk
@@ -13,8 +13,7 @@
 #include "../common/transform.h"
 #include "../common/common.h"
 
-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 
 #if defined(XGBOOST_USE_CUDA)
 DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
@@ -63,7 +62,7 @@ class HingeObj : public ObjFunction {
           _out_gpair[_idx] = GradientPair(g, h);
         },
         common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
-        ctx_->gpu_id).Eval(
+        ctx_->Device()).Eval(
             out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
   }
 
@@ -73,11 +72,11 @@ class HingeObj : public ObjFunction {
           _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
     return "error";
   }
 
@@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
 .describe("Hinge loss. Expects labels to be in [0,1f]")
 .set_body([]() { return new HingeObj(); });
 
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc
index 47e0364fe..df06882bb 100644
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
     CheckInitInputs(info);
   }
   // Avoid altering any state in child objective.
-  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
-  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
+  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device());
+  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->Device());
 
   Json config{Object{}};
   this->SaveConfig(&config);
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index 5a3a38fdf..bd41aebb6 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -103,10 +103,10 @@ class LambdaRankObj : public FitIntercept {
 
   // Update position biased for unbiased click data
   void UpdatePositionBias() {
-    li_full_.SetDevice(ctx_->gpu_id);
-    lj_full_.SetDevice(ctx_->gpu_id);
-    li_.SetDevice(ctx_->gpu_id);
-    lj_.SetDevice(ctx_->gpu_id);
+    li_full_.SetDevice(ctx_->Device());
+    lj_full_.SetDevice(ctx_->Device());
+    li_.SetDevice(ctx_->Device());
+    lj_.SetDevice(ctx_->Device());
 
     if (ctx_->IsCPU()) {
       cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index ac31a2c79..30eba2fdc 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -290,12 +290,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
             linalg::VectorView<double> li, linalg::VectorView<double> lj,
             linalg::Matrix<GradientPair>* out_gpair) {
   // boilerplate
-  std::int32_t device_id = ctx->gpu_id;
-  dh::safe_cuda(cudaSetDevice(device_id));
+  auto device = ctx->Device();
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   auto n_groups = p_cache->Groups();
 
-  info.labels.SetDevice(device_id);
-  preds.SetDevice(device_id);
+  info.labels.SetDevice(device);
+  preds.SetDevice(device);
   out_gpair->SetDevice(ctx->Device());
   out_gpair->Reshape(preds.Size(), 1);
 
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 7c762ed48..38880f911 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const int nclass = param_.num_class;
     const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
 
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     out_gpair->SetDevice(device);
     info.labels.SetDevice(device);
     info.weights_.SetDevice(device);
@@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const int nclass = param_.num_class;
     const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
 
-    auto device = io_preds->DeviceIdx();
+    auto device = io_preds->Device();
     if (prob) {
       common::Transform<>::Init(
           [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 8d83b829b..d8571ed17 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -70,15 +70,15 @@ class QuantileRegression : public ObjFunction {
     out_gpair->Reshape(info.num_row_, n_targets);
     auto gpair = out_gpair->View(ctx_->Device());
 
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
     common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                  : info.weights_.ConstDeviceSpan()};
 
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
     auto n_samples = info.num_row_;
 
-    alpha_.SetDevice(ctx_->gpu_id);
+    alpha_.SetDevice(ctx_->Device());
     auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
 
     linalg::ElementWiseKernel(
@@ -103,7 +103,7 @@ class QuantileRegression : public ObjFunction {
     CHECK(!alpha_.Empty());
 
     auto n_targets = this->Targets(info);
-    base_score->SetDevice(ctx_->gpu_id);
+    base_score->SetDevice(ctx_->Device());
     base_score->Reshape(n_targets);
 
     double sw{0};
@@ -129,7 +129,7 @@ class QuantileRegression : public ObjFunction {
       }
     } else {
 #if defined(XGBOOST_USE_CUDA)
-      alpha_.SetDevice(ctx_->gpu_id);
+      alpha_.SetDevice(ctx_->Device());
       auto d_alpha = alpha_.ConstDeviceSpan();
       auto d_labels = info.labels.View(ctx_->Device());
       auto seg_it = dh::MakeTransformIterator<std::size_t>(
@@ -148,7 +148,7 @@ class QuantileRegression : public ObjFunction {
                                   val_it + n, base_score->Data());
         sw = info.num_row_;
       } else {
-        info.weights_.SetDevice(ctx_->gpu_id);
+        info.weights_.SetDevice(ctx_->Device());
         auto d_weights = info.weights_.ConstDeviceSpan();
         auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                                           [=] XGBOOST_DEVICE(std::size_t i) {
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 35610beb7..cf774db93 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept {
 
     size_t const ndata = preds.Size();
     out_gpair->SetDevice(ctx_->Device());
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
 
     bool is_null_weight = info.weights_.Size() == 0;
     auto scale_pos_weight = param_.scale_pos_weight;
@@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept {
     additional_input_.HostVector().begin()[1] = is_null_weight;
 
     const size_t nthreads = ctx_->Threads();
-    bool on_device = device >= 0;
+    bool on_device = device.IsCUDA();
     // On CPU we run the transformation each thread processing a contigious block of data
     // for better performance.
     const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
@@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept {
           _preds[_idx] = Loss::PredTransform(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
@@ -246,14 +246,14 @@ class PseudoHuberRegression : public FitIntercept {
     CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
     auto labels = info.labels.View(ctx_->Device());
 
-    out_gpair->SetDevice(ctx_->gpu_id);
+    out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
     auto gpair = out_gpair->View(ctx_->Device());
 
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
 
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
     common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                  : info.weights_.ConstDeviceSpan()};
 
@@ -327,7 +327,7 @@ class PoissonRegression : public FitIntercept {
     size_t const ndata = preds.Size();
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     label_correct_.Resize(1);
     label_correct_.Fill(1);
 
@@ -369,7 +369,7 @@ class PoissonRegression : public FitIntercept {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -512,7 +512,7 @@ class GammaRegression : public FitIntercept {
     CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
     CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
     const size_t ndata = preds.Size();
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
     label_correct_.Resize(1);
@@ -555,7 +555,7 @@ class GammaRegression : public FitIntercept {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -613,7 +613,7 @@ class TweedieRegression : public FitIntercept {
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
 
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     label_correct_.Resize(1);
     label_correct_.Fill(1);
 
@@ -660,7 +660,7 @@ class TweedieRegression : public FitIntercept {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
@@ -711,9 +711,9 @@ class MeanAbsoluteError : public ObjFunction {
     out_gpair->Reshape(info.num_row_, this->Targets(info));
     auto gpair = out_gpair->View(ctx_->Device());
 
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
     common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                  : info.weights_.ConstDeviceSpan()};
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index f217ad827..e41248e29 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -180,33 +180,30 @@ struct DeviceAdapterLoader {
 
   XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
                                          bst_feature_t num_features, bst_row_t num_rows,
-                                         size_t entry_start, float missing) :
-    batch{batch},
-    columns{num_features},
-    use_shared{use_shared},
-    is_valid{missing} {
-      extern __shared__ float _smem[];
-      smem = _smem;
-      if (use_shared) {
-        uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-        size_t shared_elements = blockDim.x * num_features;
-        dh::BlockFill(smem, shared_elements, nanf(""));
-        __syncthreads();
-        if (global_idx < num_rows) {
-          auto beg = global_idx * columns;
-          auto end = (global_idx + 1) * columns;
-          for (size_t i = beg; i < end; ++i) {
-            auto value = batch.GetElement(i).value;
-            if (is_valid(value)) {
-              smem[threadIdx.x * num_features + (i - beg)] = value;
-            }
+                                         size_t entry_start, float missing)
+      : batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
+    extern __shared__ float _smem[];
+    smem = _smem;
+    if (use_shared) {
+      uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      size_t shared_elements = blockDim.x * num_features;
+      dh::BlockFill(smem, shared_elements, nanf(""));
+      __syncthreads();
+      if (global_idx < num_rows) {
+        auto beg = global_idx * columns;
+        auto end = (global_idx + 1) * columns;
+        for (size_t i = beg; i < end; ++i) {
+          auto value = batch.GetElement(i).value;
+          if (is_valid(value)) {
+            smem[threadIdx.x * num_features + (i - beg)] = value;
           }
         }
       }
-      __syncthreads();
     }
+    __syncthreads();
+  }
 
-  XGBOOST_DEV_INLINE  float GetElement(size_t  ridx, size_t  fidx) const {
+  [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
     if (use_shared) {
       return smem[threadIdx.x * columns + fidx];
     }
@@ -340,11 +337,11 @@ class DeviceModel {
   size_t tree_end_;  // NOLINT
   int num_group;
 
-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
-    dh::safe_cuda(cudaSetDevice(gpu_id));
+  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
+    dh::safe_cuda(cudaSetDevice(device.ordinal));
 
     // Copy decision trees to device
-    tree_segments = HostDeviceVector<size_t>({}, gpu_id);
+    tree_segments = HostDeviceVector<size_t>({}, device);
     auto& h_tree_segments = tree_segments.HostVector();
     h_tree_segments.reserve((tree_end - tree_begin) + 1);
     size_t sum = 0;
@@ -354,8 +351,8 @@ class DeviceModel {
       h_tree_segments.push_back(sum);
     }
 
-    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
-    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
+    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
+    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
     auto d_nodes = nodes.DevicePointer();
     auto d_stats = stats.DevicePointer();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -369,12 +366,12 @@ class DeviceModel {
           sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
     }
 
-    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
+    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
     auto& h_tree_group = tree_group.HostVector();
     std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
 
     // Initialize categorical splits.
-    split_types.SetDevice(gpu_id);
+    split_types.SetDevice(device);
     std::vector<FeatureType>& h_split_types = split_types.HostVector();
     h_split_types.resize(h_tree_segments.back());
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -383,8 +380,8 @@ class DeviceModel {
                 h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
     }
 
-    categories = HostDeviceVector<uint32_t>({}, gpu_id);
-    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id);
+    categories = HostDeviceVector<uint32_t>({}, device);
+    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
     std::vector<uint32_t> &h_categories = categories.HostVector();
     std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -397,7 +394,7 @@ class DeviceModel {
     }
 
     categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
-        h_tree_segments.back(), {}, gpu_id);
+        h_tree_segments.back(), {}, device);
     std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
         categories_node_segments.HostVector();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -485,8 +482,8 @@ struct PathInfo {
 void ExtractPaths(
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
     DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
-    int gpu_id) {
-  dh::safe_cuda(cudaSetDevice(gpu_id));
+    DeviceOrd device) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   auto& device_model = *model;
 
   dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@@ -773,12 +770,12 @@ class ColumnSplitHelper {
   template <bool predict_leaf>
   void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
                       bst_feature_t num_features, std::uint32_t num_group) const {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     dh::caching_device_vector<BitType> decision_storage{};
     dh::caching_device_vector<BitType> missing_storage{};
 
     auto constexpr kBlockThreads = 128;
-    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
+    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
     auto const shared_memory_bytes =
         SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
     auto const use_shared = shared_memory_bytes != 0;
@@ -791,8 +788,8 @@ class ColumnSplitHelper {
       BitVector decision_bits{dh::ToSpan(decision_storage)};
       BitVector missing_bits{dh::ToSpan(missing_storage)};
 
-      batch.offset.SetDevice(ctx_->gpu_id);
-      batch.data.SetDevice(ctx_->gpu_id);
+      batch.offset.SetDevice(ctx_->Device());
+      batch.data.SetDevice(ctx_->Device());
       std::size_t entry_start = 0;
       SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
 
@@ -823,9 +820,9 @@ class ColumnSplitHelper {
   void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
                            dh::caching_device_vector<BitType>* missing_storage) const {
     collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
+        ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
     collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
+        ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
   }
 
   void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
@@ -853,12 +850,12 @@ class GPUPredictor : public xgboost::Predictor {
                        size_t num_features,
                        HostDeviceVector<bst_float>* predictions,
                        size_t batch_offset, bool is_dense) const {
-    batch.offset.SetDevice(ctx_->gpu_id);
-    batch.data.SetDevice(ctx_->gpu_id);
+    batch.offset.SetDevice(ctx_->Device());
+    batch.data.SetDevice(ctx_->Device());
     const uint32_t BLOCK_THREADS = 128;
     size_t num_rows = batch.Size();
     auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
     size_t shared_memory_bytes =
         SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
     bool use_shared = shared_memory_bytes != 0;
@@ -914,10 +911,10 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_end - tree_begin == 0) {
       return;
     }
-    out_preds->SetDevice(ctx_->gpu_id);
+    out_preds->SetDevice(ctx_->Device());
     auto const& info = dmat->Info();
     DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
+    d_model.Init(model, tree_begin, tree_end, ctx_->Device());
 
     if (info.IsColumnSplit()) {
       column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
@@ -934,10 +931,10 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       size_t batch_offset = 0;
       for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
+        dmat->Info().feature_types.SetDevice(ctx_->Device());
         auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
         this->PredictInternal(
-            page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types),
+            page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
             d_model,
             out_preds,
             batch_offset);
@@ -951,16 +948,15 @@ class GPUPredictor : public xgboost::Predictor {
       : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
 
   ~GPUPredictor() override {
-    if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
-      dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
+      dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     }
   }
 
   void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
                     const gbm::GBTreeModel& model, uint32_t tree_begin,
                     uint32_t tree_end = 0) const override {
-    int device = ctx_->gpu_id;
-    CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
+    CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
     auto* out_preds = &predts->predictions;
     if (tree_end == 0) {
       tree_end = model.trees.size();
@@ -978,9 +974,9 @@ class GPUPredictor : public xgboost::Predictor {
     auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
     CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
         << "Number of columns in data must equal to trained model.";
-    CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
-        << "XGBoost is running on device: " << this->ctx_->gpu_id << ", "
-        << "but data is on: " << m->DeviceIdx();
+    CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
+        << "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
+        << "but data is on: " << m->Device().Name();
     if (p_m) {
       p_m->Info().num_row_ = m->NumRows();
       this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
@@ -989,16 +985,16 @@ class GPUPredictor : public xgboost::Predictor {
       info.num_row_ = m->NumRows();
       this->InitOutPredictions(info, &(out_preds->predictions), model);
     }
-    out_preds->predictions.SetDevice(m->DeviceIdx());
+    out_preds->predictions.SetDevice(m->Device());
 
     const uint32_t BLOCK_THREADS = 128;
     auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
 
-    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
+    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
     size_t shared_memory_bytes =
         SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
     DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
+    d_model.Init(model, tree_begin, tree_end, m->Device());
 
     bool use_shared = shared_memory_bytes != 0;
     size_t entry_start = 0;
@@ -1050,9 +1046,8 @@ class GPUPredictor : public xgboost::Predictor {
     }
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
-
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    out_contribs->SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
@@ -1070,12 +1065,12 @@ class GPUPredictor : public xgboost::Predictor {
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
         device_paths;
     DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
     for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
+      batch.offset.SetDevice(ctx_->Device());
       SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                        model.learner_model_param->num_feature);
       auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1084,7 +1079,7 @@ class GPUPredictor : public xgboost::Predictor {
           dh::tend(phis));
     }
     // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
 
     auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1109,8 +1104,8 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    out_contribs->SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
@@ -1129,12 +1124,12 @@ class GPUPredictor : public xgboost::Predictor {
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
         device_paths;
     DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
     for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
+      batch.offset.SetDevice(ctx_->Device());
       SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                        model.learner_model_param->num_feature);
       auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1143,7 +1138,7 @@ class GPUPredictor : public xgboost::Predictor {
           dh::tend(phis));
     }
     // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
 
     auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1168,24 +1163,24 @@ class GPUPredictor : public xgboost::Predictor {
   void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
                    const gbm::GBTreeModel &model,
                    unsigned tree_end) const override {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
 
     const MetaInfo& info = p_fmat->Info();
     bst_row_t num_rows = info.num_row_;
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
-    predictions->SetDevice(ctx_->gpu_id);
+    predictions->SetDevice(ctx_->Device());
     predictions->Resize(num_rows * tree_end);
     DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, this->ctx_->Device());
 
     if (info.IsColumnSplit()) {
       column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
       return;
     }
 
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
     constexpr uint32_t kBlockThreads = 128;
     size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
         info.num_col_, max_shared_memory_bytes);
@@ -1195,8 +1190,8 @@ class GPUPredictor : public xgboost::Predictor {
 
     if (p_fmat->PageExists<SparsePage>()) {
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
-        batch.data.SetDevice(ctx_->gpu_id);
-        batch.offset.SetDevice(ctx_->gpu_id);
+        batch.data.SetDevice(ctx_->Device());
+        batch.offset.SetDevice(ctx_->Device());
         bst_row_t batch_offset = 0;
         SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                             model.learner_model_param->num_feature};
@@ -1221,7 +1216,7 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
         bst_row_t batch_offset = 0;
-        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
+        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
         size_t num_rows = batch.Size();
         auto grid =
             static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
@@ -1249,9 +1244,9 @@ class GPUPredictor : public xgboost::Predictor {
 
  private:
   /*! \brief Reconfigure the device when GPU is changed. */
-  static size_t ConfigureDevice(int device) {
-    if (device >= 0) {
-      return dh::MaxSharedMemory(device);
+  static size_t ConfigureDevice(DeviceOrd device) {
+    if (device.IsCUDA()) {
+      return dh::MaxSharedMemory(device.ordinal);
     }
     return 0;
   }
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 4d7fc598f..aad33c272 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
   std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
 
   const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
-  if (ctx_->gpu_id >= 0) {
-    out_preds->SetDevice(ctx_->gpu_id);
+  if (ctx_->Device().IsCUDA()) {
+    out_preds->SetDevice(ctx_->Device());
   }
   if (!base_margin->Empty()) {
     out_preds->Resize(n);
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index a8f5e1d8e..b80293fae 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -19,8 +19,7 @@
 #include "xgboost/linalg.h"                // TensorView, Tensor, Constant
 #include "xgboost/logging.h"               // CHECK_EQ
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace cpu_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
               linalg::TensorView<GradientPair const, 2> gpair,
@@ -68,7 +67,7 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien
 
 void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
               bst_target_t n_targets, linalg::Vector<float>* out) {
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
   out->Reshape(n_targets);
 
   gpair.SetDevice(ctx->Device());
@@ -76,5 +75,4 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
   ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
       : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index f0d53bff1..9fcacd081 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -21,9 +21,7 @@
 #include "xgboost/logging.h"  // CHECK_EQ
 #include "xgboost/span.h"     // span
 
-namespace xgboost {
-namespace tree {
-namespace cuda_impl {
+namespace xgboost::tree::cuda_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
               linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
   auto n_targets = out.Size();
@@ -50,7 +48,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
   thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
                         thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
 
-  collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
+  collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
                         d_sum.Size() * 2);
 
   thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
@@ -59,6 +57,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
                            CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
                      });
 }
-}  // namespace cuda_impl
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index b9a4424a5..627bf4ca4 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -413,7 +413,7 @@ void GPUHistEvaluator::EvaluateSplits(
     auto const world_size = collective::GetWorldSize();
     dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
     auto all_candidates = dh::ToSpan(all_candidate_storage);
-    collective::AllGather(device_, out_splits.data(), all_candidates.data(),
+    collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(),
                           out_splits.size() * sizeof(DeviceSplitCandidate));
 
     // Reduce to get the best candidate from all workers.
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 667982aa9..7c61099a1 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -85,7 +85,7 @@ class GPUHistEvaluator {
   std::size_t node_categorical_storage_size_ = 0;
   // Is the data split column-wise?
   bool is_column_split_ = false;
-  int32_t device_;
+  DeviceOrd device_;
 
   // Copy the categories from device to host asynchronously.
   void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -133,14 +133,14 @@ class GPUHistEvaluator {
   }
 
  public:
-  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, int32_t device)
+  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, DeviceOrd device)
       : tree_evaluator_{param, n_features, device}, param_{param} {}
   /**
    * \brief Reset the evaluator, should be called before any use.
    */
   void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
              bst_feature_t n_features, TrainParam const &param, bool is_column_split,
-             int32_t device);
+             DeviceOrd device);
 
   /**
    * \brief Get host category storage for nidx.  Different from the internal version, this
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 69485aa81..f862e048e 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023 by XGBoost Contributors
  *
  * \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
  *        compilation time.
@@ -12,11 +12,10 @@
 #include "evaluate_splits.cuh"
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
                              bst_feature_t n_features, TrainParam const &param,
-                             bool is_column_split, int32_t device) {
+                             bool is_column_split, DeviceOrd device) {
   param_ = param;
   tree_evaluator_ = TreeEvaluator{param, n_features, device};
   has_categoricals_ = cuts.HasCategorical();
@@ -127,6 +126,4 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
                              });
   return dh::ToSpan(cat_sorted_idx_);
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/feature_groups.cuh b/src/tree/gpu_hist/feature_groups.cuh
index 3af230c2c..671272822 100644
--- a/src/tree/gpu_hist/feature_groups.cuh
+++ b/src/tree/gpu_hist/feature_groups.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
  */
 #ifndef FEATURE_GROUPS_CUH_
 #define FEATURE_GROUPS_CUH_
@@ -102,11 +102,10 @@ struct FeatureGroups {
     InitSingle(cuts);
   }
 
-  FeatureGroupsAccessor DeviceAccessor(int device) const {
+  [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
     feature_segments.SetDevice(device);
     bin_segments.SetDevice(device);
-    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(),
-        max_group_bins};
+    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins};
   }
 
 private:
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 1082f8955..58add0a93 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
     for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
       auto page = batch.Impl();
       if (!page_) {
-        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+        page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->Cuts(), page->is_dense,
                                                   page->row_stride, dmat->Info().num_row_);
       }
-      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
+      size_t num_elements = page_->Copy(ctx->Device(), page, offset);
       offset += num_elements;
     }
     page_concatenated_ = true;
@@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
@@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 78b04883c..35b43d24b 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -13,15 +13,15 @@
 namespace xgboost {
 namespace tree {
 
-RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
+RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
     : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
 }
 
 RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 64ca540f6..fde6c4dd0 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -199,7 +199,7 @@ class RowPartitioner {
   static constexpr bst_node_t kIgnoredTreePosition = -1;
 
  private:
-  int device_idx_;
+  DeviceOrd device_idx_;
   /*! \brief In here if you want to find the rows belong to a node nid, first you need to
    * get the indices segment from ridx_segments[nid], then get the row index that
    * represents position of row in input data X.  `RowPartitioner::GetRows` would be a
@@ -223,7 +223,7 @@ class RowPartitioner {
   dh::PinnedMemory pinned2_;
 
  public:
-  RowPartitioner(int device_idx, size_t num_rows);
+  RowPartitioner(DeviceOrd device_idx, size_t num_rows);
   ~RowPartitioner();
   RowPartitioner(const RowPartitioner&) = delete;
   RowPartitioner& operator=(const RowPartitioner&) = delete;
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index d0267b0ed..b4612e24c 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -477,7 +477,7 @@ class HistEvaluator {
       : ctx_{ctx},
         param_{param},
         column_sampler_{std::move(sampler)},
-        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
+        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
         is_col_split_{info.IsColumnSplit()} {
     interaction_constraints_.Configure(*param, info.num_col_);
     column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
@@ -696,7 +696,7 @@ class HistMultiEvaluator {
     stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
     gain_.resize(1);
 
-    linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
+    linalg::Vector<float> weight({n_targets}, ctx_->Device());
     CalcWeight(*param_, root_sum, weight.HostView());
     auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
     gain_.front() = root_gain;
diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h
index a3b33e757..f417ff898 100644
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2020 by Contributors
+/**
+ * Copyright 2018-2023 by Contributors
  * \file split_evaluator.h
  * \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
  * \author Henry Gouk
@@ -23,8 +23,7 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 class TreeEvaluator {
   // hist and exact use parent id to calculate constraints.
   static constexpr bst_node_t kRootParentId =
@@ -33,13 +32,13 @@ class TreeEvaluator {
   HostDeviceVector<float> lower_bounds_;
   HostDeviceVector<float> upper_bounds_;
   HostDeviceVector<int32_t> monotone_;
-  int32_t device_;
+  DeviceOrd device_;
   bool has_constraint_;
 
  public:
-  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, int32_t device) {
+  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) {
     device_ = device;
-    if (device != Context::kCpuId) {
+    if (device.IsCUDA()) {
       lower_bounds_.SetDevice(device);
       upper_bounds_.SetDevice(device);
       monotone_.SetDevice(device);
@@ -59,7 +58,7 @@ class TreeEvaluator {
       has_constraint_ = true;
     }
 
-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
       // Pull to device early.
       lower_bounds_.ConstDeviceSpan();
       upper_bounds_.ConstDeviceSpan();
@@ -122,7 +121,7 @@ class TreeEvaluator {
     }
 
     // Fast floating point division instruction on device
-    XGBOOST_DEVICE float Divide(float a, float b) const {
+    [[nodiscard]] XGBOOST_DEVICE float Divide(float a, float b) const {
 #ifdef __CUDA_ARCH__
       return __fdividef(a, b);
 #else
@@ -154,7 +153,7 @@ class TreeEvaluator {
  public:
   /* Get a view to the evaluator that can be passed down to device. */
   template <typename ParamT = TrainParam> auto GetEvaluator() const {
-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
       auto constraints = monotone_.ConstDevicePointer();
       return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
                                     upper_bounds_.ConstDevicePointer(), has_constraint_};
@@ -215,7 +214,6 @@ enum SplitType {
   // partition-based categorical split
   kPart = 2
 };
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
 
 #endif  // XGBOOST_TREE_SPLIT_EVALUATOR_H_
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 3afbe3e46..7a88bd30e 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -154,7 +154,7 @@ class ColMaker: public TreeUpdater {
         : param_(param),
           colmaker_train_param_{colmaker_train_param},
           ctx_{ctx},
-          tree_evaluator_(param_, column_densities.size(), Context::kCpuId),
+          tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()),
           interaction_constraints_{std::move(_interaction_constraints)},
           column_densities_(column_densities) {}
     // update one tree, growing
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 57eec0db8..8fd3120b5 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -74,7 +74,7 @@ class DeviceHistogramStorage {
   dh::device_vector<typename GradientSumT::ValueT> overflow_;
   std::map<int, size_t> overflow_nidx_map_;
   int n_bins_;
-  int device_id_;
+  DeviceOrd device_id_;
   static constexpr size_t kNumItemsInGradientSum =
       sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
   static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
@@ -82,7 +82,7 @@ class DeviceHistogramStorage {
  public:
   // Start with about 16mb
   DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(int device_id, int n_bins) {
+  void Init(DeviceOrd device_id, int n_bins) {
     this->n_bins_ = n_bins;
     this->device_id_ = device_id;
   }
@@ -196,7 +196,7 @@ struct GPUHistMakerDevice {
                      common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
                      TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
                      uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
-      : evaluator_{_param, n_features, ctx->gpu_id},
+      : evaluator_{_param, n_features, ctx->Device()},
         ctx_(ctx),
         feature_types{_feature_types},
         param(std::move(_param)),
@@ -211,7 +211,7 @@ struct GPUHistMakerDevice {
     }
 
     CHECK(column_sampler_);
-    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
+    monitor.Init(std::string("GPUHistMakerDevice") + ctx_->Device().Name());
   }
 
   ~GPUHistMakerDevice() = default;
@@ -220,7 +220,7 @@ struct GPUHistMakerDevice {
     if (!feature_groups) {
       CHECK(page);
       feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
-                                                       dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                                       dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
                                                        sizeof(GradientPairPrecise));
     }
   }
@@ -231,7 +231,7 @@ struct GPUHistMakerDevice {
     this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
                                 param.colsample_bynode, param.colsample_bylevel,
                                 param.colsample_bytree);
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
 
     this->interaction_constraints.Reset();
 
@@ -246,15 +246,15 @@ struct GPUHistMakerDevice {
     gpair = sample.gpair;
 
     this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
-                           dmat->Info().IsColumnSplit(), ctx_->gpu_id);
+                           dmat->Info().IsColumnSplit(), ctx_->Device());
 
     quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
 
     row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
 
     // Init histogram
-    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
+    hist.Init(ctx_->Device(), page->Cuts().TotalBins());
     hist.Reset();
 
     this->InitFeatureGroupsOnce();
@@ -267,7 +267,7 @@ struct GPUHistMakerDevice {
     sampled_features->SetDevice(ctx_->Device());
     common::Span<bst_feature_t> feature_set =
         interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
     EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
     EvaluateSplitSharedInputs shared_inputs{
         gpu_param,
@@ -289,7 +289,7 @@ struct GPUHistMakerDevice {
     dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
     std::vector<bst_node_t> nidx(2 * candidates.size());
     auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
     EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
                                             matrix.feature_segments, matrix.gidx_fvalue_map,
                                             matrix.min_fvalue,
@@ -342,9 +342,9 @@ struct GPUHistMakerDevice {
   void BuildHist(int nidx) {
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
-    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id),
-                           feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist,
-                           *quantiser);
+    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
+                           feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
+                           d_node_hist, *quantiser);
   }
 
   // Attempt to do subtraction trick
@@ -413,10 +413,10 @@ struct GPUHistMakerDevice {
     });
 
     collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
+        ctx_->Ordinal(), decision_storage.data().get(), decision_storage.size());
     collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
-    collective::Synchronize(ctx_->gpu_id);
+        ctx_->Ordinal(), missing_storage.data().get(), missing_storage.size());
+    collective::Synchronize(ctx_->Ordinal());
 
     row_partitioner->UpdatePositionBatch(
         nidx, left_nidx, right_nidx, split_data,
@@ -454,7 +454,7 @@ struct GPUHistMakerDevice {
       CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
     }
 
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
 
     if (info_.IsColumnSplit()) {
       UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
@@ -524,9 +524,9 @@ struct GPUHistMakerDevice {
       common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
       common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
       HostDeviceVector<bst_node_t>* p_out_position) {
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
     auto d_gpair = this->gpair;
-    p_out_position->SetDevice(ctx_->gpu_id);
+    p_out_position->SetDevice(ctx_->Device());
     p_out_position->Resize(row_partitioner->GetRows().size());
 
     auto new_position_op = [=] __device__(size_t row_id, int position) {
@@ -613,7 +613,7 @@ struct GPUHistMakerDevice {
     monitor.Start("AllReduce");
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
     using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-    collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
+    collective::GlobalSum(info_, ctx_->Device(), reinterpret_cast<ReduceT*>(d_node_hist),
                           page->Cuts().TotalBins() * 2 * num_histograms);
 
     monitor.Stop("AllReduce");
@@ -855,7 +855,7 @@ class GPUHistMaker : public TreeUpdater {
   }
 
   void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
-    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
+    CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device";
     info_ = &dmat->Info();
 
     // Synchronise the column sampling seed
@@ -864,8 +864,8 @@ class GPUHistMaker : public TreeUpdater {
     this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
 
     auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    info_->feature_types.SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    info_->feature_types.SetDevice(ctx_->Device());
     maker = std::make_unique<GPUHistMakerDevice>(
         ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
         *param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
@@ -890,7 +890,7 @@ class GPUHistMaker : public TreeUpdater {
     this->InitData(param, p_fmat, p_tree);
     monitor_.Stop("InitData");
 
-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
     maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
   }
 
@@ -1023,7 +1023,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     this->InitData(p_fmat, p_tree);
     monitor_.Stop("InitData");
 
-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
     maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
   }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 34890c2e5..50943e1c4 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -518,7 +518,7 @@ class QuantileHistMaker : public TreeUpdater {
     auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
     if (need_copy()) {
       // allocate buffer
-      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
+      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->Device(), linalg::Order::kF};
       h_sample_out = sample_out.HostView();
     }
 
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index d6ed400b2..a09696c19 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -34,7 +34,7 @@ void VerifyAllReduceBitwiseAND() {
   auto const rank = collective::GetRank();
   std::bitset<64> original{};
   original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
   collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
   collective::Synchronize(rank);
   EXPECT_EQ(buffer.HostVector()[0], 0ULL);
@@ -56,7 +56,7 @@ void VerifyAllReduceBitwiseOR() {
   auto const rank = collective::GetRank();
   std::bitset<64> original{};
   original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
   collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
   collective::Synchronize(rank);
   EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
@@ -78,7 +78,7 @@ void VerifyAllReduceBitwiseXOR() {
   auto const rank = collective::GetRank();
   std::bitset<64> original{~0ULL};
   original[rank] = false;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
   collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
   collective::Synchronize(rank);
   EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index 70ebecd3d..5391bc2cf 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -147,7 +147,7 @@ TEST(CutsBuilder, SearchGroupInd) {
 
   EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
 
-  p_mat->Info().Validate(-1);
+  p_mat->Info().Validate(DeviceOrd::CPU());
   EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
                dmlc::Error);
 
@@ -330,7 +330,7 @@ TEST(HistUtil, IndexBinData) {
 void TestSketchFromWeights(bool with_group) {
   size_t constexpr kRows = 300, kCols = 20, kBins = 256;
   size_t constexpr kGroups = 10;
-  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
+  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
   Context ctx;
   common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);
 
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 91baad981..4782f9580 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -208,7 +208,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   ASSERT_EQ(info.feature_types.Size(), n_features);
 
   HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
-  cuts_ptr.SetDevice(0);
+  cuts_ptr.SetDevice(DeviceOrd::CUDA(0));
 
   dh::device_vector<float> weight(n_samples * n_features, 0);
   dh::Iota(dh::ToSpan(weight));
@@ -221,7 +221,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
                       detail::EntryCompareOp());
 
-  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
+  detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries,
                                      &weight, &columns_ptr);
 
   auto const& h_cptr = cuts_ptr.ConstHostVector();
@@ -363,7 +363,8 @@ template <typename Adapter>
 auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
+  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
+                                   DeviceOrd::CUDA(0));
   MetaInfo info;
   AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
   sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
@@ -430,7 +431,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
   ConsoleLogger::Configure({{"verbosity", "3"}});
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
   HistogramCuts cuts;
@@ -458,7 +459,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
   ConsoleLogger::Configure({{"verbosity", "3"}});
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
@@ -493,7 +494,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
   }
 
   ASSERT_EQ(info.feature_types.Size(), 1);
-  SketchContainer container(info.feature_types, num_bins, 1, n, 0);
+  SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(), &container);
   HistogramCuts cuts;
@@ -566,7 +567,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
 
 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   auto n = n_samples * n_features;
   std::vector<float> x;
   x.resize(n);
@@ -606,21 +607,21 @@ void TestGetColumnSize(std::size_t n_samples) {
   std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 }
@@ -697,9 +698,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
   size_t constexpr kRows = 300, kCols = 20, kBins = 256;
   size_t constexpr kGroups = 10;
   HostDeviceVector<float> storage;
-  std::string m =
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
-          &storage);
+  std::string m = RandomDataGenerator{kRows, kCols, 0}
+                      .Device(DeviceOrd::CUDA(0))
+                      .GenerateArrayInterface(&storage);
   MetaInfo info;
   Context ctx;
   auto& h_weights = info.weights_.HostVector();
@@ -718,14 +719,14 @@ void TestAdapterSketchFromWeights(bool with_group) {
     info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
   }
 
-  info.weights_.SetDevice(0);
+  info.weights_.SetDevice(DeviceOrd::CUDA(0));
   info.num_row_ = kRows;
   info.num_col_ = kCols;
 
   data::CupyAdapter adapter(m);
   auto const& batch = adapter.Value();
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
 
@@ -769,7 +770,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
       // https://github.com/dmlc/xgboost/issues/7946
       h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
     }
-    SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+    SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
     AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                         &sketch_container);
     sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index ade2537f9..57e945cba 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -1,7 +1,6 @@
-/*!
- * Copyright 2018 XGBoost contributors
+/**
+ * Copyright 2018-2023 XGBoost contributors
  */
-
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -9,14 +8,13 @@
 #include "../../../src/common/device_helpers.cuh"
 #include <xgboost/host_device_vector.h>
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace {
-void SetDeviceForTest(int device) {
+void SetDeviceForTest(DeviceOrd device) {
   int n_devices;
   dh::safe_cuda(cudaGetDeviceCount(&n_devices));
-  device %= n_devices;
-  dh::safe_cuda(cudaSetDevice(device));
+  device.ordinal %= n_devices;
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 }
 }  // namespace
 
@@ -31,13 +29,13 @@ struct HostDeviceVectorSetDeviceHandler {
   }
 };
 
-void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
+void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
   // create the vector
   v->SetDevice(device);
   v->Resize(n);
 
   ASSERT_EQ(v->Size(), n);
-  ASSERT_EQ(v->DeviceIdx(), device);
+  ASSERT_EQ(v->Device(), device);
   // ensure that the device have read-write access
   ASSERT_TRUE(v->DeviceCanRead());
   ASSERT_TRUE(v->DeviceCanWrite());
@@ -57,7 +55,7 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
 }
 
 void PlusOne(HostDeviceVector<int> *v) {
-  int device = v->DeviceIdx();
+  auto device = v->Device();
   SetDeviceForTest(device);
   thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
                     [=]__device__(unsigned int a){ return a + 1; });
@@ -69,7 +67,7 @@ void CheckDevice(HostDeviceVector<int>* v,
                  unsigned int first,
                  GPUAccess access) {
   ASSERT_EQ(v->Size(), size);
-  SetDeviceForTest(v->DeviceIdx());
+  SetDeviceForTest(v->Device());
 
   ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
                             thrust::make_counting_iterator(first)));
@@ -100,7 +98,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
   ASSERT_FALSE(v->DeviceCanWrite());
 }
 
-void TestHostDeviceVector(size_t n, int device) {
+void TestHostDeviceVector(size_t n, DeviceOrd device) {
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
   HostDeviceVector<int> v;
   InitHostDeviceVector(n, device, &v);
@@ -113,13 +111,13 @@ void TestHostDeviceVector(size_t n, int device) {
 
 TEST(HostDeviceVector, Basic) {
   size_t n = 1001;
-  int device = 0;
+  DeviceOrd device = DeviceOrd::CUDA(0);
   TestHostDeviceVector(n, device);
 }
 
 TEST(HostDeviceVector, Copy) {
   size_t n = 1001;
-  int device = 0;
+  auto device = DeviceOrd::CUDA(0);
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
 
   HostDeviceVector<int> v;
@@ -143,15 +141,15 @@ TEST(HostDeviceVector, SetDevice) {
     h_vec[i] = i;
   }
   HostDeviceVector<int> vec (h_vec);
-  auto device = 0;
+  auto device = DeviceOrd::CUDA(0);
 
   vec.SetDevice(device);
   ASSERT_EQ(vec.Size(), h_vec.size());
   auto span = vec.DeviceSpan();  // sync to device
 
-  vec.SetDevice(-1);  // pull back to cpu.
+  vec.SetDevice(DeviceOrd::CPU());  // pull back to cpu.
   ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_EQ(vec.DeviceIdx(), -1);
+  ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
 
   auto h_vec_1 = vec.HostVector();
   ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@@ -159,7 +157,7 @@ TEST(HostDeviceVector, SetDevice) {
 
 TEST(HostDeviceVector, Span) {
   HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.SetDevice(0);
+  vec.SetDevice(DeviceOrd::CUDA(0));
   auto span = vec.DeviceSpan();
   ASSERT_EQ(vec.Size(), span.size());
   ASSERT_EQ(vec.DevicePointer(), span.data());
@@ -183,5 +181,4 @@ TEST(HostDeviceVector, Empty) {
   ASSERT_FALSE(another.Empty());
   ASSERT_TRUE(vec.Empty());
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index b88b8e127..4823b1191 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -12,7 +12,7 @@ namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
   auto device = DeviceOrd::CUDA(0);
-  Tensor<float, 3> l{{2, 3, 4}, 0};
+  Tensor<float, 3> l{{2, 3, 4}, device};
   {
     /**
      * Non-contiguous
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index 59d06f5d3..343f59cda 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -9,9 +9,7 @@
 #include "../../../src/data/adapter.h"
 #include "xgboost/context.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 TEST(Quantile, LoadBalance) {
   size_t constexpr kRows = 1000, kCols = 100;
   auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
@@ -314,7 +312,7 @@ void TestSameOnAllWorkers() {
         }
 
         auto m = RandomDataGenerator{kRows, kCols, 0}
-                     .Device(Context::kCpuId)
+                     .Device(DeviceOrd::CPU())
                      .Type(ft)
                      .MaxCategory(17)
                      .Seed(rank + seed)
@@ -373,6 +371,4 @@ TEST(Quantile, SameOnAllWorkers) {
   auto constexpr kWorkers = 4;
   RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
 }
-
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index a5ecbb598..57c9da703 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -25,7 +25,7 @@ class MGPUQuantileTest : public BaseMGPUTest {};
 TEST(GPUQuantile, Basic) {
   constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
   dh::caching_device_vector<Entry> entries;
   dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
   thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
@@ -38,12 +38,12 @@ void TestSketchUnique(float sparsity) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
 
     HostDeviceVector<float> storage;
     std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
                                     .Seed(seed)
-                                    .Device(0)
+                                    .Device(FstCU())
                                     .GenerateArrayInterface(&storage);
     data::CupyAdapter adapter(interface_str);
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
@@ -58,7 +58,7 @@ void TestSketchUnique(float sparsity) {
         thrust::make_counting_iterator(0llu),
         [=] __device__(size_t idx) { return batch.GetElement(idx); });
     auto end = kCols * kRows;
-    detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
+    detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
                                &cut_sizes_scan, &column_sizes_scan);
     auto const& cut_sizes = cut_sizes_scan.HostVector();
     ASSERT_LE(sketch.Data().size(), cut_sizes.back());
@@ -86,9 +86,9 @@ TEST(GPUQuantile, Unique) {
 }
 
 // if with_error is true, the test tolerates floating point error
-void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,
+void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
                           Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   std::vector<SketchEntry> h_in(in.size());
   dh::CopyDeviceSpanToVector(&h_in, in);
   std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
@@ -123,13 +123,12 @@ TEST(GPUQuantile, Prune) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
 
     HostDeviceVector<float> storage;
-    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage);
+    std::string interface_str =
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+            &storage);
     data::CupyAdapter adapter(interface_str);
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch);
@@ -145,7 +144,7 @@ TEST(GPUQuantile, Prune) {
     ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
                                   sketch.Data().data() + sketch.Data().size(),
                                   detail::SketchUnique{}));
-    TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr());
   });
 }
 
@@ -153,10 +152,10 @@ TEST(GPUQuantile, MergeEmpty) {
   constexpr size_t kRows = 1000, kCols = 100;
   size_t n_bins = 10;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
   HostDeviceVector<float> storage_0;
   std::string interface_str_0 =
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
+      RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface(
           &storage_0);
   data::CupyAdapter adapter_0(interface_str_0);
   MetaInfo info;
@@ -193,34 +192,33 @@ TEST(GPUQuantile, MergeBasic) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
     HostDeviceVector<float> storage_0;
     std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                  .Device(FstCU())
                                       .Seed(seed)
                                       .GenerateArrayInterface(&storage_0);
     data::CupyAdapter adapter_0(interface_str_0);
     AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch_0);
 
-    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0);
+    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU());
     HostDeviceVector<float> storage_1;
-    std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
-                                      .Seed(seed)
-                                      .GenerateArrayInterface(&storage_1);
+    std::string interface_str_1 =
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+            &storage_1);
     data::CupyAdapter adapter_1(interface_str_1);
-    AdapterDeviceSketch(adapter_1.Value(), n_bins, info,
-                        std::numeric_limits<float>::quiet_NaN(), &sketch_1);
+    AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
+                        &sketch_1);
 
     size_t size_before_merge = sketch_0.Data().size();
     sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
     if (info.weights_.Size() != 0) {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
       sketch_0.FixError();
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
     } else {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
     }
 
     auto columns_ptr = sketch_0.ColumnsPtr();
@@ -240,24 +238,22 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
   MetaInfo info;
   int32_t seed = 0;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, cols, rows, 0);
+  SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU());
   HostDeviceVector<float> storage_0;
-  std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage_0);
+  std::string interface_str_0 =
+      RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+          &storage_0);
   data::CupyAdapter adapter_0(interface_str_0);
   AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                       std::numeric_limits<float>::quiet_NaN(),
                       &sketch_0);
 
   size_t f_rows = rows * frac;
-  SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0);
+  SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU());
   HostDeviceVector<float> storage_1;
-  std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage_1);
+  std::string interface_str_1 =
+      RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+          &storage_1);
   auto data_1 = storage_1.DeviceSpan();
   auto tuple_it = thrust::make_tuple(
       thrust::make_counting_iterator<size_t>(0ul), data_1.data());
@@ -279,7 +275,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
 
   size_t size_before_merge = sketch_0.Data().size();
   sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
-  TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+  TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
 
   auto columns_ptr = sketch_0.ColumnsPtr();
   std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
@@ -310,11 +306,10 @@ TEST(GPUQuantile, MergeDuplicated) {
 TEST(GPUQuantile, MultiMerge) {
   constexpr size_t kRows = 20, kCols = 1;
   int32_t world = 2;
-  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
-                                 MetaInfo const &info) {
+  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
     // Set up single node version
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());
 
     size_t intermediate_num_cuts = std::min(
         kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@@ -322,12 +317,12 @@ TEST(GPUQuantile, MultiMerge) {
     for (auto rank = 0; rank < world; ++rank) {
       HostDeviceVector<float> storage;
       std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                      .Device(FstCU())
                                       .Seed(rank + seed)
                                       .GenerateArrayInterface(&storage);
       data::CupyAdapter adapter(interface_str);
       HostDeviceVector<FeatureType> ft;
-      containers.emplace_back(ft, n_bins, kCols, kRows, 0);
+      containers.emplace_back(ft, n_bins, kCols, kRows, FstCU());
       AdapterDeviceSketch(adapter.Value(), n_bins, info,
                           std::numeric_limits<float>::quiet_NaN(),
                           &containers.back());
@@ -337,12 +332,10 @@ TEST(GPUQuantile, MultiMerge) {
       sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
       sketch_on_single_node.FixError();
     }
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
-                         sketch_on_single_node.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
 
     sketch_on_single_node.Unique();
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
-                         sketch_on_single_node.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
   });
 }
 
@@ -351,7 +344,7 @@ void TestAllReduceBasic() {
   auto const world = collective::GetWorldSize();
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);
 
     // Set up single node version;
     HostDeviceVector<FeatureType> ft({}, device);
@@ -483,7 +476,7 @@ void TestSameOnAllWorkers() {
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                  MetaInfo const &info) {
     auto const rank = collective::GetRank();
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);
     HostDeviceVector<FeatureType> ft({}, device);
     SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
     HostDeviceVector<float> storage({}, device);
@@ -514,9 +507,9 @@ void TestSameOnAllWorkers() {
     thrust::copy(thrust::device, local_data.data(),
                  local_data.data() + local_data.size(),
                  all_workers.begin() + local_data.size() * rank);
-    collective::AllReduce<collective::Operation::kSum>(device, all_workers.data().get(),
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, all_workers.data().get(),
                                                        all_workers.size());
-    collective::Synchronize(device);
+    collective::Synchronize(device.ordinal);
 
     auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
     std::vector<float> h_base_line(base_line.size());
@@ -562,7 +555,7 @@ TEST(GPUQuantile, Push) {
   columns_ptr[1] = kRows;
 
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
   sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
 
   auto sketch_data = sketch.Data();
@@ -602,7 +595,7 @@ TEST(GPUQuantile, MultiColPush) {
 
   int32_t n_bins = 16;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
   dh::device_vector<Entry> d_entries {entries};
 
   dh::device_vector<size_t> columns_ptr(kCols + 1, 0);
diff --git a/tests/cpp/common/test_ranking_utils.cc b/tests/cpp/common/test_ranking_utils.cc
index 919102278..b57ee90cd 100644
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@@ -95,7 +95,7 @@ void TestRankingCache(Context const* ctx) {
   HostDeviceVector<float> predt(info.num_row_, 0);
   auto& h_predt = predt.HostVector();
   std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
 
   auto rank_idx =
       cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
@@ -129,7 +129,7 @@ void TestNDCGCache(Context const* ctx) {
     auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
     // empty label
     ASSERT_THROW(fail(), dmlc::Error);
-    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
+    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, DeviceOrd::CPU()};
     // invalid label
     ASSERT_THROW(fail(), dmlc::Error);
     auto h_labels = info.labels.HostView();
diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu
index 86ce4b6d0..378394d67 100644
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -35,7 +35,7 @@ void TestCalcQueriesInvIDCG() {
   auto d_scores = dh::ToSpan(scores);
   common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
 
-  linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
+  linalg::Vector<double> inv_IDCG({n_groups}, ctx.Device());
 
   ltr::LambdaRankParam p;
   p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
@@ -70,7 +70,7 @@ void TestRankingCache(Context const* ctx) {
   HostDeviceVector<float> predt(info.num_row_, 0);
   auto& h_predt = predt.HostVector();
   std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
 
   auto rank_idx =
       cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
diff --git a/tests/cpp/common/test_stats.cc b/tests/cpp/common/test_stats.cc
index e74caeb3a..b4a8f50c4 100644
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -9,12 +9,11 @@
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
 #include "../helpers.h"
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(Stats, Quantile) {
   Context ctx;
   {
-    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
+    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, DeviceOrd::CPU());
     std::vector<size_t> index{0, 2, 3, 4, 6};
     auto h_arr = arr.HostView();
     auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
@@ -40,8 +39,8 @@ TEST(Stats, Quantile) {
 
 TEST(Stats, WeightedQuantile) {
   Context ctx;
-  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
-  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);
+  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, DeviceOrd::CPU());
+  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, DeviceOrd::CPU());
 
   auto h_arr = arr.HostView();
   auto h_weight = weight.HostView();
@@ -64,7 +63,7 @@ TEST(Stats, Median) {
   Context ctx;
 
   {
-    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, Context::kCpuId};
+    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, DeviceOrd::CPU()};
     HostDeviceVector<float> weights;
     linalg::Tensor<float, 1> out;
     Median(&ctx, values, weights, &out);
@@ -83,7 +82,7 @@ TEST(Stats, Median) {
   {
     ctx = ctx.MakeCPU();
     // 4x2 matrix
-    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
+    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.Device()};
     HostDeviceVector<float> weights;
     linalg::Tensor<float, 1> out;
     Median(&ctx, values, weights, &out);
@@ -102,14 +101,14 @@ TEST(Stats, Median) {
 namespace {
 void TestMean(Context const* ctx) {
   std::size_t n{128};
-  linalg::Vector<float> data({n}, ctx->gpu_id);
+  linalg::Vector<float> data({n}, ctx->Device());
   auto h_v = data.HostView().Values();
   std::iota(h_v.begin(), h_v.end(), .0f);
 
   auto nf = static_cast<float>(n);
   float mean = nf * (nf - 1) / 2 / n;
 
-  linalg::Vector<float> res{{1}, ctx->gpu_id};
+  linalg::Vector<float> res{{1}, ctx->Device()};
   Mean(ctx, data, &res);
   auto h_res = res.HostView();
   ASSERT_EQ(h_res.Size(), 1);
@@ -128,5 +127,4 @@ TEST(Stats, GPUMean) {
   TestMean(&ctx);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu
index 3dc90e069..28d471423 100644
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -20,8 +20,8 @@ namespace common {
 namespace {
 class StatsGPU : public ::testing::Test {
  private:
-  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0};
-  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, 0};
+  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, FstCU()};
+  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, FstCU()};
   HostDeviceVector<float> results_;
   using TestSet = std::vector<std::pair<float, float>>;
   Context ctx_;
@@ -46,7 +46,7 @@ class StatsGPU : public ::testing::Test {
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
     auto d_arr = arr.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -58,7 +58,7 @@ class StatsGPU : public ::testing::Test {
 
     // one alpha for each segment
     HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
     auto d_alphas = alphas.ConstDeviceSpan();
     auto w_it = thrust::make_constant_iterator(0.1f);
     SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
@@ -80,7 +80,7 @@ class StatsGPU : public ::testing::Test {
     auto val_it =
         dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                          [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
-    linalg::Tensor<float, 1> weights{{10}, 0};
+    linalg::Tensor<float, 1> weights{{10}, FstCU()};
     linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
                                        [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
     auto w_it = weights.Data()->ConstDevicePointer();
@@ -101,7 +101,7 @@ class StatsGPU : public ::testing::Test {
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
     auto d_arr = arr.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -113,7 +113,7 @@ class StatsGPU : public ::testing::Test {
 
     // one alpha for each segment
     HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
     auto d_alphas = alphas.ConstDeviceSpan();
     SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
                       val_it + d_arr.Size(), &results_);
diff --git a/tests/cpp/common/test_transform_range.cc b/tests/cpp/common/test_transform_range.cc
index 6e3ae9d82..24d0267b6 100644
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -11,63 +11,59 @@
 #include "../../../src/common/transform.h"
 #include "../helpers.h"
 
+namespace xgboost::common {
+namespace {
+constexpr DeviceOrd TransformDevice() {
 #if defined(__CUDACC__)
-
-#define TRANSFORM_GPU 0
-
+  return DeviceOrd::CUDA(0);
 #else
-
-#define TRANSFORM_GPU -1
-
+  return DeviceOrd::CPU();
 #endif
-
-namespace xgboost {
-namespace common {
+}
+}  // namespace
 
 template <typename T>
 struct TestTransformRange {
-  void XGBOOST_DEVICE operator()(size_t _idx,
-                                 Span<bst_float> _out, Span<const bst_float> _in) {
+  void XGBOOST_DEVICE operator()(std::size_t _idx, Span<float> _out, Span<const float> _in) {
     _out[_idx] = _in[_idx];
   }
 };
 
 TEST(Transform, DeclareUnifiedTest(Basic)) {
-  const size_t size {256};
-  std::vector<bst_float> h_in(size);
-  std::vector<bst_float> h_out(size);
+  const size_t size{256};
+  std::vector<float> h_in(size);
+  std::vector<float> h_out(size);
   std::iota(h_in.begin(), h_in.end(), 0);
-  std::vector<bst_float> h_sol(size);
+  std::vector<float> h_sol(size);
   std::iota(h_sol.begin(), h_sol.end(), 0);
 
-  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
-  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
+  auto device = TransformDevice();
+  HostDeviceVector<float> const in_vec{h_in, device};
+  HostDeviceVector<float> out_vec{h_out, device};
   out_vec.Fill(0);
 
-  Transform<>::Init(TestTransformRange<bst_float>{},
+  Transform<>::Init(TestTransformRange<float>{},
                     Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(),
-                    TRANSFORM_GPU)
+                    TransformDevice())
       .Eval(&out_vec, &in_vec);
-  std::vector<bst_float> res = out_vec.HostVector();
+  std::vector<float> res = out_vec.HostVector();
 
   ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
 }
 
 #if !defined(__CUDACC__)
 TEST(TransformDeathTest, Exception) {
-  size_t const kSize {16};
-  std::vector<bst_float> h_in(kSize);
-  const HostDeviceVector<bst_float> in_vec{h_in, -1};
+  size_t const kSize{16};
+  std::vector<float> h_in(kSize);
+  const HostDeviceVector<float> in_vec{h_in, DeviceOrd::CPU()};
   EXPECT_DEATH(
       {
         Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; },
                           Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(),
-                          -1)
+                          DeviceOrd::CPU())
             .Eval(&in_vec);
       },
       "");
 }
 #endif
-
-} // namespace common
-} // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_transform_range.cu b/tests/cpp/common/test_transform_range.cu
new file mode 100644
index 000000000..b0fa7c102
--- /dev/null
+++ b/tests/cpp/common/test_transform_range.cu
@@ -0,0 +1,5 @@
+/**
+ * Copyright 2023 XGBoost contributors
+ */
+// Dummy file to keep the CUDA tests.
+#include "test_transform_range.cc"
diff --git a/tests/cpp/data/test_device_adapter.cu b/tests/cpp/data/test_device_adapter.cu
index db70c216c..2190dbe5b 100644
--- a/tests/cpp/data/test_device_adapter.cu
+++ b/tests/cpp/data/test_device_adapter.cu
@@ -59,12 +59,12 @@ TEST(DeviceAdapter, GetRowCounts) {
   for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
     HostDeviceVector<float> storage;
     auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
-                       .Device(ctx.gpu_id)
+                       .Device(ctx.Device())
                        .GenerateArrayInterface(&storage);
     auto adapter = CupyAdapter{str_arr};
     HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
-    offset.SetDevice(ctx.gpu_id);
-    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id,
+    offset.SetDevice(ctx.Device());
+    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
                                 std::numeric_limits<float>::quiet_NaN());
     ASSERT_EQ(rstride, n_features);
   }
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 4b279a1a4..ab4539fd4 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
   Context ctx{MakeCUDACtx(0)};
   auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
   auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(0);
+  auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU());
   ASSERT_EQ(kCats, accessor.NumBins());
 
   auto x_copy = x;
@@ -152,13 +152,12 @@ TEST(EllpackPage, Copy) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
-                         kRows);
+  EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kRows);
 
   // Copy batch pages into the result page.
   size_t offset = 0;
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    size_t num_elements = result.Copy(0, batch.Impl(), offset);
+    size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset);
     offset += num_elements;
   }
 
@@ -172,10 +171,12 @@ TEST(EllpackPage, Copy) {
     EXPECT_EQ(impl->base_rowid, current_row);
 
     for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row,
+                                         row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row,
+                                         row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
       EXPECT_EQ(row, row_result);
@@ -199,8 +200,7 @@ TEST(EllpackPage, Compact) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
-                         kCompactedRows);
+  EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kCompactedRows);
 
   // Compact batch pages into the result page.
   std::vector<size_t> row_indexes_h {
@@ -209,7 +209,7 @@ TEST(EllpackPage, Compact) {
   thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
   common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    result.Compact(0, batch.Impl(), row_indexes_span);
+    result.Compact(FstCU(), batch.Impl(), row_indexes_span);
   }
 
   size_t current_row = 0;
@@ -228,13 +228,13 @@ TEST(EllpackPage, Compact) {
         continue;
       }
 
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0),
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()),
                                          current_row, row_d.data().get()));
       dh::safe_cuda(cudaDeviceSynchronize());
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
       dh::LaunchN(kCols,
-                  ReadRowFunction(result.GetDeviceAccessor(0), compacted_row,
+                  ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row,
                                   row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index f2ade711b..9fb11921c 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -30,7 +30,7 @@ namespace xgboost::data {
 TEST(GradientIndex, ExternalMemoryBaseRowID) {
   Context ctx;
   auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
-                    .Device(ctx.gpu_id)
+                    .Device(ctx.Device())
                     .Batches(8)
                     .GenerateSparsePageDMatrix("cache", true);
 
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 6b856f3fa..f7985df45 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -11,9 +11,7 @@
 #include "../helpers.h"
 #include "test_iterative_dmatrix.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 void TestEquivalent(float sparsity) {
   Context ctx{MakeCUDACtx(0)};
 
@@ -23,14 +21,14 @@ void TestEquivalent(float sparsity) {
   std::size_t offset = 0;
   auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
   std::unique_ptr<EllpackPageImpl> page_concatenated {
-    new EllpackPageImpl(0, first->Cuts(), first->is_dense,
+    new EllpackPageImpl(ctx.Device(), first->Cuts(), first->is_dense,
                         first->row_stride, 1000 * 100)};
   for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
     auto page = batch.Impl();
-    size_t num_elements = page_concatenated->Copy(0, page, offset);
+    size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset);
     offset += num_elements;
   }
-  auto from_iter = page_concatenated->GetDeviceAccessor(0);
+  auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
   ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
   ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
 
@@ -40,7 +38,7 @@ void TestEquivalent(float sparsity) {
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
   auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
-    auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
+    auto from_data = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
 
     std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
     std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
@@ -152,10 +150,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
   auto impl = ellpack.Impl();
   common::CompressedIterator<uint32_t> iterator(
       impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
-  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue());
-  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue());
+  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   // null values get placed after valid values in a row
-  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue());
+  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   EXPECT_EQ(m.Info().num_col_, cols);
   EXPECT_EQ(m.Info().num_row_, rows);
   EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
@@ -183,5 +181,4 @@ TEST(IterativeDeviceDMatrix, Ref) {
   TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
       &ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index dbaffb7cd..3e96d1919 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -12,6 +12,7 @@
 #include "../helpers.h"
 #include "xgboost/base.h"
 
+namespace xgboost {
 TEST(MetaInfo, GetSet) {
   xgboost::Context ctx;
   xgboost::MetaInfo info;
@@ -236,9 +237,9 @@ TEST(MetaInfo, Validate) {
   info.num_nonzero_ = 12;
   info.num_col_ = 3;
   std::vector<xgboost::bst_group_t> groups (11);
-  xgboost::Context ctx;
+  Context ctx;
   info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
-  EXPECT_THROW(info.Validate(0), dmlc::Error);
+  EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
 
   std::vector<float> labels(info.num_row_ + 1);
   EXPECT_THROW(
@@ -261,11 +262,11 @@ TEST(MetaInfo, Validate) {
   info.group_ptr_.clear();
   labels.resize(info.num_row_);
   info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
-  info.labels.SetDevice(0);
-  EXPECT_THROW(info.Validate(1), dmlc::Error);
+  info.labels.SetDevice(FstCU());
+  EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
 
   xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
-  d_groups.SetDevice(0);
+  d_groups.SetDevice(FstCU());
   d_groups.DevicePointer();  // pull to device
   std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
       d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
@@ -306,6 +307,5 @@ TEST(MetaInfo, HostExtend) {
   }
 }
 
-namespace xgboost {
 TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_proxy_dmatrix.cc b/tests/cpp/data/test_proxy_dmatrix.cc
index a6d0b2188..996836ed6 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cc
+++ b/tests/cpp/data/test_proxy_dmatrix.cc
@@ -1,31 +1,27 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include "../helpers.h"
-#include "../../../src/data/proxy_dmatrix.h"
-#include "../../../src/data/adapter.h"
 
-namespace xgboost {
-namespace data {
+#include "../../../src/data/adapter.h"
+#include "../../../src/data/proxy_dmatrix.h"
+#include "../helpers.h"
+
+namespace xgboost::data {
 TEST(ProxyDMatrix, HostData) {
   DMatrixProxy proxy;
   size_t constexpr kRows = 100, kCols = 10;
   std::vector<HostDeviceVector<float>> label_storage(1);
 
   HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5)
-                  .Device(0)
-                  .GenerateArrayInterface(&storage);
+  auto data =
+      RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
 
   proxy.SetArrayData(data.c_str());
 
-  auto n_samples = HostAdapterDispatch(
-      &proxy, [](auto const &value) { return value.Size(); });
+  auto n_samples = HostAdapterDispatch(&proxy, [](auto const &value) { return value.Size(); });
   ASSERT_EQ(n_samples, kRows);
-  auto n_features = HostAdapterDispatch(
-      &proxy, [](auto const &value) { return value.NumCols(); });
+  auto n_features = HostAdapterDispatch(&proxy, [](auto const &value) { return value.NumCols(); });
   ASSERT_EQ(n_features, kCols);
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu
index ab38f51bb..e7780951c 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cu
+++ b/tests/cpp/data/test_proxy_dmatrix.cu
@@ -15,10 +15,12 @@ namespace xgboost::data {
 TEST(ProxyDMatrix, DeviceData) {
   constexpr size_t kRows{100}, kCols{100};
   HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5).Device(0).GenerateArrayInterface(&storage);
+  auto data =
+      RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
   std::vector<HostDeviceVector<float>> label_storage(1);
-  auto labels =
-      RandomDataGenerator(kRows, 1, 0).Device(0).GenerateColumnarArrayInterface(&label_storage);
+  auto labels = RandomDataGenerator(kRows, 1, 0)
+                    .Device(FstCU())
+                    .GenerateColumnarArrayInterface(&label_storage);
 
   DMatrixProxy proxy;
   proxy.SetCUDAArray(data.c_str());
@@ -31,7 +33,7 @@ TEST(ProxyDMatrix, DeviceData) {
 
   std::vector<HostDeviceVector<float>> columnar_storage(kCols);
   data = RandomDataGenerator(kRows, kCols, 0)
-             .Device(0)
+             .Device(FstCU())
              .GenerateColumnarArrayInterface(&columnar_storage);
   proxy.SetCUDAArray(data.c_str());
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index f1d588196..e4d5f2672 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -268,7 +268,7 @@ TEST(SimpleDMatrix, Slice) {
   std::iota(upper.begin(), upper.end(), 1.0f);
 
   auto& margin = p_m->Info().base_margin_;
-  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
+  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
 
   std::array<int32_t, 3> ridxs {1, 3, 5};
   std::unique_ptr<DMatrix> out { p_m->Slice(ridxs) };
@@ -341,7 +341,7 @@ TEST(SimpleDMatrix, SliceCol) {
   std::iota(upper.begin(), upper.end(), 1.0f);
 
   auto& margin = p_m->Info().base_margin_;
-  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
+  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
 
   auto constexpr kSlices {2};
   auto constexpr kSliceSize {4};
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 17ed64c90..e82ca64cc 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -134,11 +134,11 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
   size_t offset = 0;
   for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
     if (!impl_ext) {
-      impl_ext.reset(new EllpackPageImpl(
-          batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
-          batch.Impl()->is_dense, batch.Impl()->row_stride, kRows));
+      impl_ext = std::make_unique<EllpackPageImpl>(batch.Impl()->gidx_buffer.Device(),
+                                                   batch.Impl()->Cuts(), batch.Impl()->is_dense,
+                                                   batch.Impl()->row_stride, kRows);
     }
-    auto n_elems = impl_ext->Copy(0, batch.Impl(), offset);
+    auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset);
     offset += n_elems;
   }
   EXPECT_EQ(impl_ext->base_rowid, 0);
@@ -198,10 +198,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
     EXPECT_EQ(impl_ext->base_rowid, current_row);
 
     for (size_t i = 0; i < impl_ext->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
+                                         row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(ctx.Device()), current_row,
+                                         row_ext_d.data().get()));
       thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
 
       EXPECT_EQ(row, row_ext);
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index d7b7e588d..dac1f1cf7 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -65,7 +65,7 @@ TEST(GBTree, PredictionCache) {
 
   gbtree.Configure({{"tree_method", "hist"}});
   auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   PredictionCacheEntry out_predictions;
@@ -156,7 +156,7 @@ TEST(GBTree, ChoosePredictor) {
 
   // pull data into device.
   data.HostVector();
-  data.SetDevice(0);
+  data.SetDevice(DeviceOrd::CUDA(0));
   data.DeviceSpan();
   ASSERT_FALSE(data.HostCanWrite());
 
@@ -215,7 +215,7 @@ TEST(GBTree, ChooseTreeMethod) {
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
-      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, Context::kCpuId};
+      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, DeviceOrd::CPU()};
       gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
       learner->BoostOneIter(0, Xy, &gpair);
     }
@@ -400,7 +400,7 @@ class Dart : public testing::TestWithParam<char const*> {
     if (device == "GPU") {
       ctx = MakeCUDACtx(0);
     }
-    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id);
+    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.Device());
     auto array_str = rng.GenerateArrayInterface(&data);
     auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
 
@@ -710,7 +710,7 @@ TEST(GBTree, InplacePredictionError) {
   auto test_qdm_err = [&](std::string booster, Context const* ctx) {
     std::shared_ptr<DMatrix> p_fmat;
     bst_bin_t max_bins = 16;
-    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins);
+    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->Device()).Bins(max_bins);
     if (ctx->IsCPU()) {
       p_fmat = rng.GenerateQuantileDMatrix(true);
     } else {
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index 801c935d6..f308e3b3e 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -22,7 +22,7 @@ void TestInplaceFallback(Context const* ctx) {
   bst_feature_t n_features{32};
   HostDeviceVector<float> X_storage;
   // use a different device than the learner
-  std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1;
+  auto data_ordinal = ctx->IsCPU() ? DeviceOrd::CUDA(0) : DeviceOrd::CPU();
   auto X = RandomDataGenerator{n_samples, n_features, 0.0}
                .Device(data_ordinal)
                .GenerateArrayInterface(&X_storage);
@@ -30,7 +30,7 @@ void TestInplaceFallback(Context const* ctx) {
   auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
 
   std::shared_ptr<DMatrix> Xy;
-  if (data_ordinal == Context::kCpuId) {
+  if (data_ordinal.IsCPU()) {
     auto X_adapter = data::ArrayAdapter{StringView{X}};
     Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
   } else {
@@ -49,7 +49,7 @@ void TestInplaceFallback(Context const* ctx) {
 
   std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
   auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
-  if (data_ordinal == Context::kCpuId) {
+  if (data_ordinal.IsCPU()) {
     proxy->SetArrayData(StringView{X});
   } else {
     proxy->SetCUDAArray(X.c_str());
@@ -64,7 +64,7 @@ void TestInplaceFallback(Context const* ctx) {
 
   // test when the contexts match
   Context new_ctx = *proxy->Ctx();
-  ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
+  ASSERT_NE(new_ctx.Ordinal(), ctx->Ordinal());
 
   learner->SetParam("device", new_ctx.DeviceName());
   HostDeviceVector<float>* out_predt_1{nullptr};
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index a9ff347ea..604c4d30a 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -119,8 +119,10 @@ void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                       std::vector<xgboost::bst_float> out_hess) {
   xgboost::MetaInfo info;
   info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Tensor<float, 2>{
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
+  info.labels = xgboost::linalg::Tensor<float, 2>{labels.cbegin(),
+                                                  labels.cend(),
+                                                  {labels.size(), static_cast<std::size_t>(1)},
+                                                  xgboost::DeviceOrd::CPU()};
   info.weights_.HostVector() = weights;
 
   CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
@@ -155,8 +157,10 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                              std::vector<xgboost::bst_float> out_hess) {
   xgboost::MetaInfo info;
   info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Matrix<float>{
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
+  info.labels = xgboost::linalg::Matrix<float>{labels.cbegin(),
+                                               labels.cend(),
+                                               {labels.size(), static_cast<std::size_t>(1)},
+                                               xgboost::DeviceOrd::CPU()};
   info.weights_.HostVector() = weights;
   info.group_ptr_ = groups;
 
@@ -171,8 +175,9 @@ xgboost::bst_float GetMetricEval(xgboost::Metric* metric,
                                  xgboost::DataSplitMode data_split_mode) {
   return GetMultiMetricEval(
       metric, preds,
-      xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1}, weights,
-      groups, data_split_mode);
+      xgboost::linalg::Tensor<float, 2>{
+          labels.begin(), labels.end(), {labels.size()}, xgboost::DeviceOrd::CPU()},
+      weights, groups, data_split_mode);
 }
 
 double GetMultiMetricEval(xgboost::Metric* metric,
@@ -215,7 +220,7 @@ void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const
       p_fmat->Info().labels.Data());
   CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
   p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
-  if (device_ != Context::kCpuId) {
+  if (device_.IsCUDA()) {
     p_fmat->Info().labels.SetDevice(device_);
   }
 }
@@ -236,7 +241,7 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
       v = dist(&lcg);
     }
   }
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
     out->SetDevice(device_);
     out->DeviceSpan();
   }
@@ -258,7 +263,7 @@ std::string RandomDataGenerator::GenerateArrayInterface(
 
 std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
     HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features,
-    std::size_t batches, std::int32_t device) {
+    std::size_t batches, DeviceOrd device) {
   std::vector<std::string> result(batches);
   std::vector<Json> objects;
 
@@ -267,7 +272,7 @@ std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
   auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) {
     Json array_interface{Object()};
     array_interface["data"] = std::vector<Json>(2);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       array_interface["data"][0] =
           Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
       array_interface["stream"] = Null{};
@@ -359,7 +364,7 @@ void RandomDataGenerator::GenerateCSR(
     h_rptr.emplace_back(rptr);
   }
 
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
     value->SetDevice(device_);
     value->DeviceSpan();
     row_ptr->SetDevice(device_);
@@ -400,7 +405,7 @@ void RandomDataGenerator::GenerateCSR(
       out->Info().labels.Reshape(this->rows_, this->n_targets_);
     }
   }
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
     out->Info().labels.SetDevice(device_);
     out->Info().feature_types.SetDevice(device_);
     for (auto const& page : out->GetBatches<SparsePage>()) {
@@ -423,7 +428,7 @@ void RandomDataGenerator::GenerateCSR(
   CHECK_GE(this->n_batches_, 1)
       << "Must set the n_batches before generating an external memory DMatrix.";
   std::unique_ptr<ArrayIterForTest> iter;
-  if (device_ == Context::kCpuId) {
+  if (device_.IsCPU()) {
     iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
   } else {
 #if defined(XGBOOST_USE_CUDA)
@@ -487,7 +492,7 @@ int CudaArrayIterForTest::Next() {
 NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
                                              size_t batches)
     : ArrayIterForTest{sparsity, rows, cols, batches} {
-  rng_->Device(Context::kCpuId);
+  rng_->Device(DeviceOrd::CPU());
   std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
   this->Reset();
 }
@@ -644,8 +649,8 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
     labels[i] = i;
   }
   p_dmat->Info().labels =
-      linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1};
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Ordinal());
+      linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, DeviceOrd::CPU()};
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Device());
   auto h_gpair = gpair.HostView();
   for (size_t i = 0; i < kRows; ++i) {
     h_gpair(i) = GradientPair{static_cast<float>(i), 1};
@@ -674,7 +679,7 @@ ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector<float> c
   CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
   this->data_.Copy(data);
   std::tie(batches_, interface_) =
-      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->gpu_id);
+      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->Device());
 }
 
 ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index 10b800fc1..db94da27a 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -9,7 +9,7 @@ namespace xgboost {
 CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
                                            size_t cols, size_t batches)
     : ArrayIterForTest{sparsity, rows, cols, batches} {
-  rng_->Device(0);
+  rng_->Device(FstCU());
   std::tie(batches_, interface_) =
       rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
   this->Reset();
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index bad15c695..a26669b7d 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -231,7 +231,7 @@ class RandomDataGenerator {
 
   bst_target_t n_targets_{1};
 
-  std::int32_t device_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
   std::size_t n_batches_{0};
   std::uint64_t seed_{0};
   SimpleLCG lcg_;
@@ -256,7 +256,7 @@ class RandomDataGenerator {
     upper_ = v;
     return *this;
   }
-  RandomDataGenerator& Device(int32_t d) {
+  RandomDataGenerator& Device(DeviceOrd d) {
     device_ = d;
     return *this;
   }
@@ -391,7 +391,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
  * \brief Make a context that uses CUDA if device >= 0.
  */
 inline Context MakeCUDACtx(std::int32_t device) {
-  if (device == Context::kCpuId) {
+  if (device == DeviceOrd::CPUOrdinal()) {
     return Context{};
   }
   return Context{}.MakeCUDA(device);
@@ -501,7 +501,7 @@ RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
  * \brief Make learner model param
  */
 inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups,
-                                int32_t device = Context::kCpuId) {
+                                DeviceOrd device = DeviceOrd::CPU()) {
   size_t shape[1]{1};
   LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
                            n_groups, 1, MultiStrategy::kOneOutputPerTree);
@@ -571,4 +571,5 @@ class BaseMGPUTest : public ::testing::Test {
 
 class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
 
+inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 }  // namespace xgboost
diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h
index 6774f531c..496aa30f3 100644
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -1,3 +1,8 @@
+/**
+ * Copyright 2020-2023, XGBoost contributors
+ */
+#pragma once
+
 #if defined(__CUDACC__)
 #include "../../src/data/ellpack_page.cuh"
 #endif
@@ -24,8 +29,8 @@ class HistogramCutsWrapper : public common::HistogramCuts {
 };
 }  //  anonymous namespace
 
-inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
-    int n_rows, int n_cols, bst_float sparsity= 0) {
+inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
+                                                         bst_float sparsity = 0) {
   auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
   const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
 
@@ -49,7 +54,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
   }
 
   auto page = std::unique_ptr<EllpackPageImpl>(
-      new EllpackPageImpl(0, cmat, batch, dmat->IsDense(), row_stride, {}));
+      new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {}));
 
   return page;
 }
diff --git a/tests/cpp/metric/test_auc.h b/tests/cpp/metric/test_auc.h
index 0dd3dd83e..cef6d9757 100644
--- a/tests/cpp/metric/test_auc.h
+++ b/tests/cpp/metric/test_auc.h
@@ -28,7 +28,7 @@ inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow)
   // Invalid dataset
   auto p_fmat = EmptyDMatrix();
   MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1};
+  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, DeviceOrd::CPU()};
   float auc = metric->Evaluate({1, 1}, p_fmat);
   ASSERT_TRUE(std::isnan(auc));
   *info.labels.Data() = HostDeviceVector<float>{};
diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc
index 13021fb6a..11854ce88 100644
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -3,8 +3,7 @@
  */
 #include "test_elementwise_metric.h"
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 TEST(Metric, DeclareUnifiedTest(RMSE)) { VerifyRMSE(); }
 
 TEST(Metric, DeclareUnifiedTest(RMSLE)) { VerifyRMSLE(); }
@@ -104,5 +103,4 @@ TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileRowSplit) {
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileColumnSplit) {
   DoTest(VerifyQuantile, DataSplitMode::kCol);
 }
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/tests/cpp/metric/test_elementwise_metric.h b/tests/cpp/metric/test_elementwise_metric.h
index a32bb0438..ef34d7651 100644
--- a/tests/cpp/metric/test_elementwise_metric.h
+++ b/tests/cpp/metric/test_elementwise_metric.h
@@ -11,9 +11,7 @@
 #include "../../../src/common/linalg_op.h"
 #include "../helpers.h"
 
-namespace xgboost {
-namespace metric {
-
+namespace xgboost::metric {
 inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
   auto ctx = MakeCUDACtx(device);
   std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
@@ -325,14 +323,14 @@ inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode
 }
 
 inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = MakeCUDACtx(GPUIDX);
   size_t n_samples = 32, n_targets = 8;
-  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
+  linalg::Tensor<float, 2> y{{n_samples, n_targets}, ctx.Device()};
   auto &h_y = y.Data()->HostVector();
   std::iota(h_y.begin(), h_y.end(), 0);
 
   HostDeviceVector<float> predt(n_samples * n_targets, 0);
 
-  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Metric> metric{Metric::Create("rmse", &ctx)};
   metric->Configure({});
 
@@ -381,5 +379,4 @@ inline void VerifyQuantile(DataSplitMode data_split_mode = DataSplitMode::kRow)
   metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
   EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
 }
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/tests/cpp/metric/test_rank_metric.h b/tests/cpp/metric/test_rank_metric.h
index 2f7785689..5d5e87072 100644
--- a/tests/cpp/metric/test_rank_metric.h
+++ b/tests/cpp/metric/test_rank_metric.h
@@ -154,7 +154,7 @@ inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRo
 
   auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
   MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id};
+  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.Device()};
   info.num_row_ = info.labels.Shape(0);
   info.group_ptr_.resize(2);
   info.group_ptr_[0] = 0;
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index 963f69639..2b34cfa38 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -71,7 +71,7 @@ void TestNDCGGPair(Context const* ctx) {
 
   HostDeviceVector<float> predts{0, 1, 0, 1};
   MetaInfo info;
-  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GPUIDX};
+  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, ctx->Device()};
   info.group_ptr_ = {0, 2, 4};
   info.num_row_ = 4;
   linalg::Matrix<GradientPair> gpairs;
@@ -146,7 +146,7 @@ TEST(LambdaRank, UnbiasedNDCG) {
 }
 
 void InitMakePairTest(Context const* ctx, MetaInfo* out_info, HostDeviceVector<float>* out_predt) {
-  out_predt->SetDevice(ctx->gpu_id);
+  out_predt->SetDevice(ctx->Device());
   MetaInfo& info = *out_info;
   info.num_row_ = 128;
   info.labels.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
@@ -243,7 +243,7 @@ void TestMAPStat(Context const* ctx) {
 
     auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
 
-    predt.SetDevice(ctx->gpu_id);
+    predt.SetDevice(ctx->Device());
     auto rank_idx =
         p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
 
@@ -280,7 +280,7 @@ void TestMAPStat(Context const* ctx) {
 
     auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
 
-    predt.SetDevice(ctx->gpu_id);
+    predt.SetDevice(ctx->Device());
     auto rank_idx =
         p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
 
diff --git a/tests/cpp/objective/test_quantile_obj.cc b/tests/cpp/objective/test_quantile_obj.cc
index b263b4a8f..5b0a981e1 100644
--- a/tests/cpp/objective/test_quantile_obj.cc
+++ b/tests/cpp/objective/test_quantile_obj.cc
@@ -45,7 +45,7 @@ TEST(Objective, DeclareUnifiedTest(QuantileIntercept)) {
   MetaInfo info;
   info.num_row_ = 10;
   info.labels.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
-    data->SetDevice(ctx.gpu_id);
+    data->SetDevice(ctx.Device());
     data->Resize(info.num_row_);
     shape[0] = info.num_row_;
     shape[1] = 1;
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index c99360d8a..07f33d72e 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -142,7 +142,7 @@ TEST(CpuPredictor, InplacePredict) {
   bst_row_t constexpr kRows{128};
   bst_feature_t constexpr kCols{64};
   Context ctx;
-  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.gpu_id);
+  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.Device());
   {
     HostDeviceVector<float> data;
     gen.GenerateDense(&data);
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index beeab70fa..883e6e01c 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -34,7 +34,7 @@ TEST(GPUPredictor, Basic) {
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
     auto ctx = MakeCUDACtx(0);
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -70,7 +70,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
     std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -98,7 +98,7 @@ TEST_F(MGPUPredictorTest, BasicColumnSplit) {
     size_t n_row = i, n_col = i;
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -119,8 +119,10 @@ TEST(GPUPredictor, EllpackBasic) {
   auto ctx = MakeCUDACtx(0);
   for (size_t bins = 2; bins < 258; bins += 16) {
     size_t rows = bins * 16;
-    auto p_m =
-        RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix(false);
+    auto p_m = RandomDataGenerator{rows, kCols, 0.0}
+                   .Bins(bins)
+                   .Device(DeviceOrd::CUDA(0))
+                   .GenerateDeviceDMatrix(false);
     ASSERT_FALSE(p_m->PageExists<SparsePage>());
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, rows, kCols, p_m);
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, bins, kCols, p_m);
@@ -132,11 +134,11 @@ TEST(GPUPredictor, EllpackTraining) {
   size_t constexpr kRows{128}, kCols{16}, kBins{64};
   auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0}
                        .Bins(kBins)
-                       .Device(ctx.Ordinal())
+                       .Device(ctx.Device())
                        .GenerateDeviceDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
   auto columnar =
-      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Ordinal()).GenerateArrayInterface(&storage);
+      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Device()).GenerateArrayInterface(&storage);
   auto adapter = data::CupyAdapter(columnar);
   std::shared_ptr<DMatrix> p_full{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
@@ -151,7 +153,7 @@ TEST(GPUPredictor, ExternalMemoryTest) {
 
   const int n_classes = 3;
   Context ctx = MakeCUDACtx(0);
-  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Ordinal())};
+  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Device())};
 
   gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, n_classes);
   std::vector<std::unique_ptr<DMatrix>> dmats;
@@ -162,7 +164,7 @@ TEST(GPUPredictor, ExternalMemoryTest) {
 
   for (const auto& dmat: dmats) {
     dmat->Info().base_margin_ = decltype(dmat->Info().base_margin_){
-        {dmat->Info().num_row_, static_cast<size_t>(n_classes)}, 0};
+        {dmat->Info().num_row_, static_cast<size_t>(n_classes)}, DeviceOrd::CUDA(0)};
     dmat->Info().base_margin_.Data()->Fill(0.5);
     PredictionCacheEntry out_predictions;
     gpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
@@ -181,7 +183,7 @@ TEST(GPUPredictor, InplacePredictCupy) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.Ordinal());
+  gen.Device(ctx.Device());
   HostDeviceVector<float> data;
   std::string interface_str = gen.GenerateArrayInterface(&data);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -193,7 +195,7 @@ TEST(GPUPredictor, InplacePredictCuDF) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.Ordinal());
+  gen.Device(ctx.Device());
   std::vector<HostDeviceVector<float>> storage(kCols);
   auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -215,7 +217,7 @@ TEST(GPUPredictor, ShapStump) {
   cudaSetDevice(0);
 
   auto ctx = MakeCUDACtx(0);
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Device())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -241,7 +243,7 @@ TEST(GPUPredictor, ShapStump) {
 
 TEST(GPUPredictor, Shap) {
   auto ctx = MakeCUDACtx(0);
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Device())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -296,7 +298,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionLeafColumnSplit) {
 
 TEST(GPUPredictor, PredictLeafBasic) {
   size_t constexpr kRows = 5, kCols = 5;
-  auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(0).GenerateDMatrix();
+  auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
   auto lparam = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Predictor> gpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &lparam));
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 873c1672e..21aa483e4 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -34,7 +34,7 @@ TEST(Predictor, PredictionCache) {
   // Add a cache that is immediately expired.
   auto add_cache = [&]() {
     auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
-    container.Cache(p_dmat, Context::kCpuId);
+    container.Cache(p_dmat, DeviceOrd::CPU());
     m = p_dmat.get();
   };
 
@@ -93,7 +93,7 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
 void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
                            bst_feature_t cols) {
   std::size_t constexpr kClasses { 4 };
-  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->gpu_id);
+  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->Device());
   std::shared_ptr<DMatrix> m = gen.GenerateDMatrix(true, false, kClasses);
 
   std::unique_ptr<Learner> learner {
@@ -192,7 +192,7 @@ void TestPredictionDeviceAccess() {
 
   HostDeviceVector<float> from_cpu;
   {
-    ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId);
+    ASSERT_TRUE(from_cpu.Device().IsCPU());
     Context cpu_ctx;
     learner->SetParam("device", cpu_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cpu, 0, 0);
@@ -206,7 +206,7 @@ void TestPredictionDeviceAccess() {
     Context cuda_ctx = MakeCUDACtx(0);
     learner->SetParam("device", cuda_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cuda, 0, 0);
-    ASSERT_EQ(from_cuda.DeviceIdx(), 0);
+    ASSERT_EQ(from_cuda.Device(), DeviceOrd::CUDA(0));
     ASSERT_TRUE(from_cuda.DeviceCanWrite());
     ASSERT_FALSE(from_cuda.HostCanRead());
   }
@@ -351,7 +351,7 @@ void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split) {
 void TestIterationRange(Context const* ctx) {
   size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
   auto dmat = RandomDataGenerator(kRows, kCols, 0)
-                  .Device(ctx->gpu_id)
+                  .Device(ctx->Device())
                   .GenerateDMatrix(true, true, kClasses);
   auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
 
@@ -522,7 +522,7 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
 
   if (ctx->IsCUDA()) {
     learner->SetParam("tree_method", "gpu_hist");
-    learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+    learner->SetParam("device", ctx->Device().Name());
   }
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
@@ -620,7 +620,7 @@ void TestVectorLeafPrediction(Context const *ctx) {
   size_t constexpr kCols = 5;
 
   LearnerModelParam mparam{static_cast<bst_feature_t>(kCols),
-                           linalg::Vector<float>{{0.5}, {1}, Context::kCpuId}, 1, 3,
+                           linalg::Vector<float>{{0.5}, {1}, DeviceOrd::CPU()}, 1, 3,
                            MultiStrategy::kMultiOutputTree};
 
   std::vector<std::unique_ptr<RegTree>> trees;
diff --git a/tests/cpp/test_context.cc b/tests/cpp/test_context.cc
index d49f7b4b2..2fdf04aa1 100644
--- a/tests/cpp/test_context.cc
+++ b/tests/cpp/test_context.cc
@@ -5,11 +5,13 @@
 #include <xgboost/base.h>
 #include <xgboost/context.h>
 
+#include <sstream>
+
 namespace xgboost {
 TEST(Context, CPU) {
   Context ctx;
   ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
-  ASSERT_EQ(ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(ctx.Ordinal(), DeviceOrd::CPUOrdinal());
 
   std::int32_t flag{0};
   ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
@@ -27,5 +29,20 @@ TEST(Context, CPU) {
   ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":gpu"}}), dmlc::Error);
   ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":0"}}), dmlc::Error);
   ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ""}}), dmlc::Error);
+
+  std::stringstream ss;
+  ss << ctx.Device();
+  ASSERT_EQ(ss.str(), "cpu");
+}
+
+TEST(Context, ErrorInit) {
+  Context ctx;
+  ASSERT_THROW({ ctx.Init({{"foo", "bar"}}); }, dmlc::Error);
+  try {
+    ctx.Init({{"foo", "bar"}});
+  } catch (dmlc::Error const& e) {
+    auto msg = std::string{e.what()};
+    ASSERT_NE(msg.find("foo"), std::string::npos);
+  }
 }
 }  // namespace xgboost
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
index 035d22125..7684ff467 100644
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -13,7 +13,6 @@
 namespace xgboost {
 namespace {
 void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
-  ASSERT_EQ(ctx.gpu_id, ord);
   ASSERT_EQ(ctx.Device().ordinal, ord);
   ASSERT_EQ(ctx.DeviceName(), "cuda:" + std::to_string(ord));
   ASSERT_EQ(ctx.Ordinal(), ord);
@@ -25,7 +24,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
   Context new_ctx;
   FromJson(jctx, &new_ctx);
   ASSERT_EQ(new_ctx.Device(), ctx.Device());
-  ASSERT_EQ(new_ctx.gpu_id, ctx.gpu_id);
+  ASSERT_EQ(new_ctx.Ordinal(), ctx.Ordinal());
 }
 }  // namespace
 
@@ -53,7 +52,7 @@ TEST(Context, DeviceOrdinal) {
 
   auto cpu_ctx = ctx.MakeCPU();
   ASSERT_TRUE(cpu_ctx.IsCPU());
-  ASSERT_EQ(cpu_ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(cpu_ctx.Ordinal(), DeviceOrd::CPUOrdinal());
   ASSERT_EQ(cpu_ctx.Device(), DeviceOrd::CPU());
 
   auto cuda_ctx = cpu_ctx.MakeCUDA(ctx.Ordinal());
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 350744c58..0b65220ab 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -210,9 +210,9 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
     }
     // Pull data to device
     for (auto &batch : p_dmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(0);
+      batch.data.SetDevice(DeviceOrd::CUDA(0));
       batch.data.DeviceSpan();
-      batch.offset.SetDevice(0);
+      batch.offset.SetDevice(DeviceOrd::CUDA(0));
       batch.offset.DeviceSpan();
     }
 
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index f4ed34bf0..7d5f15a1c 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>
@@ -9,9 +9,7 @@
 #include "../../histogram_helpers.h"
 #include "../test_evaluate_splits.h"  // TestPartitionBasedSplit
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 namespace {
 auto ZeroParam() {
   auto args = Args{{"min_child_weight", "0"}, {"lambda", "0"}};
@@ -37,11 +35,12 @@ thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPa
 }
 
 TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
+  auto ctx = MakeCUDACtx(0);
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
   GPUTrainingParam param{param_};
-  cuts_.cut_ptrs_.SetDevice(0);
-  cuts_.cut_values_.SetDevice(0);
-  cuts_.min_vals_.SetDevice(0);
+  cuts_.cut_ptrs_.SetDevice(ctx.Device());
+  cuts_.cut_values_.SetDevice(ctx.Device());
+  cuts_.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<GradientPairInt64> feature_histogram{ConvertToInteger(feature_histogram_)};
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
@@ -57,9 +56,10 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
                                           cuts_.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), 0};
+  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
 
-  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false,
+                  ctx.Device());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   ASSERT_EQ(result.thresh, 1);
@@ -69,6 +69,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
 }
 
 TEST(GpuHist, PartitionBasic) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   tparam.max_cat_to_onehot = 0;
   GPUTrainingParam param{tparam};
@@ -77,9 +78,9 @@ TEST(GpuHist, PartitionBasic) {
   cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
   cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
   cuts.min_vals_.HostVector() = std::vector<float>{0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
+  cuts.cut_ptrs_.SetDevice(ctx.Device());
+  cuts.cut_values_.SetDevice(ctx.Device());
+  cuts.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
 
   thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
@@ -100,8 +101,8 @@ TEST(GpuHist, PartitionBasic) {
       false,
   };
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
 
   {
     // -1.0s go right
@@ -183,6 +184,7 @@ TEST(GpuHist, PartitionBasic) {
 }
 
 TEST(GpuHist, PartitionTwoFeatures) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   tparam.max_cat_to_onehot = 0;
   GPUTrainingParam param{tparam};
@@ -191,9 +193,9 @@ TEST(GpuHist, PartitionTwoFeatures) {
   cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0, 0.0, 1.0, 2.0};
   cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3, 6};
   cuts.min_vals_.HostVector() = std::vector<float>{0.0, 0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
+  cuts.cut_ptrs_.SetDevice(ctx.Device());
+  cuts.cut_values_.SetDevice(ctx.Device());
+  cuts.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
@@ -212,8 +214,8 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -243,6 +245,7 @@ TEST(GpuHist, PartitionTwoFeatures) {
 }
 
 TEST(GpuHist, PartitionTwoNodes) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   tparam.max_cat_to_onehot = 0;
   GPUTrainingParam param{tparam};
@@ -251,9 +254,9 @@ TEST(GpuHist, PartitionTwoNodes) {
   cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
   cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
   cuts.min_vals_.HostVector() = std::vector<float>{0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
+  cuts.cut_ptrs_.SetDevice(ctx.Device());
+  cuts.cut_values_.SetDevice(ctx.Device());
+  cuts.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
 
   thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
@@ -272,8 +275,10 @@ TEST(GpuHist, PartitionTwoNodes) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
+                             ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
+                  ctx.Device());
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -295,12 +300,14 @@ TEST(GpuHist, PartitionTwoNodes) {
 }
 
 void TestEvaluateSingleSplit(bool is_categorical) {
+  auto ctx = MakeCUDACtx(0);
   auto quantiser = DummyRoundingFactor();
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
 
-  common::HistogramCuts cuts{MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, 0)};
+  common::HistogramCuts cuts{
+      MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, ctx.Device())};
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
@@ -325,8 +332,10 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
+                             ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
+                  ctx.Device());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -363,7 +372,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator(tparam, feature_set.size(), 0);
+  GPUHistEvaluator evaluator(tparam, feature_set.size(), FstCU());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
@@ -375,7 +384,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
 
 TEST(GpuHist, EvaluateSingleSplitEmpty) {
   TrainParam tparam = ZeroParam();
-  GPUHistEvaluator evaluator(tparam, 1, 0);
+  GPUHistEvaluator evaluator(tparam, 1, FstCU());
   DeviceSplitCandidate result =
       evaluator
           .EvaluateSingleSplit(
@@ -410,7 +419,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0);
+  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -442,7 +451,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0);
+  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
@@ -477,7 +486,8 @@ TEST(GpuHist, EvaluateSplits) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_min_values.size()),
+                             FstCU()};
   dh::device_vector<EvaluateSplitInputs> inputs =
       std::vector<EvaluateSplitInputs>{input_left, input_right};
   evaluator.LaunchEvaluateSplits(input_left.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
@@ -493,14 +503,15 @@ TEST(GpuHist, EvaluateSplits) {
 }
 
 TEST_F(TestPartitionBasedSplit, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
   dh::device_vector<FeatureType> ft{std::vector<FeatureType>{FeatureType::kCategorical}};
-  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(info_.num_col_), 0};
+  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(info_.num_col_), ctx.Device()};
 
-  cuts_.cut_ptrs_.SetDevice(0);
-  cuts_.cut_values_.SetDevice(0);
-  cuts_.min_vals_.SetDevice(0);
+  cuts_.cut_ptrs_.SetDevice(ctx.Device());
+  cuts_.cut_values_.SetDevice(ctx.Device());
+  cuts_.min_vals_.SetDevice(ctx.Device());
 
-  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device());
 
   // Convert the sample histogram to fixed point
   auto quantiser = DummyRoundingFactor();
@@ -528,15 +539,16 @@ class MGPUHistTest : public BaseMGPUTest {};
 
 namespace {
 void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
+  auto ctx = MakeCUDACtx(GPUIDX);
   auto rank = collective::GetRank();
   auto quantiser = DummyRoundingFactor();
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
 
-  common::HistogramCuts cuts{rank == 0
-                                 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, GPUIDX)
-                                 : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, GPUIDX)};
+  common::HistogramCuts cuts{
+      rank == 0 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, ctx.Device())
+                : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, ctx.Device())};
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
@@ -562,8 +574,8 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), GPUIDX};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, GPUIDX);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1) << "rank: " << rank;
@@ -583,5 +595,4 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
 TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
   DoTest(VerifyColumnSplitEvaluateSingleSplit, true);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index 26ddfd8cc..9a0304f87 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -30,9 +30,9 @@ void VerifySampling(size_t page_size,
   for (const auto& gp : gpair.ConstHostVector()) {
     sum_gpair += gp;
   }
-  gpair.SetDevice(0);
-
   Context ctx{MakeCUDACtx(0)};
+  gpair.SetDevice(ctx.Device());
+
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   if (page_size != 0) {
@@ -87,9 +87,9 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   std::unique_ptr<DMatrix> dmat(
       CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
   auto gpair = GenerateRandomGradients(kRows);
-  gpair.SetDevice(0);
-
   Context ctx{MakeCUDACtx(0)};
+  gpair.SetDevice(ctx.Device());
+
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_NE(page->n_rows, kRows);
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 2eacd48e5..0c91cf21e 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -12,9 +12,7 @@
 #include "../../categorical_helpers.h"
 #include "../../helpers.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 void TestDeterministicHistogram(bool is_dense, int shm_size) {
   Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
@@ -27,22 +25,22 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
   for (auto const& batch : matrix->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
 
-    tree::RowPartitioner row_partitioner(0, kRows);
+    tree::RowPartitioner row_partitioner(FstCU(), kRows);
     auto ridx = row_partitioner.GetRows(0);
 
     int num_bins = kBins * kCols;
     dh::device_vector<GradientPairInt64> histogram(num_bins);
     auto d_histogram = dh::ToSpan(histogram);
     auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-    gpair.SetDevice(0);
+    gpair.SetDevice(FstCU());
 
     FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
                                  sizeof(GradientPairInt64));
 
     auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                           feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram,
-                           quantiser);
+    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
+                           feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
+                           d_histogram, quantiser);
 
     std::vector<GradientPairInt64> histogram_h(num_bins);
     dh::safe_cuda(cudaMemcpy(histogram_h.data(), d_histogram.data(),
@@ -54,8 +52,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
       auto d_new_histogram = dh::ToSpan(new_histogram);
 
       auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                             feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
+                             feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
 
       std::vector<GradientPairInt64> new_histogram_h(num_bins);
@@ -70,14 +68,14 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
 
     {
       auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-      gpair.SetDevice(0);
+      gpair.SetDevice(FstCU());
 
       // Use a single feature group to compute the baseline.
       FeatureGroups single_group(page->Cuts());
 
       dh::device_vector<GradientPairInt64> baseline(num_bins);
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                             single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
+                             single_group.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                              dh::ToSpan(baseline), quantiser);
 
       std::vector<GradientPairInt64> baseline_h(num_bins);
@@ -126,11 +124,11 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
   auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()};
-  tree::RowPartitioner row_partitioner(0, kRows);
+  tree::RowPartitioner row_partitioner(ctx.Device(), kRows);
   auto ridx = row_partitioner.GetRows(0);
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
   auto gpair = GenerateRandomGradients(kRows, 0, 2);
-  gpair.SetDevice(0);
+  gpair.SetDevice(DeviceOrd::CUDA(0));
   auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
   /**
    * Generate hist with cat data.
@@ -138,8 +136,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : cat_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                           single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(cat_hist), quantiser);
   }
 
@@ -152,8 +150,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : encode_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                           single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(encode_hist), quantiser);
   }
 
@@ -241,5 +239,4 @@ void TestAtomicAdd() {
 TEST(Histogram, AtomicAddInt64) {
   TestAtomicAdd();
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 317728e01..14ea6fd70 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -16,12 +16,10 @@
 #include "xgboost/task.h"
 #include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 void TestUpdatePositionBatch() {
   const int kNumRows = 10;
-  RowPartitioner rp(0, kNumRows);
+  RowPartitioner rp(FstCU(), kNumRows);
   auto rows = rp.GetRowsHost(0);
   EXPECT_EQ(rows.size(), kNumRows);
   for (auto i = 0ull; i < kNumRows; i++) {
@@ -89,12 +87,11 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   }
 }
 
-TEST(GpuHist, SortPositionBatch) { 
-  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}}); 
-  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}}); 
+TEST(GpuHist, SortPositionBatch) {
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}});
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}});
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
 }
 
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 095284a38..78fda5ce5 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -115,7 +115,7 @@ TEST(HistMultiEvaluator, Evaluate) {
   HistMultiEvaluator evaluator{&ctx, p_fmat->Info(), &param, sampler};
   HistMakerTrainParam hist_param;
   std::vector<BoundedHistCollection> histogram(n_targets);
-  linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
+  linalg::Vector<GradientPairPrecise> root_sum({2}, DeviceOrd::CPU());
   for (bst_target_t t{0}; t < n_targets; ++t) {
     auto &hist = histogram[t];
     hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index 6cb75e23b..6506b54e8 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -76,7 +76,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
                                                      GradientPairPrecise parent_sum) {
       int32_t best_thresh = -1;
       float best_score{-std::numeric_limits<float>::infinity()};
-      TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), -1};
+      TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), DeviceOrd::CPU()};
       auto tree_evaluator = evaluator.GetEvaluator<TrainParam>();
       GradientPairPrecise left_sum;
       auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_});
@@ -111,13 +111,13 @@ class TestPartitionBasedSplit : public ::testing::Test {
 };
 
 inline auto MakeCutsForTest(std::vector<float> values, std::vector<uint32_t> ptrs,
-                            std::vector<float> min_values, int32_t device) {
+                            std::vector<float> min_values, DeviceOrd device) {
   common::HistogramCuts cuts;
   cuts.cut_values_.HostVector() = values;
   cuts.cut_ptrs_.HostVector() = ptrs;
   cuts.min_vals_.HostVector() = min_values;
 
-  if (device >= 0) {
+  if (device.IsCUDA()) {
     cuts.cut_ptrs_.SetDevice(device);
     cuts.cut_values_.SetDevice(device);
     cuts.min_vals_.SetDevice(device);
@@ -136,7 +136,7 @@ class TestCategoricalSplitWithMissing : public testing::Test {
   TrainParam param_;
 
   void SetUp() override {
-    cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, -1);
+    cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, DeviceOrd::CPU());
     auto max_cat = *std::max_element(cuts_.cut_values_.HostVector().begin(),
                                      cuts_.cut_values_.HostVector().end());
     cuts_.SetCategorical(true, max_cat);
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index f21ed1f06..accfbae08 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -29,7 +29,7 @@ TEST(GpuHist, DeviceHistogram) {
   constexpr int kNNodes = 4;
   constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
   DeviceHistogramStorage<kStopGrowing> histogram;
-  histogram.Init(0, kNBins);
+  histogram.Init(FstCU(), kNBins);
   for (int i = 0; i < kNNodes; ++i) {
     histogram.AllocateHistograms({i});
   }
@@ -102,12 +102,12 @@ void TestBuildHist(bool use_shared_memory_histograms) {
     bst_float hess = dist(&gen);
     gp = GradientPair(grad, hess);
   }
-  gpair.SetDevice(0);
+  gpair.SetDevice(DeviceOrd::CUDA(0));
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
-  maker.row_partitioner = std::make_unique<RowPartitioner>(0, kNRows);
+  maker.row_partitioner = std::make_unique<RowPartitioner>(FstCU(), kNRows);
 
-  maker.hist.Init(0, page->Cuts().TotalBins());
+  maker.hist.Init(FstCU(), page->Cuts().TotalBins());
   maker.hist.AllocateHistograms({0});
 
   maker.gpair = gpair.DeviceSpan();
@@ -116,8 +116,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   maker.InitFeatureGroupsOnce();
 
-  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                         maker.feature_groups->DeviceAccessor(0), gpair.DeviceSpan(),
+  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(DeviceOrd::CUDA(0)),
+                         maker.feature_groups->DeviceAccessor(DeviceOrd::CUDA(0)), gpair.DeviceSpan(),
                          maker.row_partitioner->GetRows(0), maker.hist.GetNodeHistogram(0),
                          *maker.quantiser, !use_shared_memory_histograms);
 
@@ -198,7 +198,7 @@ void TestHistogramIndexImpl() {
   // histogram index
   const auto &maker = hist_maker.maker;
   auto grad = GenerateRandomGradients(kNRows);
-  grad.SetDevice(0);
+  grad.SetDevice(DeviceOrd::CUDA(0));
   maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
   std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
 
@@ -264,17 +264,17 @@ TEST(GpuHist, UniformSampling) {
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
 
-  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   Context ctx(MakeCUDACtx(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using sampling.
   RegTree tree_sampling;
-  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "uniform",
              kRows);
 
@@ -295,18 +295,18 @@ TEST(GpuHist, GradientBasedSampling) {
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
 
-  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   Context ctx(MakeCUDACtx(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
 
   // Build another tree using sampling.
   RegTree tree_sampling;
-  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
              "gradient_based", kRows);
 
@@ -333,16 +333,16 @@ TEST(GpuHist, ExternalMemory) {
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));
 
   Context ctx(MakeCUDACtx(0));
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using multiple ELLPACK pages.
   RegTree tree_ext;
-  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
 
   // Make sure the predictions are the same.
@@ -371,20 +371,20 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
       CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
 
   Context ctx(MakeCUDACtx(0));
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   auto rng = common::GlobalRandom();
 
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
 
   // Build another tree using multiple ELLPACK pages.
   common::GlobalRandom() = rng;
   RegTree tree_ext;
-  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample,
              kSamplingMethod, kRows);
 
@@ -436,7 +436,7 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
 
-  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Ordinal());
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
   gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -486,7 +486,7 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
 
-  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Ordinal());
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
   gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index e90120231..963660f59 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -28,7 +28,7 @@ TEST(GrowHistMaker, InteractionConstraint) {
   auto p_dmat = GenerateDMatrix(kRows, kCols);
   Context ctx;
 
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   ObjInfo task{ObjInfo::kRegression};
@@ -74,7 +74,7 @@ void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
                        RegTree const& expected_tree) {
   Context ctx;
   auto p_dmat = GenerateDMatrix(rows, cols, categorical);
-  linalg::Matrix<GradientPair> gpair({rows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({rows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(rows));
 
 
@@ -107,7 +107,7 @@ void TestColumnSplit(bool categorical) {
   {
     Context ctx;
     auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
-    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
     gpair.Data()->Copy(GenerateRandomGradients(kRows));
     std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index af83ed7eb..550b8837c 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -12,9 +12,9 @@ TEST(MultiTargetTree, JsonIO) {
   bst_feature_t n_features{4};
   RegTree tree{n_targets, n_features};
   ASSERT_TRUE(tree.IsMultiTarget());
-  linalg::Vector<float> base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, Context::kCpuId};
-  linalg::Vector<float> left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, Context::kCpuId};
-  linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, Context::kCpuId};
+  linalg::Vector<float> base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, DeviceOrd::CPU()};
+  linalg::Vector<float> left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, DeviceOrd::CPU()};
+  linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, DeviceOrd::CPU()};
   tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
                   left_weight.HostView(), right_weight.HostView());
   ASSERT_EQ(tree.NumNodes(), 3);
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index dc9a9c209..d112efa9d 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -33,7 +33,7 @@ class UpdaterTreeStatTest : public ::testing::Test {
     ObjInfo task{ObjInfo::kRegression};
     param.Init(Args{});
 
-    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(Context::kCpuId));
+    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
     auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
     up->Configure(Args{});
     RegTree tree{1u, kCols};
@@ -78,7 +78,7 @@ class UpdaterEtaTest : public ::testing::Test {
   void RunTest(std::string updater) {
     ObjInfo task{ObjInfo::kClassification};
 
-    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(Context::kCpuId));
+    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
 
     float eta = 0.4;
     auto up_0 = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};

From 0080c970751cdac3f6b205553ef99a0138187599 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 21 Sep 2023 01:09:36 +0800
Subject: [PATCH 17/59] Workaround poll on macos. (#9596)

---
 rabit/src/allreduce_base.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc
index 04246b5a1..416801ee2 100644
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -549,7 +549,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
       break;
     }
     // select must return
-    auto poll_res = watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec, false);  // fail on macos
     if (!poll_res.OK()) {
       LOG(FATAL) << poll_res.Report();
     }
@@ -814,7 +814,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
       break;
     }
 
-    auto poll_res = watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec, false);  // fail on macos
     if (!poll_res.OK()) {
       LOG(FATAL) << poll_res.Report();
     }
@@ -919,7 +919,7 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
     if (finished) {
       break;
     }
-    auto poll_res = watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec, false);  // fail on macos
     if (!poll_res.OK()) {
       LOG(FATAL) << poll_res.Report();
     }

From bbf5b9ee57620a28b44e45389faaf32ea475f94f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 23 Sep 2023 01:28:18 +0800
Subject: [PATCH 18/59] [dask] Move dask module into directory. (#9597)

---
 .../xgboost/{dask.py => dask/__init__.py}      | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)
 rename python-package/xgboost/{dask.py => dask/__init__.py} (99%)

diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask/__init__.py
similarity index 99%
rename from python-package/xgboost/dask.py
rename to python-package/xgboost/dask/__init__.py
index f62a3e5af..a58c0f225 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -60,11 +60,11 @@ from typing import (
 
 import numpy
 
-from . import collective, config
-from ._typing import _T, FeatureNames, FeatureTypes, ModelIn
-from .callback import TrainingCallback
-from .compat import DataFrame, LazyLoader, concat, lazy_isinstance
-from .core import (
+from xgboost import collective, config
+from xgboost._typing import _T, FeatureNames, FeatureTypes, ModelIn
+from xgboost.callback import TrainingCallback
+from xgboost.compat import DataFrame, LazyLoader, concat, lazy_isinstance
+from xgboost.core import (
     Booster,
     DataIter,
     DMatrix,
@@ -75,8 +75,8 @@ from .core import (
     _deprecate_positional_args,
     _expect,
 )
-from .data import _is_cudf_ser, _is_cupy_array
-from .sklearn import (
+from xgboost.data import _is_cudf_ser, _is_cupy_array
+from xgboost.sklearn import (
     XGBClassifier,
     XGBClassifierBase,
     XGBClassifierMixIn,
@@ -91,8 +91,8 @@ from .sklearn import (
     _wrap_evaluation_matrices,
     xgboost_model_doc,
 )
-from .tracker import RabitTracker, get_host_ip
-from .training import train as worker_train
+from xgboost.tracker import RabitTracker, get_host_ip
+from xgboost.training import train as worker_train
 
 if TYPE_CHECKING:
     import dask

From a90d204942d3e623bd31b757160f5f8b897f03cc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 23 Sep 2023 03:13:48 +0800
Subject: [PATCH 19/59] Use array interface for testing numpy arrays. (#9602)

---
 python-package/xgboost/core.py |  3 ++-
 python-package/xgboost/data.py | 12 ++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 486cee514..f94e60321 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2434,6 +2434,7 @@ class Booster:
             _is_cudf_df,
             _is_cupy_array,
             _is_list,
+            _is_np_array_like,
             _is_pandas_df,
             _is_pandas_series,
             _is_tuple,
@@ -2463,7 +2464,7 @@ class Booster:
                     f"got {data.shape[1]}"
                 )
 
-        if isinstance(data, np.ndarray):
+        if _is_np_array_like(data):
             from .data import _ensure_np_dtype
 
             data, _ = _ensure_np_dtype(data, data.dtype)
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 428e48d10..0022a17d4 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -164,8 +164,8 @@ def _is_scipy_coo(data: DataType) -> bool:
     return isinstance(data, scipy.sparse.coo_matrix)
 
 
-def _is_numpy_array(data: DataType) -> bool:
-    return isinstance(data, (np.ndarray, np.matrix))
+def _is_np_array_like(data: DataType) -> bool:
+    return hasattr(data, "__array_interface__")
 
 
 def _ensure_np_dtype(
@@ -1071,7 +1071,7 @@ def dispatch_data_backend(
         return _from_scipy_csr(
             data.tocsr(), missing, threads, feature_names, feature_types
         )
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
         return _from_numpy_array(
             data, missing, threads, feature_names, feature_types, data_split_mode
         )
@@ -1214,7 +1214,7 @@ def dispatch_meta_backend(
     if _is_tuple(data):
         _meta_from_tuple(data, name, dtype, handle)
         return
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
         _meta_from_numpy(data, name, dtype, handle)
         return
     if _is_pandas_df(data):
@@ -1301,7 +1301,7 @@ def _proxy_transform(
         return _transform_dlpack(data), None, feature_names, feature_types
     if _is_list(data) or _is_tuple(data):
         data = np.array(data)
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
         data, _ = _ensure_np_dtype(data, data.dtype)
         return data, None, feature_names, feature_types
     if _is_scipy_csr(data):
@@ -1351,7 +1351,7 @@ def dispatch_proxy_set_data(
     if not allow_host:
         raise err
 
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
         _check_data_shape(data)
         proxy._set_data_from_array(data)  # pylint: disable=W0212
         return

From def77870f3e82c476dd96d2a0f467c8294173de3 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 22 Sep 2023 18:55:09 -0700
Subject: [PATCH 20/59] Test categorical features with column-split gpu
 quantile (#9595)

---
 src/common/quantile.cu            | 23 ++++++++++---
 src/common/quantile.h             |  8 ++---
 tests/cpp/common/test_quantile.cu | 56 ++++++++++++++++++++++++++-----
 3 files changed, 70 insertions(+), 17 deletions(-)

diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 776752476..2bf6070d5 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -634,12 +634,25 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
         });
     CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1);
     max_values.resize(d_in_columns_ptr.size() - 1);
+
+    // In some cases (e.g. column-wise data split), we may have empty columns, so we need to keep
+    // track of the unique keys (feature indices) after the thrust::reduce_by_key` call.
+    dh::caching_device_vector<size_t> d_max_keys(d_in_columns_ptr.size() - 1);
     dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
-    thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
-                          thrust::make_discard_iterator(), d_max_values.begin(),
-                          thrust::equal_to<bst_feature_t>{},
-                          [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
-    dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values));
+    auto new_end = thrust::reduce_by_key(
+        thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
+        d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
+        [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
+    d_max_keys.erase(new_end.first, d_max_keys.end());
+    d_max_values.erase(new_end.second, d_max_values.end());
+
+    // The device vector needs to be initialized explicitly since we may have some missing columns.
+    SketchEntry default_entry{};
+    dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
+                                                         default_entry);
+    thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(),
+                    d_max_keys.begin(), d_max_results.begin());
+    dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results));
     auto max_it = MakeIndexTransformIter([&](auto i) {
       if (IsCat(h_feature_types, i)) {
         return max_values[i].value;
diff --git a/src/common/quantile.h b/src/common/quantile.h
index 48758b8dc..47db5f875 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -35,13 +35,13 @@ struct WQSummary {
   /*! \brief an entry in the sketch summary */
   struct Entry {
     /*! \brief minimum rank */
-    RType rmin;
+    RType rmin{};
     /*! \brief maximum rank */
-    RType rmax;
+    RType rmax{};
     /*! \brief maximum weight */
-    RType wmin;
+    RType wmin{};
     /*! \brief the value of data */
-    DType value;
+    DType value{};
     // constructor
     XGBOOST_DEVICE Entry() {}  // NOLINT
     // constructor
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 57c9da703..49353439f 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -339,6 +339,31 @@ TEST(GPUQuantile, MultiMerge) {
   });
 }
 
+TEST(GPUQuantile, MissingColumns) {
+  auto dmat = std::unique_ptr<DMatrix>{[=]() {
+    std::size_t constexpr kRows = 1000, kCols = 100;
+    auto sparsity = 0.5f;
+    std::vector<FeatureType> ft(kCols);
+    for (size_t i = 0; i < ft.size(); ++i) {
+      ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
+    }
+    auto dmat = RandomDataGenerator{kRows, kCols, sparsity}
+                    .Seed(0)
+                    .Lower(.0f)
+                    .Upper(1.0f)
+                    .Type(ft)
+                    .MaxCategory(13)
+                    .GenerateDMatrix();
+    return dmat->SliceCol(2, 1);
+  }()};
+  dmat->Info().data_split_mode = DataSplitMode::kRow;
+
+  auto ctx = MakeCUDACtx(0);
+  std::size_t constexpr kBins = 64;
+  HistogramCuts cuts = common::DeviceSketch(&ctx, dmat.get(), kBins);
+  ASSERT_TRUE(cuts.HasCategorical());
+}
+
 namespace {
 void TestAllReduceBasic() {
   auto const world = collective::GetWorldSize();
@@ -422,18 +447,14 @@ TEST_F(MGPUQuantileTest, AllReduceBasic) {
 }
 
 namespace {
-void TestColumnSplitBasic() {
+void TestColumnSplit(DMatrix* dmat) {
   auto const world = collective::GetWorldSize();
   auto const rank = collective::GetRank();
-  std::size_t constexpr kRows = 1000, kCols = 100, kBins = 64;
-
-  auto m = std::unique_ptr<DMatrix>{[=]() {
-    auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-    return dmat->SliceCol(world, rank);
-  }()};
+  auto m = std::unique_ptr<DMatrix>{dmat->SliceCol(world, rank)};
 
   // Generate cuts for distributed environment.
   auto ctx = MakeCUDACtx(GPUIDX);
+  std::size_t constexpr kBins = 64;
   HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
 
   // Generate cuts for single node environment
@@ -466,7 +487,26 @@ void TestColumnSplitBasic() {
 }  // anonymous namespace
 
 TEST_F(MGPUQuantileTest, ColumnSplitBasic) {
-  DoTest(TestColumnSplitBasic);
+  std::size_t constexpr kRows = 1000, kCols = 100;
+  auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+  DoTest(TestColumnSplit, dmat.get());
+}
+
+TEST_F(MGPUQuantileTest, ColumnSplitCategorical) {
+  std::size_t constexpr kRows = 1000, kCols = 100;
+  auto sparsity = 0.5f;
+  std::vector<FeatureType> ft(kCols);
+  for (size_t i = 0; i < ft.size(); ++i) {
+    ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
+  }
+  auto dmat = RandomDataGenerator{kRows, kCols, sparsity}
+                  .Seed(0)
+                  .Lower(.0f)
+                  .Upper(1.0f)
+                  .Type(ft)
+                  .MaxCategory(13)
+                  .GenerateDMatrix();
+  DoTest(TestColumnSplit, dmat.get());
 }
 
 namespace {

From cac2cd2e940bc91190e7f456e15cb2b15b7f97a3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 23 Sep 2023 21:44:03 +0800
Subject: [PATCH 21/59] [R] Set number of threads in demos and tests. (#9591)

- Restrict the number of threads in IO.
- Specify the number of threads in demos and tests.
- Add helper scripts for checks.
---
 R-package/.Rbuildignore                       |   2 +
 R-package/R/callbacks.R                       |  14 +-
 R-package/R/xgb.Booster.R                     |  26 +-
 R-package/R/xgb.DMatrix.R                     |   7 +-
 R-package/R/xgb.load.R                        |  15 +-
 R-package/R/xgb.model.dt.tree.R               |   5 +-
 R-package/R/xgb.plot.deepness.R               |   7 +-
 R-package/R/xgb.plot.importance.R             |   9 +-
 R-package/R/xgb.plot.multi.trees.R            |  11 +-
 R-package/R/xgb.plot.shap.R                   |  12 +-
 R-package/R/xgb.save.R                        |  14 +-
 R-package/R/xgb.save.raw.R                    |   9 +-
 R-package/R/xgb.train.R                       |  29 +-
 R-package/man/cb.gblinear.history.Rd          |  14 +-
 R-package/man/predict.xgb.Booster.Rd          |   7 +-
 R-package/man/xgb.DMatrix.Rd                  |   7 +-
 R-package/man/xgb.config.Rd                   |   9 +-
 R-package/man/xgb.load.Rd                     |  15 +-
 R-package/man/xgb.model.dt.tree.Rd            |   5 +-
 R-package/man/xgb.plot.deepness.Rd            |   7 +-
 R-package/man/xgb.plot.importance.Rd          |   9 +-
 R-package/man/xgb.plot.multi.trees.Rd         |  11 +-
 R-package/man/xgb.plot.shap.Rd                |  12 +-
 R-package/man/xgb.save.Rd                     |  14 +-
 R-package/man/xgb.save.raw.Rd                 |   9 +-
 R-package/man/xgb.train.Rd                    |  29 +-
 R-package/tests/helper_scripts/run-examples.R |  25 ++
 R-package/tests/testthat/test_basic.R         | 272 +++++++++++-------
 R-package/tests/testthat/test_callbacks.R     |  19 +-
 .../tests/testthat/test_custom_objective.R    |  14 +-
 R-package/tests/testthat/test_dmatrix.R       |  65 +++--
 .../tests/testthat/test_feature_weights.R     |  10 +-
 R-package/tests/testthat/test_glm.R           |  22 +-
 R-package/tests/testthat/test_helpers.R       |   6 +-
 .../testthat/test_interaction_constraints.R   |  17 +-
 R-package/tests/testthat/test_interactions.R  |  24 +-
 R-package/tests/testthat/test_io.R            |   3 +-
 .../tests/testthat/test_model_compatibility.R |   3 +-
 .../tests/testthat/test_parameter_exposure.R  |   8 +-
 .../tests/testthat/test_poisson_regression.R  |   6 +-
 R-package/tests/testthat/test_ranking.R       |  16 +-
 R-package/tests/testthat/test_unicode.R       |   1 +
 R-package/tests/testthat/test_update.R        |  19 +-
 R-package/vignettes/discoverYourData.Rmd      |  30 +-
 R-package/vignettes/xgboost.Rnw               |   9 +-
 R-package/vignettes/xgboostPresentation.Rmd   |  34 +--
 R-package/vignettes/xgboostfromJSON.Rmd       |   2 +-
 doc/contrib/coding_guide.rst                  |  18 ++
 src/data/ellpack_page_source.h                |   1 +
 src/gbm/gbtree_model.cc                       |  24 +-
 tests/ci_build/test_r_package.py              |  54 +++-
 51 files changed, 714 insertions(+), 296 deletions(-)
 create mode 100644 R-package/tests/helper_scripts/run-examples.R

diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore
index b37d627ba..b1932e324 100644
--- a/R-package/.Rbuildignore
+++ b/R-package/.Rbuildignore
@@ -4,3 +4,5 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 README.md
+^doc$
+^Meta$
diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index 96bcaef32..54f821a79 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -557,14 +557,18 @@ cb.cv.predict <- function(save_models = FALSE) {
 #'
 #' @examples
 #' #### Binary classification:
-#' #
+#'
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
 #' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
 #' # without considering the 2nd order interactions:
 #' x <- model.matrix(Species ~ .^2, iris)[,-1]
 #' colnames(x)
-#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = 2)
+#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
 #' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
-#'               lambda = 0.0003, alpha = 0.0003, nthread = 2)
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
 #' # For 'shotgun', which is a default linear updater, using high eta values may result in
 #' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
 #' # rate does not break the convergence, but allows us to illustrate the typical pattern of
@@ -594,9 +598,9 @@ cb.cv.predict <- function(save_models = FALSE) {
 #'
 #' #### Multiclass classification:
 #' #
-#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1)
+#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
 #' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-#'               lambda = 0.0003, alpha = 0.0003, nthread = 1)
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
 #' # For the default linear updater 'shotgun' it sometimes is helpful
 #' # to use smaller eta to reduce instability
 #' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 5ffbbc31c..f196b1a1d 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -267,11 +267,16 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #'
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
+#'
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
+#'
 #' train <- agaricus.train
 #' test <- agaricus.test
 #'
 #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
+#'                eta = 0.5, nthread = nthread, nrounds = 5, objective = "binary:logistic")
 #' # use all trees by default
 #' pred <- predict(bst, test$data)
 #' # use only the 1st tree
@@ -337,8 +342,14 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
                                 reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
   object <- xgb.Booster.complete(object, saveraw = FALSE)
 
-  if (!inherits(newdata, "xgb.DMatrix"))
-    newdata <- xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]], -1))
+  if (!inherits(newdata, "xgb.DMatrix")) {
+    config <- jsonlite::fromJSON(xgb.config(object))
+    nthread <- strtoi(config$learner$generic_param$nthread)
+    newdata <- xgb.DMatrix(
+      newdata,
+      missing = missing, nthread = NVL(nthread, -1)
+    )
+  }
   if (!is.null(object[["feature_names"]]) &&
       !is.null(colnames(newdata)) &&
       !identical(object[["feature_names"]], colnames(newdata)))
@@ -628,10 +639,15 @@ xgb.attributes <- function(object) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
 #' train <- agaricus.train
 #'
-#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' bst <- xgboost(
+#'   data = train$data, label = train$label, max_depth = 2,
+#'   eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
+#' )
 #' config <- xgb.config(bst)
 #'
 #' @rdname xgb.config
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index fc2609416..8586ae086 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -18,7 +18,12 @@
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#' dtrain <- with(
+#'   agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
+#' )
 #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
index cfbf0b2d8..cbdbdacc3 100644
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -22,14 +22,23 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
+#'
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
 #' train <- agaricus.train
 #' test <- agaricus.test
-#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#' bst <- xgboost(
+#'   data = train$data, label = train$label, max_depth = 2, eta = 1,
+#'   nthread = nthread,
+#'   nrounds = 2,
+#'   objective = "binary:logistic"
+#' )
+#'
 #' xgb.save(bst, 'xgb.model')
 #' bst <- xgb.load('xgb.model')
 #' if (file.exists('xgb.model')) file.remove('xgb.model')
-#' pred <- predict(bst, test$data)
 #' @export
 xgb.load <- function(modelfile) {
   if (is.null(modelfile))
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index fa11c50fb..d69169b89 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -46,9 +46,12 @@
 #' # Basic use:
 #'
 #' data(agaricus.train, package='xgboost')
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
 #'
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
 #'
 #' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
 #'
diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R
index f6230e1ab..f0fe0f134 100644
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@@ -45,10 +45,13 @@
 #' @examples
 #'
 #' data(agaricus.train, package='xgboost')
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
 #'
-#' # Change max_depth to a higher number to get a more significant result
+#' ## Change max_depth to a higher number to get a more significant result
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
-#'                eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
+#'                eta = 0.1, nthread = nthread, nrounds = 50, objective = "binary:logistic",
 #'                subsample = 0.5, min_child_weight = 2)
 #'
 #' xgb.plot.deepness(bst)
diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R
index 7104d701f..2c02d5a42 100644
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@@ -45,9 +45,14 @@
 #'
 #' @examples
 #' data(agaricus.train)
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
 #'
-#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
-#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' bst <- xgboost(
+#'   data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
+#'   eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
+#' )
 #'
 #' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
 #'
diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R
index 63c66008d..f4d797a61 100644
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -43,10 +43,15 @@
 #' @examples
 #'
 #' data(agaricus.train, package='xgboost')
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
 #'
-#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-#'                eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-#'                min_child_weight = 50, verbose = 0)
+#' bst <- xgboost(
+#'   data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
+#'   eta = 1, nthread = nthread, nrounds = 30, objective = "binary:logistic",
+#'   min_child_weight = 50, verbose = 0
+#' )
 #'
 #' p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
 #' print(p)
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index 9efcb66ec..d9afd5546 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -74,9 +74,14 @@
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
-#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#' nrounds <- 20
+#'
+#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = nrounds,
 #'                eta = 0.1, max_depth = 3, subsample = .5,
-#'                method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
+#'                method = "hist", objective = "binary:logistic", nthread = nthread, verbose = 0)
 #'
 #' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
 #' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
@@ -85,12 +90,11 @@
 #'
 #' # multiclass example - plots for each class separately:
 #' nclass <- 3
-#' nrounds <- 20
 #' x <- as.matrix(iris[, -5])
 #' set.seed(123)
 #' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
 #' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
-#'                 max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
+#'                 max_depth = 2, eta = 0.3, subsample = .5, nthread = nthread,
 #'                 objective = "multi:softprob", num_class = nclass, verbose = 0)
 #' trees0 <- seq(from=0, by=nclass, length.out=nrounds)
 #' col <- rgb(0, 0, 1, 0.5)
diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R
index 14be0f065..ab55bc4a9 100644
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -25,14 +25,22 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
+#'
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
 #' train <- agaricus.train
 #' test <- agaricus.test
-#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#' bst <- xgboost(
+#'   data = train$data, label = train$label, max_depth = 2, eta = 1,
+#'   nthread = nthread,
+#'   nrounds = 2,
+#'   objective = "binary:logistic"
+#' )
 #' xgb.save(bst, 'xgb.model')
 #' bst <- xgb.load('xgb.model')
 #' if (file.exists('xgb.model')) file.remove('xgb.model')
-#' pred <- predict(bst, test$data)
 #' @export
 xgb.save <- function(model, fname) {
   if (typeof(fname) != "character")
diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R
index 48fdbca45..cad0fb0e0 100644
--- a/R-package/R/xgb.save.raw.R
+++ b/R-package/R/xgb.save.raw.R
@@ -16,13 +16,18 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
+#'
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
+#'
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
+#'
 #' raw <- xgb.save.raw(bst)
 #' bst <- xgb.load.raw(raw)
-#' pred <- predict(bst, test$data)
 #'
 #' @export
 xgb.save.raw <- function(model, raw_format = "deprecated") {
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 7fe64ab34..d93a0643d 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -168,7 +168,8 @@
 #' than the \code{xgboost} interface.
 #'
 #' Parallelization is automatically enabled if \code{OpenMP} is present.
-#' Number of threads can also be manually specified via \code{nthread} parameter.
+#' Number of threads can also be manually specified via the \code{nthread}
+#' parameter.
 #'
 #' The evaluation metric is chosen automatically by XGBoost (according to the objective)
 #' when the \code{eval_metric} parameter is not provided.
@@ -237,17 +238,25 @@
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
-#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
+#' dtrain <- with(
+#'   agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
+#' )
+#' dtest <- with(
+#'   agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
+#' )
 #' watchlist <- list(train = dtrain, eval = dtest)
 #'
 #' ## A simple xgb.train example:
-#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
 #'               objective = "binary:logistic", eval_metric = "auc")
 #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
 #'
-#'
-#' ## An xgb.train example where custom objective and evaluation metric are used:
+#' ## An xgb.train example where custom objective and evaluation metric are
+#' ## used:
 #' logregobj <- function(preds, dtrain) {
 #'    labels <- getinfo(dtrain, "label")
 #'    preds <- 1/(1 + exp(-preds))
@@ -263,12 +272,12 @@
 #'
 #' # These functions could be used by passing them either:
 #' #  as 'objective' and 'eval_metric' parameters in the params list:
-#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
 #'               objective = logregobj, eval_metric = evalerror)
 #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
 #'
 #' #  or through the ... arguments:
-#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2)
+#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread)
 #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
 #'                  objective = logregobj, eval_metric = evalerror)
 #'
@@ -278,7 +287,7 @@
 #'
 #'
 #' ## An xgb.train example of using variable learning rates at each iteration:
-#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
 #'               objective = "binary:logistic", eval_metric = "auc")
 #' my_etas <- list(eta = c(0.5, 0.1))
 #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
@@ -290,7 +299,7 @@
 #'
 #' ## An 'xgboost' interface example:
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
-#'                max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
+#'                max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
 #'                objective = "binary:logistic")
 #' pred <- predict(bst, agaricus.test$data)
 #'
diff --git a/R-package/man/cb.gblinear.history.Rd b/R-package/man/cb.gblinear.history.Rd
index f050fc7f1..2a03c14db 100644
--- a/R-package/man/cb.gblinear.history.Rd
+++ b/R-package/man/cb.gblinear.history.Rd
@@ -35,14 +35,18 @@ Callback function expects the following values to be set in its calling frame:
 }
 \examples{
 #### Binary classification:
-#
+
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+
 # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
 # without considering the 2nd order interactions:
 x <- model.matrix(Species ~ .^2, iris)[,-1]
 colnames(x)
-dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = 2)
+dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
 param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
-              lambda = 0.0003, alpha = 0.0003, nthread = 2)
+              lambda = 0.0003, alpha = 0.0003, nthread = nthread)
 # For 'shotgun', which is a default linear updater, using high eta values may result in
 # unstable behaviour in some datasets. With this simple dataset, however, the high learning
 # rate does not break the convergence, but allows us to illustrate the typical pattern of
@@ -72,9 +76,9 @@ matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
 
 #### Multiclass classification:
 #
-dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1)
+dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
 param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-              lambda = 0.0003, alpha = 0.0003, nthread = 1)
+              lambda = 0.0003, alpha = 0.0003, nthread = nthread)
 # For the default linear updater 'shotgun' it sometimes is helpful
 # to use smaller eta to reduce instability
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 87f06d451..ee3b370c4 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -132,11 +132,16 @@ Note also that converting a matrix to \code{\link{xgb.DMatrix}} uses multiple th
 
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
+
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
+
 train <- agaricus.train
 test <- agaricus.test
 
 bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
+               eta = 0.5, nthread = nthread, nrounds = 5, objective = "binary:logistic")
 # use all trees by default
 pred <- predict(bst, test$data)
 # use only the 1st tree
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index 742073fad..59ef0b3be 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -38,7 +38,12 @@ Supported input file formats are either a LIBSVM text file or a binary file that
 }
 \examples{
 data(agaricus.train, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+dtrain <- with(
+  agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
+)
 xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
diff --git a/R-package/man/xgb.config.Rd b/R-package/man/xgb.config.Rd
index a5187c8ea..35545cc77 100644
--- a/R-package/man/xgb.config.Rd
+++ b/R-package/man/xgb.config.Rd
@@ -19,10 +19,15 @@ Accessors for model parameters as JSON string.
 }
 \examples{
 data(agaricus.train, package='xgboost')
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
 train <- agaricus.train
 
-bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+bst <- xgboost(
+  data = train$data, label = train$label, max_depth = 2,
+  eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
+)
 config <- xgb.config(bst)
 
 }
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index f644bc408..1a406cc21 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -27,14 +27,23 @@ not \code{xgb.load}.
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
+
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+bst <- xgboost(
+  data = train$data, label = train$label, max_depth = 2, eta = 1,
+  nthread = nthread,
+  nrounds = 2,
+  objective = "binary:logistic"
+)
+
 xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
 if (file.exists('xgb.model')) file.remove('xgb.model')
-pred <- predict(bst, test$data)
 }
 \seealso{
 \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd
index b89d298b6..5a17f9d90 100644
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -66,9 +66,12 @@ Parse a boosted tree model text dump into a \code{data.table} structure.
 # Basic use:
 
 data(agaricus.train, package='xgboost')
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
 
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+               eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
 
 (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
 
diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd
index 39e291a81..9e23ac130 100644
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@@ -61,10 +61,13 @@ This function was inspired by the blog post
 \examples{
 
 data(agaricus.train, package='xgboost')
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
 
-# Change max_depth to a higher number to get a more significant result
+## Change max_depth to a higher number to get a more significant result
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
-               eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
+               eta = 0.1, nthread = nthread, nrounds = 50, objective = "binary:logistic",
                subsample = 0.5, min_child_weight = 2)
 
 xgb.plot.deepness(bst)
diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd
index 1ee58b7ad..4dba62afe 100644
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@@ -77,9 +77,14 @@ with bar colors corresponding to different clusters that have somewhat similar i
 }
 \examples{
 data(agaricus.train)
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
 
-bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+bst <- xgboost(
+  data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
+  eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
+)
 
 importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
 
diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd
index 74c4a0604..4fa526b90 100644
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -63,10 +63,15 @@ This function is inspired by this blog post:
 \examples{
 
 data(agaricus.train, package='xgboost')
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
 
-bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-               eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-               min_child_weight = 50, verbose = 0)
+bst <- xgboost(
+  data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
+  eta = 1, nthread = nthread, nrounds = 30, objective = "binary:logistic",
+  min_child_weight = 50, verbose = 0
+)
 
 p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
 print(p)
diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd
index a55a551de..6f2d0dfa6 100644
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -124,9 +124,14 @@ a meaningful thing to do.
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
-bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+nrounds <- 20
+
+bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = nrounds,
                eta = 0.1, max_depth = 3, subsample = .5,
-               method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
+               method = "hist", objective = "binary:logistic", nthread = nthread, verbose = 0)
 
 xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
 contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
@@ -135,12 +140,11 @@ xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12)  # S
 
 # multiclass example - plots for each class separately:
 nclass <- 3
-nrounds <- 20
 x <- as.matrix(iris[, -5])
 set.seed(123)
 is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
 mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
-                max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
+                max_depth = 2, eta = 0.3, subsample = .5, nthread = nthread,
                 objective = "multi:softprob", num_class = nclass, verbose = 0)
 trees0 <- seq(from=0, by=nclass, length.out=nrounds)
 col <- rgb(0, 0, 1, 0.5)
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
index 235fc504c..a7e160a12 100644
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -31,14 +31,22 @@ releases of XGBoost.
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
+
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+bst <- xgboost(
+  data = train$data, label = train$label, max_depth = 2, eta = 1,
+  nthread = nthread,
+  nrounds = 2,
+  objective = "binary:logistic"
+)
 xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
 if (file.exists('xgb.model')) file.remove('xgb.model')
-pred <- predict(bst, test$data)
 }
 \seealso{
 \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd
index ad188eb83..c7c93a734 100644
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -25,12 +25,17 @@ Save xgboost model from xgboost or xgb.train
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
+
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
+
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+               eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
+
 raw <- xgb.save.raw(bst)
 bst <- xgb.load.raw(raw)
-pred <- predict(bst, test$data)
 
 }
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index d2eeadfd0..105009cf8 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -250,7 +250,8 @@ customized objective and evaluation metric functions, therefore it is more flexi
 than the \code{xgboost} interface.
 
 Parallelization is automatically enabled if \code{OpenMP} is present.
-Number of threads can also be manually specified via \code{nthread} parameter.
+Number of threads can also be manually specified via the \code{nthread}
+parameter.
 
 The evaluation metric is chosen automatically by XGBoost (according to the objective)
 when the \code{eval_metric} parameter is not provided.
@@ -286,17 +287,25 @@ The following callbacks are automatically created when certain parameters are se
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
-dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+
+dtrain <- with(
+  agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
+)
+dtest <- with(
+  agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
+)
 watchlist <- list(train = dtrain, eval = dtest)
 
 ## A simple xgb.train example:
-param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
               objective = "binary:logistic", eval_metric = "auc")
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
 
-
-## An xgb.train example where custom objective and evaluation metric are used:
+## An xgb.train example where custom objective and evaluation metric are
+## used:
 logregobj <- function(preds, dtrain) {
    labels <- getinfo(dtrain, "label")
    preds <- 1/(1 + exp(-preds))
@@ -312,12 +321,12 @@ evalerror <- function(preds, dtrain) {
 
 # These functions could be used by passing them either:
 #  as 'objective' and 'eval_metric' parameters in the params list:
-param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
               objective = logregobj, eval_metric = evalerror)
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
 
 #  or through the ... arguments:
-param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2)
+param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread)
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
                  objective = logregobj, eval_metric = evalerror)
 
@@ -327,7 +336,7 @@ bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
 
 
 ## An xgb.train example of using variable learning rates at each iteration:
-param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
               objective = "binary:logistic", eval_metric = "auc")
 my_etas <- list(eta = c(0.5, 0.1))
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
@@ -339,7 +348,7 @@ bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
 
 ## An 'xgboost' interface example:
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
-               max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
+               max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
                objective = "binary:logistic")
 pred <- predict(bst, agaricus.test$data)
 
diff --git a/R-package/tests/helper_scripts/run-examples.R b/R-package/tests/helper_scripts/run-examples.R
new file mode 100644
index 000000000..08dd3d2a0
--- /dev/null
+++ b/R-package/tests/helper_scripts/run-examples.R
@@ -0,0 +1,25 @@
+## Helper script for running individual examples.
+library(pkgload)
+library(xgboost)
+
+files <- list.files("./man")
+
+
+run_example_timeit <- function(f) {
+  path <- paste("./man/", f, sep = "")
+  print(paste("Test", f))
+  flush.console()
+  t0 <- proc.time()
+  run_example(path)
+  t1 <- proc.time()
+  list(file = f, time = t1 - t0)
+}
+
+timings <- lapply(files, run_example_timeit)
+
+for (t in timings) {
+  ratio <- t$time[1] / t$time[3]
+  if (!is.na(ratio) && !is.infinite(ratio) && ratio >= 2.5) {
+    print(paste("Offending example:", t$file, ratio))
+  }
+}
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index a21b03d77..b7e819738 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1,23 +1,28 @@
 context("basic functions")
 
-data(agaricus.train, package = 'xgboost')
-data(agaricus.test, package = 'xgboost')
+data(agaricus.train, package = "xgboost")
+data(agaricus.test, package = "xgboost")
 train <- agaricus.train
 test <- agaricus.test
 set.seed(1994)
 
 # disable some tests for Win32
 windows_flag <- .Platform$OS.type == "windows" &&
-               .Machine$sizeof.pointer != 8
-solaris_flag <- (Sys.info()['sysname'] == "SunOS")
+  .Machine$sizeof.pointer != 8
+solaris_flag <- (Sys.info()["sysname"] == "SunOS")
+n_threads <- 1
+
 
 test_that("train and predict binary classification", {
   nrounds <- 2
   expect_output(
-    bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-                  eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic",
-                  eval_metric = "error")
-  , "train-error")
+    bst <- xgboost(
+      data = train$data, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = nrounds,
+      objective = "binary:logistic", eval_metric = "error"
+    ),
+    "train-error"
+  )
   expect_equal(class(bst), "xgb.Booster")
   expect_equal(bst$niter, nrounds)
   expect_false(is.null(bst$evaluation_log))
@@ -46,26 +51,39 @@ test_that("parameter validation works", {
   d <- cbind(
     x1 = rnorm(10),
     x2 = rnorm(10),
-    x3 = rnorm(10))
+    x3 = rnorm(10)
+  )
   y <- d[, "x1"] + d[, "x2"]^2 +
     ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
     rnorm(10)
-  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
+  dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
 
   correct <- function() {
-    params <- list(max_depth = 2, booster = "dart",
-                   rate_drop = 0.5, one_drop = TRUE,
-                   objective = "reg:squarederror")
+    params <- list(
+      max_depth = 2,
+      booster = "dart",
+      rate_drop = 0.5,
+      one_drop = TRUE,
+      nthread = n_threads,
+      objective = "reg:squarederror"
+    )
     xgb.train(params = params, data = dtrain, nrounds = nrounds)
   }
   expect_silent(correct())
   incorrect <- function() {
-    params <- list(max_depth = 2, booster = "dart",
-                   rate_drop = 0.5, one_drop = TRUE,
-                   objective = "reg:squarederror",
-                   foo = "bar", bar = "foo")
+    params <- list(
+      max_depth = 2,
+      booster = "dart",
+      rate_drop = 0.5,
+      one_drop = TRUE,
+      objective = "reg:squarederror",
+      nthread = n_threads,
+      foo = "bar",
+      bar = "foo"
+    )
     output <- capture.output(
-      xgb.train(params = params, data = dtrain, nrounds = nrounds))
+      xgb.train(params = params, data = dtrain, nrounds = nrounds)
+    )
     print(output)
   }
   expect_output(incorrect(), '\\\\"bar\\\\", \\\\"foo\\\\"')
@@ -79,7 +97,8 @@ test_that("dart prediction works", {
   d <- cbind(
     x1 = rnorm(100),
     x2 = rnorm(100),
-    x3 = rnorm(100))
+    x3 = rnorm(100)
+  )
   y <- d[, "x1"] + d[, "x2"]^2 +
     ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
     rnorm(100)
@@ -93,7 +112,7 @@ test_that("dart prediction works", {
     rate_drop = 0.5,
     one_drop = TRUE,
     eta = 1,
-    nthread = 2,
+    nthread = n_threads,
     nrounds = nrounds,
     objective = "reg:squarederror"
   )
@@ -105,7 +124,7 @@ test_that("dart prediction works", {
   expect_false(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
 
   set.seed(1994)
-  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
+  dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
   booster_by_train <- xgb.train(
     params = list(
       booster = "dart",
@@ -113,7 +132,7 @@ test_that("dart prediction works", {
       eta = 1,
       rate_drop = 0.5,
       one_drop = TRUE,
-      nthread = 1,
+      nthread = n_threads,
       objective = "reg:squarederror"
     ),
     data = dtrain,
@@ -132,10 +151,13 @@ test_that("train and predict softprob", {
   lb <- as.numeric(iris$Species) - 1
   set.seed(11)
   expect_output(
-    bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
-                   max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
-                   objective = "multi:softprob", num_class = 3, eval_metric = "merror")
-  , "train-merror")
+    bst <- xgboost(
+      data = as.matrix(iris[, -5]), label = lb,
+      max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
+      objective = "multi:softprob", num_class = 3, eval_metric = "merror"
+    ),
+    "train-merror"
+  )
   expect_false(is.null(bst$evaluation_log))
   expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
   expect_equal(bst$niter * 3, xgb.ntree(bst))
@@ -164,9 +186,10 @@ test_that("train and predict softprob", {
     x3 = rnorm(100)
   )
   y <- sample.int(10, 100, replace = TRUE) - 1
-  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
+  dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
   booster <- xgb.train(
-    params = list(tree_method = "hist"), data = dtrain, nrounds = 4, num_class = 10,
+    params = list(tree_method = "hist", nthread = n_threads),
+    data = dtrain, nrounds = 4, num_class = 10,
     objective = "multi:softprob"
   )
   predt <- predict(booster, as.matrix(d), reshape = TRUE, strict_shape = FALSE)
@@ -178,10 +201,13 @@ test_that("train and predict softmax", {
   lb <- as.numeric(iris$Species) - 1
   set.seed(11)
   expect_output(
-    bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
-                   max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
-                   objective = "multi:softmax", num_class = 3, eval_metric = "merror")
-  , "train-merror")
+    bst <- xgboost(
+      data = as.matrix(iris[, -5]), label = lb,
+      max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
+      objective = "multi:softmax", num_class = 3, eval_metric = "merror"
+    ),
+    "train-merror"
+  )
   expect_false(is.null(bst$evaluation_log))
   expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
   expect_equal(bst$niter * 3, xgb.ntree(bst))
@@ -196,16 +222,19 @@ test_that("train and predict RF", {
   set.seed(11)
   lb <- train$label
   # single iteration
-  bst <- xgboost(data = train$data, label = lb, max_depth = 5,
-                 nthread = 2, nrounds = 1, objective = "binary:logistic", eval_metric = "error",
-                 num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1)
+  bst <- xgboost(
+    data = train$data, label = lb, max_depth = 5,
+    nthread = n_threads,
+    nrounds = 1, objective = "binary:logistic", eval_metric = "error",
+    num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1
+  )
   expect_equal(bst$niter, 1)
   expect_equal(xgb.ntree(bst), 20)
 
   pred <- predict(bst, train$data)
   pred_err <- sum((pred > 0.5) != lb) / length(lb)
   expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
-  #expect_lt(pred_err, 0.03)
+  # expect_lt(pred_err, 0.03)
 
   pred <- predict(bst, train$data, ntreelimit = 20)
   pred_err_20 <- sum((pred > 0.5) != lb) / length(lb)
@@ -219,11 +248,13 @@ test_that("train and predict RF with softprob", {
   lb <- as.numeric(iris$Species) - 1
   nrounds <- 15
   set.seed(11)
-  bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
-                 max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
-                 objective = "multi:softprob", eval_metric = "merror",
-                 num_class = 3, verbose = 0,
-                 num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
+  bst <- xgboost(
+    data = as.matrix(iris[, -5]), label = lb,
+    max_depth = 3, eta = 0.9, nthread = n_threads, nrounds = nrounds,
+    objective = "multi:softprob", eval_metric = "merror",
+    num_class = 3, verbose = 0,
+    num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5
+  )
   expect_equal(bst$niter, 15)
   expect_equal(xgb.ntree(bst), 15 * 3 * 4)
   # predict for all iterations:
@@ -240,18 +271,24 @@ test_that("train and predict RF with softprob", {
 
 test_that("use of multiple eval metrics works", {
   expect_output(
-    bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
-                   eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
-  , "train-error.*train-auc.*train-logloss")
+    bst <- xgboost(
+      data = train$data, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
+    ),
+    "train-error.*train-auc.*train-logloss"
+  )
   expect_false(is.null(bst$evaluation_log))
   expect_equal(dim(bst$evaluation_log), c(2, 4))
   expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
   expect_output(
-    bst2 <- xgboost(data = train$data, label = train$label, max_depth = 2,
-                    eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
-                    eval_metric = list("error", "auc", "logloss"))
-  , "train-error.*train-auc.*train-logloss")
+    bst2 <- xgboost(
+      data = train$data, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      eval_metric = list("error", "auc", "logloss")
+    ),
+    "train-error.*train-auc.*train-logloss"
+  )
   expect_false(is.null(bst2$evaluation_log))
   expect_equal(dim(bst2$evaluation_log), c(2, 4))
   expect_equal(colnames(bst2$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
@@ -259,9 +296,11 @@ test_that("use of multiple eval metrics works", {
 
 
 test_that("training continuation works", {
-  dtrain <- xgb.DMatrix(train$data, label = train$label)
+  dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
   watchlist <- list(train = dtrain)
-  param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2)
+  param <- list(
+    objective = "binary:logistic", max_depth = 2, eta = 1, nthread = n_threads
+  )
 
   # for the reference, use 4 iterations at once:
   set.seed(11)
@@ -271,30 +310,33 @@ test_that("training continuation works", {
   bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
   # continue for two more:
   bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
-  if (!windows_flag && !solaris_flag)
+  if (!windows_flag && !solaris_flag) {
     expect_equal(bst$raw, bst2$raw)
+  }
   expect_false(is.null(bst2$evaluation_log))
   expect_equal(dim(bst2$evaluation_log), c(4, 2))
   expect_equal(bst2$evaluation_log, bst$evaluation_log)
   # test continuing from raw model data
   bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
-  if (!windows_flag && !solaris_flag)
+  if (!windows_flag && !solaris_flag) {
     expect_equal(bst$raw, bst2$raw)
+  }
   expect_equal(dim(bst2$evaluation_log), c(2, 2))
   # test continuing from a model in file
   xgb.save(bst1, "xgboost.json")
   bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.json")
-  if (!windows_flag && !solaris_flag)
+  if (!windows_flag && !solaris_flag) {
     expect_equal(bst$raw, bst2$raw)
+  }
   expect_equal(dim(bst2$evaluation_log), c(2, 2))
   file.remove("xgboost.json")
 })
 
 test_that("model serialization works", {
   out_path <- "model_serialization"
-  dtrain <- xgb.DMatrix(train$data, label = train$label)
+  dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
   watchlist <- list(train = dtrain)
-  param <- list(objective = "binary:logistic")
+  param <- list(objective = "binary:logistic", nthread = n_threads)
   booster <- xgb.train(param, dtrain, nrounds = 4, watchlist)
   raw <- xgb.serialize(booster)
   saveRDS(raw, out_path)
@@ -309,11 +351,14 @@ test_that("model serialization works", {
 test_that("xgb.cv works", {
   set.seed(11)
   expect_output(
-    cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
-                 eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
-                 eval_metric = "error", verbose = TRUE)
-  , "train-error:")
-  expect_is(cv, 'xgb.cv.synchronous')
+    cv <- xgb.cv(
+      data = train$data, label = train$label, max_depth = 2, nfold = 5,
+      eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      eval_metric = "error", verbose = TRUE
+    ),
+    "train-error:"
+  )
+  expect_is(cv, "xgb.cv.synchronous")
   expect_false(is.null(cv$evaluation_log))
   expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
   expect_lt(cv$evaluation_log[, min(test_error_std)], 0.008)
@@ -326,15 +371,19 @@ test_that("xgb.cv works", {
 })
 
 test_that("xgb.cv works with stratified folds", {
-  dtrain <- xgb.DMatrix(train$data, label = train$label)
+  dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
   set.seed(314159)
-  cv <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
-               eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
-               verbose = TRUE, stratified = FALSE)
+  cv <- xgb.cv(
+    data = dtrain, max_depth = 2, nfold = 5,
+    eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+    verbose = TRUE, stratified = FALSE
+  )
   set.seed(314159)
-  cv2 <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
-                eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
-                verbose = TRUE, stratified = TRUE)
+  cv2 <- xgb.cv(
+    data = dtrain, max_depth = 2, nfold = 5,
+    eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+    verbose = TRUE, stratified = TRUE
+  )
   # Stratified folds should result in a different evaluation logs
   expect_true(all(cv$evaluation_log[, test_logloss_mean] != cv2$evaluation_log[, test_logloss_mean]))
 })
@@ -342,40 +391,57 @@ test_that("xgb.cv works with stratified folds", {
 test_that("train and predict with non-strict classes", {
   # standard dense matrix input
   train_dense <- as.matrix(train$data)
-  bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
-                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
+  bst <- xgboost(
+    data = train_dense, label = train$label, max_depth = 2,
+    eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+    verbose = 0
+  )
   pr0 <- predict(bst, train_dense)
 
   # dense matrix-like input of non-matrix class
-  class(train_dense) <- 'shmatrix'
+  class(train_dense) <- "shmatrix"
   expect_true(is.matrix(train_dense))
   expect_error(
-    bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
-                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
-    , regexp = NA)
+    bst <- xgboost(
+      data = train_dense, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      verbose = 0
+    ),
+    regexp = NA
+  )
   expect_error(pr <- predict(bst, train_dense), regexp = NA)
   expect_equal(pr0, pr)
 
   # dense matrix-like input of non-matrix class with some inheritance
-  class(train_dense) <- c('pphmatrix', 'shmatrix')
+  class(train_dense) <- c("pphmatrix", "shmatrix")
   expect_true(is.matrix(train_dense))
   expect_error(
-    bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
-                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
-    , regexp = NA)
+    bst <- xgboost(
+      data = train_dense, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      verbose = 0
+    ),
+    regexp = NA
+  )
   expect_error(pr <- predict(bst, train_dense), regexp = NA)
   expect_equal(pr0, pr)
 
   # when someone inherits from xgb.Booster, it should still be possible to use it as xgb.Booster
-  class(bst) <- c('super.Booster', 'xgb.Booster')
+  class(bst) <- c("super.Booster", "xgb.Booster")
   expect_error(pr <- predict(bst, train_dense), regexp = NA)
   expect_equal(pr0, pr)
 })
 
 test_that("max_delta_step works", {
-  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+  dtrain <- xgb.DMatrix(
+    agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+  )
   watchlist <- list(train = dtrain)
-  param <- list(objective = "binary:logistic", eval_metric = "logloss", max_depth = 2, nthread = 2, eta = 0.5)
+  param <- list(
+    objective = "binary:logistic", eval_metric = "logloss", max_depth = 2,
+    nthread = n_threads,
+    eta = 0.5
+  )
   nrounds <- 5
   # model with no restriction on max_delta_step
   bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1)
@@ -395,14 +461,16 @@ test_that("colsample_bytree works", {
   test_y <- as.numeric(rowSums(test_x) > 0)
   colnames(train_x) <- paste0("Feature_", sprintf("%03d", 1:100))
   colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100))
-  dtrain <- xgb.DMatrix(train_x, label = train_y)
-  dtest <- xgb.DMatrix(test_x, label = test_y)
+  dtrain <- xgb.DMatrix(train_x, label = train_y, nthread = n_threads)
+  dtest <- xgb.DMatrix(test_x, label = test_y, nthread = n_threads)
   watchlist <- list(train = dtrain, eval = dtest)
   ## Use colsample_bytree = 0.01, so that roughly one out of 100 features is chosen for
   ## each tree
-  param <- list(max_depth = 2, eta = 0, nthread = 2,
-                colsample_bytree = 0.01, objective = "binary:logistic",
-                eval_metric = "auc")
+  param <- list(
+    max_depth = 2, eta = 0, nthread = n_threads,
+    colsample_bytree = 0.01, objective = "binary:logistic",
+    eval_metric = "auc"
+  )
   set.seed(2)
   bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0)
   xgb.importance(model = bst)
@@ -412,9 +480,11 @@ test_that("colsample_bytree works", {
 })
 
 test_that("Configuration works", {
-  bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
-                 eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
+  bst <- xgboost(
+    data = train$data, label = train$label, max_depth = 2,
+    eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+    eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
+  )
   config <- xgb.config(bst)
   xgb.config(bst) <- config
   reloaded_config <- xgb.config(bst)
@@ -451,22 +521,26 @@ test_that("strict_shape works", {
     y <- as.numeric(iris$Species) - 1
     X <- as.matrix(iris[, -5])
 
-    bst <- xgboost(data = X, label = y,
-                   max_depth = 2, nrounds = n_rounds,
-                   objective = "multi:softprob", num_class = 3, eval_metric = "merror")
+    bst <- xgboost(
+      data = X, label = y,
+      max_depth = 2, nrounds = n_rounds, nthread = n_threads,
+      objective = "multi:softprob", num_class = 3, eval_metric = "merror"
+    )
 
     test_strict_shape(bst, X, 3)
   }
 
 
   test_agaricus <- function() {
-    data(agaricus.train, package = 'xgboost')
+    data(agaricus.train, package = "xgboost")
     X <- agaricus.train$data
     y <- agaricus.train$label
 
-    bst <- xgboost(data = X, label = y, max_depth = 2,
-                   nrounds = n_rounds, objective = "binary:logistic",
-                   eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
+    bst <- xgboost(
+      data = X, label = y, max_depth = 2, nthread = n_threads,
+      nrounds = n_rounds, objective = "binary:logistic",
+      eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
+    )
 
     test_strict_shape(bst, X, 1)
   }
@@ -481,8 +555,10 @@ test_that("'predict' accepts CSR data", {
   x_csc <- as(X[1L, , drop = FALSE], "CsparseMatrix")
   x_csr <- as(x_csc, "RsparseMatrix")
   x_spv <- as(x_csc, "sparseVector")
-  bst <- xgboost(data = X, label = y, objective = "binary:logistic",
-                 nrounds = 5L, verbose = FALSE)
+  bst <- xgboost(
+    data = X, label = y, objective = "binary:logistic",
+    nrounds = 5L, verbose = FALSE, nthread = n_threads,
+  )
   p_csc <- predict(bst, x_csc)
   p_csr <- predict(bst, x_csr)
   p_spv <- predict(bst, x_spv)
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
index e6fe14c6b..b5d3c5310 100644
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -6,6 +6,8 @@ data(agaricus.test, package = 'xgboost')
 train <- agaricus.train
 test <- agaricus.test
 
+n_threads <- 2
+
 # add some label noise for early stopping tests
 add.noise <- function(label, frac) {
   inoise <- sample(length(label), length(label) * frac)
@@ -15,15 +17,15 @@ add.noise <- function(label, frac) {
 set.seed(11)
 ltrain <- add.noise(train$label, 0.2)
 ltest <- add.noise(test$label, 0.2)
-dtrain <- xgb.DMatrix(train$data, label = ltrain)
-dtest <- xgb.DMatrix(test$data, label = ltest)
+dtrain <- xgb.DMatrix(train$data, label = ltrain, nthread = n_threads)
+dtest <- xgb.DMatrix(test$data, label = ltest, nthread = n_threads)
 watchlist <- list(train = dtrain, test = dtest)
 
 
 err <- function(label, pr) sum((pr > 0.5) != label) / length(label)
 
 param <- list(objective = "binary:logistic", eval_metric = "error",
-              max_depth = 2, nthread = 2)
+              max_depth = 2, nthread = n_threads)
 
 
 test_that("cb.print.evaluation works as expected", {
@@ -103,7 +105,7 @@ test_that("cb.evaluation.log works as expected", {
 
 
 param <- list(objective = "binary:logistic", eval_metric = "error",
-              max_depth = 4, nthread = 2)
+              max_depth = 4, nthread = n_threads)
 
 test_that("can store evaluation_log without printing", {
   expect_silent(
@@ -179,8 +181,10 @@ test_that("cb.save.model works as expected", {
   expect_true(file.exists('xgboost_01.json'))
   expect_true(file.exists('xgboost_02.json'))
   b1 <- xgb.load('xgboost_01.json')
+  xgb.parameters(b1) <- list(nthread = 2)
   expect_equal(xgb.ntree(b1), 1)
   b2 <- xgb.load('xgboost_02.json')
+  xgb.parameters(b2) <- list(nthread = 2)
   expect_equal(xgb.ntree(b2), 2)
 
   xgb.config(b2) <- xgb.config(bst)
@@ -267,7 +271,8 @@ test_that("early stopping works with titanic", {
     objective = "binary:logistic",
     eval_metric = "auc",
     nrounds = 100,
-    early_stopping_rounds = 3
+    early_stopping_rounds = 3,
+    nthread = n_threads
   )
 
   expect_true(TRUE)  # should not crash
@@ -308,7 +313,7 @@ test_that("prediction in xgb.cv works", {
 
 test_that("prediction in xgb.cv works for gblinear too", {
   set.seed(11)
-  p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = 2)
+  p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = n_threads)
   cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0)
   expect_false(is.null(cv$evaluation_log))
   expect_false(is.null(cv$pred))
@@ -341,7 +346,7 @@ test_that("prediction in xgb.cv for softprob works", {
   set.seed(11)
   expect_warning(
     cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
-                 eta = 0.5, nrounds = 5, max_depth = 3, nthread = 2,
+                 eta = 0.5, nrounds = 5, max_depth = 3, nthread = n_threads,
                  subsample = 0.8, gamma = 2, verbose = 0,
                  prediction = TRUE, objective = "multi:softprob", num_class = 3)
   , NA)
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index 42f43cede..ff8eb1d6d 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -2,10 +2,16 @@ context('Test models with custom objective')
 
 set.seed(1994)
 
+n_threads <- 2
+
 data(agaricus.train, package = 'xgboost')
 data(agaricus.test, package = 'xgboost')
-dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+dtrain <- xgb.DMatrix(
+  agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+)
+dtest <- xgb.DMatrix(
+  agaricus.test$data, label = agaricus.test$label, nthread = n_threads
+)
 watchlist <- list(eval = dtest, train = dtrain)
 
 logregobj <- function(preds, dtrain) {
@@ -22,7 +28,7 @@ evalerror <- function(preds, dtrain) {
   return(list(metric = "error", value = err))
 }
 
-param <- list(max_depth = 2, eta = 1, nthread = 2,
+param <- list(max_depth = 2, eta = 1, nthread = n_threads,
               objective = logregobj, eval_metric = evalerror)
 num_round <- 2
 
@@ -67,7 +73,7 @@ test_that("custom objective using DMatrix attr works", {
 test_that("custom objective with multi-class shape", {
   data <- as.matrix(iris[, -5])
   label <-  as.numeric(iris$Species) - 1
-  dtrain <- xgb.DMatrix(data = data, label = label)
+  dtrain <- xgb.DMatrix(data = data, label = label, nthread = n_threads)
   n_classes <- 3
 
   fake_softprob <- function(preds, dtrain) {
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 8d74a0357..461b7d158 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -5,19 +5,21 @@ data(agaricus.test, package = "xgboost")
 test_data <- agaricus.test$data[1:100, ]
 test_label <- agaricus.test$label[1:100]
 
+n_threads <- 2
+
 test_that("xgb.DMatrix: basic construction", {
   # from sparse matrix
-  dtest1 <- xgb.DMatrix(test_data, label = test_label)
+  dtest1 <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
 
   # from dense matrix
-  dtest2 <- xgb.DMatrix(as.matrix(test_data), label = test_label)
+  dtest2 <- xgb.DMatrix(as.matrix(test_data), label = test_label, nthread = n_threads)
   expect_equal(getinfo(dtest1, "label"), getinfo(dtest2, "label"))
   expect_equal(dim(dtest1), dim(dtest2))
 
   # from dense integer matrix
   int_data <- as.matrix(test_data)
   storage.mode(int_data) <- "integer"
-  dtest3 <- xgb.DMatrix(int_data, label = test_label)
+  dtest3 <- xgb.DMatrix(int_data, label = test_label, nthread = n_threads)
   expect_equal(dim(dtest1), dim(dtest3))
 
   n_samples <- 100
@@ -29,15 +31,15 @@ test_that("xgb.DMatrix: basic construction", {
   X <- matrix(X, nrow = n_samples)
   y <- rbinom(n = n_samples, size = 1, prob = 1 / 2)
 
-  fd <- xgb.DMatrix(X, label = y, missing = 1)
+  fd <- xgb.DMatrix(X, label = y, missing = 1, nthread = n_threads)
 
   dgc <- as(X, "dgCMatrix")
-  fdgc <- xgb.DMatrix(dgc, label = y, missing = 1.0)
+  fdgc <- xgb.DMatrix(dgc, label = y, missing = 1.0, nthread = n_threads)
 
   dgr <- as(X, "dgRMatrix")
-  fdgr <- xgb.DMatrix(dgr, label = y, missing = 1)
+  fdgr <- xgb.DMatrix(dgr, label = y, missing = 1, nthread = n_threads)
 
-  params <- list(tree_method = "hist")
+  params <- list(tree_method = "hist", nthread = n_threads)
   bst_fd <- xgb.train(
     params, nrounds = 8, fd, watchlist = list(train = fd)
   )
@@ -64,12 +66,12 @@ test_that("xgb.DMatrix: NA", {
   )
   x[1, "x1"] <- NA
 
-  m <- xgb.DMatrix(x)
+  m <- xgb.DMatrix(x, nthread = n_threads)
   xgb.DMatrix.save(m, "int.dmatrix")
 
   x <- matrix(as.numeric(x), nrow = n_samples, ncol = 2)
   colnames(x) <- c("x1", "x2")
-  m <- xgb.DMatrix(x)
+  m <- xgb.DMatrix(x, nthread = n_threads)
 
   xgb.DMatrix.save(m, "float.dmatrix")
 
@@ -94,7 +96,7 @@ test_that("xgb.DMatrix: NA", {
 
 test_that("xgb.DMatrix: saving, loading", {
   # save to a local file
-  dtest1 <- xgb.DMatrix(test_data, label = test_label)
+  dtest1 <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
   tmp_file <- tempfile('xgb.DMatrix_')
   on.exit(unlink(tmp_file))
   expect_true(xgb.DMatrix.save(dtest1, tmp_file))
@@ -109,13 +111,17 @@ test_that("xgb.DMatrix: saving, loading", {
   tmp_file <- tempfile(fileext = ".libsvm")
   writeLines(tmp, tmp_file)
   expect_true(file.exists(tmp_file))
-  dtest4 <- xgb.DMatrix(paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE)
+  dtest4 <- xgb.DMatrix(
+    paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE, nthread = n_threads
+  )
   expect_equal(dim(dtest4), c(3, 4))
   expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0))
 
   # check that feature info is saved
   data(agaricus.train, package = 'xgboost')
-  dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
+  dtrain <- xgb.DMatrix(
+    data = agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+  )
   cnames <- colnames(dtrain)
   expect_equal(length(cnames), 126)
   tmp_file <- tempfile('xgb.DMatrix_')
@@ -129,7 +135,7 @@ test_that("xgb.DMatrix: saving, loading", {
 })
 
 test_that("xgb.DMatrix: getinfo & setinfo", {
-  dtest <- xgb.DMatrix(test_data)
+  dtest <- xgb.DMatrix(test_data, nthread = n_threads)
   expect_true(setinfo(dtest, 'label', test_label))
   labels <- getinfo(dtest, 'label')
   expect_equal(test_label, getinfo(dtest, 'label'))
@@ -156,7 +162,7 @@ test_that("xgb.DMatrix: getinfo & setinfo", {
 })
 
 test_that("xgb.DMatrix: slice, dim", {
-  dtest <- xgb.DMatrix(test_data, label = test_label)
+  dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
   expect_equal(dim(dtest), dim(test_data))
   dsub1 <- slice(dtest, 1:42)
   expect_equal(nrow(dsub1), 42)
@@ -171,16 +177,20 @@ test_that("xgb.DMatrix: slice, trailing empty rows", {
   data(agaricus.train, package = 'xgboost')
   train_data <- agaricus.train$data
   train_label <- agaricus.train$label
-  dtrain <- xgb.DMatrix(data = train_data, label = train_label)
+  dtrain <- xgb.DMatrix(
+    data = train_data, label = train_label, nthread = n_threads
+  )
   slice(dtrain, 6513L)
   train_data[6513, ] <- 0
-  dtrain <- xgb.DMatrix(data = train_data, label = train_label)
+  dtrain <- xgb.DMatrix(
+    data = train_data, label = train_label, nthread = n_threads
+  )
   slice(dtrain, 6513L)
   expect_equal(nrow(dtrain), 6513)
 })
 
 test_that("xgb.DMatrix: colnames", {
-  dtest <- xgb.DMatrix(test_data, label = test_label)
+  dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
   expect_equal(colnames(dtest), colnames(test_data))
   expect_error(colnames(dtest) <- 'asdf')
   new_names <- make.names(seq_len(ncol(test_data)))
@@ -196,7 +206,7 @@ test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
   x <- Matrix::rsparsematrix(nr, 100, density = 0.0005)
   # we want it very sparse, so that last rows are empty
   expect_lt(max(x@i), nr)
-  dtest <- xgb.DMatrix(x)
+  dtest <- xgb.DMatrix(x, nthread = n_threads)
   expect_equal(dim(dtest), dim(x))
 })
 
@@ -205,8 +215,8 @@ test_that("xgb.DMatrix: print", {
 
     # core DMatrix with just data and labels
     dtrain <- xgb.DMatrix(
-        data = agaricus.train$data
-        , label = agaricus.train$label
+      data = agaricus.train$data, label = agaricus.train$label,
+      nthread = n_threads
     )
     txt <- capture.output({
         print(dtrain)
@@ -222,10 +232,11 @@ test_that("xgb.DMatrix: print", {
 
     # DMatrix with weights and base_margin
     dtrain <- xgb.DMatrix(
-        data = agaricus.train$data
-        , label = agaricus.train$label
-        , weight = seq_along(agaricus.train$label)
-        , base_margin = agaricus.train$label
+      data = agaricus.train$data,
+      label = agaricus.train$label,
+      weight = seq_along(agaricus.train$label),
+      base_margin = agaricus.train$label,
+      nthread = n_threads
     )
     txt <- capture.output({
         print(dtrain)
@@ -234,7 +245,8 @@ test_that("xgb.DMatrix: print", {
 
     # DMatrix with just features
     dtrain <- xgb.DMatrix(
-        data = agaricus.train$data
+      data = agaricus.train$data,
+      nthread = n_threads
     )
     txt <- capture.output({
         print(dtrain)
@@ -245,7 +257,8 @@ test_that("xgb.DMatrix: print", {
     data_no_colnames <- agaricus.train$data
     colnames(data_no_colnames) <- NULL
     dtrain <- xgb.DMatrix(
-        data = data_no_colnames
+      data = data_no_colnames,
+      nthread = n_threads
     )
     txt <- capture.output({
         print(dtrain)
diff --git a/R-package/tests/testthat/test_feature_weights.R b/R-package/tests/testthat/test_feature_weights.R
index bb3802979..4ed78c9b6 100644
--- a/R-package/tests/testthat/test_feature_weights.R
+++ b/R-package/tests/testthat/test_feature_weights.R
@@ -1,5 +1,7 @@
 context("feature weights")
 
+n_threads <- 2
+
 test_that("training with feature weights works", {
   nrows <- 1000
   ncols <- 9
@@ -10,8 +12,12 @@ test_that("training with feature weights works", {
 
   test <- function(tm) {
     names <- paste0("f", 1:ncols)
-    xy <- xgb.DMatrix(data = x, label = y, feature_weights = weights)
-    params <- list(colsample_bynode = 0.4, tree_method = tm, nthread = 1)
+    xy <- xgb.DMatrix(
+      data = x, label = y, feature_weights = weights, nthread = n_threads
+    )
+    params <- list(
+      colsample_bynode = 0.4, tree_method = tm, nthread = n_threads
+    )
     model <- xgb.train(params = params, data = xy, nrounds = 32)
     importance <- xgb.importance(model = model, feature_names = names)
     expect_equal(dim(importance), c(ncols, 4))
diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R
index 2d050945a..9e0a3551f 100644
--- a/R-package/tests/testthat/test_glm.R
+++ b/R-package/tests/testthat/test_glm.R
@@ -1,13 +1,19 @@
 context('Test generalized linear models')
 
+n_threads <- 2
+
 test_that("gblinear works", {
   data(agaricus.train, package = 'xgboost')
   data(agaricus.test, package = 'xgboost')
-  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+  dtrain <- xgb.DMatrix(
+    agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+  )
+  dtest <- xgb.DMatrix(
+    agaricus.test$data, label = agaricus.test$label, nthread = n_threads
+  )
 
   param <- list(objective = "binary:logistic", eval_metric = "error", booster = "gblinear",
-                nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001)
+                nthread = n_threads, eta = 0.8, alpha = 0.0001, lambda = 0.0001)
   watchlist <- list(eval = dtest, train = dtrain)
 
   n <- 5         # iterations
@@ -48,12 +54,16 @@ test_that("gblinear works", {
 test_that("gblinear early stopping works", {
   data(agaricus.train, package = 'xgboost')
   data(agaricus.test, package = 'xgboost')
-  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+  dtrain <- xgb.DMatrix(
+    agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+  )
+  dtest <- xgb.DMatrix(
+    agaricus.test$data, label = agaricus.test$label, nthread = n_threads
+  )
 
   param <- list(
     objective = "binary:logistic", eval_metric = "error", booster = "gblinear",
-    nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001,
+    nthread = n_threads, eta = 0.8, alpha = 0.0001, lambda = 0.0001,
     updater = "coord_descent"
   )
 
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index f00ac0881..04e034ce1 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -171,6 +171,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
     fit <- xgboost(
       params = c(
         list(
+          nthread = 2,
           booster = booster,
           objective = "reg:squarederror",
           eval_metric = "rmse"),
@@ -257,7 +258,7 @@ test_that("xgb.Booster serializing as R object works", {
   .skip_if_vcd_not_available()
   saveRDS(bst.Tree, 'xgb.model.rds')
   bst <- readRDS('xgb.model.rds')
-  dtrain <- xgb.DMatrix(sparse_matrix, label = label)
+  dtrain <- xgb.DMatrix(sparse_matrix, label = label, nthread = 2)
   expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
   expect_equal(xgb.dump(bst.Tree), xgb.dump(bst))
   xgb.save(bst, 'xgb.model')
@@ -363,7 +364,8 @@ test_that("xgb.importance works with and without feature names", {
     data = as.matrix(data.frame(x = c(0, 1))),
     label = c(1, 2),
     nrounds = 1,
-    base_score = 0.5
+    base_score = 0.5,
+    nthread = 2
   )
   df <- xgb.model.dt.tree(model = m)
   expect_equal(df$Feature, "Leaf")
diff --git a/R-package/tests/testthat/test_interaction_constraints.R b/R-package/tests/testthat/test_interaction_constraints.R
index 680709638..1ac804501 100644
--- a/R-package/tests/testthat/test_interaction_constraints.R
+++ b/R-package/tests/testthat/test_interaction_constraints.R
@@ -2,6 +2,8 @@ require(xgboost)
 
 context("interaction constraints")
 
+n_threads <- 2
+
 set.seed(1024)
 x1 <- rnorm(1000, 1)
 x2 <- rnorm(1000, 1)
@@ -45,11 +47,18 @@ test_that("interaction constraints scientific representation", {
   d <- matrix(rexp(rows, rate = .1), nrow = rows, ncol = cols)
   y <- rnorm(rows)
 
-  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
+  dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
   inc <- list(c(seq.int(from = 0, to = cols, by = 1)))
 
-  with_inc <- xgb.train(data = dtrain, tree_method = 'hist',
-                        interaction_constraints = inc, nrounds = 10)
-  without_inc <- xgb.train(data = dtrain, tree_method = 'hist', nrounds = 10)
+  with_inc <- xgb.train(
+    data = dtrain,
+    tree_method = 'hist',
+    interaction_constraints = inc,
+    nrounds = 10,
+    nthread = n_threads
+  )
+  without_inc <- xgb.train(
+    data = dtrain, tree_method = 'hist', nrounds = 10, nthread = n_threads
+  )
   expect_equal(xgb.save.raw(with_inc), xgb.save.raw(without_inc))
 })
diff --git a/R-package/tests/testthat/test_interactions.R b/R-package/tests/testthat/test_interactions.R
index a658fc81f..398531e0e 100644
--- a/R-package/tests/testthat/test_interactions.R
+++ b/R-package/tests/testthat/test_interactions.R
@@ -1,6 +1,7 @@
 context('Test prediction of feature interactions')
 
 set.seed(123)
+n_threads <- 2
 
 test_that("predict feature interactions works", {
   # simulate some binary data and a linear outcome with an interaction term
@@ -19,8 +20,10 @@ test_that("predict feature interactions works", {
 
   y <- f_int(X)
 
-  dm <- xgb.DMatrix(X, label = y)
-  param <- list(eta = 0.1, max_depth = 4, base_score = mean(y), lambda = 0, nthread = 2)
+  dm <- xgb.DMatrix(X, label = y, nthread = n_threads)
+  param <- list(
+    eta = 0.1, max_depth = 4, base_score = mean(y), lambda = 0, nthread = n_threads
+  )
   b <- xgb.train(param, dm, 100)
 
   pred <- predict(b, dm, outputmargin = TRUE)
@@ -99,11 +102,13 @@ test_that("SHAP contribution values are not NAN", {
     verbose = 0,
     params = list(
       objective = "reg:squarederror",
-      eval_metric = "rmse"),
+      eval_metric = "rmse",
+      nthread = n_threads
+    ),
     data = as.matrix(subset(d, fold == 2)[, ivs]),
     label = subset(d, fold == 2)$y,
-    nthread = 1,
-    nrounds = 3)
+    nrounds = 3
+  )
 
   shaps <- as.data.frame(predict(fit,
     newdata = as.matrix(subset(d, fold == 1)[, ivs]),
@@ -116,8 +121,12 @@ test_that("SHAP contribution values are not NAN", {
 
 
 test_that("multiclass feature interactions work", {
-  dm <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1)
-  param <- list(eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3)
+  dm <- xgb.DMatrix(
+    as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, nthread = n_threads
+  )
+  param <- list(
+    eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3, nthread = n_threads
+  )
   b <- xgb.train(param, dm, 40)
   pred <- t(
     array(
@@ -166,6 +175,7 @@ test_that("SHAP single sample works", {
     max_depth = 2,
     nrounds = 4,
     objective = "binary:logistic",
+    nthread = n_threads
   )
 
   predt <- predict(
diff --git a/R-package/tests/testthat/test_io.R b/R-package/tests/testthat/test_io.R
index c2cb1a1a8..8cf5a9ae9 100644
--- a/R-package/tests/testthat/test_io.R
+++ b/R-package/tests/testthat/test_io.R
@@ -9,7 +9,8 @@ test_that("load/save raw works", {
   nrounds <- 8
   booster <- xgboost(
     data = train$data, label = train$label,
-    nrounds = nrounds, objective = "binary:logistic"
+    nrounds = nrounds, objective = "binary:logistic",
+    nthread = 2
   )
 
   json_bytes <- xgb.save.raw(booster, raw_format = "json")
diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R
index 2f4992c06..87380e85d 100644
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -66,7 +66,7 @@ test_that("Models from previous versions of XGBoost can be loaded", {
   unzip(zipfile, exdir = extract_dir, overwrite = TRUE)
   model_dir <- file.path(extract_dir, 'models')
 
-  pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4))
+  pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4), nthread = 2)
 
   lapply(list.files(model_dir), function (x) {
     model_file <- file.path(model_dir, x)
@@ -87,6 +87,7 @@ test_that("Models from previous versions of XGBoost can be loaded", {
         booster <- readRDS(model_file)
       } else {
         booster <- xgb.load(model_file)
+        xgb.parameters(booster) <- list(nthread = 2)
       }
       predict(booster, newdata = pred_data)
       run_booster_check(booster, name)
diff --git a/R-package/tests/testthat/test_parameter_exposure.R b/R-package/tests/testthat/test_parameter_exposure.R
index 47524fbfc..ea71ca7b7 100644
--- a/R-package/tests/testthat/test_parameter_exposure.R
+++ b/R-package/tests/testthat/test_parameter_exposure.R
@@ -3,8 +3,12 @@ context('Test model params and call are exposed to R')
 data(agaricus.train, package = 'xgboost')
 data(agaricus.test, package = 'xgboost')
 
-dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+dtrain <- xgb.DMatrix(
+  agaricus.train$data, label = agaricus.train$label, nthread = 2
+)
+dtest <- xgb.DMatrix(
+  agaricus.test$data, label = agaricus.test$label, nthread = 2
+)
 
 bst <- xgboost(data = dtrain,
                max_depth = 2,
diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R
index b17c6c072..55918b57a 100644
--- a/R-package/tests/testthat/test_poisson_regression.R
+++ b/R-package/tests/testthat/test_poisson_regression.R
@@ -4,8 +4,10 @@ set.seed(1994)
 
 test_that("Poisson regression works", {
   data(mtcars)
-  bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11],
-                objective = 'count:poisson', nrounds = 10, verbose = 0)
+  bst <- xgboost(
+    data = as.matrix(mtcars[, -11]), label = mtcars[, 11],
+    objective = 'count:poisson', nrounds = 10, verbose = 0, nthread = 2
+  )
   expect_equal(class(bst), "xgb.Booster")
   pred <- predict(bst, as.matrix(mtcars[, -11]))
   expect_equal(length(pred), 32)
diff --git a/R-package/tests/testthat/test_ranking.R b/R-package/tests/testthat/test_ranking.R
index 9e8d0156e..b43292e18 100644
--- a/R-package/tests/testthat/test_ranking.R
+++ b/R-package/tests/testthat/test_ranking.R
@@ -1,5 +1,7 @@
 context('Learning to rank')
 
+n_threads <- 2
+
 test_that('Test ranking with unweighted data', {
   X <- Matrix::sparseMatrix(
     i = c(2, 3, 7, 9, 12, 15, 17, 18)
@@ -9,10 +11,10 @@ test_that('Test ranking with unweighted data', {
   )
   y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
   group <- c(5, 5, 5, 5)
-  dtrain <- xgb.DMatrix(X, label = y, group = group)
+  dtrain <- xgb.DMatrix(X, label = y, group = group, nthread = n_threads)
 
   params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
-                 eval_metric = 'auc', eval_metric = 'aucpr')
+                 eval_metric = 'auc', eval_metric = 'aucpr', nthread = n_threads)
   bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
   # Check if the metric is monotone increasing
   expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
@@ -29,10 +31,14 @@ test_that('Test ranking with weighted data', {
   y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
   group <- c(5, 5, 5, 5)
   weight <- c(1.0, 2.0, 3.0, 4.0)
-  dtrain <- xgb.DMatrix(X, label = y, group = group, weight = weight)
+  dtrain <- xgb.DMatrix(
+    X, label = y, group = group, weight = weight, nthread = n_threads
+  )
 
-  params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
-                 eval_metric = 'auc', eval_metric = 'aucpr')
+  params <- list(
+    eta = 1, tree_method = "exact", objective = "rank:pairwise", max_depth = 1,
+    eval_metric = "auc", eval_metric = "aucpr", nthread = n_threads
+  )
   bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
   # Check if the metric is monotone increasing
   expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
diff --git a/R-package/tests/testthat/test_unicode.R b/R-package/tests/testthat/test_unicode.R
index cac544ef9..c8a225716 100644
--- a/R-package/tests/testthat/test_unicode.R
+++ b/R-package/tests/testthat/test_unicode.R
@@ -16,6 +16,7 @@ test_that("Can save and load models with Unicode paths", {
     path <- file.path(tmpdir, x)
     xgb.save(bst, path)
     bst2 <- xgb.load(path)
+    xgb.parameters(bst2) <- list(nthread = 2)
     expect_equal(predict(bst, test$data), predict(bst2, test$data))
   })
 })
diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R
index c961bab1a..cf8b6f007 100644
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -2,8 +2,15 @@ context("update trees in an existing model")
 
 data(agaricus.train, package = 'xgboost')
 data(agaricus.test, package = 'xgboost')
-dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
+n_threads <- 1
+
+dtrain <- xgb.DMatrix(
+  agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+)
+dtest <- xgb.DMatrix(
+  agaricus.test$data, label = agaricus.test$label, nthread = n_threads
+)
 
 # Disable flaky tests for 32-bit Windows.
 # See https://github.com/dmlc/xgboost/issues/3720
@@ -14,7 +21,7 @@ test_that("updating the model works", {
 
   # no-subsampling
   p1 <- list(
-    objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
+    objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = n_threads,
     updater = "grow_colmaker,prune"
   )
   set.seed(11)
@@ -86,9 +93,11 @@ test_that("updating the model works", {
 })
 
 test_that("updating works for multiclass & multitree", {
-  dtr <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1)
+  dtr <- xgb.DMatrix(
+    as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, nthread = n_threads
+  )
   watchlist <- list(train = dtr)
-  p0 <- list(max_depth = 2, eta = 0.5, nthread = 2, subsample = 0.6,
+  p0 <- list(max_depth = 2, eta = 0.5, nthread = n_threads, subsample = 0.6,
              objective = "multi:softprob", num_class = 3, num_parallel_tree = 2,
              base_score = 0)
   set.seed(121)
diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd
index 8b9e2e2e3..4b04f771f 100644
--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@@ -31,6 +31,8 @@ require(data.table)
 if (!require('vcd')) {
   install.packages('vcd')
 }
+
+data.table::setDTthreads(2)
 ```
 
 > **VCD** package is used for one of its embedded dataset only.
@@ -297,23 +299,25 @@ test <- agaricus.test
 
 #Random Forest - 1000 trees
 bst <- xgboost(
-    data = train$data
-    , label = train$label
-    , max_depth = 4
-    , num_parallel_tree = 1000
-    , subsample = 0.5
-    , colsample_bytree = 0.5
-    , nrounds = 1
-    , objective = "binary:logistic"
+    data = train$data,
+    label = train$label,
+    max_depth = 4,
+    num_parallel_tree = 1000,
+    subsample = 0.5,
+    colsample_bytree = 0.5,
+    nrounds = 1,
+    objective = "binary:logistic",
+    nthread = 2
 )
 
 #Boosting - 3 rounds
 bst <- xgboost(
-    data = train$data
-    , label = train$label
-    , max_depth = 4
-    , nrounds = 3
-    , objective = "binary:logistic"
+    data = train$data,
+    label = train$label,
+    max_depth = 4,
+    nrounds = 3,
+    objective = "binary:logistic",
+    nthread = 2
 )
 ```
 
diff --git a/R-package/vignettes/xgboost.Rnw b/R-package/vignettes/xgboost.Rnw
index c9089cd6f..7edf4ace3 100644
--- a/R-package/vignettes/xgboost.Rnw
+++ b/R-package/vignettes/xgboost.Rnw
@@ -86,9 +86,10 @@ data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1,
-               nrounds = 2, objective = "binary:logistic")
+               nrounds = 2, objective = "binary:logistic", nthread = 2)
 xgb.save(bst, 'model.save')
 bst = xgb.load('model.save')
+xgb.parameters(bst) <- list(nthread = 2)
 pred <- predict(bst, test$data)
 @
 
@@ -127,7 +128,7 @@ training from initial prediction value, weighted training instance.
 
 We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
 <<xgb.DMatrix>>=
-dtrain <- xgb.DMatrix(train$data, label = train$label)
+dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
 class(dtrain)
 head(getinfo(dtrain,'label'))
 @
@@ -161,9 +162,9 @@ evalerror <- function(preds, dtrain) {
   return(list(metric = "MSE", value = err))
 }
 
-dtest <- xgb.DMatrix(test$data, label = test$label)
+dtest <- xgb.DMatrix(test$data, label = test$label, nthread = 2)
 watchlist <- list(eval = dtest, train = dtrain)
-param <- list(max_depth = 2, eta = 1)
+param <- list(max_depth = 2, eta = 1, nthread = 2)
 
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, logregobj, evalerror, maximize = FALSE)
 @
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index 34f5963d5..1b015fab8 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -173,13 +173,13 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
 
 ```{r trainingDense, message=F, warning=F}
 bstDense <- xgboost(
-    data = as.matrix(train$data)
-    , label = train$label
-    , max_depth = 2
-    , eta = 1
-    , nthread = 2
-    , nrounds = 2
-    , objective = "binary:logistic"
+    data = as.matrix(train$data),
+    label = train$label,
+    max_depth = 2,
+    eta = 1,
+    nthread = 2,
+    nrounds = 2,
+    objective = "binary:logistic"
 )
 ```
 
@@ -188,14 +188,14 @@ bstDense <- xgboost(
 **XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be useful for the most advanced features we will discover later.
 
 ```{r trainingDmatrix, message=F, warning=F}
-dtrain <- xgb.DMatrix(data = train$data, label = train$label)
+dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
 bstDMatrix <- xgboost(
-    data = dtrain
-    , max_depth = 2
-    , eta = 1
-    , nthread = 2
-    , nrounds = 2
-    , objective = "binary:logistic"
+    data = dtrain,
+    max_depth = 2,
+    eta = 1,
+    nthread = 2,
+    nrounds = 2,
+    objective = "binary:logistic"
 )
 ```
 
@@ -314,8 +314,8 @@ Most of the features below have been implemented to help you to improve your mod
 For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
 
 ```{r DMatrix, message=F, warning=F}
-dtrain <- xgb.DMatrix(data = train$data, label = train$label)
-dtest <- xgb.DMatrix(data = test$data, label = test$label)
+dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
+dtest <- xgb.DMatrix(data = test$data, label = test$label, nthread = 2)
 ```
 
 ### Measure learning progress with xgb.train
@@ -476,6 +476,7 @@ An interesting test to see how identical our saved model is to the original one
 ```{r loadModel, message=F, warning=F}
 # load binary model to R
 bst2 <- xgb.load("xgboost.model")
+xgb.parameters(bst2) <- list(nthread = 2)
 pred2 <- predict(bst2, test$data)
 
 # And now the test
@@ -500,6 +501,7 @@ print(class(rawVec))
 
 # load binary model to R
 bst3 <- xgb.load(rawVec)
+xgb.parameters(bst3) <- list(nthread = 2)
 pred3 <- predict(bst3, test$data)
 
 # pred2 should be identical to pred
diff --git a/R-package/vignettes/xgboostfromJSON.Rmd b/R-package/vignettes/xgboostfromJSON.Rmd
index 0ea2f56cb..e7ccdf3a9 100644
--- a/R-package/vignettes/xgboostfromJSON.Rmd
+++ b/R-package/vignettes/xgboostfromJSON.Rmd
@@ -175,7 +175,7 @@ bst_preds == bst_from_json_preds
 
 None are exactly equal again.  What is going on here?  Well, since we are using the value `1` in the calculations, we have introduced a double into the calculation.  Because of this, all float values are promoted to 64-bit doubles and the 64-bit version of the exponentiation operator `exp` is also used.  On the other hand, xgboost uses the 32-bit version of the exponentiation operator in its [sigmoid function](https://github.com/dmlc/xgboost/blob/54980b8959680a0da06a3fc0ec776e47c8cbb0a1/src/common/math.h#L25-L27).
 
-How do we fix this?  We have to ensure we use the correct data types everywhere and the correct operators.  If we use only floats, the float library that we have loaded will ensure the 32-bit float exponentiation operator is applied. 
+How do we fix this?  We have to ensure we use the correct data types everywhere and the correct operators.  If we use only floats, the float library that we have loaded will ensure the 32-bit float exponentiation operator is applied.
 ```{r}
 # calculate the predictions casting doubles to floats
 bst_from_json_preds <- ifelse(
diff --git a/doc/contrib/coding_guide.rst b/doc/contrib/coding_guide.rst
index e799ad286..1169921bb 100644
--- a/doc/contrib/coding_guide.rst
+++ b/doc/contrib/coding_guide.rst
@@ -80,6 +80,24 @@ R package versioning
 ====================
 See :ref:`release`.
 
+Testing R package with different compilers
+==========================================
+
+You can change the default compiler of R by changing the configuration file in home
+directory. For instance, if you want to test XGBoost built with clang++ instead of g++ on
+Linux, put the following in your ``~/.R/Makevars`` file:
+
+.. code-block:: sh
+
+  CC=clang-15
+  CXX17=clang++-15
+
+Be aware that the variable name should match with the name used by ``R CMD``:
+
+.. code-block:: sh
+
+  R CMD config CXX17
+
 Registering native routines in R
 ================================
 According to `R extension manual <https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Registering-native-routines>`_,
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 24182e209..53cb52233 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -50,6 +50,7 @@ inline void EllpackPageSource::Fetch() {
   // silent the warning about unused variables.
   (void)(row_stride_);
   (void)(is_dense_);
+  (void)(device_);
   common::AssertGPUSupport();
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 1373e3e2b..14131865f 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -106,14 +106,30 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
   Validate(*this);
 }
 
+namespace {
+std::int32_t IOThreads(Context const* ctx) {
+  CHECK(ctx);
+  std::int32_t n_threads = ctx->Threads();
+  // CRAN checks for number of threads used by examples, but we might not have the right
+  // number of threads when serializing/unserializing models as nthread is a booster
+  // parameter, which is only effective after booster initialization.
+  //
+  // The threshold ratio of CPU time to user time for R is 2.5, we set the number of
+  // threads to 2.
+#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
+  n_threads = std::min(2, n_threads);
+#endif
+  return n_threads;
+}
+}  // namespace
+
 void GBTreeModel::SaveModel(Json* p_out) const {
   auto& out = *p_out;
   CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
   out["gbtree_model_param"] = ToJson(param);
   std::vector<Json> trees_json(trees.size());
 
-  CHECK(ctx_);
-  common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
+  common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) {
     auto const& tree = trees[t];
     Json jtree{Object{}};
     tree->SaveModel(&jtree);
@@ -151,9 +167,7 @@ void GBTreeModel::LoadModel(Json const& in) {
   CHECK_EQ(tree_info_json.size(), param.num_trees);
   tree_info.resize(param.num_trees);
 
-  CHECK(ctx_);
-
-  common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
+  common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) {
     auto tree_id = get<Integer const>(trees_json[t]["id"]);
     trees.at(tree_id).reset(new RegTree{});
     trees[tree_id]->LoadModel(trees_json[t]);
diff --git a/tests/ci_build/test_r_package.py b/tests/ci_build/test_r_package.py
index f1e179d43..853bf0502 100644
--- a/tests/ci_build/test_r_package.py
+++ b/tests/ci_build/test_r_package.py
@@ -3,9 +3,15 @@ import argparse
 import os
 import shutil
 import subprocess
+from io import StringIO
 from pathlib import Path
 from platform import system
 
+try:
+    import pandas as pd
+except ImportError:
+    pd = None
+
 from test_utils import R_PACKAGE, ROOT, DirectoryExcursion, cd, print_time, record_time
 
 
@@ -97,16 +103,47 @@ def build_rpackage(path: str) -> str:
     return tarball
 
 
+def check_example_timing(rcheck_dir: Path, threshold: float) -> None:
+    with open(rcheck_dir / "xgboost-Ex.timings", "r") as fd:
+        timings = fd.readlines()
+        newlines = []
+        for line in timings:
+            line = line.strip()
+            newlines.append(line)
+        con_timings = "\n".join(newlines)
+        df = pd.read_csv(StringIO(con_timings), delimiter="\t")
+        ratio_n = "user/elapsed"
+        df[ratio_n] = df["user"] / df["elapsed"]
+        offending = df[df[ratio_n] > threshold]
+
+    try:
+        # requires the tabulate package
+        df.to_markdown("timings.md")
+        offending.to_markdown("offending.md")
+    except ImportError:
+        print("failed to export markdown files.")
+        pass
+
+    if offending.shape[0] == 0:
+        return
+
+    print(offending)
+    raise ValueError("There are examples using too many threads")
+
+
 @cd(ROOT)
 @record_time
 def check_rpackage(path: str) -> None:
     env = os.environ.copy()
     print("Ncpus:", f"{os.cpu_count()}")
+    threshold = 2.5
     env.update(
         {
             "MAKEFLAGS": f"-j{os.cpu_count()}",
             # cran specific environment variables
-            "_R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(2.5),
+            "_R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(threshold),
+            "_R_CHECK_TEST_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(threshold),
+            "_R_CHECK_VIGNETTE_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(threshold),
         }
     )
 
@@ -118,11 +155,14 @@ def check_rpackage(path: str) -> None:
         CC = os.path.join(mingw_bin, "gcc.exe")
         env.update({"CC": CC, "CXX": CXX})
 
-    status = subprocess.run([R, "CMD", "check", "--as-cran", path], env=env)
-    with open(Path("xgboost.Rcheck") / "00check.log", "r") as fd:
+    status = subprocess.run(
+        [R, "CMD", "check", "--as-cran", "--timings", path], env=env
+    )
+    rcheck_dir = Path("xgboost.Rcheck")
+    with open(rcheck_dir / "00check.log", "r") as fd:
         check_log = fd.read()
 
-    with open(Path("xgboost.Rcheck") / "00install.out", "r") as fd:
+    with open(rcheck_dir / "00install.out", "r") as fd:
         install_log = fd.read()
 
     msg = f"""
@@ -144,6 +184,8 @@ def check_rpackage(path: str) -> None:
     if check_log.find("Examples with CPU time") != -1:
         print(msg)
         raise ValueError("Suspicious NOTE.")
+    if pd is not None:
+        check_example_timing(rcheck_dir, threshold)
 
 
 @cd(R_PACKAGE)
@@ -264,6 +306,8 @@ def main(args: argparse.Namespace) -> None:
             test_with_autotools()
         else:
             test_with_cmake(args)
+    elif args.task == "timings":
+        check_example_timing(Path("xgboost.Rcheck"), 2.5)
     else:
         raise ValueError("Unexpected task.")
 
@@ -279,7 +323,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--task",
         type=str,
-        choices=["pack", "build", "check", "doc"],
+        choices=["pack", "build", "check", "doc", "timings"],
         default="check",
         required=False,
     )

From 1167e6c554f2fad47183f835bf0845e2cde5cd52 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 24 Sep 2023 00:30:28 +0800
Subject: [PATCH 22/59] Limit the number of threads for external memory.
 (#9605)

---
 src/data/sparse_page_source.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index b32c536af..9cb0e364f 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -177,15 +177,15 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     }
     // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
     // to let user adjust number of pre-fetched batches when needed.
-    uint32_t constexpr kPreFetch = 3;
-
-    size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
+    std::int32_t n_prefetches = std::max(nthreads_, 3);
+    std::int32_t n_prefetch_batches =
+        std::min(static_cast<std::uint32_t>(n_prefetches), n_batches_);
     CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
     std::size_t fetch_it = count_;
 
     exce_.Rethrow();
 
-    for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
+    for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       fetch_it %= n_batches_;  // ring
       if (ring_->at(fetch_it).valid()) {
         continue;

From 290b17ffda0a21e4f85bc77cf8004055181ed5a0 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Mon, 25 Sep 2023 22:31:23 -0700
Subject: [PATCH 23/59] Test column sampler with column-wise data split (#9609)

---
 tests/cpp/test_learner.cc | 66 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 0ca7ee0eb..992716db5 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -718,4 +718,70 @@ INSTANTIATE_TEST_SUITE_P(ColumnSplitObjective, TestColumnSplit,
                          [](const ::testing::TestParamInfo<TestColumnSplit::ParamType>& info) {
                            return ObjTestNameGenerator(info);
                          });
+
+namespace {
+void VerifyColumnSplitColumnSampler(std::string const& tree_method, bool use_gpu,
+                                    Json const& expected_model) {
+  Json model{Object{}};
+  {
+    auto const world_size = collective::GetWorldSize();
+    auto const rank = collective::GetRank();
+    auto const objective = "reg:logistic";
+    auto p_fmat = MakeFmatForObjTest(objective);
+    std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
+    std::unique_ptr<Learner> learner{Learner::Create({sliced})};
+    learner->SetParam("tree_method", tree_method);
+    if (use_gpu) {
+      auto gpu_id = common::AllVisibleGPUs() == 1 ? 0 : rank;
+      learner->SetParam("device", "cuda:" + std::to_string(gpu_id));
+    }
+    learner->SetParam("objective", objective);
+    learner->SetParam("colsample_bytree", "0.5");
+    learner->SetParam("colsample_bylevel", "0.6");
+    learner->SetParam("colsample_bynode", "0.7");
+    learner->UpdateOneIter(0, sliced);
+    learner->SaveModel(&model);
+  }
+  ASSERT_EQ(model, expected_model);
+}
+
+void TestColumnSplitColumnSampler(std::string const& tree_method, bool use_gpu) {
+  Json model{Object{}};
+  {
+    auto objective = "reg:logistic";
+    auto p_fmat = MakeFmatForObjTest(objective);
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParam("tree_method", tree_method);
+    if (use_gpu) {
+      learner->SetParam("device", "cuda:0");
+    }
+    learner->SetParam("objective", objective);
+    learner->SetParam("colsample_bytree", "0.5");
+    learner->SetParam("colsample_bylevel", "0.6");
+    learner->SetParam("colsample_bynode", "0.7");
+    learner->UpdateOneIter(0, p_fmat);
+    learner->SaveModel(&model);
+  }
+  auto world_size{3};
+  if (use_gpu) {
+    world_size = common::AllVisibleGPUs();
+    // Simulate MPU on a single GPU.
+    if (world_size == 1) {
+      world_size = 3;
+    }
+  }
+  RunWithInMemoryCommunicator(world_size, VerifyColumnSplitColumnSampler, tree_method, use_gpu,
+                              model);
+}
+}  // anonymous namespace
+
+TEST(ColumnSplitColumnSampler, Approx) { TestColumnSplitColumnSampler("approx", false); }
+
+TEST(ColumnSplitColumnSampler, Hist) { TestColumnSplitColumnSampler("hist", false); }
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(ColumnSplitColumnSampler, GPUApprox) { TestColumnSplitColumnSampler("approx", true); }
+
+TEST(ColumnSplitColumnSampler, GPUHist) { TestColumnSplitColumnSampler("hist", true); }
+#endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost

From 7901a299b2029452938431a44a66f27b17af9654 Mon Sep 17 00:00:00 2001
From: Benoit Chevallier-Mames <64148533+bcm-at-zama@users.noreply.github.com>
Date: Tue, 26 Sep 2023 09:33:11 +0200
Subject: [PATCH 24/59] [doc] Add privacy-preserving Concrete ML links (#9598)
 (#9604)

---
 doc/prediction.rst | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/doc/prediction.rst b/doc/prediction.rst
index c94ddfbbf..100c82a1e 100644
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -146,3 +146,48 @@ instance we might accidentally call ``clf.set_params()`` inside a predict functi
 
     with ThreadPoolExecutor(max_workers=10) as e:
         e.submit(predict_fn, ...)
+
+*****************************
+Privacy-Preserving Prediction
+*****************************
+
+`Concrete ML`_ is a third-party open-source library developed by `Zama`_ that proposes gradient
+boosting classes similar to ours, but predicting directly over encrypted data, thanks to
+Fully Homomorphic Encryption. A simple example would be as follows:
+
+.. code-block:: python
+
+    from sklearn.datasets import make_classification
+    from sklearn.model_selection import train_test_split
+    from concrete.ml.sklearn import XGBClassifier
+
+    x, y = make_classification(n_samples=100, class_sep=2, n_features=30, random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(
+        x, y, test_size=10, random_state=42
+    )
+
+    # Train in the clear and quantize the weights
+    model = XGBClassifier()
+    model.fit(X_train, y_train)
+
+    # Simulate the predictions in the clear
+    y_pred_clear = model.predict(X_test)
+
+    # Compile in FHE
+    model.compile(X_train)
+
+    # Generate keys
+    model.fhe_circuit.keygen()
+
+    # Run the inference on encrypted inputs!
+    y_pred_fhe = model.predict(X_test, fhe="execute")
+
+    print("In clear  :", y_pred_clear)
+    print("In FHE    :", y_pred_fhe)
+    print(f"Similarity: {int((y_pred_fhe == y_pred_clear).mean()*100)}%")
+
+More information and examples are given in the `Concrete ML documentation`_.
+
+.. _Zama: https://www.zama.ai/
+.. _Concrete ML: https://github.com/zama-ai/concrete-ml
+.. _Concrete ML documentation: https://docs.zama.ai/concrete-ml

From c75a3bc0a9ab383fcbcd5e3ce1be61c029bd4879 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 26 Sep 2023 18:06:34 +0800
Subject: [PATCH 25/59] [breaking] [jvm-packages] Remove rabit check point.
 (#9599)

- Add `numBoostedRound` to jvm packages
- Remove rabit checkpoint version.
- Change the starting version of training continuation in JVM [breaking].
- Redefine the checkpoint version policy in jvm package. [breaking]
- Rename the Python check point callback parameter. [breaking]
- Unifies the checkpoint policy between Python and JVM.
---
 demo/guide-python/callbacks.py                |  4 +-
 include/xgboost/c_api.h                       | 18 -----
 .../ExternalCheckpointManagerSuite.scala      | 50 ++++++-------
 .../java/ml/dmlc/xgboost4j/java/Booster.java  | 59 ++--------------
 .../java/ExternalCheckpointManager.java       | 70 ++++++++++++-------
 .../java/ml/dmlc/xgboost4j/java/XGBoost.java  | 28 ++++----
 .../ml/dmlc/xgboost4j/java/XGBoostJNI.java    |  5 +-
 .../ml/dmlc/xgboost4j/scala/Booster.scala     |  2 +-
 .../xgboost4j/src/native/xgboost4j.cpp        | 38 +++-------
 jvm-packages/xgboost4j/src/native/xgboost4j.h | 24 +++----
 .../dmlc/xgboost4j/java/BoosterImplTest.java  |  9 +--
 python-package/xgboost/callback.py            | 27 ++++---
 src/c_api/c_api.cc                            | 27 +------
 tests/python/test_callback.py                 |  4 +-
 .../test_with_dask/test_with_dask.py          |  2 +-
 15 files changed, 138 insertions(+), 229 deletions(-)

diff --git a/demo/guide-python/callbacks.py b/demo/guide-python/callbacks.py
index be03b1693..9c12f70de 100644
--- a/demo/guide-python/callbacks.py
+++ b/demo/guide-python/callbacks.py
@@ -104,7 +104,7 @@ def check_point_callback():
         # Use callback class from xgboost.callback
         # Feel free to subclass/customize it to suit your need.
         check_point = xgb.callback.TrainingCheckPoint(
-            directory=tmpdir, iterations=rounds, name="model"
+            directory=tmpdir, interval=rounds, name="model"
         )
         xgb.train(
             {"objective": "binary:logistic"},
@@ -118,7 +118,7 @@ def check_point_callback():
         # This version of checkpoint saves everything including parameters and
         # model.  See: doc/tutorials/saving_model.rst
         check_point = xgb.callback.TrainingCheckPoint(
-            directory=tmpdir, iterations=rounds, as_pickle=True, name="model"
+            directory=tmpdir, interval=rounds, as_pickle=True, name="model"
         )
         xgb.train(
             {"objective": "binary:logistic"},
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 9bce616ef..5df62df55 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1308,24 +1308,6 @@ XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, bst_ulong *out_len,
 XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle,
                                            const void *buf, bst_ulong len);
 
-/*!
- * \brief Initialize the booster from rabit checkpoint.
- *  This is used in distributed training API.
- * \param handle handle
- * \param version The output version of the model.
- * \return 0 when success, -1 when failure happens
- */
-XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
-                                         int* version);
-
-/*!
- * \brief Save the current checkpoint to rabit.
- * \param handle handle
- * \return 0 when success, -1 when failure happens
- */
-XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle);
-
-
 /*!
  * \brief Save XGBoost's internal configuration into a JSON document.  Currently the
  *        support is experimental, function signature may change in the future without
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
index adc9c1068..e6835158d 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -32,57 +32,53 @@ class ExternalCheckpointManagerSuite extends AnyFunSuite with TmpFolderPerSuite
   }
 
   private def createNewModels():
-    (String, XGBoostClassificationModel, XGBoostClassificationModel) = {
+      (String, XGBoostClassificationModel, XGBoostClassificationModel) = {
     val tmpPath = createTmpFolder("test").toAbsolutePath.toString
-    val (model4, model8) = {
+    val (model2, model4) = {
       val training = buildDataFrame(Classification.train)
       val paramMap = produceParamMap(tmpPath, 2)
       (new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training),
         new XGBoostClassifier(paramMap ++ Seq("num_round" -> 4)).fit(training))
     }
-    (tmpPath, model4, model8)
+    (tmpPath, model2, model4)
   }
 
   test("test update/load models") {
-    val (tmpPath, model4, model8) = createNewModels()
+    val (tmpPath, model2, model4) = createNewModels()
     val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
 
-    manager.updateCheckpoint(model4._booster.booster)
+    manager.updateCheckpoint(model2._booster.booster)
     var files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
     assert(files.length == 1)
-    assert(files.head.getPath.getName == "4.model")
-    assert(manager.loadCheckpointAsScalaBooster().getVersion == 4)
+    assert(files.head.getPath.getName == "1.model")
+    assert(manager.loadCheckpointAsScalaBooster().getNumBoostedRound == 2)
 
-    manager.updateCheckpoint(model8._booster)
+    manager.updateCheckpoint(model4._booster)
     files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
     assert(files.length == 1)
-    assert(files.head.getPath.getName == "8.model")
-    assert(manager.loadCheckpointAsScalaBooster().getVersion == 8)
+    assert(files.head.getPath.getName == "3.model")
+    assert(manager.loadCheckpointAsScalaBooster().getNumBoostedRound == 4)
   }
 
   test("test cleanUpHigherVersions") {
-    val (tmpPath, model4, model8) = createNewModels()
+    val (tmpPath, model2, model4) = createNewModels()
 
     val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
-    manager.updateCheckpoint(model8._booster)
-    manager.cleanUpHigherVersions(8)
-    assert(new File(s"$tmpPath/8.model").exists())
+    manager.updateCheckpoint(model4._booster)
+    manager.cleanUpHigherVersions(3)
+    assert(new File(s"$tmpPath/3.model").exists())
 
-    manager.cleanUpHigherVersions(4)
-    assert(!new File(s"$tmpPath/8.model").exists())
+    manager.cleanUpHigherVersions(2)
+    assert(!new File(s"$tmpPath/3.model").exists())
   }
 
   test("test checkpoint rounds") {
     import scala.collection.JavaConverters._
-    val (tmpPath, model4, model8) = createNewModels()
+    val (tmpPath, model2, model4) = createNewModels()
     val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
-    assertResult(Seq(7))(
-      manager.getCheckpointRounds(0, 7).asScala)
-    assertResult(Seq(2, 4, 6, 7))(
-      manager.getCheckpointRounds(2, 7).asScala)
-    manager.updateCheckpoint(model4._booster)
-    assertResult(Seq(4, 6, 7))(
-      manager.getCheckpointRounds(2, 7).asScala)
+    assertResult(Seq(2))(manager.getCheckpointRounds(0, 0, 3).asScala)
+    assertResult(Seq(0, 2, 4, 6))(manager.getCheckpointRounds(0, 2, 7).asScala)
+    assertResult(Seq(0, 2, 4, 6, 7))(manager.getCheckpointRounds(0, 2, 8).asScala)
   }
 
 
@@ -109,8 +105,8 @@ class ExternalCheckpointManagerSuite extends AnyFunSuite with TmpFolderPerSuite
       // Check only one model is kept after training
       val files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
       assert(files.length == 1)
-      assert(files.head.getPath.getName == "8.model")
-      val tmpModel = SXGBoost.loadModel(s"$tmpPath/8.model")
+      assert(files.head.getPath.getName == "4.model")
+      val tmpModel = SXGBoost.loadModel(s"$tmpPath/4.model")
       // Train next model based on prev model
       val nextModel = new XGBoostClassifier(paramMap ++ Seq("num_round" -> 8)).fit(training)
       assert(error(tmpModel) >= error(prevModel._booster))
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index 7ed12c704..51959ce0c 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -787,35 +787,6 @@ public class Booster implements Serializable, KryoSerializable {
     return importanceMap;
   }
 
-  /**
-   * Save the model as byte array representation.
-   * Write these bytes to a file will give compatible format with other xgboost bindings.
-   *
-   * If java natively support HDFS file API, use toByteArray and write the ByteArray
-   *
-   * @param withStats Controls whether the split statistics are output.
-   * @return dumped model information
-   * @throws XGBoostError native error
-   */
-  private String[] getDumpInfo(boolean withStats) throws XGBoostError {
-    int statsFlag = 0;
-    if (withStats) {
-      statsFlag = 1;
-    }
-    String[][] modelInfos = new String[1][];
-    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterDumpModelEx(handle, "", statsFlag, "text",
-            modelInfos));
-    return modelInfos[0];
-  }
-
-  public int getVersion() {
-    return this.version;
-  }
-
-  public void setVersion(int version) {
-    this.version = version;
-  }
-
   /**
    * Save model into raw byte array. Currently it's using the deprecated format as
    * default, which will be changed into `ubj` in future releases.
@@ -841,29 +812,6 @@ public class Booster implements Serializable, KryoSerializable {
     return bytes[0];
   }
 
-  /**
-   * Load the booster model from thread-local rabit checkpoint.
-   * This is only used in distributed training.
-   * @return the stored version number of the checkpoint.
-   * @throws XGBoostError
-   */
-  int loadRabitCheckpoint() throws XGBoostError {
-    int[] out = new int[1];
-    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterLoadRabitCheckpoint(this.handle, out));
-    version = out[0];
-    return version;
-  }
-
-  /**
-   * Save the booster model into thread-local rabit checkpoint and increment the version.
-   * This is only used in distributed training.
-   * @throws XGBoostError
-   */
-  void saveRabitCheckpoint() throws XGBoostError {
-    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterSaveRabitCheckpoint(this.handle));
-    version += 1;
-  }
-
   /**
    * Get number of model features.
    * @return the number of features.
@@ -874,6 +822,11 @@ public class Booster implements Serializable, KryoSerializable {
     XGBoostJNI.checkCall(XGBoostJNI.XGBoosterGetNumFeature(this.handle, numFeature));
     return numFeature[0];
   }
+  public int getNumBoostedRound() throws XGBoostError {
+    int[] numRound = new int[1];
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterGetNumBoostedRound(this.handle, numRound));
+    return numRound[0];
+  }
 
   /**
    * Internal initialization function.
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java
index 655b99020..3d794756d 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java
@@ -1,3 +1,18 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
 package ml.dmlc.xgboost4j.java;
 
 import java.io.IOException;
@@ -15,7 +30,7 @@ public class ExternalCheckpointManager {
 
   private Log logger = LogFactory.getLog("ExternalCheckpointManager");
   private String modelSuffix = ".model";
-  private Path checkpointPath;
+  private Path checkpointPath;  // directory for checkpoints
   private FileSystem fs;
 
   public ExternalCheckpointManager(String checkpointPath, FileSystem fs) throws XGBoostError {
@@ -35,6 +50,7 @@ public class ExternalCheckpointManager {
     if (!fs.exists(checkpointPath)) {
       return new ArrayList<>();
     } else {
+      // Get integer versions from a list of checkpoint files.
       return Arrays.stream(fs.listStatus(checkpointPath))
               .map(path -> path.getPath().getName())
               .filter(fileName -> fileName.endsWith(modelSuffix))
@@ -44,6 +60,11 @@ public class ExternalCheckpointManager {
     }
   }
 
+  private Integer latest(List<Integer> versions) {
+    return versions.stream()
+        .max(Comparator.comparing(Integer::valueOf)).get();
+  }
+
   public void cleanPath() throws IOException {
     fs.delete(checkpointPath, true);
   }
@@ -51,12 +72,11 @@ public class ExternalCheckpointManager {
   public Booster loadCheckpointAsBooster() throws IOException, XGBoostError {
     List<Integer> versions = getExistingVersions();
     if (versions.size() > 0) {
-      int latestVersion = versions.stream().max(Comparator.comparing(Integer::valueOf)).get();
+      int latestVersion = this.latest(versions);
       String checkpointPath = getPath(latestVersion);
       InputStream in = fs.open(new Path(checkpointPath));
       logger.info("loaded checkpoint from " + checkpointPath);
       Booster booster = XGBoost.loadModel(in);
-      booster.setVersion(latestVersion);
       return booster;
     } else {
       return null;
@@ -65,13 +85,16 @@ public class ExternalCheckpointManager {
 
   public void updateCheckpoint(Booster boosterToCheckpoint) throws IOException, XGBoostError {
     List<String> prevModelPaths = getExistingVersions().stream()
-            .map(this::getPath).collect(Collectors.toList());
-    String eventualPath = getPath(boosterToCheckpoint.getVersion());
+        .map(this::getPath).collect(Collectors.toList());
+    // checkpointing is done after update, so n_rounds - 1 is the current iteration
+    // accounting for training continuation.
+    Integer iter = boosterToCheckpoint.getNumBoostedRound() - 1;
+    String eventualPath = getPath(iter);
     String tempPath = eventualPath + "-" + UUID.randomUUID();
     try (OutputStream out = fs.create(new Path(tempPath), true)) {
       boosterToCheckpoint.saveModel(out);
       fs.rename(new Path(tempPath), new Path(eventualPath));
-      logger.info("saving checkpoint with version " + boosterToCheckpoint.getVersion());
+      logger.info("saving checkpoint with version " + iter);
       prevModelPaths.stream().forEach(path -> {
         try {
           fs.delete(new Path(path), true);
@@ -83,7 +106,7 @@ public class ExternalCheckpointManager {
   }
 
   public void cleanUpHigherVersions(int currentRound) throws IOException {
-    getExistingVersions().stream().filter(v -> v / 2 >= currentRound).forEach(v -> {
+    getExistingVersions().stream().filter(v -> v > currentRound).forEach(v -> {
       try {
         fs.delete(new Path(getPath(v)), true);
       } catch (IOException e) {
@@ -91,27 +114,26 @@ public class ExternalCheckpointManager {
       }
     });
   }
-
-  public List<Integer> getCheckpointRounds(int checkpointInterval, int numOfRounds)
+  // Get a list of iterations that need checkpointing.
+  public List<Integer> getCheckpointRounds(
+      int firstRound, int checkpointInterval, int numOfRounds)
       throws IOException {
+    int end = firstRound + numOfRounds; // exclusive
+    int lastRound = end - 1;
+    if (end - 1 < 0) {
+      throw new IllegalArgumentException("Inavlid `numOfRounds`.");
+    }
+
+    List<Integer> arr = new ArrayList<>();
     if (checkpointInterval > 0) {
-      List<Integer> prevRounds =
-              getExistingVersions().stream().map(v -> v / 2).collect(Collectors.toList());
-      prevRounds.add(0);
-      int firstCheckpointRound = prevRounds.stream()
-              .max(Comparator.comparing(Integer::valueOf)).get() + checkpointInterval;
-      List<Integer> arr = new ArrayList<>();
-      for (int i = firstCheckpointRound; i <= numOfRounds; i += checkpointInterval) {
+      for (int i = firstRound; i < end; i += checkpointInterval) {
         arr.add(i);
       }
-      arr.add(numOfRounds);
-      return arr;
-    } else if (checkpointInterval <= 0) {
-      List<Integer> l = new ArrayList<Integer>();
-      l.add(numOfRounds);
-      return l;
-    } else {
-      throw new IllegalArgumentException("parameters \"checkpoint_path\" should also be set.");
     }
+
+    if (!arr.contains(lastRound)) {
+      arr.add(lastRound);
+    }
+    return arr;
   }
 }
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
index bcd0b1b11..2be62a343 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014,2021 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -133,7 +133,7 @@ public class XGBoost {
           int earlyStoppingRound) throws XGBoostError {
     return train(dtrain, params, round, watches, metrics, obj, eval, earlyStoppingRound, null);
   }
-
+  // save checkpoint if iter is in checkpointIterations
   private static void saveCheckpoint(
           Booster booster,
           int iter,
@@ -169,7 +169,6 @@ public class XGBoost {
     int bestIteration;
     List<String> names = new ArrayList<String>();
     List<DMatrix> mats = new ArrayList<DMatrix>();
-    Set<Integer> checkpointIterations = new HashSet<>();
     ExternalCheckpointManager ecm = null;
     if (checkpointPath != null) {
       ecm = new ExternalCheckpointManager(checkpointPath, fs);
@@ -203,32 +202,30 @@ public class XGBoost {
       booster = new Booster(params, allMats);
       booster.setFeatureNames(dtrain.getFeatureNames());
       booster.setFeatureTypes(dtrain.getFeatureTypes());
-      booster.loadRabitCheckpoint();
     } else {
       // Start training on an existing booster
       booster.setParams(params);
     }
 
+    Set<Integer> checkpointIterations = new HashSet<>();
     if (ecm != null) {
-      checkpointIterations = new HashSet<>(ecm.getCheckpointRounds(checkpointInterval, numRounds));
+      checkpointIterations = new HashSet<>(
+          ecm.getCheckpointRounds(booster.getNumBoostedRound(), checkpointInterval, numRounds));
     }
 
     boolean initial_best_score_flag = false;
     boolean max_direction = false;
 
     // begin to train
-    for (int iter = booster.getVersion() / 2; iter < numRounds; iter++) {
-      if (booster.getVersion() % 2 == 0) {
-        if (obj != null) {
-          booster.update(dtrain, obj);
-        } else {
-          booster.update(dtrain, iter);
-        }
-        saveCheckpoint(booster, iter, checkpointIterations, ecm);
-        booster.saveRabitCheckpoint();
+    for (int iter = 0; iter < numRounds; iter++) {
+      if (obj != null) {
+        booster.update(dtrain, iter, obj);
+      } else {
+        booster.update(dtrain, iter);
       }
+      saveCheckpoint(booster, iter, checkpointIterations, ecm);
 
-      //evaluation
+      // evaluation
       if (evalMats.length > 0) {
         float[] metricsOut = new float[evalMats.length];
         String evalInfo;
@@ -285,7 +282,6 @@ public class XGBoost {
           Communicator.communicatorPrint(evalInfo + '\n');
         }
       }
-      booster.saveRabitCheckpoint();
     }
     return booster;
   }
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
index eabbf29ba..236d53e90 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
@@ -140,10 +140,11 @@ class XGBoostJNI {
   public final static native int XGBoosterGetAttrNames(long handle, String[][] out_strings);
   public final static native int XGBoosterGetAttr(long handle, String key, String[] out_string);
   public final static native int XGBoosterSetAttr(long handle, String key, String value);
-  public final static native int XGBoosterLoadRabitCheckpoint(long handle, int[] out_version);
-  public final static native int XGBoosterSaveRabitCheckpoint(long handle);
+
   public final static native int XGBoosterGetNumFeature(long handle, long[] feature);
 
+  public final static native int XGBoosterGetNumBoostedRound(long handle, int[] rounds);
+
   // communicator functions
   public final static native int CommunicatorInit(String[] args);
   public final static native int CommunicatorFinalize();
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
index 31be86898..c288bfab1 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
@@ -326,7 +326,7 @@ class Booster private[xgboost4j](private[xgboost4j] var booster: JBooster)
   @throws(classOf[XGBoostError])
   def getNumFeature: Long = booster.getNumFeature
 
-  def getVersion: Int = booster.getVersion
+  def getNumBoostedRound: Long = booster.getNumBoostedRound
 
   /**
     * Save model into a raw byte array.  Available options are "json", "ubj" and "deprecated".
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index 821b1ebff..332b1a127 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -984,33 +984,6 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetAttr
   return ret;
 }
 
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterLoadRabitCheckpoint
- * Signature: (J[I)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterLoadRabitCheckpoint
-  (JNIEnv *jenv , jclass jcls, jlong jhandle, jintArray jout) {
-  BoosterHandle handle = (BoosterHandle) jhandle;
-  int version;
-  int ret = XGBoosterLoadRabitCheckpoint(handle, &version);
-  JVM_CHECK_CALL(ret);
-  jint jversion = version;
-  jenv->SetIntArrayRegion(jout, 0, 1, &jversion);
-  return ret;
-}
-
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterSaveRabitCheckpoint
- * Signature: (J)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSaveRabitCheckpoint
-  (JNIEnv *jenv, jclass jcls, jlong jhandle) {
-  BoosterHandle handle = (BoosterHandle) jhandle;
-  return XGBoosterSaveRabitCheckpoint(handle);
-}
-
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterGetNumFeature
@@ -1027,6 +1000,17 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetNumFea
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetNumBoostedRound(
+    JNIEnv *jenv, jclass, jlong jhandle, jintArray jout) {
+  BoosterHandle handle = (BoosterHandle)jhandle;
+  std::int32_t n_rounds{0};
+  auto ret = XGBoosterBoostedRounds(handle, &n_rounds);
+  JVM_CHECK_CALL(ret);
+  jint jn_rounds = n_rounds;
+  jenv->SetIntArrayRegion(jout, 0, 1, &jn_rounds);
+  return ret;
+}
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    CommunicatorInit
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.h b/jvm-packages/xgboost4j/src/native/xgboost4j.h
index 87ff6d30d..cc4ad53d4 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.h
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.h
@@ -287,22 +287,6 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetAttr
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetAttr
   (JNIEnv *, jclass, jlong, jstring, jstring);
 
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterLoadRabitCheckpoint
- * Signature: (J[I)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterLoadRabitCheckpoint
-  (JNIEnv *, jclass, jlong, jintArray);
-
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterSaveRabitCheckpoint
- * Signature: (J)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSaveRabitCheckpoint
-  (JNIEnv *, jclass, jlong);
-
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterGetNumFeature
@@ -311,6 +295,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSaveRabit
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetNumFeature
   (JNIEnv *, jclass, jlong, jlongArray);
 
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterGetNumBoostedRound
+ * Signature: (J[I)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetNumBoostedRound
+  (JNIEnv *, jclass, jlong, jintArray);
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    CommunicatorInit
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index c7508b20d..b686ddbed 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 package ml.dmlc.xgboost4j.java;
 
 import junit.framework.TestCase;
-import org.junit.Assert;
 import org.junit.Test;
 
 import java.io.ByteArrayInputStream;
@@ -31,7 +30,7 @@ import static org.junit.Assert.fail;
 
 /**
  * test cases for Booster Inplace Predict
- * 
+ *
  * @author hzx and Sovrn
  */
 public class BoosterImplTest {
@@ -845,14 +844,12 @@ public class BoosterImplTest {
     float tempBoosterError = eval.eval(tempBooster.predict(testMat, true, 0), testMat);
 
     // Save tempBooster to bytestream and load back
-    int prevVersion = tempBooster.getVersion();
     ByteArrayInputStream in = new ByteArrayInputStream(tempBooster.toByteArray());
     tempBooster = XGBoost.loadModel(in);
     in.close();
-    tempBooster.setVersion(prevVersion);
 
     // Continue training using tempBooster
-    round = 4;
+    round = 2;
     Booster booster2 = XGBoost.train(trainMat, paramMap, round, watches, null, null, null, 0, tempBooster);
     float booster2error = eval.eval(booster2.predict(testMat, true, 0), testMat);
     TestCase.assertTrue(booster1error == booster2error);
diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index 6077aa1e3..29d880539 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -540,7 +540,10 @@ class EvaluationMonitor(TrainingCallback):
 
 
 class TrainingCheckPoint(TrainingCallback):
-    """Checkpointing operation.
+    """Checkpointing operation. Users are encouraged to create their own callbacks for
+    checkpoint as XGBoost doesn't handle distributed file systems. When checkpointing on
+    distributed systems, be sure to know the rank of the worker to avoid multiple
+    workers checkpointing to the same place.
 
     .. versionadded:: 1.3.0
 
@@ -553,9 +556,9 @@ class TrainingCheckPoint(TrainingCallback):
         pattern of output model file.  Models will be saved as name_0.json, name_1.json,
         name_2.json ....
     as_pickle :
-        When set to True, all training parameters will be saved in pickle format, instead
-        of saving only the model.
-    iterations :
+        When set to True, all training parameters will be saved in pickle format,
+        instead of saving only the model.
+    interval :
         Interval of checkpointing.  Checkpointing is slow so setting a larger number can
         reduce performance hit.
 
@@ -566,15 +569,20 @@ class TrainingCheckPoint(TrainingCallback):
         directory: Union[str, os.PathLike],
         name: str = "model",
         as_pickle: bool = False,
-        iterations: int = 100,
+        interval: int = 100,
     ) -> None:
         self._path = os.fspath(directory)
         self._name = name
         self._as_pickle = as_pickle
-        self._iterations = iterations
-        self._epoch = 0
+        self._iterations = interval
+        self._epoch = 0  # counter for iterval
+        self._start = 0  # beginning iteration
         super().__init__()
 
+    def before_training(self, model: _Model) -> _Model:
+        self._start = model.num_boosted_rounds()
+        return model
+
     def after_iteration(
         self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
     ) -> bool:
@@ -583,11 +591,12 @@ class TrainingCheckPoint(TrainingCallback):
                 self._path,
                 self._name
                 + "_"
-                + str(epoch)
+                + (str(epoch + self._start))
                 + (".pkl" if self._as_pickle else ".json"),
             )
-            self._epoch = 0
+            self._epoch = 0  # reset counter
             if collective.get_rank() == 0:
+                # checkpoint using the first worker
                 if self._as_pickle:
                     with open(path, "wb") as fd:
                         pickle.dump(model, fd)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 2b0862d49..f6ab8d4df 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1430,36 +1430,13 @@ XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle,
   API_END();
 }
 
-XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
-                                         int* version) {
-  API_BEGIN();
-  CHECK_HANDLE();
-  auto* bst = static_cast<Learner*>(handle);
-  xgboost_CHECK_C_ARG_PTR(version);
-  *version = rabit::LoadCheckPoint();
-  if (*version != 0) {
-    bst->Configure();
-  }
-  API_END();
-}
-
-XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) {
-  API_BEGIN();
-  CHECK_HANDLE();
-  auto *learner = static_cast<Learner *>(handle);
-  learner->Configure();
-  rabit::CheckPoint();
-  API_END();
-}
-
-XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer,
-                           int end_layer, int step,
+XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer, int end_layer, int step,
                            BoosterHandle *out) {
   API_BEGIN();
   CHECK_HANDLE();
   xgboost_CHECK_C_ARG_PTR(out);
 
-  auto* learner = static_cast<Learner*>(handle);
+  auto *learner = static_cast<Learner *>(handle);
   bool out_of_bound = false;
   auto p_out = learner->Slice(begin_layer, end_layer, step, &out_of_bound);
   if (out_of_bound) {
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index 56c9fdabd..262c09c99 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -443,7 +443,7 @@ class TestCallbacks:
         m = xgb.DMatrix(X, y)
         with tempfile.TemporaryDirectory() as tmpdir:
             check_point = xgb.callback.TrainingCheckPoint(
-                directory=tmpdir, iterations=1, name="model"
+                directory=tmpdir, interval=1, name="model"
             )
             xgb.train(
                 {"objective": "binary:logistic"},
@@ -456,7 +456,7 @@ class TestCallbacks:
                 assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".json"))
 
             check_point = xgb.callback.TrainingCheckPoint(
-                directory=tmpdir, iterations=1, as_pickle=True, name="model"
+                directory=tmpdir, interval=1, as_pickle=True, name="model"
             )
             xgb.train(
                 {"objective": "binary:logistic"},
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index ae8d24139..3510dff7b 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -2238,7 +2238,7 @@ class TestDaskCallbacks:
                 y,
                 callbacks=[
                     xgb.callback.TrainingCheckPoint(
-                        directory=Path(tmpdir), iterations=1, name="model"
+                        directory=Path(tmpdir), interval=1, name="model"
                     )
                 ],
             )

From d6d14d0fb9490e3910015c8f9ef30a39db6757c7 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 26 Sep 2023 17:27:43 -0700
Subject: [PATCH 26/59] Integration tests for interaction constraints with
 column-wise data split (#9611)

---
 tests/cpp/test_learner.cc | 99 +++++++++++++++++++++++----------------
 1 file changed, 59 insertions(+), 40 deletions(-)

diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 992716db5..0526222f3 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -720,48 +720,39 @@ INSTANTIATE_TEST_SUITE_P(ColumnSplitObjective, TestColumnSplit,
                          });
 
 namespace {
-void VerifyColumnSplitColumnSampler(std::string const& tree_method, bool use_gpu,
-                                    Json const& expected_model) {
+Json GetModelWithArgs(std::shared_ptr<DMatrix> dmat, std::string const& tree_method,
+                      std::string const& device, Args const& args) {
+  std::unique_ptr<Learner> learner{Learner::Create({dmat})};
+  learner->SetParam("tree_method", tree_method);
+  learner->SetParam("device", device);
+  learner->SetParam("objective", "reg:logistic");
+  learner->SetParams(args);
+  learner->UpdateOneIter(0, dmat);
   Json model{Object{}};
-  {
-    auto const world_size = collective::GetWorldSize();
-    auto const rank = collective::GetRank();
-    auto const objective = "reg:logistic";
-    auto p_fmat = MakeFmatForObjTest(objective);
-    std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
-    std::unique_ptr<Learner> learner{Learner::Create({sliced})};
-    learner->SetParam("tree_method", tree_method);
-    if (use_gpu) {
-      auto gpu_id = common::AllVisibleGPUs() == 1 ? 0 : rank;
-      learner->SetParam("device", "cuda:" + std::to_string(gpu_id));
-    }
-    learner->SetParam("objective", objective);
-    learner->SetParam("colsample_bytree", "0.5");
-    learner->SetParam("colsample_bylevel", "0.6");
-    learner->SetParam("colsample_bynode", "0.7");
-    learner->UpdateOneIter(0, sliced);
-    learner->SaveModel(&model);
+  learner->SaveModel(&model);
+  return model;
+}
+
+void VerifyColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args const& args,
+                               Json const& expected_model) {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  auto p_fmat = MakeFmatForObjTest("");
+  std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
+  std::string device = "cpu";
+  if (use_gpu) {
+    auto gpu_id = common::AllVisibleGPUs() == 1 ? 0 : rank;
+    device = "cuda:" + std::to_string(gpu_id);
   }
+  auto model = GetModelWithArgs(sliced, tree_method, device, args);
   ASSERT_EQ(model, expected_model);
 }
 
-void TestColumnSplitColumnSampler(std::string const& tree_method, bool use_gpu) {
-  Json model{Object{}};
-  {
-    auto objective = "reg:logistic";
-    auto p_fmat = MakeFmatForObjTest(objective);
-    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParam("tree_method", tree_method);
-    if (use_gpu) {
-      learner->SetParam("device", "cuda:0");
-    }
-    learner->SetParam("objective", objective);
-    learner->SetParam("colsample_bytree", "0.5");
-    learner->SetParam("colsample_bylevel", "0.6");
-    learner->SetParam("colsample_bynode", "0.7");
-    learner->UpdateOneIter(0, p_fmat);
-    learner->SaveModel(&model);
-  }
+void TestColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args const& args) {
+  auto p_fmat = MakeFmatForObjTest("");
+  std::string device = use_gpu ? "cuda:0" : "cpu";
+  auto model = GetModelWithArgs(p_fmat, tree_method, device, args);
+
   auto world_size{3};
   if (use_gpu) {
     world_size = common::AllVisibleGPUs();
@@ -770,9 +761,19 @@ void TestColumnSplitColumnSampler(std::string const& tree_method, bool use_gpu)
       world_size = 3;
     }
   }
-  RunWithInMemoryCommunicator(world_size, VerifyColumnSplitColumnSampler, tree_method, use_gpu,
+  RunWithInMemoryCommunicator(world_size, VerifyColumnSplitWithArgs, tree_method, use_gpu, args,
                               model);
 }
+
+void TestColumnSplitColumnSampler(std::string const& tree_method, bool use_gpu) {
+  Args args{{"colsample_bytree", "0.5"}, {"colsample_bylevel", "0.6"}, {"colsample_bynode", "0.7"}};
+  TestColumnSplitWithArgs(tree_method, use_gpu, args);
+}
+
+void TestColumnSplitInteractionConstraints(std::string const& tree_method, bool use_gpu) {
+  Args args{{"interaction_constraints", "[[0, 5, 7], [2, 8, 9], [1, 3, 6]]"}};
+  TestColumnSplitWithArgs(tree_method, use_gpu, args);
+}
 }  // anonymous namespace
 
 TEST(ColumnSplitColumnSampler, Approx) { TestColumnSplitColumnSampler("approx", false); }
@@ -780,8 +781,26 @@ TEST(ColumnSplitColumnSampler, Approx) { TestColumnSplitColumnSampler("approx",
 TEST(ColumnSplitColumnSampler, Hist) { TestColumnSplitColumnSampler("hist", false); }
 
 #if defined(XGBOOST_USE_CUDA)
-TEST(ColumnSplitColumnSampler, GPUApprox) { TestColumnSplitColumnSampler("approx", true); }
+TEST(MGPUColumnSplitColumnSampler, GPUApprox) { TestColumnSplitColumnSampler("approx", true); }
 
-TEST(ColumnSplitColumnSampler, GPUHist) { TestColumnSplitColumnSampler("hist", true); }
+TEST(MGPUColumnSplitColumnSampler, GPUHist) { TestColumnSplitColumnSampler("hist", true); }
+#endif  // defined(XGBOOST_USE_CUDA)
+
+TEST(ColumnSplitInteractionConstraints, Approx) {
+  TestColumnSplitInteractionConstraints("approx", false);
+}
+
+TEST(ColumnSplitInteractionConstraints, Hist) {
+  TestColumnSplitInteractionConstraints("hist", false);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(MGPUColumnSplitInteractionConstraints, GPUApprox) {
+  TestColumnSplitInteractionConstraints("approx", true);
+}
+
+TEST(MGPUColumnSplitInteractionConstraints, GPUHist) {
+  TestColumnSplitInteractionConstraints("hist", true);
+}
 #endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost

From 7cafd41a58a1e01da0ca0fdd21a953f61a26b0d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jordan=20Fr=C3=A9ry?= <jordan.frery@gmail.com>
Date: Wed, 27 Sep 2023 20:50:01 +0200
Subject: [PATCH 27/59] [doc] Add privacy preserving tutorial (#9610)

---
 doc/tutorials/privacy_preserving.rst | 97 ++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 doc/tutorials/privacy_preserving.rst

diff --git a/doc/tutorials/privacy_preserving.rst b/doc/tutorials/privacy_preserving.rst
new file mode 100644
index 000000000..132861f7c
--- /dev/null
+++ b/doc/tutorials/privacy_preserving.rst
@@ -0,0 +1,97 @@
+#############################################
+Privacy Preserving Inference with Concrete ML
+#############################################
+
+`Concrete ML`_ is a specialized library developed by Zama that allows the execution of machine learning models on encrypted data through `Fully Homomorphic Encryption (FHE) <https://www.youtube.com/watch?v=FFox2S4uqEo>`_, thereby preserving data privacy.
+
+To use models such as XGBClassifier, use the following import:
+
+.. code:: python
+
+  from concrete.ml.sklearn import XGBClassifier
+
+***************************************
+Performing Privacy Preserving Inference
+***************************************
+
+Initialization of a XGBClassifier can be done as follows:
+
+.. code:: python
+
+  classifier = XGBClassifier(n_bits=6, [other_hyperparameters])
+
+
+where ``n_bits`` determines the precision of the input features. Note that a higher value of ``n_bits`` increases the precision of the input features and possibly the final model accuracy but also ends up with longer FHE execution time.
+
+Other hyper-parameters that exist in xgboost library can be used.
+
+******************************
+Model Training and Compilation
+******************************
+
+As commonly used in scikit-learn like models, it can be trained with the .fit() method.
+
+.. code:: python
+
+  classifier.fit(X_train, y_train)
+
+After training, the model can be compiled with a calibration dataset, potentially a subset of the training data:
+
+.. code:: python
+
+  classifier.compile(X_calibrate)
+
+This calibration dataset, ``X_calibrate``, is used in Concrete ML compute the precision (bit-width) of each intermediate value in the model. This is a necessary step to optimize the equivalent FHE circuit.
+
+****************************
+FHE Simulation and Execution
+****************************
+
+To verify model accuracy in encrypted computations, you can run an FHE simulation:
+
+.. code:: python
+
+  predictions = classifier.predict(X_test, fhe="simulate")
+
+This simulation can be used to evaluate the model. The resulting accuracy of this simulation step is representative of the actual FHE execution without having to pay the cost of an actual FHE execution. 
+
+When the model is ready, actual Fully Homomorphic Encryption execution can be performed:
+
+.. code:: python
+
+  predictions = classifier.predict(X_test, fhe="execute")
+
+
+Note that using FHE="execute" is a convenient way to assess the model in FHE, but for real deployment, functions to encrypt (on the client), run in FHE (on the server), and finally decrypt (on the client) have to be used for end-to-end privacy-preserving inferences.
+
+Concrete ML provides a deployment API to facilitate this process, ensuring end-to-end privacy.
+
+To go further in the deployment API you can read:
+
+- the `deployment documentation <https://docs.zama.ai/concrete-ml/advanced-topics/client_server>`_
+- the `deployment notebook <https://github.com/zama-ai/concrete-ml/blob/17779ca571d20b001caff5792eb11e76fe2c19ba/docs/advanced_examples/ClientServer.ipynb>`_
+
+*******************************
+Parameter Tuning in Concrete ML
+*******************************
+
+Concrete ML is compatible with standard scikit-learn pipelines such as GridSearchCV or any other hyper-parameter tuning techniques.
+
+******************
+Examples and Demos
+******************
+
+- `Sentiment analysis (based on transformers + xgboost) <https://huggingface.co/spaces/zama-fhe/encrypted_sentiment_analysis>`_
+- `XGBoost Classifier <https://github.com/zama-ai/concrete-ml/blob/6966c84b9698d5418209b346900f81d1270c64bd/docs/advanced_examples/XGBClassifier.ipynb>`_
+- `XGBoost Regressor <https://github.com/zama-ai/concrete-ml/blob/6966c84b9698d5418209b346900f81d1270c64bd/docs/advanced_examples/XGBRegressor.ipynb>`_
+
+**********
+Conclusion
+**********
+
+Concrete ML provides a framework for executing privacy-preserving inferences by leveraging Fully Homomorphic Encryption, allowing secure and private computations on encrypted data.
+
+More information and examples are given in the `Concrete ML documentation`_.
+
+.. _Concrete ML: https://github.com/zama-ai/concrete-ml
+.. _`Concrete ML documentation`: https://docs.zama.ai/concrete-ml
\ No newline at end of file

From 3f2093fb812366c9ee03ead743d1a40029a3398a Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 27 Sep 2023 13:54:53 -0700
Subject: [PATCH 28/59] Test monotone constraints with column split (#9613)

---
 tests/cpp/test_learner.cc | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 0526222f3..692257748 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -774,6 +774,11 @@ void TestColumnSplitInteractionConstraints(std::string const& tree_method, bool
   Args args{{"interaction_constraints", "[[0, 5, 7], [2, 8, 9], [1, 3, 6]]"}};
   TestColumnSplitWithArgs(tree_method, use_gpu, args);
 }
+
+void TestColumnSplitMonotoneConstraints(std::string const& tree_method, bool use_gpu) {
+  Args args{{"monotone_constraints", "(1,-1,0,1,1,-1,-1,0,0,1)"}};
+  TestColumnSplitWithArgs(tree_method, use_gpu, args);
+}
 }  // anonymous namespace
 
 TEST(ColumnSplitColumnSampler, Approx) { TestColumnSplitColumnSampler("approx", false); }
@@ -803,4 +808,22 @@ TEST(MGPUColumnSplitInteractionConstraints, GPUHist) {
   TestColumnSplitInteractionConstraints("hist", true);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
+
+TEST(ColumnSplitMonotoneConstraints, Approx) {
+  TestColumnSplitMonotoneConstraints("approx", false);
+}
+
+TEST(ColumnSplitMonotoneConstraints, Hist) {
+  TestColumnSplitMonotoneConstraints("hist", false);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(MGPUColumnSplitMonotoneConstraints, GPUApprox) {
+  TestColumnSplitMonotoneConstraints("approx", true);
+}
+
+TEST(MGPUColumnSplitMonotoneConstraints, GPUHist) {
+  TestColumnSplitMonotoneConstraints("hist", true);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost

From 60526100e3c064adb68f68ed0c391e9cbdf99c53 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 28 Sep 2023 17:00:16 +0800
Subject: [PATCH 29/59] Support arrow through pandas ext types. (#9612)

- Use pandas extension type for pyarrow support.
- Additional support for QDM.
- Additional support for inplace_predict.
---
 doc/python/python_intro.rst     |   3 +-
 include/xgboost/c_api.h         |  18 --
 python-package/xgboost/core.py  |   4 +
 python-package/xgboost/data.py  | 127 +++++---------
 src/c_api/c_api.cc              |  29 +--
 src/data/adapter.h              | 301 --------------------------------
 src/data/arrow-cdi.h            |  66 -------
 src/data/data.cc                |   3 -
 src/data/simple_dmatrix.cc      |  74 --------
 src/data/simple_dmatrix.h       |  10 +-
 tests/python/test_with_arrow.py |  23 ++-
 11 files changed, 74 insertions(+), 584 deletions(-)
 delete mode 100644 src/data/arrow-cdi.h

diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index bb74e7bc3..cc0e461e0 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -172,9 +172,8 @@ Support Matrix
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | modin.Series            | NPA       | FF                | NPA       | NPA       | FF                 |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| pyarrow.Table           | T         | F                 |           | NPA       | FF                 |             |
+| pyarrow.Table           | NPA       | NPA               | NPA       | NPA       | NPA                | NPA         |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| pyarrow.dataset.Dataset | T         | F                 |           |           | F                  |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | _\_array\_\_            | NPA       | F                 | NPA       | NPA       | H                  |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 5df62df55..63096cb56 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -552,24 +552,6 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
 
 /** @} */  // End of Streaming
 
-XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array, void *ptr_schema);
-
-/*!
- * \brief Construct DMatrix from arrow using callbacks.  Arrow related C API is not stable
- *        and subject to change in the future.
- *
- * \param next   Callback function for fetching arrow records.
- * \param config JSON encoded configuration.  Required values are:
- *   - missing: Which value to represent missing value.
- *   - nbatch: Number of batches in arrow table.
- *   - nthread (optional): Number of threads used for initializing DMatrix.
- * \param out      The created DMatrix.
- *
- * \return 0 when success, -1 when failure happens
- */
-XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config,
-                                             DMatrixHandle *out);
-
 /*!
  * \brief create a new dmatrix from sliced content of existing matrix
  * \param handle instance of data matrix to be sliced
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index f94e60321..91c6bbd85 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2431,6 +2431,8 @@ class Booster:
 
         from .data import (
             _array_interface,
+            _arrow_transform,
+            _is_arrow,
             _is_cudf_df,
             _is_cupy_array,
             _is_list,
@@ -2442,6 +2444,8 @@ class Booster:
         )
 
         enable_categorical = True
+        if _is_arrow(data):
+            data = _arrow_transform(data)
         if _is_pandas_series(data):
             import pandas as pd
 
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 0022a17d4..bfdb21c80 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,7 +5,7 @@ import ctypes
 import json
 import os
 import warnings
-from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, cast
+from typing import Any, Callable, List, Optional, Sequence, Tuple, cast
 
 import numpy as np
 
@@ -676,86 +676,51 @@ def _from_dt_df(
 
 
 def _is_arrow(data: DataType) -> bool:
-    try:
-        import pyarrow as pa
-        from pyarrow import dataset as arrow_dataset
-
-        return isinstance(data, (pa.Table, arrow_dataset.Dataset))
-    except ImportError:
-        return False
+    return lazy_isinstance(data, "pyarrow.lib", "Table") or lazy_isinstance(
+        data, "pyarrow._dataset", "Dataset"
+    )
 
 
-def record_batch_data_iter(data_iter: Iterator) -> Callable:
-    """Data iterator used to ingest Arrow columnar record batches. We are not using
-    class DataIter because it is only intended for building Device DMatrix and external
-    memory DMatrix.
-
-    """
-    from pyarrow.cffi import ffi
-
-    c_schemas: List[ffi.CData] = []
-    c_arrays: List[ffi.CData] = []
-
-    def _next(data_handle: int) -> int:
-        from pyarrow.cffi import ffi
-
-        try:
-            batch = next(data_iter)
-            c_schemas.append(ffi.new("struct ArrowSchema*"))
-            c_arrays.append(ffi.new("struct ArrowArray*"))
-            ptr_schema = int(ffi.cast("uintptr_t", c_schemas[-1]))
-            ptr_array = int(ffi.cast("uintptr_t", c_arrays[-1]))
-            # pylint: disable=protected-access
-            batch._export_to_c(ptr_array, ptr_schema)
-            _check_call(
-                _LIB.XGImportArrowRecordBatch(
-                    ctypes.c_void_p(data_handle),
-                    ctypes.c_void_p(ptr_array),
-                    ctypes.c_void_p(ptr_schema),
-                )
-            )
-            return 1
-        except StopIteration:
-            return 0
-
-    return _next
-
-
-def _from_arrow(
-    data: DataType,
-    missing: FloatCompatible,
-    nthread: int,
-    feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
-    enable_categorical: bool,
-) -> DispatchedDataBackendReturnType:
+def _arrow_transform(data: DataType) -> Any:
+    import pandas as pd
     import pyarrow as pa
+    from pyarrow.dataset import Dataset
 
-    if not all(
-        pa.types.is_integer(t) or pa.types.is_floating(t) for t in data.schema.types
-    ):
-        raise ValueError(
-            "Features in dataset can only be integers or floating point number"
-        )
-    if enable_categorical:
-        raise ValueError("categorical data in arrow is not supported yet.")
+    if isinstance(data, Dataset):
+        raise TypeError("arrow Dataset is not supported.")
 
-    batches = data.to_batches()
-    rb_iter = iter(batches)
-    it = record_batch_data_iter(rb_iter)
-    next_callback = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)(it)
-    handle = ctypes.c_void_p()
-    config = from_pystr_to_cstr(
-        json.dumps({"missing": missing, "nthread": nthread, "nbatch": len(batches)})
-    )
-    _check_call(
-        _LIB.XGDMatrixCreateFromArrowCallback(
-            next_callback,
-            config,
-            ctypes.byref(handle),
-        )
-    )
-    return handle, feature_names, feature_types
+    data = cast(pa.Table, data)
+
+    def type_mapper(dtype: pa.DataType) -> Optional[str]:
+        """Maps pyarrow type to pandas arrow extension type."""
+        if pa.types.is_int8(dtype):
+            return pd.ArrowDtype(pa.int8())
+        if pa.types.is_int16(dtype):
+            return pd.ArrowDtype(pa.int16())
+        if pa.types.is_int32(dtype):
+            return pd.ArrowDtype(pa.int32())
+        if pa.types.is_int64(dtype):
+            return pd.ArrowDtype(pa.int64())
+        if pa.types.is_uint8(dtype):
+            return pd.ArrowDtype(pa.uint8())
+        if pa.types.is_uint16(dtype):
+            return pd.ArrowDtype(pa.uint16())
+        if pa.types.is_uint32(dtype):
+            return pd.ArrowDtype(pa.uint32())
+        if pa.types.is_uint64(dtype):
+            return pd.ArrowDtype(pa.uint64())
+        if pa.types.is_float16(dtype):
+            return pd.ArrowDtype(pa.float16())
+        if pa.types.is_float32(dtype):
+            return pd.ArrowDtype(pa.float32())
+        if pa.types.is_float64(dtype):
+            return pd.ArrowDtype(pa.float64())
+        if pa.types.is_boolean(dtype):
+            return pd.ArrowDtype(pa.bool_())
+        return None
+
+    df = data.to_pandas(types_mapper=type_mapper)
+    return df
 
 
 def _is_cudf_df(data: DataType) -> bool:
@@ -1081,6 +1046,8 @@ def dispatch_data_backend(
         return _from_list(data, missing, threads, feature_names, feature_types)
     if _is_tuple(data):
         return _from_tuple(data, missing, threads, feature_names, feature_types)
+    if _is_arrow(data):
+        data = _arrow_transform(data)
     if _is_pandas_series(data):
         import pandas as pd
 
@@ -1114,10 +1081,6 @@ def dispatch_data_backend(
         return _from_pandas_series(
             data, missing, threads, enable_categorical, feature_names, feature_types
         )
-    if _is_arrow(data):
-        return _from_arrow(
-            data, missing, threads, feature_names, feature_types, enable_categorical
-        )
     if _has_array_protocol(data):
         array = np.asarray(data)
         return _from_numpy_array(array, missing, threads, feature_names, feature_types)
@@ -1217,6 +1180,8 @@ def dispatch_meta_backend(
     if _is_np_array_like(data):
         _meta_from_numpy(data, name, dtype, handle)
         return
+    if _is_arrow(data):
+        data = _arrow_transform(data)
     if _is_pandas_df(data):
         data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
         _meta_from_numpy(data, name, dtype, handle)
@@ -1311,6 +1276,8 @@ def _proxy_transform(
         import pandas as pd
 
         data = pd.DataFrame(data)
+    if _is_arrow(data):
+        data = _arrow_transform(data)
     if _is_pandas_df(data):
         arr, feature_names, feature_types = _transform_pandas_df(
             data, enable_categorical, feature_names, feature_types
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f6ab8d4df..f8b0aa3de 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -532,33 +532,8 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
   API_END();
 }
 
-XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array,
-                                     void *ptr_schema) {
-  API_BEGIN();
-  static_cast<data::RecordBatchesIterAdapter *>(data_handle)
-      ->SetData(static_cast<struct ArrowArray *>(ptr_array),
-                static_cast<struct ArrowSchema *>(ptr_schema));
-  API_END();
-}
-
-XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config,
-                                             DMatrixHandle *out) {
-  API_BEGIN();
-  xgboost_CHECK_C_ARG_PTR(config);
-  auto jconfig = Json::Load(StringView{config});
-  auto missing = GetMissing(jconfig);
-  auto n_batches = RequiredArg<Integer>(jconfig, "nbatch", __func__);
-  auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
-  data::RecordBatchesIterAdapter adapter(next, n_batches);
-  xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
-  API_END();
-}
-
-XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
-                                  const int* idxset,
-                                  xgboost::bst_ulong len,
-                                  DMatrixHandle* out) {
+XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle, const int *idxset, xgboost::bst_ulong len,
+                                  DMatrixHandle *out) {
   xgboost_CHECK_C_ARG_PTR(out);
   return XGDMatrixSliceDMatrixEx(handle, idxset, len, out, 0);
 }
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 1463a13a7..e7eaa372f 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -20,7 +20,6 @@
 #include "../common/error_msg.h"  // for MaxFeatureSize
 #include "../common/math.h"
 #include "array_interface.h"
-#include "arrow-cdi.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/logging.h"
@@ -899,306 +898,6 @@ class Column {
   const uint8_t* bitmap_;
 };
 
-// Only columns of primitive types are supported. An ArrowColumnarBatch is a
-// collection of std::shared_ptr<PrimitiveColumn>. These columns can be of different data types.
-// Hence, PrimitiveColumn is a class template; and all concrete PrimitiveColumns
-// derive from the abstract class Column.
-template <typename T>
-class PrimitiveColumn : public Column {
-  static constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
-
- public:
-  PrimitiveColumn(size_t idx, size_t length, size_t null_count,
-                  const uint8_t* bitmap, const T* data, float missing)
-    : Column{idx, length, null_count, bitmap}, data_{data}, missing_{missing} {}
-
-  COOTuple GetElement(size_t row_idx) const override {
-    CHECK(data_ && row_idx < length_) << "Column is empty or out-of-bound index of the column";
-    return { row_idx, col_idx_, IsValidElement(row_idx) ?
-                  static_cast<float>(data_[row_idx]) : kNaN };
-  }
-
-  bool IsValidElement(size_t row_idx) const override {
-    // std::isfinite needs to cast to double to prevent msvc report error
-    return IsValid(row_idx)
-            && std::isfinite(static_cast<double>(data_[row_idx]))
-            && static_cast<float>(data_[row_idx]) != missing_;
-  }
-
-  std::vector<float> AsFloatVector() const override {
-    CHECK(data_) << "Column is empty";
-    std::vector<float> fv(length_);
-    std::transform(data_, data_ + length_, fv.begin(),
-        [](T v) { return static_cast<float>(v); });
-    return fv;
-  }
-
-  std::vector<uint64_t> AsUint64Vector() const override {
-    CHECK(data_) << "Column is empty";
-    std::vector<uint64_t> iv(length_);
-    std::transform(data_, data_ + length_, iv.begin(),
-        [](T v) { return static_cast<uint64_t>(v); });
-    return iv;
-  }
-
- private:
-  const T* data_;
-  float missing_;  // user specified missing value
-};
-
-struct ColumnarMetaInfo {
-  // data type of the column
-  ColumnDType type{ColumnDType::kUnknown};
-  // location of the column in an Arrow record batch
-  int64_t loc{-1};
-};
-
-struct ArrowSchemaImporter {
-  std::vector<ColumnarMetaInfo> columns;
-
-  // map Arrow format strings to types
-  static ColumnDType FormatMap(char const* format_str) {
-    CHECK(format_str) << "Format string cannot be empty";
-    switch (format_str[0]) {
-      case 'c':
-        return ColumnDType::kInt8;
-      case 'C':
-        return ColumnDType::kUInt8;
-      case 's':
-        return ColumnDType::kInt16;
-      case 'S':
-        return ColumnDType::kUInt16;
-      case 'i':
-        return ColumnDType::kInt32;
-      case 'I':
-        return ColumnDType::kUInt32;
-      case 'l':
-        return ColumnDType::kInt64;
-      case 'L':
-        return ColumnDType::kUInt64;
-      case 'f':
-        return ColumnDType::kFloat;
-      case 'g':
-        return ColumnDType::kDouble;
-      default:
-        CHECK(false) << "Column data type not supported by XGBoost";
-        return ColumnDType::kUnknown;
-    }
-  }
-
-  void Import(struct ArrowSchema *schema) {
-    if (schema) {
-      CHECK(std::string(schema->format) == "+s"); // NOLINT
-      CHECK(columns.empty());
-      for (auto i = 0; i < schema->n_children; ++i) {
-        std::string name{schema->children[i]->name};
-        ColumnDType type = FormatMap(schema->children[i]->format);
-        ColumnarMetaInfo col_info{type, i};
-        columns.push_back(col_info);
-      }
-      if (schema->release) {
-        schema->release(schema);
-      }
-    }
-  }
-};
-
-class ArrowColumnarBatch {
- public:
-  ArrowColumnarBatch(struct ArrowArray *rb, struct ArrowSchemaImporter* schema)
-    : rb_{rb}, schema_{schema} {
-    CHECK(rb_) << "Cannot import non-existent record batch";
-    CHECK(!schema_->columns.empty()) << "Cannot import record batch without a schema";
-  }
-
-  size_t Import(float missing) {
-    auto& infov = schema_->columns;
-    for (size_t i = 0; i < infov.size(); ++i) {
-      columns_.push_back(CreateColumn(i, infov[i], missing));
-    }
-
-    // Compute the starting location for every row in this batch
-    auto batch_size = rb_->length;
-    auto num_columns = columns_.size();
-    row_offsets_.resize(batch_size + 1, 0);
-    for (auto i = 0; i < batch_size; ++i) {
-      row_offsets_[i+1] = row_offsets_[i];
-      for (size_t j = 0; j < num_columns; ++j) {
-        if (GetColumn(j).IsValidElement(i)) {
-          row_offsets_[i+1]++;
-        }
-      }
-    }
-    // return number of elements in the batch
-    return row_offsets_.back();
-  }
-
-  ArrowColumnarBatch(const ArrowColumnarBatch&) = delete;
-  ArrowColumnarBatch& operator=(const ArrowColumnarBatch&) = delete;
-  ArrowColumnarBatch(ArrowColumnarBatch&&) = delete;
-  ArrowColumnarBatch& operator=(ArrowColumnarBatch&&) = delete;
-
-  virtual ~ArrowColumnarBatch() {
-    if (rb_ && rb_->release) {
-      rb_->release(rb_);
-      rb_ = nullptr;
-    }
-    columns_.clear();
-  }
-
-  size_t Size() const { return rb_ ? rb_->length : 0; }
-
-  size_t NumColumns() const { return columns_.size(); }
-
-  size_t NumElements() const { return row_offsets_.back(); }
-
-  const Column& GetColumn(size_t col_idx) const {
-    return *columns_[col_idx];
-  }
-
-  void ShiftRowOffsets(size_t batch_offset) {
-    std::transform(row_offsets_.begin(), row_offsets_.end(), row_offsets_.begin(),
-        [=](size_t c) { return c + batch_offset; });
-  }
-
-  const std::vector<size_t>& RowOffsets() const { return row_offsets_; }
-
- private:
-  std::shared_ptr<Column> CreateColumn(size_t idx,
-                                      ColumnarMetaInfo info,
-                                      float missing) const {
-    if (info.loc < 0) {
-      return nullptr;
-    }
-
-    auto loc_in_batch = info.loc;
-    auto length = rb_->length;
-    auto null_count = rb_->null_count;
-    auto buffers0 = rb_->children[loc_in_batch]->buffers[0];
-    auto buffers1 = rb_->children[loc_in_batch]->buffers[1];
-    const uint8_t* bitmap = buffers0 ? reinterpret_cast<const uint8_t*>(buffers0) : nullptr;
-    const uint8_t* data = buffers1 ? reinterpret_cast<const uint8_t*>(buffers1) : nullptr;
-
-    // if null_count is not computed, compute it here
-    if (null_count < 0) {
-      if (!bitmap) {
-        null_count = 0;
-      } else {
-        null_count = length;
-        for (auto i = 0; i < length; ++i) {
-          if (bitmap[i/8] & (1 << (i%8))) {
-            null_count--;
-          }
-        }
-      }
-    }
-
-    switch (info.type) {
-      case ColumnDType::kInt8:
-        return std::make_shared<PrimitiveColumn<int8_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int8_t*>(data), missing);
-      case ColumnDType::kUInt8:
-        return std::make_shared<PrimitiveColumn<uint8_t>>(
-            idx, length, null_count, bitmap, data, missing);
-      case ColumnDType::kInt16:
-        return std::make_shared<PrimitiveColumn<int16_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int16_t*>(data), missing);
-      case ColumnDType::kUInt16:
-        return std::make_shared<PrimitiveColumn<uint16_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const uint16_t*>(data), missing);
-      case ColumnDType::kInt32:
-        return std::make_shared<PrimitiveColumn<int32_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int32_t*>(data), missing);
-      case ColumnDType::kUInt32:
-        return std::make_shared<PrimitiveColumn<uint32_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const uint32_t*>(data), missing);
-      case ColumnDType::kInt64:
-        return std::make_shared<PrimitiveColumn<int64_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int64_t*>(data), missing);
-      case ColumnDType::kUInt64:
-        return std::make_shared<PrimitiveColumn<uint64_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const uint64_t*>(data), missing);
-      case ColumnDType::kFloat:
-        return std::make_shared<PrimitiveColumn<float>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const float*>(data), missing);
-      case ColumnDType::kDouble:
-        return std::make_shared<PrimitiveColumn<double>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const double*>(data), missing);
-      default:
-        return nullptr;
-    }
-  }
-
-  struct ArrowArray* rb_;
-  struct ArrowSchemaImporter* schema_;
-  std::vector<std::shared_ptr<Column>> columns_;
-  std::vector<size_t> row_offsets_;
-};
-
-using ArrowColumnarBatchVec = std::vector<std::unique_ptr<ArrowColumnarBatch>>;
-class RecordBatchesIterAdapter: public dmlc::DataIter<ArrowColumnarBatchVec> {
- public:
-  RecordBatchesIterAdapter(XGDMatrixCallbackNext* next_callback, int nbatch)
-      : next_callback_{next_callback}, nbatches_{nbatch} {}
-
-  void BeforeFirst() override {
-    CHECK(at_first_) << "Cannot reset RecordBatchesIterAdapter";
-  }
-
-  bool Next() override {
-    batches_.clear();
-    while (batches_.size() < static_cast<size_t>(nbatches_) && (*next_callback_)(this) != 0) {
-      at_first_ = false;
-    }
-
-    if (batches_.size() > 0) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  void SetData(struct ArrowArray* rb, struct ArrowSchema* schema) {
-    // Schema is only imported once at the beginning, regardless how many
-    // baches are comming.
-    // But even schema is not imported we still need to release its C data
-    // exported from Arrow.
-    if (at_first_ && schema) {
-      schema_.Import(schema);
-    } else {
-      if (schema && schema->release) {
-        schema->release(schema);
-      }
-    }
-    if (rb) {
-      batches_.push_back(std::make_unique<ArrowColumnarBatch>(rb, &schema_));
-    }
-  }
-
-  const ArrowColumnarBatchVec& Value() const override {
-    return batches_;
-  }
-
-  size_t NumColumns() const { return schema_.columns.size(); }
-  size_t NumRows() const { return kAdapterUnknownSize; }
-
- private:
-  XGDMatrixCallbackNext *next_callback_;
-  bool at_first_{true};
-  int nbatches_;
-  struct ArrowSchemaImporter schema_;
-  ArrowColumnarBatchVec batches_;
-};
-
 class SparsePageAdapterBatch {
   HostSparsePageView page_;
 
diff --git a/src/data/arrow-cdi.h b/src/data/arrow-cdi.h
deleted file mode 100644
index 2cb061b3a..000000000
--- a/src/data/arrow-cdi.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#pragma once
-
-#include <cstdint>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ARROW_FLAG_DICTIONARY_ORDERED 1
-#define ARROW_FLAG_NULLABLE 2
-#define ARROW_FLAG_MAP_KEYS_SORTED 4
-
-struct ArrowSchema {
-  // Array type description
-  const char* format;
-  const char* name;
-  const char* metadata;
-  int64_t flags;
-  int64_t n_children;
-  struct ArrowSchema** children;
-  struct ArrowSchema* dictionary;
-
-  // Release callback
-  void (*release)(struct ArrowSchema*);
-  // Opaque producer-specific data
-  void* private_data;
-};
-
-struct ArrowArray {
-  // Array data description
-  int64_t length;
-  int64_t null_count;
-  int64_t offset;
-  int64_t n_buffers;
-  int64_t n_children;
-  const void** buffers;
-  struct ArrowArray** children;
-  struct ArrowArray* dictionary;
-
-  // Release callback
-  void (*release)(struct ArrowArray*);
-  // Opaque producer-specific data
-  void* private_data;
-};
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/data/data.cc b/src/data/data.cc
index 92547dafd..4a2bef6be 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -1011,9 +1011,6 @@ template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter*
 template DMatrix* DMatrix::Create(
     data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
     float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
-template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
-    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&,
-    DataSplitMode data_split_mode);
 
 SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
   SparsePage transpose;
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index bf7b27eb7..0adf6b466 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -361,78 +361,4 @@ template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int n
 template SimpleDMatrix::SimpleDMatrix(
     IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
     float missing, int nthread, DataSplitMode data_split_mode);
-
-template <>
-SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
-                             DataSplitMode data_split_mode) {
-  Context ctx;
-  ctx.nthread = nthread;
-
-  auto& offset_vec = sparse_page_->offset.HostVector();
-  auto& data_vec = sparse_page_->data.HostVector();
-  uint64_t total_batch_size = 0;
-  uint64_t total_elements = 0;
-
-  adapter->BeforeFirst();
-  // Iterate over batches of input data
-  while (adapter->Next()) {
-    auto& batches = adapter->Value();
-    size_t num_elements = 0;
-    size_t num_rows = 0;
-    // Import Arrow RecordBatches
-#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx.Threads())
-    for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
-      num_elements += batches[i]->Import(missing);
-      num_rows += batches[i]->Size();
-    }
-    total_elements += num_elements;
-    total_batch_size += num_rows;
-    // Compute global offset for every row and starting row for every batch
-    std::vector<uint64_t> batch_offsets(batches.size());
-    for (size_t i = 0; i < batches.size(); ++i) {
-      if (i == 0) {
-        batch_offsets[i] = total_batch_size - num_rows;
-        batches[i]->ShiftRowOffsets(total_elements - num_elements);
-      } else {
-        batch_offsets[i] = batch_offsets[i - 1] + batches[i - 1]->Size();
-        batches[i]->ShiftRowOffsets(batches[i - 1]->RowOffsets().back());
-      }
-    }
-    // Pre-allocate DMatrix memory
-    data_vec.resize(total_elements);
-    offset_vec.resize(total_batch_size + 1);
-    // Copy data into DMatrix
-#pragma omp parallel num_threads(ctx.Threads())
-    {
-#pragma omp for nowait
-      for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
-        size_t begin = batches[i]->RowOffsets()[0];
-        for (size_t k = 0; k < batches[i]->Size(); ++k) {
-          for (size_t j = 0; j < batches[i]->NumColumns(); ++j) {
-            auto element = batches[i]->GetColumn(j).GetElement(k);
-            if (!std::isnan(element.value)) {
-              data_vec[begin++] = Entry(element.column_idx, element.value);
-            }
-          }
-        }
-      }
-#pragma omp for nowait
-      for (int i = 0; i < static_cast<int>(batches.size()); ++i) {
-        auto& offsets = batches[i]->RowOffsets();
-        std::copy(offsets.begin() + 1, offsets.end(), offset_vec.begin() + batch_offsets[i] + 1);
-      }
-    }
-  }
-  // Synchronise worker columns
-  info_.num_col_ = adapter->NumColumns();
-  info_.data_split_mode = data_split_mode;
-  ReindexFeatures(&ctx);
-  info_.SynchronizeNumberOfColumns();
-
-  info_.num_row_ = total_batch_size;
-  info_.num_nonzero_ = data_vec.size();
-  CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
-
-  fmat_ctx_ = ctx;
-}
 }  // namespace xgboost::data
diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
index 56685c1e6..d6164894a 100644
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2022 by XGBoost Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
  * \file simple_dmatrix.h
  * \brief In-memory version of DMatrix.
  * \author Tianqi Chen
@@ -15,8 +15,7 @@
 
 #include "gradient_index.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 // Used for single batch data.
 class SimpleDMatrix : public DMatrix {
  public:
@@ -75,6 +74,5 @@ class SimpleDMatrix : public DMatrix {
   // Context used only for DMatrix initialization.
   Context fmat_ctx_;
 };
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SIMPLE_DMATRIX_H_
diff --git a/tests/python/test_with_arrow.py b/tests/python/test_with_arrow.py
index 8b7bce9eb..4673a688e 100644
--- a/tests/python/test_with_arrow.py
+++ b/tests/python/test_with_arrow.py
@@ -22,7 +22,7 @@ pytestmark = pytest.mark.skipif(
 dpath = "demo/data/"
 
 
-class TestArrowTable(unittest.TestCase):
+class TestArrowTable:
     def test_arrow_table(self):
         df = pd.DataFrame(
             [[0, 1, 2.0, 3.0], [1, 2, 3.0, 4.0]], columns=["a", "b", "c", "d"]
@@ -52,7 +52,8 @@ class TestArrowTable(unittest.TestCase):
         assert dm.num_row() == 4
         assert dm.num_col() == 3
 
-    def test_arrow_train(self):
+    @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
+    def test_arrow_train(self, DMatrixT):
         import pandas as pd
 
         rows = 100
@@ -64,16 +65,24 @@ class TestArrowTable(unittest.TestCase):
             }
         )
         y = pd.Series(np.random.randn(rows))
+
         table = pa.Table.from_pandas(X)
-        dtrain1 = xgb.DMatrix(table)
-        dtrain1.set_label(y)
+        dtrain1 = DMatrixT(table)
+        dtrain1.set_label(pa.Table.from_pandas(pd.DataFrame(y)))
         bst1 = xgb.train({}, dtrain1, num_boost_round=10)
-        preds1 = bst1.predict(xgb.DMatrix(X))
-        dtrain2 = xgb.DMatrix(X, y)
+        preds1 = bst1.predict(DMatrixT(X))
+
+        dtrain2 = DMatrixT(X, y)
         bst2 = xgb.train({}, dtrain2, num_boost_round=10)
-        preds2 = bst2.predict(xgb.DMatrix(X))
+        preds2 = bst2.predict(DMatrixT(X))
+
         np.testing.assert_allclose(preds1, preds2)
 
+        preds3 = bst2.inplace_predict(table)
+        np.testing.assert_allclose(preds1, preds3)
+        assert bst2.feature_names == ["A", "B", "C"]
+        assert bst2.feature_types == ["int", "float", "int"]
+
     def test_arrow_survival(self):
         data = os.path.join(tm.data_dir(__file__), "veterans_lung_cancer.csv")
         table = pc.read_csv(data)

From 295f13ef09b3fb44089bac4979af64bbed612986 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jordan=20Fr=C3=A9ry?= <jordan.frery@zama.ai>
Date: Thu, 28 Sep 2023 12:53:29 +0200
Subject: [PATCH 30/59] Add privacy preserving tutorial to index.rst (#9614)

---
 doc/tutorials/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst
index 7693173e9..5d090ce65 100644
--- a/doc/tutorials/index.rst
+++ b/doc/tutorials/index.rst
@@ -30,3 +30,4 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
   input_format
   param_tuning
   custom_metric_obj
+  privacy_preserving
\ No newline at end of file

From 417c3ba47e3b60d16cecff81eb59547e9fc36d9a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 28 Sep 2023 22:51:47 +0800
Subject: [PATCH 31/59] Workaround Apple clang issue. (#9615)

---
 src/common/io.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common/io.h b/src/common/io.h
index 07bb60787..2eb62b094 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -382,7 +382,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
    * @param length    See the `length` parameter of `mmap` for details.
    */
   explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
-      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
+      : AlignedResourceReadStream{std::shared_ptr<MmapResource>{  // NOLINT
+            new MmapResource{std::move(path), offset, length}}} {}
   ~PrivateMmapConstStream() noexcept(false) override;
 };
 

From d95be1c38d8a3877197a8171d5bac24b26cddf35 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 29 Sep 2023 00:39:07 +0800
Subject: [PATCH 32/59] Small cleanup to jvm iter adapter. (#9616)

- Remove header dependency on c_api
- Remove remaining code for arrow.
---
 R-package/src/Makevars.in  |  1 +
 R-package/src/Makevars.win |  1 +
 src/c_api/c_api.cc         |  4 +-
 src/data/adapter.cc        | 28 +++++++++++++
 src/data/adapter.h         | 85 +++++---------------------------------
 5 files changed, 43 insertions(+), 76 deletions(-)
 create mode 100644 src/data/adapter.cc

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index f42c94501..e3af418e3 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -62,6 +62,7 @@ OBJECTS= \
     $(PKGROOT)/src/gbm/gbtree_model.o \
     $(PKGROOT)/src/gbm/gblinear.o \
     $(PKGROOT)/src/gbm/gblinear_model.o \
+    $(PKGROOT)/src/data/adapter.o \
     $(PKGROOT)/src/data/simple_dmatrix.o \
     $(PKGROOT)/src/data/data.o \
     $(PKGROOT)/src/data/sparse_page_raw_format.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 1b620751f..8f003403f 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -62,6 +62,7 @@ OBJECTS= \
     $(PKGROOT)/src/gbm/gbtree_model.o \
     $(PKGROOT)/src/gbm/gblinear.o \
     $(PKGROOT)/src/gbm/gblinear_model.o \
+    $(PKGROOT)/src/data/adapter.o \
     $(PKGROOT)/src/data/simple_dmatrix.o \
     $(PKGROOT)/src/data/data.o \
     $(PKGROOT)/src/data/sparse_page_raw_format.o \
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f8b0aa3de..8cead56d8 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -269,8 +269,8 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
   if (cache_info != nullptr) {
     scache = cache_info;
   }
-  xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
-                                 XGBoostBatchCSR> adapter(data_handle, callback);
+  xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR> adapter(
+      data_handle, callback);
   xgboost_CHECK_C_ARG_PTR(out);
   *out = new std::shared_ptr<DMatrix> {
     DMatrix::Create(
diff --git a/src/data/adapter.cc b/src/data/adapter.cc
new file mode 100644
index 000000000..4fa171c9d
--- /dev/null
+++ b/src/data/adapter.cc
@@ -0,0 +1,28 @@
+/**
+ *  Copyright 2019-2023, XGBoost Contributors
+ */
+#include "adapter.h"
+
+#include "../c_api/c_api_error.h"  // for API_BEGIN, API_END
+#include "xgboost/c_api.h"
+
+namespace xgboost::data {
+template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
+bool IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>::Next() {
+  if ((*next_callback_)(
+          data_handle_,
+          [](void *handle, XGBoostBatchCSR batch) -> int {
+            API_BEGIN();
+            static_cast<IteratorAdapter *>(handle)->SetData(batch);
+            API_END();
+          },
+          this) != 0) {
+    at_first_ = false;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template class IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
+}  // namespace xgboost::data
diff --git a/src/data/adapter.h b/src/data/adapter.h
index e7eaa372f..9e7058aba 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -1,5 +1,5 @@
-/*!
- *  Copyright (c) 2019~2021 by Contributors
+/**
+ *  Copyright 2019-2023, XGBoost Contributors
  * \file adapter.h
  */
 #ifndef XGBOOST_DATA_ADAPTER_H_
@@ -16,7 +16,6 @@
 #include <utility>  // std::move
 #include <vector>
 
-#include "../c_api/c_api_error.h"
 #include "../common/error_msg.h"  // for MaxFeatureSize
 #include "../common/math.h"
 #include "array_interface.h"
@@ -742,8 +741,10 @@ class FileAdapter : dmlc::DataIter<FileAdapterBatch> {
   dmlc::Parser<uint32_t>* parser_;
 };
 
-/*! \brief Data iterator that takes callback to return data, used in JVM package for
- *  accepting data iterator. */
+/**
+ * @brief Data iterator that takes callback to return data, used in JVM package for accepting data
+ *        iterator.
+ */
 template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
 class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
  public:
@@ -757,23 +758,9 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
     CHECK(at_first_) << "Cannot reset IteratorAdapter";
   }
 
-  bool Next() override {
-    if ((*next_callback_)(
-            data_handle_,
-            [](void *handle, XGBoostBatchCSR batch) -> int {
-              API_BEGIN();
-              static_cast<IteratorAdapter *>(handle)->SetData(batch);
-              API_END();
-            },
-            this) != 0) {
-      at_first_ = false;
-      return true;
-    } else {
-      return false;
-    }
-  }
+  [[nodiscard]] bool Next() override;
 
-  FileAdapterBatch const& Value() const override {
+  [[nodiscard]] FileAdapterBatch const& Value() const override {
     return *batch_.get();
   }
 
@@ -821,12 +808,12 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
     block_.index = dmlc::BeginPtr(index_);
     block_.value = dmlc::BeginPtr(value_);
 
-    batch_.reset(new FileAdapterBatch(&block_, row_offset_));
+    batch_ = std::make_unique<FileAdapterBatch>(&block_, row_offset_);
     row_offset_ += offset_.size() - 1;
   }
 
-  size_t NumColumns() const { return columns_; }
-  size_t NumRows() const { return kAdapterUnknownSize; }
+  [[nodiscard]] std::size_t NumColumns() const { return columns_; }
+  [[nodiscard]] std::size_t NumRows() const { return kAdapterUnknownSize; }
 
  private:
   std::vector<size_t> offset_;
@@ -848,56 +835,6 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
   std::unique_ptr<FileAdapterBatch> batch_;
 };
 
-enum ColumnDType : uint8_t {
-  kUnknown,
-  kInt8,
-  kUInt8,
-  kInt16,
-  kUInt16,
-  kInt32,
-  kUInt32,
-  kInt64,
-  kUInt64,
-  kFloat,
-  kDouble
-};
-
-class Column {
- public:
-  Column() = default;
-
-  Column(size_t col_idx, size_t length, size_t null_count, const uint8_t* bitmap)
-    : col_idx_{col_idx}, length_{length}, null_count_{null_count}, bitmap_{bitmap} {}
-
-  virtual ~Column() = default;
-
-  Column(const Column&) = delete;
-  Column& operator=(const Column&) = delete;
-  Column(Column&&) = delete;
-  Column& operator=(Column&&) = delete;
-
-  // whether the valid bit is set for this element
-  bool IsValid(size_t row_idx) const {
-    return (!bitmap_ || (bitmap_[row_idx/8] & (1 << (row_idx%8))));
-  }
-
-  virtual COOTuple GetElement(size_t row_idx) const = 0;
-
-  virtual bool IsValidElement(size_t row_idx) const = 0;
-
-  virtual std::vector<float> AsFloatVector() const = 0;
-
-  virtual std::vector<uint64_t> AsUint64Vector() const = 0;
-
-  size_t Length() const { return length_; }
-
- protected:
-  size_t col_idx_;
-  size_t length_;
-  size_t null_count_;
-  const uint8_t* bitmap_;
-};
-
 class SparsePageAdapterBatch {
   HostSparsePageView page_;
 

From 4d7a187cb0ebc2194abc1efe0aaee6f06af937c5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 29 Sep 2023 02:29:33 +0800
Subject: [PATCH 33/59] Remove `XGBoosterGetModelRaw`. (#9617)

Deprecated in 1.6.
---
 include/xgboost/c_api.h |  9 ---------
 src/c_api/c_api.cc      | 23 -----------------------
 2 files changed, 32 deletions(-)

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 63096cb56..ce2d8bf43 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1258,15 +1258,6 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
 XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *config, bst_ulong *out_len,
                                        char const **out_dptr);
 
-/*!
- * \brief Save booster to a buffer with in binary format.
- *
- * \deprecated since 1.6.0
- * \see XGBoosterSaveModelToBuffer()
- */
-XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, bst_ulong *out_len,
-                                 const char **out_dptr);
-
 /*!
  * \brief Memory snapshot based serialization method.  Saves everything states
  * into buffer.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 8cead56d8..858047af8 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1348,29 +1348,6 @@ XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_co
   API_END();
 }
 
-XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_len,
-                                 const char **out_dptr) {
-  API_BEGIN();
-  CHECK_HANDLE();
-
-  auto *learner = static_cast<Learner*>(handle);
-  std::string& raw_str = learner->GetThreadLocal().ret_str;
-  raw_str.resize(0);
-
-  common::MemoryBufferStream fo(&raw_str);
-  LOG(WARNING) << error::DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
-
-  learner->Configure();
-  learner->SaveModel(&fo);
-
-  xgboost_CHECK_C_ARG_PTR(out_dptr);
-  xgboost_CHECK_C_ARG_PTR(out_len);
-
-  *out_dptr = dmlc::BeginPtr(raw_str);
-  *out_len = static_cast<xgboost::bst_ulong>(raw_str.length());
-  API_END();
-}
-
 // The following two functions are `Load` and `Save` for memory based
 // serialization methods. E.g. Python pickle.
 XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, xgboost::bst_ulong *out_len,

From 799f8485e2bb832189f0d5ea4e5d708679a1f8ab Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 7 Oct 2023 20:42:09 -0500
Subject: [PATCH 34/59] [R] [CI] enforce
 lintr::function_left_parentheses_linter check (#9631)

---
 R-package/R/xgb.Booster.R                     |  6 ++---
 R-package/R/xgb.DMatrix.R                     |  2 +-
 R-package/R/xgb.load.raw.R                    |  2 +-
 R-package/R/xgb.unserialize.R                 |  2 +-
 R-package/R/xgboost.R                         |  2 +-
 R-package/demo/cross_validation.R             |  2 +-
 R-package/demo/custom_objective.R             |  4 ++--
 R-package/demo/early_stopping.R               |  2 +-
 .../tests/helper_scripts/generate_models.R    | 22 +++++++++----------
 .../tests/testthat/test_model_compatibility.R | 10 ++++-----
 R-package/tests/testthat/test_ranking.R       |  2 +-
 demo/kaggle-higgs/higgs-train.R               |  4 ++--
 demo/kaggle-higgs/speedtest.R                 |  4 ++--
 tests/ci_build/lint_r.R                       |  1 +
 14 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index f196b1a1d..37cfc199e 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -21,13 +21,13 @@ xgb.Booster.handle <- function(params, cachelist, modelfile, handle) {
       ## A memory buffer
       bst <- xgb.unserialize(modelfile, handle)
       xgb.parameters(bst) <- params
-      return (bst)
+      return(bst)
     } else if (inherits(modelfile, "xgb.Booster")) {
       ## A booster object
       bst <- xgb.Booster.complete(modelfile, saveraw = TRUE)
       bst <- xgb.unserialize(bst$raw)
       xgb.parameters(bst) <- params
-      return (bst)
+      return(bst)
     } else {
       stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object")
     }
@@ -382,7 +382,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
       cval[0] <- val
       return(cval)
     }
-    return (val)
+    return(val)
   }
 
   ## We set strict_shape to TRUE then drop the dimensions conditionally
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index 8586ae086..b01e98637 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -117,7 +117,7 @@ xgb.get.DMatrix <- function(data, label, missing, weight, nthread) {
       stop("xgboost: invalid input data")
     }
   }
-  return (dtrain)
+  return(dtrain)
 }
 
 
diff --git a/R-package/R/xgb.load.raw.R b/R-package/R/xgb.load.raw.R
index d531da6c9..b159e9de1 100644
--- a/R-package/R/xgb.load.raw.R
+++ b/R-package/R/xgb.load.raw.R
@@ -18,6 +18,6 @@ xgb.load.raw <- function(buffer, as_booster = FALSE) {
     booster <- xgb.Booster.complete(booster, saveraw = TRUE)
     return(booster)
   } else {
-    return (handle)
+    return(handle)
   }
 }
diff --git a/R-package/R/xgb.unserialize.R b/R-package/R/xgb.unserialize.R
index e666eb055..291d3e7da 100644
--- a/R-package/R/xgb.unserialize.R
+++ b/R-package/R/xgb.unserialize.R
@@ -37,5 +37,5 @@ xgb.unserialize <- function(buffer, handle = NULL) {
       }
     })
   class(handle) <- "xgb.Booster.handle"
-  return (handle)
+  return(handle)
 }
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index db4fd67aa..e60ea2de8 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -24,7 +24,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
                    early_stopping_rounds = early_stopping_rounds, maximize = maximize,
                    save_period = save_period, save_name = save_name,
                    xgb_model = xgb_model, callbacks = callbacks, ...)
-  return (bst)
+  return(bst)
 }
 
 #' Training part from Mushroom Data Set
diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R
index 33c70a3be..cf048c5ed 100644
--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@@ -25,7 +25,7 @@ xgb.cv(param, dtrain, nrounds, nfold = 5,
 # you can also do cross validation with customized loss function
 # See custom_objective.R
 ##
-print ('running cross validation, with customized loss function')
+print('running cross validation, with customized loss function')
 
 logregobj <- function(preds, dtrain) {
   labels <- getinfo(dtrain, "label")
diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R
index 2d0914ab5..35201332c 100644
--- a/R-package/demo/custom_objective.R
+++ b/R-package/demo/custom_objective.R
@@ -35,7 +35,7 @@ evalerror <- function(preds, dtrain) {
 
 param <- list(max_depth = 2, eta = 1, nthread  =  2, verbosity = 0,
               objective = logregobj, eval_metric = evalerror)
-print ('start training with user customized objective')
+print('start training with user customized objective')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
 bst <- xgb.train(param, dtrain, num_round, watchlist)
@@ -59,7 +59,7 @@ logregobjattr <- function(preds, dtrain) {
 }
 param <- list(max_depth = 2, eta = 1, nthread  =  2, verbosity = 0,
               objective = logregobjattr, eval_metric = evalerror)
-print ('start training with user customized objective, with additional attributes in DMatrix')
+print('start training with user customized objective, with additional attributes in DMatrix')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
 bst <- xgb.train(param, dtrain, num_round, watchlist)
diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R
index f733dce8d..04da1382f 100644
--- a/R-package/demo/early_stopping.R
+++ b/R-package/demo/early_stopping.R
@@ -30,7 +30,7 @@ evalerror <- function(preds, dtrain) {
   err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
   return(list(metric = "error", value = err))
 }
-print ('start training with early Stopping setting')
+print('start training with early Stopping setting')
 
 bst <- xgb.train(param, dtrain, num_round, watchlist,
                  objective = logregobj, eval_metric = evalerror, maximize = FALSE,
diff --git a/R-package/tests/helper_scripts/generate_models.R b/R-package/tests/helper_scripts/generate_models.R
index 5d64fa6c5..ef2aeded6 100644
--- a/R-package/tests/helper_scripts/generate_models.R
+++ b/R-package/tests/helper_scripts/generate_models.R
@@ -19,15 +19,15 @@ w <- runif(metadata$kRows)
 version <- packageVersion('xgboost')
 target_dir <- 'models'
 
-save_booster <- function (booster, model_name) {
-  booster_bin <- function (model_name) {
-    return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = '')))
+save_booster <- function(booster, model_name) {
+  booster_bin <- function(model_name) {
+    return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = '')))
   }
-  booster_json <- function (model_name) {
-    return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = '')))
+  booster_json <- function(model_name) {
+    return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = '')))
   }
-  booster_rds <- function (model_name) {
-    return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = '')))
+  booster_rds <- function(model_name) {
+    return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = '')))
   }
   xgb.save(booster, booster_bin(model_name))
   saveRDS(booster, booster_rds(model_name))
@@ -36,7 +36,7 @@ save_booster <- function (booster, model_name) {
   }
 }
 
-generate_regression_model <- function () {
+generate_regression_model <- function() {
   print('Regression')
   y <- rnorm(metadata$kRows)
 
@@ -47,7 +47,7 @@ generate_regression_model <- function () {
   save_booster(booster, 'reg')
 }
 
-generate_logistic_model <- function () {
+generate_logistic_model <- function() {
   print('Binary classification with logistic loss')
   y <- sample(0:1, size = metadata$kRows, replace = TRUE)
   stopifnot(max(y) == 1, min(y) == 0)
@@ -64,7 +64,7 @@ generate_logistic_model <- function () {
   }
 }
 
-generate_classification_model <- function () {
+generate_classification_model <- function() {
   print('Multi-class classification')
   y <- sample(0:(metadata$kClasses - 1), size = metadata$kRows, replace = TRUE)
   stopifnot(max(y) == metadata$kClasses - 1, min(y) == 0)
@@ -77,7 +77,7 @@ generate_classification_model <- function () {
   save_booster(booster, 'cls')
 }
 
-generate_ranking_model <- function () {
+generate_ranking_model <- function() {
   print('Learning to rank')
   y <- sample(0:4, size = metadata$kRows, replace = TRUE)
   stopifnot(max(y) == 4, min(y) == 0)
diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R
index 87380e85d..ce1725dc9 100644
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -9,20 +9,20 @@ metadata <- list(
   kClasses = 3
 )
 
-run_model_param_check <- function (config) {
+run_model_param_check <- function(config) {
   testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')
   testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree')
 }
 
-get_num_tree <- function (booster) {
+get_num_tree <- function(booster) {
   dump <- xgb.dump(booster)
   m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE)
   m <- regmatches(dump, m)
   num_tree <- Reduce('+', lapply(m, length))
-  return (num_tree)
+  return(num_tree)
 }
 
-run_booster_check <- function (booster, name) {
+run_booster_check <- function(booster, name) {
   # If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config().
   if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) {
     booster <- xgb.Booster.complete(booster)
@@ -68,7 +68,7 @@ test_that("Models from previous versions of XGBoost can be loaded", {
 
   pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4), nthread = 2)
 
-  lapply(list.files(model_dir), function (x) {
+  lapply(list.files(model_dir), function(x) {
     model_file <- file.path(model_dir, x)
     m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE)
     m <- regmatches(model_file, m)[[1]]
diff --git a/R-package/tests/testthat/test_ranking.R b/R-package/tests/testthat/test_ranking.R
index b43292e18..d4102dfce 100644
--- a/R-package/tests/testthat/test_ranking.R
+++ b/R-package/tests/testthat/test_ranking.R
@@ -47,7 +47,7 @@ test_that('Test ranking with weighted data', {
     pred <- predict(bst, newdata = dtrain, ntreelimit = i)
     # is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
     is_sorted <- lapply(seq(1, 20, by = 5),
-      function (k) {
+      function(k) {
         ind <- order(-pred[k:(k + 4)])
         z <- y[ind + (k - 1)]
         all(diff(z) <= 0)  # Check if z is monotone decreasing
diff --git a/demo/kaggle-higgs/higgs-train.R b/demo/kaggle-higgs/higgs-train.R
index 6f37040f8..4730d7b3d 100644
--- a/demo/kaggle-higgs/higgs-train.R
+++ b/demo/kaggle-higgs/higgs-train.R
@@ -24,8 +24,8 @@ param <- list("objective" = "binary:logitraw",
               "nthread" = 16)
 watchlist <- list("train" = xgmat)
 nrounds <- 120
-print ("loading data end, start to boost trees")
+print("loading data end, start to boost trees")
 bst <- xgb.train(param, xgmat, nrounds, watchlist)
 # save out model
 xgb.save(bst, "higgs.model")
-print ('finish training')
+print('finish training')
diff --git a/demo/kaggle-higgs/speedtest.R b/demo/kaggle-higgs/speedtest.R
index c0e96a010..c6de1511b 100644
--- a/demo/kaggle-higgs/speedtest.R
+++ b/demo/kaggle-higgs/speedtest.R
@@ -39,11 +39,11 @@ for (i in seq_along(threads)){
                   "nthread" = thread)
     watchlist <- list("train" = xgmat)
     nrounds <- 120
-    print ("loading data end, start to boost trees")
+    print("loading data end, start to boost trees")
     bst <- xgb.train(param, xgmat, nrounds, watchlist)
     # save out model
     xgb.save(bst, "higgs.model")
-    print ('finish training')
+    print('finish training')
   })
 }
 
diff --git a/tests/ci_build/lint_r.R b/tests/ci_build/lint_r.R
index ce512482d..9b55ebfce 100644
--- a/tests/ci_build/lint_r.R
+++ b/tests/ci_build/lint_r.R
@@ -28,6 +28,7 @@ my_linters <- list(
   equals_na = lintr::equals_na_linter(),
   fixed_regex = lintr::fixed_regex_linter(),
   for_loop_index = lintr::for_loop_index_linter(),
+  function_left_parentheses = lintr::function_left_parentheses_linter(),
   function_return = lintr::function_return_linter(),
   infix_spaces_linter = lintr::infix_spaces_linter(),
   is_numeric = lintr::is_numeric_linter(),

From db8d117f7e0c66e8ce6adb8a7a96f0dc6c729f73 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 7 Oct 2023 22:45:20 -0500
Subject: [PATCH 35/59] [CI] standardize endif() calls in CMake scripts (#9637)

---
 CMakeLists.txt                      | 104 ++++++++++++++--------------
 cmake/Doc.cmake                     |   2 +-
 cmake/Sanitizer.cmake               |   2 +-
 cmake/Utils.cmake                   |  50 ++++++-------
 cmake/modules/FindNVML.cmake        |   2 +-
 cmake/modules/FindNccl.cmake        |   4 +-
 demo/c-api/basic/CMakeLists.txt     |   2 +-
 demo/c-api/inference/CMakeLists.txt |   2 +-
 jvm-packages/CMakeLists.txt         |   4 +-
 plugin/CMakeLists.txt               |   8 +--
 rabit/CMakeLists.txt                |   2 +-
 src/CMakeLists.txt                  |   4 +-
 tests/cpp/CMakeLists.txt            |  12 ++--
 13 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9d2d7cc8..b7ee23d1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ cmake_policy(SET CMP0063 NEW)
 
 if ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
   cmake_policy(SET CMP0077 NEW)
-endif ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
+endif()
 
 message(STATUS "CMake version ${CMAKE_VERSION}")
 
@@ -88,80 +88,80 @@ option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
 #-- Checks for building XGBoost
 if (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
   message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.")
-endif (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
+endif()
 if (USE_NCCL AND NOT (USE_CUDA))
   message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
-endif (USE_NCCL AND NOT (USE_CUDA))
+endif()
 if (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
   message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_CUDA` flag.")
-endif (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
+endif()
 if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
   message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
-endif (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
+endif()
 if (JVM_BINDINGS AND R_LIB)
   message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
-endif (JVM_BINDINGS AND R_LIB)
+endif()
 if (R_LIB AND GOOGLE_TEST)
   message(WARNING "Some C++ unittests will fail with `R_LIB` enabled,
  as R package redirects some functions to R runtime implementation.")
-endif (R_LIB AND GOOGLE_TEST)
+endif()
 if (PLUGIN_RMM AND NOT (USE_CUDA))
   message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
-endif (PLUGIN_RMM AND NOT (USE_CUDA))
+endif()
 if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
   message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.")
-endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
+endif()
 if (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
   message(SEND_ERROR "`PLUGIN_RMM` must be used with Linux.")
-endif (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
+endif()
 if (ENABLE_ALL_WARNINGS)
   if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
     message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.")
-  endif ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
-endif (ENABLE_ALL_WARNINGS)
+  endif()
+endif()
 if (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
   message(SEND_ERROR "Cannot build a static library libxgboost.a when R or JVM packages are enabled.")
-endif (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
+endif()
 if (PLUGIN_FEDERATED)
   if (CMAKE_CROSSCOMPILING)
     message(SEND_ERROR "Cannot cross compile with federated learning support")
-  endif ()
+  endif()
   if (BUILD_STATIC_LIB)
     message(SEND_ERROR "Cannot build static lib with federated learning support")
-  endif ()
+  endif()
   if (R_LIB OR JVM_BINDINGS)
     message(SEND_ERROR "Cannot enable federated learning support when R or JVM packages are enabled.")
-  endif ()
+  endif()
   if (WIN32)
     message(SEND_ERROR "Federated learning not supported for Windows platform")
-  endif ()
-endif ()
+  endif()
+endif()
 
 #-- Removed options
 if (USE_AVX)
   message(SEND_ERROR  "The option `USE_AVX` is deprecated as experimental AVX features have been removed from XGBoost.")
-endif (USE_AVX)
+endif()
 if (PLUGIN_LZ4)
   message(SEND_ERROR  "The option `PLUGIN_LZ4` is removed from XGBoost.")
-endif (PLUGIN_LZ4)
+endif()
 if (RABIT_BUILD_MPI)
   message(SEND_ERROR "The option `RABIT_BUILD_MPI` has been removed from XGBoost.")
-endif (RABIT_BUILD_MPI)
+endif()
 if (USE_S3)
   message(SEND_ERROR "The option `USE_S3` has been removed from XGBoost")
-endif (USE_S3)
+endif()
 if (USE_AZURE)
   message(SEND_ERROR "The option `USE_AZURE` has been removed from XGBoost")
-endif (USE_AZURE)
+endif()
 if (USE_HDFS)
   message(SEND_ERROR "The option `USE_HDFS` has been removed from XGBoost")
-endif (USE_HDFS)
+endif()
 
 #-- Sanitizer
 if (USE_SANITIZER)
   include(cmake/Sanitizer.cmake)
   enable_sanitizers("${ENABLED_SANITIZERS}")
-endif (USE_SANITIZER)
+endif()
 
 if (USE_CUDA)
   set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
@@ -178,7 +178,7 @@ if (USE_CUDA)
   add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
 
   find_package(CUDAToolkit REQUIRED)
-endif (USE_CUDA)
+endif()
 
 if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
     ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
@@ -204,11 +204,11 @@ if (USE_OPENMP)
       set(OpenMP_CXX_LIB_NAMES omp)
       set(OpenMP_omp_LIBRARY ${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib)
       find_package(OpenMP REQUIRED)
-    endif ()
+    endif()
   else ()
     find_package(OpenMP REQUIRED)
-  endif ()
-endif (USE_OPENMP)
+  endif()
+endif()
 #Add for IBM i
 if (${CMAKE_SYSTEM_NAME} MATCHES "OS400")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
@@ -217,21 +217,21 @@ endif()
 
 if (USE_NCCL)
   find_package(Nccl REQUIRED)
-endif (USE_NCCL)
+endif()
 
 # dmlc-core
 msvc_use_static_runtime()
 if (FORCE_SHARED_CRT)
   set(DMLC_FORCE_SHARED_CRT ON)
-endif ()
+endif()
 add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core)
 
 if (MSVC)
   if (TARGET dmlc_unit_tests)
     target_compile_options(dmlc_unit_tests PRIVATE
                            -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
-  endif (TARGET dmlc_unit_tests)
-endif (MSVC)
+  endif()
+endif()
 
 # rabit
 add_subdirectory(rabit)
@@ -243,12 +243,12 @@ target_link_libraries(objxgboost PUBLIC dmlc)
 # Exports some R specific definitions and objects
 if (R_LIB)
   add_subdirectory(${xgboost_SOURCE_DIR}/R-package)
-endif (R_LIB)
+endif()
 
 # This creates its own shared library `xgboost4j'.
 if (JVM_BINDINGS)
   add_subdirectory(${xgboost_SOURCE_DIR}/jvm-packages)
-endif (JVM_BINDINGS)
+endif()
 
 # Plugin
 add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
@@ -264,14 +264,14 @@ if (PLUGIN_RMM)
   list(APPEND rmm_link_libs CUDA::cudart_static)
   set_target_properties(rmm::rmm PROPERTIES INTERFACE_LINK_LIBRARIES "${rmm_link_libs}")
   get_target_property(rmm_link_libs rmm::rmm INTERFACE_LINK_LIBRARIES)
-endif (PLUGIN_RMM)
+endif()
 
 #-- library
 if (BUILD_STATIC_LIB)
   add_library(xgboost STATIC)
 else (BUILD_STATIC_LIB)
   add_library(xgboost SHARED)
-endif (BUILD_STATIC_LIB)
+endif()
 target_link_libraries(xgboost PRIVATE objxgboost)
 target_include_directories(xgboost
   INTERFACE
@@ -298,8 +298,8 @@ if (BUILD_DEPRECATED_CLI)
     set_output_directory(runxgboost ${xgboost_BINARY_DIR})
   else ()
     set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
-  endif (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
-endif (BUILD_DEPRECATED_CLI)
+  endif()
+endif()
 #-- End CLI for xgboost
 
 # Common setup for all targets
@@ -313,18 +313,18 @@ if (JVM_BINDINGS)
   xgboost_target_properties(xgboost4j)
   xgboost_target_link_libraries(xgboost4j)
   xgboost_target_defs(xgboost4j)
-endif (JVM_BINDINGS)
+endif()
 
 if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
   set_output_directory(xgboost ${xgboost_BINARY_DIR}/lib)
 else ()
   set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
-endif ()
+endif()
 
 # Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names
 if (BUILD_DEPRECATED_CLI)
   add_dependencies(xgboost runxgboost)
-endif (BUILD_DEPRECATED_CLI)
+endif()
 
 #-- Installing XGBoost
 if (R_LIB)
@@ -332,18 +332,18 @@ if (R_LIB)
   set_target_properties(xgboost PROPERTIES PREFIX "")
   if (APPLE)
     set_target_properties(xgboost PROPERTIES SUFFIX ".so")
-  endif (APPLE)
+  endif()
   setup_rpackage_install_target(xgboost "${CMAKE_CURRENT_BINARY_DIR}/R-package-install")
   set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dummy_inst")
-endif (R_LIB)
+endif()
 if (MINGW)
   set_target_properties(xgboost PROPERTIES PREFIX "")
-endif (MINGW)
+endif()
 
 if (BUILD_C_DOC)
   include(cmake/Doc.cmake)
   run_doxygen()
-endif (BUILD_C_DOC)
+endif()
 
 include(CPack)
 
@@ -364,14 +364,14 @@ if (BUILD_STATIC_LIB)
     set(INSTALL_TARGETS xgboost runxgboost objxgboost dmlc)
   else()
     set(INSTALL_TARGETS xgboost objxgboost dmlc)
-  endif (BUILD_DEPRECATED_CLI)
+  endif()
 else (BUILD_STATIC_LIB)
   if (BUILD_DEPRECATED_CLI)
     set(INSTALL_TARGETS xgboost runxgboost)
   else(BUILD_DEPRECATED_CLI)
     set(INSTALL_TARGETS xgboost)
-  endif (BUILD_DEPRECATED_CLI)
-endif (BUILD_STATIC_LIB)
+  endif()
+endif()
 
 install(TARGETS ${INSTALL_TARGETS}
   EXPORT XGBoostTargets
@@ -428,8 +428,8 @@ if (GOOGLE_TEST)
     set_tests_properties(TestXGBoostCLI
       PROPERTIES
       PASS_REGULAR_EXPRESSION ".*test-rmse:0.087.*")
-  endif (BUILD_DEPRECATED_CLI)
-endif (GOOGLE_TEST)
+  endif()
+endif()
 
 # For MSVC: Call msvc_use_static_runtime() once again to completely
 # replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
@@ -443,4 +443,4 @@ if (ADD_PKGCONFIG)
   install(
     FILES ${xgboost_BINARY_DIR}/xgboost.pc
     DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
-endif (ADD_PKGCONFIG)
+endif()
diff --git a/cmake/Doc.cmake b/cmake/Doc.cmake
index 2ffa005ff..7c3d05c00 100644
--- a/cmake/Doc.cmake
+++ b/cmake/Doc.cmake
@@ -3,7 +3,7 @@ function (run_doxygen)
 
   if (NOT DOXYGEN_DOT_FOUND)
     message(FATAL_ERROR "Command `dot` not found.  Please install graphviz.")
-  endif (NOT DOXYGEN_DOT_FOUND)
+  endif()
 
   configure_file(
     ${xgboost_SOURCE_DIR}/doc/Doxyfile.in
diff --git a/cmake/Sanitizer.cmake b/cmake/Sanitizer.cmake
index 77d7c93c1..919d5c695 100644
--- a/cmake/Sanitizer.cmake
+++ b/cmake/Sanitizer.cmake
@@ -14,7 +14,7 @@ macro(enable_sanitizer sanitizer)
     set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=thread")
     if (TSan_FOUND)
       link_libraries(${TSan_LIBRARY})
-    endif (TSan_FOUND)
+    endif()
 
   elseif(${sanitizer} MATCHES "leak")
     find_package(LSan)
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index f215c3eca..686a545d0 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -121,7 +121,7 @@ function(format_gencode_flags flags out)
     set(${out} "${${out}}--generate-code=arch=compute_${ver},code=compute_${ver};")
     set(${out} "${${out}}" PARENT_SCOPE)
     message(STATUS "CUDA GEN_CODE: ${GEN_CODE}")
-  endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+  endif()
 endfunction(format_gencode_flags flags)
 
 # Set CUDA related flags to target.  Must be used after code `format_gencode_flags`.
@@ -136,11 +136,11 @@ function(xgboost_set_cuda_flags target)
   if (USE_PER_THREAD_DEFAULT_STREAM)
     target_compile_options(${target} PRIVATE
             $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>)
-  endif (USE_PER_THREAD_DEFAULT_STREAM)
+  endif()
 
   if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     set_property(TARGET ${target} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
-  endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+  endif()
 
   if (FORCE_COLORED_OUTPUT)
     if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
@@ -149,7 +149,7 @@ function(xgboost_set_cuda_flags target)
       target_compile_options(${target} PRIVATE
         $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fdiagnostics-color=always>)
     endif()
-  endif (FORCE_COLORED_OUTPUT)
+  endif()
 
   if (USE_DEVICE_DEBUG)
     target_compile_options(${target} PRIVATE
@@ -157,11 +157,11 @@ function(xgboost_set_cuda_flags target)
   else (USE_DEVICE_DEBUG)
     target_compile_options(${target} PRIVATE
       $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
-  endif (USE_DEVICE_DEBUG)
+  endif()
 
   if (USE_NVTX)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
-  endif (USE_NVTX)
+  endif()
 
   target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1)
   target_include_directories(
@@ -172,7 +172,7 @@ function(xgboost_set_cuda_flags target)
   if (MSVC)
     target_compile_options(${target} PRIVATE
       $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
-  endif (MSVC)
+  endif()
 
   set_target_properties(${target} PROPERTIES
     CUDA_STANDARD 17
@@ -190,7 +190,7 @@ macro(xgboost_link_nccl target)
     target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR})
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1)
     target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
-  endif (BUILD_STATIC_LIB)
+  endif()
 endmacro(xgboost_link_nccl)
 
 # compile options
@@ -207,7 +207,7 @@ macro(xgboost_target_properties target)
       CXX_VISIBILITY_PRESET hidden
       CUDA_VISIBILITY_PRESET hidden
     )
-  endif (HIDE_CXX_SYMBOLS)
+  endif()
 
   if (ENABLE_ALL_WARNINGS)
     target_compile_options(${target} PUBLIC
@@ -215,7 +215,7 @@ macro(xgboost_target_properties target)
       -Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined,
       -Wall -Wextra -Wno-expansion-to-defined>
     )
-  endif(ENABLE_ALL_WARNINGS)
+  endif()
 
   target_compile_options(${target}
     PRIVATE
@@ -228,11 +228,11 @@ macro(xgboost_target_properties target)
       -D_CRT_SECURE_NO_WARNINGS
       -D_CRT_SECURE_NO_DEPRECATE
     )
-  endif (MSVC)
+  endif()
 
   if (WIN32 AND MINGW)
     target_compile_options(${target} PUBLIC -static-libstdc++)
-  endif (WIN32 AND MINGW)
+  endif()
 endmacro(xgboost_target_properties)
 
 # Custom definitions used in xgboost.
@@ -242,24 +242,24 @@ macro(xgboost_target_defs target)
       PRIVATE
       -DDMLC_LOG_CUSTOMIZE=1
       $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:_MWAITXINTRIN_H_INCLUDED>)
-  endif ()
+  endif()
   if (USE_DEBUG_OUTPUT)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_DEBUG_OUTPUT=1)
-  endif (USE_DEBUG_OUTPUT)
+  endif()
   if (XGBOOST_MM_PREFETCH_PRESENT)
     target_compile_definitions(${target}
       PRIVATE
       -DXGBOOST_MM_PREFETCH_PRESENT=1)
-  endif(XGBOOST_MM_PREFETCH_PRESENT)
+  endif()
   if (XGBOOST_BUILTIN_PREFETCH_PRESENT)
     target_compile_definitions(${target}
       PRIVATE
       -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1)
-  endif (XGBOOST_BUILTIN_PREFETCH_PRESENT)
+  endif()
 
   if (PLUGIN_RMM)
     target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
-  endif (PLUGIN_RMM)
+  endif()
 endmacro(xgboost_target_defs)
 
 # handles dependencies
@@ -268,34 +268,34 @@ macro(xgboost_target_link_libraries target)
     target_link_libraries(${target} PUBLIC Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
   else()
     target_link_libraries(${target} PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
-  endif (BUILD_STATIC_LIB)
+  endif()
 
   if (USE_OPENMP)
     if (BUILD_STATIC_LIB)
       target_link_libraries(${target} PUBLIC OpenMP::OpenMP_CXX)
     else()
       target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
-    endif (BUILD_STATIC_LIB)
-  endif (USE_OPENMP)
+    endif()
+  endif()
 
   if (USE_CUDA)
     xgboost_set_cuda_flags(${target})
     target_link_libraries(${target} PUBLIC CUDA::cudart_static)
-  endif (USE_CUDA)
+  endif()
 
   if (PLUGIN_RMM)
     target_link_libraries(${target} PRIVATE rmm::rmm)
-  endif (PLUGIN_RMM)
+  endif()
 
   if (USE_NCCL)
     xgboost_link_nccl(${target})
-  endif (USE_NCCL)
+  endif()
 
   if (USE_NVTX)
     target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
-  endif (USE_NVTX)
+  endif()
 
   if (MINGW)
     target_link_libraries(${target} PRIVATE wsock32 ws2_32)
-  endif (MINGW)
+  endif()
 endmacro(xgboost_target_link_libraries)
diff --git a/cmake/modules/FindNVML.cmake b/cmake/modules/FindNVML.cmake
index a4bed0019..a7061aa29 100644
--- a/cmake/modules/FindNVML.cmake
+++ b/cmake/modules/FindNVML.cmake
@@ -1,6 +1,6 @@
 if (NVML_LIBRARY)
   unset(NVML_LIBRARY CACHE)
-endif(NVML_LIBRARY)
+endif()
 
 set(NVML_LIB_NAME nvml)
 
diff --git a/cmake/modules/FindNccl.cmake b/cmake/modules/FindNccl.cmake
index 5f06f96b8..6fd2ed981 100644
--- a/cmake/modules/FindNccl.cmake
+++ b/cmake/modules/FindNccl.cmake
@@ -39,7 +39,7 @@ if (NCCL_LIBRARY)
   if(NOT USE_NCCL_LIB_PATH)
     # Don't cache NCCL_LIBRARY to enable switching between static and shared.
     unset(NCCL_LIBRARY CACHE)
-  endif(NOT USE_NCCL_LIB_PATH)
+  endif()
 endif()
 
 if (BUILD_WITH_SHARED_NCCL)
@@ -48,7 +48,7 @@ if (BUILD_WITH_SHARED_NCCL)
 else ()
   # libnccl_static.a
   set(NCCL_LIB_NAME nccl_static)
-endif (BUILD_WITH_SHARED_NCCL)
+endif()
 
 find_path(NCCL_INCLUDE_DIR
   NAMES nccl.h
diff --git a/demo/c-api/basic/CMakeLists.txt b/demo/c-api/basic/CMakeLists.txt
index 32e2bc432..f0cc8a744 100644
--- a/demo/c-api/basic/CMakeLists.txt
+++ b/demo/c-api/basic/CMakeLists.txt
@@ -7,7 +7,7 @@ if (XGBOOST_BUILD_STATIC_LIB)
   enable_language(CXX)
   # find again for those  cxx libraries.
   find_package(xgboost REQUIRED)
-endif(XGBOOST_BUILD_STATIC_LIB)
+endif()
 
 add_executable(api-demo c-api-demo.c)
 target_link_libraries(api-demo PRIVATE xgboost::xgboost)
diff --git a/demo/c-api/inference/CMakeLists.txt b/demo/c-api/inference/CMakeLists.txt
index 6aa8f1dd2..70ddb256c 100644
--- a/demo/c-api/inference/CMakeLists.txt
+++ b/demo/c-api/inference/CMakeLists.txt
@@ -8,7 +8,7 @@ if (XGBOOST_BUILD_STATIC_LIB)
   enable_language(CXX)
   # find again for those  cxx libraries.
   find_package(xgboost REQUIRED)
-endif(XGBOOST_BUILD_STATIC_LIB)
+endif()
 
 add_executable(inference-demo inference.c)
 target_link_libraries(inference-demo PRIVATE xgboost::xgboost)
diff --git a/jvm-packages/CMakeLists.txt b/jvm-packages/CMakeLists.txt
index 247c44378..f89970dea 100644
--- a/jvm-packages/CMakeLists.txt
+++ b/jvm-packages/CMakeLists.txt
@@ -7,13 +7,13 @@ list(APPEND JVM_SOURCES
 if (USE_CUDA)
   list(APPEND JVM_SOURCES
     ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu)
-endif (USE_CUDA)
+endif()
 
 add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES})
 
 if (ENABLE_ALL_WARNINGS)
   target_compile_options(xgboost4j PUBLIC -Wall -Wextra)
-endif (ENABLE_ALL_WARNINGS)
+endif()
 
 target_link_libraries(xgboost4j PRIVATE objxgboost)
 target_include_directories(xgboost4j
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 7026238e3..3a1889d2e 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -1,6 +1,6 @@
 if (PLUGIN_DENSE_PARSER)
   target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/dense_parser/dense_libsvm.cc)
-endif (PLUGIN_DENSE_PARSER)
+endif()
 
 if (PLUGIN_UPDATER_ONEAPI)
   add_library(oneapi_plugin OBJECT
@@ -21,14 +21,14 @@ if (PLUGIN_UPDATER_ONEAPI)
   if (USE_OPENMP)
     find_package(OpenMP REQUIRED)
     target_link_libraries(oneapi_plugin PUBLIC OpenMP::OpenMP_CXX)
-  endif (USE_OPENMP)
+  endif()
   # Get compilation and link flags of oneapi_plugin and propagate to objxgboost
   target_link_libraries(objxgboost PUBLIC oneapi_plugin)
   # Add all objects of oneapi_plugin to objxgboost
   target_sources(objxgboost INTERFACE $<TARGET_OBJECTS:oneapi_plugin>)
-endif (PLUGIN_UPDATER_ONEAPI)
+endif()
 
 # Add the Federate Learning plugin if enabled.
 if (PLUGIN_FEDERATED)
   add_subdirectory(federated)
-endif (PLUGIN_FEDERATED)
+endif()
diff --git a/rabit/CMakeLists.txt b/rabit/CMakeLists.txt
index 977d4867e..f1adcb6e0 100644
--- a/rabit/CMakeLists.txt
+++ b/rabit/CMakeLists.txt
@@ -10,6 +10,6 @@ if (RABIT_MOCK)
   list(APPEND RABIT_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/engine_mock.cc)
 else ()
   list(APPEND RABIT_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/engine.cc)
-endif ()
+endif()
 
 set(RABIT_SOURCES ${RABIT_SOURCES} PARENT_SCOPE)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4624c643c..9b2a803ab 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,7 +14,7 @@ target_sources(objxgboost PRIVATE ${RABIT_SOURCES})
 if (USE_CUDA)
   file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
   target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
-endif (USE_CUDA)
+endif()
 
 target_include_directories(objxgboost
   PRIVATE
@@ -24,7 +24,7 @@ target_include_directories(objxgboost
 
 if (LOG_CAPI_INVOCATION)
   target_compile_definitions(objxgboost PRIVATE -DLOG_CAPI_INVOCATION=1)
-endif (LOG_CAPI_INVOCATION)
+endif()
 
 # For MSVC: Call msvc_use_static_runtime() once again to completely
 # replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 51cdecd9d..1aebcc2ef 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -1,22 +1,22 @@
 if (USE_DMLC_GTEST)
   if (NOT TARGET gtest)
     message(FATAL_ERROR "USE_DMLC_GTEST=ON but dmlc-core didn't bundle gtest")
-  endif (NOT TARGET gtest)
+  endif()
   set(GTEST_LIBRARIES gtest)
 else (USE_DMLC_GTEST)
   find_package(GTest REQUIRED)
-endif (USE_DMLC_GTEST)
+endif()
 file(GLOB_RECURSE TEST_SOURCES "*.cc")
 
 if (USE_CUDA)
   file(GLOB_RECURSE CUDA_TEST_SOURCES "*.cu")
   list(APPEND TEST_SOURCES ${CUDA_TEST_SOURCES})
-endif (USE_CUDA)
+endif()
 
 file(GLOB_RECURSE ONEAPI_TEST_SOURCES "plugin/*_oneapi.cc")
 if (NOT PLUGIN_UPDATER_ONEAPI)
   list(REMOVE_ITEM TEST_SOURCES ${ONEAPI_TEST_SOURCES})
-endif (NOT PLUGIN_UPDATER_ONEAPI)
+endif()
 
 if (PLUGIN_FEDERATED)
   target_include_directories(testxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/federated)
@@ -24,14 +24,14 @@ if (PLUGIN_FEDERATED)
 else (PLUGIN_FEDERATED)
   file(GLOB_RECURSE FEDERATED_TEST_SOURCES "plugin/*_federated_*.*")
   list(REMOVE_ITEM TEST_SOURCES ${FEDERATED_TEST_SOURCES})
-endif (PLUGIN_FEDERATED)
+endif()
 
 target_sources(testxgboost PRIVATE ${TEST_SOURCES} ${xgboost_SOURCE_DIR}/plugin/example/custom_obj.cc)
 
 if (USE_CUDA AND PLUGIN_RMM)
   find_package(CUDA)
   target_include_directories(testxgboost PRIVATE ${CUDA_INCLUDE_DIRS})
-endif (USE_CUDA AND PLUGIN_RMM)
+endif()
 
 target_include_directories(testxgboost
   PRIVATE

From 4e5a7729c3f6561cfb939d4d49ef69fb460eaac1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 9 Oct 2023 19:04:31 +0800
Subject: [PATCH 36/59] Fix lint errors. (#9634)

---
 .github/workflows/main.yml                |  16 +--
 python-package/xgboost/spark/estimator.py |   6 +-
 tests/ci_build/lint_cpp.py                | 166 ++++++++++++++++++++++
 3 files changed, 174 insertions(+), 14 deletions(-)
 create mode 100644 tests/ci_build/lint_cpp.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 0288b0c97..9f1bf11c9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -144,15 +144,9 @@ jobs:
         python -m pip install wheel setuptools cpplint pylint
     - name: Run lint
       run: |
-        python3 dmlc-core/scripts/lint.py xgboost cpp R-package/src
+        python3 tests/ci_build/lint_cpp.py xgboost cpp R-package/src
 
-        python3 dmlc-core/scripts/lint.py --exclude_path \
-            python-package/xgboost/dmlc-core \
-            python-package/xgboost/include \
-            python-package/xgboost/lib \
-            python-package/xgboost/rabit \
-            python-package/xgboost/src \
-            --pylint-rc python-package/.pylintrc \
-            xgboost \
-            cpp \
-            include src python-package
+        python3 tests/ci_build/lint_cpp.py xgboost cpp include src python-package \
+            --exclude_path python-package/xgboost/dmlc-core python-package/xgboost/include \
+                           python-package/xgboost/lib python-package/xgboost/rabit \
+                           python-package/xgboost/src
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index b73dfba6c..193ca4b2a 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -198,7 +198,7 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     """
 
     @keyword_only
-    def __init__(
+    def __init__(  # pylint:disable=too-many-arguments
         self,
         *,
         features_col: Union[str, List[str]] = "features",
@@ -376,7 +376,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     """
 
     @keyword_only
-    def __init__(
+    def __init__(  # pylint:disable=too-many-arguments
         self,
         *,
         features_col: Union[str, List[str]] = "features",
@@ -566,7 +566,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
     """
 
     @keyword_only
-    def __init__(
+    def __init__(  # pylint:disable=too-many-arguments
         self,
         *,
         features_col: Union[str, List[str]] = "features",
diff --git a/tests/ci_build/lint_cpp.py b/tests/ci_build/lint_cpp.py
new file mode 100644
index 000000000..593b8f870
--- /dev/null
+++ b/tests/ci_build/lint_cpp.py
@@ -0,0 +1,166 @@
+import argparse
+import os
+import re
+import sys
+
+import cpplint
+from cpplint import _cpplint_state
+
+CXX_SUFFIX = set(["cc", "c", "cpp", "h", "cu", "hpp"])
+
+
+def filepath_enumerate(paths):
+    """Enumerate the file paths of all subfiles of the list of paths"""
+    out = []
+    for path in paths:
+        if os.path.isfile(path):
+            out.append(path)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    out.append(os.path.normpath(os.path.join(root, name)))
+    return out
+
+
+def get_header_guard_dmlc(filename):
+    """Get Header Guard Convention for DMLC Projects.
+
+    For headers in include, directly use the path
+    For headers in src, use project name plus path
+
+    Examples: with project-name = dmlc
+        include/dmlc/timer.h -> DMLC_TIMTER_H_
+        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
+    """
+    fileinfo = cpplint.FileInfo(filename)
+    file_path_from_root = fileinfo.RepositoryName()
+    inc_list = ["include", "api", "wrapper", "contrib"]
+    if os.name == "nt":
+        inc_list.append("mshadow")
+
+    if file_path_from_root.find("src/") != -1 and _HELPER.project_name is not None:
+        idx = file_path_from_root.find("src/")
+        file_path_from_root = _HELPER.project_name + file_path_from_root[idx + 3 :]
+    else:
+        idx = file_path_from_root.find("include/")
+        if idx != -1:
+            file_path_from_root = file_path_from_root[idx + 8 :]
+        for spath in inc_list:
+            prefix = spath + "/"
+            if file_path_from_root.startswith(prefix):
+                file_path_from_root = re.sub("^" + prefix, "", file_path_from_root)
+                break
+    return re.sub(r"[-./\s]", "_", file_path_from_root).upper() + "_"
+
+
+class Lint:
+    def __init__(self):
+        self.project_name = "xgboost"
+        self.cpp_header_map = {}
+        self.cpp_src_map = {}
+        self.python_map = {}
+
+        self.pylint_cats = set(["error", "warning", "convention", "refactor"])
+        # setup cpp lint
+        cpplint_args = ["--quiet", "--extensions=" + (",".join(CXX_SUFFIX)), "."]
+        _ = cpplint.ParseArguments(cpplint_args)
+        cpplint._SetFilters(
+            ",".join(
+                [
+                    "-build/c++11",
+                    "-build/include,",
+                    "+build/namespaces",
+                    "+build/include_what_you_use",
+                    "+build/include_order",
+                ]
+            )
+        )
+        cpplint._SetCountingStyle("toplevel")
+        cpplint._line_length = 100
+
+    def process_cpp(self, path, suffix):
+        """Process a cpp file."""
+        _cpplint_state.ResetErrorCounts()
+        cpplint.ProcessFile(str(path), _cpplint_state.verbose_level)
+        _cpplint_state.PrintErrorCounts()
+        errors = _cpplint_state.errors_by_category.copy()
+
+        if suffix == "h":
+            self.cpp_header_map[str(path)] = errors
+        else:
+            self.cpp_src_map[str(path)] = errors
+
+    @staticmethod
+    def _print_summary_map(strm, result_map, ftype):
+        """Print summary of certain result map."""
+        if len(result_map) == 0:
+            return 0
+        npass = sum(1 for x in result_map.values() if len(x) == 0)
+        strm.write(f"====={npass}/{len(result_map)} {ftype} files passed check=====\n")
+        for fname, emap in result_map.items():
+            if len(emap) == 0:
+                continue
+            strm.write(
+                f"{fname}: {sum(emap.values())} Errors of {len(emap)} Categories map={str(emap)}\n"
+            )
+        return len(result_map) - npass
+
+    def print_summary(self, strm):
+        """Print summary of lint."""
+        nerr = 0
+        nerr += Lint._print_summary_map(strm, self.cpp_header_map, "cpp-header")
+        nerr += Lint._print_summary_map(strm, self.cpp_src_map, "cpp-source")
+        if nerr == 0:
+            strm.write("All passed!\n")
+        else:
+            strm.write(f"{nerr} files failed lint\n")
+        return nerr
+
+
+_HELPER = Lint()
+
+cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc
+
+
+def process(fname, allow_type):
+    """Process a file."""
+    fname = str(fname)
+    arr = fname.rsplit(".", 1)
+    if fname.find("#") != -1 or arr[-1] not in allow_type:
+        return
+    if arr[-1] in CXX_SUFFIX:
+        _HELPER.process_cpp(fname, arr[-1])
+
+
+def main():
+    parser = argparse.ArgumentParser(description="run cpp lint")
+    parser.add_argument("path", nargs="+", help="path to traverse")
+    parser.add_argument(
+        "--exclude_path",
+        nargs="+",
+        default=[],
+        help="exclude this path, and all subfolders if path is a folder",
+    )
+    args = parser.parse_args()
+    excluded_paths = filepath_enumerate(args.exclude_path)
+
+    allow_type = []
+    allow_type += CXX_SUFFIX
+
+    for path in args.path:
+        if os.path.isfile(path):
+            normpath = os.path.normpath(path)
+            if normpath not in excluded_paths:
+                process(path, allow_type)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    file_path = os.path.normpath(os.path.join(root, name))
+                    if file_path not in excluded_paths:
+                        process(file_path, allow_type)
+    nerr = _HELPER.print_summary(sys.stderr)
+    sys.exit(nerr > 0)
+
+
+if __name__ == "__main__":
+    main()

From 680d53db43ffbd76023a0dec8f998d6a4c4e3c88 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 10 Oct 2023 07:15:14 +0800
Subject: [PATCH 37/59] Extract JSON utils. (#9645)

---
 include/xgboost/json.h            | 38 ----------------
 src/c_api/c_api_utils.h           | 25 +----------
 src/common/io.h                   |  2 +-
 src/common/json_utils.h           | 74 +++++++++++++++++++++++++++++++
 src/common/quantile_loss_utils.cc | 23 +++++-----
 tests/cpp/common/test_json.cc     |  3 +-
 6 files changed, 90 insertions(+), 75 deletions(-)
 create mode 100644 src/common/json_utils.h

diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index b099d1c47..c2c16ef8f 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -608,44 +608,6 @@ using Boolean = JsonBoolean;
 using String  = JsonString;
 using Null    = JsonNull;
 
-// Utils tailored for XGBoost.
-namespace detail {
-template <typename Head>
-bool TypeCheckImpl(Json const& value) {
-  return IsA<Head>(value);
-}
-
-template <typename Head, typename... JT>
-std::enable_if_t<sizeof...(JT) != 0, bool> TypeCheckImpl(Json const& value) {
-  return IsA<Head>(value) || TypeCheckImpl<JT...>(value);
-}
-
-template <typename Head>
-std::string TypeCheckError() {
-  return "`" + Head{}.TypeStr() + "`";
-}
-
-template <typename Head, typename... JT>
-std::enable_if_t<sizeof...(JT) != 0, std::string> TypeCheckError() {
-  return "`" + Head{}.TypeStr() + "`, " + TypeCheckError<JT...>();
-}
-}  // namespace detail
-
-/**
- * \brief Type check for JSON-based parameters
- *
- * \tparam JT    Expected JSON types.
- * \param  value Value to be checked.
- */
-template <typename... JT>
-void TypeCheck(Json const& value, StringView name) {
-  if (!detail::TypeCheckImpl<JT...>(value)) {
-    LOG(FATAL) << "Invalid type for: `" << name << "`, expecting one of the: {`"
-               << detail::TypeCheckError<JT...>() << "}, got: `" << value.GetValue().TypeStr()
-               << "`";
-  }
-}
-
 /**
  * \brief Convert XGBoost parameter to JSON object.
  *
diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h
index 19dd6d639..95efb5b9d 100644
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2023, XGBoost Contributors
  */
 #ifndef XGBOOST_C_API_C_API_UTILS_H_
 #define XGBOOST_C_API_C_API_UTILS_H_
@@ -13,6 +13,7 @@
 #include <utility>  // for move
 #include <vector>
 
+#include "../common/json_utils.h"  // for TypeCheck
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"         // DMatrix
 #include "xgboost/feature_map.h"  // for FeatureMap
@@ -254,28 +255,6 @@ inline void GenerateFeatureMap(Learner const *learner,
 
 void XGBBuildInfoDevice(Json* p_info);
 
-template <typename JT>
-auto const &RequiredArg(Json const &in, StringView key, StringView func) {
-  auto const &obj = get<Object const>(in);
-  auto it = obj.find(key);
-  if (it == obj.cend() || IsA<Null>(it->second)) {
-    LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
-  }
-  TypeCheck<JT>(it->second, StringView{key});
-  return get<std::remove_const_t<JT> const>(it->second);
-}
-
-template <typename JT, typename T>
-auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
-  auto const &obj = get<Object const>(in);
-  auto it = obj.find(key);
-  if (it != obj.cend() && !IsA<Null>(it->second)) {
-    TypeCheck<JT>(it->second, key);
-    return get<std::remove_const_t<JT> const>(it->second);
-  }
-  return dft;
-}
-
 /**
  * \brief Get shared ptr from DMatrix C handle with additional checks.
  */
diff --git a/src/common/io.h b/src/common/io.h
index 2eb62b094..5e9d27582 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -8,7 +8,7 @@
 #define XGBOOST_COMMON_IO_H_
 
 #include <dmlc/io.h>
-#include <rabit/rabit.h>
+#include <rabit/internal/io.h>  // for MemoryFixSizeBuffer, MemoryBufferStream
 
 #include <algorithm>    // for min, fill_n, copy_n
 #include <array>        // for array
diff --git a/src/common/json_utils.h b/src/common/json_utils.h
new file mode 100644
index 000000000..a2a8a3cae
--- /dev/null
+++ b/src/common/json_utils.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ *
+ * @brief Utils tailored for XGBoost.
+ */
+#pragma once
+
+#include <string>       // for string
+#include <type_traits>  // for enable_if_t, remove_const_t
+
+#include "xgboost/json.h"
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost {
+namespace detail {
+template <typename Head>
+bool TypeCheckImpl(Json const &value) {
+  return IsA<Head>(value);
+}
+
+template <typename Head, typename... JT>
+std::enable_if_t<sizeof...(JT) != 0, bool> TypeCheckImpl(Json const &value) {
+  return IsA<Head>(value) || TypeCheckImpl<JT...>(value);
+}
+
+template <typename Head>
+std::string TypeCheckError() {
+  return "`" + Head{}.TypeStr() + "`";
+}
+
+template <typename Head, typename... JT>
+std::enable_if_t<sizeof...(JT) != 0, std::string> TypeCheckError() {
+  return "`" + Head{}.TypeStr() + "`, " + TypeCheckError<JT...>();
+}
+}  // namespace detail
+
+/**
+ * @brief Type check for JSON-based parameters
+ *
+ * @tparam JT    Expected JSON types.
+ * @param  value Value to be checked.
+ */
+template <typename... JT>
+void TypeCheck(Json const &value, StringView name) {
+  if (!detail::TypeCheckImpl<JT...>(value)) {
+    LOG(FATAL) << "Invalid type for: `" << name << "`, expecting one of the: {`"
+               << detail::TypeCheckError<JT...>() << "}, got: `" << value.GetValue().TypeStr()
+               << "`";
+  }
+}
+
+template <typename JT>
+auto const &RequiredArg(Json const &in, StringView key, StringView func) {
+  auto const &obj = get<Object const>(in);
+  auto it = obj.find(key);
+  if (it == obj.cend() || IsA<Null>(it->second)) {
+    LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
+  }
+  TypeCheck<JT>(it->second, StringView{key});
+  return get<std::remove_const_t<JT> const>(it->second);
+}
+
+template <typename JT, typename T>
+auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
+  auto const &obj = get<Object const>(in);
+  auto it = obj.find(key);
+  if (it != obj.cend() && !IsA<Null>(it->second)) {
+    TypeCheck<JT>(it->second, key);
+
+    return get<std::remove_const_t<JT> const>(it->second);
+  }
+  return dft;
+}
+}  // namespace xgboost
diff --git a/src/common/quantile_loss_utils.cc b/src/common/quantile_loss_utils.cc
index 59397b701..df2fa6edd 100644
--- a/src/common/quantile_loss_utils.cc
+++ b/src/common/quantile_loss_utils.cc
@@ -1,19 +1,19 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023, XGBoost contributors
  */
 #include "quantile_loss_utils.h"
 
-#include <cctype>             // std::isspace
-#include <istream>            // std::istream
-#include <ostream>            // std::ostream
-#include <string>             // std::string
-#include <vector>             // std::vector
+#include <cctype>   // for isspace
+#include <istream>  // for istream
+#include <ostream>  // for ostream
+#include <string>   // for string
+#include <vector>   // for vector
 
-#include "xgboost/json.h"     // F32Array,TypeCheck,get,Number
-#include "xgboost/json_io.h"  // JsonWriter
+#include "../common/json_utils.h"  // for TypeCheck
+#include "xgboost/json.h"          // for F32Array, get, Number
+#include "xgboost/json_io.h"       // for JsonWriter
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 std::ostream& operator<<(std::ostream& os, const ParamFloatArray& array) {
   auto const& t = array.Get();
   xgboost::F32Array arr{t.size()};
@@ -70,5 +70,4 @@ std::istream& operator>>(std::istream& is, ParamFloatArray& array) {
 }
 
 DMLC_REGISTER_PARAMETER(QuantileLossParam);
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 1d1319274..d361552ce 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2019-2023, XGBoost Contributors
+ * Copyright 2019-2023, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -9,6 +9,7 @@
 
 #include "../../../src/common/charconv.h"
 #include "../../../src/common/io.h"
+#include "../../../src/common/json_utils.h"
 #include "../../../src/common/threading_utils.h"  // for ParallelFor
 #include "../filesystem.h"                        // dmlc::TemporaryDirectory
 #include "../helpers.h"

From b14e535e788b4d1c09de886d8a41cd5b316c6bc2 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 10 Oct 2023 10:01:14 +0800
Subject: [PATCH 38/59] [Coll] Implement get host address in libxgboost.
 (#9644)

- Port `xgboost.tracker.get_host_ip` in C++.
---
 include/xgboost/collective/socket.h  | 28 ++++++++++
 src/collective/tracker.cc            | 76 ++++++++++++++++++++++++++++
 src/collective/tracker.h             | 15 ++++++
 tests/cpp/collective/net_test.h      | 41 +++++++++++++++
 tests/cpp/collective/test_socket.cc  | 52 ++++++++-----------
 tests/cpp/collective/test_tracker.cc | 18 +++++++
 6 files changed, 199 insertions(+), 31 deletions(-)
 create mode 100644 src/collective/tracker.cc
 create mode 100644 src/collective/tracker.h
 create mode 100644 tests/cpp/collective/net_test.h
 create mode 100644 tests/cpp/collective/test_tracker.cc

diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index f36cdccb2..a16dd05c0 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -658,6 +658,34 @@ class TCPSocket {
  * @brief Get the local host name.
  */
 [[nodiscard]] Result GetHostName(std::string *p_out);
+
+/**
+ * @brief inet_ntop
+ */
+template <typename H>
+Result INetNToP(H const &host, std::string *p_out) {
+  std::string &ip = *p_out;
+  switch (host->h_addrtype) {
+    case AF_INET: {
+      auto addr = reinterpret_cast<struct in_addr *>(host->h_addr_list[0]);
+      char str[INET_ADDRSTRLEN];
+      inet_ntop(AF_INET, addr, str, INET_ADDRSTRLEN);
+      ip = str;
+      break;
+    }
+    case AF_INET6: {
+      auto addr = reinterpret_cast<struct in6_addr *>(host->h_addr_list[0]);
+      char str[INET6_ADDRSTRLEN];
+      inet_ntop(AF_INET6, addr, str, INET6_ADDRSTRLEN);
+      ip = str;
+      break;
+    }
+    default: {
+      return Fail("Invalid address type.");
+    }
+  }
+  return Success();
+}
 }  // namespace collective
 }  // namespace xgboost
 
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
new file mode 100644
index 000000000..598b41ddd
--- /dev/null
+++ b/src/collective/tracker.cc
@@ -0,0 +1,76 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#if defined(__unix__) || defined(__APPLE__)
+#include <netdb.h>       // gethostbyname
+#include <sys/socket.h>  // socket, AF_INET6, AF_INET, connect, getsockname
+#endif                   // defined(__unix__) || defined(__APPLE__)
+
+#if !defined(NOMINMAX) && defined(_WIN32)
+#define NOMINMAX
+#endif  // !defined(NOMINMAX)
+
+#if defined(_WIN32)
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#endif  // defined(_WIN32)
+
+#include <string>  // for string
+
+#include "xgboost/collective/result.h"  // for Result, Fail, Success
+#include "xgboost/collective/socket.h"  // for GetHostName, FailWithCode, MakeSockAddress, ...
+
+namespace xgboost::collective {
+[[nodiscard]] Result GetHostAddress(std::string* out) {
+  auto rc = GetHostName(out);
+  if (!rc.OK()) {
+    return rc;
+  }
+  auto host = gethostbyname(out->c_str());
+
+  // get ip address from host
+  std::string ip;
+  rc = INetNToP(host, &ip);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  if (!(ip.size() >= 4 && ip.substr(0, 4) == "127.")) {
+    // return if this is a public IP address.
+    // not entirely accurate, we have other reserved IPs
+    *out = ip;
+    return Success();
+  }
+
+  // Create an UDP socket to prob the public IP address, it's fine even if it's
+  // unreachable.
+  auto sock = socket(AF_INET, SOCK_DGRAM, 0);
+  if (sock == -1) {
+    return Fail("Failed to create socket.");
+  }
+
+  auto paddr = MakeSockAddress(StringView{"10.255.255.255"}, 1);
+  sockaddr const* addr_handle = reinterpret_cast<const sockaddr*>(&paddr.V4().Handle());
+  socklen_t addr_len{sizeof(paddr.V4().Handle())};
+  auto err = connect(sock, addr_handle, addr_len);
+  if (err != 0) {
+    return system::FailWithCode("Failed to find IP address.");
+  }
+
+  // get the IP address from socket desrciptor
+  struct sockaddr_in addr;
+  socklen_t len = sizeof(addr);
+  if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr), &len) == -1) {
+    return Fail("Failed to get sock name.");
+  }
+  ip = inet_ntoa(addr.sin_addr);
+
+  err = system::CloseSocket(sock);
+  if (err != 0) {
+    return system::FailWithCode("Failed to close socket.");
+  }
+
+  *out = ip;
+  return Success();
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/tracker.h b/src/collective/tracker.h
new file mode 100644
index 000000000..ec52f6a62
--- /dev/null
+++ b/src/collective/tracker.h
@@ -0,0 +1,15 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <string>  // for string
+
+#include "xgboost/collective/result.h"  // for Result
+
+namespace xgboost::collective {
+// Prob the public IP address of the host, need a better method.
+//
+// This is directly translated from the previous Python implementation, we should find a
+// more riguous approach, can use some expertise in network programming.
+[[nodiscard]] Result GetHostAddress(std::string* out);
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/net_test.h b/tests/cpp/collective/net_test.h
new file mode 100644
index 000000000..ed15ed256
--- /dev/null
+++ b/tests/cpp/collective/net_test.h
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2022-2023, XGBoost Contributors
+ */
+#pragma once
+
+#include <gtest/gtest.h>
+#include <xgboost/collective/socket.h>
+
+#include <fstream>  // ifstream
+
+#include "../helpers.h"  // for FileExists
+
+namespace xgboost::collective {
+class SocketTest : public ::testing::Test {
+ protected:
+  std::string skip_msg_{"Skipping IPv6 test"};
+
+  bool SkipTest() {
+    std::string path{"/sys/module/ipv6/parameters/disable"};
+    if (FileExists(path)) {
+      std::ifstream fin(path);
+      if (!fin) {
+        return true;
+      }
+      std::string s_value;
+      fin >> s_value;
+      auto value = std::stoi(s_value);
+      if (value != 0) {
+        return true;
+      }
+    } else {
+      return true;
+    }
+    return false;
+  }
+
+ protected:
+  void SetUp() override { system::SocketStartup(); }
+  void TearDown() override { system::SocketFinalize(); }
+};
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_socket.cc b/tests/cpp/collective/test_socket.cc
index 07a7f52d0..7802acda8 100644
--- a/tests/cpp/collective/test_socket.cc
+++ b/tests/cpp/collective/test_socket.cc
@@ -1,19 +1,16 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2023, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/collective/socket.h>
 
 #include <cerrno>        // EADDRNOTAVAIL
-#include <fstream>       // ifstream
 #include <system_error>  // std::error_code, std::system_category
 
-#include "../helpers.h"
+#include "net_test.h"  // for SocketTest
 
 namespace xgboost::collective {
-TEST(Socket, Basic) {
-  system::SocketStartup();
-
+TEST_F(SocketTest, Basic) {
   SockAddress addr{SockAddrV6::Loopback()};
   ASSERT_TRUE(addr.IsV6());
   addr = SockAddress{SockAddrV4::Loopback()};
@@ -54,34 +51,27 @@ TEST(Socket, Basic) {
 
   run_test(SockDomain::kV4);
 
-  std::string path{"/sys/module/ipv6/parameters/disable"};
-  if (FileExists(path)) {
-    std::ifstream fin(path);
-    if (!fin) {
-      GTEST_SKIP_(msg.c_str());
-    }
-    std::string s_value;
-    fin >> s_value;
-    auto value = std::stoi(s_value);
-    if (value != 0) {
-      GTEST_SKIP_(msg.c_str());
-    }
-  } else {
-    GTEST_SKIP_(msg.c_str());
+  if (SkipTest()) {
+    GTEST_SKIP_(skip_msg_.c_str());
   }
   run_test(SockDomain::kV6);
-
-  system::SocketFinalize();
 }
 
-TEST(Socket, Bind) {
-  system::SocketStartup();
-  auto any = SockAddrV4::InaddrAny().Addr();
-  auto sock = TCPSocket::Create(SockDomain::kV4);
-  std::int32_t port{0};
-  auto rc = sock.Bind(any, &port);
-  ASSERT_TRUE(rc.OK());
-  ASSERT_NE(port, 0);
-  system::SocketFinalize();
+TEST_F(SocketTest, Bind) {
+  auto run = [](SockDomain domain) {
+    auto any =
+        domain == SockDomain::kV4 ? SockAddrV4::InaddrAny().Addr() : SockAddrV6::InaddrAny().Addr();
+    auto sock = TCPSocket::Create(domain);
+    std::int32_t port{0};
+    auto rc = sock.Bind(any, &port);
+    ASSERT_TRUE(rc.OK());
+    ASSERT_NE(port, 0);
+  };
+
+  run(SockDomain::kV4);
+  if (SkipTest()) {
+    GTEST_SKIP_(skip_msg_.c_str());
+  }
+  run(SockDomain::kV6);
 }
 }  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_tracker.cc b/tests/cpp/collective/test_tracker.cc
new file mode 100644
index 000000000..0e60cfb68
--- /dev/null
+++ b/tests/cpp/collective/test_tracker.cc
@@ -0,0 +1,18 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "../../../src/collective/tracker.h"  // for GetHostAddress
+#include "net_test.h"                         // for SocketTest
+
+namespace xgboost::collective {
+namespace {
+class TrackerTest : public SocketTest {};
+}  // namespace
+
+TEST_F(TrackerTest, GetHostAddress) {
+  std::string host;
+  auto rc = GetHostAddress(&host);
+  ASSERT_TRUE(rc.OK());
+  ASSERT_TRUE(host.find("127.") == std::string::npos);
+}
+}  // namespace xgboost::collective

From 0ecb4de9637ae1682334466d5a2cca69cb373ac8 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 10 Oct 2023 08:35:57 -0700
Subject: [PATCH 39/59] [breaking] Change DMatrix construction to be
 distributed (#9623)

* Change column-split DMatrix construction to be distributed

* remove splitting code for row split
---
 include/xgboost/data.h                |  3 +-
 src/data/data.cc                      | 56 +++------------------------
 src/data/simple_dmatrix.cc            |  4 +-
 src/data/simple_dmatrix.h             |  7 ++--
 tests/cpp/data/test_simple_dmatrix.cc | 18 +++++++++
 tests/cpp/helpers.cc                  |  7 ++--
 tests/cpp/helpers.h                   |  6 +--
 7 files changed, 36 insertions(+), 65 deletions(-)

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index c232819f9..04b489d8b 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -559,8 +559,7 @@ class DMatrix {
    *
    * \param uri The URI of input.
    * \param silent Whether print information during loading.
-   * \param data_split_mode In distributed mode, split the input according this mode; otherwise,
-   *                        it's just an indicator on how the input was split beforehand.
+   * \param data_split_mode Indicate how the data was split beforehand.
    * \return The created DMatrix.
    */
   static DMatrix* Load(const std::string& uri, bool silent = true,
diff --git a/src/data/data.cc b/src/data/data.cc
index 4a2bef6be..3c190a90b 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -729,7 +729,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
 }
 
 void MetaInfo::SynchronizeNumberOfColumns() {
-  if (IsVerticalFederated()) {
+  if (IsColumnSplit()) {
     collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
   } else {
     collective::Allreduce<collective::Operation::kMax>(&num_col_, 1);
@@ -850,14 +850,6 @@ DMatrix* TryLoadBinary(std::string fname, bool silent) {
 }  // namespace
 
 DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
-  auto need_split = false;
-  if (collective::IsFederated()) {
-    LOG(CONSOLE) << "XGBoost federated mode detected, not splitting data among workers";
-  } else if (collective::IsDistributed()) {
-    LOG(CONSOLE) << "XGBoost distributed mode detected, will split data among workers";
-    need_split = true;
-  }
-
   std::string fname, cache_file;
   auto dlm_pos = uri.find('#');
   if (dlm_pos != std::string::npos) {
@@ -865,24 +857,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     fname = uri.substr(0, dlm_pos);
     CHECK_EQ(cache_file.find('#'), std::string::npos)
         << "Only one `#` is allowed in file path for cache file specification.";
-    if (need_split && data_split_mode == DataSplitMode::kRow) {
-      std::ostringstream os;
-      std::vector<std::string> cache_shards = common::Split(cache_file, ':');
-      for (size_t i = 0; i < cache_shards.size(); ++i) {
-        size_t pos = cache_shards[i].rfind('.');
-        if (pos == std::string::npos) {
-          os << cache_shards[i] << ".r" << collective::GetRank() << "-"
-             << collective::GetWorldSize();
-        } else {
-          os << cache_shards[i].substr(0, pos) << ".r" << collective::GetRank() << "-"
-             << collective::GetWorldSize() << cache_shards[i].substr(pos, cache_shards[i].length());
-        }
-        if (i + 1 != cache_shards.size()) {
-          os << ':';
-        }
-      }
-      cache_file = os.str();
-    }
   } else {
     fname = uri;
   }
@@ -894,19 +868,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
   }
 
   int partid = 0, npart = 1;
-  if (need_split && data_split_mode == DataSplitMode::kRow) {
-    partid = collective::GetRank();
-    npart = collective::GetWorldSize();
-  } else {
-    // test option to load in part
-    npart = 1;
-  }
-
-  if (npart != 1) {
-    LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
-  }
-
-  DMatrix* dmat{nullptr};
+  DMatrix* dmat{};
 
   if (cache_file.empty()) {
     fname = data::ValidateFileFormat(fname);
@@ -916,6 +878,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
                            cache_file, data_split_mode);
   } else {
+    CHECK(data_split_mode != DataSplitMode::kCol)
+        << "Column-wise data split is not supported for external memory.";
     data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
     dmat = new data::SparsePageDMatrix{&iter,
                                        iter.Proxy(),
@@ -926,17 +890,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
                                        cache_file};
   }
 
-  if (need_split && data_split_mode == DataSplitMode::kCol) {
-    if (!cache_file.empty()) {
-      LOG(FATAL) << "Column-wise data split is not support for external memory.";
-    }
-    LOG(CONSOLE) << "Splitting data by column";
-    auto* sliced = dmat->SliceCol(npart, partid);
-    delete dmat;
-    return sliced;
-  } else {
-    return dmat;
-  }
+  return dmat;
 }
 
 template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 0adf6b466..1989f68a9 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -75,11 +75,11 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
 }
 
 void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
-  if (info_.IsVerticalFederated()) {
+  if (info_.IsColumnSplit()) {
     std::vector<uint64_t> buffer(collective::GetWorldSize());
     buffer[collective::GetRank()] = info_.num_col_;
     collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
-    auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0);
+    auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0ul);
     if (offset == 0) {
       return;
     }
diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
index d6164894a..5b5bb2bfb 100644
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -64,9 +64,10 @@ class SimpleDMatrix : public DMatrix {
   /**
    * \brief Reindex the features based on a global view.
    *
-   * In some cases (e.g. vertical federated learning), features are loaded locally with indices
-   * starting from 0. However, all the algorithms assume the features are globally indexed, so we
-   * reindex the features based on the offset needed to obtain the global view.
+   * In some cases (e.g. column-wise data split and vertical federated learning), features are
+   * loaded locally with indices starting from 0. However, all the algorithms assume the features
+   * are globally indexed, so we reindex the features based on the offset needed to obtain the
+   * global view.
    */
   void ReindexFeatures(Context const* ctx);
 
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index e4d5f2672..fa4165796 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -428,3 +428,21 @@ TEST(SimpleDMatrix, Threads) {
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0, "")};
   ASSERT_EQ(p_fmat->Ctx()->Threads(), AllThreadsForTest());
 }
+
+namespace {
+void VerifyColumnSplit() {
+  size_t constexpr kRows {16};
+  size_t constexpr kCols {8};
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(false, false, 1, DataSplitMode::kCol);
+
+  ASSERT_EQ(dmat->Info().num_col_, kCols * collective::GetWorldSize());
+  ASSERT_EQ(dmat->Info().num_row_, kRows);
+  ASSERT_EQ(dmat->Info().data_split_mode, DataSplitMode::kCol);
+}
+}  // anonymous namespace
+
+TEST(SimpleDMatrix, ColumnSplit) {
+  auto constexpr kWorldSize{3};
+  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit);
+}
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 604c4d30a..97db9dbd8 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -378,9 +378,8 @@ void RandomDataGenerator::GenerateCSR(
   CHECK_EQ(columns->Size(), value->Size());
 }
 
-[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label,
-                                                                            bool float_label,
-                                                                            size_t classes) const {
+[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(
+    bool with_label, bool float_label, size_t classes, DataSplitMode data_split_mode) const {
   HostDeviceVector<float> data;
   HostDeviceVector<bst_row_t> rptrs;
   HostDeviceVector<bst_feature_t> columns;
@@ -388,7 +387,7 @@ void RandomDataGenerator::GenerateCSR(
   data::CSRAdapter adapter(rptrs.HostPointer(), columns.HostPointer(), data.HostPointer(), rows_,
                            data.Size(), cols_);
   std::shared_ptr<DMatrix> out{
-      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1, "", data_split_mode)};
 
   if (with_label) {
     RandomDataGenerator gen{rows_, n_targets_, 0.0f};
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index a26669b7d..82a55450e 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -310,9 +310,9 @@ class RandomDataGenerator {
   void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
                    HostDeviceVector<bst_feature_t>* columns) const;
 
-  [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false,
-                                                         bool float_label = true,
-                                                         size_t classes = 1) const;
+  [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(
+      bool with_label = false, bool float_label = true, size_t classes = 1,
+      DataSplitMode data_split_mode = DataSplitMode::kRow) const;
 
   [[nodiscard]] std::shared_ptr<DMatrix> GenerateSparsePageDMatrix(std::string prefix,
                                                                    bool with_label) const;

From 51e32e4905dc8f30f1ec6616aa98b155ed85793e Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 11 Oct 2023 03:04:10 -0500
Subject: [PATCH 40/59] [CI] add cmakelint to C++ linting task (#9641)

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 .github/workflows/main.yml   |  4 +++-
 tests/ci_build/lint_cmake.sh | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 tests/ci_build/lint_cmake.sh

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 9f1bf11c9..1f91afdc5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -141,7 +141,7 @@ jobs:
         architecture: 'x64'
     - name: Install Python packages
       run: |
-        python -m pip install wheel setuptools cpplint pylint
+        python -m pip install wheel setuptools cmakelint cpplint pylint
     - name: Run lint
       run: |
         python3 tests/ci_build/lint_cpp.py xgboost cpp R-package/src
@@ -150,3 +150,5 @@ jobs:
             --exclude_path python-package/xgboost/dmlc-core python-package/xgboost/include \
                            python-package/xgboost/lib python-package/xgboost/rabit \
                            python-package/xgboost/src
+
+        sh ./tests/ci_build/lint_cmake.sh || true
diff --git a/tests/ci_build/lint_cmake.sh b/tests/ci_build/lint_cmake.sh
new file mode 100644
index 000000000..d67ecd084
--- /dev/null
+++ b/tests/ci_build/lint_cmake.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+
+cmake_files=$(
+    find . -name CMakeLists.txt -o -path "./cmake/*.cmake" \
+    | grep -v dmlc-core \
+    | grep -v gputreeshap
+)
+cmakelint \
+    --linelength=120 \
+    --filter=-convention/filename,-package/stdargs,-readability/wonkycase \
+    ${cmake_files} \
+|| exit 1

From 084d89216c7d40d204fb0f936d582be1547d9edf Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 12 Oct 2023 09:36:36 +0800
Subject: [PATCH 41/59] Add support for cgroupv2. (#9651)

---
 src/common/threading_utils.cc | 69 ++++++++++++++++++++++++++++++-----
 src/common/threading_utils.h  |  5 ---
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/src/common/threading_utils.cc b/src/common/threading_utils.cc
index 349cc0ba7..5e730e96d 100644
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@@ -3,14 +3,23 @@
  */
 #include "threading_utils.h"
 
-#include <fstream>
-#include <string>
+#include <algorithm>   // for max
+#include <exception>   // for exception
+#include <filesystem>  // for path, exists
+#include <fstream>     // for ifstream
+#include <string>      // for string
 
-#include "xgboost/logging.h"
+#include "common.h"  // for DivRoundUp
 
-namespace xgboost {
-namespace common {
-int32_t GetCfsCPUCount() noexcept {
+namespace xgboost::common {
+/**
+ * Modified from
+ * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
+ *
+ * MIT License: Copyright (c) 2016 Domagoj Šarić
+ */
+std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
+                              std::filesystem::path const& peroid_path) {
 #if defined(__linux__)
   // https://bugs.openjdk.java.net/browse/JDK-8146115
   // http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
@@ -31,8 +40,8 @@ int32_t GetCfsCPUCount() noexcept {
     }
   };
   // complete fair scheduler from Linux
-  auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"));
-  auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us"));
+  auto const cfs_quota(read_int(quota_path.c_str()));
+  auto const cfs_period(read_int(peroid_path.c_str()));
   if ((cfs_quota > 0) && (cfs_period > 0)) {
     return std::max(cfs_quota / cfs_period, 1);
   }
@@ -40,6 +49,47 @@ int32_t GetCfsCPUCount() noexcept {
   return -1;
 }
 
+std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
+  std::int32_t cnt{-1};
+#if defined(__linux__)
+  namespace fs = std::filesystem;
+
+  std::int32_t a{0}, b{0};
+
+  auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
+  try {
+    std::ifstream fin{bandwidth_path, std::ios::in};
+    fin >> a;
+    fin >> b;
+  } catch (std::exception const&) {
+    warn();
+    return cnt;
+  }
+  if (a > 0 && b > 0) {
+    cnt = std::max(common::DivRoundUp(a, b), 1);
+  }
+#endif  //  defined(__linux__)
+  return cnt;
+}
+
+std::int32_t GetCfsCPUCount() noexcept {
+  namespace fs = std::filesystem;
+  fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
+  auto has_v2 = fs::exists(bandwidth_path);
+  if (has_v2) {
+    return GetCGroupV2Count(bandwidth_path);
+  }
+
+  fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
+  fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
+  auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
+  if (has_v1) {
+    return GetCGroupV1Count(quota_path, peroid_path);
+  }
+
+  return -1;
+}
+
 std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
   // Don't use parallel if we are in a parallel region.
   if (omp_in_parallel()) {
@@ -54,5 +104,4 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
   n_threads = std::max(n_threads, 1);
   return n_threads;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index 4ca4ca070..ac7119035 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -253,11 +253,6 @@ inline std::int32_t OmpGetThreadLimit() {
  * \brief Get thread limit from CFS.
  *
  *   This function has non-trivial overhead and should not be called repeatly.
- *
- * Modified from
- * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
- *
- * MIT License: Copyright (c) 2016 Domagoj Šarić
  */
 std::int32_t GetCfsCPUCount() noexcept;
 

From 2e42f33fc127a15be927b98667d15ababa9efdfe Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 11 Oct 2023 22:14:19 -0500
Subject: [PATCH 42/59] [CI] standardize else() and enfunction() calls in CMake
 scripts (#9653)

---
 CMakeLists.txt                     |  6 +++---
 cmake/Doc.cmake                    |  2 +-
 cmake/FindPrefetchIntrinsics.cmake |  2 +-
 cmake/Utils.cmake                  | 22 +++++++++++-----------
 cmake/Version.cmake                |  2 +-
 cmake/modules/FindLibR.cmake       |  2 +-
 tests/cpp/CMakeLists.txt           |  4 ++--
 7 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7ee23d1b..8e1a38929 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -269,7 +269,7 @@ endif()
 #-- library
 if (BUILD_STATIC_LIB)
   add_library(xgboost STATIC)
-else (BUILD_STATIC_LIB)
+else()
   add_library(xgboost SHARED)
 endif()
 target_link_libraries(xgboost PRIVATE objxgboost)
@@ -365,10 +365,10 @@ if (BUILD_STATIC_LIB)
   else()
     set(INSTALL_TARGETS xgboost objxgboost dmlc)
   endif()
-else (BUILD_STATIC_LIB)
+else()
   if (BUILD_DEPRECATED_CLI)
     set(INSTALL_TARGETS xgboost runxgboost)
-  else(BUILD_DEPRECATED_CLI)
+  else()
     set(INSTALL_TARGETS xgboost)
   endif()
 endif()
diff --git a/cmake/Doc.cmake b/cmake/Doc.cmake
index 7c3d05c00..cf4ef26f4 100644
--- a/cmake/Doc.cmake
+++ b/cmake/Doc.cmake
@@ -13,4 +13,4 @@ function (run_doxygen)
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     COMMENT "Generate C APIs documentation."
     VERBATIM)
-endfunction (run_doxygen)
+endfunction()
diff --git a/cmake/FindPrefetchIntrinsics.cmake b/cmake/FindPrefetchIntrinsics.cmake
index b00ff57d7..94e65de32 100644
--- a/cmake/FindPrefetchIntrinsics.cmake
+++ b/cmake/FindPrefetchIntrinsics.cmake
@@ -19,4 +19,4 @@ function (find_prefetch_intrinsics)
   " XGBOOST_BUILTIN_PREFETCH_PRESENT)
   set(XGBOOST_MM_PREFETCH_PRESENT ${XGBOOST_MM_PREFETCH_PRESENT} PARENT_SCOPE)
   set(XGBOOST_BUILTIN_PREFETCH_PRESENT ${XGBOOST_BUILTIN_PREFETCH_PRESENT} PARENT_SCOPE)
-endfunction (find_prefetch_intrinsics)
+endfunction()
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 686a545d0..f1e9a5904 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -11,7 +11,7 @@ function(auto_source_group SOURCES)
 
       source_group("${GROUP}" FILES "${FILE}")
   endforeach()
-endfunction(auto_source_group)
+endfunction()
 
 # Force static runtime for MSVC
 function(msvc_use_static_runtime)
@@ -50,7 +50,7 @@ function(msvc_use_static_runtime)
           endif()
       endforeach()
   endif()
-endfunction(msvc_use_static_runtime)
+endfunction()
 
 # Set output directory of target, ignoring debug or release
 function(set_output_directory target dir)
@@ -70,7 +70,7 @@ function(set_output_directory target dir)
     ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${dir}
     ARCHIVE_OUTPUT_DIRECTORY_RELWITHDEBINFO ${dir}
     ARCHIVE_OUTPUT_DIRECTORY_MINSIZEREL ${dir})
-endfunction(set_output_directory)
+endfunction()
 
 # Set a default build type to release if none was specified
 function(set_default_configuration_release)
@@ -80,7 +80,7 @@ function(set_default_configuration_release)
 	  message(STATUS "Setting build type to 'Release' as none was specified.")
 	  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE )
 	endif()
-endfunction(set_default_configuration_release)
+endfunction()
 
 # Generate nvcc compiler flags given a list of architectures
 # Also generates PTX for the most recent architecture for forwards compatibility
@@ -122,7 +122,7 @@ function(format_gencode_flags flags out)
     set(${out} "${${out}}" PARENT_SCOPE)
     message(STATUS "CUDA GEN_CODE: ${GEN_CODE}")
   endif()
-endfunction(format_gencode_flags flags)
+endfunction()
 
 # Set CUDA related flags to target.  Must be used after code `format_gencode_flags`.
 function(xgboost_set_cuda_flags target)
@@ -154,7 +154,7 @@ function(xgboost_set_cuda_flags target)
   if (USE_DEVICE_DEBUG)
     target_compile_options(${target} PRIVATE
       $<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>)
-  else (USE_DEVICE_DEBUG)
+  else()
     target_compile_options(${target} PRIVATE
       $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
   endif()
@@ -179,7 +179,7 @@ function(xgboost_set_cuda_flags target)
     CUDA_STANDARD_REQUIRED ON
     CUDA_SEPARABLE_COMPILATION OFF
     CUDA_RUNTIME_LIBRARY Static)
-endfunction(xgboost_set_cuda_flags)
+endfunction()
 
 macro(xgboost_link_nccl target)
   if (BUILD_STATIC_LIB)
@@ -191,7 +191,7 @@ macro(xgboost_link_nccl target)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1)
     target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
   endif()
-endmacro(xgboost_link_nccl)
+endmacro()
 
 # compile options
 macro(xgboost_target_properties target)
@@ -233,7 +233,7 @@ macro(xgboost_target_properties target)
   if (WIN32 AND MINGW)
     target_compile_options(${target} PUBLIC -static-libstdc++)
   endif()
-endmacro(xgboost_target_properties)
+endmacro()
 
 # Custom definitions used in xgboost.
 macro(xgboost_target_defs target)
@@ -260,7 +260,7 @@ macro(xgboost_target_defs target)
   if (PLUGIN_RMM)
     target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
   endif()
-endmacro(xgboost_target_defs)
+endmacro()
 
 # handles dependencies
 macro(xgboost_target_link_libraries target)
@@ -298,4 +298,4 @@ macro(xgboost_target_link_libraries target)
   if (MINGW)
     target_link_libraries(${target} PRIVATE wsock32 ws2_32)
   endif()
-endmacro(xgboost_target_link_libraries)
+endmacro()
diff --git a/cmake/Version.cmake b/cmake/Version.cmake
index ea8c081dc..a64d1b65e 100644
--- a/cmake/Version.cmake
+++ b/cmake/Version.cmake
@@ -3,4 +3,4 @@ function (write_version)
   configure_file(
     ${xgboost_SOURCE_DIR}/cmake/version_config.h.in
     ${xgboost_SOURCE_DIR}/include/xgboost/version_config.h @ONLY)
-endfunction (write_version)
+endfunction()
diff --git a/cmake/modules/FindLibR.cmake b/cmake/modules/FindLibR.cmake
index c9d9509fa..1eb384238 100644
--- a/cmake/modules/FindLibR.cmake
+++ b/cmake/modules/FindLibR.cmake
@@ -66,7 +66,7 @@ function(create_rlib_for_msvc)
   execute_process(COMMAND ${DLLTOOL_EXE}
     "--input-def" "${CMAKE_CURRENT_BINARY_DIR}/R.def"
     "--output-lib" "${CMAKE_CURRENT_BINARY_DIR}/R.lib")
-endfunction(create_rlib_for_msvc)
+endfunction()
 
 
 # detection for OSX
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 1aebcc2ef..5b61f5328 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -3,7 +3,7 @@ if (USE_DMLC_GTEST)
     message(FATAL_ERROR "USE_DMLC_GTEST=ON but dmlc-core didn't bundle gtest")
   endif()
   set(GTEST_LIBRARIES gtest)
-else (USE_DMLC_GTEST)
+else()
   find_package(GTest REQUIRED)
 endif()
 file(GLOB_RECURSE TEST_SOURCES "*.cc")
@@ -21,7 +21,7 @@ endif()
 if (PLUGIN_FEDERATED)
   target_include_directories(testxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/federated)
   target_link_libraries(testxgboost PRIVATE federated_client)
-else (PLUGIN_FEDERATED)
+else()
   file(GLOB_RECURSE FEDERATED_TEST_SOURCES "plugin/*_federated_*.*")
   list(REMOVE_ITEM TEST_SOURCES ${FEDERATED_TEST_SOURCES})
 endif()

From 946ae1c440d4e949a267d9991763e0303a3351e7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 12 Oct 2023 12:49:16 +0800
Subject: [PATCH 43/59] [coll] Implement a new tracker and a communicator.
 (#9650)

* [coll] Implement a new tracker and a communicator.

The new tracker and communicators communicate through the use of JSON documents. Along
with which, communicators are aware of each other.
---
 R-package/src/Makevars.in            |   4 +
 R-package/src/Makevars.win           |   4 +
 include/xgboost/collective/result.h  |   9 +
 include/xgboost/collective/socket.h  |  48 ++++-
 src/collective/allgather.cc          |  42 ++++
 src/collective/allgather.h           |  23 ++
 src/collective/comm.cc               | 302 +++++++++++++++++++++++++++
 src/collective/comm.h                | 160 ++++++++++++++
 src/collective/protocol.h            | 214 +++++++++++++++++++
 src/collective/tracker.cc            | 222 +++++++++++++++++++-
 src/collective/tracker.h             | 128 +++++++++++-
 tests/cpp/collective/test_comm.cc    |  47 +++++
 tests/cpp/collective/test_socket.cc  |   2 +-
 tests/cpp/collective/test_tracker.cc |  65 +++++-
 tests/cpp/collective/test_worker.h   |  91 ++++++++
 15 files changed, 1345 insertions(+), 16 deletions(-)
 create mode 100644 src/collective/allgather.cc
 create mode 100644 src/collective/allgather.h
 create mode 100644 src/collective/comm.cc
 create mode 100644 src/collective/comm.h
 create mode 100644 src/collective/protocol.h
 create mode 100644 tests/cpp/collective/test_comm.cc
 create mode 100644 tests/cpp/collective/test_worker.h

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index e3af418e3..541c0fb52 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -98,9 +98,13 @@ OBJECTS= \
     $(PKGROOT)/src/context.o \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
+    $(PKGROOT)/src/collective/allgather.o \
+    $(PKGROOT)/src/collective/comm.o \
+    $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
     $(PKGROOT)/src/collective/in_memory_handler.o \
+    $(PKGROOT)/src/collective/loop.o \
     $(PKGROOT)/src/collective/socket.o \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 8f003403f..faacd6d8d 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -98,9 +98,13 @@ OBJECTS= \
     $(PKGROOT)/src/context.o \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
+    $(PKGROOT)/src/collective/allgather.o \
+    $(PKGROOT)/src/collective/comm.o \
+    $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
     $(PKGROOT)/src/collective/in_memory_handler.o \
+    $(PKGROOT)/src/collective/loop.o \
     $(PKGROOT)/src/collective/socket.o \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
diff --git a/include/xgboost/collective/result.h b/include/xgboost/collective/result.h
index 209362505..507171dd4 100644
--- a/include/xgboost/collective/result.h
+++ b/include/xgboost/collective/result.h
@@ -157,4 +157,13 @@ struct Result {
 [[nodiscard]] inline auto Fail(std::string msg, std::error_code errc, Result&& prev) {
   return Result{std::move(msg), std::move(errc), std::forward<Result>(prev)};
 }
+
+// We don't have monad, a simple helper would do.
+template <typename Fn>
+Result operator<<(Result&& r, Fn&& fn) {
+  if (!r.OK()) {
+    return std::forward<Result>(r);
+  }
+  return fn();
+}
 }  // namespace xgboost::collective
diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index a16dd05c0..5dd1b9ffa 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -380,11 +380,18 @@ class TCPSocket {
   }
   [[nodiscard]] bool NonBlocking() const { return non_blocking_; }
   [[nodiscard]] Result RecvTimeout(std::chrono::seconds timeout) {
-    timeval tv;
+    // https://stackoverflow.com/questions/2876024/linux-is-there-a-read-or-recv-from-socket-with-timeout
+#if defined(_WIN32)
+    DWORD tv = timeout.count() * 1000;
+    auto rc =
+        setsockopt(Handle(), SOL_SOCKET, SO_RCVTIMEO, reinterpret_cast<char *>(&tv), sizeof(tv));
+#else
+    struct timeval tv;
     tv.tv_sec = timeout.count();
     tv.tv_usec = 0;
     auto rc = setsockopt(Handle(), SOL_SOCKET, SO_RCVTIMEO, reinterpret_cast<char const *>(&tv),
                          sizeof(tv));
+#endif
     if (rc != 0) {
       return system::FailWithCode("Failed to set timeout on recv.");
     }
@@ -425,7 +432,12 @@ class TCPSocket {
    */
   TCPSocket Accept() {
     HandleT newfd = accept(Handle(), nullptr, nullptr);
-    if (newfd == InvalidSocket()) {
+#if defined(_WIN32)
+    auto interrupt = WSAEINTR;
+#else
+    auto interrupt = EINTR;
+#endif
+    if (newfd == InvalidSocket() && system::LastError() != interrupt) {
       system::ThrowAtError("accept");
     }
     TCPSocket newsock{newfd};
@@ -468,7 +480,7 @@ class TCPSocket {
   /**
    * \brief Bind socket to INADDR_ANY, return the port selected by the OS.
    */
-  in_port_t BindHost() {
+  [[nodiscard]] in_port_t BindHost() {
     if (Domain() == SockDomain::kV6) {
       auto addr = SockAddrV6::InaddrAny();
       auto handle = reinterpret_cast<sockaddr const *>(&addr.Handle());
@@ -539,7 +551,7 @@ class TCPSocket {
   /**
    * \brief Send data, without error then all data should be sent.
    */
-  auto SendAll(void const *buf, std::size_t len) {
+  [[nodiscard]] auto SendAll(void const *buf, std::size_t len) {
     char const *_buf = reinterpret_cast<const char *>(buf);
     std::size_t ndone = 0;
     while (ndone < len) {
@@ -558,7 +570,7 @@ class TCPSocket {
   /**
    * \brief Receive data, without error then all data should be received.
    */
-  auto RecvAll(void *buf, std::size_t len) {
+  [[nodiscard]] auto RecvAll(void *buf, std::size_t len) {
     char *_buf = reinterpret_cast<char *>(buf);
     std::size_t ndone = 0;
     while (ndone < len) {
@@ -612,7 +624,15 @@ class TCPSocket {
    */
   void Close() {
     if (InvalidSocket() != handle_) {
+#if defined(_WIN32)
+      auto rc = system::CloseSocket(handle_);
+      // it's possible that we close TCP sockets after finalizing WSA due to detached thread.
+      if (rc != 0 && system::LastError() != WSANOTINITIALISED) {
+        system::ThrowAtError("close", rc);
+      }
+#else
       xgboost_CHECK_SYS_CALL(system::CloseSocket(handle_), 0);
+#endif
       handle_ = InvalidSocket();
     }
   }
@@ -634,6 +654,24 @@ class TCPSocket {
     socket.domain_ = domain;
 #endif  // defined(__APPLE__)
     return socket;
+#endif  // defined(xgboost_IS_MINGW)
+  }
+
+  static TCPSocket *CreatePtr(SockDomain domain) {
+#if defined(xgboost_IS_MINGW)
+    MingWError();
+    return nullptr;
+#else
+    auto fd = socket(static_cast<std::int32_t>(domain), SOCK_STREAM, 0);
+    if (fd == InvalidSocket()) {
+      system::ThrowAtError("socket");
+    }
+    auto socket = new TCPSocket{fd};
+
+#if defined(__APPLE__)
+    socket->domain_ = domain;
+#endif  // defined(__APPLE__)
+    return socket;
 #endif  // defined(xgboost_IS_MINGW)
   }
 };
diff --git a/src/collective/allgather.cc b/src/collective/allgather.cc
new file mode 100644
index 000000000..dba36c88c
--- /dev/null
+++ b/src/collective/allgather.cc
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "allgather.h"
+
+#include <algorithm>  // for min
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int8_t
+#include <memory>     // for shared_ptr
+
+#include "comm.h"          // for Comm, Channel
+#include "xgboost/span.h"  // for Span
+
+namespace xgboost::collective::cpu_impl {
+Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size_t segment_size,
+                     std::int32_t worker_off, std::shared_ptr<Channel> prev_ch,
+                     std::shared_ptr<Channel> next_ch) {
+  auto world = comm.World();
+  auto rank = comm.Rank();
+  CHECK_LT(worker_off, world);
+
+  for (std::int32_t r = 0; r < world; ++r) {
+    auto send_rank = (rank + world - r + worker_off) % world;
+    auto send_off = send_rank * segment_size;
+    send_off = std::min(send_off, data.size_bytes());
+    auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
+    next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
+
+    auto recv_rank = (rank + world - r - 1 + worker_off) % world;
+    auto recv_off = recv_rank * segment_size;
+    recv_off = std::min(recv_off, data.size_bytes());
+    auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
+    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+    auto rc = prev_ch->Block();
+    if (!rc.OK()) {
+      return rc;
+    }
+  }
+
+  return Success();
+}
+}  // namespace xgboost::collective::cpu_impl
diff --git a/src/collective/allgather.h b/src/collective/allgather.h
new file mode 100644
index 000000000..31a9a36b3
--- /dev/null
+++ b/src/collective/allgather.h
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstddef>  // for size_t
+#include <cstdint>  // for int32_t
+#include <memory>   // for shared_ptr
+
+#include "comm.h"          // for Comm, Channel
+#include "xgboost/span.h"  // for Span
+
+namespace xgboost::collective {
+namespace cpu_impl {
+/**
+ * @param worker_off Segment offset. For example, if the rank 2 worker specifis worker_off
+ *                   = 1, then it owns the third segment.
+ */
+[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data,
+                                   std::size_t segment_size, std::int32_t worker_off,
+                                   std::shared_ptr<Channel> prev_ch,
+                                   std::shared_ptr<Channel> next_ch);
+}  // namespace cpu_impl
+}  // namespace xgboost::collective
diff --git a/src/collective/comm.cc b/src/collective/comm.cc
new file mode 100644
index 000000000..7e0af9c18
--- /dev/null
+++ b/src/collective/comm.cc
@@ -0,0 +1,302 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "comm.h"
+
+#include <algorithm>  // for copy
+#include <chrono>     // for seconds
+#include <memory>     // for shared_ptr
+#include <string>     // for string
+#include <utility>    // for move, forward
+
+#include "allgather.h"
+#include "protocol.h"                   // for kMagic
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/json.h"               // for Json, Object
+
+namespace xgboost::collective {
+Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
+           std::int32_t retry, std::string task_id)
+    : timeout_{timeout},
+      retry_{retry},
+      tracker_{host, port, -1},
+      task_id_{std::move(task_id)},
+      loop_{std::make_shared<Loop>(timeout)} {}
+
+Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, std::int32_t retry,
+                          std::string const& task_id, TCPSocket* out, std::int32_t rank,
+                          std::int32_t world) {
+  // get information from tracker
+  CHECK(!info.host.empty());
+  auto rc = Connect(info.host, info.port, retry, timeout, out);
+  if (!rc.OK()) {
+    return Fail("Failed to connect to the tracker.", std::move(rc));
+  }
+
+  TCPSocket& tracker = *out;
+  return std::move(rc)
+      << [&] { return tracker.NonBlocking(false); }
+      << [&] { return tracker.RecvTimeout(timeout); }
+      << [&] { return proto::Magic{}.Verify(&tracker); }
+      << [&] { return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id); };
+}
+
+[[nodiscard]] Result Comm::ConnectTracker(TCPSocket* out) const {
+  return ConnectTrackerImpl(this->TrackerInfo(), this->Timeout(), this->retry_, this->task_id_, out,
+                            this->Rank(), this->World());
+}
+
+[[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
+                                    proto::PeerInfo ninfo, std::chrono::seconds timeout,
+                                    std::int32_t retry,
+                                    std::vector<std::shared_ptr<TCPSocket>>* out_workers) {
+  auto next = std::make_shared<TCPSocket>();
+  auto prev = std::make_shared<TCPSocket>();
+
+  auto rc = Success() << [&] {
+    auto rc = Connect(ninfo.host, ninfo.port, retry, timeout, next.get());
+    if (!rc.OK()) {
+      return Fail("Bootstrap failed to connect to ring next.", std::move(rc));
+    }
+    return rc;
+  } << [&] {
+    return next->NonBlocking(true);
+  } << [&] {
+    SockAddrV4 addr;
+    return listener->Accept(prev.get(), &addr);
+  } << [&] { return prev->NonBlocking(true); };
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  // exchange host name and port
+  std::vector<std::int8_t> buffer(HOST_NAME_MAX * comm.World(), 0);
+  auto s_buffer = common::Span{buffer.data(), buffer.size()};
+  auto next_host = s_buffer.subspan(HOST_NAME_MAX * comm.Rank(), HOST_NAME_MAX);
+  if (next_host.size() < ninfo.host.size()) {
+    return Fail("Got an invalid host name.");
+  }
+  std::copy(ninfo.host.cbegin(), ninfo.host.cend(), next_host.begin());
+
+  auto prev_ch = std::make_shared<Channel>(comm, prev);
+  auto next_ch = std::make_shared<Channel>(comm, next);
+
+  auto block = [&] {
+    for (auto ch : {prev_ch, next_ch}) {
+      auto rc = ch->Block();
+      if (!rc.OK()) {
+        return rc;
+      }
+    }
+    return Success();
+  };
+
+  rc = std::move(rc) << [&] {
+    return cpu_impl::RingAllgather(comm, s_buffer, HOST_NAME_MAX, 0, prev_ch, next_ch);
+  } << [&] { return block(); };
+  if (!rc.OK()) {
+    return Fail("Failed to get host names from peers.", std::move(rc));
+  }
+
+  std::vector<std::int32_t> peers_port(comm.World(), -1);
+  peers_port[comm.Rank()] = ninfo.port;
+  rc = std::move(rc) << [&] {
+    auto s_ports = common::Span{reinterpret_cast<std::int8_t*>(peers_port.data()),
+                                peers_port.size() * sizeof(ninfo.port)};
+    return cpu_impl::RingAllgather(comm, s_ports, sizeof(ninfo.port), 0, prev_ch, next_ch);
+  } << [&] { return block(); };
+  if (!rc.OK()) {
+    return Fail("Failed to get the port from peers.", std::move(rc));
+  }
+
+  std::vector<proto::PeerInfo> peers(comm.World());
+  for (auto r = 0; r < comm.World(); ++r) {
+    auto nhost = s_buffer.subspan(HOST_NAME_MAX * r, HOST_NAME_MAX);
+    auto nport = peers_port[r];
+    auto nrank = BootstrapNext(r, comm.World());
+
+    peers[nrank] = {std::string{reinterpret_cast<char const*>(nhost.data())}, nport, nrank};
+  }
+  CHECK_EQ(peers[comm.Rank()].port, lport);
+  for (auto const& p : peers) {
+    CHECK_NE(p.port, -1);
+  }
+
+  std::vector<std::shared_ptr<TCPSocket>>& workers = *out_workers;
+  workers.resize(comm.World());
+
+  for (std::int32_t r = (comm.Rank() + 1); r < comm.World(); ++r) {
+    auto const& peer = peers[r];
+    std::shared_ptr<TCPSocket> worker{TCPSocket::CreatePtr(comm.Domain())};
+    rc = std::move(rc)
+         << [&] { return Connect(peer.host, peer.port, retry, timeout, worker.get()); }
+         << [&] { return worker->RecvTimeout(timeout); };
+    if (!rc.OK()) {
+      return rc;
+    }
+
+    auto rank = comm.Rank();
+    auto n_bytes = worker->SendAll(&rank, sizeof(comm.Rank()));
+    if (n_bytes != sizeof(comm.Rank())) {
+      return Fail("Failed to send rank.");
+    }
+    workers[r] = std::move(worker);
+  }
+
+  for (std::int32_t r = 0; r < comm.Rank(); ++r) {
+    SockAddrV4 addr;
+    auto peer = std::shared_ptr<TCPSocket>(TCPSocket::CreatePtr(comm.Domain()));
+    rc = std::move(rc) << [&] { return listener->Accept(peer.get(), &addr); }
+                       << [&] { return peer->RecvTimeout(timeout); };
+    if (!rc.OK()) {
+      return rc;
+    }
+    std::int32_t rank{-1};
+    auto n_bytes = peer->RecvAll(&rank, sizeof(rank));
+    if (n_bytes != sizeof(comm.Rank())) {
+      return Fail("Failed to recv rank.");
+    }
+    workers[rank] = std::move(peer);
+  }
+
+  for (std::int32_t r = 0; r < comm.World(); ++r) {
+    if (r == comm.Rank()) {
+      continue;
+    }
+    CHECK(workers[r]);
+  }
+
+  return Success();
+}
+
+RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
+                     std::int32_t retry, std::string task_id)
+    : Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
+  auto rc = this->Bootstrap(timeout_, retry_, task_id_);
+  CHECK(rc.OK()) << rc.Report();
+}
+
+[[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
+                                          std::string task_id) {
+  TCPSocket tracker;
+  std::int32_t world{-1};
+  auto rc = ConnectTrackerImpl(this->TrackerInfo(), timeout, retry, task_id, &tracker, this->Rank(),
+                               world);
+  if (!rc.OK()) {
+    return Fail("Bootstrap failed.", std::move(rc));
+  }
+
+  this->domain_ = tracker.Domain();
+
+  // Start command
+  TCPSocket listener = TCPSocket::Create(tracker.Domain());
+  std::int32_t lport = listener.BindHost();
+  listener.Listen();
+
+  // create worker for listening to error notice.
+  auto domain = tracker.Domain();
+  std::shared_ptr<TCPSocket> error_sock{TCPSocket::CreatePtr(domain)};
+  auto eport = error_sock->BindHost();
+  error_sock->Listen();
+  error_worker_ = std::thread{[this, error_sock = std::move(error_sock)] {
+    auto conn = error_sock->Accept();
+    // On Windows accept returns an invalid socket after network is shutdown.
+    if (conn.IsClosed()) {
+      return;
+    }
+    LOG(WARNING) << "Another worker is running into error.";
+    std::string scmd;
+    conn.Recv(&scmd);
+    auto jcmd = Json::Load(scmd);
+    auto rc = this->Shutdown();
+    if (!rc.OK()) {
+      LOG(WARNING) << "Fail to shutdown worker:" << rc.Report();
+    }
+#if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
+    exit(-1);
+#else
+    LOG(FATAL) << rc.Report();
+#endif
+  }};
+  error_worker_.detach();
+
+  proto::Start start;
+  rc = std::move(rc) << [&] { return start.WorkerSend(lport, &tracker, eport); }
+                     << [&] { return start.WorkerRecv(&tracker, &world); };
+  if (!rc.OK()) {
+    return rc;
+  }
+  this->world_ = world;
+
+  // get ring neighbors
+  std::string snext;
+  tracker.Recv(&snext);
+  auto jnext = Json::Load(StringView{snext});
+
+  proto::PeerInfo ninfo{jnext};
+
+  // get the rank of this worker
+  this->rank_ = BootstrapPrev(ninfo.rank, world);
+  this->tracker_.rank = rank_;
+
+  std::vector<std::shared_ptr<TCPSocket>> workers;
+  rc = ConnectWorkers(*this, &listener, lport, ninfo, timeout, retry, &workers);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  CHECK(this->channels_.empty());
+  for (auto& w : workers) {
+    if (w) {
+      w->SetNoDelay();
+      rc = w->NonBlocking(true);
+    }
+    if (!rc.OK()) {
+      return rc;
+    }
+    this->channels_.emplace_back(std::make_shared<Channel>(*this, w));
+  }
+  return rc;
+}
+
+RabitComm::~RabitComm() noexcept(false) {
+  if (!IsDistributed()) {
+    return;
+  }
+  auto rc = this->Shutdown();
+  if (!rc.OK()) {
+    LOG(WARNING) << rc.Report();
+  }
+}
+
+[[nodiscard]] Result RabitComm::Shutdown() {
+  TCPSocket tracker;
+  return Success() << [&] {
+    return ConnectTrackerImpl(tracker_, timeout_, retry_, task_id_, &tracker, Rank(), World());
+  } << [&] {
+    return this->Block();
+  } << [&] {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker.Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Faled to send cmd.");
+    }
+    return Success();
+  };
+}
+
+[[nodiscard]] Result RabitComm::LogTracker(std::string msg) const {
+  TCPSocket out;
+  proto::Print print;
+  return Success() << [&] { return this->ConnectTracker(&out); }
+                   << [&] { return print.WorkerSend(&out, msg); };
+}
+
+[[nodiscard]] Result RabitComm::SignalError(Result const& res) {
+  TCPSocket out;
+  return Success() << [&] { return this->ConnectTracker(&out); }
+                   << [&] { return proto::ErrorCMD{}.WorkerSend(&out, res); };
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/comm.h b/src/collective/comm.h
new file mode 100644
index 000000000..f23810034
--- /dev/null
+++ b/src/collective/comm.h
@@ -0,0 +1,160 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <chrono>              // for seconds
+#include <condition_variable>  // for condition_variable
+#include <cstddef>             // for size_t
+#include <cstdint>             // for int32_t
+#include <memory>              // for shared_ptr
+#include <mutex>               // for mutex
+#include <queue>               // for queue
+#include <string>              // for string
+#include <thread>              // for thread
+#include <type_traits>         // for remove_const_t
+#include <utility>             // for move
+#include <vector>              // for vector
+
+#include "../common/timer.h"
+#include "loop.h"                       // for Loop
+#include "protocol.h"                   // for PeerInfo
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+
+inline constexpr std::int32_t DefaultTimeoutSec() { return 300; }  // 5min
+inline constexpr std::int32_t DefaultRetry() { return 3; }
+
+// indexing into the ring
+inline std::int32_t BootstrapNext(std::int32_t r, std::int32_t world) {
+  auto nrank = (r + world + 1) % world;
+  return nrank;
+}
+
+inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
+  auto nrank = (r + world - 1) % world;
+  return nrank;
+}
+
+class Channel;
+
+/**
+ * @brief Base communicator storing info about the tracker and other communicators.
+ */
+class Comm {
+ protected:
+  std::int32_t world_{1};
+  std::int32_t rank_{0};
+  std::chrono::seconds timeout_{DefaultTimeoutSec()};
+  std::int32_t retry_{DefaultRetry()};
+
+  proto::PeerInfo tracker_;
+  SockDomain domain_{SockDomain::kV4};
+  std::thread error_worker_;
+  std::string task_id_;
+  std::vector<std::shared_ptr<Channel>> channels_;
+  std::shared_ptr<Loop> loop_{new Loop{std::chrono::seconds{
+      DefaultTimeoutSec()}}};  // fixme: require federated comm to have a timeout
+
+ public:
+  Comm() = default;
+  Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout, std::int32_t retry,
+       std::string task_id);
+  virtual ~Comm() noexcept(false) {}  // NOLINT
+
+  Comm(Comm const& that) = delete;
+  Comm& operator=(Comm const& that) = delete;
+  Comm(Comm&& that) = delete;
+  Comm& operator=(Comm&& that) = delete;
+
+  [[nodiscard]] auto TrackerInfo() const { return tracker_; }
+  [[nodiscard]] Result ConnectTracker(TCPSocket* out) const;
+  [[nodiscard]] auto Domain() const { return domain_; }
+  [[nodiscard]] auto Timeout() const { return timeout_; }
+
+  [[nodiscard]] auto Rank() const { return rank_; }
+  [[nodiscard]] auto World() const { return world_; }
+  [[nodiscard]] bool IsDistributed() const { return World() > 1; }
+  void Submit(Loop::Op op) const { loop_->Submit(op); }
+  [[nodiscard]] Result Block() const { return loop_->Block(); }
+
+  [[nodiscard]] virtual std::shared_ptr<Channel> Chan(std::int32_t rank) const {
+    return channels_.at(rank);
+  }
+  [[nodiscard]] virtual bool IsFederated() const = 0;
+  [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
+
+  [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
+};
+
+class RabitComm : public Comm {
+  [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
+                                 std::string task_id);
+  [[nodiscard]] Result Shutdown();
+
+ public:
+  // bootstrapping construction.
+  RabitComm() = default;
+  // ctor for testing where environment is known.
+  RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
+            std::int32_t retry, std::string task_id);
+  ~RabitComm() noexcept(false) override;
+
+  [[nodiscard]] bool IsFederated() const override { return false; }
+  [[nodiscard]] Result LogTracker(std::string msg) const override;
+
+  [[nodiscard]] Result SignalError(Result const&) override;
+};
+
+/**
+ * @brief Communication channel between workers.
+ */
+class Channel {
+  std::shared_ptr<TCPSocket> sock_{nullptr};
+  Result rc_;
+  Comm const& comm_;
+
+ public:
+  explicit Channel(Comm const& comm, std::shared_ptr<TCPSocket> sock)
+      : sock_{std::move(sock)}, comm_{comm} {}
+
+  void SendAll(std::int8_t const* ptr, std::size_t n) {
+    Loop::Op op{Loop::Op::kWrite, comm_.Rank(), const_cast<std::int8_t*>(ptr), n, sock_.get(), 0};
+    CHECK(sock_.get());
+    comm_.Submit(std::move(op));
+  }
+  void SendAll(common::Span<std::int8_t const> data) {
+    this->SendAll(data.data(), data.size_bytes());
+  }
+
+  void RecvAll(std::int8_t* ptr, std::size_t n) {
+    Loop::Op op{Loop::Op::kRead, comm_.Rank(), ptr, n, sock_.get(), 0};
+    CHECK(sock_.get());
+    comm_.Submit(std::move(op));
+  }
+  void RecvAll(common::Span<std::int8_t> data) { this->RecvAll(data.data(), data.size_bytes()); }
+
+  [[nodiscard]] auto Socket() const { return sock_; }
+  [[nodiscard]] Result Block() { return comm_.Block(); }
+};
+
+enum class Op { kMax = 0, kMin = 1, kSum = 2, kBitwiseAND = 3, kBitwiseOR = 4, kBitwiseXOR = 5 };
+
+template <typename T, typename U = std::conditional_t<std::is_const_v<T>,
+                                                      std::add_const_t<std::int8_t>, std::int8_t>>
+common::Span<U> EraseType(common::Span<T> data) {
+  auto n_total_bytes = data.size_bytes();
+  auto erased = common::Span{reinterpret_cast<std::add_pointer_t<U>>(data.data()), n_total_bytes};
+  return erased;
+}
+
+template <typename T, typename U>
+common::Span<T> RestoreType(common::Span<U> data) {
+  static_assert(std::is_same_v<std::remove_const_t<U>, std::int8_t>);
+  auto n_total_bytes = data.size_bytes();
+  auto restored = common::Span{reinterpret_cast<T*>(data.data()), n_total_bytes / sizeof(T)};
+  return restored;
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/protocol.h b/src/collective/protocol.h
new file mode 100644
index 000000000..96edf4e29
--- /dev/null
+++ b/src/collective/protocol.h
@@ -0,0 +1,214 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/json.h"               // for Json
+
+namespace xgboost::collective::proto {
+struct PeerInfo {
+  std::string host;
+  std::int32_t port{-1};
+  std::int32_t rank{-1};
+
+  PeerInfo() = default;
+  PeerInfo(std::string host, std::int32_t port, std::int32_t rank)
+      : host{std::move(host)}, port{port}, rank{rank} {}
+
+  explicit PeerInfo(Json const& peer)
+      : host{get<String>(peer["host"])},
+        port{static_cast<std::int32_t>(get<Integer const>(peer["port"]))},
+        rank{static_cast<std::int32_t>(get<Integer const>(peer["rank"]))} {}
+
+  [[nodiscard]] Json ToJson() const {
+    Json info{Object{}};
+    info["rank"] = rank;
+    info["host"] = String{host};
+    info["port"] = Integer{port};
+    return info;
+  }
+
+  [[nodiscard]] auto HostPort() const { return host + ":" + std::to_string(this->port); }
+};
+
+struct Magic {
+  static constexpr std::int32_t kMagic = 0xff99;
+
+  [[nodiscard]] Result Verify(xgboost::collective::TCPSocket* p_sock) {
+    std::int32_t magic{kMagic};
+    auto n_bytes = p_sock->SendAll(&magic, sizeof(magic));
+    if (n_bytes != sizeof(magic)) {
+      return Fail("Failed to verify.");
+    }
+
+    magic = 0;
+    n_bytes = p_sock->RecvAll(&magic, sizeof(magic));
+    if (n_bytes != sizeof(magic)) {
+      return Fail("Failed to verify.");
+    }
+    if (magic != kMagic) {
+      return xgboost::collective::Fail("Invalid verification number.");
+    }
+    return Success();
+  }
+};
+
+enum class CMD : std::int32_t {
+  kInvalid = 0,
+  kStart = 1,
+  kShutdown = 2,
+  kError = 3,
+  kPrint = 4,
+};
+
+struct Connect {
+  [[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::int32_t world, std::int32_t rank,
+                                  std::string task_id) const {
+    Json jinit{Object{}};
+    jinit["world_size"] = Integer{world};
+    jinit["rank"] = Integer{rank};
+    jinit["task_id"] = String{task_id};
+    std::string msg;
+    Json::Dump(jinit, &msg);
+    auto n_bytes = tracker->Send(msg);
+    if (n_bytes != msg.size()) {
+      return Fail("Failed to send init command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result TrackerRecv(TCPSocket* sock, std::int32_t* world, std::int32_t* rank,
+                                   std::string* task_id) const {
+    std::string init;
+    sock->Recv(&init);
+    auto jinit = Json::Load(StringView{init});
+    *world = get<Integer const>(jinit["world_size"]);
+    *rank = get<Integer const>(jinit["rank"]);
+    *task_id = get<String const>(jinit["task_id"]);
+    return Success();
+  }
+};
+
+class Start {
+ private:
+  [[nodiscard]] Result TrackerSend(std::int32_t world, TCPSocket* worker) const {
+    Json jcmd{Object{}};
+    jcmd["world_size"] = Integer{world};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = worker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send init command from tracker.");
+    }
+    return Success();
+  }
+
+ public:
+  [[nodiscard]] Result WorkerSend(std::int32_t lport, TCPSocket* tracker,
+                                  std::int32_t eport) const {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kStart)};
+    jcmd["port"] = Integer{lport};
+    jcmd["error_port"] = Integer{eport};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send init command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result WorkerRecv(TCPSocket* tracker, std::int32_t* p_world) const {
+    std::string scmd;
+    auto n_bytes = tracker->Recv(&scmd);
+    if (n_bytes <= 0) {
+      return Fail("Failed to recv init command from tracker.");
+    }
+    auto jcmd = Json::Load(scmd);
+    auto world = get<Integer const>(jcmd["world_size"]);
+    if (world <= 0) {
+      return Fail("Invalid world size.");
+    }
+    *p_world = world;
+    return Success();
+  }
+  [[nodiscard]] Result TrackerHandle(Json jcmd, std::int32_t* recv_world, std::int32_t world,
+                                     std::int32_t* p_port, TCPSocket* p_sock,
+                                     std::int32_t* eport) const {
+    *p_port = get<Integer const>(jcmd["port"]);
+    if (*p_port <= 0) {
+      return Fail("Invalid port.");
+    }
+    if (*recv_world != -1) {
+      return Fail("Invalid initialization sequence.");
+    }
+    *recv_world = world;
+    *eport = get<Integer const>(jcmd["error_port"]);
+    return TrackerSend(world, p_sock);
+  }
+};
+
+struct Print {
+  [[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::string msg) const {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kPrint)};
+    jcmd["msg"] = String{std::move(msg)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send print command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg) const {
+    if (!IsA<String>(jcmd["msg"])) {
+      return Fail("Invalid print command.");
+    }
+    auto msg = get<String const>(jcmd["msg"]);
+    *p_msg = msg;
+    return Success();
+  }
+};
+
+struct ErrorCMD {
+  [[nodiscard]] Result WorkerSend(TCPSocket* tracker, Result const& res) const {
+    auto msg = res.Report();
+    auto code = res.Code().value();
+    Json jcmd{Object{}};
+    jcmd["msg"] = String{std::move(msg)};
+    jcmd["code"] = Integer{code};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kError)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send error command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg, int* p_code) const {
+    if (!IsA<String>(jcmd["msg"]) || !IsA<Integer>(jcmd["code"])) {
+      return Fail("Invalid error command.");
+    }
+    auto msg = get<String const>(jcmd["msg"]);
+    auto code = get<Integer const>(jcmd["code"]);
+    *p_msg = msg;
+    *p_code = code;
+    return Success();
+  }
+};
+
+struct ShutdownCMD {
+  [[nodiscard]] Result Send(TCPSocket* peer) const {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = peer->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send shutdown command from worker.");
+    }
+    return Success();
+  }
+};
+}  // namespace xgboost::collective::proto
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
index 598b41ddd..043e93359 100644
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -15,12 +15,232 @@
 #include <ws2tcpip.h>
 #endif  // defined(_WIN32)
 
-#include <string>  // for string
+#include <algorithm>  // for sort
+#include <chrono>     // for seconds
+#include <cstdint>    // for int32_t
+#include <string>     // for string
+#include <utility>    // for move, forward
 
+#include "../common/json_utils.h"
+#include "comm.h"
+#include "protocol.h"  // for kMagic, PeerInfo
+#include "tracker.h"
 #include "xgboost/collective/result.h"  // for Result, Fail, Success
 #include "xgboost/collective/socket.h"  // for GetHostName, FailWithCode, MakeSockAddress, ...
+#include "xgboost/json.h"
 
 namespace xgboost::collective {
+Tracker::Tracker(Json const& config)
+    : n_workers_{static_cast<std::int32_t>(
+          RequiredArg<Integer const>(config, "n_workers", __func__))},
+      port_{static_cast<std::int32_t>(OptionalArg<Integer const>(config, "port", Integer::Int{0}))},
+      timeout_{std::chrono::seconds{OptionalArg<Integer const>(
+          config, "timeout", static_cast<std::int64_t>(collective::DefaultTimeoutSec()))}} {}
+
+RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr)
+    : sock_{std::move(sock)} {
+  auto host = addr.Addr();
+
+  std::int32_t rank{0};
+  rc_ = Success()
+        << [&] { return proto::Magic{}.Verify(&sock_); }
+        << [&] { return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_); };
+  if (!rc_.OK()) {
+    return;
+  }
+
+  std::string cmd;
+  sock_.Recv(&cmd);
+  auto jcmd = Json::Load(StringView{cmd});
+  cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
+  std::int32_t port{0};
+  if (cmd_ == proto::CMD::kStart) {
+    proto::Start start;
+    rc_ = start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
+  } else if (cmd_ == proto::CMD::kPrint) {
+    proto::Print print;
+    rc_ = print.TrackerHandle(jcmd, &msg_);
+  } else if (cmd_ == proto::CMD::kError) {
+    proto::ErrorCMD error;
+    rc_ = error.TrackerHandle(jcmd, &msg_, &code_);
+  }
+  if (!rc_.OK()) {
+    return;
+  }
+
+  info_ = proto::PeerInfo{host, port, rank};
+}
+
+RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
+  std::string self;
+  auto rc = collective::GetHostAddress(&self);
+  auto host = OptionalArg<String>(config, "host", self);
+
+  listener_ = TCPSocket::Create(SockDomain::kV4);
+  rc = listener_.Bind(host, &this->port_);
+  CHECK(rc.OK()) << rc.Report();
+  listener_.Listen();
+}
+
+Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
+  auto& workers = *p_workers;
+
+  std::sort(workers.begin(), workers.end(), WorkerCmp{});
+
+  std::vector<std::thread> bootstrap_threads;
+  for (std::int32_t r = 0; r < n_workers_; ++r) {
+    auto& worker = workers[r];
+    auto next = BootstrapNext(r, n_workers_);
+    auto const& next_w = workers[next];
+    bootstrap_threads.emplace_back([next, &worker, &next_w] {
+      auto jnext = proto::PeerInfo{next_w.Host(), next_w.Port(), next}.ToJson();
+      std::string str;
+      Json::Dump(jnext, &str);
+      worker.Send(StringView{str});
+    });
+  }
+
+  for (auto& t : bootstrap_threads) {
+    t.join();
+  }
+
+  for (auto const& w : workers) {
+    worker_error_handles_.emplace_back(w.Host(), w.ErrorPort());
+  }
+  return Success();
+}
+
+[[nodiscard]] std::future<Result> RabitTracker::Run() {
+  // a state machine to keep track of consistency.
+  struct State {
+    std::int32_t const n_workers;
+
+    std::int32_t n_shutdown{0};
+    bool during_restart{false};
+    std::vector<WorkerProxy> pending;
+
+    explicit State(std::int32_t world) : n_workers{world} {}
+    State(State const& that) = delete;
+    State& operator=(State&& that) = delete;
+
+    void Start(WorkerProxy&& worker) {
+      CHECK_LT(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+
+      pending.emplace_back(std::forward<WorkerProxy>(worker));
+
+      CHECK_LE(pending.size(), n_workers);
+    }
+    void Shutdown() {
+      CHECK_GE(n_shutdown, 0);
+      CHECK_LT(n_shutdown, n_workers);
+
+      ++n_shutdown;
+
+      CHECK_LE(n_shutdown, n_workers);
+    }
+    void Error() {
+      CHECK_LE(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+
+      during_restart = true;
+    }
+    [[nodiscard]] bool Ready() const {
+      CHECK_LE(pending.size(), n_workers);
+      return static_cast<std::int32_t>(pending.size()) == n_workers;
+    }
+    void Bootstrap() {
+      CHECK_EQ(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+
+      // A reset.
+      n_shutdown = 0;
+      during_restart = false;
+      pending.clear();
+    }
+    [[nodiscard]] bool ShouldContinue() const {
+      CHECK_LE(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+      // - Without error, we should shutdown after all workers are offline.
+      // - With error, all workers are offline, and we have during_restart as true.
+      return n_shutdown != n_workers || during_restart;
+    }
+  };
+
+  return std::async(std::launch::async, [this] {
+    State state{this->n_workers_};
+
+    while (state.ShouldContinue()) {
+      TCPSocket sock;
+      SockAddrV4 addr;
+      auto rc = listener_.Accept(&sock, &addr);
+      if (!rc.OK()) {
+        return Fail("Failed to accept connection.", std::move(rc));
+      }
+
+      auto worker = WorkerProxy{n_workers_, std::move(sock), std::move(addr)};
+      if (!worker.Status().OK()) {
+        return Fail("Failed to initialize worker proxy.", std::move(worker.Status()));
+      }
+      switch (worker.Command()) {
+        case proto::CMD::kStart: {
+          state.Start(std::move(worker));
+          if (state.Ready()) {
+            rc = this->Bootstrap(&state.pending);
+            state.Bootstrap();
+          }
+          if (!rc.OK()) {
+            return rc;
+          }
+          continue;
+        }
+        case proto::CMD::kShutdown: {
+          state.Shutdown();
+          continue;
+        }
+        case proto::CMD::kError: {
+          if (state.during_restart) {
+            continue;
+          }
+          state.Error();
+          auto msg = worker.Msg();
+          auto code = worker.Code();
+          LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank()
+                       << "]: " << msg << " code:" << code;
+          auto host = worker.Host();
+          // We signal all workers for the error, if they haven't aborted already.
+          for (auto& w : worker_error_handles_) {
+            if (w.first == host) {
+              continue;
+            }
+            TCPSocket out;
+            // retry is set to 1, just let the worker timeout or error. Otherwise the
+            // tracker and the worker might be waiting for each other.
+            auto rc = Connect(w.first, w.second, 1, timeout_, &out);
+            // send signal to stop the worker.
+            proto::ShutdownCMD shutdown;
+            rc = shutdown.Send(&out);
+            if (!rc.OK()) {
+              return Fail("Failed to inform workers to stop.");
+            }
+          }
+
+          continue;
+        }
+        case proto::CMD::kPrint: {
+          LOG(CONSOLE) << worker.Msg();
+          continue;
+        }
+        case proto::CMD::kInvalid:
+        default: {
+          return Fail("Invalid command received.");
+        }
+      }
+    }
+    return Success();
+  });
+}
+
 [[nodiscard]] Result GetHostAddress(std::string* out) {
   auto rc = GetHostName(out);
   if (!rc.OK()) {
diff --git a/src/collective/tracker.h b/src/collective/tracker.h
index ec52f6a62..7bbee3c8d 100644
--- a/src/collective/tracker.h
+++ b/src/collective/tracker.h
@@ -2,11 +2,137 @@
  * Copyright 2023, XGBoost Contributors
  */
 #pragma once
-#include <string>  // for string
+#include <chrono>   // for seconds
+#include <cstdint>  // for int32_t
+#include <future>   // for future
+#include <string>   // for string
+#include <utility>  // for pair
+#include <vector>   // for vector
 
+#include "protocol.h"
 #include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/json.h"               // for Json
 
 namespace xgboost::collective {
+/**
+ *
+ * @brief Implementation of RABIT tracker.
+ *
+ * * What is a tracker
+ *
+ *   The implementation of collective follows what RABIT did in the past. It requires a
+ *   tracker to coordinate initialization and error recovery of workers. While the
+ *   original implementation attempted to attain error resislient inside the collective
+ *   module, which turned out be too challenging due to large amount of external
+ *   states. The new implementation here differs from RABIT in the way that neither state
+ *   recovery nor resislient is handled inside the collective, it merely provides the
+ *   mechanism to signal error to other workers through the use of a centralized tracker.
+ *
+ *   There are three major functionalities provided the a tracker, namely:
+ *   - Initialization. Share the node addresses among all workers.
+ *   - Logging.
+ *   - Signal error. If an exception is thrown in one (or many) of the workers, it can
+ *     signal an error to the tracker and the tracker will notify other workers.
+ */
+class Tracker {
+ protected:
+  std::int32_t n_workers_{0};
+  std::int32_t port_{-1};
+  std::chrono::seconds timeout_{0};
+
+ public:
+  explicit Tracker(Json const& config);
+  Tracker(std::int32_t n_worders, std::int32_t port, std::chrono::seconds timeout)
+      : n_workers_{n_worders}, port_{port}, timeout_{timeout} {}
+
+  virtual ~Tracker() noexcept(false){};  // NOLINT
+  [[nodiscard]] virtual std::future<Result> Run() = 0;
+  [[nodiscard]] virtual Json WorkerArgs() const = 0;
+  [[nodiscard]] std::chrono::seconds Timeout() const { return timeout_; }
+};
+
+class RabitTracker : public Tracker {
+  // a wrapper for connected worker socket.
+  class WorkerProxy {
+    TCPSocket sock_;
+    proto::PeerInfo info_;
+    std::int32_t eport_{0};
+    std::int32_t world_{-1};
+    std::string task_id_;
+
+    proto::CMD cmd_{proto::CMD::kInvalid};
+    std::string msg_;
+    std::int32_t code_{0};
+    Result rc_;
+
+   public:
+    explicit WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr);
+    WorkerProxy(WorkerProxy const& that) = delete;
+    WorkerProxy(WorkerProxy&& that) = default;
+    WorkerProxy& operator=(WorkerProxy const&) = delete;
+    WorkerProxy& operator=(WorkerProxy&&) = default;
+
+    [[nodiscard]] auto Host() const { return info_.host; }
+    [[nodiscard]] auto TaskID() const { return task_id_; }
+    [[nodiscard]] auto Port() const { return info_.port; }
+    [[nodiscard]] auto Rank() const { return info_.rank; }
+    [[nodiscard]] auto ErrorPort() const { return eport_; }
+    [[nodiscard]] auto Command() const { return cmd_; }
+    [[nodiscard]] auto Msg() const { return msg_; }
+    [[nodiscard]] auto Code() const { return code_; }
+
+    [[nodiscard]] Result const& Status() const { return rc_; }
+    [[nodiscard]] Result& Status() { return rc_; }
+
+    void Send(StringView value) { this->sock_.Send(value); }
+  };
+  // provide an ordering for workers, this helps us get deterministic topology.
+  struct WorkerCmp {
+    [[nodiscard]] bool operator()(WorkerProxy const& lhs, WorkerProxy const& rhs) {
+      auto const& lh = lhs.Host();
+      auto const& rh = rhs.Host();
+
+      if (lh != rh) {
+        return lh < rh;
+      }
+      return lhs.TaskID() < rhs.TaskID();
+    }
+  };
+
+ private:
+  std::string host_;
+  // record for how to reach out to workers if error happens.
+  std::vector<std::pair<std::string, std::int32_t>> worker_error_handles_;
+  // listening socket for incoming workers.
+  TCPSocket listener_;
+
+  Result Bootstrap(std::vector<WorkerProxy>* p_workers);
+
+ public:
+  explicit RabitTracker(StringView host, std::int32_t n_worders, std::int32_t port,
+                        std::chrono::seconds timeout)
+      : Tracker{n_worders, port, timeout}, host_{host.c_str(), host.size()} {
+    listener_ = TCPSocket::Create(SockDomain::kV4);
+    auto rc = listener_.Bind(host, &this->port_);
+    CHECK(rc.OK()) << rc.Report();
+    listener_.Listen();
+  }
+
+  explicit RabitTracker(Json const& config);
+  ~RabitTracker() noexcept(false) override = default;
+
+  std::future<Result> Run() override;
+
+  [[nodiscard]] std::int32_t Port() const { return port_; }
+  [[nodiscard]] Json WorkerArgs() const override {
+    Json args{Object{}};
+    args["DMLC_TRACKER_URI"] = String{host_};
+    args["DMLC_TRACKER_PORT"] = this->Port();
+    return args;
+  }
+};
+
 // Prob the public IP address of the host, need a better method.
 //
 // This is directly translated from the previous Python implementation, we should find a
diff --git a/tests/cpp/collective/test_comm.cc b/tests/cpp/collective/test_comm.cc
new file mode 100644
index 000000000..7792c4c25
--- /dev/null
+++ b/tests/cpp/collective/test_comm.cc
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../src/collective/comm.h"
+#include "test_worker.h"
+namespace xgboost::collective {
+namespace {
+class CommTest : public TrackerTest {};
+}  // namespace
+
+TEST_F(CommTest, Channel) {
+  auto n_workers = 4;
+  RabitTracker tracker{host, n_workers, 0, timeout};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] {
+      WorkerForTest worker{host, port, timeout, n_workers, i};
+      if (i % 2 == 0) {
+        auto p_chan = worker.Comm().Chan(i + 1);
+        p_chan->SendAll(
+            EraseType(common::Span<std::int32_t const>{&i, static_cast<std::size_t>(1)}));
+        auto rc = p_chan->Block();
+        ASSERT_TRUE(rc.OK()) << rc.Report();
+      } else {
+        auto p_chan = worker.Comm().Chan(i - 1);
+        std::int32_t r{-1};
+        p_chan->RecvAll(EraseType(common::Span<std::int32_t>{&r, static_cast<std::size_t>(1)}));
+        auto rc = p_chan->Block();
+        ASSERT_TRUE(rc.OK()) << rc.Report();
+        ASSERT_EQ(r, i - 1);
+      }
+    });
+  }
+
+  for (auto &w : workers) {
+    w.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_socket.cc b/tests/cpp/collective/test_socket.cc
index 7802acda8..ced795fef 100644
--- a/tests/cpp/collective/test_socket.cc
+++ b/tests/cpp/collective/test_socket.cc
@@ -7,7 +7,7 @@
 #include <cerrno>        // EADDRNOTAVAIL
 #include <system_error>  // std::error_code, std::system_category
 
-#include "net_test.h"  // for SocketTest
+#include "test_worker.h"  // for SocketTest
 
 namespace xgboost::collective {
 TEST_F(SocketTest, Basic) {
diff --git a/tests/cpp/collective/test_tracker.cc b/tests/cpp/collective/test_tracker.cc
index 0e60cfb68..8fc5f0b3f 100644
--- a/tests/cpp/collective/test_tracker.cc
+++ b/tests/cpp/collective/test_tracker.cc
@@ -1,18 +1,67 @@
 /**
  * Copyright 2023, XGBoost Contributors
  */
-#include "../../../src/collective/tracker.h"  // for GetHostAddress
-#include "net_test.h"                         // for SocketTest
+#include <gtest/gtest.h>
+
+#include <chrono>   // for seconds
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <thread>   // for thread
+#include <vector>   // for vector
+
+#include "../../../src/collective/comm.h"
+#include "test_worker.h"
 
 namespace xgboost::collective {
 namespace {
-class TrackerTest : public SocketTest {};
+class PrintWorker : public WorkerForTest {
+ public:
+  using WorkerForTest::WorkerForTest;
+
+  void Print() {
+    auto rc = comm_.LogTracker("ack:" + std::to_string(this->comm_.Rank()));
+    ASSERT_TRUE(rc.OK()) << rc.Report();
+  }
+};
 }  // namespace
 
-TEST_F(TrackerTest, GetHostAddress) {
-  std::string host;
-  auto rc = GetHostAddress(&host);
-  ASSERT_TRUE(rc.OK());
-  ASSERT_TRUE(host.find("127.") == std::string::npos);
+TEST_F(TrackerTest, Bootstrap) {
+  RabitTracker tracker{host, n_workers, 0, timeout};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] { WorkerForTest worker{host, port, timeout, n_workers, i}; });
+  }
+  for (auto &w : workers) {
+    w.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
 }
+
+TEST_F(TrackerTest, Print) {
+  RabitTracker tracker{host, n_workers, 0, timeout};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] {
+      PrintWorker worker{host, port, timeout, n_workers, i};
+      worker.Print();
+    });
+  }
+
+  for (auto &w : workers) {
+    w.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
+}
+
+TEST_F(TrackerTest, GetHostAddress) { ASSERT_TRUE(host.find("127.") == std::string::npos); }
 }  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
new file mode 100644
index 000000000..3c9d02f03
--- /dev/null
+++ b/tests/cpp/collective/test_worker.h
@@ -0,0 +1,91 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include <chrono>   // for seconds
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <thread>   // for thread
+#include <utility>  // for move
+#include <vector>   // for vector
+
+#include "../../../src/collective/comm.h"
+#include "../../../src/collective/tracker.h"  // for GetHostAddress
+#include "../helpers.h"                       // for FileExists
+
+namespace xgboost::collective {
+class WorkerForTest {
+  std::string tracker_host_;
+  std::int32_t tracker_port_;
+  std::int32_t world_size_;
+
+ protected:
+  std::int32_t retry_{1};
+  std::string task_id_;
+  RabitComm comm_;
+
+ public:
+  WorkerForTest(std::string host, std::int32_t port, std::chrono::seconds timeout,
+                std::int32_t world, std::int32_t rank)
+      : tracker_host_{std::move(host)},
+        tracker_port_{port},
+        world_size_{world},
+        task_id_{"t:" + std::to_string(rank)},
+        comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_} {
+    CHECK_EQ(world_size_, comm_.World());
+  }
+  virtual ~WorkerForTest() = default;
+  auto& Comm() { return comm_; }
+
+  void LimitSockBuf(std::int32_t n_bytes) {
+    for (std::int32_t i = 0; i < comm_.World(); ++i) {
+      if (i != comm_.Rank()) {
+        ASSERT_TRUE(comm_.Chan(i)->Socket()->NonBlocking());
+        ASSERT_TRUE(comm_.Chan(i)->Socket()->SetBufSize(n_bytes).OK());
+      }
+    }
+  }
+};
+
+class SocketTest : public ::testing::Test {
+ protected:
+  std::string skip_msg_{"Skipping IPv6 test"};
+
+  bool SkipTest() {
+    std::string path{"/sys/module/ipv6/parameters/disable"};
+    if (FileExists(path)) {
+      std::ifstream fin(path);
+      if (!fin) {
+        return true;
+      }
+      std::string s_value;
+      fin >> s_value;
+      auto value = std::stoi(s_value);
+      if (value != 0) {
+        return true;
+      }
+    } else {
+      return true;
+    }
+    return false;
+  }
+
+ protected:
+  void SetUp() override { system::SocketStartup(); }
+  void TearDown() override { system::SocketFinalize(); }
+};
+
+class TrackerTest : public SocketTest {
+ public:
+  std::int32_t n_workers{2};
+  std::chrono::seconds timeout{1};
+  std::string host;
+
+  void SetUp() override {
+    SocketTest::SetUp();
+    auto rc = GetHostAddress(&host);
+    ASSERT_TRUE(rc.OK()) << rc.Report();
+  }
+};
+}  // namespace xgboost::collective

From d1dee4ad99788afa039cadaba3f28e4d17fba52c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 11 Oct 2023 23:26:09 -0700
Subject: [PATCH 44/59] [CI] Update RAPIDS to latest stable (#9654)

* [CI] Update RAPIDS to latest stable

* Remove slashes from Docker tag

---------

Co-authored-by: hcho3 <hcho3@users.noreply.github.com>
Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
---
 tests/buildkite/conftest.sh |  2 +-
 tests/ci_build/ci_build.sh  | 17 +++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 9e821f0fe..3d820d727 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x
 
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.08
+RAPIDS_VERSION=23.10
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
 
diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh
index a8567a89a..ef0c69183 100755
--- a/tests/ci_build/ci_build.sh
+++ b/tests/ci_build/ci_build.sh
@@ -148,10 +148,11 @@ then
     $(aws ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})
     # Pull pre-build container from Docker build cache,
     # if one exists for the particular branch or pull request
-    echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
-    if time docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+    DOCKER_TAG="${BRANCH_NAME//\//-}"  # Slashes are not allow in Docker tag
+    echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
+    if time docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
     then
-      CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+      CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
     else
       # If the build cache is empty of the particular branch or pull request,
       # use the build cache associated with the master branch
@@ -185,8 +186,8 @@ if [[ -n "${DOCKER_CACHE_REPO}" ]]
 then
     # Push the container we just built to the Docker build cache
     # that is associated with the particular branch or pull request
-    echo "docker tag ${DOCKER_IMG_NAME} ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
-    docker tag "${DOCKER_IMG_NAME}" "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+    echo "docker tag ${DOCKER_IMG_NAME} ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
+    docker tag "${DOCKER_IMG_NAME}" "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
 
     # Attempt to create Docker repository; it will fail if the repository already exists
     echo "aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION}"
@@ -214,10 +215,10 @@ then
 EOF
     fi
 
-    echo "docker push --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
-    docker push --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+    echo "docker push --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
+    docker push --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
     if [[ $? != "0" ]]; then
-        echo "ERROR: could not update Docker cache ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+        echo "ERROR: could not update Docker cache ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
         exit 1
     fi
 fi

From e164d51c439806c232afcc03c335fa963936ec93 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 12 Oct 2023 08:31:43 -0700
Subject: [PATCH 45/59] Improve allgather functions (#9649)

---
 plugin/federated/federated.proto              |  12 ++
 plugin/federated/federated_client.h           |  23 +++-
 plugin/federated/federated_communicator.h     |  19 ++--
 plugin/federated/federated_server.cc          |  13 ++-
 plugin/federated/federated_server.h           |  11 +-
 src/c_api/c_api.cc                            |   4 +-
 src/collective/communicator-inl.h             | 106 ++++++++++++++----
 src/collective/communicator.h                 |  14 ++-
 .../device_communicator_adapter.cuh           |  10 +-
 src/collective/in_memory_communicator.h       |  13 ++-
 src/collective/in_memory_handler.cc           |  70 ++++++++----
 src/collective/in_memory_handler.h            |  41 ++++---
 src/collective/noop_communicator.h            |   5 +-
 src/collective/rabit_communicator.h           |  24 +++-
 src/data/simple_dmatrix.cc                    |   6 +-
 src/tree/hist/evaluate_splits.h               |  23 ++--
 .../collective/test_in_memory_communicator.cc |  20 +++-
 tests/cpp/plugin/helpers.h                    |   2 +-
 .../cpp/plugin/test_federated_communicator.cc |  28 ++++-
 tests/cpp/plugin/test_federated_server.cc     |  24 +++-
 20 files changed, 346 insertions(+), 122 deletions(-)

diff --git a/plugin/federated/federated.proto b/plugin/federated/federated.proto
index d8ef5bd92..8450659fd 100644
--- a/plugin/federated/federated.proto
+++ b/plugin/federated/federated.proto
@@ -7,6 +7,7 @@ package xgboost.federated;
 
 service Federated {
   rpc Allgather(AllgatherRequest) returns (AllgatherReply) {}
+  rpc AllgatherV(AllgatherVRequest) returns (AllgatherVReply) {}
   rpc Allreduce(AllreduceRequest) returns (AllreduceReply) {}
   rpc Broadcast(BroadcastRequest) returns (BroadcastReply) {}
 }
@@ -42,6 +43,17 @@ message AllgatherReply {
   bytes receive_buffer = 1;
 }
 
+message AllgatherVRequest {
+  // An incrementing counter that is unique to each round to operations.
+  uint64 sequence_number = 1;
+  int32 rank = 2;
+  bytes send_buffer = 3;
+}
+
+message AllgatherVReply {
+  bytes receive_buffer = 1;
+}
+
 message AllreduceRequest {
   // An incrementing counter that is unique to each round to operations.
   uint64 sequence_number = 1;
diff --git a/plugin/federated/federated_client.h b/plugin/federated/federated_client.h
index d104cb231..ac1fbd57d 100644
--- a/plugin/federated/federated_client.h
+++ b/plugin/federated/federated_client.h
@@ -44,11 +44,11 @@ class FederatedClient {
         }()},
         rank_{rank} {}
 
-  std::string Allgather(std::string const &send_buffer) {
+  std::string Allgather(std::string_view send_buffer) {
     AllgatherRequest request;
     request.set_sequence_number(sequence_number_++);
     request.set_rank(rank_);
-    request.set_send_buffer(send_buffer);
+    request.set_send_buffer(send_buffer.data(), send_buffer.size());
 
     AllgatherReply reply;
     grpc::ClientContext context;
@@ -63,6 +63,25 @@ class FederatedClient {
     }
   }
 
+  std::string AllgatherV(std::string_view send_buffer) {
+    AllgatherVRequest request;
+    request.set_sequence_number(sequence_number_++);
+    request.set_rank(rank_);
+    request.set_send_buffer(send_buffer.data(), send_buffer.size());
+
+    AllgatherVReply reply;
+    grpc::ClientContext context;
+    context.set_wait_for_ready(true);
+    grpc::Status status = stub_->AllgatherV(&context, request, &reply);
+
+    if (status.ok()) {
+      return reply.receive_buffer();
+    } else {
+      std::cout << status.error_code() << ": " << status.error_message() << '\n';
+      throw std::runtime_error("AllgatherV RPC failed");
+    }
+  }
+
   std::string Allreduce(std::string const &send_buffer, DataType data_type,
                         ReduceOperation reduce_operation) {
     AllreduceRequest request;
diff --git a/plugin/federated/federated_communicator.h b/plugin/federated/federated_communicator.h
index 996b433cb..46c6b0fda 100644
--- a/plugin/federated/federated_communicator.h
+++ b/plugin/federated/federated_communicator.h
@@ -125,14 +125,19 @@ class FederatedCommunicator : public Communicator {
   [[nodiscard]] bool IsFederated() const override { return true; }
 
   /**
-   * \brief Perform in-place allgather.
-   * \param send_receive_buffer Buffer for both sending and receiving data.
-   * \param size Number of bytes to be gathered.
+   * \brief Perform allgather.
+   * \param input Buffer for sending data.
    */
-  void AllGather(void *send_receive_buffer, std::size_t size) override {
-    std::string const send_buffer(reinterpret_cast<char const *>(send_receive_buffer), size);
-    auto const received = client_->Allgather(send_buffer);
-    received.copy(reinterpret_cast<char *>(send_receive_buffer), size);
+  std::string AllGather(std::string_view input) override {
+    return client_->Allgather(input);
+  }
+
+  /**
+   * \brief Perform variable-length allgather.
+   * \param input Buffer for sending data.
+   */
+  std::string AllGatherV(std::string_view input) override {
+    return client_->AllgatherV(input);
   }
 
   /**
diff --git a/plugin/federated/federated_server.cc b/plugin/federated/federated_server.cc
index ae42f6d28..ad6cf6022 100644
--- a/plugin/federated/federated_server.cc
+++ b/plugin/federated/federated_server.cc
@@ -19,6 +19,13 @@ grpc::Status FederatedService::Allgather(grpc::ServerContext*, AllgatherRequest
   return grpc::Status::OK;
 }
 
+grpc::Status FederatedService::AllgatherV(grpc::ServerContext*, AllgatherVRequest const* request,
+                                          AllgatherVReply* reply) {
+  handler_.AllgatherV(request->send_buffer().data(), request->send_buffer().size(),
+                      reply->mutable_receive_buffer(), request->sequence_number(), request->rank());
+  return grpc::Status::OK;
+}
+
 grpc::Status FederatedService::Allreduce(grpc::ServerContext*, AllreduceRequest const* request,
                                          AllreduceReply* reply) {
   handler_.Allreduce(request->send_buffer().data(), request->send_buffer().size(),
@@ -36,8 +43,8 @@ grpc::Status FederatedService::Broadcast(grpc::ServerContext*, BroadcastRequest
   return grpc::Status::OK;
 }
 
-void RunServer(int port, int world_size, char const* server_key_file, char const* server_cert_file,
-               char const* client_cert_file) {
+void RunServer(int port, std::size_t world_size, char const* server_key_file,
+               char const* server_cert_file, char const* client_cert_file) {
   std::string const server_address = "0.0.0.0:" + std::to_string(port);
   FederatedService service{world_size};
 
@@ -59,7 +66,7 @@ void RunServer(int port, int world_size, char const* server_key_file, char const
   server->Wait();
 }
 
-void RunInsecureServer(int port, int world_size) {
+void RunInsecureServer(int port, std::size_t world_size) {
   std::string const server_address = "0.0.0.0:" + std::to_string(port);
   FederatedService service{world_size};
 
diff --git a/plugin/federated/federated_server.h b/plugin/federated/federated_server.h
index 7738248ea..711ef5588 100644
--- a/plugin/federated/federated_server.h
+++ b/plugin/federated/federated_server.h
@@ -12,11 +12,14 @@ namespace federated {
 
 class FederatedService final : public Federated::Service {
  public:
-  explicit FederatedService(int const world_size) : handler_{world_size} {}
+  explicit FederatedService(std::size_t const world_size) : handler_{world_size} {}
 
   grpc::Status Allgather(grpc::ServerContext* context, AllgatherRequest const* request,
                          AllgatherReply* reply) override;
 
+  grpc::Status AllgatherV(grpc::ServerContext* context, AllgatherVRequest const* request,
+                          AllgatherVReply* reply) override;
+
   grpc::Status Allreduce(grpc::ServerContext* context, AllreduceRequest const* request,
                          AllreduceReply* reply) override;
 
@@ -27,10 +30,10 @@ class FederatedService final : public Federated::Service {
   xgboost::collective::InMemoryHandler handler_;
 };
 
-void RunServer(int port, int world_size, char const* server_key_file, char const* server_cert_file,
-               char const* client_cert_file);
+void RunServer(int port, std::size_t world_size, char const* server_key_file,
+               char const* server_cert_file, char const* client_cert_file);
 
-void RunInsecureServer(int port, int world_size);
+void RunInsecureServer(int port, std::size_t world_size);
 
 }  // namespace federated
 }  // namespace xgboost
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 858047af8..4fb6d90ff 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1724,7 +1724,7 @@ XGB_DLL int XGCommunicatorAllreduce(void *send_receive_buffer, size_t count, int
 }
 
 #if defined(XGBOOST_USE_FEDERATED)
-XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_key_path,
+XGB_DLL int XGBRunFederatedServer(int port, std::size_t world_size, char const *server_key_path,
                                   char const *server_cert_path, char const *client_cert_path) {
   API_BEGIN();
   federated::RunServer(port, world_size, server_key_path, server_cert_path, client_cert_path);
@@ -1732,7 +1732,7 @@ XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_k
 }
 
 // Run a server without SSL for local testing.
-XGB_DLL int XGBRunInsecureFederatedServer(int port, int world_size) {
+XGB_DLL int XGBRunInsecureFederatedServer(int port, std::size_t world_size) {
   API_BEGIN();
   federated::RunInsecureServer(port, world_size);
   API_END();
diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
index 59cc4cc45..c58a9f3bc 100644
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -57,9 +57,7 @@ namespace collective {
  *   - federated_client_key: Client key file path. Only needed for the SSL mode.
  *   - federated_client_cert: Client certificate file path. Only needed for the SSL mode.
  */
-inline void Init(Json const& config) {
-  Communicator::Init(config);
-}
+inline void Init(Json const &config) { Communicator::Init(config); }
 
 /*!
  * \brief Finalize the collective communicator.
@@ -141,17 +139,89 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
   }
 }
 
+/**
+ * @brief Gathers a single value all processes and distributes the result to all processes.
+ *
+ * @param input The single value.
+ */
+template <typename T>
+inline std::vector<T> Allgather(T const &input) {
+  std::string_view str_input{reinterpret_cast<char const *>(&input), sizeof(T)};
+  auto const output = Communicator::Get()->AllGather(str_input);
+  CHECK_EQ(output.size() % sizeof(T), 0);
+  std::vector<T> result(output.size() / sizeof(T));
+  std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
+  return result;
+}
+
 /**
  * @brief Gathers data from all processes and distributes it to all processes.
  *
- * This assumes all ranks have the same size, and input data has been sliced into the
- * corresponding position.
+ * This assumes all ranks have the same size.
  *
- * @param send_receive_buffer Buffer storing the data.
- * @param size                Size of the data in bytes.
+ * @param input Buffer storing the data.
  */
-inline void Allgather(void *send_receive_buffer, std::size_t size) {
-  Communicator::Get()->AllGather(send_receive_buffer, size);
+template <typename T>
+inline std::vector<T> Allgather(std::vector<T> const &input) {
+  if (input.empty()) {
+    return input;
+  }
+  std::string_view str_input{reinterpret_cast<char const *>(input.data()),
+                             input.size() * sizeof(T)};
+  auto const output = Communicator::Get()->AllGather(str_input);
+  CHECK_EQ(output.size() % sizeof(T), 0);
+  std::vector<T> result(output.size() / sizeof(T));
+  std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
+  return result;
+}
+
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ * @param input Buffer storing the data.
+ */
+template <typename T>
+inline std::vector<T> AllgatherV(std::vector<T> const &input) {
+  std::string_view str_input{reinterpret_cast<char const *>(input.data()),
+                             input.size() * sizeof(T)};
+  auto const output = Communicator::Get()->AllGatherV(str_input);
+  CHECK_EQ(output.size() % sizeof(T), 0);
+  std::vector<T> result(output.size() / sizeof(T));
+  if (!output.empty()) {
+    std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
+  }
+  return result;
+}
+
+/**
+ * @brief Gathers variable-length strings from all processes and distributes them to all processes.
+ * @param input Variable-length list of variable-length strings.
+ */
+inline std::vector<std::string> AllgatherStrings(std::vector<std::string> const &input) {
+  std::size_t total_size{0};
+  for (auto const &s : input) {
+    total_size += s.length() + 1;  // +1 for null-terminators
+  }
+  std::string flat_string;
+  flat_string.reserve(total_size);
+  for (auto const &s : input) {
+    flat_string.append(s);
+    flat_string.push_back('\0');  // Append a null-terminator after each string
+  }
+
+  auto const output = Communicator::Get()->AllGatherV(flat_string);
+
+  std::vector<std::string> result;
+  std::size_t start_index = 0;
+  // Iterate through the output, find each null-terminated substring.
+  for (std::size_t i = 0; i < output.size(); i++) {
+    if (output[i] == '\0') {
+      // Construct a std::string from the char* substring
+      result.emplace_back(&output[start_index]);
+      // Move to the next substring
+      start_index = i + 1;
+    }
+  }
+  return result;
 }
 
 /*!
@@ -226,7 +296,7 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
 }
 
 template <typename T>
-struct AllgatherVResult {
+struct SpecialAllgatherVResult {
   std::vector<std::size_t> offsets;
   std::vector<std::size_t> sizes;
   std::vector<T> result;
@@ -241,14 +311,10 @@ struct AllgatherVResult {
  * @param sizes  Sizes of each input.
  */
 template <typename T>
-inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
-                                      std::vector<std::size_t> const &sizes) {
-  auto num_inputs = sizes.size();
-
+inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
+                                                    std::vector<std::size_t> const &sizes) {
   // Gather the sizes across all workers.
-  std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
-  std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
-  collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
+  auto const all_sizes = Allgather(sizes);
 
   // Calculate input offsets (std::exclusive_scan).
   std::vector<std::size_t> offsets(all_sizes.size());
@@ -257,11 +323,7 @@ inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
   }
 
   // Gather all the inputs.
-  auto total_input_size = offsets.back() + all_sizes.back();
-  std::vector<T> all_inputs(total_input_size);
-  std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
-  // We cannot use allgather here, since each worker might have a different size.
-  Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
+  auto const all_inputs = AllgatherV(inputs);
 
   return {offsets, all_sizes, all_inputs};
 }
diff --git a/src/collective/communicator.h b/src/collective/communicator.h
index def961513..feb446355 100644
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -125,13 +125,17 @@ class Communicator {
   /**
    * @brief Gathers data from all processes and distributes it to all processes.
    *
-   * This assumes all ranks have the same size, and input data has been sliced into the
-   * corresponding position.
+   * This assumes all ranks have the same size.
    *
-   * @param send_receive_buffer Buffer storing the data.
-   * @param size                Size of the data in bytes.
+   * @param input Buffer storing the data.
    */
-  virtual void AllGather(void *send_receive_buffer, std::size_t size) = 0;
+  virtual std::string AllGather(std::string_view input) = 0;
+
+  /**
+   * @brief Gathers variable-length data from all processes and distributes it to all processes.
+   * @param input Buffer storing the data.
+   */
+  virtual std::string AllGatherV(std::string_view input) = 0;
 
   /**
    * @brief Combines values from all processes and distributes the result back to all processes.
diff --git a/src/collective/device_communicator_adapter.cuh b/src/collective/device_communicator_adapter.cuh
index d10b10486..7d3e836a0 100644
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -40,12 +40,10 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
     }
 
     dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    host_buffer_.resize(send_size * world_size_);
-    dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
-                             cudaMemcpyDefault));
-    Allgather(host_buffer_.data(), host_buffer_.size());
-    dh::safe_cuda(
-        cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
+    host_buffer_.resize(send_size);
+    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_buffer, send_size, cudaMemcpyDefault));
+    auto const output = Allgather(host_buffer_);
+    dh::safe_cuda(cudaMemcpy(receive_buffer, output.data(), output.size(), cudaMemcpyDefault));
   }
 
   void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
diff --git a/src/collective/in_memory_communicator.h b/src/collective/in_memory_communicator.h
index f41029af1..c712d32a8 100644
--- a/src/collective/in_memory_communicator.h
+++ b/src/collective/in_memory_communicator.h
@@ -60,11 +60,16 @@ class InMemoryCommunicator : public Communicator {
   bool IsDistributed() const override { return true; }
   bool IsFederated() const override { return false; }
 
-  void AllGather(void* in_out, std::size_t size) override {
+  std::string AllGather(std::string_view input) override {
     std::string output;
-    handler_.Allgather(static_cast<const char*>(in_out), size, &output, sequence_number_++,
-                       GetRank());
-    output.copy(static_cast<char*>(in_out), size);
+    handler_.Allgather(input.data(), input.size(), &output, sequence_number_++, GetRank());
+    return output;
+  }
+
+  std::string AllGatherV(std::string_view input) override {
+    std::string output;
+    handler_.AllgatherV(input.data(), input.size(), &output, sequence_number_++, GetRank());
+    return output;
   }
 
   void AllReduce(void* in_out, std::size_t size, DataType data_type, Operation operation) override {
diff --git a/src/collective/in_memory_handler.cc b/src/collective/in_memory_handler.cc
index a45fe3e7d..944e5077b 100644
--- a/src/collective/in_memory_handler.cc
+++ b/src/collective/in_memory_handler.cc
@@ -16,23 +16,49 @@ class AllgatherFunctor {
  public:
   std::string const name{"Allgather"};
 
-  AllgatherFunctor(int world_size, int rank) : world_size_{world_size}, rank_{rank} {}
+  AllgatherFunctor(std::size_t world_size, std::size_t rank)
+      : world_size_{world_size}, rank_{rank} {}
 
   void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
     if (buffer->empty()) {
-      // Copy the input if this is the first request.
-      buffer->assign(input, bytes);
-    } else {
-      // Splice the input into the common buffer.
-      auto const per_rank = bytes / world_size_;
-      auto const index = rank_ * per_rank;
-      buffer->replace(index, per_rank, input + index, per_rank);
+      // Resize the buffer if this is the first request.
+      buffer->resize(bytes * world_size_);
+    }
+
+    // Splice the input into the common buffer.
+    buffer->replace(rank_ * bytes, bytes, input, bytes);
+  }
+
+ private:
+  std::size_t world_size_;
+  std::size_t rank_;
+};
+
+/**
+ * @brief Functor for variable-length allgather.
+ */
+class AllgatherVFunctor {
+ public:
+  std::string const name{"AllgatherV"};
+
+  AllgatherVFunctor(std::size_t world_size, std::size_t rank,
+                    std::map<std::size_t, std::string_view>* data)
+      : world_size_{world_size}, rank_{rank}, data_{data} {}
+
+  void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
+    data_->emplace(rank_, std::string_view{input, bytes});
+    if (data_->size() == world_size_) {
+      for (auto const& kv : *data_) {
+        buffer->append(kv.second);
+      }
+      data_->clear();
     }
   }
 
  private:
-  int world_size_;
-  int rank_;
+  std::size_t world_size_;
+  std::size_t rank_;
+  std::map<std::size_t, std::string_view>* data_;
 };
 
 /**
@@ -154,7 +180,7 @@ class BroadcastFunctor {
  public:
   std::string const name{"Broadcast"};
 
-  BroadcastFunctor(int rank, int root) : rank_{rank}, root_{root} {}
+  BroadcastFunctor(std::size_t rank, std::size_t root) : rank_{rank}, root_{root} {}
 
   void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
     if (rank_ == root_) {
@@ -164,11 +190,11 @@ class BroadcastFunctor {
   }
 
  private:
-  int rank_;
-  int root_;
+  std::size_t rank_;
+  std::size_t root_;
 };
 
-void InMemoryHandler::Init(int world_size, int) {
+void InMemoryHandler::Init(std::size_t world_size, std::size_t) {
   CHECK(world_size_ < world_size) << "In memory handler already initialized.";
 
   std::unique_lock<std::mutex> lock(mutex_);
@@ -178,7 +204,7 @@ void InMemoryHandler::Init(int world_size, int) {
   cv_.notify_all();
 }
 
-void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
+void InMemoryHandler::Shutdown(uint64_t sequence_number, std::size_t) {
   CHECK(world_size_ > 0) << "In memory handler already shutdown.";
 
   std::unique_lock<std::mutex> lock(mutex_);
@@ -194,24 +220,30 @@ void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
 }
 
 void InMemoryHandler::Allgather(char const* input, std::size_t bytes, std::string* output,
-                                std::size_t sequence_number, int rank) {
+                                std::size_t sequence_number, std::size_t rank) {
   Handle(input, bytes, output, sequence_number, rank, AllgatherFunctor{world_size_, rank});
 }
 
+void InMemoryHandler::AllgatherV(char const* input, std::size_t bytes, std::string* output,
+                                 std::size_t sequence_number, std::size_t rank) {
+  Handle(input, bytes, output, sequence_number, rank, AllgatherVFunctor{world_size_, rank, &aux_});
+}
+
 void InMemoryHandler::Allreduce(char const* input, std::size_t bytes, std::string* output,
-                                std::size_t sequence_number, int rank, DataType data_type,
+                                std::size_t sequence_number, std::size_t rank, DataType data_type,
                                 Operation op) {
   Handle(input, bytes, output, sequence_number, rank, AllreduceFunctor{data_type, op});
 }
 
 void InMemoryHandler::Broadcast(char const* input, std::size_t bytes, std::string* output,
-                                std::size_t sequence_number, int rank, int root) {
+                                std::size_t sequence_number, std::size_t rank, std::size_t root) {
   Handle(input, bytes, output, sequence_number, rank, BroadcastFunctor{rank, root});
 }
 
 template <class HandlerFunctor>
 void InMemoryHandler::Handle(char const* input, std::size_t bytes, std::string* output,
-                             std::size_t sequence_number, int rank, HandlerFunctor const& functor) {
+                             std::size_t sequence_number, std::size_t rank,
+                             HandlerFunctor const& functor) {
   // Pass through if there is only 1 client.
   if (world_size_ == 1) {
     if (input != output->data()) {
diff --git a/src/collective/in_memory_handler.h b/src/collective/in_memory_handler.h
index 4182c7b3d..f9ac52007 100644
--- a/src/collective/in_memory_handler.h
+++ b/src/collective/in_memory_handler.h
@@ -3,6 +3,7 @@
  */
 #pragma once
 #include <condition_variable>
+#include <map>
 #include <string>
 
 #include "communicator.h"
@@ -31,7 +32,7 @@ class InMemoryHandler {
    *
    * This is used when the handler only needs to be initialized once with a known world size.
    */
-  explicit InMemoryHandler(int worldSize) : world_size_{worldSize} {}
+  explicit InMemoryHandler(std::size_t worldSize) : world_size_{worldSize} {}
 
   /**
    * @brief Initialize the handler with the world size and rank.
@@ -41,7 +42,7 @@ class InMemoryHandler {
    * This is used when multiple objects/threads are accessing the same handler and need to
    * initialize it collectively.
    */
-  void Init(int world_size, int rank);
+  void Init(std::size_t world_size, std::size_t rank);
 
   /**
    * @brief Shut down the handler.
@@ -51,7 +52,7 @@ class InMemoryHandler {
    * This is used when multiple objects/threads are accessing the same handler and need to
    * shut it down collectively.
    */
-  void Shutdown(uint64_t sequence_number, int rank);
+  void Shutdown(uint64_t sequence_number, std::size_t rank);
 
   /**
    * @brief Perform allgather.
@@ -62,7 +63,18 @@ class InMemoryHandler {
    * @param rank Index of the worker.
    */
   void Allgather(char const* input, std::size_t bytes, std::string* output,
-                 std::size_t sequence_number, int rank);
+                 std::size_t sequence_number, std::size_t rank);
+
+  /**
+   * @brief Perform variable-length allgather.
+   * @param input The input buffer.
+   * @param bytes Number of bytes in the input buffer.
+   * @param output The output buffer.
+   * @param sequence_number Call sequence number.
+   * @param rank Index of the worker.
+   */
+  void AllgatherV(char const* input, std::size_t bytes, std::string* output,
+                  std::size_t sequence_number, std::size_t rank);
 
   /**
    * @brief Perform allreduce.
@@ -75,7 +87,7 @@ class InMemoryHandler {
    * @param op The reduce operation.
    */
   void Allreduce(char const* input, std::size_t bytes, std::string* output,
-                 std::size_t sequence_number, int rank, DataType data_type, Operation op);
+                 std::size_t sequence_number, std::size_t rank, DataType data_type, Operation op);
 
   /**
    * @brief Perform broadcast.
@@ -87,7 +99,7 @@ class InMemoryHandler {
    * @param root Index of the worker to broadcast from.
    */
   void Broadcast(char const* input, std::size_t bytes, std::string* output,
-                 std::size_t sequence_number, int rank, int root);
+                 std::size_t sequence_number, std::size_t rank, std::size_t root);
 
  private:
   /**
@@ -102,15 +114,16 @@ class InMemoryHandler {
    */
   template <class HandlerFunctor>
   void Handle(char const* input, std::size_t size, std::string* output, std::size_t sequence_number,
-              int rank, HandlerFunctor const& functor);
+              std::size_t rank, HandlerFunctor const& functor);
 
-  int world_size_{};                    /// Number of workers.
-  int received_{};                      /// Number of calls received with the current sequence.
-  int sent_{};                          /// Number of calls completed with the current sequence.
-  std::string buffer_{};                /// A shared common buffer.
-  uint64_t sequence_number_{};          /// Call sequence number.
-  mutable std::mutex mutex_;            /// Lock.
-  mutable std::condition_variable cv_;  /// Conditional variable to wait on.
+  std::size_t world_size_{};  /// Number of workers.
+  std::size_t received_{};    /// Number of calls received with the current sequence.
+  std::size_t sent_{};        /// Number of calls completed with the current sequence.
+  std::string buffer_{};      /// A shared common buffer.
+  std::map<std::size_t, std::string_view> aux_{};  /// A shared auxiliary map.
+  uint64_t sequence_number_{};                     /// Call sequence number.
+  mutable std::mutex mutex_;                       /// Lock.
+  mutable std::condition_variable cv_;             /// Conditional variable to wait on.
 };
 
 }  // namespace collective
diff --git a/src/collective/noop_communicator.h b/src/collective/noop_communicator.h
index 28a0a1cad..2d88fd802 100644
--- a/src/collective/noop_communicator.h
+++ b/src/collective/noop_communicator.h
@@ -17,10 +17,11 @@ class NoOpCommunicator : public Communicator {
   NoOpCommunicator() : Communicator(1, 0) {}
   bool IsDistributed() const override { return false; }
   bool IsFederated() const override { return false; }
-  void AllGather(void *, std::size_t) override {}
+  std::string AllGather(std::string_view) override { return {}; }
+  std::string AllGatherV(std::string_view) override { return {}; }
   void AllReduce(void *, std::size_t, DataType, Operation) override {}
   void Broadcast(void *, std::size_t, int) override {}
-  std::string GetProcessorName() override { return ""; }
+  std::string GetProcessorName() override { return {}; }
   void Print(const std::string &message) override { LOG(CONSOLE) << message; }
 
  protected:
diff --git a/src/collective/rabit_communicator.h b/src/collective/rabit_communicator.h
index 9b79624a2..59a4bbbd8 100644
--- a/src/collective/rabit_communicator.h
+++ b/src/collective/rabit_communicator.h
@@ -7,6 +7,7 @@
 #include <string>
 #include <vector>
 
+#include "communicator-inl.h"
 #include "communicator.h"
 #include "xgboost/json.h"
 
@@ -55,10 +56,27 @@ class RabitCommunicator : public Communicator {
 
   bool IsFederated() const override { return false; }
 
-  void AllGather(void *send_receive_buffer, std::size_t size) override {
-    auto const per_rank = size / GetWorldSize();
+  std::string AllGather(std::string_view input) override {
+    auto const per_rank = input.size();
+    auto const total_size = per_rank * GetWorldSize();
     auto const index = per_rank * GetRank();
-    rabit::Allgather(static_cast<char *>(send_receive_buffer), size, index, per_rank, per_rank);
+    std::string result(total_size, '\0');
+    rabit::Allgather(result.data(), total_size, index, per_rank, per_rank);
+    return result;
+  }
+
+  std::string AllGatherV(std::string_view input) override {
+    auto const size_node_slice = input.size();
+    auto const all_sizes = collective::Allgather(size_node_slice);
+    auto const total_size = std::accumulate(all_sizes.cbegin(), all_sizes.cend(), 0ul);
+    auto const begin_index =
+        std::accumulate(all_sizes.cbegin(), all_sizes.cbegin() + GetRank(), 0ul);
+    auto const size_prev_slice = GetRank() == 0 ? 0 : all_sizes[GetRank() - 1];
+
+    std::string result(total_size, '\0');
+    result.replace(begin_index, size_node_slice, input);
+    rabit::Allgather(result.data(), total_size, begin_index, size_node_slice, size_prev_slice);
+    return result;
   }
 
   void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 1989f68a9..48e764986 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -76,10 +76,8 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
 
 void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
   if (info_.IsColumnSplit()) {
-    std::vector<uint64_t> buffer(collective::GetWorldSize());
-    buffer[collective::GetRank()] = info_.num_col_;
-    collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
-    auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0ul);
+    auto const cols = collective::Allgather(info_.num_col_);
+    auto const offset = std::accumulate(cols.cbegin(), cols.cbegin() + collective::GetRank(), 0ul);
     if (offset == 0) {
       return;
     }
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index b4612e24c..680c50398 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -292,20 +292,19 @@ class HistEvaluator {
    */
   std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
     auto const world = collective::GetWorldSize();
-    auto const rank = collective::GetRank();
     auto const num_entries = entries.size();
 
     // First, gather all the primitive fields.
-    std::vector<CPUExpandEntry> all_entries(num_entries * world);
+    std::vector<CPUExpandEntry> local_entries(num_entries);
     std::vector<uint32_t> cat_bits;
     std::vector<std::size_t> cat_bits_sizes;
     for (std::size_t i = 0; i < num_entries; i++) {
-      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
+      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
     }
-    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
+    auto all_entries = collective::Allgather(local_entries);
 
     // Gather all the cat_bits.
-    auto gathered = collective::AllgatherV(cat_bits, cat_bits_sizes);
+    auto gathered = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
 
     common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
       // Copy the cat_bits back into all expand entries.
@@ -579,28 +578,24 @@ class HistMultiEvaluator {
    */
   std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
     auto const world = collective::GetWorldSize();
-    auto const rank = collective::GetRank();
     auto const num_entries = entries.size();
 
     // First, gather all the primitive fields.
-    std::vector<MultiExpandEntry> all_entries(num_entries * world);
+    std::vector<MultiExpandEntry> local_entries(num_entries);
     std::vector<uint32_t> cat_bits;
     std::vector<std::size_t> cat_bits_sizes;
     std::vector<GradientPairPrecise> gradients;
     for (std::size_t i = 0; i < num_entries; i++) {
-      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
-                                                         &gradients);
+      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes, &gradients);
     }
-    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(MultiExpandEntry));
+    auto all_entries = collective::Allgather(local_entries);
 
     // Gather all the cat_bits.
-    auto gathered_cat_bits = collective::AllgatherV(cat_bits, cat_bits_sizes);
+    auto gathered_cat_bits = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
 
     // Gather all the gradients.
     auto const num_gradients = gradients.size();
-    std::vector<GradientPairPrecise> all_gradients(num_gradients * world);
-    std::copy_n(gradients.cbegin(), num_gradients, all_gradients.begin() + num_gradients * rank);
-    collective::Allgather(all_gradients.data(), all_gradients.size() * sizeof(GradientPairPrecise));
+    auto const all_gradients = collective::Allgather(gradients);
 
     auto const total_entries = num_entries * world;
     auto const gradients_per_entry = num_gradients / num_entries;
diff --git a/tests/cpp/collective/test_in_memory_communicator.cc b/tests/cpp/collective/test_in_memory_communicator.cc
index f36e30e33..69c427a4e 100644
--- a/tests/cpp/collective/test_in_memory_communicator.cc
+++ b/tests/cpp/collective/test_in_memory_communicator.cc
@@ -29,6 +29,11 @@ class InMemoryCommunicatorTest : public ::testing::Test {
     VerifyAllgather(comm, rank);
   }
 
+  static void AllgatherV(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyAllgatherV(comm, rank);
+  }
+
   static void AllreduceMax(int rank) {
     InMemoryCommunicator comm{kWorldSize, rank};
     VerifyAllreduceMax(comm, rank);
@@ -80,14 +85,19 @@ class InMemoryCommunicatorTest : public ::testing::Test {
 
  protected:
   static void VerifyAllgather(InMemoryCommunicator &comm, int rank) {
-    char buffer[kWorldSize] = {'a', 'b', 'c'};
-    buffer[rank] = '0' + rank;
-    comm.AllGather(buffer, kWorldSize);
+    std::string input{static_cast<char>('0' + rank)};
+    auto output = comm.AllGather(input);
     for (auto i = 0; i < kWorldSize; i++) {
-      EXPECT_EQ(buffer[i], '0' + i);
+      EXPECT_EQ(output[i], static_cast<char>('0' + i));
     }
   }
 
+  static void VerifyAllgatherV(InMemoryCommunicator &comm, int rank) {
+    std::vector<std::string_view> inputs{"a", "bb", "ccc"};
+    auto output = comm.AllGatherV(inputs[rank]);
+    EXPECT_EQ(output, "abbccc");
+  }
+
   static void VerifyAllreduceMax(InMemoryCommunicator &comm, int rank) {
     int buffer[] = {1 + rank, 2 + rank, 3 + rank, 4 + rank, 5 + rank};
     comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kMax);
@@ -205,6 +215,8 @@ TEST(InMemoryCommunicatorSimpleTest, IsDistributed) {
 
 TEST_F(InMemoryCommunicatorTest, Allgather) { Verify(&Allgather); }
 
+TEST_F(InMemoryCommunicatorTest, AllgatherV) { Verify(&AllgatherV); }
+
 TEST_F(InMemoryCommunicatorTest, AllreduceMax) { Verify(&AllreduceMax); }
 
 TEST_F(InMemoryCommunicatorTest, AllreduceMin) { Verify(&AllreduceMin); }
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index 20b4afc30..b756adefd 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -23,7 +23,7 @@ class ServerForTest {
   std::unique_ptr<grpc::Server> server_;
 
  public:
-  explicit ServerForTest(std::int32_t world_size) {
+  explicit ServerForTest(std::size_t world_size) {
     server_thread_.reset(new std::thread([this, world_size] {
       grpc::ServerBuilder builder;
       xgboost::federated::FederatedService service{world_size};
diff --git a/tests/cpp/plugin/test_federated_communicator.cc b/tests/cpp/plugin/test_federated_communicator.cc
index 8b0e1039a..68b112f1c 100644
--- a/tests/cpp/plugin/test_federated_communicator.cc
+++ b/tests/cpp/plugin/test_federated_communicator.cc
@@ -19,6 +19,11 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
     CheckAllgather(comm, rank);
   }
 
+  static void VerifyAllgatherV(int rank, const std::string &server_address) {
+    FederatedCommunicator comm{kWorldSize, rank, server_address};
+    CheckAllgatherV(comm, rank);
+  }
+
   static void VerifyAllreduce(int rank, const std::string &server_address) {
     FederatedCommunicator comm{kWorldSize, rank, server_address};
     CheckAllreduce(comm);
@@ -31,14 +36,19 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
 
  protected:
   static void CheckAllgather(FederatedCommunicator &comm, int rank) {
-    int buffer[kWorldSize] = {0, 0};
-    buffer[rank] = rank;
-    comm.AllGather(buffer, sizeof(buffer));
+    std::string input{static_cast<char>('0' + rank)};
+    auto output = comm.AllGather(input);
     for (auto i = 0; i < kWorldSize; i++) {
-      EXPECT_EQ(buffer[i], i);
+      EXPECT_EQ(output[i], static_cast<char>('0' + i));
     }
   }
 
+  static void CheckAllgatherV(FederatedCommunicator &comm, int rank) {
+    std::vector<std::string_view> inputs{"Federated", " Learning!!!"};
+    auto output = comm.AllGatherV(inputs[rank]);
+    EXPECT_EQ(output, "Federated Learning!!!");
+  }
+
   static void CheckAllreduce(FederatedCommunicator &comm) {
     int buffer[] = {1, 2, 3, 4, 5};
     comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kSum);
@@ -119,6 +129,16 @@ TEST_F(FederatedCommunicatorTest, Allgather) {
   }
 }
 
+TEST_F(FederatedCommunicatorTest, AllgatherV) {
+  std::vector<std::thread> threads;
+  for (auto rank = 0; rank < kWorldSize; rank++) {
+    threads.emplace_back(&FederatedCommunicatorTest::VerifyAllgatherV, rank, server_->Address());
+  }
+  for (auto &thread : threads) {
+    thread.join();
+  }
+}
+
 TEST_F(FederatedCommunicatorTest, Allreduce) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
diff --git a/tests/cpp/plugin/test_federated_server.cc b/tests/cpp/plugin/test_federated_server.cc
index 633d64df1..c40e58fa3 100644
--- a/tests/cpp/plugin/test_federated_server.cc
+++ b/tests/cpp/plugin/test_federated_server.cc
@@ -18,6 +18,11 @@ class FederatedServerTest : public BaseFederatedTest {
     CheckAllgather(client, rank);
   }
 
+  static void VerifyAllgatherV(int rank, const std::string& server_address) {
+    federated::FederatedClient client{server_address, rank};
+    CheckAllgatherV(client, rank);
+  }
+
   static void VerifyAllreduce(int rank, const std::string& server_address) {
     federated::FederatedClient client{server_address, rank};
     CheckAllreduce(client);
@@ -39,8 +44,7 @@ class FederatedServerTest : public BaseFederatedTest {
 
  protected:
   static void CheckAllgather(federated::FederatedClient& client, int rank) {
-    int data[kWorldSize] = {0, 0};
-    data[rank] = rank;
+    int data[] = {rank};
     std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
     auto reply = client.Allgather(send_buffer);
     auto const* result = reinterpret_cast<int const*>(reply.data());
@@ -49,6 +53,12 @@ class FederatedServerTest : public BaseFederatedTest {
     }
   }
 
+  static void CheckAllgatherV(federated::FederatedClient& client, int rank) {
+    std::vector<std::string_view> inputs{"Hello,", " World!"};
+    auto reply = client.AllgatherV(inputs[rank]);
+    EXPECT_EQ(reply, "Hello, World!");
+  }
+
   static void CheckAllreduce(federated::FederatedClient& client) {
     int data[] = {1, 2, 3, 4, 5};
     std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
@@ -80,6 +90,16 @@ TEST_F(FederatedServerTest, Allgather) {
   }
 }
 
+TEST_F(FederatedServerTest, AllgatherV) {
+  std::vector<std::thread> threads;
+  for (auto rank = 0; rank < kWorldSize; rank++) {
+    threads.emplace_back(&FederatedServerTest::VerifyAllgatherV, rank, server_->Address());
+  }
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
 TEST_F(FederatedServerTest, Allreduce) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {

From cd8760cba3fdb5c43fb7337a2d1262f896f8ac41 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 13 Oct 2023 09:07:01 +0800
Subject: [PATCH 46/59] [doc] Update document about running tests. [skip ci]
 (#9658)

---
 doc/contrib/unit_tests.rst | 58 +++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst
index ef4ad1480..2aa97e812 100644
--- a/doc/contrib/unit_tests.rst
+++ b/doc/contrib/unit_tests.rst
@@ -16,7 +16,14 @@ Adding a new unit test
 
 Python package: pytest
 ======================
-Add your test under the directory `tests/python/ <https://github.com/dmlc/xgboost/tree/master/tests/python>`_ or `tests/python-gpu/ <https://github.com/dmlc/xgboost/tree/master/tests/python-gpu>`_ (if you are testing GPU code). Refer to `the PyTest tutorial <https://docs.pytest.org/en/latest/getting-started.html>`_ to learn how to write tests for Python code.
+Add your test under the directories
+
+- `tests/python/ <https://github.com/dmlc/xgboost/tree/master/tests/python>`_
+- `tests/python-gpu/ <https://github.com/dmlc/xgboost/tree/master/tests/python-gpu>`_ (if you are testing GPU code)
+- `tests/test_distributed <https://github.com/dmlc/xgboost/tree/master/tests/test_distributed>`_. (if a distributed framework is used)
+
+Refer to `the PyTest tutorial <https://docs.pytest.org/en/latest/getting-started.html>`_
+to learn how to write tests for Python code.
 
 You may try running your test by following instructions in :ref:`this section <running_pytest>`.
 
@@ -56,19 +63,26 @@ Run
 
 .. code-block:: bash
 
-  make Rcheck
+  python ./tests/ci_build/test_r_package.py --task=check
 
-at the root of the project directory.
+at the root of the project directory. The command builds and checks the XGBoost
+r-package. Alternatively, if you want to just run the tests, you can use the following
+commands after installing XGBoost:
+
+.. code-block:: bash
+
+  cd R-package/tests/
+  Rscript testthat.R
 
 .. _running_jvm_tests:
 
 JVM packages
 ============
-As part of the building process, tests are run:
+Maven is used
 
 .. code-block:: bash
 
-  mvn package
+  mvn test
 
 .. _running_pytest:
 
@@ -99,6 +113,14 @@ In addition, to test CUDA code, run:
 
 (For this step, you should have compiled XGBoost with CUDA enabled.)
 
+For testing with distributed frameworks like ``Dask`` and ``PySpark``:
+
+.. code:: bash
+
+  # Tell Python where to find XGBoost module
+  export PYTHONPATH=./python-package
+  pytest -v -s --fulltrace tests/test_distributed
+
 .. _running_gtest:
 
 C++: Google Test
@@ -110,21 +132,13 @@ To build and run C++ unit tests enable tests while running CMake:
 
   mkdir build
   cd build
-  cmake -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON  ..
-  make
-  make test
+  cmake -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON ..
+  ninja
+  ./testxgboost
 
-To enable tests for CUDA code, add ``-DUSE_CUDA=ON`` and ``-DUSE_NCCL=ON`` (CUDA toolkit required):
-
-.. code-block:: bash
-
-  mkdir build
-  cd build
-  cmake -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON ..
-  make
-  make test
-
-One can also run all unit test using ctest tool which provides higher flexibility. For example:
+Flags like ``USE_CUDA``, ``USE_DMLC_GTEST`` are optional. For more info about how to build
+XGBoost from source, see :doc:`</build>`. One can also run all unit test using ctest tool
+which provides higher flexibility. For example:
 
 .. code-block:: bash
 
@@ -157,14 +171,14 @@ sanitizer is not compatible with the other two sanitizers.
 
 .. code-block:: bash
 
-  cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;leak" /path/to/xgboost
+  cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;undefined" /path/to/xgboost
 
 By default, CMake will search regular system paths for sanitizers, you can also
 supply a specified SANITIZER_PATH.
 
 .. code-block:: bash
 
-  cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;leak" \
+  cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;undefined" \
   -DSANITIZER_PATH=/path/to/sanitizers /path/to/xgboost
 
 How to use sanitizers with CUDA support
@@ -181,7 +195,7 @@ environment variable:
 Other sanitizer runtime options
 ===============================
 
-By default undefined sanitizer doesn't print out the backtrace.  You can enable it by
+By default undefined sanitizer doesn't print out the backtrace. You can enable it by
 exporting environment variable:
 
 .. code-block::

From a5e07a01f83e3ddd13cfdbec5b4f627f93321610 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 12 Oct 2023 21:11:54 -0700
Subject: [PATCH 47/59] [CI] Pull CentOS 7 images from NGC (#9666)

---
 tests/ci_build/Dockerfile.gpu_build_centos7   | 2 +-
 tests/ci_build/Dockerfile.gpu_build_r_centos7 | 2 +-
 tests/ci_build/Dockerfile.jvm_gpu_build       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7
index 6134d49aa..98a0a7033 100644
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
diff --git a/tests/ci_build/Dockerfile.gpu_build_r_centos7 b/tests/ci_build/Dockerfile.gpu_build_r_centos7
index 6cfd30fe5..b73cf5adb 100644
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 
 # Install all basic requirements
diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/tests/ci_build/Dockerfile.jvm_gpu_build
index d4a580495..86ce7e72a 100644
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
 

From 81a059864aafafa49f5d6bbc27560e74a722f939 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 13 Oct 2023 14:35:14 +0800
Subject: [PATCH 48/59] Skip check for pollhup. (#9661)

---
 rabit/include/rabit/internal/socket.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/rabit/include/rabit/internal/socket.h b/rabit/include/rabit/internal/socket.h
index f1a6699fb..89e324482 100644
--- a/rabit/include/rabit/internal/socket.h
+++ b/rabit/include/rabit/internal/socket.h
@@ -100,9 +100,6 @@ std::enable_if_t<std::is_integral_v<E>, xgboost::collective::Result> PollError(E
   if ((revents & POLLNVAL) != 0) {
     return xgboost::system::FailWithCode("Invalid polling request.");
   }
-  if ((revents & POLLHUP) != 0) {
-    return xgboost::system::FailWithCode("Poll hung up.");
-  }
   return xgboost::collective::Success();
 }
 

From 53049b16b840adf5b03a9640a303baca6a1ade48 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 14 Oct 2023 09:34:37 +0800
Subject: [PATCH 49/59] [coll] Broadcast. (#9659)

---
 R-package/src/Makevars.in              |  1 +
 R-package/src/Makevars.win             |  1 +
 src/collective/broadcast.cc            | 83 ++++++++++++++++++++++++++
 src/collective/broadcast.h             | 26 ++++++++
 src/common/bitfield.h                  | 53 ++++++++++++----
 src/common/common.h                    |  3 +-
 tests/cpp/collective/test_broadcast.cc | 68 +++++++++++++++++++++
 7 files changed, 220 insertions(+), 15 deletions(-)
 create mode 100644 src/collective/broadcast.cc
 create mode 100644 src/collective/broadcast.h
 create mode 100644 tests/cpp/collective/test_broadcast.cc

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 541c0fb52..808960319 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -99,6 +99,7 @@ OBJECTS= \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
     $(PKGROOT)/src/collective/allgather.o \
+    $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index faacd6d8d..43bfcf7c1 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -99,6 +99,7 @@ OBJECTS= \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
     $(PKGROOT)/src/collective/allgather.o \
+    $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
diff --git a/src/collective/broadcast.cc b/src/collective/broadcast.cc
new file mode 100644
index 000000000..be7e8f55f
--- /dev/null
+++ b/src/collective/broadcast.cc
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "broadcast.h"
+
+#include <cstdint>  // for int32_t, int8_t
+#include <utility>  // for move
+
+#include "../common/bitfield.h"         // for TrailingZeroBits, RBitField32
+#include "comm.h"                       // for Comm
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective::cpu_impl {
+namespace {
+std::int32_t ShiftedParentRank(std::int32_t shifted_rank, std::int32_t depth) {
+  std::uint32_t mask{std::uint32_t{0} - 1};  // Oxff...
+  RBitField32 maskbits{common::Span<std::uint32_t>{&mask, 1}};
+  RBitField32 rankbits{
+      common::Span<std::uint32_t>{reinterpret_cast<std::uint32_t*>(&shifted_rank), 1}};
+  // prepare for counting trailing zeros.
+  for (std::int32_t i = 0; i < depth + 1; ++i) {
+    if (rankbits.Check(i)) {
+      maskbits.Set(i);
+    } else {
+      maskbits.Clear(i);
+    }
+  }
+
+  CHECK_NE(mask, 0);
+  auto k = TrailingZeroBits(mask);
+  auto shifted_parent = shifted_rank - (1 << k);
+  return shifted_parent;
+}
+
+// Shift the root node to rank 0
+std::int32_t ShiftLeft(std::int32_t rank, std::int32_t world, std::int32_t root) {
+  auto shifted_rank = (rank + world - root) % world;
+  return shifted_rank;
+}
+// shift back to the original rank
+std::int32_t ShiftRight(std::int32_t rank, std::int32_t world, std::int32_t root) {
+  auto orig = (rank + root) % world;
+  return orig;
+}
+}  // namespace
+
+Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root) {
+  // Binomial tree broadcast
+  // * Wiki
+  // https://en.wikipedia.org/wiki/Broadcast_(parallel_pattern)#Binomial_Tree_Broadcast
+  // * Impl
+  // https://people.mpi-inf.mpg.de/~mehlhorn/ftp/NewToolbox/collective.pdf
+
+  auto rank = comm.Rank();
+  auto world = comm.World();
+
+  // shift root to rank 0
+  auto shifted_rank = ShiftLeft(rank, world, root);
+  std::int32_t depth = std::ceil(std::log2(static_cast<double>(world))) - 1;
+
+  if (shifted_rank != 0) {  // not root
+    auto parent = ShiftRight(ShiftedParentRank(shifted_rank, depth), world, root);
+    comm.Chan(parent)->RecvAll(data);
+    auto rc = comm.Chan(parent)->Block();
+    if (!rc.OK()) {
+      return Fail("broadcast failed.", std::move(rc));
+    }
+  }
+
+  for (std::int32_t i = depth; i >= 0; --i) {
+    CHECK_GE((i + 1), 0);  // weird clang-tidy error that i might be negative
+    if (shifted_rank % (1 << (i + 1)) == 0 && shifted_rank + (1 << i) < world) {
+      auto sft_peer = shifted_rank + (1 << i);
+      auto peer = ShiftRight(sft_peer, world, root);
+      CHECK_NE(peer, root);
+      comm.Chan(peer)->SendAll(data);
+    }
+  }
+
+  return comm.Block();
+}
+}  // namespace xgboost::collective::cpu_impl
diff --git a/src/collective/broadcast.h b/src/collective/broadcast.h
new file mode 100644
index 000000000..28db83815
--- /dev/null
+++ b/src/collective/broadcast.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>  // for int32_t, int8_t
+
+#include "comm.h"                       // for Comm
+#include "xgboost/collective/result.h"  // for
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+namespace cpu_impl {
+Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root);
+}
+
+/**
+ * @brief binomial tree broadcast is used on CPU with the default implementation.
+ */
+template <typename T>
+[[nodiscard]] Result Broadcast(Comm const& comm, common::Span<T> data, std::int32_t root) {
+  auto n_total_bytes = data.size_bytes();
+  auto erased =
+      common::Span<std::int8_t>{reinterpret_cast<std::int8_t*>(data.data()), n_total_bytes};
+  return cpu_impl::Broadcast(comm, erased, root);
+}
+}  // namespace xgboost::collective
diff --git a/src/common/bitfield.h b/src/common/bitfield.h
index 6cdf4412e..efabaa834 100644
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -5,22 +5,21 @@
 #ifndef XGBOOST_COMMON_BITFIELD_H_
 #define XGBOOST_COMMON_BITFIELD_H_
 
-#include <algorithm>
-#include <bitset>
-#include <cinttypes>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
+#include <algorithm>    // for min
+#include <bitset>       // for bitset
+#include <cstdint>      // for uint32_t, uint64_t, uint8_t
+#include <ostream>      // for ostream
+#include <type_traits>  // for conditional_t, is_signed_v
 
 #if defined(__CUDACC__)
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
+
 #include "device_helpers.cuh"
 #endif  // defined(__CUDACC__)
 
-#include "xgboost/span.h"
 #include "common.h"
+#include "xgboost/span.h"  // for Span
 
 namespace xgboost {
 
@@ -75,7 +74,7 @@ struct BitFieldContainer {
  private:
   value_type* bits_{nullptr};
   size_type n_values_{0};
-  static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
+  static_assert(!std::is_signed_v<VT>, "Must use an unsiged type as the underlying storage.");
 
  public:
   XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
@@ -240,11 +239,39 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {
 
 // Format: <Const><Direction>BitField<size of underlying type in bits>, underlying type
 // must be unsigned.
-using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
-using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;
+using LBitField64 = BitFieldContainer<std::uint64_t, LBitsPolicy<std::uint64_t>>;
+using RBitField8 = BitFieldContainer<std::uint8_t, RBitsPolicy<unsigned char>>;
 
-using LBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t>>;
-using CLBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t, true>, true>;
+using LBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t>>;
+using CLBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t, true>, true>;
+using RBitField32 = BitFieldContainer<std::uint32_t, RBitsPolicy<std::uint32_t>>;
+
+namespace detail {
+inline std::uint32_t TrailingZeroBitsImpl(std::uint32_t value) {
+  auto n = sizeof(value) * 8;
+  std::uint32_t cnt{0};
+  for (decltype(n) i = 0; i < n; i++) {
+    if ((value >> i) & 1) {
+      break;
+    }
+    cnt++;
+  }
+  return cnt;
+}
+}  // namespace detail
+
+inline std::uint32_t TrailingZeroBits(std::uint32_t value) {
+  if (value == 0) {
+    return sizeof(value) * 8;
+  }
+#if defined(__GNUC__)
+  return __builtin_ctz(value);
+#elif defined(_MSC_VER)
+  return _tzcnt_u32(value);
+#else
+  return detail::TrailingZeroBitsImpl(value);
+#endif  //  __GNUC__
+}
 }       // namespace xgboost
 
 #endif  // XGBOOST_COMMON_BITFIELD_H_
diff --git a/src/common/common.h b/src/common/common.h
index bedff80b3..2abb34cb2 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -6,7 +6,6 @@
 #ifndef XGBOOST_COMMON_COMMON_H_
 #define XGBOOST_COMMON_COMMON_H_
 
-#include <algorithm>  // for max
 #include <array>      // for array
 #include <cmath>      // for ceil
 #include <cstddef>    // for size_t
@@ -181,7 +180,7 @@ inline void SetDevice(std::int32_t device) {
 #endif
 
 /**
- * Last index of a group in a CSR style of index pointer.
+ * @brief Last index of a group in a CSR style of index pointer.
  */
 template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
diff --git a/tests/cpp/collective/test_broadcast.cc b/tests/cpp/collective/test_broadcast.cc
new file mode 100644
index 000000000..485f6dcdf
--- /dev/null
+++ b/tests/cpp/collective/test_broadcast.cc
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/collective/socket.h>
+
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <thread>   // for thread
+#include <vector>   // for vector
+
+#include "../../../src/collective/broadcast.h"  // for Broadcast
+#include "../../../src/collective/tracker.h"    // for GetHostAddress, Tracker
+#include "test_worker.h"                        // for WorkerForTest
+
+namespace xgboost::collective {
+namespace {
+class Worker : public WorkerForTest {
+ public:
+  using WorkerForTest::WorkerForTest;
+
+  void Run() {
+    for (std::int32_t r = 0; r < comm_.World(); ++r) {
+      // basic test
+      std::vector<std::int32_t> data(1, comm_.Rank());
+      auto rc = Broadcast(this->comm_, common::Span{data.data(), data.size()}, r);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+      ASSERT_EQ(data[0], r);
+    }
+
+    for (std::int32_t r = 0; r < comm_.World(); ++r) {
+      std::vector<std::int32_t> data(1 << 16, comm_.Rank());
+      auto rc = Broadcast(this->comm_, common::Span{data.data(), data.size()}, r);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+      ASSERT_EQ(data[0], r);
+    }
+  }
+};
+
+class BroadcastTest : public SocketTest {};
+}  // namespace
+
+TEST_F(BroadcastTest, Basic) {
+  std::int32_t n_workers = std::min(24u, std::thread::hardware_concurrency());
+  std::chrono::seconds timeout{3};
+
+  std::string host;
+  ASSERT_TRUE(GetHostAddress(&host).OK());
+  RabitTracker tracker{StringView{host}, n_workers, 0, timeout};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] {
+      Worker worker{host, port, timeout, n_workers, i};
+      worker.Run();
+    });
+  }
+
+  for (auto& t : workers) {
+    t.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
+}
+}  // namespace xgboost::collective

From eb562d38296ccb0e9640669edb5a3ff9d2a2a76a Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Fri, 13 Oct 2023 23:46:07 -0500
Subject: [PATCH 50/59] [CI] address cmakelint warnings about whitespace
 (#9674)

---
 CMakeLists.txt                      | 128 ++++++++++++++--------------
 cmake/Doc.cmake                     |   7 +-
 cmake/FindPrefetchIntrinsics.cmake  |   2 +-
 cmake/Sanitizer.cmake               |  12 +--
 cmake/Utils.cmake                   |  66 +++++++-------
 cmake/Version.cmake                 |   2 +-
 cmake/modules/FindNVML.cmake        |   2 +-
 cmake/modules/FindNccl.cmake        |   6 +-
 demo/c-api/basic/CMakeLists.txt     |   2 +-
 demo/c-api/inference/CMakeLists.txt |   2 +-
 jvm-packages/CMakeLists.txt         |   4 +-
 plugin/CMakeLists.txt               |   8 +-
 rabit/CMakeLists.txt                |   4 +-
 src/CMakeLists.txt                  |   4 +-
 tests/cpp/CMakeLists.txt            |  12 +--
 15 files changed, 133 insertions(+), 128 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e1a38929..7e5f2c556 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ cmake_policy(SET CMP0076 NEW)
 set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
 cmake_policy(SET CMP0063 NEW)
 
-if ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
+if((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
   cmake_policy(SET CMP0077 NEW)
 endif()
 
@@ -86,91 +86,93 @@ option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF)
 option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
 
 #-- Checks for building XGBoost
-if (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
+if(USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
   message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.")
 endif()
-if (USE_NCCL AND NOT (USE_CUDA))
+if(USE_NCCL AND NOT (USE_CUDA))
   message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
 endif()
-if (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
+if(USE_DEVICE_DEBUG AND NOT (USE_CUDA))
   message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_CUDA` flag.")
 endif()
-if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
+if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
   message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
 endif()
-if (JVM_BINDINGS AND R_LIB)
+if(JVM_BINDINGS AND R_LIB)
   message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
 endif()
-if (R_LIB AND GOOGLE_TEST)
-  message(WARNING "Some C++ unittests will fail with `R_LIB` enabled,
- as R package redirects some functions to R runtime implementation.")
+if(R_LIB AND GOOGLE_TEST)
+  message(
+    WARNING
+    "Some C++ tests will fail with `R_LIB` enabled, as R package redirects some functions to R runtime implementation."
+  )
 endif()
-if (PLUGIN_RMM AND NOT (USE_CUDA))
+if(PLUGIN_RMM AND NOT (USE_CUDA))
   message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
 endif()
-if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
+if(PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
   message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.")
 endif()
-if (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
+if(PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
   message(SEND_ERROR "`PLUGIN_RMM` must be used with Linux.")
 endif()
-if (ENABLE_ALL_WARNINGS)
-  if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+if(ENABLE_ALL_WARNINGS)
+  if((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
     message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.")
   endif()
 endif()
-if (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
+if(BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
   message(SEND_ERROR "Cannot build a static library libxgboost.a when R or JVM packages are enabled.")
 endif()
-if (PLUGIN_FEDERATED)
-  if (CMAKE_CROSSCOMPILING)
+if(PLUGIN_FEDERATED)
+  if(CMAKE_CROSSCOMPILING)
     message(SEND_ERROR "Cannot cross compile with federated learning support")
   endif()
-  if (BUILD_STATIC_LIB)
+  if(BUILD_STATIC_LIB)
     message(SEND_ERROR "Cannot build static lib with federated learning support")
   endif()
-  if (R_LIB OR JVM_BINDINGS)
+  if(R_LIB OR JVM_BINDINGS)
     message(SEND_ERROR "Cannot enable federated learning support when R or JVM packages are enabled.")
   endif()
-  if (WIN32)
+  if(WIN32)
     message(SEND_ERROR "Federated learning not supported for Windows platform")
   endif()
 endif()
 
 #-- Removed options
-if (USE_AVX)
+if(USE_AVX)
   message(SEND_ERROR  "The option `USE_AVX` is deprecated as experimental AVX features have been removed from XGBoost.")
 endif()
-if (PLUGIN_LZ4)
+if(PLUGIN_LZ4)
   message(SEND_ERROR  "The option `PLUGIN_LZ4` is removed from XGBoost.")
 endif()
-if (RABIT_BUILD_MPI)
+if(RABIT_BUILD_MPI)
   message(SEND_ERROR "The option `RABIT_BUILD_MPI` has been removed from XGBoost.")
 endif()
-if (USE_S3)
+if(USE_S3)
   message(SEND_ERROR "The option `USE_S3` has been removed from XGBoost")
 endif()
-if (USE_AZURE)
+if(USE_AZURE)
   message(SEND_ERROR "The option `USE_AZURE` has been removed from XGBoost")
 endif()
-if (USE_HDFS)
+if(USE_HDFS)
   message(SEND_ERROR "The option `USE_HDFS` has been removed from XGBoost")
 endif()
 
 #-- Sanitizer
-if (USE_SANITIZER)
+if(USE_SANITIZER)
   include(cmake/Sanitizer.cmake)
   enable_sanitizers("${ENABLED_SANITIZERS}")
 endif()
 
-if (USE_CUDA)
+if(USE_CUDA)
   set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
   # `export CXX=' is ignored by CMake CUDA.
   set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
   message(STATUS "Configured CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}")
 
   enable_language(CUDA)
-  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.0)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.0)
     message(FATAL_ERROR "CUDA version must be at least 11.0!")
   endif()
   set(GEN_CODE "")
@@ -180,7 +182,7 @@ if (USE_CUDA)
   find_package(CUDAToolkit REQUIRED)
 endif()
 
-if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
+if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
     ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
       (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
@@ -188,10 +190,10 @@ endif()
 
 find_package(Threads REQUIRED)
 
-if (USE_OPENMP)
-  if (APPLE)
+if(USE_OPENMP)
+  if(APPLE)
     find_package(OpenMP)
-    if (NOT OpenMP_FOUND)
+    if(NOT OpenMP_FOUND)
       # Try again with extra path info; required for libomp 15+ from Homebrew
       execute_process(COMMAND brew --prefix libomp
                       OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX
@@ -205,31 +207,33 @@ if (USE_OPENMP)
       set(OpenMP_omp_LIBRARY ${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib)
       find_package(OpenMP REQUIRED)
     endif()
-  else ()
+  else()
     find_package(OpenMP REQUIRED)
   endif()
 endif()
 #Add for IBM i
-if (${CMAKE_SYSTEM_NAME} MATCHES "OS400")
+if(${CMAKE_SYSTEM_NAME} MATCHES "OS400")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
   set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> -X64 qc <TARGET> <OBJECTS>")
 endif()
 
-if (USE_NCCL)
+if(USE_NCCL)
   find_package(Nccl REQUIRED)
 endif()
 
 # dmlc-core
 msvc_use_static_runtime()
-if (FORCE_SHARED_CRT)
+if(FORCE_SHARED_CRT)
   set(DMLC_FORCE_SHARED_CRT ON)
 endif()
 add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core)
 
-if (MSVC)
-  if (TARGET dmlc_unit_tests)
-    target_compile_options(dmlc_unit_tests PRIVATE
-                           -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
+if(MSVC)
+  if(TARGET dmlc_unit_tests)
+    target_compile_options(
+        dmlc_unit_tests PRIVATE
+        -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE
+    )
   endif()
 endif()
 
@@ -241,19 +245,19 @@ add_subdirectory(${xgboost_SOURCE_DIR}/src)
 target_link_libraries(objxgboost PUBLIC dmlc)
 
 # Exports some R specific definitions and objects
-if (R_LIB)
+if(R_LIB)
   add_subdirectory(${xgboost_SOURCE_DIR}/R-package)
 endif()
 
 # This creates its own shared library `xgboost4j'.
-if (JVM_BINDINGS)
+if(JVM_BINDINGS)
   add_subdirectory(${xgboost_SOURCE_DIR}/jvm-packages)
 endif()
 
 # Plugin
 add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
 
-if (PLUGIN_RMM)
+if(PLUGIN_RMM)
   find_package(rmm REQUIRED)
 
   # Patch the rmm targets so they reference the static cudart
@@ -267,7 +271,7 @@ if (PLUGIN_RMM)
 endif()
 
 #-- library
-if (BUILD_STATIC_LIB)
+if(BUILD_STATIC_LIB)
   add_library(xgboost STATIC)
 else()
   add_library(xgboost SHARED)
@@ -280,7 +284,7 @@ target_include_directories(xgboost
 #-- End shared library
 
 #-- CLI for xgboost
-if (BUILD_DEPRECATED_CLI)
+if(BUILD_DEPRECATED_CLI)
   add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc)
   target_link_libraries(runxgboost PRIVATE objxgboost)
   target_include_directories(runxgboost
@@ -294,9 +298,9 @@ if (BUILD_DEPRECATED_CLI)
   xgboost_target_link_libraries(runxgboost)
   xgboost_target_defs(runxgboost)
 
-  if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
+  if(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
     set_output_directory(runxgboost ${xgboost_BINARY_DIR})
-  else ()
+  else()
     set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
   endif()
 endif()
@@ -309,38 +313,38 @@ foreach(target xgboost objxgboost dmlc)
   xgboost_target_defs(${target})
 endforeach()
 
-if (JVM_BINDINGS)
+if(JVM_BINDINGS)
   xgboost_target_properties(xgboost4j)
   xgboost_target_link_libraries(xgboost4j)
   xgboost_target_defs(xgboost4j)
 endif()
 
-if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
+if(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
   set_output_directory(xgboost ${xgboost_BINARY_DIR}/lib)
-else ()
+else()
   set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
 endif()
 
 # Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names
-if (BUILD_DEPRECATED_CLI)
+if(BUILD_DEPRECATED_CLI)
   add_dependencies(xgboost runxgboost)
 endif()
 
 #-- Installing XGBoost
-if (R_LIB)
+if(R_LIB)
   include(cmake/RPackageInstallTargetSetup.cmake)
   set_target_properties(xgboost PROPERTIES PREFIX "")
-  if (APPLE)
+  if(APPLE)
     set_target_properties(xgboost PROPERTIES SUFFIX ".so")
   endif()
   setup_rpackage_install_target(xgboost "${CMAKE_CURRENT_BINARY_DIR}/R-package-install")
   set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dummy_inst")
 endif()
-if (MINGW)
+if(MINGW)
   set_target_properties(xgboost PROPERTIES PREFIX "")
 endif()
 
-if (BUILD_C_DOC)
+if(BUILD_C_DOC)
   include(cmake/Doc.cmake)
   run_doxygen()
 endif()
@@ -359,14 +363,14 @@ install(DIRECTORY ${xgboost_SOURCE_DIR}/include/xgboost
 #  > in any export set.
 #
 # https://github.com/dmlc/xgboost/issues/6085
-if (BUILD_STATIC_LIB)
-  if (BUILD_DEPRECATED_CLI)
+if(BUILD_STATIC_LIB)
+  if(BUILD_DEPRECATED_CLI)
     set(INSTALL_TARGETS xgboost runxgboost objxgboost dmlc)
   else()
     set(INSTALL_TARGETS xgboost objxgboost dmlc)
   endif()
 else()
-  if (BUILD_DEPRECATED_CLI)
+  if(BUILD_DEPRECATED_CLI)
     set(INSTALL_TARGETS xgboost runxgboost)
   else()
     set(INSTALL_TARGETS xgboost)
@@ -400,7 +404,7 @@ install(
   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/xgboost)
 
 #-- Test
-if (GOOGLE_TEST)
+if(GOOGLE_TEST)
   enable_testing()
   # Unittests.
   add_executable(testxgboost)
@@ -420,7 +424,7 @@ if (GOOGLE_TEST)
     ${xgboost_SOURCE_DIR}/tests/cli/machine.conf.in
     ${xgboost_BINARY_DIR}/tests/cli/machine.conf
     @ONLY)
-  if (BUILD_DEPRECATED_CLI)
+  if(BUILD_DEPRECATED_CLI)
     add_test(
       NAME TestXGBoostCLI
       COMMAND runxgboost ${xgboost_BINARY_DIR}/tests/cli/machine.conf
@@ -437,7 +441,7 @@ endif()
 msvc_use_static_runtime()
 
 # Add xgboost.pc
-if (ADD_PKGCONFIG)
+if(ADD_PKGCONFIG)
   configure_file(${xgboost_SOURCE_DIR}/cmake/xgboost.pc.in ${xgboost_BINARY_DIR}/xgboost.pc @ONLY)
 
   install(
diff --git a/cmake/Doc.cmake b/cmake/Doc.cmake
index cf4ef26f4..41c4fc961 100644
--- a/cmake/Doc.cmake
+++ b/cmake/Doc.cmake
@@ -1,14 +1,15 @@
-function (run_doxygen)
+function(run_doxygen)
   find_package(Doxygen REQUIRED)
 
-  if (NOT DOXYGEN_DOT_FOUND)
+  if(NOT DOXYGEN_DOT_FOUND)
     message(FATAL_ERROR "Command `dot` not found.  Please install graphviz.")
   endif()
 
   configure_file(
     ${xgboost_SOURCE_DIR}/doc/Doxyfile.in
     ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
-  add_custom_target( doc_doxygen ALL
+  add_custom_target(
+    doc_doxygen ALL
     COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     COMMENT "Generate C APIs documentation."
diff --git a/cmake/FindPrefetchIntrinsics.cmake b/cmake/FindPrefetchIntrinsics.cmake
index 94e65de32..71ae5899f 100644
--- a/cmake/FindPrefetchIntrinsics.cmake
+++ b/cmake/FindPrefetchIntrinsics.cmake
@@ -1,4 +1,4 @@
-function (find_prefetch_intrinsics)
+function(find_prefetch_intrinsics)
   include(CheckCXXSourceCompiles)
   check_cxx_source_compiles("
   #include <xmmintrin.h>
diff --git a/cmake/Sanitizer.cmake b/cmake/Sanitizer.cmake
index 919d5c695..176d967d0 100644
--- a/cmake/Sanitizer.cmake
+++ b/cmake/Sanitizer.cmake
@@ -12,7 +12,7 @@ macro(enable_sanitizer sanitizer)
   elseif(${sanitizer} MATCHES "thread")
     find_package(TSan)
     set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=thread")
-    if (TSan_FOUND)
+    if(TSan_FOUND)
       link_libraries(${TSan_LIBRARY})
     endif()
 
@@ -33,16 +33,16 @@ macro(enable_sanitizers SANITIZERS)
   # Check sanitizers compatibility.
   # Idealy, we should use if(san IN_LIST SANITIZERS) ... endif()
   # But I haven't figure out how to make it work.
-  foreach ( _san ${SANITIZERS} )
+  foreach( _san ${SANITIZERS} )
     string(TOLOWER ${_san} _san)
-    if (_san MATCHES "thread")
-      if (${_use_other_sanitizers})
+    if(_san MATCHES "thread")
+      if(${_use_other_sanitizers})
         message(FATAL_ERROR
           "thread sanitizer is not compatible with ${_san} sanitizer.")
       endif()
       set(_use_thread_sanitizer 1)
-    else ()
-      if (${_use_thread_sanitizer})
+    else()
+      if(${_use_thread_sanitizer})
         message(FATAL_ERROR
           "${_san} sanitizer is not compatible with thread sanitizer.")
       endif()
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index f1e9a5904..586e32ee5 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -76,10 +76,10 @@ endfunction()
 function(set_default_configuration_release)
     if(CMAKE_CONFIGURATION_TYPES STREQUAL "Debug;Release;MinSizeRel;RelWithDebInfo") # multiconfig generator?
         set(CMAKE_CONFIGURATION_TYPES Release CACHE STRING "" FORCE)
-	elseif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-	  message(STATUS "Setting build type to 'Release' as none was specified.")
-	  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE )
-	endif()
+    elseif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+      message(STATUS "Setting build type to 'Release' as none was specified.")
+      set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+    endif()
 endfunction()
 
 # Generate nvcc compiler flags given a list of architectures
@@ -90,9 +90,9 @@ function(format_gencode_flags flags out)
   endif()
   # Set up architecture flags
   if(NOT flags)
-    if (CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
+    if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
       set(flags "50;60;70;80;90")
-    elseif (CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
+    elseif(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
       set(flags "50;60;70;80")
     elseif(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
       set(flags "35;50;60;70")
@@ -103,7 +103,7 @@ function(format_gencode_flags flags out)
     endif()
   endif()
 
-  if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     cmake_policy(SET CMP0104 NEW)
     list(GET flags -1 latest_arch)
     list(TRANSFORM flags APPEND "-real")
@@ -133,17 +133,17 @@ function(xgboost_set_cuda_flags target)
     $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
 
-  if (USE_PER_THREAD_DEFAULT_STREAM)
+  if(USE_PER_THREAD_DEFAULT_STREAM)
     target_compile_options(${target} PRIVATE
             $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>)
   endif()
 
-  if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     set_property(TARGET ${target} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
   endif()
 
-  if (FORCE_COLORED_OUTPUT)
-    if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
+  if(FORCE_COLORED_OUTPUT)
+    if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
         ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
           (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
       target_compile_options(${target} PRIVATE
@@ -151,7 +151,7 @@ function(xgboost_set_cuda_flags target)
     endif()
   endif()
 
-  if (USE_DEVICE_DEBUG)
+  if(USE_DEVICE_DEBUG)
     target_compile_options(${target} PRIVATE
       $<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>)
   else()
@@ -159,7 +159,7 @@ function(xgboost_set_cuda_flags target)
       $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
   endif()
 
-  if (USE_NVTX)
+  if(USE_NVTX)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
   endif()
 
@@ -169,7 +169,7 @@ function(xgboost_set_cuda_flags target)
     ${xgboost_SOURCE_DIR}/gputreeshap
     ${CUDAToolkit_INCLUDE_DIRS})
 
-  if (MSVC)
+  if(MSVC)
     target_compile_options(${target} PRIVATE
       $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
   endif()
@@ -182,11 +182,11 @@ function(xgboost_set_cuda_flags target)
 endfunction()
 
 macro(xgboost_link_nccl target)
-  if (BUILD_STATIC_LIB)
+  if(BUILD_STATIC_LIB)
     target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
     target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_NCCL=1)
     target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY})
-  else ()
+  else()
     target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR})
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1)
     target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
@@ -200,7 +200,7 @@ macro(xgboost_target_properties target)
     CXX_STANDARD_REQUIRED ON
     POSITION_INDEPENDENT_CODE ON)
 
-  if (HIDE_CXX_SYMBOLS)
+  if(HIDE_CXX_SYMBOLS)
     #-- Hide all C++ symbols
     set_target_properties(${target} PROPERTIES
       C_VISIBILITY_PRESET hidden
@@ -209,7 +209,7 @@ macro(xgboost_target_properties target)
     )
   endif()
 
-  if (ENABLE_ALL_WARNINGS)
+  if(ENABLE_ALL_WARNINGS)
     target_compile_options(${target} PUBLIC
       $<IF:$<COMPILE_LANGUAGE:CUDA>,
       -Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined,
@@ -222,7 +222,7 @@ macro(xgboost_target_properties target)
     $<$<AND:$<CXX_COMPILER_ID:MSVC>,$<COMPILE_LANGUAGE:CXX>>:/MP>
     $<$<AND:$<NOT:$<CXX_COMPILER_ID:MSVC>>,$<COMPILE_LANGUAGE:CXX>>:-funroll-loops>)
 
-  if (MSVC)
+  if(MSVC)
     target_compile_options(${target} PRIVATE
       $<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>
       -D_CRT_SECURE_NO_WARNINGS
@@ -230,72 +230,72 @@ macro(xgboost_target_properties target)
     )
   endif()
 
-  if (WIN32 AND MINGW)
+  if(WIN32 AND MINGW)
     target_compile_options(${target} PUBLIC -static-libstdc++)
   endif()
 endmacro()
 
 # Custom definitions used in xgboost.
 macro(xgboost_target_defs target)
-  if (NOT ${target} STREQUAL "dmlc") # skip dmlc core for custom logging.
+  if(NOT ${target} STREQUAL "dmlc") # skip dmlc core for custom logging.
     target_compile_definitions(${target}
       PRIVATE
       -DDMLC_LOG_CUSTOMIZE=1
       $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:_MWAITXINTRIN_H_INCLUDED>)
   endif()
-  if (USE_DEBUG_OUTPUT)
+  if(USE_DEBUG_OUTPUT)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_DEBUG_OUTPUT=1)
   endif()
-  if (XGBOOST_MM_PREFETCH_PRESENT)
+  if(XGBOOST_MM_PREFETCH_PRESENT)
     target_compile_definitions(${target}
       PRIVATE
       -DXGBOOST_MM_PREFETCH_PRESENT=1)
   endif()
-  if (XGBOOST_BUILTIN_PREFETCH_PRESENT)
+  if(XGBOOST_BUILTIN_PREFETCH_PRESENT)
     target_compile_definitions(${target}
       PRIVATE
       -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1)
   endif()
 
-  if (PLUGIN_RMM)
+  if(PLUGIN_RMM)
     target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
   endif()
 endmacro()
 
 # handles dependencies
 macro(xgboost_target_link_libraries target)
-  if (BUILD_STATIC_LIB)
+  if(BUILD_STATIC_LIB)
     target_link_libraries(${target} PUBLIC Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
   else()
     target_link_libraries(${target} PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
   endif()
 
-  if (USE_OPENMP)
-    if (BUILD_STATIC_LIB)
+  if(USE_OPENMP)
+    if(BUILD_STATIC_LIB)
       target_link_libraries(${target} PUBLIC OpenMP::OpenMP_CXX)
     else()
       target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
     endif()
   endif()
 
-  if (USE_CUDA)
+  if(USE_CUDA)
     xgboost_set_cuda_flags(${target})
     target_link_libraries(${target} PUBLIC CUDA::cudart_static)
   endif()
 
-  if (PLUGIN_RMM)
+  if(PLUGIN_RMM)
     target_link_libraries(${target} PRIVATE rmm::rmm)
   endif()
 
-  if (USE_NCCL)
+  if(USE_NCCL)
     xgboost_link_nccl(${target})
   endif()
 
-  if (USE_NVTX)
+  if(USE_NVTX)
     target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
   endif()
 
-  if (MINGW)
+  if(MINGW)
     target_link_libraries(${target} PRIVATE wsock32 ws2_32)
   endif()
 endmacro()
diff --git a/cmake/Version.cmake b/cmake/Version.cmake
index a64d1b65e..4af6b27d6 100644
--- a/cmake/Version.cmake
+++ b/cmake/Version.cmake
@@ -1,4 +1,4 @@
-function (write_version)
+function(write_version)
   message(STATUS "xgboost VERSION: ${xgboost_VERSION}")
   configure_file(
     ${xgboost_SOURCE_DIR}/cmake/version_config.h.in
diff --git a/cmake/modules/FindNVML.cmake b/cmake/modules/FindNVML.cmake
index a7061aa29..f0e72f371 100644
--- a/cmake/modules/FindNVML.cmake
+++ b/cmake/modules/FindNVML.cmake
@@ -1,4 +1,4 @@
-if (NVML_LIBRARY)
+if(NVML_LIBRARY)
   unset(NVML_LIBRARY CACHE)
 endif()
 
diff --git a/cmake/modules/FindNccl.cmake b/cmake/modules/FindNccl.cmake
index 6fd2ed981..02ee731a1 100644
--- a/cmake/modules/FindNccl.cmake
+++ b/cmake/modules/FindNccl.cmake
@@ -35,17 +35,17 @@
 #
 # This module assumes that the user has already called find_package(CUDA)
 
-if (NCCL_LIBRARY)
+if(NCCL_LIBRARY)
   if(NOT USE_NCCL_LIB_PATH)
     # Don't cache NCCL_LIBRARY to enable switching between static and shared.
     unset(NCCL_LIBRARY CACHE)
   endif()
 endif()
 
-if (BUILD_WITH_SHARED_NCCL)
+if(BUILD_WITH_SHARED_NCCL)
   # libnccl.so
   set(NCCL_LIB_NAME nccl)
-else ()
+else()
   # libnccl_static.a
   set(NCCL_LIB_NAME nccl_static)
 endif()
diff --git a/demo/c-api/basic/CMakeLists.txt b/demo/c-api/basic/CMakeLists.txt
index f0cc8a744..74567c9c8 100644
--- a/demo/c-api/basic/CMakeLists.txt
+++ b/demo/c-api/basic/CMakeLists.txt
@@ -3,7 +3,7 @@ find_package(xgboost REQUIRED)
 
 # xgboost is built as static libraries, all cxx dependencies need to be linked into the
 # executable.
-if (XGBOOST_BUILD_STATIC_LIB)
+if(XGBOOST_BUILD_STATIC_LIB)
   enable_language(CXX)
   # find again for those  cxx libraries.
   find_package(xgboost REQUIRED)
diff --git a/demo/c-api/inference/CMakeLists.txt b/demo/c-api/inference/CMakeLists.txt
index 70ddb256c..5eded43e2 100644
--- a/demo/c-api/inference/CMakeLists.txt
+++ b/demo/c-api/inference/CMakeLists.txt
@@ -4,7 +4,7 @@ find_package(xgboost REQUIRED)
 
 # xgboost is built as static libraries, all cxx dependencies need to be linked into the
 # executable.
-if (XGBOOST_BUILD_STATIC_LIB)
+if(XGBOOST_BUILD_STATIC_LIB)
   enable_language(CXX)
   # find again for those  cxx libraries.
   find_package(xgboost REQUIRED)
diff --git a/jvm-packages/CMakeLists.txt b/jvm-packages/CMakeLists.txt
index f89970dea..d87301753 100644
--- a/jvm-packages/CMakeLists.txt
+++ b/jvm-packages/CMakeLists.txt
@@ -4,14 +4,14 @@ list(APPEND JVM_SOURCES
   ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
   ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cpp)
 
-if (USE_CUDA)
+if(USE_CUDA)
   list(APPEND JVM_SOURCES
     ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu)
 endif()
 
 add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES})
 
-if (ENABLE_ALL_WARNINGS)
+if(ENABLE_ALL_WARNINGS)
   target_compile_options(xgboost4j PUBLIC -Wall -Wextra)
 endif()
 
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 3a1889d2e..6089ae486 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -1,8 +1,8 @@
-if (PLUGIN_DENSE_PARSER)
+if(PLUGIN_DENSE_PARSER)
   target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/dense_parser/dense_libsvm.cc)
 endif()
 
-if (PLUGIN_UPDATER_ONEAPI)
+if(PLUGIN_UPDATER_ONEAPI)
   add_library(oneapi_plugin OBJECT
     ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/regression_obj_oneapi.cc
     ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/predictor_oneapi.cc)
@@ -18,7 +18,7 @@ if (PLUGIN_UPDATER_ONEAPI)
     CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
     POSITION_INDEPENDENT_CODE ON)
-  if (USE_OPENMP)
+  if(USE_OPENMP)
     find_package(OpenMP REQUIRED)
     target_link_libraries(oneapi_plugin PUBLIC OpenMP::OpenMP_CXX)
   endif()
@@ -29,6 +29,6 @@ if (PLUGIN_UPDATER_ONEAPI)
 endif()
 
 # Add the Federate Learning plugin if enabled.
-if (PLUGIN_FEDERATED)
+if(PLUGIN_FEDERATED)
   add_subdirectory(federated)
 endif()
diff --git a/rabit/CMakeLists.txt b/rabit/CMakeLists.txt
index f1adcb6e0..4562f864f 100644
--- a/rabit/CMakeLists.txt
+++ b/rabit/CMakeLists.txt
@@ -6,9 +6,9 @@ set(RABIT_SOURCES
   ${CMAKE_CURRENT_LIST_DIR}/src/allreduce_base.cc
   ${CMAKE_CURRENT_LIST_DIR}/src/rabit_c_api.cc)
 
-if (RABIT_MOCK)
+if(RABIT_MOCK)
   list(APPEND RABIT_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/engine_mock.cc)
-else ()
+else()
   list(APPEND RABIT_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/engine.cc)
 endif()
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9b2a803ab..f0dfe061f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,7 +11,7 @@ set_source_files_properties(
   PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
 target_sources(objxgboost PRIVATE ${RABIT_SOURCES})
 
-if (USE_CUDA)
+if(USE_CUDA)
   file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
   target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
 endif()
@@ -22,7 +22,7 @@ target_include_directories(objxgboost
   ${xgboost_SOURCE_DIR}/dmlc-core/include
   ${xgboost_SOURCE_DIR}/rabit/include)
 
-if (LOG_CAPI_INVOCATION)
+if(LOG_CAPI_INVOCATION)
   target_compile_definitions(objxgboost PRIVATE -DLOG_CAPI_INVOCATION=1)
 endif()
 
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 5b61f5328..cf34caf81 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
-if (USE_DMLC_GTEST)
-  if (NOT TARGET gtest)
+if(USE_DMLC_GTEST)
+  if(NOT TARGET gtest)
     message(FATAL_ERROR "USE_DMLC_GTEST=ON but dmlc-core didn't bundle gtest")
   endif()
   set(GTEST_LIBRARIES gtest)
@@ -8,17 +8,17 @@ else()
 endif()
 file(GLOB_RECURSE TEST_SOURCES "*.cc")
 
-if (USE_CUDA)
+if(USE_CUDA)
   file(GLOB_RECURSE CUDA_TEST_SOURCES "*.cu")
   list(APPEND TEST_SOURCES ${CUDA_TEST_SOURCES})
 endif()
 
 file(GLOB_RECURSE ONEAPI_TEST_SOURCES "plugin/*_oneapi.cc")
-if (NOT PLUGIN_UPDATER_ONEAPI)
+if(NOT PLUGIN_UPDATER_ONEAPI)
   list(REMOVE_ITEM TEST_SOURCES ${ONEAPI_TEST_SOURCES})
 endif()
 
-if (PLUGIN_FEDERATED)
+if(PLUGIN_FEDERATED)
   target_include_directories(testxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/federated)
   target_link_libraries(testxgboost PRIVATE federated_client)
 else()
@@ -28,7 +28,7 @@ endif()
 
 target_sources(testxgboost PRIVATE ${TEST_SOURCES} ${xgboost_SOURCE_DIR}/plugin/example/custom_obj.cc)
 
-if (USE_CUDA AND PLUGIN_RMM)
+if(USE_CUDA AND PLUGIN_RMM)
   find_package(CUDA)
   target_include_directories(testxgboost PRIVATE ${CUDA_INCLUDE_DIRS})
 endif()

From eee7cdf07ef0c62b87078ef9ffcf202334ffaa77 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 13 Oct 2023 22:07:49 -0700
Subject: [PATCH 51/59] Fix build for GCC 8.x (#9670) (#9675)

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e5f2c556..460327385 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -244,6 +244,11 @@ add_subdirectory(rabit)
 add_subdirectory(${xgboost_SOURCE_DIR}/src)
 target_link_libraries(objxgboost PUBLIC dmlc)
 
+# Link -lstdc++fs for GCC 8.x
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
+  target_link_libraries(objxgboost PUBLIC stdc++fs)
+endif()
+
 # Exports some R specific definitions and objects
 if(R_LIB)
   add_subdirectory(${xgboost_SOURCE_DIR}/R-package)

From 83191f0839ad4b729b61fee0e12e7349ade84a62 Mon Sep 17 00:00:00 2001
From: Thomas Lynn <32374143+lynntf@users.noreply.github.com>
Date: Sat, 14 Oct 2023 10:38:58 +0200
Subject: [PATCH 52/59] Update learning_to_rank.rst; Correct qid sort in
 snippet (#9673)

---
 doc/tutorials/learning_to_rank.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index c562dc2df..015f736e0 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -58,6 +58,7 @@ Notice that the samples are sorted based on their query index in a non-decreasin
   sorted_idx = np.argsort(qid)
   X = X[sorted_idx, :]
   y = y[sorted_idx]
+  qid = qid[sorted_idx]
 
 The simplest way to train a ranking model is by using the scikit-learn estimator interface. Continuing the previous snippet, we can train a simple ranking model without tuning:
 

From 4d1607eefd0c3321f9ab7f442d3e845b977f8a6d Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Tue, 17 Oct 2023 10:35:39 +0800
Subject: [PATCH 53/59] [pyspark] Support stage-level scheduling for training
 (#9519)

---
 python-package/xgboost/spark/core.py  | 220 ++++++++++++++++++++------
 python-package/xgboost/spark/utils.py |   7 +
 2 files changed, 176 insertions(+), 51 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 6b1d2faaa..9fe73005a 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -22,7 +22,7 @@ from typing import (
 
 import numpy as np
 import pandas as pd
-from pyspark import SparkContext, cloudpickle
+from pyspark import RDD, SparkContext, cloudpickle
 from pyspark.ml import Estimator, Model
 from pyspark.ml.functions import array_to_vector, vector_to_array
 from pyspark.ml.linalg import VectorUDT
@@ -44,6 +44,7 @@ from pyspark.ml.util import (
     MLWritable,
     MLWriter,
 )
+from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests
 from pyspark.sql import Column, DataFrame
 from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
 from pyspark.sql.types import (
@@ -88,6 +89,7 @@ from .utils import (
     _get_rabit_args,
     _get_spark_session,
     _is_local,
+    _is_standalone_or_localcluster,
     deserialize_booster,
     deserialize_xgb_model,
     get_class_name,
@@ -342,6 +344,54 @@ class _SparkXGBParams(
                 predict_params[param.name] = self.getOrDefault(param)
         return predict_params
 
+    def _validate_gpu_params(self) -> None:
+        """Validate the gpu parameters and gpu configurations"""
+
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+            ss = _get_spark_session()
+            sc = ss.sparkContext
+
+            if _is_local(sc):
+                # Support GPU training in Spark local mode is just for debugging
+                # purposes, so it's okay for printing the below warning instead of
+                # checking the real gpu numbers and raising the exception.
+                get_logger(self.__class__.__name__).warning(
+                    "You have enabled GPU in spark local mode. Please make sure your"
+                    " local node has at least %d GPUs",
+                    self.getOrDefault(self.num_workers),
+                )
+            else:
+                executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+                if executor_gpus is None:
+                    raise ValueError(
+                        "The `spark.executor.resource.gpu.amount` is required for training"
+                        " on GPU."
+                    )
+
+                if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
+                    # We will enable stage-level scheduling in spark 3.4.0+ which doesn't
+                    # require spark.task.resource.gpu.amount to be set explicitly
+                    gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
+                    if gpu_per_task is not None:
+                        if float(gpu_per_task) < 1.0:
+                            raise ValueError(
+                                "XGBoost doesn't support GPU fractional configurations. "
+                                "Please set `spark.task.resource.gpu.amount=spark.executor"
+                                ".resource.gpu.amount`"
+                            )
+
+                        if float(gpu_per_task) > 1.0:
+                            get_logger(self.__class__.__name__).warning(
+                                "%s GPUs for each Spark task is configured, but each "
+                                "XGBoost training task uses only 1 GPU.",
+                                gpu_per_task,
+                            )
+                    else:
+                        raise ValueError(
+                            "The `spark.task.resource.gpu.amount` is required for training"
+                            " on GPU."
+                        )
+
     def _validate_params(self) -> None:
         # pylint: disable=too-many-branches
         init_model = self.getOrDefault("xgb_model")
@@ -421,53 +471,7 @@ class _SparkXGBParams(
                     "`pyspark.ml.linalg.Vector` type."
                 )
 
-        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
-            gpu_per_task = (
-                _get_spark_session()
-                .sparkContext.getConf()
-                .get("spark.task.resource.gpu.amount")
-            )
-
-            is_local = _is_local(_get_spark_session().sparkContext)
-
-            if is_local:
-                # checking spark local mode.
-                if gpu_per_task is not None:
-                    raise RuntimeError(
-                        "The spark local mode does not support gpu configuration."
-                        "Please remove spark.executor.resource.gpu.amount and "
-                        "spark.task.resource.gpu.amount"
-                    )
-
-                # Support GPU training in Spark local mode is just for debugging
-                # purposes, so it's okay for printing the below warning instead of
-                # checking the real gpu numbers and raising the exception.
-                get_logger(self.__class__.__name__).warning(
-                    "You have enabled GPU in spark local mode. Please make sure your"
-                    " local node has at least %d GPUs",
-                    self.getOrDefault(self.num_workers),
-                )
-            else:
-                # checking spark non-local mode.
-                if gpu_per_task is not None:
-                    if float(gpu_per_task) < 1.0:
-                        raise ValueError(
-                            "XGBoost doesn't support GPU fractional configurations. "
-                            "Please set `spark.task.resource.gpu.amount=spark.executor"
-                            ".resource.gpu.amount`"
-                        )
-
-                    if float(gpu_per_task) > 1.0:
-                        get_logger(self.__class__.__name__).warning(
-                            "%s GPUs for each Spark task is configured, but each "
-                            "XGBoost training task uses only 1 GPU.",
-                            gpu_per_task,
-                        )
-                else:
-                    raise ValueError(
-                        "The `spark.task.resource.gpu.amount` is required for training"
-                        " on GPU."
-                    )
+        self._validate_gpu_params()
 
 
 def _validate_and_convert_feature_col_as_float_col_list(
@@ -592,6 +596,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             arbitrary_params_dict={},
         )
 
+        self.logger = get_logger(self.__class__.__name__)
+
     def setParams(self, **kwargs: Any) -> None:  # pylint: disable=invalid-name
         """
         Set params for the estimator.
@@ -894,6 +900,116 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
         return booster_params, train_call_kwargs_params, dmatrix_kwargs
 
+    def _skip_stage_level_scheduling(self) -> bool:
+        # pylint: disable=too-many-return-statements
+        """Check if stage-level scheduling is not needed,
+        return true to skip stage-level scheduling"""
+
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+            ss = _get_spark_session()
+            sc = ss.sparkContext
+
+            if ss.version < "3.4.0":
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark version 3.4.0+"
+                )
+                return True
+
+            if not _is_standalone_or_localcluster(sc):
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark standalone or "
+                    "local-cluster mode"
+                )
+                return True
+
+            executor_cores = sc.getConf().get("spark.executor.cores")
+            executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+            if executor_cores is None or executor_gpus is None:
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark.executor.cores, "
+                    "spark.executor.resource.gpu.amount to be set."
+                )
+                return True
+
+            if int(executor_cores) == 1:
+                # there will be only 1 task running at any time.
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark.executor.cores > 1 "
+                )
+                return True
+
+            if int(executor_gpus) > 1:
+                # For spark.executor.resource.gpu.amount > 1, we suppose user knows how to configure
+                # to make xgboost run successfully.
+                #
+                self.logger.info(
+                    "Stage-level scheduling in xgboost will not work "
+                    "when spark.executor.resource.gpu.amount>1"
+                )
+                return True
+
+            task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
+
+            if task_gpu_amount is None:
+                # The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
+                # but with stage-level scheduling, we can make training task grab the gpu.
+                return False
+
+            if float(task_gpu_amount) == float(executor_gpus):
+                # spark.executor.resource.gpu.amount=spark.task.resource.gpu.amount "
+                # results in only 1 task running at a time, which may cause perf issue.
+                return True
+
+            # We can enable stage-level scheduling
+            return False
+
+        # CPU training doesn't require stage-level scheduling
+        return True
+
+    def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
+        """Try to enable stage-level scheduling"""
+
+        if self._skip_stage_level_scheduling():
+            return rdd
+
+        ss = _get_spark_session()
+
+        # executor_cores will not be None
+        executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
+        assert executor_cores is not None
+
+        # Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
+        # If spark-rapids is enabled, to avoid GPU OOM, we don't allow other
+        # ETL gpu tasks running alongside training tasks.
+        spark_plugins = ss.conf.get("spark.plugins", " ")
+        assert spark_plugins is not None
+        spark_rapids_sql_enabled = ss.conf.get("spark.rapids.sql.enabled", "true")
+        assert spark_rapids_sql_enabled is not None
+
+        task_cores = (
+            int(executor_cores)
+            if "com.nvidia.spark.SQLPlugin" in spark_plugins
+            and "true" == spark_rapids_sql_enabled.lower()
+            else (int(executor_cores) // 2) + 1
+        )
+
+        # Each training task requires cpu cores > total executor cores//2 + 1 which can
+        # make sure the tasks be sent to different executors.
+        #
+        # Please note that we can't use GPU to limit the concurrent tasks because of
+        # https://issues.apache.org/jira/browse/SPARK-45527.
+
+        task_gpus = 1.0
+        treqs = TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
+        rp = ResourceProfileBuilder().require(treqs).build
+
+        self.logger.info(
+            "XGBoost training tasks require the resource(cores=%s, gpu=%s).",
+            task_cores,
+            task_gpus,
+        )
+        return rdd.withResources(rp)
+
     def _fit(self, dataset: DataFrame) -> "_SparkXGBModel":
         # pylint: disable=too-many-statements, too-many-locals
         self._validate_params()
@@ -994,14 +1110,16 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 )
 
         def _run_job() -> Tuple[str, str]:
-            ret = (
+            rdd = (
                 dataset.mapInPandas(
-                    _train_booster, schema="config string, booster string"  # type: ignore
+                    _train_booster,  # type: ignore
+                    schema="config string, booster string",
                 )
                 .rdd.barrier()
                 .mapPartitions(lambda x: x)
-                .collect()[0]
             )
+            rdd_with_resource = self._try_stage_level_scheduling(rdd)
+            ret = rdd_with_resource.collect()[0]
             return ret[0], ret[1]
 
         get_logger("XGBoost-PySpark").info(
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 66d7ca454..395865386 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -129,6 +129,13 @@ def _is_local(spark_context: SparkContext) -> bool:
     return spark_context._jsc.sc().isLocal()
 
 
+def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
+    master = spark_context.getConf().get("spark.master")
+    return master is not None and (
+        master.startswith("spark://") or master.startswith("local-cluster")
+    )
+
+
 def _get_gpu_id(task_context: TaskContext) -> int:
     """Get the gpu id from the task resources"""
     if task_context is None:

From da6803b75b235bfc359fbcae540288a21a7aadb2 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Mon, 16 Oct 2023 21:16:39 -0700
Subject: [PATCH 54/59] Support column-wise data split with in-memory inputs
 (#9628)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 include/xgboost/c_api.h                    | 19 ++++-
 python-package/xgboost/core.py             | 24 +++++-
 python-package/xgboost/data.py             | 60 +++++++++++---
 python-package/xgboost/testing/__init__.py | 21 +++++
 src/c_api/c_api.cc                         | 19 ++++-
 src/collective/rabit_communicator.h        |  4 +-
 src/data/data.cc                           | 27 ++++--
 src/data/simple_dmatrix.cc                 |  2 +-
 tests/cpp/c_api/test_c_api.cc              |  3 +
 tests/cpp/data/test_metainfo.cc            | 43 ++++++++++
 tests/python/test_dmatrix.py               | 96 ++++++++++++++++++++++
 tests/python/test_with_arrow.py            | 16 ++++
 12 files changed, 307 insertions(+), 27 deletions(-)

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index ce2d8bf43..d28b5098b 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -144,9 +144,7 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
  *            See :doc:`/tutorials/input_format` for more info.
  *          \endverbatim
  *   - silent (optional): Whether to print message during loading. Default to true.
- *   - data_split_mode (optional): Whether to split by row or column. In distributed mode, the
- *     file is split accordingly; otherwise this is only an indicator on how the file was split
- *     beforehand. Default to row.
+ *   - data_split_mode (optional): Whether the file was split by row or column beforehand for distributed computing. Default to row.
  * \param out a loaded data matrix
  * \return 0 when success, -1 when failure happens
  */
@@ -174,6 +172,7 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indic
  * \param config  JSON encoded configuration.  Required values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -186,6 +185,7 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
  * \param config JSON encoded configuration.  Required values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -200,6 +200,7 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatr
  * \param config  JSON encoded configuration.  Supported values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -266,6 +267,7 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data,
  * \param config JSON encoded configuration.  Required values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -278,6 +280,7 @@ XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data, char const *config
  * \param config JSON encoded configuration.  Required values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -790,6 +793,16 @@ XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, bst_ulong *out);
  */
 XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
 
+/*!
+ * \brief Get the data split mode from DMatrix.
+ *
+ * \param handle the handle to the DMatrix
+ * \param out The output of the data split mode
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out);
+
 /**
  * \brief Get the predictors from DMatrix as CSR matrix for testing.  If this is a
  *        quantized DMatrix, quantized values are returned instead.
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 91c6bbd85..648851b31 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -303,14 +303,14 @@ def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
 
 
 def _validate_feature_info(
-    feature_info: Sequence[str], n_features: int, name: str
+    feature_info: Sequence[str], n_features: int, is_column_split: bool, name: str
 ) -> List[str]:
     if isinstance(feature_info, str) or not isinstance(feature_info, Sequence):
         raise TypeError(
             f"Expecting a sequence of strings for {name}, got: {type(feature_info)}"
         )
     feature_info = list(feature_info)
-    if len(feature_info) != n_features and n_features != 0:
+    if len(feature_info) != n_features and n_features != 0 and not is_column_split:
         msg = (
             f"{name} must have the same length as the number of data columns, ",
             f"expected {n_features}, got {len(feature_info)}",
@@ -1231,6 +1231,16 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         _check_call(_LIB.XGDMatrixNumNonMissing(self.handle, ctypes.byref(ret)))
         return ret.value
 
+    def data_split_mode(self) -> DataSplitMode:
+        """Get the data split mode of the DMatrix.
+
+        .. versionadded:: 2.1.0
+
+        """
+        ret = c_bst_ulong()
+        _check_call(_LIB.XGDMatrixDataSplitMode(self.handle, ctypes.byref(ret)))
+        return DataSplitMode(ret.value)
+
     def slice(
         self, rindex: Union[List[int], np.ndarray], allow_groups: bool = False
     ) -> "DMatrix":
@@ -1298,7 +1308,10 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
 
         # validate feature name
         feature_names = _validate_feature_info(
-            feature_names, self.num_col(), "feature names"
+            feature_names,
+            self.num_col(),
+            self.data_split_mode() == DataSplitMode.COL,
+            "feature names",
         )
         if len(feature_names) != len(set(feature_names)):
             values, counts = np.unique(
@@ -1371,7 +1384,10 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
             return
 
         feature_types = _validate_feature_info(
-            feature_types, self.num_col(), "feature types"
+            feature_types,
+            self.num_col(),
+            self.data_split_mode() == DataSplitMode.COL,
+            "feature types",
         )
 
         feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index bfdb21c80..49287d817 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -107,6 +107,7 @@ def _from_scipy_csr(
     nthread: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     """Initialize data from a CSR matrix."""
 
@@ -118,7 +119,11 @@ def _from_scipy_csr(
             _array_interface(data.indices),
             _array_interface(data.data),
             c_bst_ulong(data.shape[1]),
-            make_jcargs(missing=float(missing), nthread=int(nthread)),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(data_split_mode),
+            ),
             ctypes.byref(handle),
         )
     )
@@ -139,6 +144,7 @@ def _from_scipy_csc(
     nthread: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     """Initialize data from a CSC matrix."""
     handle = ctypes.c_void_p()
@@ -149,7 +155,11 @@ def _from_scipy_csc(
             _array_interface(data.indices),
             _array_interface(data.data),
             c_bst_ulong(data.shape[0]),
-            make_jcargs(missing=float(missing), nthread=int(nthread)),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(data_split_mode),
+            ),
             ctypes.byref(handle),
         )
     )
@@ -518,11 +528,14 @@ def _from_pandas_df(
     nthread: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     data, feature_names, feature_types = _transform_pandas_df(
         data, enable_categorical, feature_names, feature_types
     )
-    return _from_numpy_array(data, missing, nthread, feature_names, feature_types)
+    return _from_numpy_array(
+        data, missing, nthread, feature_names, feature_types, data_split_mode
+    )
 
 
 def _is_pandas_series(data: DataType) -> bool:
@@ -970,10 +983,13 @@ def _from_list(
     n_threads: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     array = np.array(data)
     _check_data_shape(data)
-    return _from_numpy_array(array, missing, n_threads, feature_names, feature_types)
+    return _from_numpy_array(
+        array, missing, n_threads, feature_names, feature_types, data_split_mode
+    )
 
 
 def _is_tuple(data: DataType) -> bool:
@@ -986,8 +1002,11 @@ def _from_tuple(
     n_threads: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
-    return _from_list(data, missing, n_threads, feature_names, feature_types)
+    return _from_list(
+        data, missing, n_threads, feature_names, feature_types, data_split_mode
+    )
 
 
 def _is_iter(data: DataType) -> bool:
@@ -1029,12 +1048,21 @@ def dispatch_data_backend(
     if not _is_cudf_ser(data) and not _is_pandas_series(data):
         _check_data_shape(data)
     if _is_scipy_csr(data):
-        return _from_scipy_csr(data, missing, threads, feature_names, feature_types)
+        return _from_scipy_csr(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
     if _is_scipy_csc(data):
-        return _from_scipy_csc(data, missing, threads, feature_names, feature_types)
+        return _from_scipy_csc(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
     if _is_scipy_coo(data):
         return _from_scipy_csr(
-            data.tocsr(), missing, threads, feature_names, feature_types
+            data.tocsr(),
+            missing,
+            threads,
+            feature_names,
+            feature_types,
+            data_split_mode,
         )
     if _is_np_array_like(data):
         return _from_numpy_array(
@@ -1043,9 +1071,13 @@ def dispatch_data_backend(
     if _is_uri(data):
         return _from_uri(data, missing, feature_names, feature_types, data_split_mode)
     if _is_list(data):
-        return _from_list(data, missing, threads, feature_names, feature_types)
+        return _from_list(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
     if _is_tuple(data):
-        return _from_tuple(data, missing, threads, feature_names, feature_types)
+        return _from_tuple(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
     if _is_arrow(data):
         data = _arrow_transform(data)
     if _is_pandas_series(data):
@@ -1054,7 +1086,13 @@ def dispatch_data_backend(
         data = pd.DataFrame(data)
     if _is_pandas_df(data):
         return _from_pandas_df(
-            data, enable_categorical, missing, threads, feature_names, feature_types
+            data,
+            enable_categorical,
+            missing,
+            threads,
+            feature_names,
+            feature_types,
+            data_split_mode,
         )
     if _is_cudf_df(data) or _is_cudf_ser(data):
         return _from_cudf_df(
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 2e0933a43..391f2bf9f 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -10,6 +10,7 @@ import os
 import platform
 import socket
 import sys
+import threading
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from io import StringIO
@@ -34,6 +35,7 @@ import pytest
 from scipy import sparse
 
 import xgboost as xgb
+from xgboost import RabitTracker
 from xgboost.core import ArrayLike
 from xgboost.sklearn import SklObjective
 from xgboost.testing.data import (
@@ -938,3 +940,22 @@ def load_agaricus(path: str) -> Tuple[xgb.DMatrix, xgb.DMatrix]:
 
 def project_root(path: str) -> str:
     return normpath(os.path.join(demo_dir(path), os.path.pardir))
+
+
+def run_with_rabit(world_size: int, test_fn: Callable) -> None:
+    tracker = RabitTracker(host_ip="127.0.0.1", n_workers=world_size)
+    tracker.start(world_size)
+
+    def run_worker(rabit_env: Dict[str, Union[str, int]]) -> None:
+        with xgb.collective.CommunicatorContext(**rabit_env):
+            test_fn()
+
+    workers = []
+    for _ in range(world_size):
+        worker = threading.Thread(target=run_worker, args=(tracker.worker_envs(),))
+        workers.append(worker)
+        worker.start()
+    for worker in workers:
+        worker.join()
+
+    tracker.join()
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 4fb6d90ff..8975bfb2e 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -445,8 +445,11 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
   auto config = Json::Load(StringView{c_json_config});
   float missing = GetMissing(config);
   auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
+  auto data_split_mode =
+      static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(
+      DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
   API_END();
 }
 
@@ -481,8 +484,11 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
   auto config = Json::Load(StringView{c_json_config});
   float missing = GetMissing(config);
   auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
+  auto data_split_mode =
+      static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(
+      DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
 
   API_END();
 }
@@ -722,6 +728,15 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle const handle, xgboost::bst_ulon
   API_END();
 }
 
+XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  auto p_m = CastDMatrixHandle(handle);
+  xgboost_CHECK_C_ARG_PTR(out);
+  *out = static_cast<xgboost::bst_ulong>(p_m->Info().data_split_mode);
+  API_END();
+}
+
 XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
                                   xgboost::bst_ulong *out_indptr, unsigned *out_indices,
                                   float *out_data) {
diff --git a/src/collective/rabit_communicator.h b/src/collective/rabit_communicator.h
index 59a4bbbd8..452e9ad9c 100644
--- a/src/collective/rabit_communicator.h
+++ b/src/collective/rabit_communicator.h
@@ -61,6 +61,7 @@ class RabitCommunicator : public Communicator {
     auto const total_size = per_rank * GetWorldSize();
     auto const index = per_rank * GetRank();
     std::string result(total_size, '\0');
+    result.replace(index, per_rank, input);
     rabit::Allgather(result.data(), total_size, index, per_rank, per_rank);
     return result;
   }
@@ -71,7 +72,8 @@ class RabitCommunicator : public Communicator {
     auto const total_size = std::accumulate(all_sizes.cbegin(), all_sizes.cend(), 0ul);
     auto const begin_index =
         std::accumulate(all_sizes.cbegin(), all_sizes.cbegin() + GetRank(), 0ul);
-    auto const size_prev_slice = GetRank() == 0 ? 0 : all_sizes[GetRank() - 1];
+    auto const size_prev_slice =
+        GetRank() == 0 ? all_sizes[GetWorldSize() - 1] : all_sizes[GetRank() - 1];
 
     std::string result(total_size, '\0');
     result.replace(begin_index, size_node_slice, input);
diff --git a/src/data/data.cc b/src/data/data.cc
index 3c190a90b..7e70fff3f 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -635,22 +635,39 @@ void MetaInfo::GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
 }
 
 void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) {
-  if (size != 0 && this->num_col_ != 0) {
+  if (size != 0 && this->num_col_ != 0 && !IsColumnSplit()) {
     CHECK_EQ(size, this->num_col_) << "Length of " << key << " must be equal to number of columns.";
     CHECK(info);
   }
   if (!std::strcmp(key, "feature_type")) {
     feature_type_names.clear();
-    auto& h_feature_types = feature_types.HostVector();
     for (size_t i = 0; i < size; ++i) {
       auto elem = info[i];
       feature_type_names.emplace_back(elem);
     }
+    if (IsColumnSplit()) {
+      feature_type_names = collective::AllgatherStrings(feature_type_names);
+      CHECK_EQ(feature_type_names.size(), num_col_)
+          << "Length of " << key << " must be equal to number of columns.";
+    }
+    auto& h_feature_types = feature_types.HostVector();
     LoadFeatureType(feature_type_names, &h_feature_types);
   } else if (!std::strcmp(key, "feature_name")) {
-    feature_names.clear();
-    for (size_t i = 0; i < size; ++i) {
-      feature_names.emplace_back(info[i]);
+    if (IsColumnSplit()) {
+      std::vector<std::string> local_feature_names{};
+      auto const rank = collective::GetRank();
+      for (std::size_t i = 0; i < size; ++i) {
+        auto elem = std::to_string(rank) + "." + info[i];
+        local_feature_names.emplace_back(elem);
+      }
+      feature_names = collective::AllgatherStrings(local_feature_names);
+      CHECK_EQ(feature_names.size(), num_col_)
+        << "Length of " << key << " must be equal to number of columns.";
+    } else {
+      feature_names.clear();
+      for (size_t i = 0; i < size; ++i) {
+        feature_names.emplace_back(info[i]);
+      }
     }
   } else {
     LOG(FATAL) << "Unknown feature info name: " << key;
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 48e764986..3814d74d2 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -75,7 +75,7 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
 }
 
 void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
-  if (info_.IsColumnSplit()) {
+  if (info_.IsColumnSplit() && collective::GetWorldSize() > 1) {
     auto const cols = collective::Allgather(info_.num_col_);
     auto const offset = std::accumulate(cols.cbegin(), cols.cbegin() + collective::GetRank(), 0ul);
     if (offset == 0) {
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 7fcab199e..4491dee92 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -108,6 +108,7 @@ TEST(CAPI, XGDMatrixCreateFromCSR) {
   Json::Dump(data_arr, &sdata);
   Json config{Object{}};
   config["missing"] = Number{std::numeric_limits<float>::quiet_NaN()};
+  config["data_split_mode"] = Integer{static_cast<int64_t>(DataSplitMode::kCol)};
   Json::Dump(config, &sconfig);
 
   DMatrixHandle handle;
@@ -120,6 +121,8 @@ TEST(CAPI, XGDMatrixCreateFromCSR) {
   ASSERT_EQ(n, 3);
   ASSERT_EQ(XGDMatrixNumNonMissing(handle, &n), 0);
   ASSERT_EQ(n, 3);
+  ASSERT_EQ(XGDMatrixDataSplitMode(handle, &n), 0);
+  ASSERT_EQ(n, static_cast<int64_t>(DataSplitMode::kCol));
 
   std::shared_ptr<xgboost::DMatrix> *pp_fmat =
       static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index 3e96d1919..67c5b39a4 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -74,6 +74,49 @@ TEST(MetaInfo, GetSetFeature) {
   // Other conditions are tested in `SaveLoadBinary`.
 }
 
+namespace {
+void VerifyGetSetFeatureColumnSplit() {
+  xgboost::MetaInfo info;
+  info.data_split_mode = DataSplitMode::kCol;
+  auto const world_size = collective::GetWorldSize();
+
+  auto constexpr kCols{2};
+  std::vector<std::string> types{u8"float", u8"c"};
+  std::vector<char const *> c_types(kCols);
+  std::transform(types.cbegin(), types.cend(), c_types.begin(),
+                 [](auto const &str) { return str.c_str(); });
+  info.num_col_ = kCols;
+  EXPECT_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()), dmlc::Error);
+  info.num_col_ = kCols * world_size;
+  EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
+  std::vector<std::string> expected_type_names{u8"float", u8"c",     u8"float",
+                                               u8"c",     u8"float", u8"c"};
+  EXPECT_EQ(info.feature_type_names, expected_type_names);
+  std::vector<xgboost::FeatureType> expected_types{
+      xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical,
+      xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical,
+      xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical};
+  EXPECT_EQ(info.feature_types.HostVector(), expected_types);
+
+  std::vector<std::string> names{u8"feature0", u8"feature1"};
+  std::vector<char const *> c_names(kCols);
+  std::transform(names.cbegin(), names.cend(), c_names.begin(),
+                 [](auto const &str) { return str.c_str(); });
+  info.num_col_ = kCols;
+  EXPECT_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()), dmlc::Error);
+  info.num_col_ = kCols * world_size;
+  EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()));
+  std::vector<std::string> expected_names{u8"0.feature0", u8"0.feature1", u8"1.feature0",
+                                          u8"1.feature1", u8"2.feature0", u8"2.feature1"};
+  EXPECT_EQ(info.feature_names, expected_names);
+}
+}  // anonymous namespace
+
+TEST(MetaInfo, GetSetFeatureColumnSplit) {
+  auto constexpr kWorldSize{3};
+  RunWithInMemoryCommunicator(kWorldSize, VerifyGetSetFeatureColumnSplit);
+}
+
 TEST(MetaInfo, SaveLoadBinary) {
   xgboost::MetaInfo info;
   xgboost::Context ctx;
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index 73e2055b7..51bee5669 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import tempfile
 
 import numpy as np
@@ -9,6 +10,7 @@ from scipy.sparse import csr_matrix, rand
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.core import DataSplitMode
 from xgboost.testing.data import np_dtypes
 
 rng = np.random.RandomState(1)
@@ -467,3 +469,97 @@ class TestDMatrix:
             m0 = xgb.DMatrix(orig)
             m1 = xgb.DMatrix(x)
             assert tm.predictor_equal(m0, m1)
+
+
+class TestDMatrixColumnSplit:
+    def test_numpy(self):
+        def verify_numpy():
+            data = np.random.randn(5, 5)
+            dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * xgb.collective.get_world_size()
+            assert dm.feature_names is None
+            assert dm.feature_types is None
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_numpy)
+
+    def test_numpy_feature_names(self):
+        def verify_numpy_feature_names():
+            world_size = xgb.collective.get_world_size()
+            data = np.random.randn(5, 5)
+            feature_names = [f'feature{x}' for x in range(5)]
+            feature_types = ['float'] * 5
+            dm = xgb.DMatrix(data, feature_names=feature_names, feature_types=feature_types,
+                             data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * world_size
+            assert len(dm.feature_names) == 5 * world_size
+            assert len(dm.feature_types) == 5 * world_size
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_numpy_feature_names)
+
+    def test_csr(self):
+        def verify_csr():
+            indptr = np.array([0, 2, 3, 6])
+            indices = np.array([0, 2, 2, 0, 1, 2])
+            data = np.array([1, 2, 3, 4, 5, 6])
+            X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3))
+            dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+            assert dtrain.num_row() == 3
+            assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_csr)
+
+    def test_csc(self):
+        def verify_csc():
+            row = np.array([0, 2, 2, 0, 1, 2])
+            col = np.array([0, 0, 1, 2, 2, 2])
+            data = np.array([1, 2, 3, 4, 5, 6])
+            X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3))
+            dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+            assert dtrain.num_row() == 3
+            assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_csc)
+
+    def test_coo(self):
+        def verify_coo():
+            row = np.array([0, 2, 2, 0, 1, 2])
+            col = np.array([0, 0, 1, 2, 2, 2])
+            data = np.array([1, 2, 3, 4, 5, 6])
+            X = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 3))
+            dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+            assert dtrain.num_row() == 3
+            assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_coo)
+
+    def test_list(self):
+        def verify_list():
+            data = [
+                [1, 2, 3, 4, 5],
+                [6, 7, 8, 9, 10],
+                [11, 12, 13, 14, 15],
+                [16, 17, 18, 19, 20],
+                [21, 22, 23, 24, 25]
+            ]
+            dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_list)
+
+    def test_tuple(self):
+        def verify_tuple():
+            data = (
+                (1, 2, 3, 4, 5),
+                (6, 7, 8, 9, 10),
+                (11, 12, 13, 14, 15),
+                (16, 17, 18, 19, 20),
+                (21, 22, 23, 24, 25)
+            )
+            dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_tuple)
diff --git a/tests/python/test_with_arrow.py b/tests/python/test_with_arrow.py
index 4673a688e..fdc4c7dbe 100644
--- a/tests/python/test_with_arrow.py
+++ b/tests/python/test_with_arrow.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import unittest
 
 import numpy as np
@@ -6,6 +7,7 @@ import pytest
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.core import DataSplitMode
 
 try:
     import pandas as pd
@@ -97,3 +99,17 @@ class TestArrowTable:
         y_np_low = dtrain.get_float_info("label_lower_bound")
         np.testing.assert_equal(y_np_up, y_upper_bound.to_pandas().values)
         np.testing.assert_equal(y_np_low, y_lower_bound.to_pandas().values)
+
+
+class TestArrowTableColumnSplit:
+    def test_arrow_table(self):
+        def verify_arrow_table():
+            df = pd.DataFrame(
+                [[0, 1, 2.0, 3.0], [1, 2, 3.0, 4.0]], columns=["a", "b", "c", "d"]
+            )
+            table = pa.Table.from_pandas(df)
+            dm = xgb.DMatrix(table, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 2
+            assert dm.num_col() == 4 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_arrow_table)

From 48ac9b6cbe95d932f33616e5b2c1df0aeedad216 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 17 Oct 2023 13:57:14 +0800
Subject: [PATCH 55/59] [coll] Allreduce. (#9679)

---
 R-package/src/Makevars.in              |   1 +
 R-package/src/Makevars.win             |   1 +
 src/collective/allreduce.cc            |  90 ++++++++++++++++++
 src/collective/allreduce.h             |  39 ++++++++
 src/data/array_interface.h             | 123 ++++++++++++++-----------
 tests/cpp/collective/test_allreduce.cc |  72 +++++++++++++++
 tests/cpp/collective/test_broadcast.cc |  33 ++-----
 tests/cpp/collective/test_worker.h     |  23 +++++
 8 files changed, 301 insertions(+), 81 deletions(-)
 create mode 100644 src/collective/allreduce.cc
 create mode 100644 src/collective/allreduce.h
 create mode 100644 tests/cpp/collective/test_allreduce.cc

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 808960319..37511ec62 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -99,6 +99,7 @@ OBJECTS= \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
     $(PKGROOT)/src/collective/allgather.o \
+    $(PKGROOT)/src/collective/allreduce.o \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/tracker.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 43bfcf7c1..611cff874 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -99,6 +99,7 @@ OBJECTS= \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
     $(PKGROOT)/src/collective/allgather.o \
+    $(PKGROOT)/src/collective/allreduce.o \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/tracker.o \
diff --git a/src/collective/allreduce.cc b/src/collective/allreduce.cc
new file mode 100644
index 000000000..6948f6758
--- /dev/null
+++ b/src/collective/allreduce.cc
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "allreduce.h"
+
+#include <algorithm>  // for min
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, int8_t
+#include <vector>     // for vector
+
+#include "../data/array_interface.h"    // for Type, DispatchDType
+#include "allgather.h"                  // for RingAllgather
+#include "comm.h"                       // for Comm
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective::cpu_impl {
+template <typename T>
+Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
+                              std::size_t n_bytes_in_seg, Func const& op) {
+  auto rank = comm.Rank();
+  auto world = comm.World();
+
+  auto dst_rank = BootstrapNext(rank, world);
+  auto src_rank = BootstrapPrev(rank, world);
+  auto next_ch = comm.Chan(dst_rank);
+  auto prev_ch = comm.Chan(src_rank);
+
+  std::vector<std::int8_t> buffer(n_bytes_in_seg, 0);
+  auto s_buf = common::Span{buffer.data(), buffer.size()};
+
+  for (std::int32_t r = 0; r < world - 1; ++r) {
+    // send to ring next
+    auto send_off = ((rank + world - r) % world) * n_bytes_in_seg;
+    send_off = std::min(send_off, data.size_bytes());
+    auto seg_nbytes = std::min(data.size_bytes() - send_off, n_bytes_in_seg);
+    auto send_seg = data.subspan(send_off, seg_nbytes);
+
+    next_ch->SendAll(send_seg);
+
+    // receive from ring prev
+    auto recv_off = ((rank + world - r - 1) % world) * n_bytes_in_seg;
+    recv_off = std::min(recv_off, data.size_bytes());
+    seg_nbytes = std::min(data.size_bytes() - recv_off, n_bytes_in_seg);
+    CHECK_EQ(seg_nbytes % sizeof(T), 0);
+    auto recv_seg = data.subspan(recv_off, seg_nbytes);
+    auto seg = s_buf.subspan(0, recv_seg.size());
+
+    prev_ch->RecvAll(seg);
+    auto rc = prev_ch->Block();
+    if (!rc.OK()) {
+      return rc;
+    }
+
+    // accumulate to recv_seg
+    CHECK_EQ(seg.size(), recv_seg.size());
+    op(seg, recv_seg);
+  }
+
+  return Success();
+}
+
+Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
+                     ArrayInterfaceHandler::Type type) {
+  return DispatchDType(type, [&](auto t) {
+    using T = decltype(t);
+    // Divide the data into segments according to the number of workers.
+    auto n_bytes_elem = sizeof(T);
+    CHECK_EQ(data.size_bytes() % n_bytes_elem, 0);
+    auto n = data.size_bytes() / n_bytes_elem;
+    auto world = comm.World();
+    auto n_bytes_in_seg = common::DivRoundUp(n, world) * sizeof(T);
+    auto rc = RingScatterReduceTyped<T>(comm, data, n_bytes_in_seg, op);
+    if (!rc.OK()) {
+      return rc;
+    }
+
+    auto prev = BootstrapPrev(comm.Rank(), comm.World());
+    auto next = BootstrapNext(comm.Rank(), comm.World());
+    auto prev_ch = comm.Chan(prev);
+    auto next_ch = comm.Chan(next);
+
+    rc = RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
+    if (!rc.OK()) {
+      return rc;
+    }
+    return comm.Block();
+  });
+}
+}  // namespace xgboost::collective::cpu_impl
diff --git a/src/collective/allreduce.h b/src/collective/allreduce.h
new file mode 100644
index 000000000..e3f8ab5b8
--- /dev/null
+++ b/src/collective/allreduce.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>      // for int8_t
+#include <functional>   // for function
+#include <type_traits>  // for is_invocable_v
+
+#include "../data/array_interface.h"    // for ArrayInterfaceHandler
+#include "comm.h"                       // for Comm, RestoreType
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+namespace cpu_impl {
+using Func =
+    std::function<void(common::Span<std::int8_t const> lhs, common::Span<std::int8_t> out)>;
+
+Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
+                     ArrayInterfaceHandler::Type type);
+}  // namespace cpu_impl
+
+template <typename T, typename Fn>
+std::enable_if_t<std::is_invocable_v<Fn, common::Span<T const>, common::Span<T>>, Result> Allreduce(
+    Comm const& comm, common::Span<T> data, Fn redop) {
+  auto erased = EraseType(data);
+  auto type = ToDType<T>::kType;
+
+  auto erased_fn = [type, redop](common::Span<std::int8_t const> lhs,
+                                 common::Span<std::int8_t> out) {
+    CHECK_EQ(lhs.size(), out.size()) << "Invalid input for reduction.";
+    auto lhs_t = RestoreType<T const>(lhs);
+    auto rhs_t = RestoreType<T>(out);
+    redop(lhs_t, rhs_t);
+  };
+
+  return cpu_impl::RingAllreduce(comm, erased, erased_fn, type);
+}
+}  // namespace xgboost::collective
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index c62a5cef2..0170e6a84 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -16,7 +16,7 @@
 #include <utility>
 #include <vector>
 
-#include "../common/bitfield.h"
+#include "../common/bitfield.h"  // for RBitField8
 #include "../common/common.h"
 #include "../common/error_msg.h"  // for NoF128
 #include "xgboost/base.h"
@@ -104,7 +104,20 @@ struct ArrayInterfaceErrors {
  */
 class ArrayInterfaceHandler {
  public:
-  enum Type : std::int8_t { kF2, kF4, kF8, kF16, kI1, kI2, kI4, kI8, kU1, kU2, kU4, kU8 };
+  enum Type : std::int8_t {
+    kF2 = 0,
+    kF4 = 1,
+    kF8 = 2,
+    kF16 = 3,
+    kI1 = 4,
+    kI2 = 5,
+    kI4 = 6,
+    kI8 = 7,
+    kU1 = 8,
+    kU2 = 9,
+    kU4 = 10,
+    kU8 = 11,
+  };
 
   template <typename PtrType>
   static PtrType GetPtrFromArrayData(Object::Map const &obj) {
@@ -587,6 +600,57 @@ class ArrayInterface {
   ArrayInterfaceHandler::Type type{ArrayInterfaceHandler::kF16};
 };
 
+template <typename Fn>
+auto DispatchDType(ArrayInterfaceHandler::Type dtype, Fn dispatch) {
+  switch (dtype) {
+    case ArrayInterfaceHandler::kF2: {
+#if defined(XGBOOST_USE_CUDA)
+      return dispatch(__half{});
+#else
+      LOG(FATAL) << "half type is only supported for CUDA input.";
+      break;
+#endif
+    }
+    case ArrayInterfaceHandler::kF4: {
+      return dispatch(float{});
+    }
+    case ArrayInterfaceHandler::kF8: {
+      return dispatch(double{});
+    }
+    case ArrayInterfaceHandler::kF16: {
+      using T = long double;
+      CHECK(sizeof(T) == 16) << error::NoF128();
+      return dispatch(T{});
+    }
+    case ArrayInterfaceHandler::kI1: {
+      return dispatch(std::int8_t{});
+    }
+    case ArrayInterfaceHandler::kI2: {
+      return dispatch(std::int16_t{});
+    }
+    case ArrayInterfaceHandler::kI4: {
+      return dispatch(std::int32_t{});
+    }
+    case ArrayInterfaceHandler::kI8: {
+      return dispatch(std::int64_t{});
+    }
+    case ArrayInterfaceHandler::kU1: {
+      return dispatch(std::uint8_t{});
+    }
+    case ArrayInterfaceHandler::kU2: {
+      return dispatch(std::uint16_t{});
+    }
+    case ArrayInterfaceHandler::kU4: {
+      return dispatch(std::uint32_t{});
+    }
+    case ArrayInterfaceHandler::kU8: {
+      return dispatch(std::uint64_t{});
+    }
+  }
+
+  return std::result_of_t<Fn(std::int8_t)>();
+}
+
 template <std::int32_t D, typename Fn>
 void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
   // Only used for cuDF at the moment.
@@ -602,60 +666,7 @@ void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
                                                       std::numeric_limits<std::size_t>::max()},
                                 array.shape, array.strides, device});
   };
-  switch (array.type) {
-    case ArrayInterfaceHandler::kF2: {
-#if defined(XGBOOST_USE_CUDA)
-      dispatch(__half{});
-#endif
-      break;
-    }
-    case ArrayInterfaceHandler::kF4: {
-      dispatch(float{});
-      break;
-    }
-    case ArrayInterfaceHandler::kF8: {
-      dispatch(double{});
-      break;
-    }
-    case ArrayInterfaceHandler::kF16: {
-      using T = long double;
-      CHECK(sizeof(long double) == 16) << error::NoF128();
-      dispatch(T{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI1: {
-      dispatch(std::int8_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI2: {
-      dispatch(std::int16_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI4: {
-      dispatch(std::int32_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI8: {
-      dispatch(std::int64_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU1: {
-      dispatch(std::uint8_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU2: {
-      dispatch(std::uint16_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU4: {
-      dispatch(std::uint32_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU8: {
-      dispatch(std::uint64_t{});
-      break;
-    }
-  }
+  DispatchDType(array.type, dispatch);
 }
 
 /**
diff --git a/tests/cpp/collective/test_allreduce.cc b/tests/cpp/collective/test_allreduce.cc
new file mode 100644
index 000000000..62b87e411
--- /dev/null
+++ b/tests/cpp/collective/test_allreduce.cc
@@ -0,0 +1,72 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../src/collective/allreduce.h"
+#include "../../../src/collective/tracker.h"
+#include "test_worker.h"  // for WorkerForTest, TestDistributed
+
+namespace xgboost::collective {
+
+namespace {
+class AllreduceWorker : public WorkerForTest {
+ public:
+  using WorkerForTest::WorkerForTest;
+
+  void Basic() {
+    {
+      std::vector<double> data(13, 0.0);
+      Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
+        for (std::size_t i = 0; i < rhs.size(); ++i) {
+          rhs[i] += lhs[i];
+        }
+      });
+      ASSERT_EQ(std::accumulate(data.cbegin(), data.cend(), 0.0), 0.0);
+    }
+    {
+      std::vector<double> data(1, 1.0);
+      Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
+        for (std::size_t i = 0; i < rhs.size(); ++i) {
+          rhs[i] += lhs[i];
+        }
+      });
+      ASSERT_EQ(data[0], static_cast<double>(comm_.World()));
+    }
+  }
+
+  void Acc() {
+    std::vector<double> data(314, 1.5);
+    Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
+      for (std::size_t i = 0; i < rhs.size(); ++i) {
+        rhs[i] += lhs[i];
+      }
+    });
+    for (std::size_t i = 0; i < data.size(); ++i) {
+      auto v = data[i];
+      ASSERT_EQ(v, 1.5 * static_cast<double>(comm_.World())) << i;
+    }
+  }
+};
+
+class AllreduceTest : public SocketTest {};
+}  // namespace
+
+TEST_F(AllreduceTest, Basic) {
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    AllreduceWorker worker{host, port, timeout, n_workers, r};
+    worker.Basic();
+  });
+}
+
+TEST_F(AllreduceTest, Sum) {
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    AllreduceWorker worker{host, port, timeout, n_workers, r};
+    worker.Acc();
+  });
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_broadcast.cc b/tests/cpp/collective/test_broadcast.cc
index 485f6dcdf..0ade86567 100644
--- a/tests/cpp/collective/test_broadcast.cc
+++ b/tests/cpp/collective/test_broadcast.cc
@@ -10,8 +10,8 @@
 #include <vector>   // for vector
 
 #include "../../../src/collective/broadcast.h"  // for Broadcast
-#include "../../../src/collective/tracker.h"    // for GetHostAddress, Tracker
-#include "test_worker.h"                        // for WorkerForTest
+#include "../../../src/collective/tracker.h"    // for GetHostAddress
+#include "test_worker.h"                        // for WorkerForTest, TestDistributed
 
 namespace xgboost::collective {
 namespace {
@@ -41,28 +41,11 @@ class BroadcastTest : public SocketTest {};
 }  // namespace
 
 TEST_F(BroadcastTest, Basic) {
-  std::int32_t n_workers = std::min(24u, std::thread::hardware_concurrency());
-  std::chrono::seconds timeout{3};
-
-  std::string host;
-  ASSERT_TRUE(GetHostAddress(&host).OK());
-  RabitTracker tracker{StringView{host}, n_workers, 0, timeout};
-  auto fut = tracker.Run();
-
-  std::vector<std::thread> workers;
-  std::int32_t port = tracker.Port();
-
-  for (std::int32_t i = 0; i < n_workers; ++i) {
-    workers.emplace_back([=] {
-      Worker worker{host, port, timeout, n_workers, i};
-      worker.Run();
-    });
-  }
-
-  for (auto& t : workers) {
-    t.join();
-  }
-
-  ASSERT_TRUE(fut.get().OK());
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    Worker worker{host, port, timeout, n_workers, r};
+    worker.Run();
+  });
 }
 }  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index 3c9d02f03..a3d6de875 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -88,4 +88,27 @@ class TrackerTest : public SocketTest {
     ASSERT_TRUE(rc.OK()) << rc.Report();
   }
 };
+
+template <typename WorkerFn>
+void TestDistributed(std::int32_t n_workers, WorkerFn worker_fn) {
+  std::chrono::seconds timeout{1};
+
+  std::string host;
+  ASSERT_TRUE(GetHostAddress(&host).OK());
+  RabitTracker tracker{StringView{host}, n_workers, 0, timeout};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] { worker_fn(host, port, timeout, i); });
+  }
+
+  for (auto& t : workers) {
+    t.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
+}
 }  // namespace xgboost::collective

From 4c0e4422d0e2115928fd68e29f7c1c67cc969854 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 18 Oct 2023 10:22:18 +0800
Subject: [PATCH 56/59] [coll] allgather. (#9681)

---
 src/collective/allgather.h             | 20 ++++++++
 tests/cpp/collective/test_allgather.cc | 71 ++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 tests/cpp/collective/test_allgather.cc

diff --git a/src/collective/allgather.h b/src/collective/allgather.h
index 31a9a36b3..5dcb4ebdd 100644
--- a/src/collective/allgather.h
+++ b/src/collective/allgather.h
@@ -20,4 +20,24 @@ namespace cpu_impl {
                                    std::shared_ptr<Channel> prev_ch,
                                    std::shared_ptr<Channel> next_ch);
 }  // namespace cpu_impl
+
+template <typename T>
+[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<T> data, std::size_t size) {
+  auto n_total_bytes = data.size_bytes();
+  auto n_bytes = sizeof(T) * size;
+  auto erased =
+      common::Span<std::int8_t>{reinterpret_cast<std::int8_t*>(data.data()), n_total_bytes};
+
+  auto rank = comm.Rank();
+  auto prev = BootstrapPrev(rank, comm.World());
+  auto next = BootstrapNext(rank, comm.World());
+
+  auto prev_ch = comm.Chan(prev);
+  auto next_ch = comm.Chan(next);
+  auto rc = cpu_impl::RingAllgather(comm, erased, n_bytes, 0, prev_ch, next_ch);
+  if (!rc.OK()) {
+    return rc;
+  }
+  return comm.Block();
+}
 }  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_allgather.cc b/tests/cpp/collective/test_allgather.cc
new file mode 100644
index 000000000..49ba591d0
--- /dev/null
+++ b/tests/cpp/collective/test_allgather.cc
@@ -0,0 +1,71 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/span.h>  // for Span
+
+#include <cstdint>  // for int32_t
+#include <numeric>  // for iota
+#include <string>   // for string
+#include <thread>   // for thread
+#include <vector>   // for vector
+
+#include "../../../src/collective/allgather.h"
+#include "../../../src/collective/tracker.h"  // for GetHostAddress, Tracker
+#include "test_worker.h"                      // for TestDistributed==
+
+namespace xgboost::collective {
+namespace {
+class AllgatherTest : public TrackerTest {};
+
+class Worker : public WorkerForTest {
+ public:
+  using WorkerForTest::WorkerForTest;
+
+  void Run() {
+    {
+      // basic test
+      std::vector<std::int32_t> data(comm_.World(), 0);
+      data[comm_.Rank()] = comm_.Rank();
+
+      auto rc = RingAllgather(this->comm_, common::Span{data.data(), data.size()}, 1);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+
+      for (std::int32_t r = 0; r < comm_.World(); ++r) {
+        ASSERT_EQ(data[r], r);
+      }
+    }
+    {
+      // test for limited socket buffer
+      this->LimitSockBuf(4096);
+
+      std::size_t n = 8192;  // n_bytes = 8192 * sizeof(int)
+      std::vector<std::int32_t> data(comm_.World() * n, 0);
+      auto s_data = common::Span{data.data(), data.size()};
+      auto seg = s_data.subspan(comm_.Rank() * n, n);
+      std::iota(seg.begin(), seg.end(), comm_.Rank());
+
+      auto rc = RingAllgather(comm_, common::Span{data.data(), data.size()}, n);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+
+      for (std::int32_t r = 0; r < comm_.World(); ++r) {
+        auto seg = s_data.subspan(r * n, n);
+        for (std::int32_t i = 0; i < static_cast<std::int32_t>(seg.size()); ++i) {
+          auto v = seg[i];
+          ASSERT_EQ(v, r + i);
+        }
+      }
+    }
+  }
+};
+}  // namespace
+
+TEST_F(AllgatherTest, Basic) {
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    Worker worker{host, port, timeout, n_workers, r};
+    worker.Run();
+  });
+}
+}  // namespace xgboost::collective

From ea9f09716bd426f235ce785d6286e0ad26350ce1 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <dmitry.razdoburdin@intel.com>
Date: Wed, 18 Oct 2023 04:55:33 +0200
Subject: [PATCH 57/59] Reorder if-else statements to allow using of cpu
 branches for sycl-devices (#9682)

---
 include/xgboost/linalg.h         |  4 +--
 src/common/linalg_op.cuh         |  2 +-
 src/common/linalg_op.h           |  2 +-
 src/common/numeric.cc            |  5 +--
 src/common/optional_weight.h     |  2 +-
 src/common/ranking_utils.h       | 42 +++++++++++++-------------
 src/common/stats.cc              |  6 ++--
 src/learner.cc                   |  4 +--
 src/metric/elementwise_metric.cu | 52 ++++++++++++++++----------------
 src/objective/adaptive.h         |  8 ++---
 src/objective/lambdarank_obj.cc  | 10 +++---
 src/objective/quantile_obj.cu    | 48 ++++++++++++++---------------
 src/objective/regression_obj.cu  |  8 ++---
 src/tree/fit_stump.cc            |  4 +--
 14 files changed, 99 insertions(+), 98 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index d56170aec..901c9ae91 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -603,13 +603,13 @@ auto MakeTensorView(Context const *ctx, Order order, common::Span<T> data, S &&.
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
-  auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();
+  auto span = ctx->IsCUDA() ? data->DeviceSpan() : data->HostSpan();
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {
-  auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan();
+  auto span = ctx->IsCUDA() ? data->ConstDeviceSpan() : data->ConstHostSpan();
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index 5d52e4100..361f926e9 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -42,7 +42,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_
 
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
-  ctx->IsCPU() ? ElementWiseKernelHost(t, ctx->Threads(), fn) : ElementWiseKernelDevice(t, fn);
+  ctx->IsCUDA() ? ElementWiseKernelDevice(t, fn) : ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
 }  // namespace linalg
 }  // namespace xgboost
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index f55927402..d89e5a736 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -55,7 +55,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr)
 
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
-  if (!ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
     common::AssertGPUSupport();
   }
   ElementWiseKernelHost(t, ctx->Threads(), fn);
diff --git a/src/common/numeric.cc b/src/common/numeric.cc
index 240e0234a..f19932311 100644
--- a/src/common/numeric.cc
+++ b/src/common/numeric.cc
@@ -11,13 +11,14 @@
 namespace xgboost {
 namespace common {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
-  if (ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
+    return cuda_impl::Reduce(ctx, values);
+  } else {
     auto const& h_values = values.ConstHostVector();
     auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
     static_assert(std::is_same<decltype(result), double>::value);
     return result;
   }
-  return cuda_impl::Reduce(ctx, values);
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/src/common/optional_weight.h b/src/common/optional_weight.h
index 997f3fad1..bbfd365c8 100644
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@@ -26,7 +26,7 @@ inline OptionalWeights MakeOptionalWeights(Context const* ctx,
   if (ctx->IsCUDA()) {
     weights.SetDevice(ctx->Device());
   }
-  return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
+  return OptionalWeights{ctx->IsCUDA() ? weights.ConstDeviceSpan() : weights.ConstHostSpan()};
 }
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_OPTIONAL_WEIGHT_H_
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index 31531a597..e6b87ed4b 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -197,10 +197,10 @@ class RankingCache {
       CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
           << error::GroupSize() << "the size of label.";
     }
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
       this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
     }
     if (!info.weights_.Empty()) {
       CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
@@ -218,7 +218,7 @@ class RankingCache {
   // Constructed as [1, n_samples] if group ptr is not supplied by the user
   common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
     group_ptr_.SetDevice(ctx->Device());
-    return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
+    return ctx->IsCUDA() ? group_ptr_.ConstDeviceSpan() : group_ptr_.ConstHostSpan();
   }
 
   [[nodiscard]] auto const& Param() const { return param_; }
@@ -231,10 +231,10 @@ class RankingCache {
       sorted_idx_cache_.SetDevice(ctx->Device());
       sorted_idx_cache_.Resize(predt.size());
     }
-    if (ctx->IsCPU()) {
-      return this->MakeRankOnCPU(ctx, predt);
-    } else {
+    if (ctx->IsCUDA()) {
       return this->MakeRankOnCUDA(ctx, predt);
+    } else {
+      return this->MakeRankOnCPU(ctx, predt);
     }
   }
   // The function simply returns a uninitialized buffer as this is only used by the
@@ -307,10 +307,10 @@ class NDCGCache : public RankingCache {
  public:
   NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
       : RankingCache{ctx, info, p} {
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
       this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
     }
   }
 
@@ -318,7 +318,7 @@ class NDCGCache : public RankingCache {
     return inv_idcg_.View(ctx->Device());
   }
   common::Span<double const> Discount(Context const* ctx) const {
-    return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
+    return ctx->IsCUDA() ? discounts_.ConstDeviceSpan() : discounts_.ConstHostSpan();
   }
   linalg::VectorView<double> Dcg(Context const* ctx) {
     if (dcg_.Size() == 0) {
@@ -387,10 +387,10 @@ class PreCache : public RankingCache {
  public:
   PreCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
       : RankingCache{ctx, info, p} {
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
       this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
     }
   }
 
@@ -399,7 +399,7 @@ class PreCache : public RankingCache {
       pre_.SetDevice(ctx->Device());
       pre_.Resize(this->Groups());
     }
-    return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
+    return ctx->IsCUDA() ? pre_.DeviceSpan() : pre_.HostSpan();
   }
 };
 
@@ -418,10 +418,10 @@ class MAPCache : public RankingCache {
  public:
   MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
       : RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
       this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
     }
   }
 
@@ -430,21 +430,21 @@ class MAPCache : public RankingCache {
       n_rel_.SetDevice(ctx->Device());
       n_rel_.Resize(n_samples_);
     }
-    return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
+    return ctx->IsCUDA() ? n_rel_.DeviceSpan() : n_rel_.HostSpan();
   }
   common::Span<double> Acc(Context const* ctx) {
     if (acc_.Empty()) {
       acc_.SetDevice(ctx->Device());
       acc_.Resize(n_samples_);
     }
-    return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
+    return ctx->IsCUDA() ? acc_.DeviceSpan() : acc_.HostSpan();
   }
   common::Span<double> Map(Context const* ctx) {
     if (map_.Empty()) {
       map_.SetDevice(ctx->Device());
       map_.Resize(this->Groups());
     }
-    return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
+    return ctx->IsCUDA() ? map_.DeviceSpan() : map_.HostSpan();
   }
 };
 
diff --git a/src/common/stats.cc b/src/common/stats.cc
index aa73a07c3..bbf969fcc 100644
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -49,7 +49,9 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
   out->SetDevice(ctx->Device());
   out->Reshape(1);
 
-  if (ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
+    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
+  } else {
     auto h_v = v.HostView();
     float n = v.Size();
     MemStackAllocator<float, DefaultMaxThreads()> tloc(ctx->Threads(), 0.0f);
@@ -57,8 +59,6 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
                 [&](auto i) { tloc[omp_get_thread_num()] += h_v(i) / n; });
     auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f);
     out->HostView()(0) = ret;
-  } else {
-    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
   }
 }
 }  // namespace xgboost::common
diff --git a/src/learner.cc b/src/learner.cc
index d2b297dca..08c59ba60 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -278,7 +278,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
   std::swap(base_score_, base_margin);
   // Make sure read access everywhere for thread-safe prediction.
   std::as_const(base_score_).HostView();
-  if (!ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
     std::as_const(base_score_).View(ctx->Device());
   }
   CHECK(std::as_const(base_score_).Data()->HostCanRead());
@@ -287,7 +287,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
 linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device) const {
   // multi-class is not yet supported.
   CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
-  if (device.IsCPU()) {
+  if (!device.IsCUDA()) {
     // Make sure that we won't run into race condition.
     CHECK(base_score_.Data()->HostCanRead());
     return base_score_.HostView();
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index e16f9f8cc..feabedfab 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -46,7 +46,26 @@ template <typename Fn>
 PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
   PackedReduceResult result;
   auto labels = info.labels.View(ctx->Device());
-  if (ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
+    dh::XGBCachingDeviceAllocator<char> alloc;
+    thrust::counting_iterator<size_t> begin(0);
+    thrust::counting_iterator<size_t> end = begin + labels.Size();
+    result = thrust::transform_reduce(
+        thrust::cuda::par(alloc), begin, end,
+        [=] XGBOOST_DEVICE(size_t i) {
+          auto idx = linalg::UnravelIndex(i, labels.Shape());
+          auto sample_id = std::get<0>(idx);
+          auto target_id = std::get<1>(idx);
+          auto res = loss(i, sample_id, target_id);
+          float v{std::get<0>(res)}, wt{std::get<1>(res)};
+          return PackedReduceResult{v, wt};
+        },
+        PackedReduceResult{}, thrust::plus<PackedReduceResult>());
+#else
+    common::AssertGPUSupport();
+#endif  //  defined(XGBOOST_USE_CUDA)
+  } else {
     auto n_threads = ctx->Threads();
     std::vector<double> score_tloc(n_threads, 0.0);
     std::vector<double> weight_tloc(n_threads, 0.0);
@@ -69,25 +88,6 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
     double residue_sum = std::accumulate(score_tloc.cbegin(), score_tloc.cend(), 0.0);
     double weights_sum = std::accumulate(weight_tloc.cbegin(), weight_tloc.cend(), 0.0);
     result = PackedReduceResult{residue_sum, weights_sum};
-  } else {
-#if defined(XGBOOST_USE_CUDA)
-    dh::XGBCachingDeviceAllocator<char> alloc;
-    thrust::counting_iterator<size_t> begin(0);
-    thrust::counting_iterator<size_t> end = begin + labels.Size();
-    result = thrust::transform_reduce(
-        thrust::cuda::par(alloc), begin, end,
-        [=] XGBOOST_DEVICE(size_t i) {
-          auto idx = linalg::UnravelIndex(i, labels.Shape());
-          auto sample_id = std::get<0>(idx);
-          auto target_id = std::get<1>(idx);
-          auto res = loss(i, sample_id, target_id);
-          float v{std::get<0>(res)}, wt{std::get<1>(res)};
-          return PackedReduceResult{v, wt};
-        },
-        PackedReduceResult{}, thrust::plus<PackedReduceResult>());
-#else
-    common::AssertGPUSupport();
-#endif  //  defined(XGBOOST_USE_CUDA)
   }
   return result;
 }
@@ -185,10 +185,10 @@ class PseudoErrorLoss : public MetricNoCache {
     CHECK_EQ(info.labels.Shape(0), info.num_row_);
     auto labels = info.labels.View(ctx_->Device());
     preds.SetDevice(ctx_->Device());
-    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
+    auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
     info.weights_.SetDevice(ctx_->Device());
-    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                     : info.weights_.ConstDeviceSpan());
+    common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                   : info.weights_.ConstHostSpan());
     float slope = this->param_.huber_slope;
     CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
     PackedReduceResult result =
@@ -351,10 +351,10 @@ struct EvalEWiseBase : public MetricNoCache {
     }
     auto labels = info.labels.View(ctx_->Device());
     info.weights_.SetDevice(ctx_->Device());
-    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                     : info.weights_.ConstDeviceSpan());
+    common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                   : info.weights_.ConstHostSpan());
     preds.SetDevice(ctx_->Device());
-    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
+    auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
 
     auto d_policy = policy_;
     auto result =
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index c49c5b6fb..a64f37f63 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -96,13 +96,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
 inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
                            std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                            HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
-  if (ctx->IsCPU()) {
-    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
-                               predt, alpha, p_tree);
-  } else {
+  if (ctx->IsCUDA()) {
     position.SetDevice(ctx->Device());
     detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
                                  predt, alpha, p_tree);
+  } else {
+    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
+                               predt, alpha, p_tree);
   }
 }
 }  // namespace obj
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index bd41aebb6..0c9d1262a 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -108,14 +108,14 @@ class LambdaRankObj : public FitIntercept {
     li_.SetDevice(ctx_->Device());
     lj_.SetDevice(ctx_->Device());
 
-    if (ctx_->IsCPU()) {
-      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
-                                             lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
-                                             &li_, &lj_, p_cache_);
-    } else {
+    if (ctx_->IsCUDA()) {
       cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
                                               lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
                                               &li_, &lj_, p_cache_);
+    } else {
+      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
+                                             lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
+                                             &li_, &lj_, p_cache_);
     }
 
     li_full_.Data()->Fill(0.0);
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index d8571ed17..57f432c7f 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -71,15 +71,15 @@ class QuantileRegression : public ObjFunction {
     auto gpair = out_gpair->View(ctx_->Device());
 
     info.weights_.SetDevice(ctx_->Device());
-    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                 : info.weights_.ConstDeviceSpan()};
+    common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                  : info.weights_.ConstHostSpan()};
 
     preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
     auto n_samples = info.num_row_;
 
     alpha_.SetDevice(ctx_->Device());
-    auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
+    auto alpha = ctx_->IsCUDA() ? alpha_.ConstDeviceSpan() : alpha_.ConstHostSpan();
 
     linalg::ElementWiseKernel(
         ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
@@ -107,27 +107,7 @@ class QuantileRegression : public ObjFunction {
     base_score->Reshape(n_targets);
 
     double sw{0};
-    if (ctx_->IsCPU()) {
-      auto quantiles = base_score->HostView();
-      auto h_weights = info.weights_.ConstHostVector();
-      if (info.weights_.Empty()) {
-        sw = info.num_row_;
-      } else {
-        sw = std::accumulate(std::cbegin(h_weights), std::cend(h_weights), 0.0);
-      }
-      for (bst_target_t t{0}; t < n_targets; ++t) {
-        auto alpha = param_.quantile_alpha[t];
-        auto h_labels = info.labels.HostView();
-        if (h_weights.empty()) {
-          quantiles(t) =
-              common::Quantile(ctx_, alpha, linalg::cbegin(h_labels), linalg::cend(h_labels));
-        } else {
-          CHECK_EQ(h_weights.size(), h_labels.Size());
-          quantiles(t) = common::WeightedQuantile(ctx_, alpha, linalg::cbegin(h_labels),
-                                                  linalg::cend(h_labels), std::cbegin(h_weights));
-        }
-      }
-    } else {
+    if (ctx_->IsCUDA()) {
 #if defined(XGBOOST_USE_CUDA)
       alpha_.SetDevice(ctx_->Device());
       auto d_alpha = alpha_.ConstDeviceSpan();
@@ -164,6 +144,26 @@ class QuantileRegression : public ObjFunction {
 #else
       common::AssertGPUSupport();
 #endif  // defined(XGBOOST_USE_CUDA)
+    } else {
+      auto quantiles = base_score->HostView();
+      auto h_weights = info.weights_.ConstHostVector();
+      if (info.weights_.Empty()) {
+        sw = info.num_row_;
+      } else {
+        sw = std::accumulate(std::cbegin(h_weights), std::cend(h_weights), 0.0);
+      }
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        auto alpha = param_.quantile_alpha[t];
+        auto h_labels = info.labels.HostView();
+        if (h_weights.empty()) {
+          quantiles(t) =
+              common::Quantile(ctx_, alpha, linalg::cbegin(h_labels), linalg::cend(h_labels));
+        } else {
+          CHECK_EQ(h_weights.size(), h_labels.Size());
+          quantiles(t) = common::WeightedQuantile(ctx_, alpha, linalg::cbegin(h_labels),
+                                                  linalg::cend(h_labels), std::cbegin(h_weights));
+        }
+      }
     }
 
     // For multiple quantiles, we should extend the base score to a vector instead of
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index cf774db93..7f498c5f1 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -254,8 +254,8 @@ class PseudoHuberRegression : public FitIntercept {
     auto predt = linalg::MakeVec(&preds);
 
     info.weights_.SetDevice(ctx_->Device());
-    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                 : info.weights_.ConstDeviceSpan()};
+    common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                  : info.weights_.ConstHostSpan()};
 
     linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(size_t i, float const y) mutable {
       auto sample_id = std::get<0>(linalg::UnravelIndex(i, labels.Shape()));
@@ -714,8 +714,8 @@ class MeanAbsoluteError : public ObjFunction {
     preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
     info.weights_.SetDevice(ctx_->Device());
-    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                 : info.weights_.ConstDeviceSpan()};
+    common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                  : info.weights_.ConstHostSpan()};
 
     linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, float y) mutable {
       auto sign = [](auto x) {
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index b80293fae..21a050536 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -72,7 +72,7 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
 
   gpair.SetDevice(ctx->Device());
   auto gpair_t = gpair.View(ctx->Device());
-  ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
-      : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
+  ctx->IsCUDA() ? cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()))
+                : cpu_impl::FitStump(ctx, info, gpair_t, out->HostView());
 }
 }  // namespace xgboost::tree

From 5d1bcde7196d34ef7ac030f4463e0b45d35a6f3d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 19 Oct 2023 03:13:50 +0800
Subject: [PATCH 58/59] [coll] allgatherv. (#9688)

---
 src/collective/allgather.cc            | 54 +++++++++++++++++++--
 src/collective/allgather.h             | 45 ++++++++++++++----
 src/collective/broadcast.cc            |  1 +
 src/collective/comm.cc                 |  2 +
 src/collective/comm.h                  | 22 ++++-----
 src/common/host_device_vector.cc       |  1 +
 src/common/host_device_vector.cu       |  1 +
 tests/cpp/collective/test_allgather.cc | 66 ++++++++++++++++++++++----
 8 files changed, 157 insertions(+), 35 deletions(-)

diff --git a/src/collective/allgather.cc b/src/collective/allgather.cc
index dba36c88c..378a06911 100644
--- a/src/collective/allgather.cc
+++ b/src/collective/allgather.cc
@@ -3,13 +3,16 @@
  */
 #include "allgather.h"
 
-#include <algorithm>  // for min
+#include <algorithm>  // for min, copy_n
 #include <cstddef>    // for size_t
-#include <cstdint>    // for int8_t
+#include <cstdint>    // for int8_t, int32_t, int64_t
 #include <memory>     // for shared_ptr
+#include <numeric>    // for partial_sum
+#include <vector>     // for vector
 
-#include "comm.h"          // for Comm, Channel
-#include "xgboost/span.h"  // for Span
+#include "comm.h"                       // for Comm, Channel
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
 
 namespace xgboost::collective::cpu_impl {
 Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size_t segment_size,
@@ -39,4 +42,47 @@ Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size
 
   return Success();
 }
+
+[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
+                                    common::Span<std::int8_t const> data,
+                                    common::Span<std::int8_t> erased_result) {
+  auto world = comm.World();
+  auto rank = comm.Rank();
+
+  auto prev = BootstrapPrev(rank, comm.World());
+  auto next = BootstrapNext(rank, comm.World());
+
+  auto prev_ch = comm.Chan(prev);
+  auto next_ch = comm.Chan(next);
+
+  // get worker offset
+  std::vector<std::int64_t> offset(world + 1, 0);
+  std::partial_sum(sizes.cbegin(), sizes.cend(), offset.begin() + 1);
+  CHECK_EQ(*offset.cbegin(), 0);
+
+  // copy data
+  auto current = erased_result.subspan(offset[rank], data.size_bytes());
+  auto erased_data = EraseType(data);
+  std::copy_n(erased_data.data(), erased_data.size(), current.data());
+
+  for (std::int32_t r = 0; r < world; ++r) {
+    auto send_rank = (rank + world - r) % world;
+    auto send_off = offset[send_rank];
+    auto send_size = sizes[send_rank];
+    auto send_seg = erased_result.subspan(send_off, send_size);
+    next_ch->SendAll(send_seg);
+
+    auto recv_rank = (rank + world - r - 1) % world;
+    auto recv_off = offset[recv_rank];
+    auto recv_size = sizes[recv_rank];
+    auto recv_seg = erased_result.subspan(recv_off, recv_size);
+    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+
+    auto rc = prev_ch->Block();
+    if (!rc.OK()) {
+      return rc;
+    }
+  }
+  return comm.Block();
+}
 }  // namespace xgboost::collective::cpu_impl
diff --git a/src/collective/allgather.h b/src/collective/allgather.h
index 5dcb4ebdd..cb5f5b8af 100644
--- a/src/collective/allgather.h
+++ b/src/collective/allgather.h
@@ -2,12 +2,16 @@
  * Copyright 2023, XGBoost Contributors
  */
 #pragma once
-#include <cstddef>  // for size_t
-#include <cstdint>  // for int32_t
-#include <memory>   // for shared_ptr
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <memory>       // for shared_ptr
+#include <numeric>      // for accumulate
+#include <type_traits>  // for remove_cv_t
+#include <vector>       // for vector
 
-#include "comm.h"          // for Comm, Channel
-#include "xgboost/span.h"  // for Span
+#include "comm.h"                       // for Comm, Channel, EraseType
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
 
 namespace xgboost::collective {
 namespace cpu_impl {
@@ -19,14 +23,16 @@ namespace cpu_impl {
                                    std::size_t segment_size, std::int32_t worker_off,
                                    std::shared_ptr<Channel> prev_ch,
                                    std::shared_ptr<Channel> next_ch);
+
+[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
+                                    common::Span<std::int8_t const> data,
+                                    common::Span<std::int8_t> erased_result);
 }  // namespace cpu_impl
 
 template <typename T>
 [[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<T> data, std::size_t size) {
-  auto n_total_bytes = data.size_bytes();
   auto n_bytes = sizeof(T) * size;
-  auto erased =
-      common::Span<std::int8_t>{reinterpret_cast<std::int8_t*>(data.data()), n_total_bytes};
+  auto erased = EraseType(data);
 
   auto rank = comm.Rank();
   auto prev = BootstrapPrev(rank, comm.World());
@@ -40,4 +46,27 @@ template <typename T>
   }
   return comm.Block();
 }
+
+template <typename T>
+[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<T> data,
+                                    std::vector<std::remove_cv_t<T>>* p_out) {
+  auto world = comm.World();
+  auto rank = comm.Rank();
+
+  std::vector<std::int64_t> sizes(world, 0);
+  sizes[rank] = data.size_bytes();
+  auto rc = RingAllgather(comm, common::Span{sizes.data(), sizes.size()}, 1);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  std::vector<T>& result = *p_out;
+  auto n_total_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
+  result.resize(n_total_bytes / sizeof(T));
+  auto h_result = common::Span{result.data(), result.size()};
+  auto erased_result = EraseType(h_result);
+  auto erased_data = EraseType(data);
+
+  return cpu_impl::RingAllgatherV(comm, sizes, erased_data, erased_result);
+}
 }  // namespace xgboost::collective
diff --git a/src/collective/broadcast.cc b/src/collective/broadcast.cc
index be7e8f55f..660bb9130 100644
--- a/src/collective/broadcast.cc
+++ b/src/collective/broadcast.cc
@@ -3,6 +3,7 @@
  */
 #include "broadcast.h"
 
+#include <cmath>    // for ceil, log2
 #include <cstdint>  // for int32_t, int8_t
 #include <utility>  // for move
 
diff --git a/src/collective/comm.cc b/src/collective/comm.cc
index 7e0af9c18..9ee1e0e6a 100644
--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@@ -11,8 +11,10 @@
 
 #include "allgather.h"
 #include "protocol.h"                   // for kMagic
+#include "xgboost/base.h"               // for XGBOOST_STRICT_R_MODE
 #include "xgboost/collective/socket.h"  // for TCPSocket
 #include "xgboost/json.h"               // for Json, Object
+#include "xgboost/string_view.h"        // for StringView
 
 namespace xgboost::collective {
 Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
diff --git a/src/collective/comm.h b/src/collective/comm.h
index f23810034..b501fcddd 100644
--- a/src/collective/comm.h
+++ b/src/collective/comm.h
@@ -2,20 +2,16 @@
  * Copyright 2023, XGBoost Contributors
  */
 #pragma once
-#include <chrono>              // for seconds
-#include <condition_variable>  // for condition_variable
-#include <cstddef>             // for size_t
-#include <cstdint>             // for int32_t
-#include <memory>              // for shared_ptr
-#include <mutex>               // for mutex
-#include <queue>               // for queue
-#include <string>              // for string
-#include <thread>              // for thread
-#include <type_traits>         // for remove_const_t
-#include <utility>             // for move
-#include <vector>              // for vector
+#include <chrono>       // for seconds
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <memory>       // for shared_ptr
+#include <string>       // for string
+#include <thread>       // for thread
+#include <type_traits>  // for remove_const_t
+#include <utility>      // for move
+#include <vector>       // for vector
 
-#include "../common/timer.h"
 #include "loop.h"                       // for Loop
 #include "protocol.h"                   // for PeerInfo
 #include "xgboost/collective/result.h"  // for Result
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index 66d8024bd..a7a996c6c 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -175,6 +175,7 @@ template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
+template class HostDeviceVector<int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 5f7b71043..4933a4b11 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -409,6 +409,7 @@ template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
+template class HostDeviceVector<int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
diff --git a/tests/cpp/collective/test_allgather.cc b/tests/cpp/collective/test_allgather.cc
index 49ba591d0..a74b9f149 100644
--- a/tests/cpp/collective/test_allgather.cc
+++ b/tests/cpp/collective/test_allgather.cc
@@ -1,18 +1,23 @@
 /**
  * Copyright 2023, XGBoost Contributors
  */
-#include <gtest/gtest.h>
-#include <xgboost/span.h>  // for Span
+#include <gtest/gtest.h>   // for ASSERT_EQ
+#include <xgboost/span.h>  // for Span, oper...
 
-#include <cstdint>  // for int32_t
-#include <numeric>  // for iota
-#include <string>   // for string
-#include <thread>   // for thread
-#include <vector>   // for vector
+#include <algorithm>  // for min
+#include <chrono>     // for seconds
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t
+#include <numeric>    // for iota
+#include <string>     // for string
+#include <thread>     // for thread
+#include <vector>     // for vector
 
-#include "../../../src/collective/allgather.h"
-#include "../../../src/collective/tracker.h"  // for GetHostAddress, Tracker
-#include "test_worker.h"                      // for TestDistributed==
+#include "../../../src/collective/allgather.h"  // for RingAllgather
+#include "../../../src/collective/comm.h"       // for RabitComm
+#include "gtest/gtest.h"                        // for AssertionR...
+#include "test_worker.h"                        // for TestDistri...
+#include "xgboost/collective/result.h"          // for Result
 
 namespace xgboost::collective {
 namespace {
@@ -57,6 +62,38 @@ class Worker : public WorkerForTest {
       }
     }
   }
+
+  void TestV() {
+    {
+      // basic test
+      std::int32_t n{comm_.Rank()};
+      std::vector<std::int32_t> result;
+      auto rc = RingAllgatherV(comm_, common::Span{&n, 1}, &result);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+      for (std::int32_t i = 0; i < comm_.World(); ++i) {
+        ASSERT_EQ(result[i], i);
+      }
+    }
+
+    {
+      // V test
+      std::vector<std::int32_t> data(comm_.Rank() + 1, comm_.Rank());
+      std::vector<std::int32_t> result;
+      auto rc = RingAllgatherV(comm_, common::Span{data.data(), data.size()}, &result);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+      ASSERT_EQ(result.size(), (1 + comm_.World()) * comm_.World() / 2);
+      std::int32_t k{0};
+      for (std::int32_t r = 0; r < comm_.World(); ++r) {
+        auto seg = common::Span{result.data(), result.size()}.subspan(k, (r + 1));
+        if (comm_.Rank() == 0) {
+          for (auto v : seg) {
+            ASSERT_EQ(v, r);
+          }
+          k += seg.size();
+        }
+      }
+    }
+  }
 };
 }  // namespace
 
@@ -68,4 +105,13 @@ TEST_F(AllgatherTest, Basic) {
     worker.Run();
   });
 }
+
+TEST_F(AllgatherTest, V) {
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    Worker worker{host, port, timeout, n_workers, r};
+    worker.TestV();
+  });
+}
 }  // namespace xgboost::collective

From 3b86260b506154762f3fbd3daaf71c08843ecaf0 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 18 Oct 2023 12:27:21 -0700
Subject: [PATCH 59/59] Fix build for AppleClang 11 (#9684) (#9693)

---
 src/common/ref_resource_view.h | 2 +-
 tests/cpp/common/test_io.cc    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index 0fadf846d..d4f82e615 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -76,7 +76,7 @@ class RefResourceView {
 
   [[nodiscard]] size_type size() const { return size_; }  // NOLINT
   [[nodiscard]] size_type size_bytes() const {            // NOLINT
-    return Span{data(), size()}.size_bytes();
+    return Span<const value_type>{data(), size()}.size_bytes();
   }
   [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
   [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index e4d65c1f4..4c4d4efe0 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -144,7 +144,8 @@ TEST(IO, Resource) {
     fout << 1.0 << std::endl;
     fout.close();
 
-    auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
+    auto resource = std::shared_ptr<MmapResource>{
+      new MmapResource{path, 0, sizeof(double)}};
     ASSERT_EQ(resource->Size(), sizeof(double));
     ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
     ASSERT_EQ(resource->DataAs<double>()[0], val);