From 093b6758380e4663d18729dd87d54a7845695a4b Mon Sep 17 00:00:00 2001
From: Bobby Wang <bobwang@nvidia.com>
Date: Fri, 3 Nov 2023 18:19:28 +0800
Subject: [PATCH 01/32] [Doc] update the tutorial of xgboost4j-spark-gpu
 (#9752)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 doc/jvm/xgboost4j_spark_gpu_tutorial.rst | 37 +++++++++++++-----------
 1 file changed, 20 insertions(+), 17 deletions(-)
diff --git a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
index 7b80286ef..3b2f92c6f 100644
--- a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
@@ -18,9 +18,9 @@ Build an ML Application with XGBoost4J-Spark-GPU
 Add XGBoost to Your Project
 ===========================
 
-Before we go into the tour of how to use XGBoost4J-Spark-GPU, you should first consult
-:ref:`Installation from Maven repository <install_jvm_packages>` in order to add XGBoost4J-Spark-GPU as
-a dependency for your project. We provide both stable releases and snapshots.
+Prior to delving into the tutorial on utilizing XGBoost4J-Spark-GPU, it is advisable to refer to
+:ref:`Installation from Maven repository <install_jvm_packages>` for instructions on adding XGBoost4J-Spark-GPU
+as a project dependency. We offer both stable releases and snapshots for your convenience.
 
 Data Preparation
 ================
@@ -54,7 +54,7 @@ Read Dataset with Spark's Built-In Reader
       .schema(schema)
       .csv(dataPath)
 
-In the first line, we create an instance of a `SparkSession <https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession>`_
+At first, we create an instance of a `SparkSession <https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession>`_
 which is the entry point of any Spark application working with DataFrames. The ``schema`` variable
 defines the schema of the DataFrame wrapping Iris data. With this explicitly set schema, we
 can define the column names as well as their types; otherwise the column names would be
@@ -112,7 +112,7 @@ models. Although we use the Iris dataset in this tutorial to show how we use
 ``XGBoost/XGBoost4J-Spark-GPU`` to resolve a multi-classes classification problem, the
 usage in Regression is very similar to classification.
 
-To train a XGBoost model for classification, we need to claim a XGBoostClassifier first:
+To train a XGBoost model for classification, we need to define a XGBoostClassifier first:
 
 .. code-block:: scala
 
@@ -130,9 +130,13 @@ To train a XGBoost model for classification, we need to claim a XGBoostClassifie
       .setFeaturesCol(featuresNames)
       .setLabelCol(labelName)
 
-The ``device`` parameter is for informing XGBoost that CUDA devices should be used instead of CPU. Unlike the single-node mode, GPUs are managed by spark instead of by XGBoost. Therefore, explicitly specified device ordinal like ``cuda:1`` is not support.
+The ``device`` parameter is for informing XGBoost that CUDA devices should be used instead of CPU.
+Unlike the single-node mode, GPUs are managed by spark instead of by XGBoost. Therefore,
+explicitly specified device ordinal like ``cuda:1`` is not support.
 
-The available parameters for training a XGBoost model can be found in :doc:`here </parameter>`. Similar to the XGBoost4J-Spark package, in addition to the default set of parameters, XGBoost4J-Spark-GPU also supports the camel-case variant of these parameters to be consistent with Spark's MLlib naming convention.
+The available parameters for training a XGBoost model can be found in :doc:`here </parameter>`.
+Similar to the XGBoost4J-Spark package, in addition to the default set of parameters,
+XGBoost4J-Spark-GPU also supports the camel-case variant of these parameters to be consistent with Spark's MLlib naming convention.
 
 Specifically, each parameter in :doc:`this page </parameter>` has its equivalent form in
 XGBoost4J-Spark-GPU with camel case. For example, to set ``max_depth`` for each tree, you
@@ -211,32 +215,31 @@ and the prediction for each instance.
 Submit the application
 **********************
 
-Here’s an example to submit an end-to-end XGBoost-4j-Spark-GPU Spark application to an
-Apache Spark Standalone cluster, assuming the application main class is Iris and the
-application jar is iris-1.0.0.jar
+Assuming that the application main class is "Iris" and the application jar is "iris-1.0.0.jar",`
+provided below is an instance demonstrating how to submit the xgboost application to an Apache
+Spark Standalone cluster.
 
 .. code-block:: bash
 
-  cudf_version=22.02.0
-  rapids_version=22.02.0
-  xgboost_version=1.6.1
+  rapids_version=23.10.0
+  xgboost_version=2.0.1
   main_class=Iris
   app_jar=iris-1.0.0.jar
 
   spark-submit \
     --master $master \
-    --packages ai.rapids:cudf:${cudf_version},com.nvidia:rapids-4-spark_2.12:${rapids_version},ml.dmlc:xgboost4j-gpu_2.12:${xgboost_version},ml.dmlc:xgboost4j-spark-gpu_2.12:${xgboost_version} \
+    --packages com.nvidia:rapids-4-spark_2.12:${rapids_version},ml.dmlc:xgboost4j-gpu_2.12:${xgboost_version},ml.dmlc:xgboost4j-spark-gpu_2.12:${xgboost_version} \
     --conf spark.executor.cores=12 \
-    --conf spark.task.cpus=1 \
+    --conf spark.task.cpus=12 \
     --conf spark.executor.resource.gpu.amount=1 \
-    --conf spark.task.resource.gpu.amount=0.08 \
+    --conf spark.task.resource.gpu.amount=1 \
     --conf spark.rapids.sql.csv.read.double.enabled=true \
     --conf spark.rapids.sql.hasNans=false \
     --conf spark.plugins=com.nvidia.spark.SQLPlugin \
     --class ${main_class} \
      ${app_jar}
 
-* First, we need to specify the ``RAPIDS Accelerator, cudf, xgboost4j-gpu, xgboost4j-spark-gpu`` packages by ``--packages``
+* First, we need to specify the ``RAPIDS Accelerator, xgboost4j-gpu, xgboost4j-spark-gpu`` packages by ``--packages``
 * Second, ``RAPIDS Accelerator`` is a Spark plugin, so we need to configure it by specifying ``spark.plugins=com.nvidia.spark.SQLPlugin``
 
 For details about other ``RAPIDS Accelerator`` other configurations, please refer to the `configuration <https://nvidia.github.io/spark-rapids/docs/configs.html>`_.

From 98238d63fabe3470f558e1705b47a53e3e27649c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 7 Nov 2023 02:44:39 +0800
Subject: [PATCH 02/32] [dask] Change document to avoid using default import.
 (#9742)

This aligns dask with pyspark, users need to explicitly call:

```
from xgboost.dask import DaskXGBClassifier
from xgboost import dask as dxgb
```

In future releases, we might stop using the default import and remove the lazy loader.
---
 demo/dask/cpu_survival.py         |  6 +--
 demo/dask/cpu_training.py         |  6 +--
 demo/dask/dask_callbacks.py       |  3 +-
 demo/dask/gpu_training.py         |  9 ++---
 demo/dask/sklearn_cpu_training.py |  4 +-
 demo/dask/sklearn_gpu_training.py |  4 +-
 doc/tutorials/dask.rst            | 67 ++++++++++++++++---------------
 7 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/demo/dask/cpu_survival.py b/demo/dask/cpu_survival.py
index 7fe0570de..8bf464ce2 100644
--- a/demo/dask/cpu_survival.py
+++ b/demo/dask/cpu_survival.py
@@ -9,7 +9,7 @@ import os
 import dask.dataframe as dd
 from dask.distributed import Client, LocalCluster
 
-import xgboost as xgb
+from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix
 
 
@@ -48,14 +48,14 @@ def main(client):
         "lambda": 0.01,
         "alpha": 0.02,
     }
-    output = xgb.dask.train(
+    output = dxgb.train(
         client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
     )
     bst = output["booster"]
     history = output["history"]
 
     # you can pass output directly into `predict` too.
-    prediction = xgb.dask.predict(client, bst, dtrain)
+    prediction = dxgb.predict(client, bst, dtrain)
     print("Evaluation history: ", history)
 
     # Uncomment the following line to save the model to the disk
diff --git a/demo/dask/cpu_training.py b/demo/dask/cpu_training.py
index 811af5cd3..0f3316741 100644
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@@ -6,7 +6,7 @@ Example of training with Dask on CPU
 from dask import array as da
 from dask.distributed import Client, LocalCluster
 
-import xgboost as xgb
+from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix
 
 
@@ -25,7 +25,7 @@ def main(client):
     # distributed version of train returns a dictionary containing the
     # resulting booster and evaluation history obtained from
     # evaluation metrics.
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {"verbosity": 1, "tree_method": "hist"},
         dtrain,
@@ -36,7 +36,7 @@ def main(client):
     history = output["history"]
 
     # you can pass output directly into `predict` too.
-    prediction = xgb.dask.predict(client, bst, dtrain)
+    prediction = dxgb.predict(client, bst, dtrain)
     print("Evaluation history:", history)
     return prediction
 
diff --git a/demo/dask/dask_callbacks.py b/demo/dask/dask_callbacks.py
index 408297d9e..a4b0f5648 100644
--- a/demo/dask/dask_callbacks.py
+++ b/demo/dask/dask_callbacks.py
@@ -8,6 +8,7 @@ from dask_ml.datasets import make_regression
 from dask_ml.model_selection import train_test_split
 
 import xgboost as xgb
+import xgboost.dask as dxgb
 from xgboost.dask import DaskDMatrix
 
 
@@ -61,7 +62,7 @@ def main(client):
     dtrain = DaskDMatrix(client, X_train, y_train)
     dtest = DaskDMatrix(client, X_test, y_test)
 
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {
             "verbosity": 1,
diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py
index 6eea00692..fd5b35bf3 100644
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@@ -8,7 +8,6 @@ from dask import dataframe as dd
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
 
-import xgboost as xgb
 from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix
 
@@ -21,7 +20,7 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
     # Use train method from xgboost.dask instead of xgboost.  This distributed version
     # of train returns a dictionary containing the resulting booster and evaluation
     # history obtained from evaluation metrics.
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {
             "verbosity": 2,
@@ -37,7 +36,7 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
     history = output["history"]
 
     # you can pass output directly into `predict` too.
-    prediction = xgb.dask.predict(client, bst, dtrain)
+    prediction = dxgb.predict(client, bst, dtrain)
     print("Evaluation history:", history)
     return prediction
 
@@ -56,14 +55,14 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
     # be used for anything else other than training unless a reference is specified. See
     # the `ref` argument of `DaskQuantileDMatrix`.
     dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {"verbosity": 2, "tree_method": "hist", "device": "cuda"},
         dtrain,
         num_boost_round=4,
     )
 
-    prediction = xgb.dask.predict(client, output, X)
+    prediction = dxgb.predict(client, output, X)
     return prediction
 
 
diff --git a/demo/dask/sklearn_cpu_training.py b/demo/dask/sklearn_cpu_training.py
index 12d55493c..38ea25e61 100644
--- a/demo/dask/sklearn_cpu_training.py
+++ b/demo/dask/sklearn_cpu_training.py
@@ -5,7 +5,7 @@ Use scikit-learn regressor interface with CPU histogram tree method
 from dask import array as da
 from dask.distributed import Client, LocalCluster
 
-import xgboost
+from xgboost import dask as dxgb
 
 
 def main(client):
@@ -16,7 +16,7 @@ def main(client):
     X = da.random.random((m, n), partition_size)
     y = da.random.random(m, partition_size)
 
-    regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
+    regressor = dxgb.DaskXGBRegressor(verbosity=1, n_estimators=2)
     regressor.set_params(tree_method="hist")
     # assigning client here is optional
     regressor.client = client
diff --git a/demo/dask/sklearn_gpu_training.py b/demo/dask/sklearn_gpu_training.py
index 32a994464..768690995 100644
--- a/demo/dask/sklearn_gpu_training.py
+++ b/demo/dask/sklearn_gpu_training.py
@@ -9,7 +9,7 @@ from dask.distributed import Client
 # It's recommended to use dask_cuda for GPU assignment
 from dask_cuda import LocalCUDACluster
 
-import xgboost
+from xgboost import dask as dxgb
 
 
 def main(client):
@@ -20,7 +20,7 @@ def main(client):
     X = da.random.random((m, n), partition_size)
     y = da.random.random(m, partition_size)
 
-    regressor = xgboost.dask.DaskXGBRegressor(verbosity=1)
+    regressor = dxgb.DaskXGBRegressor(verbosity=1)
     # set the device to CUDA
     regressor.set_params(tree_method="hist", device="cuda")
     # assigning client here is optional
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 7ab251bcf..148230fe6 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -39,7 +39,8 @@ on a dask cluster:
 
 .. code-block:: python
 
-    import xgboost as xgb
+    from xgboost import dask as dxgb
+
     import dask.array as da
     import dask.distributed
 
@@ -53,11 +54,11 @@ on a dask cluster:
         X = da.random.random(size=(num_obs, num_features), chunks=(1000, num_features))
         y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))
 
-        dtrain = xgb.dask.DaskDMatrix(client, X, y)
+        dtrain = dxgb.DaskDMatrix(client, X, y)
         # or
-        # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
+        # dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
 
-        output = xgb.dask.train(
+        output = dxgb.train(
             client,
             {"verbosity": 2, "tree_method": "hist", "objective": "reg:squarederror"},
             dtrain,
@@ -87,25 +88,27 @@ returns a model and the computation history as a Python dictionary:
 
 .. code-block:: python
 
-  {'booster': Booster,
-   'history': dict}
+  {
+    "booster": Booster,
+    "history": dict,
+  }
 
 For prediction, pass the ``output`` returned by ``train`` into :py:func:`xgboost.dask.predict`:
 
 .. code-block:: python
 
-  prediction = xgb.dask.predict(client, output, dtrain)
+  prediction = dxgb.predict(client, output, dtrain)
   # Or equivalently, pass ``output['booster']``:
-  prediction = xgb.dask.predict(client, output['booster'], dtrain)
+  prediction = dxgb.predict(client, output['booster'], dtrain)
 
 Eliminating the construction of DaskDMatrix is also possible, this can make the
 computation a bit faster when meta information like ``base_margin`` is not needed:
 
 .. code-block:: python
 
-  prediction = xgb.dask.predict(client, output, X)
+  prediction = dxgb.predict(client, output, X)
   # Use inplace version.
-  prediction = xgb.dask.inplace_predict(client, output, X)
+  prediction = dxgb.inplace_predict(client, output, X)
 
 Here ``prediction`` is a dask ``Array`` object containing predictions from model if input
 is a ``DaskDMatrix`` or ``da.Array``.  When putting dask collection directly into the
@@ -134,14 +137,14 @@ both memory usage and prediction time.
 .. code-block:: python
 
   # dtrain is the DaskDMatrix defined above.
-  prediction = xgb.dask.predict(client, booster, dtrain)
+  prediction = dxgb.predict(client, booster, dtrain)
 
 or equivalently:
 
 .. code-block:: python
 
   # where X is a dask DataFrame or dask Array.
-  prediction = xgb.dask.predict(client, booster, X)
+  prediction = dxgb.predict(client, booster, X)
 
 Also for inplace prediction:
 
@@ -149,7 +152,7 @@ Also for inplace prediction:
 
   # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
   booster.set_param({"device": "cuda"})
-  prediction = xgb.dask.inplace_predict(client, booster, X)
+  prediction = dxgb.inplace_predict(client, booster, X)
 
 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
 type is ``dd.DataFrame``, output can be ``dd.Series``, ``dd.DataFrame`` or ``da.Array``,
@@ -174,7 +177,7 @@ One simple optimization for running consecutive predictions is using
     futures = []
     for X in dataset:
         # Here we pass in a future instead of concrete booster
-        shap_f = xgb.dask.predict(client, booster_f, X, pred_contribs=True)
+        shap_f = dxgb.predict(client, booster_f, X, pred_contribs=True)
         futures.append(shap_f)
 
     results = client.gather(futures)
@@ -186,7 +189,7 @@ Scikit-Learn wrapper object:
 
 .. code-block:: python
 
-    cls = xgb.dask.DaskXGBClassifier()
+    cls = dxgb.DaskXGBClassifier()
     cls.fit(X, y)
 
     booster = cls.get_booster()
@@ -207,12 +210,12 @@ collection.
 .. code-block:: python
 
     from distributed import LocalCluster, Client
-    import xgboost as xgb
+    from xgboost import dask as dxgb
 
 
     def main(client: Client) -> None:
         X, y = load_data()
-        clf = xgb.dask.DaskXGBClassifier(n_estimators=100, tree_method="hist")
+        clf = dxgb.DaskXGBClassifier(n_estimators=100, tree_method="hist")
         clf.client = client  # assign the client
         clf.fit(X, y, eval_set=[(X, y)])
         proba = clf.predict_proba(X)
@@ -242,7 +245,7 @@ In the example below, a ``KubeCluster`` is used for `deploying Dask on Kubernete
 
   from dask_kubernetes import KubeCluster  # Need to install the ``dask-kubernetes`` package
   from dask.distributed import Client
-  import xgboost as xgb
+  from xgboost import dask as dxgb
   import dask
   import dask.array as da
 
@@ -265,7 +268,7 @@ In the example below, a ``KubeCluster`` is used for `deploying Dask on Kubernete
           X = da.random.random(size=(m, n), chunks=100)
           y = da.random.random(size=(m, ), chunks=100)
 
-          regressor = xgb.dask.DaskXGBRegressor(n_estimators=10, missing=0.0)
+          regressor = dxgb.DaskXGBRegressor(n_estimators=10, missing=0.0)
           regressor.client = client
           regressor.set_params(tree_method='hist', device="cuda")
           regressor.fit(X, y, eval_set=[(X, y)])
@@ -298,7 +301,7 @@ threads in each process for training.  But if ``nthread`` parameter is set:
 
 .. code-block:: python
 
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {"verbosity": 1, "nthread": 8, "tree_method": "hist"},
         dtrain,
@@ -330,12 +333,12 @@ Functional interface:
 
     async with dask.distributed.Client(scheduler_address, asynchronous=True) as client:
         X, y = generate_array()
-        m = await xgb.dask.DaskDMatrix(client, X, y)
-        output = await xgb.dask.train(client, {}, dtrain=m)
+        m = await dxgb.DaskDMatrix(client, X, y)
+        output = await dxgb.train(client, {}, dtrain=m)
 
-        with_m = await xgb.dask.predict(client, output, m)
-        with_X = await xgb.dask.predict(client, output, X)
-        inplace = await xgb.dask.inplace_predict(client, output, X)
+        with_m = await dxgb.predict(client, output, m)
+        with_X = await dxgb.predict(client, output, X)
+        inplace = await dxgb.inplace_predict(client, output, X)
 
         # Use ``client.compute`` instead of the ``compute`` method from dask collection
         print(await client.compute(with_m))
@@ -349,7 +352,7 @@ actual computation will return a coroutine and hence require awaiting:
 
     async with dask.distributed.Client(scheduler_address, asynchronous=True) as client:
         X, y = generate_array()
-        regressor = await xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
+        regressor = await dxgb.DaskXGBRegressor(verbosity=1, n_estimators=2)
         regressor.set_params(tree_method='hist')  # trivial method, synchronous operation
         regressor.client = client  #  accessing attribute, synchronous operation
         regressor = await regressor.fit(X, y, eval_set=[(X, y)])
@@ -371,7 +374,7 @@ To enable early stopping, pass one or more validation sets containing ``DaskDMat
 .. code-block:: python
 
     import dask.array as da
-    import xgboost as xgb
+    from xgboost import dask as dxgb
 
     num_rows = 1e6
     num_features = 100
@@ -398,19 +401,19 @@ To enable early stopping, pass one or more validation sets containing ``DaskDMat
         chunks=(rows_per_chunk, 1)
     )
 
-    dtrain = xgb.dask.DaskDMatrix(
+    dtrain = dxgb.DaskDMatrix(
         client=client,
         data=data,
         label=labels
     )
 
-    dvalid = xgb.dask.DaskDMatrix(
+    dvalid = dxgb.DaskDMatrix(
         client=client,
         data=X_eval,
         label=y_eval
     )
 
-    result = xgb.dask.train(
+    result = dxgb.train(
         client=client,
         params={
             "objective": "reg:squarederror",
@@ -421,7 +424,7 @@ To enable early stopping, pass one or more validation sets containing ``DaskDMat
         early_stopping_rounds=3
     )
 
-When validation sets are provided to ``xgb.dask.train()`` in this way, the model object returned by ``xgb.dask.train()`` contains a history of evaluation metrics for each validation set, across all boosting rounds.
+When validation sets are provided to :py:func:`xgboost.dask.train` in this way, the model object returned by :py:func:`xgboost.dask.train` contains a history of evaluation metrics for each validation set, across all boosting rounds.
 
 .. code-block:: python
 
@@ -463,7 +466,7 @@ interface, including callback functions, custom evaluation metric and objective:
         save_best=True,
     )
 
-    booster = xgb.dask.train(
+    booster = dxgb.train(
         client,
         params={
             "objective": "binary:logistic",

From 82828621d020299d1a1226b3131444298fe92278 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 7 Nov 2023 05:03:30 +0800
Subject: [PATCH 03/32] [doc] Add doc for linters and simplify c++ lint script.
 (#9750)

---
 .github/workflows/main.yml   |  8 +-------
 doc/contrib/coding_guide.rst | 32 ++++++++++++++++++++++++++++----
 tests/ci_build/lint_cpp.py   |  9 ++++++++-
 3 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3fd39bc36..67e77ad6e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -144,11 +144,5 @@ jobs:
         python -m pip install wheel setuptools cmakelint cpplint pylint
     - name: Run lint
       run: |
-        python3 tests/ci_build/lint_cpp.py xgboost cpp R-package/src
-
-        python3 tests/ci_build/lint_cpp.py xgboost cpp include src python-package \
-            --exclude_path python-package/xgboost/dmlc-core python-package/xgboost/include \
-                           python-package/xgboost/lib python-package/xgboost/rabit \
-                           python-package/xgboost/src
-
+        python3 tests/ci_build/lint_cpp.py
         sh ./tests/ci_build/lint_cmake.sh
diff --git a/doc/contrib/coding_guide.rst b/doc/contrib/coding_guide.rst
index 1169921bb..f7ed88b6c 100644
--- a/doc/contrib/coding_guide.rst
+++ b/doc/contrib/coding_guide.rst
@@ -118,16 +118,40 @@ two automatic checks to enforce coding style conventions. To expedite the code r
 
 Linter
 ======
-We use `pylint <https://github.com/PyCQA/pylint>`_ and `cpplint <https://github.com/cpplint/cpplint>`_ to enforce style convention and find potential errors. Linting is especially useful for Python, as we can catch many errors that would have otherwise occurred at run-time.
+We use a combination of linters to enforce style convention and find potential errors. Linting is especially useful for scripting languages like Python, as we can catch many errors that would have otherwise occurred at run-time.
 
-To run this check locally, run the following command from the top level source tree:
+For Python scripts, `pylint <https://github.com/PyCQA/pylint>`_, `black <https://github.com/psf/black>`__ and `isort <https://github.com/PyCQA/isort>`__ are used for providing guidance on coding style, and `mypy <https://github.com/python/mypy>`__ is required for type checking. For C++, `cpplint <https://github.com/cpplint/cpplint>`_ is used along with ``clang-tidy``. For R, ``lintr`` is used.
+
+To run checks for Python locally, install the checkers mentioned previously and run:
 
 .. code-block:: bash
 
   cd /path/to/xgboost/
-  make lint
+  python ./tests/ci_build/lint_python.py --fix
+
+To run checks for R:
+
+.. code-block:: bash
+
+  cd /path/to/xgboost/
+  Rscript tests/ci_build/lint_r.R $(pwd)
+
+To run checks for cpplint locally:
+
+.. code-block:: bash
+
+  cd /path/to/xgboost/
+  python ./tests/ci_build/lint_cpp.py
+
+
+See next section for clang-tidy. For CMake scripts:
+
+.. code-block:: bash
+
+  bash ./tests/ci_build/lint_cmake.sh
+
+Lastly, the linter for jvm-packages is integrated into the maven build process.
 
-This command requires the Python packages pylint and cpplint.
 
 Clang-tidy
 ==========
diff --git a/tests/ci_build/lint_cpp.py b/tests/ci_build/lint_cpp.py
index 593b8f870..6ec2b4e7f 100644
--- a/tests/ci_build/lint_cpp.py
+++ b/tests/ci_build/lint_cpp.py
@@ -134,7 +134,12 @@ def process(fname, allow_type):
 
 def main():
     parser = argparse.ArgumentParser(description="run cpp lint")
-    parser.add_argument("path", nargs="+", help="path to traverse")
+    parser.add_argument(
+        "path",
+        nargs="*",
+        help="Path to traverse",
+        default=["src", "include", os.path.join("R-package", "src"), "python-package"],
+    )
     parser.add_argument(
         "--exclude_path",
         nargs="+",
@@ -148,6 +153,8 @@ def main():
     allow_type += CXX_SUFFIX
 
     for path in args.path:
+        if not os.path.exists(path):
+            raise ValueError(f"Unknown path: {path}")
         if os.path.isfile(path):
             normpath = os.path.normpath(path)
             if normpath not in excluded_paths:

From c3a0622b499526ef684c6dde855b925a34911742 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 7 Nov 2023 07:29:11 +0800
Subject: [PATCH 04/32] Fix using categorical data with the score function of
 ranker. (#9753)

---
 python-package/xgboost/sklearn.py         | 12 ++++++++++-
 python-package/xgboost/testing/ranking.py | 25 +++++++++++++++++++++++
 tests/python-gpu/test_gpu_with_sklearn.py |  7 ++++++-
 tests/python/test_with_sklearn.py         |  7 ++++++-
 4 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index d5e20439a..3906973a8 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -2099,7 +2099,17 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
 
         """
         X, qid = _get_qid(X, None)
-        Xyq = DMatrix(X, y, qid=qid)
+        # fixme(jiamingy): base margin and group weight is not yet supported. We might
+        # need to make extra special fields in the dataframe.
+        Xyq = DMatrix(
+            X,
+            y,
+            qid=qid,
+            missing=self.missing,
+            enable_categorical=self.enable_categorical,
+            nthread=self.n_jobs,
+            feature_types=self.feature_types,
+        )
         if callable(self.eval_metric):
             metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
             result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)
diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
index 7c75012c2..a11eb3e03 100644
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@@ -75,3 +75,28 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
 
     with pytest.raises(ValueError, match="Either `group` or `qid`."):
         ranker.fit(df, y, eval_set=[(X, y)])
+
+
+def run_ranking_categorical(device: str) -> None:
+    """Test LTR with categorical features."""
+    from sklearn.model_selection import cross_val_score
+
+    X, y = tm.make_categorical(
+        n_samples=512, n_features=10, n_categories=3, onehot=False
+    )
+    rng = np.random.default_rng(1994)
+    qid = rng.choice(3, size=y.shape[0])
+    qid = np.sort(qid)
+    X["qid"] = qid
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+    ltr.fit(X, y)
+    score = ltr.score(X, y)
+    assert score > 0.9
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+
+    # test using the score function inside sklearn.
+    scores = cross_val_score(ltr, X, y)
+    for s in scores:
+        assert s > 0.7
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index 9f902ce32..650c0a047 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -10,7 +10,7 @@ import pytest
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.ranking import run_ranking_qid_df
+from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
 
 sys.path.append("tests/python")
 import test_with_sklearn as twskl  # noqa
@@ -256,6 +256,11 @@ def test_ranking_qid_df():
     run_ranking_qid_df(cudf, "gpu_hist")
 
 
+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_categorical() -> None:
+    run_ranking_categorical(device="cuda")
+
+
 @pytest.mark.skipif(**tm.no_cupy())
 @pytest.mark.mgpu
 def test_device_ordinal() -> None:
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index c919a01ad..16f7ab9d1 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -12,7 +12,7 @@ from sklearn.utils.estimator_checks import parametrize_with_checks
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.ranking import run_ranking_qid_df
+from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
 from xgboost.testing.shared import get_feature_weights, validate_data_initialization
 from xgboost.testing.updater import get_basescore
 
@@ -173,6 +173,11 @@ def test_ranking():
     np.testing.assert_almost_equal(pred, pred_orig)
 
 
+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_categorical() -> None:
+    run_ranking_categorical(device="cpu")
+
+
 def test_ranking_metric() -> None:
     from sklearn.metrics import roc_auc_score
 

From 6c0a190f6d12d2ba6a1cabd7741881ea1913d433 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 7 Nov 2023 11:12:31 +0800
Subject: [PATCH 05/32] [coll] Add comm group. (#9759)

- Implement `CommGroup` for double dispatching.
- Small cleanup to tracker for handling abort.
---
 plugin/federated/federated_comm.cc            |  10 +-
 plugin/federated/federated_comm.cu            |   2 +
 plugin/federated/federated_comm.h             |   7 +-
 src/collective/comm.cc                        |  20 ++-
 src/collective/comm_group.cc                  | 125 ++++++++++++++++
 src/collective/comm_group.h                   |  53 +++++++
 src/collective/tracker.cc                     | 134 +++++++++++-------
 tests/cpp/collective/test_comm_group.cc       |  63 ++++++++
 tests/cpp/collective/test_worker.h            |   3 +-
 tests/cpp/common/test_linalg.cc               |  14 ++
 .../plugin/federated/test_federated_coll.cu   |   3 +
 .../plugin/federated/test_federated_comm.cc   |  13 +-
 .../federated/test_federated_comm_group.cc    |  22 +++
 .../federated/test_federated_comm_group.cu    |  22 +++
 tests/cpp/plugin/federated/test_worker.h      |  50 ++++++-
 15 files changed, 462 insertions(+), 79 deletions(-)
 create mode 100644 src/collective/comm_group.cc
 create mode 100644 src/collective/comm_group.h
 create mode 100644 tests/cpp/collective/test_comm_group.cc
 create mode 100644 tests/cpp/plugin/federated/test_federated_comm_group.cc
 create mode 100644 tests/cpp/plugin/federated/test_federated_comm_group.cu

diff --git a/plugin/federated/federated_comm.cc b/plugin/federated/federated_comm.cc
index 8a649340f..ec1287413 100644
--- a/plugin/federated/federated_comm.cc
+++ b/plugin/federated/federated_comm.cc
@@ -60,7 +60,8 @@ void FederatedComm::Init(std::string const& host, std::int32_t port, std::int32_
   }
 }
 
-FederatedComm::FederatedComm(Json const& config) {
+FederatedComm::FederatedComm(std::int32_t retry, std::chrono::seconds timeout, std::string task_id,
+                             Json const& config) {
   /**
    * Topology
    */
@@ -93,6 +94,13 @@ FederatedComm::FederatedComm(Json const& config) {
   CHECK_NE(world_size, 0) << "Parameter `federated_world_size` is required.";
   CHECK(!server_address.empty()) << "Parameter `federated_server_address` is required.";
 
+  /**
+   * Basic config
+   */
+  this->retry_ = retry;
+  this->timeout_ = timeout;
+  this->task_id_ = task_id;
+
   /**
    * Certificates
    */
diff --git a/plugin/federated/federated_comm.cu b/plugin/federated/federated_comm.cu
index b05d38b1b..3eb8eb4f7 100644
--- a/plugin/federated/federated_comm.cu
+++ b/plugin/federated/federated_comm.cu
@@ -11,6 +11,8 @@ namespace xgboost::collective {
 CUDAFederatedComm::CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl)
     : FederatedComm{impl}, stream_{ctx->CUDACtx()->Stream()} {
   CHECK(impl);
+  CHECK(ctx->IsCUDA());
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 }
 
 Comm* FederatedComm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll>) const {
diff --git a/plugin/federated/federated_comm.h b/plugin/federated/federated_comm.h
index fb97a78b0..a24798626 100644
--- a/plugin/federated/federated_comm.h
+++ b/plugin/federated/federated_comm.h
@@ -27,6 +27,10 @@ class FederatedComm : public Comm {
     this->rank_ = that->Rank();
     this->world_ = that->World();
 
+    this->retry_ = that->Retry();
+    this->timeout_ = that->Timeout();
+    this->task_id_ = that->TaskID();
+
     this->tracker_ = that->TrackerInfo();
   }
 
@@ -41,7 +45,8 @@ class FederatedComm : public Comm {
    * - federated_client_key_path
    * - federated_client_cert_path
    */
-  explicit FederatedComm(Json const& config);
+  explicit FederatedComm(std::int32_t retry, std::chrono::seconds timeout, std::string task_id,
+                         Json const& config);
   explicit FederatedComm(std::string const& host, std::int32_t port, std::int32_t world,
                          std::int32_t rank) {
     this->Init(host, port, world, rank, {}, {}, {});
diff --git a/src/collective/comm.cc b/src/collective/comm.cc
index 241dca2ce..964137ff1 100644
--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@@ -5,13 +5,17 @@
 
 #include <algorithm>  // for copy
 #include <chrono>     // for seconds
+#include <cstdlib>    // for exit
 #include <memory>     // for shared_ptr
+#include <mutex>      // for unique_lock
 #include <string>     // for string
 #include <utility>    // for move, forward
 
 #include "../common/common.h"           // for AssertGPUSupport
+#include "../common/json_utils.h"       // for OptionalArg
 #include "allgather.h"                  // for RingAllgather
 #include "protocol.h"                   // for kMagic
+#include "tracker.h"                    // for GetHostAddress
 #include "xgboost/base.h"               // for XGBOOST_STRICT_R_MODE
 #include "xgboost/collective/socket.h"  // for TCPSocket
 #include "xgboost/json.h"               // for Json, Object
@@ -209,24 +213,18 @@ RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::se
   std::shared_ptr<TCPSocket> error_sock{TCPSocket::CreatePtr(domain)};
   auto eport = error_sock->BindHost();
   error_sock->Listen();
-  error_worker_ = std::thread{[this, error_sock = std::move(error_sock)] {
+  error_worker_ = std::thread{[error_sock = std::move(error_sock)] {
     auto conn = error_sock->Accept();
-    // On Windows accept returns an invalid socket after network is shutdown.
+    // On Windows, accept returns a closed socket after finalize.
     if (conn.IsClosed()) {
       return;
     }
     LOG(WARNING) << "Another worker is running into error.";
-    std::string scmd;
-    conn.Recv(&scmd);
-    auto jcmd = Json::Load(scmd);
-    auto rc = this->Shutdown();
-    if (!rc.OK()) {
-      LOG(WARNING) << "Fail to shutdown worker:" << rc.Report();
-    }
 #if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
-    exit(-1);
+    // exit is nicer than abort as the former performs cleanups.
+    std::exit(-1);
 #else
-    LOG(FATAL) << rc.Report();
+    LOG(FATAL) << "abort";
 #endif
   }};
   error_worker_.detach();
diff --git a/src/collective/comm_group.cc b/src/collective/comm_group.cc
new file mode 100644
index 000000000..570500843
--- /dev/null
+++ b/src/collective/comm_group.cc
@@ -0,0 +1,125 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "comm_group.h"
+
+#include <algorithm>  // for transform
+#include <chrono>     // for seconds
+#include <cstdint>    // for int32_t
+#include <memory>     // for shared_ptr, unique_ptr
+#include <string>     // for string
+#include <vector>     // for vector
+
+#include "../common/json_utils.h"       // for OptionalArg
+#include "coll.h"                       // for Coll
+#include "comm.h"                       // for Comm
+#include "tracker.h"                    // for GetHostAddress
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/context.h"            // for DeviceOrd
+#include "xgboost/json.h"               // for Json
+
+#if defined(XGBOOST_USE_FEDERATED)
+#include "../../plugin/federated/federated_coll.h"
+#include "../../plugin/federated/federated_comm.h"
+#endif
+
+namespace xgboost::collective {
+[[nodiscard]] std::shared_ptr<Coll> CommGroup::Backend(DeviceOrd device) const {
+  if (device.IsCUDA()) {
+    if (!gpu_coll_) {
+      gpu_coll_.reset(backend_->MakeCUDAVar());
+    }
+    return gpu_coll_;
+  }
+  return backend_;
+}
+
+[[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const {
+  if (device.IsCUDA()) {
+    CHECK(ctx->IsCUDA());
+    if (!gpu_comm_) {
+      gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_));
+    }
+    return *gpu_comm_;
+  }
+  return *comm_;
+}
+
+CommGroup::CommGroup()
+    : comm_{std::shared_ptr<RabitComm>(new RabitComm{})},  // NOLINT
+      backend_{std::shared_ptr<Coll>(new Coll{})} {}       // NOLINT
+
+[[nodiscard]] CommGroup* CommGroup::Create(Json config) {
+  if (IsA<Null>(config)) {
+    return new CommGroup;
+  }
+
+  std::string type = OptionalArg<String>(config, "dmlc_communicator", std::string{"rabit"});
+  std::vector<std::string> keys;
+  // Try both lower and upper case for compatibility
+  auto get_param = [&](std::string name, auto dft, auto t) {
+    std::string upper;
+    std::transform(name.cbegin(), name.cend(), std::back_inserter(upper),
+                   [](char c) { return std::toupper(c); });
+    std::transform(name.cbegin(), name.cend(), name.begin(),
+                   [](char c) { return std::tolower(c); });
+    keys.push_back(upper);
+    keys.push_back(name);
+
+    auto const& obj = get<Object const>(config);
+    auto it = obj.find(upper);
+    if (it != obj.cend()) {
+      return OptionalArg<decltype(t)>(config, upper, dft);
+    } else {
+      return OptionalArg<decltype(t)>(config, name, dft);
+    }
+  };
+  // Common args
+  auto retry =
+      OptionalArg<Integer>(config, "dmlc_retry", static_cast<Integer::Int>(DefaultRetry()));
+  auto timeout = OptionalArg<Integer>(config, "dmlc_timeout_sec",
+                                      static_cast<Integer::Int>(DefaultTimeoutSec()));
+  auto task_id = get_param("dmlc_task_id", std::string{}, String{});
+
+  if (type == "rabit") {
+    auto host = get_param("dmlc_tracker_uri", std::string{}, String{});
+    auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
+    auto ptr =
+        new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{  // NOLINT
+                          host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout},
+                          static_cast<std::int32_t>(retry), task_id}},
+                      std::shared_ptr<Coll>(new Coll{})};  // NOLINT
+    return ptr;
+  } else if (type == "federated") {
+#if defined(XGBOOST_USE_FEDERATED)
+    auto ptr = new CommGroup{
+        std::make_shared<FederatedComm>(retry, std::chrono::seconds{timeout}, task_id, config),
+        std::make_shared<FederatedColl>()};
+    return ptr;
+#endif  // defined(XGBOOST_USE_FEDERATED)
+  } else {
+    LOG(FATAL) << "Invalid communicator type";
+  }
+
+  return nullptr;
+}
+
+std::unique_ptr<collective::CommGroup>& GlobalCommGroup() {
+  static std::unique_ptr<collective::CommGroup> sptr;
+  if (!sptr) {
+    Json config{Null{}};
+    sptr.reset(CommGroup::Create(config));
+  }
+  return sptr;
+}
+
+void GlobalCommGroupInit(Json config) {
+  auto& sptr = GlobalCommGroup();
+  sptr.reset(CommGroup::Create(std::move(config)));
+}
+
+void GlobalCommGroupFinalize() {
+  auto& sptr = GlobalCommGroup();
+  sptr.reset();
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/comm_group.h b/src/collective/comm_group.h
new file mode 100644
index 000000000..62f3e565f
--- /dev/null
+++ b/src/collective/comm_group.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <memory>   // for shared_ptr, unique_ptr
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "coll.h"                       // for Comm
+#include "comm.h"                       // for Coll
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for GetHostName
+
+namespace xgboost::collective {
+/**
+ * @brief Communicator group used for double dispatching between communicators and
+ *        collective implementations.
+ */
+class CommGroup {
+  std::shared_ptr<Comm> comm_;
+  mutable std::shared_ptr<Comm> gpu_comm_;
+
+  std::shared_ptr<Coll> backend_;
+  mutable std::shared_ptr<Coll> gpu_coll_;  // lazy initialization
+
+  CommGroup(std::shared_ptr<Comm> comm, std::shared_ptr<Coll> coll)
+      : comm_{std::move(comm)}, backend_{std::move(coll)} {}
+
+ public:
+  CommGroup();
+
+  [[nodiscard]] auto World() const { return comm_->World(); }
+  [[nodiscard]] auto Rank() const { return comm_->Rank(); }
+  [[nodiscard]] bool IsDistributed() const { return comm_->IsDistributed(); }
+
+  [[nodiscard]] static CommGroup* Create(Json config);
+
+  [[nodiscard]] std::shared_ptr<Coll> Backend(DeviceOrd device) const;
+  [[nodiscard]] Comm const& Ctx(Context const* ctx, DeviceOrd device) const;
+  [[nodiscard]] Result SignalError(Result const& res) { return comm_->SignalError(res); }
+
+  [[nodiscard]] Result ProcessorName(std::string* out) const {
+    auto rc = GetHostName(out);
+    return rc;
+  }
+};
+
+std::unique_ptr<collective::CommGroup>& GlobalCommGroup();
+
+void GlobalCommGroupInit(Json config);
+
+void GlobalCommGroupFinalize();
+}  // namespace xgboost::collective
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
index 4837e2ace..88c51d8a9 100644
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -58,36 +58,35 @@ Result Tracker::WaitUntilReady() const {
 
 RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr)
     : sock_{std::move(sock)} {
-  auto host = addr.Addr();
-
   std::int32_t rank{0};
-  rc_ = Success()
-        << [&] { return proto::Magic{}.Verify(&sock_); }
-        << [&] { return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_); };
-  if (!rc_.OK()) {
-    return;
-  }
-
-  std::string cmd;
-  sock_.Recv(&cmd);
-  auto jcmd = Json::Load(StringView{cmd});
-  cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
+  Json jcmd;
   std::int32_t port{0};
-  if (cmd_ == proto::CMD::kStart) {
-    proto::Start start;
-    rc_ = start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
-  } else if (cmd_ == proto::CMD::kPrint) {
-    proto::Print print;
-    rc_ = print.TrackerHandle(jcmd, &msg_);
-  } else if (cmd_ == proto::CMD::kError) {
-    proto::ErrorCMD error;
-    rc_ = error.TrackerHandle(jcmd, &msg_, &code_);
-  }
-  if (!rc_.OK()) {
-    return;
-  }
 
-  info_ = proto::PeerInfo{host, port, rank};
+  rc_ = Success() << [&] { return proto::Magic{}.Verify(&sock_); } << [&] {
+    return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_);
+  } << [&] {
+    std::string cmd;
+    sock_.Recv(&cmd);
+    jcmd = Json::Load(StringView{cmd});
+    cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
+    return Success();
+  } << [&] {
+    if (cmd_ == proto::CMD::kStart) {
+      proto::Start start;
+      return start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
+    } else if (cmd_ == proto::CMD::kPrint) {
+      proto::Print print;
+      return print.TrackerHandle(jcmd, &msg_);
+    } else if (cmd_ == proto::CMD::kError) {
+      proto::ErrorCMD error;
+      return error.TrackerHandle(jcmd, &msg_, &code_);
+    }
+    return Success();
+  } << [&] {
+    auto host = addr.Addr();
+    info_ = proto::PeerInfo{host, port, rank};
+    return Success();
+  };
 }
 
 RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
@@ -137,15 +136,18 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
 
     std::int32_t n_shutdown{0};
     bool during_restart{false};
+    bool running{false};
     std::vector<WorkerProxy> pending;
 
     explicit State(std::int32_t world) : n_workers{world} {}
     State(State const& that) = delete;
     State& operator=(State&& that) = delete;
 
+    // modifiers
     void Start(WorkerProxy&& worker) {
       CHECK_LT(pending.size(), n_workers);
       CHECK_LE(n_shutdown, n_workers);
+      CHECK(!running);
 
       pending.emplace_back(std::forward<WorkerProxy>(worker));
 
@@ -155,6 +157,7 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
       CHECK_GE(n_shutdown, 0);
       CHECK_LT(n_shutdown, n_workers);
 
+      running = false;
       ++n_shutdown;
 
       CHECK_LE(n_shutdown, n_workers);
@@ -163,21 +166,26 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
       CHECK_LE(pending.size(), n_workers);
       CHECK_LE(n_shutdown, n_workers);
 
+      running = false;
       during_restart = true;
     }
-    [[nodiscard]] bool Ready() const {
-      CHECK_LE(pending.size(), n_workers);
-      return static_cast<std::int32_t>(pending.size()) == n_workers;
-    }
     void Bootstrap() {
       CHECK_EQ(pending.size(), n_workers);
       CHECK_LE(n_shutdown, n_workers);
 
+      running = true;
+
       // A reset.
       n_shutdown = 0;
       during_restart = false;
       pending.clear();
     }
+
+    // observers
+    [[nodiscard]] bool Ready() const {
+      CHECK_LE(pending.size(), n_workers);
+      return static_cast<std::int32_t>(pending.size()) == n_workers;
+    }
     [[nodiscard]] bool ShouldContinue() const {
       CHECK_LE(pending.size(), n_workers);
       CHECK_LE(n_shutdown, n_workers);
@@ -187,7 +195,31 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
     }
   };
 
-  return std::async(std::launch::async, [this] {
+  auto handle_error = [&](WorkerProxy const& worker) {
+    auto msg = worker.Msg();
+    auto code = worker.Code();
+    LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank() << "]: " << msg
+                 << " code:" << code;
+    auto host = worker.Host();
+    // We signal all workers for the error, if they haven't aborted already.
+    for (auto& w : worker_error_handles_) {
+      if (w.first == host) {
+        continue;
+      }
+      TCPSocket out;
+      // Connecting to the error port as a signal for exit.
+      //
+      // retry is set to 1, just let the worker timeout or error. Otherwise the
+      // tracker and the worker might be waiting for each other.
+      auto rc = Connect(w.first, w.second, 1, timeout_, &out);
+      if (!rc.OK()) {
+        return Fail("Failed to inform workers to stop.");
+      }
+    }
+    return Success();
+  };
+
+  return std::async(std::launch::async, [this, handle_error] {
     State state{this->n_workers_};
 
     while (state.ShouldContinue()) {
@@ -205,6 +237,16 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
       }
       switch (worker.Command()) {
         case proto::CMD::kStart: {
+          if (state.running) {
+            // Something went wrong with one of the workers. It got disconnected without
+            // notice.
+            state.Error();
+            rc = handle_error(worker);
+            if (!rc.OK()) {
+              return Fail("Failed to handle abort.", std::move(rc));
+            }
+          }
+
           state.Start(std::move(worker));
           if (state.Ready()) {
             rc = this->Bootstrap(&state.pending);
@@ -216,36 +258,20 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
           continue;
         }
         case proto::CMD::kShutdown: {
+          if (state.during_restart) {
+            // The worker can still send shutdown after call to `std::exit`.
+            continue;
+          }
           state.Shutdown();
           continue;
         }
         case proto::CMD::kError: {
           if (state.during_restart) {
+            // Ignore further errors.
             continue;
           }
           state.Error();
-          auto msg = worker.Msg();
-          auto code = worker.Code();
-          LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank()
-                       << "]: " << msg << " code:" << code;
-          auto host = worker.Host();
-          // We signal all workers for the error, if they haven't aborted already.
-          for (auto& w : worker_error_handles_) {
-            if (w.first == host) {
-              continue;
-            }
-            TCPSocket out;
-            // retry is set to 1, just let the worker timeout or error. Otherwise the
-            // tracker and the worker might be waiting for each other.
-            auto rc = Connect(w.first, w.second, 1, timeout_, &out);
-            // send signal to stop the worker.
-            proto::ShutdownCMD shutdown;
-            rc = shutdown.Send(&out);
-            if (!rc.OK()) {
-              return Fail("Failed to inform workers to stop.");
-            }
-          }
-
+          rc = handle_error(worker);
           continue;
         }
         case proto::CMD::kPrint: {
diff --git a/tests/cpp/collective/test_comm_group.cc b/tests/cpp/collective/test_comm_group.cc
new file mode 100644
index 000000000..0f6bc23a2
--- /dev/null
+++ b/tests/cpp/collective/test_comm_group.cc
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/json.h>  // for Json
+
+#include <chrono>   // for seconds
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <thread>   // for thread
+
+#include "../../../src/collective/comm.h"
+#include "../../../src/collective/comm_group.h"
+#include "../../../src/common/common.h"  // for AllVisibleGPUs
+#include "../helpers.h"                  // for MakeCUDACtx
+#include "test_worker.h"                 // for TestDistributed
+
+namespace xgboost::collective {
+namespace {
+auto MakeConfig(std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) {
+  Json config{Object{}};
+  config["dmlc_communicator"] = std::string{"rabit"};
+  config["DMLC_TRACKER_URI"] = host;
+  config["DMLC_TRACKER_PORT"] = port;
+  config["dmlc_timeout_sec"] = static_cast<std::int64_t>(timeout.count());
+  config["DMLC_TASK_ID"] = std::to_string(r);
+  config["dmlc_retry"] = 2;
+  return config;
+}
+
+class CommGroupTest : public SocketTest {};
+}  // namespace
+
+TEST_F(CommGroupTest, Basic) {
+  std::int32_t n_workers = std::min(std::thread::hardware_concurrency(), 5u);
+  TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    Context ctx;
+    auto config = MakeConfig(host, port, timeout, r);
+    std::unique_ptr<CommGroup> ptr{CommGroup::Create(config)};
+    ASSERT_TRUE(ptr->IsDistributed());
+    ASSERT_EQ(ptr->World(), n_workers);
+    auto const& comm = ptr->Ctx(&ctx, DeviceOrd::CPU());
+    ASSERT_EQ(comm.TaskID(), std::to_string(r));
+    ASSERT_EQ(comm.Retry(), 2);
+  });
+}
+
+#if defined(XGBOOST_USE_NCCL)
+TEST_F(CommGroupTest, BasicGPU) {
+  std::int32_t n_workers = common::AllVisibleGPUs();
+  TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    auto ctx = MakeCUDACtx(r);
+    auto config = MakeConfig(host, port, timeout, r);
+    std::unique_ptr<CommGroup> ptr{CommGroup::Create(config)};
+    auto const& comm = ptr->Ctx(&ctx, DeviceOrd::CUDA(0));
+    ASSERT_EQ(comm.TaskID(), std::to_string(r));
+    ASSERT_EQ(comm.Retry(), 2);
+  });
+}
+#endif  // for defined(XGBOOST_USE_NCCL)
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index 6578ff142..ad3213e81 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -95,7 +95,8 @@ void TestDistributed(std::int32_t n_workers, WorkerFn worker_fn) {
   std::chrono::seconds timeout{1};
 
   std::string host;
-  ASSERT_TRUE(GetHostAddress(&host).OK());
+  auto rc = GetHostAddress(&host);
+  ASSERT_TRUE(rc.OK()) << rc.Report();
   RabitTracker tracker{StringView{host}, n_workers, 0, timeout};
   auto fut = tracker.Run();
 
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index f345b3a78..21c5ad30d 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -15,6 +15,15 @@
 namespace xgboost::linalg {
 namespace {
 DeviceOrd CPU() { return DeviceOrd::CPU(); }
+
+template <typename T>
+void ConstView(linalg::VectorView<T> v1, linalg::VectorView<std::add_const_t<T>> v2) {
+  // compile test for being able to pass non-const view to const view.
+  auto s = v1.Slice(linalg::All());
+  ASSERT_EQ(s.Size(), v1.Size());
+  auto s2 = v2.Slice(linalg::All());
+  ASSERT_EQ(s2.Size(), v2.Size());
+}
 }  // namespace
 
 auto MakeMatrixFromTest(HostDeviceVector<float> *storage, std::size_t n_rows, std::size_t n_cols) {
@@ -206,6 +215,11 @@ TEST(Linalg, TensorView) {
     ASSERT_TRUE(t.FContiguous());
     ASSERT_FALSE(t.CContiguous());
   }
+  {
+    // const
+    TensorView<double, 1> t{data, {data.size()}, CPU()};
+    ConstView(t, t);
+  }
 }
 
 TEST(Linalg, Tensor) {
diff --git a/tests/cpp/plugin/federated/test_federated_coll.cu b/tests/cpp/plugin/federated/test_federated_coll.cu
index 44211f8d7..a6ec7e352 100644
--- a/tests/cpp/plugin/federated/test_federated_coll.cu
+++ b/tests/cpp/plugin/federated/test_federated_coll.cu
@@ -124,6 +124,9 @@ TEST_F(FederatedCollTestGPU, Allgather) {
 
 TEST_F(FederatedCollTestGPU, AllgatherV) {
   std::int32_t n_workers = 2;
+  if (common::AllVisibleGPUs() < n_workers) {
+    GTEST_SKIP_("At least 2 GPUs are required for the test.");
+  }
   TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
     TestAllgatherV(comm, rank);
   });
diff --git a/tests/cpp/plugin/federated/test_federated_comm.cc b/tests/cpp/plugin/federated/test_federated_comm.cc
index b45b00910..0d0692b5f 100644
--- a/tests/cpp/plugin/federated/test_federated_comm.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm.cc
@@ -1,6 +1,7 @@
 /**
  * Copyright 2022-2023, XGBoost contributors
  */
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include <string>  // for string
@@ -19,12 +20,14 @@ class FederatedCommTest : public SocketTest {};
 
 TEST_F(FederatedCommTest, ThrowOnWorldSizeTooSmall) {
   auto construct = [] { FederatedComm comm{"localhost", 0, 0, 0}; };
-  ExpectThrow<dmlc::Error>("Invalid world size.", construct);
+  ASSERT_THAT(construct,
+              ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr("Invalid world size")));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankTooSmall) {
   auto construct = [] { FederatedComm comm{"localhost", 0, 1, -1}; };
-  ExpectThrow<dmlc::Error>("Invalid worker rank.", construct);
+  ASSERT_THAT(construct,
+              ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr("Invalid worker rank.")));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankTooBig) {
@@ -38,7 +41,7 @@ TEST_F(FederatedCommTest, ThrowOnWorldSizeNotInteger) {
     config["federated_server_address"] = std::string("localhost:0");
     config["federated_world_size"] = std::string("1");
     config["federated_rank"] = Integer(0);
-    FederatedComm comm(config);
+    FederatedComm comm{DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config};
   };
   ExpectThrow<dmlc::Error>("got: `String`", construct);
 }
@@ -49,7 +52,7 @@ TEST_F(FederatedCommTest, ThrowOnRankNotInteger) {
     config["federated_server_address"] = std::string("localhost:0");
     config["federated_world_size"] = 1;
     config["federated_rank"] = std::string("0");
-    FederatedComm comm(config);
+    FederatedComm comm(DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config);
   };
   ExpectThrow<dmlc::Error>("got: `String`", construct);
 }
@@ -59,7 +62,7 @@ TEST_F(FederatedCommTest, GetWorldSizeAndRank) {
   config["federated_world_size"] = 6;
   config["federated_rank"] = 3;
   config["federated_server_address"] = String{"localhost:0"};
-  FederatedComm comm{config};
+  FederatedComm comm{DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config};
   EXPECT_EQ(comm.World(), 6);
   EXPECT_EQ(comm.Rank(), 3);
 }
diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cc b/tests/cpp/plugin/federated/test_federated_comm_group.cc
new file mode 100644
index 000000000..9bfbdd3ae
--- /dev/null
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc
@@ -0,0 +1,22 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/json.h>  // for Json
+
+#include "../../../../src/collective/comm_group.h"
+#include "../../helpers.h"
+#include "test_worker.h"
+
+namespace xgboost::collective {
+TEST(CommGroup, Federated) {
+  std::int32_t n_workers = common::AllVisibleGPUs();
+  TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
+    Context ctx;
+    ASSERT_EQ(comm_group->Rank(), r);
+    auto const& comm = comm_group->Ctx(&ctx, DeviceOrd::CPU());
+    ASSERT_EQ(comm.TaskID(), std::to_string(r));
+    ASSERT_EQ(comm.Retry(), 2);
+  });
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cu b/tests/cpp/plugin/federated/test_federated_comm_group.cu
new file mode 100644
index 000000000..747adb6fd
--- /dev/null
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu
@@ -0,0 +1,22 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/json.h>  // for Json
+
+#include "../../../../src/collective/comm_group.h"
+#include "../../helpers.h"
+#include "test_worker.h"
+
+namespace xgboost::collective {
+TEST(CommGroup, FederatedGPU) {
+  std::int32_t n_workers = common::AllVisibleGPUs();
+  TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
+    Context ctx = MakeCUDACtx(0);
+    auto const& comm = comm_group->Ctx(&ctx, DeviceOrd::CUDA(0));
+    ASSERT_EQ(comm_group->Rank(), r);
+    ASSERT_EQ(comm.TaskID(), std::to_string(r));
+    ASSERT_EQ(comm.Retry(), 2);
+  });
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/plugin/federated/test_worker.h b/tests/cpp/plugin/federated/test_worker.h
index 38bc32c60..d0edecc15 100644
--- a/tests/cpp/plugin/federated/test_worker.h
+++ b/tests/cpp/plugin/federated/test_worker.h
@@ -5,10 +5,12 @@
 
 #include <gtest/gtest.h>
 
-#include <chrono>  // for ms
+#include <chrono>  // for ms, seconds
+#include <memory>  // for shared_ptr
 #include <thread>  // for thread
 
 #include "../../../../plugin/federated/federated_tracker.h"
+#include "../../../../src/collective/comm_group.h"
 #include "federated_comm.h"  // for FederatedComm
 #include "xgboost/json.h"    // for Json
 
@@ -23,9 +25,8 @@ void TestFederated(std::int32_t n_workers, WorkerFn&& fn) {
 
   std::vector<std::thread> workers;
   using namespace std::chrono_literals;
-  while (tracker.Port() == 0) {
-    std::this_thread::sleep_for(100ms);
-  }
+  auto rc = tracker.WaitUntilReady();
+  ASSERT_TRUE(rc.OK()) << rc.Report();
   std::int32_t port = tracker.Port();
 
   for (std::int32_t i = 0; i < n_workers; ++i) {
@@ -34,7 +35,8 @@ void TestFederated(std::int32_t n_workers, WorkerFn&& fn) {
       config["federated_world_size"] = n_workers;
       config["federated_rank"] = i;
       config["federated_server_address"] = "0.0.0.0:" + std::to_string(port);
-      auto comm = std::make_shared<FederatedComm>(config);
+      auto comm = std::make_shared<FederatedComm>(
+          DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, std::to_string(i), config);
 
       fn(comm, i);
     });
@@ -44,7 +46,43 @@ void TestFederated(std::int32_t n_workers, WorkerFn&& fn) {
     t.join();
   }
 
-  auto rc = tracker.Shutdown();
+  rc = tracker.Shutdown();
+  ASSERT_TRUE(rc.OK()) << rc.Report();
+  ASSERT_TRUE(fut.get().OK());
+}
+
+template <typename WorkerFn>
+void TestFederatedGroup(std::int32_t n_workers, WorkerFn&& fn) {
+  Json config{Object()};
+  config["federated_secure"] = Boolean{false};
+  config["n_workers"] = Integer{n_workers};
+  FederatedTracker tracker{config};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  auto rc = tracker.WaitUntilReady();
+  ASSERT_TRUE(rc.OK()) << rc.Report();
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] {
+      Json config{Object{}};
+      config["dmlc_communicator"] = std::string{"federated"};
+      config["dmlc_task_id"] = std::to_string(i);
+      config["dmlc_retry"] = 2;
+      config["federated_world_size"] = n_workers;
+      config["federated_rank"] = i;
+      config["federated_server_address"] = "0.0.0.0:" + std::to_string(port);
+      std::shared_ptr<CommGroup> comm_group{CommGroup::Create(config)};
+      fn(comm_group, i);
+    });
+  }
+
+  for (auto& t : workers) {
+    t.join();
+  }
+
+  rc = tracker.Shutdown();
   ASSERT_TRUE(rc.OK()) << rc.Report();
   ASSERT_TRUE(fut.get().OK());
 }

From 06bdc15e9b72179c4bcdd01d43ca452ed72d5753 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 8 Nov 2023 09:54:05 +0800
Subject: [PATCH 06/32] [coll] Pass context to various functions. (#9772)

* [coll] Pass context to various functions.

In the future, the `Context` object would be required for collective operations, this PR
passes the context object to some required functions to prepare for swapping out the
implementation.
---
 include/xgboost/data.h                        |   2 +-
 include/xgboost/linalg.h                      |  12 +-
 plugin/federated/federated_coll.cc            |   6 +-
 src/collective/allreduce.cc                   |   3 +
 src/collective/comm.cu                        |   3 +-
 src/collective/comm_group.cc                  |   2 +-
 src/common/device_helpers.cuh                 |   3 +-
 src/common/hist_util.cc                       |   4 +-
 src/common/hist_util.cu                       |   2 +-
 src/common/quantile.cc                        |  27 ++--
 src/common/quantile.cu                        |  13 +-
 src/common/quantile.cuh                       |   4 +-
 src/common/quantile.h                         |   9 +-
 src/data/data.cc                              |   2 +-
 src/data/iterative_dmatrix.cc                 |  12 +-
 src/data/iterative_dmatrix.cu                 |   4 +-
 src/data/simple_dmatrix.cc                    |   2 +-
 src/data/simple_dmatrix.cu                    |   2 +-
 src/data/sparse_page_dmatrix.cc               |   2 +-
 src/learner.cc                                |   4 +-
 src/metric/auc.cu                             |  28 ++---
 src/metric/elementwise_metric.cu              |   6 +-
 src/metric/rank_metric.cc                     |  16 +--
 src/metric/survival_metric.cu                 |   2 +-
 src/predictor/cpu_predictor.cc                |  35 +++---
 src/predictor/gpu_predictor.cu                |  14 +--
 src/tree/gpu_hist/evaluate_splits.cu          |  33 ++---
 src/tree/gpu_hist/evaluate_splits.cuh         |   4 +-
 src/tree/gpu_hist/histogram.cu                |  11 +-
 src/tree/gpu_hist/histogram.cuh               |  14 ++-
 src/tree/hist/histogram.h                     |   9 +-
 src/tree/hist/param.cc                        |   2 +-
 src/tree/hist/param.h                         |   2 +-
 src/tree/updater_approx.cc                    |   4 +-
 src/tree/updater_gpu_hist.cu                  |  10 +-
 src/tree/updater_quantile_hist.cc             |  10 +-
 tests/cpp/common/test_hist_util.cu            |  49 +++++---
 tests/cpp/common/test_quantile.cc             |   8 +-
 tests/cpp/common/test_quantile.cu             |   6 +-
 tests/cpp/plugin/helpers.h                    |   1 +
 tests/cpp/test_learner.cc                     |   5 +-
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 115 ++++++++++--------
 tests/cpp/tree/gpu_hist/test_histogram.cu     |   6 +-
 tests/cpp/tree/hist/test_histogram.cc         |  14 +--
 tests/cpp/tree/test_gpu_hist.cu               |   8 +-
 45 files changed, 275 insertions(+), 255 deletions(-)

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 04b489d8b..69176994b 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -178,7 +178,7 @@ class MetaInfo {
    * in vertical federated learning, since each worker loads its own list of columns,
    * we need to sum them.
    */
-  void SynchronizeNumberOfColumns();
+  void SynchronizeNumberOfColumns(Context const* ctx);
 
   /*! \brief Whether the data is split row-wise. */
   bool IsRowSplit() const {
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 901c9ae91..8806818fb 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -582,20 +582,20 @@ auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) {  // NOL
   return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device()};
 }
 
-template <typename T, typename... S>
-LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span<T> data, S &&...shape) {
+template <typename T, decltype(common::dynamic_extent) ext, typename... S>
+LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span<T, ext> data, S &&...shape) {
   std::size_t in_shape[sizeof...(S)];
   detail::IndexToArr(in_shape, std::forward<S>(shape)...);
   return TensorView<T, sizeof...(S)>{data, in_shape, device};
 }
 
-template <typename T, typename... S>
-auto MakeTensorView(Context const *ctx, common::Span<T> data, S &&...shape) {
+template <typename T, decltype(common::dynamic_extent) ext, typename... S>
+auto MakeTensorView(Context const *ctx, common::Span<T, ext> data, S &&...shape) {
   return MakeTensorView(ctx->Device(), data, std::forward<S>(shape)...);
 }
 
-template <typename T, typename... S>
-auto MakeTensorView(Context const *ctx, Order order, common::Span<T> data, S &&...shape) {
+template <typename T, decltype(common::dynamic_extent) ext, typename... S>
+auto MakeTensorView(Context const *ctx, Order order, common::Span<T, ext> data, S &&...shape) {
   std::size_t in_shape[sizeof...(S)];
   detail::IndexToArr(in_shape, std::forward<S>(shape)...);
   return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device(), order};
diff --git a/plugin/federated/federated_coll.cc b/plugin/federated/federated_coll.cc
index 7c25eeba5..980992d61 100644
--- a/plugin/federated/federated_coll.cc
+++ b/plugin/federated/federated_coll.cc
@@ -29,7 +29,7 @@ namespace {
   auto stub = fed->Handle();
 
   BroadcastRequest request;
-  request.set_sequence_number(*sequence_number++);
+  request.set_sequence_number((*sequence_number)++);
   request.set_rank(comm.Rank());
   if (comm.Rank() != root) {
     request.set_send_buffer(nullptr, 0);
@@ -90,9 +90,9 @@ Coll *FederatedColl::MakeCUDAVar() {
 [[nodiscard]] Result FederatedColl::Broadcast(Comm const &comm, common::Span<std::int8_t> data,
                                               std::int32_t root) {
   if (comm.Rank() == root) {
-    return BroadcastImpl(comm, &sequence_number_, data, root);
+    return BroadcastImpl(comm, &this->sequence_number_, data, root);
   } else {
-    return BroadcastImpl(comm, &sequence_number_, data, root);
+    return BroadcastImpl(comm, &this->sequence_number_, data, root);
   }
 }
 
diff --git a/src/collective/allreduce.cc b/src/collective/allreduce.cc
index 6948f6758..65c066868 100644
--- a/src/collective/allreduce.cc
+++ b/src/collective/allreduce.cc
@@ -62,6 +62,9 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
 
 Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
                      ArrayInterfaceHandler::Type type) {
+  if (comm.World() == 1) {
+    return Success();
+  }
   return DispatchDType(type, [&](auto t) {
     using T = decltype(t);
     // Divide the data into segments according to the number of workers.
diff --git a/src/collective/comm.cu b/src/collective/comm.cu
index 09faf31cd..09edc522d 100644
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@@ -10,6 +10,7 @@
 #include <sstream>    // for stringstream
 #include <vector>     // for vector
 
+#include "../common/cuda_context.cuh"    // for CUDAContext
 #include "../common/device_helpers.cuh"  // for DefaultStream
 #include "../common/type.h"              // for EraseType
 #include "broadcast.h"                   // for Broadcast
@@ -60,7 +61,7 @@ Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
 NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl)
     : Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(),
            root.TaskID()},
-      stream_{dh::DefaultStream()} {
+      stream_{ctx->CUDACtx()->Stream()} {
   this->world_ = root.World();
   this->rank_ = root.Rank();
   this->domain_ = root.Domain();
diff --git a/src/collective/comm_group.cc b/src/collective/comm_group.cc
index 570500843..3d2e24492 100644
--- a/src/collective/comm_group.cc
+++ b/src/collective/comm_group.cc
@@ -105,7 +105,7 @@ CommGroup::CommGroup()
 }
 
 std::unique_ptr<collective::CommGroup>& GlobalCommGroup() {
-  static std::unique_ptr<collective::CommGroup> sptr;
+  static thread_local std::unique_ptr<collective::CommGroup> sptr;
   if (!sptr) {
     Json config{Null{}};
     sptr.reset(CommGroup::Create(config));
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 89b3ad2e6..74336ac61 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -480,7 +480,8 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
   cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
     // Configure allocator with maximum cached bin size of ~1GB and no limit on
     // maximum cached bytes
-    thread_local cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
+    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
+        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
     return *allocator;
   }
   pointer allocate(size_t n) {  // NOLINT
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 65ab18630..f10124792 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -51,7 +51,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
     for (auto const &page : m->GetBatches<SparsePage>()) {
       container.PushRowPage(page, info, hessian);
     }
-    container.MakeCuts(m->Info(), &out);
+    container.MakeCuts(ctx, m->Info(), &out);
   } else {
     SortedSketchContainer container{ctx,
                                     max_bins,
@@ -61,7 +61,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
     for (auto const &page : m->GetBatches<SortedCSCPage>(ctx)) {
       container.PushColPage(page, info, hessian);
     }
-    container.MakeCuts(m->Info(), &out);
+    container.MakeCuts(ctx, m->Info(), &out);
   }
 
   return out;
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 1f06c2a6f..fbe6356bf 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -359,7 +359,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
     }
   }
 
-  sketch_container.MakeCuts(&cuts, p_fmat->Info().IsColumnSplit());
+  sketch_container.MakeCuts(ctx, &cuts, p_fmat->Info().IsColumnSplit());
   return cuts;
 }
 }  // namespace xgboost::common
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 5250abd0f..0490add26 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -11,9 +11,7 @@
 #include "categorical.h"
 #include "hist_util.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 template <typename WQSketch>
 SketchContainerImpl<WQSketch>::SketchContainerImpl(Context const *ctx,
                                                    std::vector<bst_row_t> columns_size,
@@ -129,7 +127,7 @@ struct QuantileAllreduce {
    * \param rank rank of target worker
    * \param fidx feature idx
    */
-  auto Values(int32_t rank, bst_feature_t fidx) const {
+  [[nodiscard]] auto Values(int32_t rank, bst_feature_t fidx) const {
     // get span for worker
     auto wsize = worker_indptr[rank + 1] - worker_indptr[rank];
     auto worker_values = global_values.subspan(worker_indptr[rank], wsize);
@@ -145,7 +143,7 @@ struct QuantileAllreduce {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
-    MetaInfo const& info,
+    Context const *, MetaInfo const &info,
     std::vector<typename WQSketch::SummaryContainer> const &reduced,
     std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
     std::vector<typename WQSketch::Entry> *p_global_sketches) {
@@ -206,7 +204,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo const& info) {
   auto world_size = collective::GetWorldSize();
   auto rank = collective::GetRank();
   if (world_size == 1 || info.IsColumnSplit()) {
@@ -274,16 +272,15 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::AllReduce(
-    MetaInfo const& info,
-    std::vector<typename WQSketch::SummaryContainer> *p_reduced,
-    std::vector<int32_t>* p_num_cuts) {
+    Context const *ctx, MetaInfo const &info,
+    std::vector<typename WQSketch::SummaryContainer> *p_reduced, std::vector<int32_t> *p_num_cuts) {
   monitor_.Start(__func__);
 
   size_t n_columns = sketches_.size();
   collective::Allreduce<collective::Operation::kMax>(&n_columns, 1);
   CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";
 
-  AllreduceCategories(info);
+  AllreduceCategories(ctx, info);
 
   auto& num_cuts = *p_num_cuts;
   CHECK_EQ(num_cuts.size(), 0);
@@ -324,7 +321,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
   std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);
 
   std::vector<typename WQSketch::Entry> global_sketches;
-  this->GatherSketchInfo(info, reduced, &worker_segments, &sketches_scan, &global_sketches);
+  this->GatherSketchInfo(ctx, info, reduced, &worker_segments, &sketches_scan, &global_sketches);
 
   std::vector<typename WQSketch::SummaryContainer> final_sketches(n_columns);
 
@@ -383,11 +380,12 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::MakeCuts(MetaInfo const &info, HistogramCuts *p_cuts) {
+void SketchContainerImpl<WQSketch>::MakeCuts(Context const *ctx, MetaInfo const &info,
+                                             HistogramCuts *p_cuts) {
   monitor_.Start(__func__);
   std::vector<typename WQSketch::SummaryContainer> reduced;
   std::vector<int32_t> num_cuts;
-  this->AllReduce(info, &reduced, &num_cuts);
+  this->AllReduce(ctx, info, &reduced, &num_cuts);
 
   p_cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
   std::vector<typename WQSketch::SummaryContainer> final_summaries(reduced.size());
@@ -496,5 +494,4 @@ void SortedSketchContainer::PushColPage(SparsePage const &page, MetaInfo const &
   });
   monitor_.Stop(__func__);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 2bf6070d5..4b110f5e0 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -22,9 +22,7 @@
 #include "transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/span.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 using WQSketch = HostSketchContainer::WQSketch;
 using SketchEntry = WQSketch::Entry;
 
@@ -501,7 +499,7 @@ void SketchContainer::FixError() {
   });
 }
 
-void SketchContainer::AllReduce(bool is_column_split) {
+void SketchContainer::AllReduce(Context const*, bool is_column_split) {
   dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto world = collective::GetWorldSize();
   if (world == 1 || is_column_split) {
@@ -582,13 +580,13 @@ struct InvalidCatOp {
 };
 }  // anonymous namespace
 
-void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
+void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
   dh::safe_cuda(cudaSetDevice(device_.ordinal));
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
-  this->AllReduce(is_column_split);
+  this->AllReduce(ctx, is_column_split);
 
   // Prune to final number of bins.
   this->Prune(num_bins_ + 1);
@@ -731,5 +729,4 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
   p_cuts->SetCategorical(this->has_categorical_, max_cat);
   timer_.Stop(__func__);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index b47834782..f7124f079 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -151,9 +151,9 @@ class SketchContainer {
              Span<SketchEntry const> that);
 
   /* \brief Merge quantiles from other GPU workers. */
-  void AllReduce(bool is_column_split);
+  void AllReduce(Context const* ctx, bool is_column_split);
   /* \brief Create the final histogram cut values. */
-  void MakeCuts(HistogramCuts* cuts, bool is_column_split);
+  void MakeCuts(Context const* ctx, HistogramCuts* cuts, bool is_column_split);
 
   Span<SketchEntry const> Data() const {
     return {this->Current().data().get(), this->Current().size()};
diff --git a/src/common/quantile.h b/src/common/quantile.h
index 47db5f875..0af93a03e 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -827,13 +827,14 @@ class SketchContainerImpl {
     return group_ind;
   }
   // Gather sketches from all workers.
-  void GatherSketchInfo(MetaInfo const& info,
+  void GatherSketchInfo(Context const *ctx, MetaInfo const &info,
                         std::vector<typename WQSketch::SummaryContainer> const &reduced,
                         std::vector<bst_row_t> *p_worker_segments,
                         std::vector<bst_row_t> *p_sketches_scan,
                         std::vector<typename WQSketch::Entry> *p_global_sketches);
   // Merge sketches from all workers.
-  void AllReduce(MetaInfo const& info, std::vector<typename WQSketch::SummaryContainer> *p_reduced,
+  void AllReduce(Context const *ctx, MetaInfo const &info,
+                 std::vector<typename WQSketch::SummaryContainer> *p_reduced,
                  std::vector<int32_t> *p_num_cuts);
 
   template <typename Batch, typename IsValid>
@@ -887,11 +888,11 @@ class SketchContainerImpl {
   /* \brief Push a CSR matrix. */
   void PushRowPage(SparsePage const &page, MetaInfo const &info, Span<float const> hessian = {});
 
-  void MakeCuts(MetaInfo const& info, HistogramCuts* cuts);
+  void MakeCuts(Context const *ctx, MetaInfo const &info, HistogramCuts *cuts);
 
  private:
   // Merge all categories from other workers.
-  void AllreduceCategories(MetaInfo const& info);
+  void AllreduceCategories(Context const* ctx, MetaInfo const& info);
 };
 
 class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, float>> {
diff --git a/src/data/data.cc b/src/data/data.cc
index 7e70fff3f..50f64a406 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -745,7 +745,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
   }
 }
 
-void MetaInfo::SynchronizeNumberOfColumns() {
+void MetaInfo::SynchronizeNumberOfColumns(Context const*) {
   if (IsColumnSplit()) {
     collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
   } else {
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 45f6286fb..e5aa98278 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -95,7 +95,7 @@ void GetCutsFromRef(Context const* ctx, std::shared_ptr<DMatrix> ref, bst_featur
 
 namespace {
 // Synchronize feature type in case of empty DMatrix
-void SyncFeatureType(std::vector<FeatureType>* p_h_ft) {
+void SyncFeatureType(Context const*, std::vector<FeatureType>* p_h_ft) {
   if (!collective::IsDistributed()) {
     return;
   }
@@ -193,7 +193,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
   // From here on Info() has the correct data shape
   Info().num_row_ = accumulated_rows;
   Info().num_nonzero_ = nnz;
-  Info().SynchronizeNumberOfColumns();
+  Info().SynchronizeNumberOfColumns(ctx);
   CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) {
     return f > accumulated_rows;
   })) << "Something went wrong during iteration.";
@@ -213,9 +213,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
     while (iter.Next()) {
       if (!p_sketch) {
         h_ft = proxy->Info().feature_types.ConstHostVector();
-        SyncFeatureType(&h_ft);
-        p_sketch.reset(new common::HostSketchContainer{ctx, p.max_bin, h_ft, column_sizes,
-                                                       !proxy->Info().group_ptr_.empty()});
+        SyncFeatureType(ctx, &h_ft);
+        p_sketch = std::make_unique<common::HostSketchContainer>(ctx, p.max_bin, h_ft, column_sizes,
+                                                                 !proxy->Info().group_ptr_.empty());
       }
       HostAdapterDispatch(proxy, [&](auto const& batch) {
         proxy->Info().num_nonzero_ = batch_nnz[i];
@@ -230,7 +230,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
     CHECK_EQ(accumulated_rows, Info().num_row_);
 
     CHECK(p_sketch);
-    p_sketch->MakeCuts(Info(), &cuts);
+    p_sketch->MakeCuts(ctx, Info(), &cuts);
   }
   if (!h_ft.empty()) {
     CHECK_EQ(h_ft.size(), n_features);
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 2fffd516b..09a3976d7 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -105,7 +105,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     sketch_containers.clear();
     sketch_containers.shrink_to_fit();
 
-    final_sketch.MakeCuts(&cuts, this->info_.IsColumnSplit());
+    final_sketch.MakeCuts(ctx, &cuts, this->info_.IsColumnSplit());
   } else {
     GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
   }
@@ -167,7 +167,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
 
   iter.Reset();
   // Synchronise worker columns
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(ctx);
 }
 
 BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 3814d74d2..2bf81892f 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -283,7 +283,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
   ReindexFeatures(&ctx);
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(&ctx);
 
   if (adapter->NumRows() == kAdapterUnknownSize) {
     using IteratorAdapterT =
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index e41d59394..e5b4d18f7 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -42,7 +42,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
   info_.num_row_ = adapter->NumRows();
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(&ctx);
 
   this->fmat_ctx_ = ctx;
 }
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 042a75c56..f1754c1b5 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -97,7 +97,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
   this->info_.num_col_ = n_features;
   this->info_.num_nonzero_ = nnz;
 
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(&ctx);
   CHECK_NE(info_.num_col_, 0);
 
   fmat_ctx_ = ctx;
diff --git a/src/learner.cc b/src/learner.cc
index 08c59ba60..6b0fd7e4b 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -209,7 +209,7 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
     return dmlc::Parameter<LearnerModelParamLegacy>::UpdateAllowUnknown(kwargs);
   }
   // sanity check
-  void Validate() {
+  void Validate(Context const*) {
     if (!collective::IsDistributed()) {
       return;
     }
@@ -434,7 +434,7 @@ class LearnerConfiguration : public Learner {
       }
       // Update the shared model parameter
       this->ConfigureModelParamWithoutBaseScore();
-      mparam_.Validate();
+      mparam_.Validate(&ctx_);
     }
     CHECK(!std::isnan(mparam_.base_score));
     CHECK(!std::isinf(mparam_.base_score));
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index a4838d783..8b8349e1b 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -199,9 +199,9 @@ void Transpose(common::Span<float const> in, common::Span<float> out, size_t m,
   });
 }
 
-double ScaleClasses(common::Span<double> results, common::Span<double> local_area,
-                    common::Span<double> tp, common::Span<double> auc, size_t n_classes) {
-  dh::XGBDeviceAllocator<char> alloc;
+double ScaleClasses(Context const *ctx, common::Span<double> results,
+                    common::Span<double> local_area, common::Span<double> tp,
+                    common::Span<double> auc, size_t n_classes) {
   if (collective::IsDistributed()) {
     int32_t device = dh::CurrentDevice();
     CHECK_EQ(dh::CudaGetPointerDevice(results.data()), device);
@@ -218,8 +218,8 @@ double ScaleClasses(common::Span<double> results, common::Span<double> local_are
   double tp_sum;
   double auc_sum;
   thrust::tie(auc_sum, tp_sum) =
-      thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
-                     Pair{0.0, 0.0}, PairPlus<double, double>{});
+      thrust::reduce(ctx->CUDACtx()->CTP(), reduce_in, reduce_in + n_classes, Pair{0.0, 0.0},
+                     PairPlus<double, double>{});
   if (tp_sum != 0 && !std::isnan(auc_sum)) {
     auc_sum /= tp_sum;
   } else {
@@ -309,10 +309,10 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
  * up each class in all kernels.
  */
 template <bool scale, typename Fn>
-double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
+double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
                            common::Span<uint32_t> d_class_ptr, size_t n_classes,
                            std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   /**
    * Sorted idx
    */
@@ -320,7 +320,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
   // Index is sorted within class.
   auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
 
-  auto labels = info.labels.View(device);
+  auto labels = info.labels.View(ctx->Device());
   auto weights = info.weights_.ConstDeviceSpan();
 
   size_t n_samples = labels.Shape(0);
@@ -328,12 +328,11 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
   if (n_samples == 0) {
     dh::TemporaryArray<double> resutls(n_classes * 4, 0.0f);
     auto d_results = dh::ToSpan(resutls);
-    dh::LaunchN(n_classes * 4,
-                [=] XGBOOST_DEVICE(size_t i) { d_results[i] = 0.0f; });
+    dh::LaunchN(n_classes * 4, [=] XGBOOST_DEVICE(size_t i) { d_results[i] = 0.0f; });
     auto local_area = d_results.subspan(0, n_classes);
     auto tp = d_results.subspan(2 * n_classes, n_classes);
     auto auc = d_results.subspan(3 * n_classes, n_classes);
-    return ScaleClasses(d_results, local_area, tp, auc, n_classes);
+    return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes);
   }
 
   /**
@@ -437,7 +436,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
       tp[c] = 1.0f;
     }
   });
-  return ScaleClasses(d_results, local_area, tp, auc, n_classes);
+  return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes);
 }
 
 void MultiClassSortedIdx(Context const *ctx, common::Span<float const> predts,
@@ -472,8 +471,7 @@ double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
                               size_t /*class_id*/) {
     return TrapezoidArea(fp_prev, fp, tp_prev, tp);
   };
-  return GPUMultiClassAUCOVR<true>(info, ctx->Device(), dh::ToSpan(class_ptr), n_classes, cache,
-                                   fn);
+  return GPUMultiClassAUCOVR<true>(ctx, info, dh::ToSpan(class_ptr), n_classes, cache, fn);
 }
 
 namespace {
@@ -697,7 +695,7 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
     return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                   d_totals[class_id].first);
   };
-  return GPUMultiClassAUCOVR<false>(info, ctx->Device(), d_class_ptr, n_classes, cache, fn);
+  return GPUMultiClassAUCOVR<false>(ctx, info, d_class_ptr, n_classes, cache, fn);
 }
 
 template <typename Fn>
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index feabedfab..f245f3e06 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -215,7 +215,7 @@ struct EvalError {
       has_param_ = false;
     }
   }
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
     static thread_local std::string name;
     if (has_param_) {
       std::ostringstream os;
@@ -228,7 +228,7 @@ struct EvalError {
     }
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const {
     // assume label is in [0,1]
     return pred > threshold_ ? 1.0f - label : label;
   }
@@ -370,7 +370,7 @@ struct EvalEWiseBase : public MetricNoCache {
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
-  const char* Name() const override { return policy_.Name(); }
+  [[nodiscard]] const char* Name() const override { return policy_.Name(); }
 
  private:
   Policy policy_;
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 41495164c..6762aec32 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -162,7 +162,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
     return collective::GlobalRatio(info, sum_metric, static_cast<double>(ngroups));
   }
 
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return name.c_str();
   }
 
@@ -294,7 +294,7 @@ class EvalRankWithCache : public Metric {
 };
 
 namespace {
-double Finalize(MetaInfo const& info, double score, double sw) {
+double Finalize(Context const*, MetaInfo const& info, double score, double sw) {
   std::array<double, 2> dat{score, sw};
   collective::GlobalSum(info, &dat);
   std::tie(score, sw) = std::tuple_cat(dat);
@@ -323,7 +323,7 @@ class EvalPrecision : public EvalRankWithCache<ltr::PreCache> {
 
     if (ctx_->IsCUDA()) {
       auto pre = cuda_impl::PreScore(ctx_, info, predt, p_cache);
-      return Finalize(info, pre.Residue(), pre.Weights());
+      return Finalize(ctx_, info, pre.Residue(), pre.Weights());
     }
 
     auto gptr = p_cache->DataGroupPtr(ctx_);
@@ -352,7 +352,7 @@ class EvalPrecision : public EvalRankWithCache<ltr::PreCache> {
     }
 
     auto sum = std::accumulate(pre.cbegin(), pre.cend(), 0.0);
-    return Finalize(info, sum, sw);
+    return Finalize(ctx_, info, sum, sw);
   }
 };
 
@@ -369,7 +369,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
               std::shared_ptr<ltr::NDCGCache> p_cache) override {
     if (ctx_->IsCUDA()) {
       auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
-      return Finalize(info, ndcg.Residue(), ndcg.Weights());
+      return Finalize(ctx_, info, ndcg.Residue(), ndcg.Weights());
     }
 
     // group local ndcg
@@ -415,7 +415,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
       sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
     }
     auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
-    return Finalize(info, ndcg, sum_w);
+    return Finalize(ctx_, info, ndcg, sum_w);
   }
 };
 
@@ -427,7 +427,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
               std::shared_ptr<ltr::MAPCache> p_cache) override {
     if (ctx_->IsCUDA()) {
       auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
-      return Finalize(info, map.Residue(), map.Weights());
+      return Finalize(ctx_, info, map.Residue(), map.Weights());
     }
 
     auto gptr = p_cache->DataGroupPtr(ctx_);
@@ -469,7 +469,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
       sw += weight[i];
     }
     auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
-    return Finalize(info, sum, sw);
+    return Finalize(ctx_, info, sum, sw);
   }
 };
 
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 0625af25a..c13702a19 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -217,7 +217,7 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return policy_.Name();
   }
 
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 26d8f3440..20305850a 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -189,7 +189,7 @@ struct SparsePageView {
 
   explicit SparsePageView(SparsePage const *p) : base_rowid{p->base_rowid} { view = p->GetView(); }
   SparsePage::Inst operator[](size_t i) { return view[i]; }
-  size_t Size() const { return view.Size(); }
+  [[nodiscard]] size_t Size() const { return view.Size(); }
 };
 
 struct SingleInstanceView {
@@ -250,7 +250,7 @@ struct GHistIndexMatrixView {
     }
     return ret;
   }
-  size_t Size() const { return page_.Size(); }
+  [[nodiscard]] size_t Size() const { return page_.Size(); }
 };
 
 template <typename Adapter>
@@ -290,7 +290,7 @@ class AdapterView {
     return ret;
   }
 
-  size_t Size() const { return adapter_->NumRows(); }
+  [[nodiscard]] size_t Size() const { return adapter_->NumRows(); }
 
   bst_row_t const static base_rowid = 0;  // NOLINT
 };
@@ -408,31 +408,33 @@ class ColumnSplitHelper {
   ColumnSplitHelper(ColumnSplitHelper &&) noexcept = delete;
   ColumnSplitHelper &operator=(ColumnSplitHelper &&) noexcept = delete;
 
-  void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
+  void PredictDMatrix(Context const *ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
     CHECK(xgboost::collective::IsDistributed())
         << "column-split prediction is only supported for distributed training";
 
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(),
                p_fmat->Info().num_row_ * model_.learner_model_param->num_output_group);
-      PredictBatchKernel<SparsePageView, kBlockOfRowsSize>(SparsePageView{&batch}, out_preds);
+      PredictBatchKernel<SparsePageView, kBlockOfRowsSize>(ctx, SparsePageView{&batch}, out_preds);
     }
   }
 
-  void PredictInstance(SparsePage::Inst const &inst, std::vector<bst_float> *out_preds) {
+  void PredictInstance(Context const *ctx, SparsePage::Inst const &inst,
+                       std::vector<bst_float> *out_preds) {
     CHECK(xgboost::collective::IsDistributed())
         << "column-split prediction is only supported for distributed training";
 
-    PredictBatchKernel<SingleInstanceView, 1>(SingleInstanceView{inst}, out_preds);
+    PredictBatchKernel<SingleInstanceView, 1>(ctx, SingleInstanceView{inst}, out_preds);
   }
 
-  void PredictLeaf(DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
+  void PredictLeaf(Context const* ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
     CHECK(xgboost::collective::IsDistributed())
         << "column-split prediction is only supported for distributed training";
 
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(), p_fmat->Info().num_row_ * (tree_end_ - tree_begin_));
-      PredictBatchKernel<SparsePageView, kBlockOfRowsSize, true>(SparsePageView{&batch}, out_preds);
+      PredictBatchKernel<SparsePageView, kBlockOfRowsSize, true>(ctx, SparsePageView{&batch},
+                                                                 out_preds);
     }
   }
 
@@ -453,12 +455,13 @@ class ColumnSplitHelper {
     std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
   }
 
-  std::size_t BitIndex(std::size_t tree_id, std::size_t row_id, std::size_t node_id) const {
+  [[nodiscard]] std::size_t BitIndex(std::size_t tree_id, std::size_t row_id,
+                                     std::size_t node_id) const {
     size_t tree_index = tree_id - tree_begin_;
     return tree_offsets_[tree_index] * n_rows_ + row_id * tree_sizes_[tree_index] + node_id;
   }
 
-  void AllreduceBitVectors() {
+  void AllreduceBitVectors(Context const*) {
     collective::Allreduce<collective::Operation::kBitwiseOR>(decision_storage_.data(),
                                                              decision_storage_.size());
     collective::Allreduce<collective::Operation::kBitwiseAND>(missing_storage_.data(),
@@ -547,7 +550,7 @@ class ColumnSplitHelper {
   }
 
   template <typename DataView, size_t block_of_rows_size, bool predict_leaf = false>
-  void PredictBatchKernel(DataView batch, std::vector<bst_float> *out_preds) {
+  void PredictBatchKernel(Context const* ctx, DataView batch, std::vector<bst_float> *out_preds) {
     auto const num_group = model_.learner_model_param->num_output_group;
 
     // parallel over local batch
@@ -568,7 +571,7 @@ class ColumnSplitHelper {
       FVecDrop(block_size, fvec_offset, &feat_vecs_);
     });
 
-    AllreduceBitVectors();
+    AllreduceBitVectors(ctx);
 
     // auto block_id has the same type as `n_blocks`.
     common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
@@ -646,7 +649,7 @@ class CPUPredictor : public Predictor {
           << "Predict DMatrix with column split" << MTNotImplemented();
 
       ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end);
-      helper.PredictDMatrix(p_fmat, out_preds);
+      helper.PredictDMatrix(ctx_, p_fmat, out_preds);
       return;
     }
 
@@ -779,7 +782,7 @@ class CPUPredictor : public Predictor {
           << "Predict instance with column split" << MTNotImplemented();
 
       ColumnSplitHelper helper(this->ctx_->Threads(), model, 0, ntree_limit);
-      helper.PredictInstance(inst, out_preds);
+      helper.PredictInstance(ctx_, inst, out_preds);
       return;
     }
 
@@ -811,7 +814,7 @@ class CPUPredictor : public Predictor {
           << "Predict leaf with column split" << MTNotImplemented();
 
       ColumnSplitHelper helper(n_threads, model, 0, ntree_limit);
-      helper.PredictLeaf(p_fmat, &preds);
+      helper.PredictLeaf(ctx_, p_fmat, &preds);
       return;
     }
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index e41248e29..7fad07397 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -62,9 +62,7 @@ struct TreeView {
     cats.node_ptr = tree_cat_ptrs;
   }
 
-  __device__ bool HasCategoricalSplit() const {
-    return !cats.categories.empty();
-  }
+  [[nodiscard]] __device__ bool HasCategoricalSplit() const { return !cats.categories.empty(); }
 };
 
 struct SparsePageView {
@@ -77,7 +75,7 @@ struct SparsePageView {
                                 common::Span<const bst_row_t> row_ptr,
                                 bst_feature_t num_features)
       : d_data{data}, d_row_ptr{row_ptr}, num_features(num_features) {}
-  __device__ float GetElement(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const {
     // Binary search
     auto begin_ptr = d_data.begin() + d_row_ptr[ridx];
     auto end_ptr = d_data.begin() + d_row_ptr[ridx + 1];
@@ -105,8 +103,8 @@ struct SparsePageView {
     // Value is missing
     return nanf("");
   }
-  XGBOOST_DEVICE size_t NumRows() const { return d_row_ptr.size() - 1; }
-  XGBOOST_DEVICE size_t NumCols() const { return num_features; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumRows() const { return d_row_ptr.size() - 1; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumCols() const { return num_features; }
 };
 
 struct SparsePageLoader {
@@ -137,7 +135,7 @@ struct SparsePageLoader {
       __syncthreads();
     }
   }
-  __device__ float GetElement(size_t  ridx, size_t  fidx) const {
+  [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const {
     if (use_shared) {
       return smem[threadIdx.x * data.num_features + fidx];
     } else {
@@ -151,7 +149,7 @@ struct EllpackLoader {
   XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool, bst_feature_t, bst_row_t,
                                size_t, float)
       : matrix{m} {}
-  __device__ __forceinline__ float GetElement(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ __forceinline__ float GetElement(size_t ridx, size_t fidx) const {
     auto gidx = matrix.GetBinIndex(ridx, fidx);
     if (gidx == -1) {
       return nan("");
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 627bf4ca4..ceb322c28 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -395,11 +395,11 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
   }
 }
 
-void GPUHistEvaluator::EvaluateSplits(
-    const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
-    common::Span<const EvaluateSplitInputs> d_inputs,
-    EvaluateSplitSharedInputs shared_inputs,
-    common::Span<GPUExpandEntry> out_entries) {
+void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_node_t> &nidx,
+                                      bst_feature_t max_active_features,
+                                      common::Span<const EvaluateSplitInputs> d_inputs,
+                                      EvaluateSplitSharedInputs shared_inputs,
+                                      common::Span<GPUExpandEntry> out_entries) {
   auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
 
   dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
@@ -417,19 +417,20 @@ void GPUHistEvaluator::EvaluateSplits(
                           out_splits.size() * sizeof(DeviceSplitCandidate));
 
     // Reduce to get the best candidate from all workers.
-    dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
-      out_splits[i] = all_candidates[i];
-      for (auto rank = 1; rank < world_size; rank++) {
-        out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
-      }
-    });
+    dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(),
+                [world_size, all_candidates, out_splits] __device__(size_t i) {
+                  out_splits[i] = all_candidates[i];
+                  for (auto rank = 1; rank < world_size; rank++) {
+                    out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
+                  }
+                });
   }
 
   auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
   auto d_entries = out_entries;
   auto device_cats_accessor = this->DeviceCatStorage(nidx);
   // turn candidate into entry, along with handling sort based split.
-  dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable {
+  dh::LaunchN(d_inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t i) mutable {
     auto const input = d_inputs[i];
     auto &split = out_splits[i];
     // Subtract parent gain here
@@ -464,12 +465,12 @@ void GPUHistEvaluator::EvaluateSplits(
   this->CopyToHost(nidx);
 }
 
-GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
-    EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs) {
+GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
+                                                     EvaluateSplitSharedInputs shared_inputs) {
   dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
   dh::TemporaryArray<GPUExpandEntry> out_entries(1);
-  this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
-                       dh::ToSpan(out_entries));
+  this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
+                       shared_inputs, dh::ToSpan(out_entries));
   GPUExpandEntry root_entry;
   dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                 cudaMemcpyDeviceToHost));
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 7c61099a1..8c387f632 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -193,7 +193,7 @@ class GPUHistEvaluator {
   /**
    * \brief Evaluate splits for left and right nodes.
    */
-  void EvaluateSplits(const std::vector<bst_node_t> &nidx,
+  void EvaluateSplits(Context const* ctx, const std::vector<bst_node_t> &nidx,
                       bst_feature_t max_active_features,
                       common::Span<const EvaluateSplitInputs> d_inputs,
                       EvaluateSplitSharedInputs shared_inputs,
@@ -201,7 +201,7 @@ class GPUHistEvaluator {
   /**
    * \brief Evaluate splits for root node.
    */
-  GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,
+  GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
                                      EvaluateSplitSharedInputs shared_inputs);
 };
 }  // namespace tree
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 22eb7ab81..c473c9269 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -16,8 +16,7 @@
 #include "row_partitioner.cuh"
 #include "xgboost/base.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 struct Pair {
   GradientPair first;
@@ -53,7 +52,8 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
  *
  * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
  */
-GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
+GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
+                                     MetaInfo const& info) {
   using GradientSumT = GradientPairPrecise;
   using T = typename GradientSumT::ValueT;
   dh::XGBCachingDeviceAllocator<char> alloc;
@@ -99,7 +99,6 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, Met
                                  static_cast<T>(1) / to_floating_point_.GetHess());
 }
 
-
 XGBOOST_DEV_INLINE void
 AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
                xgboost::GradientPairInt64 const &gpair) {
@@ -314,6 +313,4 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
 
   dh::safe_cuda(cudaGetLastError());
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index c693e2e62..925c54893 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -39,18 +39,20 @@ private:
   GradientPairPrecise to_floating_point_;
 
  public:
-  GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
-  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
+  GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
     auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                               gpair.GetHess() * to_fixed_point_.GetHess());
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
     return adjusted;
   }
-  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPairPrecise const& gpair) const {
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64
+  ToFixedPoint(GradientPairPrecise const& gpair) const {
     auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                               gpair.GetHess() * to_fixed_point_.GetHess());
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
     return adjusted;
   }
-  XGBOOST_DEVICE GradientPairPrecise ToFloatingPoint(const GradientPairInt64&gpair) const {
+  [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
+  ToFloatingPoint(const GradientPairInt64& gpair) const {
     auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
     auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
     return {g,h};
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index f378c7808..033d2221e 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -171,7 +171,8 @@ class HistogramBuilder {
     }
   }
 
-  void SyncHistogram(RegTree const *p_tree, std::vector<bst_node_t> const &nodes_to_build,
+  void SyncHistogram(Context const *, RegTree const *p_tree,
+                     std::vector<bst_node_t> const &nodes_to_build,
                      std::vector<bst_node_t> const &nodes_to_trick) {
     auto n_total_bins = buffer_.TotalBins();
     common::BlockedSpace2d space(
@@ -277,14 +278,14 @@ class MultiHistogramBuilder {
     }
 
     for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
-      this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub);
+      this->target_builders_[t].SyncHistogram(ctx_, p_tree, nodes, dummy_sub);
     }
   }
   /**
    * @brief Build histogram for left and right child of valid candidates
    */
   template <typename Partitioner, typename ExpandEntry>
-  void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree,
+  void BuildHistLeftRight(Context const *ctx, DMatrix *p_fmat, RegTree const *p_tree,
                           std::vector<Partitioner> const &partitioners,
                           std::vector<ExpandEntry> const &valid_candidates,
                           linalg::MatrixView<GradientPair const> gpair, BatchParam const &param,
@@ -318,7 +319,7 @@ class MultiHistogramBuilder {
     }
 
     for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
-      this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub);
+      this->target_builders_[t].SyncHistogram(ctx, p_tree, nodes_to_build, nodes_to_sub);
     }
   }
 
diff --git a/src/tree/hist/param.cc b/src/tree/hist/param.cc
index 602566cd3..bd8d7a85c 100644
--- a/src/tree/hist/param.cc
+++ b/src/tree/hist/param.cc
@@ -12,7 +12,7 @@
 namespace xgboost::tree {
 DMLC_REGISTER_PARAMETER(HistMakerTrainParam);
 
-void HistMakerTrainParam::CheckTreesSynchronized(RegTree const* local_tree) const {
+void HistMakerTrainParam::CheckTreesSynchronized(Context const*, RegTree const* local_tree) const {
   if (!this->debug_synchronize) {
     return;
   }
diff --git a/src/tree/hist/param.h b/src/tree/hist/param.h
index 8757b65e6..aa9d8cedf 100644
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/param.h
@@ -15,7 +15,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
   bool debug_synchronize{false};
   std::size_t max_cached_hist_node{DefaultNodes()};
 
-  void CheckTreesSynchronized(RegTree const* local_tree) const;
+  void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const;
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 17e020ced..3c37556e1 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -140,7 +140,7 @@ class GloablApproxBuilder {
                       std::vector<GradientPair> const &gpair, common::Span<float> hess) {
     monitor_->Start(__func__);
     this->histogram_builder_.BuildHistLeftRight(
-        p_fmat, p_tree, partitioner_, valid_candidates,
+        ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
         linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
     monitor_->Stop(__func__);
   }
@@ -300,7 +300,7 @@ class GlobalApproxUpdater : public TreeUpdater {
     std::size_t t_idx = 0;
     for (auto p_tree : trees) {
       this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
-      hist_param_.CheckTreesSynchronized(p_tree);
+      hist_param_.CheckTreesSynchronized(ctx_, p_tree);
       ++t_idx;
     }
   }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 6db201dd5..3c9c61f88 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -246,7 +246,7 @@ struct GPUHistMakerDevice {
     this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
                            dmat->Info().IsColumnSplit(), ctx_->Device());
 
-    quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
+    quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, dmat->Info());
 
     row_partitioner.reset();  // Release the device memory first before reallocating
     row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
@@ -276,7 +276,7 @@ struct GPUHistMakerDevice {
         matrix.min_fvalue,
         matrix.is_dense && !collective::IsDistributed()
     };
-    auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs);
+    auto split = this->evaluator_.EvaluateSingleSplit(ctx_, inputs, shared_inputs);
     return split;
   }
 
@@ -329,7 +329,7 @@ struct GPUHistMakerDevice {
         d_node_inputs.data().get(), h_node_inputs.data(),
         h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
 
-    this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
+    this->evaluator_.EvaluateSplits(ctx_, nidx, max_active_features, dh::ToSpan(d_node_inputs),
                                     shared_inputs, dh::ToSpan(entries));
     dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                   entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
@@ -842,7 +842,7 @@ class GPUHistMaker : public TreeUpdater {
       std::size_t t_idx{0};
       for (xgboost::RegTree* tree : trees) {
         this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
-        this->hist_maker_param_.CheckTreesSynchronized(tree);
+        this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
         ++t_idx;
       }
       dh::safe_cuda(cudaGetLastError());
@@ -985,7 +985,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     std::size_t t_idx{0};
     for (xgboost::RegTree* tree : trees) {
       this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]);
-      this->hist_maker_param_.CheckTreesSynchronized(tree);
+      this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
       ++t_idx;
     }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 50943e1c4..375b24cfa 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -228,8 +228,8 @@ class MultiTargetHistBuilder {
                       std::vector<MultiExpandEntry> const &valid_candidates,
                       linalg::MatrixView<GradientPair const> gpair) {
     monitor_->Start(__func__);
-    histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair,
-                                           HistBatch(param_));
+    histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
+                                           gpair, HistBatch(param_));
     monitor_->Stop(__func__);
   }
 
@@ -436,8 +436,8 @@ class HistUpdater {
                       std::vector<CPUExpandEntry> const &valid_candidates,
                       linalg::MatrixView<GradientPair const> gpair) {
     monitor_->Start(__func__);
-    this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates,
-                                                 gpair, HistBatch(param_));
+    this->histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_,
+                                                 valid_candidates, gpair, HistBatch(param_));
     monitor_->Stop(__func__);
   }
 
@@ -537,7 +537,7 @@ class QuantileHistMaker : public TreeUpdater {
                                    h_out_position, *tree_it);
       }
 
-      hist_param_.CheckTreesSynchronized(*tree_it);
+      hist_param_.CheckTreesSynchronized(ctx_, *tree_it);
     }
   }
 
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 4782f9580..92d8ff753 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -360,25 +360,27 @@ TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
 }
 
 template <typename Adapter>
-auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
+auto MakeUnweightedCutsForTest(Context const* ctx, Adapter adapter, int32_t num_bins, float missing,
+                               size_t batch_size = 0) {
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
   SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
                                    DeviceOrd::CUDA(0));
   MetaInfo info;
   AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
-  sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
+  sketch_container.MakeCuts(ctx, &batched_cuts, info.IsColumnSplit());
   return batched_cuts;
 }
 
 template <typename Adapter>
-void ValidateBatchedCuts(Adapter adapter, int num_bins, DMatrix* dmat, size_t batch_size = 0) {
+void ValidateBatchedCuts(Context const* ctx, Adapter adapter, int num_bins, DMatrix* dmat, size_t batch_size = 0) {
   common::HistogramCuts batched_cuts = MakeUnweightedCutsForTest(
-      adapter, num_bins, std::numeric_limits<float>::quiet_NaN(), batch_size);
+      ctx, adapter, num_bins, std::numeric_limits<float>::quiet_NaN(), batch_size);
   ValidateCuts(batched_cuts, dmat, num_bins);
 }
 
 TEST(HistUtil, AdapterDeviceSketch) {
+  auto ctx = MakeCUDACtx(0);
   int rows = 5;
   int cols = 1;
   int num_bins = 4;
@@ -391,8 +393,8 @@ TEST(HistUtil, AdapterDeviceSketch) {
 
   data::CupyAdapter adapter(str);
 
-  auto device_cuts = MakeUnweightedCutsForTest(adapter, num_bins, missing);
-  Context ctx;
+  auto device_cuts = MakeUnweightedCutsForTest(&ctx, adapter, num_bins, missing);
+  ctx = ctx.MakeCPU();
   auto host_cuts = GetHostCuts(&ctx, &adapter, num_bins, missing);
 
   EXPECT_EQ(device_cuts.Values(), host_cuts.Values());
@@ -401,6 +403,7 @@ TEST(HistUtil, AdapterDeviceSketch) {
 }
 
 TEST(HistUtil, AdapterDeviceSketchMemory) {
+  auto ctx = MakeCUDACtx(0);
   int num_columns = 100;
   int num_rows = 1000;
   int num_bins = 256;
@@ -410,7 +413,8 @@ TEST(HistUtil, AdapterDeviceSketchMemory) {
 
   dh::GlobalMemoryLogger().Clear();
   ConsoleLogger::Configure({{"verbosity", "3"}});
-  auto cuts = MakeUnweightedCutsForTest(adapter, num_bins, std::numeric_limits<float>::quiet_NaN());
+  auto cuts =
+      MakeUnweightedCutsForTest(&ctx, adapter, num_bins, std::numeric_limits<float>::quiet_NaN());
   ConsoleLogger::Configure({{"verbosity", "0"}});
   size_t bytes_required = detail::RequiredMemory(
       num_rows, num_columns, num_rows * num_columns, num_bins, false);
@@ -419,6 +423,7 @@ TEST(HistUtil, AdapterDeviceSketchMemory) {
 }
 
 TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
+  auto ctx = MakeCUDACtx(0);
   int num_columns = 100;
   int num_rows = 1000;
   int num_bins = 256;
@@ -435,7 +440,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
   AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
   HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
+  sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
   size_t bytes_required = detail::RequiredMemory(
       num_rows, num_columns, num_rows * num_columns, num_bins, false);
   EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05);
@@ -444,6 +449,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
 }
 
 TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
+  auto ctx = MakeCUDACtx(0);
   int num_columns = 100;
   int num_rows = 1000;
   int num_bins = 256;
@@ -465,7 +471,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
                       &sketch_container);
 
   HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
+  sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
   ConsoleLogger::Configure({{"verbosity", "0"}});
   size_t bytes_required = detail::RequiredMemory(
       num_rows, num_columns, num_rows * num_columns, num_bins, true);
@@ -475,6 +481,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
 
 void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
                                   int32_t num_bins, bool weighted) {
+  auto ctx = MakeCUDACtx(0);
   auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories);
   thrust::device_vector<float> x(h_x);
   auto adapter = AdapterFromData(x, n, 1);
@@ -498,7 +505,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
   AdapterDeviceSketch(adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(), &container);
   HistogramCuts cuts;
-  container.MakeCuts(&cuts, info.IsColumnSplit());
+  container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
 
   thrust::sort(x.begin(), x.end());
   auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
@@ -522,6 +529,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
 TEST(HistUtil, AdapterDeviceSketchCategorical) {
   auto categorical_sizes = {2, 6, 8, 12};
   int num_bins = 256;
+  auto ctx = MakeCUDACtx(0);
   auto sizes = {25, 100, 1000};
   for (auto n : sizes) {
     for (auto num_categories : categorical_sizes) {
@@ -529,7 +537,7 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
       auto dmat = GetDMatrixFromData(x, n, 1);
       auto x_device = thrust::device_vector<float>(x);
       auto adapter = AdapterFromData(x_device, n, 1);
-      ValidateBatchedCuts(adapter, num_bins, dmat.get());
+      ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get());
       TestCategoricalSketchAdapter(n, num_categories, num_bins, true);
       TestCategoricalSketchAdapter(n, num_categories, num_bins, false);
     }
@@ -540,13 +548,14 @@ TEST(HistUtil, AdapterDeviceSketchMultipleColumns) {
   auto bin_sizes = {2, 16, 256, 512};
   auto sizes = {100, 1000, 1500};
   int num_columns = 5;
+  auto ctx = MakeCUDACtx(0);
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
     auto x_device = thrust::device_vector<float>(x);
     for (auto num_bins : bin_sizes) {
       auto adapter = AdapterFromData(x_device, num_rows, num_columns);
-      ValidateBatchedCuts(adapter, num_bins, dmat.get());
+      ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get());
     }
   }
 }
@@ -556,12 +565,13 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
   int num_rows = 5000;
   auto batch_sizes = {0, 100, 1500, 6000};
   int num_columns = 5;
+  auto ctx = MakeCUDACtx(0);
   for (auto batch_size : batch_sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
     auto x_device = thrust::device_vector<float>(x);
     auto adapter = AdapterFromData(x_device, num_rows, num_columns);
-    ValidateBatchedCuts(adapter, num_bins, dmat.get(), batch_size);
+    ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get(), batch_size);
   }
 }
 
@@ -647,12 +657,12 @@ TEST(HistUtil, SketchingEquivalent) {
       auto x_device = thrust::device_vector<float>(x);
       auto adapter = AdapterFromData(x_device, num_rows, num_columns);
       common::HistogramCuts adapter_cuts = MakeUnweightedCutsForTest(
-          adapter, num_bins, std::numeric_limits<float>::quiet_NaN());
+          &ctx, adapter, num_bins, std::numeric_limits<float>::quiet_NaN());
       EXPECT_EQ(dmat_cuts.Values(), adapter_cuts.Values());
       EXPECT_EQ(dmat_cuts.Ptrs(), adapter_cuts.Ptrs());
       EXPECT_EQ(dmat_cuts.MinValues(), adapter_cuts.MinValues());
 
-      ValidateBatchedCuts(adapter, num_bins, dmat.get());
+      ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get());
     }
   }
 }
@@ -702,7 +712,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
                       .Device(DeviceOrd::CUDA(0))
                       .GenerateArrayInterface(&storage);
   MetaInfo info;
-  Context ctx;
+  auto ctx = MakeCUDACtx(0);
   auto& h_weights = info.weights_.HostVector();
   if (with_group) {
     h_weights.resize(kGroups);
@@ -731,7 +741,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
                       &sketch_container);
 
   common::HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
+  sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
 
   auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols);
   if (with_group) {
@@ -744,10 +754,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
   ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
   ValidateCuts(cuts, dmat.get(), kBins);
 
-  auto cuda_ctx = MakeCUDACtx(0);
   if (with_group) {
     dmat->Info().weights_ = decltype(dmat->Info().weights_)();  // remove weight
-    HistogramCuts non_weighted = DeviceSketch(&cuda_ctx, dmat.get(), kBins, 0);
+    HistogramCuts non_weighted = DeviceSketch(&ctx, dmat.get(), kBins, 0);
     for (size_t i = 0; i < cuts.Values().size(); ++i) {
       ASSERT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
     }
@@ -773,7 +782,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
     SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
     AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                         &sketch_container);
-    sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
+    sketch_container.MakeCuts(&ctx, &weighted, info.IsColumnSplit());
     ValidateCuts(weighted, dmat.get(), kBins);
   }
 }
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index 343f59cda..9fa1566ea 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -86,7 +86,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   }
 
   HistogramCuts distributed_cuts;
-  sketch_distributed.MakeCuts(m->Info(), &distributed_cuts);
+  sketch_distributed.MakeCuts(&ctx, m->Info(), &distributed_cuts);
 
   // Generate cuts for single node environment
   collective::Finalize();
@@ -117,7 +117,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   }
 
   HistogramCuts single_node_cuts;
-  sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts);
+  sketch_on_single_node.MakeCuts(&ctx, m->Info(), &single_node_cuts);
 
   auto const& sptrs = single_node_cuts.Ptrs();
   auto const& dptrs = distributed_cuts.Ptrs();
@@ -220,7 +220,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
       }
     }
 
-    sketch_distributed.MakeCuts(m->Info(), &distributed_cuts);
+    sketch_distributed.MakeCuts(&ctx, m->Info(), &distributed_cuts);
   }
 
   // Generate cuts for single node environment
@@ -243,7 +243,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
       }
     }
 
-    sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts);
+    sketch_on_single_node.MakeCuts(&ctx, m->Info(), &single_node_cuts);
   }
 
   auto const& sptrs = single_node_cuts.Ptrs();
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 49353439f..26bd05524 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -370,6 +370,7 @@ void TestAllReduceBasic() {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
     auto const device = DeviceOrd::CUDA(GPUIDX);
+    auto ctx = MakeCUDACtx(device.ordinal);
 
     // Set up single node version;
     HostDeviceVector<FeatureType> ft({}, device);
@@ -413,7 +414,7 @@ void TestAllReduceBasic() {
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(),
                         &sketch_distributed);
-    sketch_distributed.AllReduce(false);
+    sketch_distributed.AllReduce(&ctx, false);
     sketch_distributed.Unique();
 
     ASSERT_EQ(sketch_distributed.ColumnsPtr().size(),
@@ -517,6 +518,7 @@ void TestSameOnAllWorkers() {
                                  MetaInfo const &info) {
     auto const rank = collective::GetRank();
     auto const device = DeviceOrd::CUDA(GPUIDX);
+    Context ctx = MakeCUDACtx(device.ordinal);
     HostDeviceVector<FeatureType> ft({}, device);
     SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
     HostDeviceVector<float> storage({}, device);
@@ -528,7 +530,7 @@ void TestSameOnAllWorkers() {
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(),
                         &sketch_distributed);
-    sketch_distributed.AllReduce(false);
+    sketch_distributed.AllReduce(&ctx, false);
     sketch_distributed.Unique();
     TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
 
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index 3dd0c3a1f..85f2e014b 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -73,6 +73,7 @@ void RunWithFederatedCommunicator(int32_t world_size, std::string const& server_
   auto run = [&](auto rank) {
     Json config{JsonObject()};
     config["xgboost_communicator"] = String("federated");
+    config["federated_secure"] = false;
     config["federated_server_address"] = String(server_address);
     config["federated_world_size"] = world_size;
     config["federated_rank"] = rank;
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 692257748..04f1d35b4 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -2,6 +2,7 @@
  * Copyright (c) 2017-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
+#include <gmock/gmock.h>
 #include <xgboost/learner.h>                        // for Learner
 #include <xgboost/logging.h>                        // for LogCheck_NE, CHECK_NE, LogCheck_EQ
 #include <xgboost/objective.h>                      // for ObjFunction
@@ -81,7 +82,9 @@ TEST(Learner, ParameterValidation) {
 
   // whitespace
   learner->SetParam("tree method", "exact");
-  EXPECT_THROW(learner->Configure(), dmlc::Error);
+  EXPECT_THAT([&] { learner->Configure(); },
+              ::testing::ThrowsMessage<dmlc::Error>(
+                  ::testing::HasSubstr(R"("tree method" contains whitespace)")));
 }
 
 TEST(Learner, CheckGroup) {
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 7d5f15a1c..862bc6bfc 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -19,14 +19,15 @@ auto ZeroParam() {
 }
 }  // anonymous namespace
 
-inline GradientQuantiser DummyRoundingFactor() {
+inline GradientQuantiser DummyRoundingFactor(Context const* ctx) {
   thrust::device_vector<GradientPair> gpair(1);
   gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return {dh::ToSpan(gpair), MetaInfo()};
+  return {ctx, dh::ToSpan(gpair), MetaInfo()};
 }
 
-thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPairPrecise> x) {
-  auto r = DummyRoundingFactor();
+thrust::device_vector<GradientPairInt64> ConvertToInteger(Context const* ctx,
+                                                          std::vector<GradientPairPrecise> x) {
+  auto r = DummyRoundingFactor(ctx);
   std::vector<GradientPairInt64> y(x.size());
   for (std::size_t i = 0; i < x.size(); i++) {
     y[i] = r.ToFixedPoint(GradientPair(x[i]));
@@ -41,11 +42,12 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
   cuts_.cut_ptrs_.SetDevice(ctx.Device());
   cuts_.cut_values_.SetDevice(ctx.Device());
   cuts_.min_vals_.SetDevice(ctx.Device());
-  thrust::device_vector<GradientPairInt64> feature_histogram{ConvertToInteger(feature_histogram_)};
+  thrust::device_vector<GradientPairInt64> feature_histogram{
+      ConvertToInteger(&ctx, feature_histogram_)};
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
   auto d_feature_types = dh::ToSpan(feature_types);
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   EvaluateSplitInputs input{1, 0, quantiser.ToFixedPoint(parent_sum_), dh::ToSpan(feature_set),
                             dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
@@ -60,7 +62,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
 
   evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false,
                   ctx.Device());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   ASSERT_EQ(result.thresh, 1);
   this->CheckResult(result.loss_chg, result.findex, result.fvalue, result.is_cat,
@@ -90,7 +92,7 @@ TEST(GpuHist, PartitionBasic) {
       *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end());
   cuts.SetCategorical(true, max_cat);
   d_feature_types = dh::ToSpan(feature_types);
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   EvaluateSplitSharedInputs shared_inputs{
       param,
       quantiser,
@@ -108,10 +110,10 @@ TEST(GpuHist, PartitionBasic) {
     // -1.0s go right
     // -3.0s go left
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-5.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}});
     EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
@@ -122,10 +124,10 @@ TEST(GpuHist, PartitionBasic) {
     // -1.0s go right
     // -3.0s go left
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-7.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}});
     EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000"));
@@ -134,10 +136,10 @@ TEST(GpuHist, PartitionBasic) {
   {
     // All -1.0, gain from splitting should be 0.0
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-3.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
     EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_FLOAT_EQ(result.loss_chg, 0.0f);
     EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
@@ -147,10 +149,10 @@ TEST(GpuHist, PartitionBasic) {
   // value
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 6.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
     EvaluateSplitInputs input{3, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
     EXPECT_EQ(result.dir, kLeftDir);
@@ -160,10 +162,10 @@ TEST(GpuHist, PartitionBasic) {
     // -1.0s go right
     // -3.0s go left
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-5.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}});
     EvaluateSplitInputs input{4, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000"));
@@ -173,10 +175,10 @@ TEST(GpuHist, PartitionBasic) {
     // -1.0s go right
     // -3.0s go left
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-5.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}});
     EvaluateSplitInputs input{5, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000"));
     EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
@@ -205,7 +207,7 @@ TEST(GpuHist, PartitionTwoFeatures) {
       *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end());
   cuts.SetCategorical(true, max_cat);
 
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   EvaluateSplitSharedInputs shared_inputs{param,
                                           quantiser,
                                           d_feature_types,
@@ -220,10 +222,10 @@ TEST(GpuHist, PartitionTwoFeatures) {
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
     auto feature_histogram = ConvertToInteger(
-        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+        &ctx, {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
     EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.findex, 1);
     EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
@@ -233,10 +235,10 @@ TEST(GpuHist, PartitionTwoFeatures) {
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
     auto feature_histogram = ConvertToInteger(
-        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
+        &ctx, {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
     EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.findex, 1);
     EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000"));
@@ -266,7 +268,7 @@ TEST(GpuHist, PartitionTwoNodes) {
       *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end());
   cuts.SetCategorical(true, max_cat);
 
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   EvaluateSplitSharedInputs shared_inputs{param,
                                           quantiser,
                                           d_feature_types,
@@ -283,15 +285,16 @@ TEST(GpuHist, PartitionTwoNodes) {
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
     auto feature_histogram_a = ConvertToInteger(
-        {{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+        &ctx, {{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
     thrust::device_vector<EvaluateSplitInputs> inputs(2);
     inputs[0] = EvaluateSplitInputs{0, 0, parent_sum, dh::ToSpan(feature_set),
                                     dh::ToSpan(feature_histogram_a)};
-    auto feature_histogram_b = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+    auto feature_histogram_b = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
     inputs[1] = EvaluateSplitInputs{1, 0, parent_sum, dh::ToSpan(feature_set),
                                     dh::ToSpan(feature_histogram_b)};
     thrust::device_vector<GPUExpandEntry> results(2);
-    evaluator.EvaluateSplits({0, 1}, 1, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(results));
+    evaluator.EvaluateSplits(&ctx, {0, 1}, 1, dh::ToSpan(inputs), shared_inputs,
+                             dh::ToSpan(results));
     EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(0)[0]),
               std::bitset<32>("10000000000000000000000000000000"));
     EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(1)[0]),
@@ -301,7 +304,7 @@ TEST(GpuHist, PartitionTwoNodes) {
 
 void TestEvaluateSingleSplit(bool is_categorical) {
   auto ctx = MakeCUDACtx(0);
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
@@ -311,7 +314,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
-  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
+  auto feature_histogram =
+      ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
   common::Span<FeatureType> d_feature_types;
@@ -336,7 +340,7 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                              ctx.Device()};
   evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
                   ctx.Device());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
   if (is_categorical) {
@@ -352,7 +356,8 @@ TEST(GpuHist, EvaluateSingleSplit) { TestEvaluateSingleSplit(false); }
 TEST(GpuHist, EvaluateSingleCategoricalSplit) { TestEvaluateSingleSplit(true); }
 
 TEST(GpuHist, EvaluateSingleSplitMissing) {
-  auto quantiser = DummyRoundingFactor();
+  auto ctx = MakeCUDACtx(0);
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{1.0, 1.5});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
@@ -361,7 +366,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
   thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0};
-  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}});
+  auto feature_histogram = ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}});
   EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                             dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
@@ -373,7 +378,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
                                           false};
 
   GPUHistEvaluator evaluator(tparam, feature_set.size(), FstCU());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
   EXPECT_EQ(result.fvalue, 1.0);
@@ -383,14 +388,15 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
 }
 
 TEST(GpuHist, EvaluateSingleSplitEmpty) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   GPUHistEvaluator evaluator(tparam, 1, FstCU());
   DeviceSplitCandidate result =
       evaluator
           .EvaluateSingleSplit(
-              EvaluateSplitInputs{},
+              &ctx, EvaluateSplitInputs{},
               EvaluateSplitSharedInputs{
-                  GPUTrainingParam(tparam), DummyRoundingFactor(), {}, {}, {}, {}, false})
+                  GPUTrainingParam(tparam), DummyRoundingFactor(&ctx), {}, {}, {}, {}, false})
           .split;
   EXPECT_EQ(result.findex, -1);
   EXPECT_LT(result.loss_chg, 0.0f);
@@ -398,7 +404,8 @@ TEST(GpuHist, EvaluateSingleSplitEmpty) {
 
 // Feature 0 has a better split, but the algorithm must select feature 1
 TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
-  auto quantiser = DummyRoundingFactor();
+  auto ctx = MakeCUDACtx(0);
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   tparam.UpdateAllowUnknown(Args{});
@@ -408,7 +415,8 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
   thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  auto feature_histogram =
+      ConvertToInteger(&ctx, {{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
   EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                             dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
@@ -420,7 +428,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
                                           false};
 
   GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
   EXPECT_EQ(result.fvalue, 11.0);
@@ -430,7 +438,8 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
 
 // Features 0 and 1 have identical gain, the algorithm must select 0
 TEST(GpuHist, EvaluateSingleSplitBreakTies) {
-  auto quantiser = DummyRoundingFactor();
+  auto ctx = MakeCUDACtx(0);
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   tparam.UpdateAllowUnknown(Args{});
@@ -440,7 +449,8 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
   thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  auto feature_histogram =
+      ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
   EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                             dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
@@ -452,15 +462,16 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
                                           false};
 
   GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
   EXPECT_EQ(result.fvalue, 1.0);
 }
 
 TEST(GpuHist, EvaluateSplits) {
+  auto ctx = MakeCUDACtx(0);
   thrust::device_vector<DeviceSplitCandidate> out_splits(2);
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   tparam.UpdateAllowUnknown(Args{});
@@ -471,9 +482,9 @@ TEST(GpuHist, EvaluateSplits) {
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 0.0};
   auto feature_histogram_left =
-      ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
+      ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
   auto feature_histogram_right =
-      ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+      ConvertToInteger(&ctx, {{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
   EvaluateSplitInputs input_left{1, 0, parent_sum, dh::ToSpan(feature_set),
                                  dh::ToSpan(feature_histogram_left)};
   EvaluateSplitInputs input_right{2, 0, parent_sum, dh::ToSpan(feature_set),
@@ -514,7 +525,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
   evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device());
 
   // Convert the sample histogram to fixed point
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   thrust::host_vector<GradientPairInt64> h_hist;
   for (auto e : hist_[0]) {
     h_hist.push_back(quantiser.ToFixedPoint(e));
@@ -531,7 +542,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
                                           cuts_.cut_values_.ConstDeviceSpan(),
                                           cuts_.min_vals_.ConstDeviceSpan(),
                                           false};
-  auto split = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  auto split = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
   ASSERT_NEAR(split.loss_chg, best_score_, 1e-2);
 }
 
@@ -541,7 +552,7 @@ namespace {
 void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
   auto ctx = MakeCUDACtx(GPUIDX);
   auto rank = collective::GetRank();
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
@@ -552,8 +563,8 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
-  auto feature_histogram = rank == 0 ? ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}})
-                                     : ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}});
+  auto feature_histogram = rank == 0 ? ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}})
+                                     : ConvertToInteger(&ctx, {{-1.0, 0.5}, {1.0, 0.5}});
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
   common::Span<FeatureType> d_feature_types;
@@ -576,7 +587,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
   evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1) << "rank: " << rank;
   if (is_categorical) {
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 0c91cf21e..f7f2e27ea 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -37,7 +37,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
     FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
                                  sizeof(GradientPairInt64));
 
-    auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
+    auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
     BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
                            feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                            d_histogram, quantiser);
@@ -51,7 +51,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
       dh::device_vector<GradientPairInt64> new_histogram(num_bins);
       auto d_new_histogram = dh::ToSpan(new_histogram);
 
-      auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
+      auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
       BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
                              feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
@@ -129,7 +129,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
   auto gpair = GenerateRandomGradients(kRows, 0, 2);
   gpair.SetDevice(DeviceOrd::CUDA(0));
-  auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
+  auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
   /**
    * Generate hist with cat data.
    */
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 8949b5f4b..25a800367 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -181,7 +181,7 @@ void TestSyncHist(bool is_distributed) {
 
   histogram.Buffer().Reset(1, n_nodes, space, target_hists);
   // sync hist
-  histogram.SyncHistogram(&tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
+  histogram.SyncHistogram(&ctx, &tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
 
   using GHistRowT = common::GHistRow;
   auto check_hist = [](const GHistRowT parent, const GHistRowT left, const GHistRowT right,
@@ -266,7 +266,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
     histogram.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
                         linalg::MakeTensorView(&ctx, gpair, gpair.size()), force_read_by_column);
   }
-  histogram.SyncHistogram(&tree, nodes_to_build, {});
+  histogram.SyncHistogram(&ctx, &tree, nodes_to_build, {});
 
   // Check if number of histogram bins is correct
   ASSERT_EQ(histogram.Histogram()[nid].size(), gmat.cut.Ptrs().back());
@@ -366,7 +366,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
                        linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
                        force_read_by_column);
   }
-  cat_hist.SyncHistogram(&tree, nodes_to_build, {});
+  cat_hist.SyncHistogram(&ctx, &tree, nodes_to_build, {});
 
   /**
    * Generate hist with one hot encoded data.
@@ -382,7 +382,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
                           linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
                           force_read_by_column);
   }
-  onehot_hist.SyncHistogram(&tree, nodes_to_build, {});
+  onehot_hist.SyncHistogram(&ctx, &tree, nodes_to_build, {});
 
   auto cat = cat_hist.Histogram()[0];
   auto onehot = onehot_hist.Histogram()[0];
@@ -451,7 +451,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
                             force_read_by_column);
       ++page_idx;
     }
-    multi_build.SyncHistogram(&tree, nodes, {});
+    multi_build.SyncHistogram(ctx, &tree, nodes, {});
 
     multi_page = multi_build.Histogram()[RegTree::kRoot];
   }
@@ -480,7 +480,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     single_build.BuildHist(0, space, gmat, row_set_collection, nodes,
                            linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()),
                            force_read_by_column);
-    single_build.SyncHistogram(&tree, nodes, {});
+    single_build.SyncHistogram(ctx, &tree, nodes, {});
 
     single_page = single_build.Histogram()[RegTree::kRoot];
   }
@@ -570,7 +570,7 @@ class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
     CHECK_NE(partitioners.front()[tree.RightChild(best.nid)].Size(), 0);
 
     hist_builder.BuildHistLeftRight(
-        Xy.get(), &tree, partitioners, valid_candidates,
+        &ctx, Xy.get(), &tree, partitioners, valid_candidates,
         linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1), batch);
 
     if (limit) {
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index accfbae08..6f937351e 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -111,7 +111,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   maker.hist.AllocateHistograms({0});
 
   maker.gpair = gpair.DeviceSpan();
-  maker.quantiser = std::make_unique<GradientQuantiser>(maker.gpair, MetaInfo());
+  maker.quantiser = std::make_unique<GradientQuantiser>(&ctx, maker.gpair, MetaInfo());
   maker.page = page.get();
 
   maker.InitFeatureGroupsOnce();
@@ -162,12 +162,6 @@ HistogramCutsWrapper GetHostCutMatrix () {
   return cmat;
 }
 
-inline GradientQuantiser DummyRoundingFactor() {
-  thrust::device_vector<GradientPair> gpair(1);
-  gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return {dh::ToSpan(gpair), MetaInfo()};
-}
-
 void TestHistogramIndexImpl() {
   // Test if the compressed histogram index matches when using a sparse
   // dmatrix with and without using external memory

From 44099f585dcb3f54567aad03c5d136614d553dc3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 8 Nov 2023 18:17:14 +0800
Subject: [PATCH 07/32] [coll] Add C API for the tracker. (#9773)

---
 include/xgboost/c_api.h                 |  77 +++++++++++++++
 src/c_api/coll_c_api.cc                 | 119 ++++++++++++++++++++++++
 src/collective/tracker.h                |   3 +
 src/common/error_msg.h                  |   2 +
 tests/cpp/collective/test_coll_c_api.cc |  63 +++++++++++++
 5 files changed, 264 insertions(+)
 create mode 100644 src/c_api/coll_c_api.cc
 create mode 100644 tests/cpp/collective/test_coll_c_api.cc

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index d28b5098b..ffa3a6c79 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1508,6 +1508,83 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
  * @{
  */
 
+/**
+ * @brief Handle to tracker.
+ *
+ *   There are currently two types of tracker in XGBoost, first one is `rabit`, while the
+ *   other one is `federated`.
+ *
+ *   This is still under development.
+ */
+typedef void *TrackerHandle; /* NOLINT */
+
+/**
+ * @brief Create a new tracker.
+ *
+ * @param config JSON encoded parameters.
+ *
+ *   - dmlc_communicator: String, the type of tracker to create. Available options are `rabit`
+ *                        and `federated`.
+ *   - n_workers: Integer, the number of workers.
+ *   - port: (Optional) Integer, the port this tracker should listen to.
+ *   - timeout: (Optional) Integer, timeout in seconds for various networking operations.
+ *
+ *   Some configurations are `rabit` specific:
+ *   - host: (Optional) String, Used by the the `rabit` tracker to specify the address of the host.
+ *
+ *   Some `federated` specific configurations:
+ *   - federated_secure: Boolean, whether this is a secure server.
+ *   - server_key_path: Path to the server key. Used only if this is a secure server.
+ *   - server_cert_path: Path to the server certificate. Used only if this is a secure server.
+ *   - client_cert_path: Path to the client certificate. Used only if this is a secure server.
+ *
+ * @param handle The handle to the created tracker.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle);
+
+/**
+ * @brief Get the arguments needed for running workers. This should be called after
+ *        XGTrackerRun() and XGTrackerWait()
+ *
+ * @param handle The handle to the tracker.
+ * @param args The arguments returned as a JSON document.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerWorkerArgs(TrackerHandle handle, char const **args);
+
+/**
+ * @brief Run the tracker.
+ *
+ * @param handle The handle to the tracker.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerRun(TrackerHandle handle);
+
+/**
+ * @brief Wait for the tracker to finish, should be called after XGTrackerRun().
+ *
+ * @param handle The handle to the tracker.
+ * @param config JSON encoded configuration. No argument is required yet, preserved for
+ *        the future.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerWait(TrackerHandle handle, char const *config);
+
+/**
+ * @brief Free a tracker instance. XGTrackerWait() is called internally. If the tracker
+ *        cannot close properly, manual interruption is required.
+ *
+ * @param handle The handle to the tracker.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerFree(TrackerHandle handle);
+
 /*!
  * \brief Initialize the collective communicator.
  *
diff --git a/src/c_api/coll_c_api.cc b/src/c_api/coll_c_api.cc
new file mode 100644
index 000000000..01713dbad
--- /dev/null
+++ b/src/c_api/coll_c_api.cc
@@ -0,0 +1,119 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <chrono>       // for seconds
+#include <cstddef>      // for size_t
+#include <future>       // for future
+#include <memory>       // for unique_ptr
+#include <string>       // for string
+#include <type_traits>  // for is_same_v, remove_pointer_t
+#include <utility>      // for pair
+
+#include "../collective/tracker.h"  // for RabitTracker
+#include "c_api_error.h"            // for API_BEGIN
+#include "xgboost/c_api.h"
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/json.h"               // for Json
+#include "xgboost/string_view.h"        // for StringView
+
+#if defined(XGBOOST_USE_FEDERATED)
+#include "../../plugin/federated/federated_tracker.h"  // for FederatedTracker
+#else
+#include "../common/error_msg.h"  // for NoFederated
+#endif
+
+using namespace xgboost;  // NOLINT
+
+namespace {
+using TrackerHandleT =
+    std::pair<std::unique_ptr<collective::Tracker>, std::shared_future<collective::Result>>;
+
+TrackerHandleT *GetTrackerHandle(TrackerHandle handle) {
+  xgboost_CHECK_C_ARG_PTR(handle);
+  auto *ptr = static_cast<TrackerHandleT *>(handle);
+  CHECK(ptr);
+  return ptr;
+}
+
+struct CollAPIEntry {
+  std::string ret_str;
+};
+using CollAPIThreadLocalStore = dmlc::ThreadLocalStore<CollAPIEntry>;
+
+void WaitImpl(TrackerHandleT *ptr) {
+  std::chrono::seconds wait_for{100};
+  auto fut = ptr->second;
+  while (fut.valid()) {
+    auto res = fut.wait_for(wait_for);
+    CHECK(res != std::future_status::deferred);
+    if (res == std::future_status::ready) {
+      auto const &rc = ptr->second.get();
+      CHECK(rc.OK()) << rc.Report();
+      break;
+    }
+  }
+}
+}  // namespace
+
+XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle) {
+  API_BEGIN();
+  xgboost_CHECK_C_ARG_PTR(config);
+
+  Json jconfig = Json::Load(config);
+
+  auto type = RequiredArg<String>(jconfig, "dmlc_communicator", __func__);
+  std::unique_ptr<collective::Tracker> tptr;
+  if (type == "federated") {
+#if defined(XGBOOST_USE_FEDERATED)
+    tptr = std::make_unique<collective::FederatedTracker>(jconfig);
+#else
+    LOG(FATAL) << error::NoFederated();
+#endif  // defined(XGBOOST_USE_FEDERATED)
+  } else if (type == "rabit") {
+    tptr = std::make_unique<collective::RabitTracker>(jconfig);
+  } else {
+    LOG(FATAL) << "Unknown communicator:" << type;
+  }
+
+  auto ptr = new TrackerHandleT{std::move(tptr), std::future<collective::Result>{}};
+  static_assert(std::is_same_v<std::remove_pointer_t<decltype(ptr)>, TrackerHandleT>);
+
+  xgboost_CHECK_C_ARG_PTR(handle);
+  *handle = ptr;
+  API_END();
+}
+
+XGB_DLL int XGTrackerWorkerArgs(TrackerHandle handle, char const **args) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  auto &local = *CollAPIThreadLocalStore::Get();
+  local.ret_str = Json::Dump(ptr->first->WorkerArgs());
+  xgboost_CHECK_C_ARG_PTR(args);
+  *args = local.ret_str.c_str();
+  API_END();
+}
+
+XGB_DLL int XGTrackerRun(TrackerHandle handle) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  CHECK(!ptr->second.valid()) << "Tracker is already running.";
+  ptr->second = ptr->first->Run();
+  API_END();
+}
+
+XGB_DLL int XGTrackerWait(TrackerHandle handle, char const *config) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  xgboost_CHECK_C_ARG_PTR(config);
+  auto jconfig = Json::Load(StringView{config});
+  WaitImpl(ptr);
+  API_END();
+}
+
+XGB_DLL int XGTrackerFree(TrackerHandle handle) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  WaitImpl(ptr);
+  delete ptr;
+  API_END();
+}
diff --git a/src/collective/tracker.h b/src/collective/tracker.h
index 24e47bb4e..f336a82f9 100644
--- a/src/collective/tracker.h
+++ b/src/collective/tracker.h
@@ -114,6 +114,9 @@ class RabitTracker : public Tracker {
   // record for how to reach out to workers if error happens.
   std::vector<std::pair<std::string, std::int32_t>> worker_error_handles_;
   // listening socket for incoming workers.
+  //
+  // At the moment, the listener calls accept without first polling. We can add an
+  // additional unix domain socket to allow cancelling the accept.
   TCPSocket listener_;
 
   Result Bootstrap(std::vector<WorkerProxy>* p_workers);
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 94703fd15..995fe11d5 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -97,5 +97,7 @@ constexpr StringView InvalidCUDAOrdinal() {
 }
 
 void MismatchedDevices(Context const* booster, Context const* data);
+
+inline auto NoFederated() { return "XGBoost is not compiled with federated learning support."; }
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/tests/cpp/collective/test_coll_c_api.cc b/tests/cpp/collective/test_coll_c_api.cc
new file mode 100644
index 000000000..d80fbc140
--- /dev/null
+++ b/tests/cpp/collective/test_coll_c_api.cc
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/c_api.h>
+
+#include <chrono>  // for ""s
+#include <thread>  // for thread
+
+#include "../../../src/collective/tracker.h"
+#include "test_worker.h"   // for SocketTest
+#include "xgboost/json.h"  // for Json
+
+namespace xgboost::collective {
+namespace {
+class TrackerAPITest : public SocketTest {};
+}  // namespace
+
+TEST_F(TrackerAPITest, CAPI) {
+  TrackerHandle handle;
+  Json config{Object{}};
+  config["dmlc_communicator"] = String{"rabit"};
+  config["n_workers"] = 2;
+  config["timeout"] = 1;
+  auto config_str = Json::Dump(config);
+  auto rc = XGTrackerCreate(config_str.c_str(), &handle);
+  ASSERT_EQ(rc, 0);
+  rc = XGTrackerRun(handle);
+  ASSERT_EQ(rc, 0);
+
+  std::thread bg_wait{[&] {
+    Json config{Object{}};
+    auto config_str = Json::Dump(config);
+    auto rc = XGTrackerWait(handle, config_str.c_str());
+    ASSERT_EQ(rc, 0);
+  }};
+
+  char const* cargs;
+  rc = XGTrackerWorkerArgs(handle, &cargs);
+  ASSERT_EQ(rc, 0);
+  auto args = Json::Load(StringView{cargs});
+
+  std::string host;
+  ASSERT_TRUE(GetHostAddress(&host).OK());
+  ASSERT_EQ(host, get<String const>(args["DMLC_TRACKER_URI"]));
+  auto port = get<Integer const>(args["DMLC_TRACKER_PORT"]);
+  ASSERT_NE(port, 0);
+
+  std::vector<std::thread> workers;
+  using namespace std::chrono_literals;  // NOLINT
+  for (std::int32_t r = 0; r < 2; ++r) {
+    workers.emplace_back([=] { WorkerForTest w{host, static_cast<std::int32_t>(port), 1s, 2, r}; });
+  }
+  for (auto& w : workers) {
+    w.join();
+  }
+
+  rc = XGTrackerFree(handle);
+  ASSERT_EQ(rc, 0);
+
+  bg_wait.join();
+}
+}  // namespace xgboost::collective

From 6fd4a306670fa82da06b42ef217705eefd97cbcd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 9 Nov 2023 05:26:40 +0800
Subject: [PATCH 08/32] [coll] Increase timeout for allgather test. (#9777)

---
 tests/cpp/collective/test_allgather.cc | 2 +-
 tests/cpp/collective/test_worker.h     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/cpp/collective/test_allgather.cc b/tests/cpp/collective/test_allgather.cc
index bdfadc0c7..decad8786 100644
--- a/tests/cpp/collective/test_allgather.cc
+++ b/tests/cpp/collective/test_allgather.cc
@@ -47,7 +47,7 @@ class Worker : public WorkerForTest {
 
       std::size_t n = 8192;  // n_bytes = 8192 * sizeof(int)
       std::vector<std::int32_t> data(comm_.World() * n, 0);
-      auto s_data = common::Span{data.data(), data.size()};
+      auto s_data = common::Span<std::int32_t>{data};
       auto seg = s_data.subspan(comm_.Rank() * n, n);
       std::iota(seg.begin(), seg.end(), comm_.Rank());
 
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index ad3213e81..490cdf13c 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -92,11 +92,12 @@ class TrackerTest : public SocketTest {
 
 template <typename WorkerFn>
 void TestDistributed(std::int32_t n_workers, WorkerFn worker_fn) {
-  std::chrono::seconds timeout{1};
+  std::chrono::seconds timeout{2};
 
   std::string host;
   auto rc = GetHostAddress(&host);
   ASSERT_TRUE(rc.OK()) << rc.Report();
+  LOG(INFO) << "Using " << n_workers << " workers for test.";
   RabitTracker tracker{StringView{host}, n_workers, 0, timeout};
   auto fut = tracker.Run();
 

From 162da7b52b6b88de7b95319b4ded746b148b9030 Mon Sep 17 00:00:00 2001
From: Ken Geis <geis.ken@gmail.com>
Date: Sun, 12 Nov 2023 11:09:06 -0800
Subject: [PATCH 09/32] fix typo in Parameters doc (#9781)

---
 doc/parameter.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 88a712a5a..d2471dfd9 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -470,7 +470,7 @@ Parameter for using Pseudo-Huber (``reg:pseudohubererror``)
 Parameter for using Quantile Loss (``reg:quantileerror``)
 =========================================================
 
-* ``quantile_alpha``: A scala or a list of targeted quantiles.
+* ``quantile_alpha``: A scalar or a list of targeted quantiles.
 
     .. versionadded:: 2.0.0
 

From 36a552ac981f4936714c4cd9128ba33f2a63d2eb Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Tue, 14 Nov 2023 08:59:45 +0800
Subject: [PATCH 10/32] [jvm-packages] support stage-level scheduling (#9775)

---
 .../rapids/spark/GpuXGBoostGeneralSuite.scala |   2 +-
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  | 153 ++++++++++++++++--
 .../scala/spark/params/BoosterParams.scala    |   6 +-
 .../xgboost4j/scala/spark/XGBoostSuite.scala  | 150 +++++++++++++++++
 4 files changed, 298 insertions(+), 13 deletions(-)
 create mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala

diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
index c731afb1d..746e03bb6 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
@@ -206,7 +206,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
         .setDevice("cuda:1")
         .fit(trainingDf)
       }
-      assert(thrown.getMessage.contains("`cuda` or `gpu`"))
+      assert(thrown.getMessage.contains("device given invalid value cuda:1"))
     }
   }
 }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index d12431479..5a1af886f 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -31,7 +31,8 @@ import org.apache.commons.logging.LogFactory
 import org.apache.hadoop.fs.FileSystem
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.{SparkContext, TaskContext}
+import org.apache.spark.resource.{ResourceProfileBuilder, TaskResourceRequests}
+import org.apache.spark.{SparkConf, SparkContext, TaskContext}
 import org.apache.spark.sql.SparkSession
 
 /**
@@ -72,7 +73,8 @@ private[scala] case class XGBoostExecutionParams(
     device: Option[String],
     isLocal: Boolean,
     featureNames: Option[Array[String]],
-    featureTypes: Option[Array[String]]) {
+    featureTypes: Option[Array[String]],
+    runOnGpu: Boolean) {
 
   private var rawParamMap: Map[String, Any] = _
 
@@ -186,14 +188,15 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
                                  .asInstanceOf[Boolean]
 
     val treeMethod: Option[String] = overridedParams.get("tree_method").map(_.toString)
-    // back-compatible with "gpu_hist"
-    val device: Option[String] = if (treeMethod.exists(_ == "gpu_hist")) {
-      Some("cuda")
-    } else overridedParams.get("device").map(_.toString)
+    val device: Option[String] = overridedParams.get("device").map(_.toString)
+    val deviceIsGpu = device.exists(_ == "cuda")
 
-    require(!(treeMethod.exists(_ == "approx") && device.exists(_ == "cuda")),
+    require(!(treeMethod.exists(_ == "approx") && deviceIsGpu),
       "The tree method \"approx\" is not yet supported for Spark GPU cluster")
 
+    // back-compatible with "gpu_hist"
+    val runOnGpu = treeMethod.exists(_ == "gpu_hist") || deviceIsGpu
+
     val trackerConf = overridedParams.get("tracker_conf") match {
       case None => TrackerConf()
       case Some(conf: TrackerConf) => conf
@@ -228,7 +231,8 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
       device,
       isLocal,
       featureNames,
-      featureTypes
+      featureTypes,
+      runOnGpu
     )
     xgbExecParam.setRawParamMap(overridedParams)
     xgbExecParam
@@ -253,7 +257,132 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
   )
 }
 
-object XGBoost extends Serializable {
+/**
+ * A trait to manage stage-level scheduling
+ */
+private[spark] trait XGBoostStageLevel extends Serializable {
+  private val logger = LogFactory.getLog("XGBoostSpark")
+
+  private[spark] def isStandaloneOrLocalCluster(conf: SparkConf): Boolean = {
+    val master = conf.get("spark.master")
+    master != null && (master.startsWith("spark://") || master.startsWith("local-cluster"))
+  }
+
+  /**
+   * To determine if stage-level scheduling should be skipped according to the spark version
+   * and spark configurations
+   *
+   * @param sparkVersion spark version
+   * @param runOnGpu     if xgboost training run on GPUs
+   * @param conf         spark configurations
+   * @return Boolean to skip stage-level scheduling or not
+   */
+  private[spark] def skipStageLevelScheduling(
+      sparkVersion: String,
+      runOnGpu: Boolean,
+      conf: SparkConf): Boolean = {
+    if (runOnGpu) {
+      if (sparkVersion < "3.4.0") {
+        logger.info("Stage-level scheduling in xgboost requires spark version 3.4.0+")
+        return true
+      }
+
+      if (!isStandaloneOrLocalCluster(conf)) {
+        logger.info("Stage-level scheduling in xgboost requires spark standalone or " +
+          "local-cluster mode")
+        return true
+      }
+
+      val executorCores = conf.getInt("spark.executor.cores", -1)
+      val executorGpus = conf.getInt("spark.executor.resource.gpu.amount", -1)
+      if (executorCores == -1 || executorGpus == -1) {
+        logger.info("Stage-level scheduling in xgboost requires spark.executor.cores, " +
+          "spark.executor.resource.gpu.amount to be set.")
+        return true
+      }
+
+      if (executorCores == 1) {
+        logger.info("Stage-level scheduling in xgboost requires spark.executor.cores > 1")
+        return true
+      }
+
+      if (executorGpus > 1) {
+        logger.info("Stage-level scheduling in xgboost will not work " +
+          "when spark.executor.resource.gpu.amount > 1")
+        return true
+      }
+
+      val taskGpuAmount = conf.getDouble("spark.task.resource.gpu.amount", -1.0).toFloat
+
+      if (taskGpuAmount == -1.0) {
+        // The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
+        // but with stage-level scheduling, we can make training task grab the gpu.
+        return false
+      }
+
+      if (taskGpuAmount == executorGpus.toFloat) {
+        // spark.executor.resource.gpu.amount = spark.task.resource.gpu.amount
+        // results in only 1 task running at a time, which may cause perf issue.
+        return true
+      }
+      // We can enable stage-level scheduling
+      false
+    } else true // Skip stage-level scheduling for cpu training.
+  }
+
+  /**
+   * Attempt to modify the task resources so that only one task can be executed
+   * on a single executor simultaneously.
+   *
+   * @param sc  the spark context
+   * @param rdd which rdd to be applied with new resource profile
+   * @return the original rdd or the changed rdd
+   */
+  private[spark] def tryStageLevelScheduling(
+      sc: SparkContext,
+      xgbExecParams: XGBoostExecutionParams,
+      rdd: RDD[(Booster, Map[String, Array[Float]])]
+    ): RDD[(Booster, Map[String, Array[Float]])] = {
+
+    val conf = sc.getConf
+    if (skipStageLevelScheduling(sc.version, xgbExecParams.runOnGpu, conf)) {
+      return rdd
+    }
+
+    // Ensure executor_cores is not None
+    val executor_cores = conf.getInt("spark.executor.cores", -1)
+    if (executor_cores == -1) {
+      throw new RuntimeException("Wrong spark.executor.cores")
+    }
+
+    // Spark-rapids is a GPU-acceleration project for Spark SQL.
+    // When spark-rapids is enabled, we prevent concurrent execution of other ETL tasks
+    // that utilize GPUs alongside training tasks in order to avoid GPU out-of-memory errors.
+    val spark_plugins = conf.get("spark.plugins", " ")
+    val spark_rapids_sql_enabled = conf.get("spark.rapids.sql.enabled", "true")
+
+    // Determine the number of cores required for each task.
+    val task_cores = if (spark_plugins.contains("com.nvidia.spark.SQLPlugin") &&
+      spark_rapids_sql_enabled.toLowerCase == "true") {
+      executor_cores
+    } else {
+      (executor_cores / 2) + 1
+    }
+
+    // Each training task requires cpu cores > total executor cores//2 + 1 to
+    // ensure tasks are sent to different executors.
+    // Note: We cannot use GPUs to limit concurrent tasks
+    // due to https://issues.apache.org/jira/browse/SPARK-45527.
+    val task_gpus = 1.0
+    val treqs = new TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
+    val rp = new ResourceProfileBuilder().require(treqs).build()
+
+    logger.info(s"XGBoost training tasks require the resource(cores=$task_cores, gpu=$task_gpus).")
+    rdd.withResources(rp)
+  }
+}
+
+object XGBoost extends XGBoostStageLevel {
   private val logger = LogFactory.getLog("XGBoostSpark")
 
   def getGPUAddrFromResources: Int = {
@@ -315,7 +444,7 @@ object XGBoost extends Serializable {
       val externalCheckpointParams = xgbExecutionParam.checkpointParam
 
       var params = xgbExecutionParam.toMap
-      if (xgbExecutionParam.device.exists(m => (m == "cuda" || m == "gpu"))) {
+      if (xgbExecutionParam.runOnGpu) {
         val gpuId = if (xgbExecutionParam.isLocal) {
           // For local mode, force gpu id to primary device
           0
@@ -413,10 +542,12 @@ object XGBoost extends Serializable {
 
         }}
 
+        val boostersAndMetricsWithRes = tryStageLevelScheduling(sc, xgbExecParams,
+          boostersAndMetrics)
         // The repartition step is to make training stage as ShuffleMapStage, so that when one
         // of the training task fails the training stage can retry. ResultStage won't retry when
         // it fails.
-        val (booster, metrics) = boostersAndMetrics.repartition(1).collect()(0)
+        val (booster, metrics) = boostersAndMetricsWithRes.repartition(1).collect()(0)
         val trackerReturnVal = tracker.waitFor(0L)
         logger.info(s"Rabit returns with exit code $trackerReturnVal")
         if (trackerReturnVal != 0) {
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
index 61efc2865..b64ad9385 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
@@ -154,11 +154,13 @@ private[spark] trait BoosterParams extends Params {
     (value: String) => BoosterParams.supportedTreeMethods.contains(value))
 
   final def getTreeMethod: String = $(treeMethod)
+
   /**
     *  The device for running XGBoost algorithms, options: cpu, cuda
     */
   final val device = new Param[String](
-    this, "device", "The device for running XGBoost algorithms, options: cpu, cuda"
+    this, "device", "The device for running XGBoost algorithms, options: cpu, cuda",
+    (value: String) => BoosterParams.supportedDevices.contains(value)
   )
 
   final def getDevice: String = $(device)
@@ -288,4 +290,6 @@ private[scala] object BoosterParams {
   val supportedSampleType = HashSet("uniform", "weighted")
 
   val supportedNormalizeType = HashSet("tree", "forest")
+
+  val supportedDevices = HashSet("cpu", "cuda")
 }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
new file mode 100644
index 000000000..9622c9b2d
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
@@ -0,0 +1,150 @@
+/*
+ Copyright (c) 2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import ml.dmlc.xgboost4j.scala.Booster
+import org.apache.spark.SparkConf
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import org.scalatest.funsuite.AnyFunSuite
+
+class XGBoostSuite extends AnyFunSuite with PerTest {
+
+  // Do not create spark context
+  override def beforeEach(): Unit = {}
+
+  test("XGBoost execution parameters") {
+    var xgbExecutionParams = new XGBoostExecutionParamsFactory(
+      Map("device" -> "cpu", "num_workers" -> 1, "num_round" -> 1), sc)
+      .buildXGBRuntimeParams
+    assert(!xgbExecutionParams.runOnGpu)
+
+    xgbExecutionParams = new XGBoostExecutionParamsFactory(
+      Map("device" -> "cuda", "num_workers" -> 1, "num_round" -> 1), sc)
+      .buildXGBRuntimeParams
+    assert(xgbExecutionParams.runOnGpu)
+
+    xgbExecutionParams = new XGBoostExecutionParamsFactory(
+      Map("device" -> "cpu", "tree_method" -> "gpu_hist", "num_workers" -> 1, "num_round" -> 1), sc)
+      .buildXGBRuntimeParams
+    assert(xgbExecutionParams.runOnGpu)
+
+    xgbExecutionParams = new XGBoostExecutionParamsFactory(
+      Map("device" -> "cuda", "tree_method" -> "gpu_hist",
+        "num_workers" -> 1, "num_round" -> 1), sc)
+      .buildXGBRuntimeParams
+    assert(xgbExecutionParams.runOnGpu)
+  }
+
+  test("skip stage-level scheduling") {
+    val conf = new SparkConf()
+      .setMaster("spark://foo")
+      .set("spark.executor.cores", "12")
+      .set("spark.task.cpus", "1")
+      .set("spark.executor.resource.gpu.amount", "1")
+      .set("spark.task.resource.gpu.amount", "0.08")
+
+    // the correct configurations should not skip stage-level scheduling
+    assert(!XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, conf))
+
+    // spark version < 3.4.0
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.3.0", runOnGpu = true, conf))
+
+    // not run on GPU
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = false, conf))
+
+    // spark.executor.cores is not set
+    var badConf = conf.clone().remove("spark.executor.cores")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.executor.cores=1
+    badConf = conf.clone().set("spark.executor.cores", "1")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.executor.resource.gpu.amount is not set
+    badConf = conf.clone().remove("spark.executor.resource.gpu.amount")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.executor.resource.gpu.amount>1
+    badConf = conf.clone().set("spark.executor.resource.gpu.amount", "2")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.task.resource.gpu.amount is not set
+    badConf = conf.clone().remove("spark.task.resource.gpu.amount")
+    assert(!XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.task.resource.gpu.amount=1
+    badConf = conf.clone().set("spark.task.resource.gpu.amount", "1")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // yarn
+    badConf = conf.clone().setMaster("yarn")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // k8s
+    badConf = conf.clone().setMaster("k8s://")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+  }
+
+
+  object FakedXGBoost extends XGBoostStageLevel {
+
+    // Do not skip stage-level scheduling for testing purposes.
+    override private[spark] def skipStageLevelScheduling(
+      sparkVersion: String,
+      runOnGpu: Boolean,
+      conf: SparkConf) = false
+  }
+
+  test("try stage-level scheduling without spark-rapids") {
+
+    val builder = SparkSession.builder()
+      .master(s"local-cluster[1, 4, 1024]")
+      .appName("XGBoostSuite")
+      .config("spark.ui.enabled", false)
+      .config("spark.driver.memory", "512m")
+      .config("spark.barrier.sync.timeout", 10)
+      .config("spark.task.cpus", 1)
+      .config("spark.executor.cores", 4)
+      .config("spark.executor.resource.gpu.amount", 1)
+      .config("spark.task.resource.gpu.amount", 0.25)
+
+    val ss = builder.getOrCreate()
+
+    try {
+      val df = ss.range(1, 10)
+      val rdd = df.rdd
+
+      val xgbExecutionParams = new XGBoostExecutionParamsFactory(
+        Map("device" -> "cuda", "num_workers" -> 1, "num_round" -> 1), sc)
+        .buildXGBRuntimeParams
+      assert(xgbExecutionParams.runOnGpu)
+
+      val finalRDD = FakedXGBoost.tryStageLevelScheduling(ss.sparkContext, xgbExecutionParams,
+        rdd.asInstanceOf[RDD[(Booster, Map[String, Array[Float]])]])
+
+      val taskResources = finalRDD.getResourceProfile().taskResources
+      assert(taskResources.contains("cpus"))
+      assert(taskResources.get("cpus").get.amount == 3)
+
+      assert(taskResources.contains("gpu"))
+      assert(taskResources.get("gpu").get.amount == 1.0)
+    } finally {
+      ss.stop()
+    }
+  }
+}

From ada377c57eec006889484d10e5ce83e4ac46c971 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 15 Nov 2023 14:16:19 +0800
Subject: [PATCH 11/32] [coll] Reduce the scope of lock in the event loop.
 (#9784)

---
 include/xgboost/collective/socket.h    | 23 ++++---
 rabit/src/allreduce_base.cc            |  4 +-
 src/collective/allreduce.cc            | 11 ++--
 src/collective/comm.cc                 | 35 ++++++----
 src/collective/loop.cc                 | 88 ++++++++++++++++++--------
 src/collective/loop.h                  | 17 ++---
 tests/cpp/collective/test_allreduce.cc |  9 ++-
 7 files changed, 117 insertions(+), 70 deletions(-)

diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index 5dd1b9ffa..844534110 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -412,19 +412,24 @@ class TCPSocket {
     return Success();
   }
 
-  void SetKeepAlive() {
+  [[nodiscard]] Result SetKeepAlive() {
     std::int32_t keepalive = 1;
-    xgboost_CHECK_SYS_CALL(setsockopt(handle_, SOL_SOCKET, SO_KEEPALIVE,
-                                      reinterpret_cast<char *>(&keepalive), sizeof(keepalive)),
-                           0);
+    auto rc = setsockopt(handle_, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast<char *>(&keepalive),
+                         sizeof(keepalive));
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set TCP keeaplive.");
+    }
+    return Success();
   }
 
-  void SetNoDelay() {
+  [[nodiscard]] Result SetNoDelay() {
     std::int32_t tcp_no_delay = 1;
-    xgboost_CHECK_SYS_CALL(
-        setsockopt(handle_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&tcp_no_delay),
-                   sizeof(tcp_no_delay)),
-        0);
+    auto rc = setsockopt(handle_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&tcp_no_delay),
+                         sizeof(tcp_no_delay));
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set TCP no delay.");
+    }
+    return Success();
   }
 
   /**
diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc
index 5cab4ae32..b99eb3763 100644
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -417,9 +417,9 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
       utils::Assert(!all_link.sock.BadSocket(), "ReConnectLink: bad socket");
       // set the socket to non-blocking mode, enable TCP keepalive
       CHECK(all_link.sock.NonBlocking(true).OK());
-      all_link.sock.SetKeepAlive();
+      CHECK(all_link.sock.SetKeepAlive().OK());
       if (rabit_enable_tcp_no_delay) {
-        all_link.sock.SetNoDelay();
+        CHECK(all_link.sock.SetNoDelay().OK());
       }
       if (tree_neighbors.count(all_link.rank) != 0) {
         if (all_link.rank == parent_rank) {
diff --git a/src/collective/allreduce.cc b/src/collective/allreduce.cc
index 65c066868..f95a9a9f1 100644
--- a/src/collective/allreduce.cc
+++ b/src/collective/allreduce.cc
@@ -6,6 +6,7 @@
 #include <algorithm>  // for min
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int32_t, int8_t
+#include <utility>    // for move
 #include <vector>     // for vector
 
 #include "../data/array_interface.h"    // for Type, DispatchDType
@@ -47,7 +48,7 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
     auto seg = s_buf.subspan(0, recv_seg.size());
 
     prev_ch->RecvAll(seg);
-    auto rc = prev_ch->Block();
+    auto rc = comm.Block();
     if (!rc.OK()) {
       return rc;
     }
@@ -83,11 +84,9 @@ Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func cons
     auto prev_ch = comm.Chan(prev);
     auto next_ch = comm.Chan(next);
 
-    rc = RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
-    if (!rc.OK()) {
-      return rc;
-    }
-    return comm.Block();
+    return std::move(rc) << [&] {
+      return RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
+    } << [&] { return comm.Block(); };
   });
 }
 }  // namespace xgboost::collective::cpu_impl
diff --git a/src/collective/comm.cc b/src/collective/comm.cc
index 964137ff1..9da9083f8 100644
--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@@ -33,19 +33,28 @@ Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds time
 Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, std::int32_t retry,
                           std::string const& task_id, TCPSocket* out, std::int32_t rank,
                           std::int32_t world) {
-  // get information from tracker
+  // Get information from the tracker
   CHECK(!info.host.empty());
-  auto rc = Connect(info.host, info.port, retry, timeout, out);
-  if (!rc.OK()) {
-    return Fail("Failed to connect to the tracker.", std::move(rc));
-  }
-
   TCPSocket& tracker = *out;
-  return std::move(rc)
-      << [&] { return tracker.NonBlocking(false); }
-      << [&] { return tracker.RecvTimeout(timeout); }
-      << [&] { return proto::Magic{}.Verify(&tracker); }
-      << [&] { return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id); };
+  return Success() << [&] {
+    auto rc = Connect(info.host, info.port, retry, timeout, out);
+    if (rc.OK()) {
+      return rc;
+    } else {
+      return Fail("Failed to connect to the tracker.", std::move(rc));
+    }
+  } << [&] {
+    return tracker.NonBlocking(false);
+  } << [&] {
+    return tracker.RecvTimeout(timeout);
+  } << [&] {
+    return proto::Magic{}.Verify(&tracker);
+  } << [&] {
+    return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id);
+  } << [&] {
+    LOG(INFO) << "Task " << task_id << " connected to the tracker";
+    return Success();
+  };
 }
 
 [[nodiscard]] Result Comm::ConnectTracker(TCPSocket* out) const {
@@ -257,8 +266,8 @@ RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::se
   CHECK(this->channels_.empty());
   for (auto& w : workers) {
     if (w) {
-      w->SetNoDelay();
-      rc = w->NonBlocking(true);
+      rc = std::move(rc) << [&] { return w->SetNoDelay(); } << [&] { return w->NonBlocking(true); }
+                         << [&] { return w->SetKeepAlive(); };
     }
     if (!rc.OK()) {
       return rc;
diff --git a/src/collective/loop.cc b/src/collective/loop.cc
index 95a1019ac..10fce0516 100644
--- a/src/collective/loop.cc
+++ b/src/collective/loop.cc
@@ -10,21 +10,26 @@
 #include "xgboost/logging.h"            // for CHECK
 
 namespace xgboost::collective {
-Result Loop::EmptyQueue() {
+Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
   timer_.Start(__func__);
-  auto error = [this] {
-    this->stop_ = true;
+  auto error = [this] { timer_.Stop(__func__); };
+
+  if (stop_) {
     timer_.Stop(__func__);
-  };
+    return Success();
+  }
 
-  while (!queue_.empty() && !stop_) {
-    std::queue<Op> qcopy;
+  auto& qcopy = *p_queue;
+
+  // clear the copied queue
+  while (!qcopy.empty()) {
     rabit::utils::PollHelper poll;
+    std::size_t n_ops = qcopy.size();
 
-    // watch all ops
-    while (!queue_.empty()) {
-      auto op = queue_.front();
-      queue_.pop();
+    // Iterate through all the ops for poll
+    for (std::size_t i = 0; i < n_ops; ++i) {
+      auto op = qcopy.front();
+      qcopy.pop();
 
       switch (op.code) {
         case Op::kRead: {
@@ -40,6 +45,7 @@ Result Loop::EmptyQueue() {
           return Fail("Invalid socket operation.");
         }
       }
+
       qcopy.push(op);
     }
 
@@ -51,10 +57,12 @@ Result Loop::EmptyQueue() {
       error();
       return rc;
     }
+
     // we wonldn't be here if the queue is empty.
     CHECK(!qcopy.empty());
 
-    while (!qcopy.empty() && !stop_) {
+    // Iterate through all the ops for performing the operations
+    for (std::size_t i = 0; i < n_ops; ++i) {
       auto op = qcopy.front();
       qcopy.pop();
 
@@ -81,20 +89,21 @@ Result Loop::EmptyQueue() {
       }
 
       if (n_bytes_done == -1 && !system::LastErrorWouldBlock()) {
-        stop_ = true;
         auto rc = system::FailWithCode("Invalid socket output.");
         error();
         return rc;
       }
+
       op.off += n_bytes_done;
       CHECK_LE(op.off, op.n);
 
       if (op.off != op.n) {
         // not yet finished, push back to queue for next round.
-        queue_.push(op);
+        qcopy.push(op);
       }
     }
   }
+
   timer_.Stop(__func__);
   return Success();
 }
@@ -107,22 +116,42 @@ void Loop::Process() {
     if (stop_) {
       break;
     }
-    CHECK(!mu_.try_lock());
 
-    this->rc_ = this->EmptyQueue();
-    if (!rc_.OK()) {
-      stop_ = true;
+    auto unlock_notify = [&](bool is_blocking) {
+      if (!is_blocking) {
+        return;
+      }
+      lock.unlock();
       cv_.notify_one();
-      break;
+    };
+
+    // move the queue
+    std::queue<Op> qcopy;
+    bool is_blocking = false;
+    while (!queue_.empty()) {
+      auto op = queue_.front();
+      queue_.pop();
+      if (op.code == Op::kBlock) {
+        is_blocking = true;
+      } else {
+        qcopy.push(op);
+      }
+    }
+    // unblock the queue
+    if (!is_blocking) {
+      lock.unlock();
+    }
+    // clear the queue
+    auto rc = this->EmptyQueue(&qcopy);
+    // Handle error
+    if (!rc.OK()) {
+      this->rc_ = std::move(rc);
+      unlock_notify(is_blocking);
+      return;
     }
 
-    CHECK(queue_.empty());
-    CHECK(!mu_.try_lock());
-    cv_.notify_one();
-  }
-
-  if (rc_.OK()) {
-    CHECK(queue_.empty());
+    CHECK(qcopy.empty());
+    unlock_notify(is_blocking);
   }
 }
 
@@ -140,6 +169,15 @@ Result Loop::Stop() {
   return Success();
 }
 
+[[nodiscard]] Result Loop::Block() {
+  this->Submit(Op{Op::kBlock});
+  {
+    std::unique_lock lock{mu_};
+    cv_.wait(lock, [this] { return (this->queue_.empty()) || stop_; });
+  }
+  return std::move(rc_);
+}
+
 Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
   timer_.Init(__func__);
   worker_ = std::thread{[this] {
diff --git a/src/collective/loop.h b/src/collective/loop.h
index 0bccbc0d0..4f5cb12b3 100644
--- a/src/collective/loop.h
+++ b/src/collective/loop.h
@@ -20,13 +20,14 @@ namespace xgboost::collective {
 class Loop {
  public:
   struct Op {
-    enum Code : std::int8_t { kRead = 0, kWrite = 1 } code;
+    enum Code : std::int8_t { kRead = 0, kWrite = 1, kBlock = 2 } code;
     std::int32_t rank{-1};
     std::int8_t* ptr{nullptr};
     std::size_t n{0};
     TCPSocket* sock{nullptr};
     std::size_t off{0};
 
+    explicit Op(Code c) : code{c} { CHECK(c == kBlock); }
     Op(Code c, std::int32_t rank, std::int8_t* ptr, std::size_t n, TCPSocket* sock, std::size_t off)
         : code{c}, rank{rank}, ptr{ptr}, n{n}, sock{sock}, off{off} {}
     Op(Op const&) = default;
@@ -44,9 +45,9 @@ class Loop {
   Result rc_;
   bool stop_{false};
   std::exception_ptr curr_exce_{nullptr};
-  common::Monitor timer_;
+  common::Monitor mutable timer_;
 
-  Result EmptyQueue();
+  Result EmptyQueue(std::queue<Op>* p_queue) const;
   void Process();
 
  public:
@@ -60,15 +61,7 @@ class Loop {
     cv_.notify_one();
   }
 
-  [[nodiscard]] Result Block() {
-    {
-      std::unique_lock lock{mu_};
-      cv_.notify_all();
-    }
-    std::unique_lock lock{mu_};
-    cv_.wait(lock, [this] { return this->queue_.empty() || stop_; });
-    return std::move(rc_);
-  }
+  [[nodiscard]] Result Block();
 
   explicit Loop(std::chrono::seconds timeout);
 
diff --git a/tests/cpp/collective/test_allreduce.cc b/tests/cpp/collective/test_allreduce.cc
index 744608dec..21b4d9fd0 100644
--- a/tests/cpp/collective/test_allreduce.cc
+++ b/tests/cpp/collective/test_allreduce.cc
@@ -18,31 +18,34 @@ class AllreduceWorker : public WorkerForTest {
   void Basic() {
     {
       std::vector<double> data(13, 0.0);
-      Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
+      auto rc = Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
         for (std::size_t i = 0; i < rhs.size(); ++i) {
           rhs[i] += lhs[i];
         }
       });
+      ASSERT_TRUE(rc.OK());
       ASSERT_EQ(std::accumulate(data.cbegin(), data.cend(), 0.0), 0.0);
     }
     {
       std::vector<double> data(1, 1.0);
-      Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
+      auto rc = Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
         for (std::size_t i = 0; i < rhs.size(); ++i) {
           rhs[i] += lhs[i];
         }
       });
+      ASSERT_TRUE(rc.OK());
       ASSERT_EQ(data[0], static_cast<double>(comm_.World()));
     }
   }
 
   void Acc() {
     std::vector<double> data(314, 1.5);
-    Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
+    auto rc = Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
       for (std::size_t i = 0; i < rhs.size(); ++i) {
         rhs[i] += lhs[i];
       }
     });
+    ASSERT_TRUE(rc.OK());
     for (std::size_t i = 0; i < data.size(); ++i) {
       auto v = data[i];
       ASSERT_EQ(v, 1.5 * static_cast<double>(comm_.World())) << i;

From 178cfe70a8bf6ca0081032cf0de47e5219d0cf08 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Thu, 16 Nov 2023 18:15:59 +0800
Subject: [PATCH 12/32] [pyspark][doc] Test and doc for stage-level scheduling.
 (#9786)

---
 doc/jvm/xgboost4j_spark_gpu_tutorial.rst      |  20 +++-
 python-package/xgboost/spark/core.py          |  31 +++--
 python-package/xgboost/spark/utils.py         |   6 +-
 .../test_with_spark/test_spark_local.py       | 108 ++++++++++++++++++
 4 files changed, 144 insertions(+), 21 deletions(-)

diff --git a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
index 3b2f92c6f..edabe8a92 100644
--- a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
@@ -215,6 +215,22 @@ and the prediction for each instance.
 Submit the application
 **********************
 
+Assuming you have configured the Spark standalone cluster with GPU support. Otherwise, please
+refer to `spark standalone configuration with GPU support <https://nvidia.github.io/spark-rapids/docs/get-started/getting-started-on-prem.html#spark-standalone-cluster>`_.
+
+Starting from XGBoost 2.1.0, stage-level scheduling is automatically enabled. Therefore,
+if you are using Spark standalone cluster version 3.4.0 or higher, we strongly recommend
+configuring the ``"spark.task.resource.gpu.amount"`` as a fractional value. This will
+enable running multiple tasks in parallel during the ETL phase. An example configuration
+would be ``"spark.task.resource.gpu.amount=1/spark.executor.cores"``. However, if you are
+using a XGBoost version earlier than 2.1.0 or a Spark standalone cluster version below 3.4.0,
+you still need to set ``"spark.task.resource.gpu.amount"`` equal to ``"spark.executor.resource.gpu.amount"``.
+
+.. note::
+
+  As of now, the stage-level scheduling feature in XGBoost is limited to the Spark standalone cluster mode.
+  However, we have plans to expand its compatibility to YARN and Kubernetes once Spark 3.5.1 is officially released.
+
 Assuming that the application main class is "Iris" and the application jar is "iris-1.0.0.jar",`
 provided below is an instance demonstrating how to submit the xgboost application to an Apache
 Spark Standalone cluster.
@@ -230,9 +246,9 @@ Spark Standalone cluster.
     --master $master \
     --packages com.nvidia:rapids-4-spark_2.12:${rapids_version},ml.dmlc:xgboost4j-gpu_2.12:${xgboost_version},ml.dmlc:xgboost4j-spark-gpu_2.12:${xgboost_version} \
     --conf spark.executor.cores=12 \
-    --conf spark.task.cpus=12 \
+    --conf spark.task.cpus=1 \
     --conf spark.executor.resource.gpu.amount=1 \
-    --conf spark.task.resource.gpu.amount=1 \
+    --conf spark.task.resource.gpu.amount=0.08 \
     --conf spark.rapids.sql.csv.read.double.enabled=true \
     --conf spark.rapids.sql.hasNans=false \
     --conf spark.plugins=com.nvidia.spark.SQLPlugin \
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index bad3a2382..aa8c5b998 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -22,7 +22,7 @@ from typing import (
 
 import numpy as np
 import pandas as pd
-from pyspark import RDD, SparkContext, cloudpickle
+from pyspark import RDD, SparkConf, SparkContext, cloudpickle
 from pyspark.ml import Estimator, Model
 from pyspark.ml.functions import array_to_vector, vector_to_array
 from pyspark.ml.linalg import VectorUDT
@@ -368,7 +368,10 @@ class _SparkXGBParams(
                         " on GPU."
                     )
 
-                if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
+                if not (
+                    ss.version >= "3.4.0"
+                    and _is_standalone_or_localcluster(sc.getConf())
+                ):
                     # We will enable stage-level scheduling in spark 3.4.0+ which doesn't
                     # require spark.task.resource.gpu.amount to be set explicitly
                     gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
@@ -907,30 +910,27 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
         return booster_params, train_call_kwargs_params, dmatrix_kwargs
 
-    def _skip_stage_level_scheduling(self) -> bool:
+    def _skip_stage_level_scheduling(self, spark_version: str, conf: SparkConf) -> bool:
         # pylint: disable=too-many-return-statements
         """Check if stage-level scheduling is not needed,
         return true to skip stage-level scheduling"""
 
         if self._run_on_gpu():
-            ss = _get_spark_session()
-            sc = ss.sparkContext
-
-            if ss.version < "3.4.0":
+            if spark_version < "3.4.0":
                 self.logger.info(
                     "Stage-level scheduling in xgboost requires spark version 3.4.0+"
                 )
                 return True
 
-            if not _is_standalone_or_localcluster(sc):
+            if not _is_standalone_or_localcluster(conf):
                 self.logger.info(
                     "Stage-level scheduling in xgboost requires spark standalone or "
                     "local-cluster mode"
                 )
                 return True
 
-            executor_cores = sc.getConf().get("spark.executor.cores")
-            executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+            executor_cores = conf.get("spark.executor.cores")
+            executor_gpus = conf.get("spark.executor.resource.gpu.amount")
             if executor_cores is None or executor_gpus is None:
                 self.logger.info(
                     "Stage-level scheduling in xgboost requires spark.executor.cores, "
@@ -955,7 +955,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 )
                 return True
 
-            task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
+            task_gpu_amount = conf.get("spark.task.resource.gpu.amount")
 
             if task_gpu_amount is None:
                 # The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
@@ -975,14 +975,13 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
     def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
         """Try to enable stage-level scheduling"""
-
-        if self._skip_stage_level_scheduling():
+        ss = _get_spark_session()
+        conf = ss.sparkContext.getConf()
+        if self._skip_stage_level_scheduling(ss.version, conf):
             return rdd
 
-        ss = _get_spark_session()
-
         # executor_cores will not be None
-        executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
+        executor_cores = conf.get("spark.executor.cores")
         assert executor_cores is not None
 
         # Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 395865386..805aa5c10 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -10,7 +10,7 @@ from threading import Thread
 from typing import Any, Callable, Dict, Optional, Set, Type
 
 import pyspark
-from pyspark import BarrierTaskContext, SparkContext, SparkFiles, TaskContext
+from pyspark import BarrierTaskContext, SparkConf, SparkContext, SparkFiles, TaskContext
 from pyspark.sql.session import SparkSession
 
 from xgboost import Booster, XGBModel, collective
@@ -129,8 +129,8 @@ def _is_local(spark_context: SparkContext) -> bool:
     return spark_context._jsc.sc().isLocal()
 
 
-def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
-    master = spark_context.getConf().get("spark.master")
+def _is_standalone_or_localcluster(conf: SparkConf) -> bool:
+    master = conf.get("spark.master")
     return master is not None and (
         master.startswith("spark://") or master.startswith("local-cluster")
     )
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 2c5ee3690..406174542 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -8,6 +8,7 @@ from typing import Generator, Sequence, Type
 
 import numpy as np
 import pytest
+from pyspark import SparkConf
 
 import xgboost as xgb
 from xgboost import testing as tm
@@ -932,6 +933,113 @@ class TestPySparkLocal:
             model_loaded.set_device("cuda")
             assert model_loaded._run_on_gpu()
 
+    def test_skip_stage_level_scheduling(self) -> None:
+        conf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "1")
+            .set("spark.task.resource.gpu.amount", "0.08")
+        )
+
+        classifer_on_cpu = SparkXGBClassifier(use_gpu=False)
+        classifer_on_gpu = SparkXGBClassifier(use_gpu=True)
+
+        # the correct configurations should not skip stage-level scheduling
+        assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", conf)
+
+        # spark version < 3.4.0
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", conf)
+
+        # not run on GPU
+        assert classifer_on_cpu._skip_stage_level_scheduling("3.4.0", conf)
+
+        # spark.executor.cores is not set
+        badConf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "1")
+            .set("spark.task.resource.gpu.amount", "0.08")
+        )
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+
+        # spark.executor.cores=1
+        badConf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.executor.cores", "1")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "1")
+            .set("spark.task.resource.gpu.amount", "0.08")
+        )
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+
+        # spark.executor.resource.gpu.amount is not set
+        badConf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.task.resource.gpu.amount", "0.08")
+        )
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+
+        # spark.executor.resource.gpu.amount>1
+        badConf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "2")
+            .set("spark.task.resource.gpu.amount", "0.08")
+        )
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+
+        # spark.task.resource.gpu.amount is not set
+        badConf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "1")
+        )
+        assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+
+        # spark.task.resource.gpu.amount=1
+        badConf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "1")
+            .set("spark.task.resource.gpu.amount", "1")
+        )
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+
+        # yarn
+        badConf = (
+            SparkConf()
+            .setMaster("yarn")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "1")
+            .set("spark.task.resource.gpu.amount", "1")
+        )
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+
+        # k8s
+        badConf = (
+            SparkConf()
+            .setMaster("k8s://")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "1")
+            .set("spark.task.resource.gpu.amount", "1")
+        )
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+
 
 class XgboostLocalTest(SparkTestCase):
     def setUp(self):

From fedd9674c8441a26b94f9d4061b11a1be4cb0e7c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 17 Nov 2023 04:29:08 +0800
Subject: [PATCH 13/32] Implement column sampler in CUDA. (#9785)

- CUDA implementation.
- Extract the broadcasting logic, we will need the context parameter after revamping the collective implementation.
- Some changes to the event loop for fixing a deadlock in CI.
- Move argsort into algorithms.cuh, add support for cuda stream.
---
 src/collective/loop.cc                      |  25 +++--
 src/collective/loop.h                       |   3 +
 src/common/algorithm.cuh                    |  91 +++++++++++++++--
 src/common/device_helpers.cuh               |  82 +++------------
 src/common/random.cc                        |  43 +++++---
 src/common/random.cu                        | 106 ++++++++++++++++++++
 src/common/random.h                         |  76 ++++++++------
 src/metric/auc.cc                           |  15 +--
 src/metric/auc.cu                           |  52 +++++-----
 src/metric/auc.h                            |  20 ++--
 src/objective/adaptive.cu                   |  10 +-
 src/tree/gpu_hist/evaluator.cu              |   2 +-
 src/tree/updater_approx.cc                  |   6 +-
 src/tree/updater_colmaker.cc                |  13 ++-
 src/tree/updater_quantile_hist.cc           |   9 +-
 tests/cpp/common/test_algorithm.cu          |   8 +-
 tests/cpp/common/test_hist_util.cu          |   3 +-
 tests/cpp/common/test_random.cc             |  91 ++++++++++++-----
 tests/cpp/tree/hist/test_evaluate_splits.cc |  12 +--
 tests/cpp/tree/test_constraints.cc          |  12 +--
 20 files changed, 447 insertions(+), 232 deletions(-)
 create mode 100644 src/common/random.cu

diff --git a/src/collective/loop.cc b/src/collective/loop.cc
index 10fce0516..5cfb0034d 100644
--- a/src/collective/loop.cc
+++ b/src/collective/loop.cc
@@ -117,11 +117,14 @@ void Loop::Process() {
       break;
     }
 
-    auto unlock_notify = [&](bool is_blocking) {
+    auto unlock_notify = [&](bool is_blocking, bool stop) {
       if (!is_blocking) {
-        return;
+        std::lock_guard guard{mu_};
+        stop_ = stop;
+      } else {
+        stop_ = stop;
+        lock.unlock();
       }
-      lock.unlock();
       cv_.notify_one();
     };
 
@@ -145,13 +148,14 @@ void Loop::Process() {
     auto rc = this->EmptyQueue(&qcopy);
     // Handle error
     if (!rc.OK()) {
+      unlock_notify(is_blocking, true);
+      std::lock_guard<std::mutex> guard{rc_lock_};
       this->rc_ = std::move(rc);
-      unlock_notify(is_blocking);
       return;
     }
 
     CHECK(qcopy.empty());
-    unlock_notify(is_blocking);
+    unlock_notify(is_blocking, false);
   }
 }
 
@@ -170,12 +174,21 @@ Result Loop::Stop() {
 }
 
 [[nodiscard]] Result Loop::Block() {
+  {
+    std::lock_guard<std::mutex> guard{rc_lock_};
+    if (!rc_.OK()) {
+      return std::move(rc_);
+    }
+  }
   this->Submit(Op{Op::kBlock});
   {
     std::unique_lock lock{mu_};
     cv_.wait(lock, [this] { return (this->queue_.empty()) || stop_; });
   }
-  return std::move(rc_);
+  {
+    std::lock_guard<std::mutex> lock{rc_lock_};
+    return std::move(rc_);
+  }
 }
 
 Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
diff --git a/src/collective/loop.h b/src/collective/loop.h
index 4f5cb12b3..0c1fdcbfe 100644
--- a/src/collective/loop.h
+++ b/src/collective/loop.h
@@ -42,7 +42,10 @@ class Loop {
   std::mutex mu_;
   std::queue<Op> queue_;
   std::chrono::seconds timeout_;
+
   Result rc_;
+  std::mutex rc_lock_;  // lock for transferring error info.
+
   bool stop_{false};
   std::exception_ptr curr_exce_{nullptr};
   common::Monitor mutable timer_;
diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index 53acc65e1..5f0986d5b 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -23,8 +23,7 @@
 #include "xgboost/logging.h"   // CHECK
 #include "xgboost/span.h"      // Span,byte
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace detail {
 // Wrapper around cub sort to define is_decending
 template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT,
@@ -127,13 +126,14 @@ inline void SegmentedSortKeys(Context const *ctx, Span<V const> group_ptr,
 template <bool accending, bool per_seg_index, typename U, typename V, typename IdxT>
 void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
                       Span<IdxT> sorted_idx) {
+  auto cuctx = ctx->CUDACtx();
   CHECK_GE(group_ptr.size(), 1ul);
   std::size_t n_groups = group_ptr.size() - 1;
   std::size_t bytes = 0;
   if (per_seg_index) {
     SegmentedSequence(ctx, group_ptr, sorted_idx);
   } else {
-    dh::Iota(sorted_idx);
+    dh::Iota(sorted_idx, cuctx->Stream());
   }
   dh::TemporaryArray<std::remove_const_t<U>> values_out(values.size());
   dh::TemporaryArray<std::remove_const_t<IdxT>> sorted_idx_out(sorted_idx.size());
@@ -141,15 +141,16 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
   detail::DeviceSegmentedRadixSortPair<!accending>(
       nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(),
       sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
-      group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+      group_ptr.data() + 1, cuctx->Stream());
   dh::TemporaryArray<byte> temp_storage(bytes);
   detail::DeviceSegmentedRadixSortPair<!accending>(
       temp_storage.data().get(), bytes, values.data(), values_out.data().get(), sorted_idx.data(),
       sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
-      group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+      group_ptr.data() + 1, cuctx->Stream());
 
   dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
-                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
+                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice,
+                                cuctx->Stream()));
 }
 
 /**
@@ -159,11 +160,12 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
 template <typename SegIt, typename ValIt>
 void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, ValIt val_begin,
                            ValIt val_end, dh::device_vector<std::size_t> *p_sorted_idx) {
+  auto cuctx = ctx->CUDACtx();
   using Tup = thrust::tuple<std::int32_t, float>;
   auto &sorted_idx = *p_sorted_idx;
   std::size_t n = std::distance(val_begin, val_end);
   sorted_idx.resize(n);
-  dh::Iota(dh::ToSpan(sorted_idx));
+  dh::Iota(dh::ToSpan(sorted_idx), cuctx->Stream());
   dh::device_vector<Tup> keys(sorted_idx.size());
   auto key_it = dh::MakeTransformIterator<Tup>(thrust::make_counting_iterator(0ul),
                                                [=] XGBOOST_DEVICE(std::size_t i) -> Tup {
@@ -177,7 +179,7 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
                                                  return thrust::make_tuple(seg_idx, residue);
                                                });
   thrust::copy(ctx->CUDACtx()->CTP(), key_it, key_it + keys.size(), keys.begin());
-  thrust::stable_sort_by_key(ctx->CUDACtx()->TP(), keys.begin(), keys.end(), sorted_idx.begin(),
+  thrust::stable_sort_by_key(cuctx->TP(), keys.begin(), keys.end(), sorted_idx.begin(),
                              [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) {
                                if (thrust::get<0>(l) != thrust::get<0>(r)) {
                                  return thrust::get<0>(l) < thrust::get<0>(r);  // segment index
@@ -185,6 +187,75 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
                                return thrust::get<1>(l) < thrust::get<1>(r);    // residue
                              });
 }
-}  // namespace common
-}  // namespace xgboost
+
+template <bool accending, typename IdxT, typename U>
+void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
+             xgboost::common::Span<IdxT> sorted_idx) {
+  std::size_t bytes = 0;
+  auto cuctx = ctx->CUDACtx();
+  dh::Iota(sorted_idx, cuctx->Stream());
+
+  using KeyT = typename decltype(keys)::value_type;
+  using ValueT = std::remove_const_t<IdxT>;
+
+  dh::TemporaryArray<KeyT> out(keys.size());
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()), out.data().get());
+  dh::TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
+  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
+                                     sorted_idx_out.data().get());
+
+  // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
+  using OffsetT = std::conditional_t<!dh::BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
+  CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
+  if (accending) {
+    void *d_temp_storage = nullptr;
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+    dh::TemporaryArray<char> storage(bytes);
+    d_temp_storage = storage.data().get();
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+  } else {
+    void *d_temp_storage = nullptr;
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+    dh::TemporaryArray<char> storage(bytes);
+    d_temp_storage = storage.data().get();
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+  }
+
+  dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
+                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice,
+                                cuctx->Stream()));
+}
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 74336ac61..066f8a3e6 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -313,8 +313,8 @@ inline void LaunchN(size_t n, L lambda) {
 }
 
 template <typename Container>
-void Iota(Container array) {
-  LaunchN(array.size(), [=] __device__(size_t i) { array[i] = i; });
+void Iota(Container array, cudaStream_t stream) {
+  LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
 }
 
 namespace detail {
@@ -597,6 +597,16 @@ class DoubleBuffer {
   T *Other() { return buff.Alternate(); }
 };
 
+template <typename T>
+xgboost::common::Span<T> LazyResize(xgboost::Context const *ctx,
+                                    xgboost::HostDeviceVector<T> *buffer, std::size_t n) {
+  buffer->SetDevice(ctx->Device());
+  if (buffer->Size() < n) {
+    buffer->Resize(n);
+  }
+  return buffer->DeviceSpan().subspan(0, n);
+}
+
 /**
  * \brief Copies device span to std::vector.
  *
@@ -1060,74 +1070,6 @@ void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items)
   InclusiveScan(d_in, d_out, cub::Sum(), num_items);
 }
 
-template <bool accending, typename IdxT, typename U>
-void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_idx) {
-  size_t bytes = 0;
-  Iota(sorted_idx);
-
-  using KeyT = typename decltype(keys)::value_type;
-  using ValueT = std::remove_const_t<IdxT>;
-
-  TemporaryArray<KeyT> out(keys.size());
-  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()),
-                                 out.data().get());
-  TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
-  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
-                                     sorted_idx_out.data().get());
-
-  // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
-  using OffsetT = std::conditional_t<!BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
-  CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
-  if (accending) {
-    void *d_temp_storage = nullptr;
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-    TemporaryArray<char> storage(bytes);
-    d_temp_storage = storage.data().get();
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-  } else {
-    void *d_temp_storage = nullptr;
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-    TemporaryArray<char> storage(bytes);
-    d_temp_storage = storage.data().get();
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-  }
-
-  safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
-                            sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
-}
-
 class CUDAStreamView;
 
 class CUDAEvent {
diff --git a/src/common/random.cc b/src/common/random.cc
index d0e75729d..e0d1a2255 100644
--- a/src/common/random.cc
+++ b/src/common/random.cc
@@ -1,32 +1,50 @@
-/*!
- * Copyright 2020 by XGBoost Contributors
- * \file random.cc
+/**
+ * Copyright 2020-2023, XGBoost Contributors
  */
 #include "random.h"
 
-namespace xgboost {
-namespace common {
+#include <algorithm>  // for sort, max, copy
+#include <memory>     // for shared_ptr
+
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+
+namespace xgboost::common {
 std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
     std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample) {
   if (colsample == 1.0f) {
     return p_features;
   }
+
+  int n = std::max(1, static_cast<int>(colsample * p_features->Size()));
+  auto p_new_features = std::make_shared<HostDeviceVector<bst_feature_t>>();
+
+  if (ctx_->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
+    cuda_impl::SampleFeature(ctx_, n, p_features, p_new_features, this->feature_weights_,
+                             &this->weight_buffer_, &this->idx_buffer_, &rng_);
+    return p_new_features;
+#else
+    AssertGPUSupport();
+    return nullptr;
+#endif  // defined(XGBOOST_USE_CUDA)
+  }
+
   const auto &features = p_features->HostVector();
   CHECK_GT(features.size(), 0);
 
-  int n = std::max(1, static_cast<int>(colsample * features.size()));
-  auto p_new_features = std::make_shared<HostDeviceVector<bst_feature_t>>();
   auto &new_features = *p_new_features;
 
-  if (feature_weights_.size() != 0) {
+  if (!feature_weights_.Empty()) {
     auto const &h_features = p_features->HostVector();
-    std::vector<float> weights(h_features.size());
+    auto const &h_feature_weight = feature_weights_.ConstHostVector();
+    auto &weight = this->weight_buffer_.HostVector();
+    weight.resize(h_features.size());
     for (size_t i = 0; i < h_features.size(); ++i) {
-      weights[i] = feature_weights_[h_features[i]];
+      weight[i] = h_feature_weight[h_features[i]];
     }
     CHECK(ctx_);
     new_features.HostVector() =
-        WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weights, n);
+        WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weight, n);
   } else {
     new_features.Resize(features.size());
     std::copy(features.begin(), features.end(), new_features.HostVector().begin());
@@ -36,5 +54,4 @@ std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
   std::sort(new_features.HostVector().begin(), new_features.HostVector().end());
   return p_new_features;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/random.cu b/src/common/random.cu
new file mode 100644
index 000000000..f5811d924
--- /dev/null
+++ b/src/common/random.cu
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <thrust/shuffle.h>  // for shuffle
+
+#include <memory>  // for shared_ptr
+
+#include "algorithm.cuh"     // for ArgSort
+#include "cuda_context.cuh"  // for CUDAContext
+#include "device_helpers.cuh"
+#include "random.h"
+#include "xgboost/base.h"                // for bst_feature_t
+#include "xgboost/context.h"             // for Context
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+
+namespace xgboost::common::cuda_impl {
+// GPU implementation for sampling without replacement, see the CPU version for references.
+void WeightedSamplingWithoutReplacement(Context const *ctx, common::Span<bst_feature_t const> array,
+                                        common::Span<float const> weights,
+                                        common::Span<bst_feature_t> results,
+                                        HostDeviceVector<bst_feature_t> *sorted_idx,
+                                        GlobalRandomEngine *grng) {
+  CUDAContext const *cuctx = ctx->CUDACtx();
+  CHECK_EQ(array.size(), weights.size());
+  // Sampling keys
+  dh::caching_device_vector<float> keys(weights.size());
+
+  auto d_keys = dh::ToSpan(keys);
+
+  auto seed = (*grng)();
+  constexpr auto kEps = kRtEps;  // avoid CUDA compilation error
+  thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), array.size(),
+                     [=] XGBOOST_DEVICE(std::size_t i) {
+                       thrust::default_random_engine rng;
+                       rng.seed(seed);
+                       rng.discard(i);
+                       thrust::uniform_real_distribution<float> dist;
+
+                       auto w = std::max(weights[i], kEps);
+                       auto u = dist(rng);
+                       auto k = std::log(u) / w;
+                       d_keys[i] = k;
+                     });
+  // Allocate buffer for sorted index.
+  auto d_idx = dh::LazyResize(ctx, sorted_idx, keys.size());
+
+  ArgSort<false>(ctx, d_keys, d_idx);
+
+  // Filter the result according to sorted index.
+  auto it = thrust::make_permutation_iterator(dh::tbegin(array), dh::tbegin(d_idx));
+  // |array| == |weights| == |keys| == |sorted_idx| >= |results|
+  for (auto size : {array.size(), weights.size(), keys.size()}) {
+    CHECK_EQ(size, d_idx.size());
+  }
+  CHECK_GE(array.size(), results.size());
+  thrust::copy_n(cuctx->CTP(), it, results.size(), dh::tbegin(results));
+}
+
+void SampleFeature(Context const *ctx, bst_feature_t n_features,
+                   std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features,
+                   std::shared_ptr<HostDeviceVector<bst_feature_t>> p_new_features,
+                   HostDeviceVector<float> const &feature_weights,
+                   HostDeviceVector<float> *weight_buffer,
+                   HostDeviceVector<bst_feature_t> *idx_buffer, GlobalRandomEngine *grng) {
+  CUDAContext const *cuctx = ctx->CUDACtx();
+  auto &new_features = *p_new_features;
+  new_features.SetDevice(ctx->Device());
+  p_features->SetDevice(ctx->Device());
+  CHECK_LE(n_features, p_features->Size());
+
+  if (!feature_weights.Empty()) {
+    CHECK_LE(p_features->Size(), feature_weights.Size());
+    idx_buffer->SetDevice(ctx->Device());
+    feature_weights.SetDevice(ctx->Device());
+
+    auto d_old_features = p_features->DeviceSpan();
+    auto d_weight_buffer = dh::LazyResize(ctx, weight_buffer, d_old_features.size());
+    // Filter weights according to the existing feature index.
+    auto d_feature_weight = feature_weights.ConstDeviceSpan();
+    auto it = thrust::make_permutation_iterator(dh::tcbegin(d_feature_weight),
+                                                dh::tcbegin(d_old_features));
+    thrust::copy_n(cuctx->CTP(), it, d_old_features.size(), dh::tbegin(d_weight_buffer));
+    new_features.Resize(n_features);
+    WeightedSamplingWithoutReplacement(ctx, d_old_features, d_weight_buffer,
+                                       new_features.DeviceSpan(), idx_buffer, grng);
+  } else {
+    new_features.Resize(p_features->Size());
+    new_features.Copy(*p_features);
+    auto d_feat = new_features.DeviceSpan();
+    thrust::default_random_engine rng;
+    rng.seed((*grng)());
+    thrust::shuffle(cuctx->CTP(), dh::tbegin(d_feat), dh::tend(d_feat), rng);
+    new_features.Resize(n_features);
+  }
+
+  auto d_new_features = new_features.DeviceSpan();
+  thrust::sort(cuctx->CTP(), dh::tbegin(d_new_features), dh::tend(d_new_features));
+}
+
+void InitFeatureSet(Context const *ctx,
+                    std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features) {
+  CUDAContext const *cuctx = ctx->CUDACtx();
+  auto d_features = p_features->DeviceSpan();
+  thrust::sequence(cuctx->CTP(), dh::tbegin(d_features), dh::tend(d_features), 0);
+}
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/random.h b/src/common/random.h
index 5efdb486d..2a94123a3 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2020 by Contributors
+/**
+ * Copyright 2015-2020, XGBoost Contributors
  * \file random.h
  * \brief Utility related to random.
  * \author Tianqi Chen
@@ -25,8 +25,7 @@
 #include "xgboost/context.h"  // Context
 #include "xgboost/host_device_vector.h"
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 /*!
  * \brief Define mt19937 as default type Random Engine.
  */
@@ -113,6 +112,18 @@ std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vecto
   return results;
 }
 
+namespace cuda_impl {
+void SampleFeature(Context const* ctx, bst_feature_t n_features,
+                   std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features,
+                   std::shared_ptr<HostDeviceVector<bst_feature_t>> p_new_features,
+                   HostDeviceVector<float> const& feature_weights,
+                   HostDeviceVector<float>* weight_buffer,
+                   HostDeviceVector<bst_feature_t>* idx_buffer, GlobalRandomEngine* grng);
+
+void InitFeatureSet(Context const* ctx,
+                    std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features);
+}  // namespace cuda_impl
+
 /**
  * \class ColumnSampler
  *
@@ -123,46 +134,37 @@ std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vecto
 class ColumnSampler {
   std::shared_ptr<HostDeviceVector<bst_feature_t>> feature_set_tree_;
   std::map<int, std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_set_level_;
-  std::vector<float> feature_weights_;
+  HostDeviceVector<float> feature_weights_;
   float colsample_bylevel_{1.0f};
   float colsample_bytree_{1.0f};
   float colsample_bynode_{1.0f};
   GlobalRandomEngine rng_;
   Context const* ctx_;
 
+  // Used for weighted sampling.
+  HostDeviceVector<bst_feature_t> idx_buffer_;
+  HostDeviceVector<float> weight_buffer_;
+
  public:
   std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
       std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample);
   /**
-   * \brief Column sampler constructor.
-   * \note This constructor manually sets the rng seed
+   * @brief Column sampler constructor.
+   * @note This constructor manually sets the rng seed
    */
-  explicit ColumnSampler(uint32_t seed) {
-    rng_.seed(seed);
-  }
+  explicit ColumnSampler(std::uint32_t seed) { rng_.seed(seed); }
 
   /**
-  * \brief Column sampler constructor.
-  * \note This constructor synchronizes the RNG seed across processes.
-  */
-  ColumnSampler() {
-    uint32_t seed = common::GlobalRandom()();
-    collective::Broadcast(&seed, sizeof(seed), 0);
-    rng_.seed(seed);
-  }
-
-  /**
-   * \brief Initialise this object before use.
+   * @brief Initialise this object before use.
    *
-   * \param num_col
-   * \param colsample_bynode
-   * \param colsample_bylevel
-   * \param colsample_bytree
-   * \param skip_index_0      (Optional) True to skip index 0.
+   * @param num_col
+   * @param colsample_bynode  Sampling rate for node.
+   * @param colsample_bylevel Sampling rate for tree level.
+   * @param colsample_bytree  Sampling rate for tree.
    */
   void Init(Context const* ctx, int64_t num_col, std::vector<float> feature_weights,
             float colsample_bynode, float colsample_bylevel, float colsample_bytree) {
-    feature_weights_ = std::move(feature_weights);
+    feature_weights_.HostVector() = std::move(feature_weights);
     colsample_bylevel_ = colsample_bylevel;
     colsample_bytree_ = colsample_bytree;
     colsample_bynode_ = colsample_bynode;
@@ -173,8 +175,17 @@ class ColumnSampler {
     }
     Reset();
 
+    feature_set_tree_->SetDevice(ctx->Device());
     feature_set_tree_->Resize(num_col);
-    std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
+    if (ctx->IsCPU()) {
+      std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
+    } else {
+#if defined(XGBOOST_USE_CUDA)
+      cuda_impl::InitFeatureSet(ctx, feature_set_tree_);
+#else
+      AssertGPUSupport();
+#endif
+    }
 
     feature_set_tree_ = ColSample(feature_set_tree_, colsample_bytree_);
   }
@@ -216,6 +227,11 @@ class ColumnSampler {
   }
 };
 
-}  // namespace common
-}  // namespace xgboost
+inline auto MakeColumnSampler(Context const*) {
+  std::uint32_t seed = common::GlobalRandomEngine()();
+  collective::Broadcast(&seed, sizeof(seed), 0);
+  auto cs = std::make_shared<common::ColumnSampler>(seed);
+  return cs;
+}
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_RANDOM_H_
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index 2e5c88174..4a8aa8a4b 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -360,7 +360,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
                                            common::OptionalWeights{info.weights_.ConstHostSpan()});
     } else {
       std::tie(fp, tp, auc) =
-          GPUBinaryROCAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
+          GPUBinaryROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
     }
     return std::make_tuple(fp, tp, auc);
   }
@@ -376,8 +376,9 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")
 .set_body([](const char*) { return new EvalROCAUC(); });
 
 #if !defined(XGBOOST_USE_CUDA)
-std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
-                                                   DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
+std::tuple<double, double, double> GPUBinaryROCAUC(Context const *, common::Span<float const>,
+                                                   MetaInfo const &,
+                                                   std::shared_ptr<DeviceAUCCache> *) {
   common::AssertGPUSupport();
   return {};
 }
@@ -409,8 +410,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
           BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                       common::OptionalWeights{info.weights_.ConstHostSpan()});
     } else {
-      std::tie(pr, re, auc) =
-          GPUBinaryPRAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
+      std::tie(pr, re, auc) = GPUBinaryPRAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
     }
     return std::make_tuple(pr, re, auc);
   }
@@ -453,8 +453,9 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
     .set_body([](char const *) { return new EvalPRAUC{}; });
 
 #if !defined(XGBOOST_USE_CUDA)
-std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
-                                                  DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
+std::tuple<double, double, double> GPUBinaryPRAUC(Context const *, common::Span<float const>,
+                                                  MetaInfo const &,
+                                                  std::shared_ptr<DeviceAUCCache> *) {
   common::AssertGPUSupport();
   return {};
 }
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 8b8349e1b..4ce10d094 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -83,13 +83,14 @@ void InitCacheOnce(common::Span<float const> predts, std::shared_ptr<DeviceAUCCa
  * - Reduce the scan array into 1 AUC value.
  */
 template <typename Fn>
-std::tuple<double, double, double>
-GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
-             DeviceOrd device, common::Span<size_t const> d_sorted_idx,
-             Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
-  auto labels = info.labels.View(device);
+std::tuple<double, double, double> GPUBinaryAUC(Context const *ctx,
+                                                common::Span<float const> predts,
+                                                MetaInfo const &info,
+                                                common::Span<size_t const> d_sorted_idx, Fn area_fn,
+                                                std::shared_ptr<DeviceAUCCache> cache) {
+  auto labels = info.labels.View(ctx->Device());
   auto weights = info.weights_.ConstDeviceSpan();
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 
   CHECK_NE(labels.Size(), 0);
   CHECK_EQ(labels.Size(), predts.size());
@@ -115,7 +116,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
 
   dh::XGBDeviceAllocator<char> alloc;
   auto d_unique_idx = dh::ToSpan(cache->unique_idx);
-  dh::Iota(d_unique_idx);
+  dh::Iota(d_unique_idx, ctx->CUDACtx()->Stream());
 
   auto uni_key = dh::MakeTransformIterator<float>(
       thrust::make_counting_iterator(0),
@@ -167,8 +168,9 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
   return std::make_tuple(last.first, last.second, auc);
 }
 
-std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, DeviceOrd device,
+std::tuple<double, double, double> GPUBinaryROCAUC(Context const *ctx,
+                                                   common::Span<float const> predts,
+                                                   MetaInfo const &info,
                                                    std::shared_ptr<DeviceAUCCache> *p_cache) {
   auto &cache = *p_cache;
   InitCacheOnce<false>(predts, p_cache);
@@ -177,10 +179,10 @@ std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> pre
    * Create sorted index for each class
    */
   auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
-  dh::ArgSort<false>(predts, d_sorted_idx);
+  common::ArgSort<false>(ctx, predts, d_sorted_idx);
   // Create lambda to avoid pass function pointer.
   return GPUBinaryAUC(
-      predts, info, device, d_sorted_idx,
+      ctx, predts, info, d_sorted_idx,
       [] XGBOOST_DEVICE(double x0, double x1, double y0, double y1) -> double {
         return TrapezoidArea(x0, x1, y0, y1);
       },
@@ -361,7 +363,7 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
    */
   dh::XGBDeviceAllocator<char> alloc;
   auto d_unique_idx = dh::ToSpan(cache->unique_idx);
-  dh::Iota(d_unique_idx);
+  dh::Iota(d_unique_idx, ctx->CUDACtx()->Stream());
   auto uni_key = dh::MakeTransformIterator<thrust::pair<uint32_t, float>>(
       thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {
         uint32_t class_id = i / n_samples;
@@ -603,8 +605,9 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
   return std::make_pair(auc, n_valid);
 }
 
-std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, DeviceOrd device,
+std::tuple<double, double, double> GPUBinaryPRAUC(Context const *ctx,
+                                                  common::Span<float const> predts,
+                                                  MetaInfo const &info,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache) {
   auto& cache = *p_cache;
   InitCacheOnce<false>(predts, p_cache);
@@ -613,9 +616,9 @@ std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> pred
    * Create sorted index for each class
    */
   auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
-  dh::ArgSort<false>(predts, d_sorted_idx);
+  common::ArgSort<false>(ctx, predts, d_sorted_idx);
 
-  auto labels = info.labels.View(device);
+  auto labels = info.labels.View(ctx->Device());
   auto d_weights = info.weights_.ConstDeviceSpan();
   auto get_weight = common::OptionalWeights{d_weights};
   auto it = dh::MakeTransformIterator<Pair>(
@@ -639,7 +642,7 @@ std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> pred
     return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp, total_pos);
   };
   double fp, tp, auc;
-  std::tie(fp, tp, auc) = GPUBinaryAUC(predts, info, device, d_sorted_idx, fn, cache);
+  std::tie(fp, tp, auc) = GPUBinaryAUC(ctx, predts, info, d_sorted_idx, fn, cache);
   return std::make_tuple(1.0, 1.0, auc);
 }
 
@@ -699,16 +702,17 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
 }
 
 template <typename Fn>
-std::pair<double, uint32_t>
-GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
-                    common::Span<uint32_t> d_group_ptr, DeviceOrd device,
-                    std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
+std::pair<double, uint32_t> GPURankingPRAUCImpl(Context const *ctx,
+                                                common::Span<float const> predts,
+                                                MetaInfo const &info,
+                                                common::Span<uint32_t> d_group_ptr,
+                                                std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
   /**
    * Sorted idx
    */
   auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
 
-  auto labels = info.labels.View(device);
+  auto labels = info.labels.View(ctx->Device());
   auto weights = info.weights_.ConstDeviceSpan();
 
   uint32_t n_groups = static_cast<uint32_t>(info.group_ptr_.size() - 1);
@@ -739,7 +743,7 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
    */
   dh::XGBDeviceAllocator<char> alloc;
   auto d_unique_idx = dh::ToSpan(cache->unique_idx);
-  dh::Iota(d_unique_idx);
+  dh::Iota(d_unique_idx, ctx->CUDACtx()->Stream());
   auto uni_key = dh::MakeTransformIterator<thrust::pair<uint32_t, float>>(
       thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {
         auto idx = d_sorted_idx[i];
@@ -882,7 +886,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
     return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                   d_totals[group_id].first);
   };
-  return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->Device(), cache, fn);
+  return GPURankingPRAUCImpl(ctx, predts, info, d_group_ptr, cache, fn);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/src/metric/auc.h b/src/metric/auc.h
index fce1cc757..4fe2ecec4 100644
--- a/src/metric/auc.h
+++ b/src/metric/auc.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021 by XGBoost Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
  */
 #ifndef XGBOOST_METRIC_AUC_H_
 #define XGBOOST_METRIC_AUC_H_
@@ -18,8 +18,7 @@
 #include "xgboost/metric.h"
 #include "xgboost/span.h"
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 /***********
  * ROC AUC *
  ***********/
@@ -29,8 +28,9 @@ XGBOOST_DEVICE inline double TrapezoidArea(double x0, double x1, double y0, doub
 
 struct DeviceAUCCache;
 
-std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, DeviceOrd,
+std::tuple<double, double, double> GPUBinaryROCAUC(Context const *ctx,
+                                                   common::Span<float const> predts,
+                                                   MetaInfo const &info,
                                                    std::shared_ptr<DeviceAUCCache> *p_cache);
 
 double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
@@ -44,8 +44,9 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
 /**********
  * PR AUC *
  **********/
-std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, DeviceOrd,
+std::tuple<double, double, double> GPUBinaryPRAUC(Context const *ctx,
+                                                  common::Span<float const> predts,
+                                                  MetaInfo const &info,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache);
 
 double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
@@ -111,6 +112,5 @@ struct PRAUCLabelInvalid {
 inline void InvalidLabels() {
   LOG(FATAL) << "PR-AUC supports only binary relevance for learning to rank.";
 }
-}      // namespace metric
-}      // namespace xgboost
+}  // namespace xgboost::metric
 #endif  // XGBOOST_METRIC_AUC_H_
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index cea211622..07644146b 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -13,9 +13,7 @@
 #include "adaptive.h"
 #include "xgboost/context.h"
 
-namespace xgboost {
-namespace obj {
-namespace detail {
+namespace xgboost::obj::detail {
 void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
                           dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
                           HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
@@ -28,7 +26,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                 position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));
 
   p_ridx->resize(position.size());
-  dh::Iota(dh::ToSpan(*p_ridx));
+  dh::Iota(dh::ToSpan(*p_ridx), cuctx->Stream());
   // sort row index according to node index
   thrust::stable_sort_by_key(cuctx->TP(), sorted_position.begin(),
                              sorted_position.begin() + n_samples, p_ridx->begin());
@@ -190,6 +188,4 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   });
   UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
 }
-}  // namespace detail
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj::detail
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index f862e048e..6eed74c56 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -72,7 +72,7 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
     TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator) {
   dh::XGBCachingDeviceAllocator<char> alloc;
   auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
-  dh::Iota(sorted_idx);
+  dh::Iota(sorted_idx, dh::DefaultStream());
   auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size());
   auto it = thrust::make_counting_iterator(0u);
   auto d_feature_idx = dh::ToSpan(feature_idx_);
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 3c37556e1..94e7547ee 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -248,8 +248,7 @@ class GlobalApproxUpdater : public TreeUpdater {
   std::unique_ptr<GloablApproxBuilder> pimpl_;
   // pointer to the last DMatrix, used for update prediction cache.
   DMatrix *cached_{nullptr};
-  std::shared_ptr<common::ColumnSampler> column_sampler_ =
-      std::make_shared<common::ColumnSampler>();
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
   ObjInfo const *task_;
   HistMakerTrainParam hist_param_;
 
@@ -284,6 +283,9 @@ class GlobalApproxUpdater : public TreeUpdater {
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
     CHECK(hist_param_.GetInitialised());
+    if (!column_sampler_) {
+      column_sampler_ = common::MakeColumnSampler(ctx_);
+    }
     pimpl_ = std::make_unique<GloablApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
                                                    column_sampler_, task_, &monitor_);
 
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 7a88bd30e..e366811f7 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -225,9 +225,12 @@ class ColMaker: public TreeUpdater {
         }
       }
       {
-        column_sampler_.Init(ctx_, fmat.Info().num_col_,
-                             fmat.Info().feature_weights.ConstHostVector(), param_.colsample_bynode,
-                             param_.colsample_bylevel, param_.colsample_bytree);
+        if (!column_sampler_) {
+          column_sampler_ = common::MakeColumnSampler(ctx_);
+        }
+        column_sampler_->Init(
+            ctx_, fmat.Info().num_col_, fmat.Info().feature_weights.ConstHostVector(),
+            param_.colsample_bynode, param_.colsample_bylevel, param_.colsample_bytree);
       }
       {
         // setup temp space for each thread
@@ -467,7 +470,7 @@ class ColMaker: public TreeUpdater {
                           RegTree *p_tree) {
       auto evaluator = tree_evaluator_.GetEvaluator();
 
-      auto feat_set = column_sampler_.GetFeatureSet(depth);
+      auto feat_set = column_sampler_->GetFeatureSet(depth);
       for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
         this->UpdateSolution(batch, feat_set->HostVector(), gpair, p_fmat);
       }
@@ -586,7 +589,7 @@ class ColMaker: public TreeUpdater {
     const ColMakerTrainParam& colmaker_train_param_;
     // number of omp thread used during training
     Context const* ctx_;
-    common::ColumnSampler column_sampler_;
+    std::shared_ptr<common::ColumnSampler> column_sampler_;
     // Instance Data: current node position in the tree of each instance
     std::vector<int> position_;
     // PerThread x PerTreeNode: statistics for per thread construction
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 375b24cfa..2bb5b0b49 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost Contributors
+ * Copyright 2017-2023, XGBoost Contributors
  * \file updater_quantile_hist.cc
  * \brief use quantized feature values to construct a tree
  * \author Philip Cho, Tianqi Checn, Egor Smirnov
@@ -470,8 +470,7 @@ class HistUpdater {
 class QuantileHistMaker : public TreeUpdater {
   std::unique_ptr<HistUpdater> p_impl_{nullptr};
   std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
-  std::shared_ptr<common::ColumnSampler> column_sampler_ =
-      std::make_shared<common::ColumnSampler>();
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
   common::Monitor monitor_;
   ObjInfo const *task_{nullptr};
   HistMakerTrainParam hist_param_;
@@ -495,6 +494,10 @@ class QuantileHistMaker : public TreeUpdater {
   void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
+    if (!column_sampler_) {
+      column_sampler_ = common::MakeColumnSampler(ctx_);
+    }
+
     if (trees.front()->IsMultiTarget()) {
       CHECK(hist_param_.GetInitialised());
       CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
diff --git a/tests/cpp/common/test_algorithm.cu b/tests/cpp/common/test_algorithm.cu
index c36073397..8f857ff50 100644
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -57,13 +57,13 @@ TEST(Algorithm, GpuArgSort) {
   auto ctx = MakeCUDACtx(0);
 
   dh::device_vector<float> values(20);
-  dh::Iota(dh::ToSpan(values));                                    // accending
+  dh::Iota(dh::ToSpan(values), ctx.CUDACtx()->Stream());  // accending
   dh::device_vector<size_t> sorted_idx(20);
-  dh::ArgSort<false>(dh::ToSpan(values), dh::ToSpan(sorted_idx));  // sort to descending
-  ASSERT_TRUE(thrust::is_sorted(thrust::device, sorted_idx.begin(), sorted_idx.end(),
+  ArgSort<false>(&ctx, dh::ToSpan(values), dh::ToSpan(sorted_idx));  // sort to descending
+  ASSERT_TRUE(thrust::is_sorted(ctx.CUDACtx()->CTP(), sorted_idx.begin(), sorted_idx.end(),
                                 thrust::greater<size_t>{}));
 
-  dh::Iota(dh::ToSpan(values));
+  dh::Iota(dh::ToSpan(values), ctx.CUDACtx()->Stream());
   dh::device_vector<size_t> groups(3);
   groups[0] = 0;
   groups[1] = 10;
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 92d8ff753..c0d5c5ddc 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -16,6 +16,7 @@
 #include <vector>     // for vector
 
 #include "../../../include/xgboost/logging.h"
+#include "../../../src/common/cuda_context.cuh"
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/hist_util.h"
@@ -211,7 +212,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   cuts_ptr.SetDevice(DeviceOrd::CUDA(0));
 
   dh::device_vector<float> weight(n_samples * n_features, 0);
-  dh::Iota(dh::ToSpan(weight));
+  dh::Iota(dh::ToSpan(weight), ctx.CUDACtx()->Stream());
 
   dh::caching_device_vector<bst_row_t> columns_ptr(4);
   for (std::size_t i = 0; i < columns_ptr.size(); ++i) {
diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc
index e2ecd0990..a51776475 100644
--- a/tests/cpp/common/test_random.cc
+++ b/tests/cpp/common/test_random.cc
@@ -1,19 +1,20 @@
-#include <valarray>
+/**
+ * Copyright 2018-2023, XGBoost Contributors
+ */
 #include "../../../src/common/random.h"
 #include "../helpers.h"
 #include "gtest/gtest.h"
-#include "xgboost/context.h"  // Context
+#include "xgboost/context.h"  // for Context
 
-namespace xgboost {
-namespace common {
-TEST(ColumnSampler, Test) {
-  Context ctx;
+namespace xgboost::common {
+namespace {
+void TestBasic(Context const* ctx) {
   int n = 128;
-  ColumnSampler cs;
+  ColumnSampler cs{1u};
   std::vector<float> feature_weights;
 
   // No node sampling
-  cs.Init(&ctx, n, feature_weights, 1.0f, 0.5f, 0.5f);
+  cs.Init(ctx, n, feature_weights, 1.0f, 0.5f, 0.5f);
   auto set0 = cs.GetFeatureSet(0);
   ASSERT_EQ(set0->Size(), 32);
 
@@ -26,7 +27,7 @@ TEST(ColumnSampler, Test) {
   ASSERT_EQ(set2->Size(), 32);
 
   // Node sampling
-  cs.Init(&ctx, n, feature_weights, 0.5f, 1.0f, 0.5f);
+  cs.Init(ctx, n, feature_weights, 0.5f, 1.0f, 0.5f);
   auto set3 = cs.GetFeatureSet(0);
   ASSERT_EQ(set3->Size(), 32);
 
@@ -36,21 +37,33 @@ TEST(ColumnSampler, Test) {
   ASSERT_EQ(set4->Size(), 32);
 
   // No level or node sampling, should be the same at different depth
-  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 0.5f);
-  ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
-            cs.GetFeatureSet(1)->HostVector());
+  cs.Init(ctx, n, feature_weights, 1.0f, 1.0f, 0.5f);
+  ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector());
 
-  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
+  cs.Init(ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
   auto set5 = cs.GetFeatureSet(0);
   ASSERT_EQ(set5->Size(), n);
-  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
+  cs.Init(ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
   auto set6 = cs.GetFeatureSet(0);
   ASSERT_EQ(set5->HostVector(), set6->HostVector());
 
   // Should always be a minimum of one feature
-  cs.Init(&ctx, n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
+  cs.Init(ctx, n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
   ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
 }
+}  // namespace
+
+TEST(ColumnSampler, Test) {
+  Context ctx;
+  TestBasic(&ctx);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(ColumnSampler, GPUTest) {
+  auto ctx = MakeCUDACtx(0);
+  TestBasic(&ctx);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
 
 // Test if different threads using the same seed produce the same result
 TEST(ColumnSampler, ThreadSynchronisation) {
@@ -81,16 +94,16 @@ TEST(ColumnSampler, ThreadSynchronisation) {
   ASSERT_TRUE(success);
 }
 
-TEST(ColumnSampler, WeightedSampling) {
-  auto test_basic = [](int first) {
-    Context ctx;
+namespace {
+void TestWeightedSampling(Context const* ctx) {
+  auto test_basic = [ctx](int first) {
     std::vector<float> feature_weights(2);
     feature_weights[0] = std::abs(first - 1.0f);
     feature_weights[1] = first - 0.0f;
     ColumnSampler cs{0};
-    cs.Init(&ctx, 2, feature_weights, 1.0, 1.0, 0.5);
+    cs.Init(ctx, 2, feature_weights, 1.0, 1.0, 0.5);
     auto feature_sets = cs.GetFeatureSet(0);
-    auto const &h_feat_set = feature_sets->HostVector();
+    auto const& h_feat_set = feature_sets->HostVector();
     ASSERT_EQ(h_feat_set.size(), 1);
     ASSERT_EQ(h_feat_set[0], first - 0);
   };
@@ -104,8 +117,7 @@ TEST(ColumnSampler, WeightedSampling) {
   SimpleRealUniformDistribution<float> dist(.0f, 12.0f);
   std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); });
   ColumnSampler cs{0};
-  Context ctx;
-  cs.Init(&ctx, kCols, feature_weights, 0.5f, 1.0f, 1.0f);
+  cs.Init(ctx, kCols, feature_weights, 0.5f, 1.0f, 1.0f);
   std::vector<bst_feature_t> features(kCols);
   std::iota(features.begin(), features.end(), 0);
   std::vector<float> freq(kCols, 0);
@@ -131,8 +143,22 @@ TEST(ColumnSampler, WeightedSampling) {
     EXPECT_NEAR(freq[i], feature_weights[i], 1e-2);
   }
 }
+}  // namespace
 
-TEST(ColumnSampler, WeightedMultiSampling) {
+TEST(ColumnSampler, WeightedSampling) {
+  Context ctx;
+  TestWeightedSampling(&ctx);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(ColumnSampler, GPUWeightedSampling) {
+  auto ctx = MakeCUDACtx(0);
+  TestWeightedSampling(&ctx);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
+
+namespace {
+void TestWeightedMultiSampling(Context const* ctx) {
   size_t constexpr kCols = 32;
   std::vector<float> feature_weights(kCols, 0);
   for (size_t i = 0; i < feature_weights.size(); ++i) {
@@ -140,13 +166,24 @@ TEST(ColumnSampler, WeightedMultiSampling) {
   }
   ColumnSampler cs{0};
   float bytree{0.5}, bylevel{0.5}, bynode{0.5};
-  Context ctx;
-  cs.Init(&ctx, feature_weights.size(), feature_weights, bytree, bylevel, bynode);
+  cs.Init(ctx, feature_weights.size(), feature_weights, bytree, bylevel, bynode);
   auto feature_set = cs.GetFeatureSet(0);
   size_t n_sampled = kCols * bytree * bylevel * bynode;
   ASSERT_EQ(feature_set->Size(), n_sampled);
   feature_set = cs.GetFeatureSet(1);
   ASSERT_EQ(feature_set->Size(), n_sampled);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace
+
+TEST(ColumnSampler, WeightedMultiSampling) {
+  Context ctx;
+  TestWeightedMultiSampling(&ctx);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(ColumnSampler, GPUWeightedMultiSampling) {
+  auto ctx = MakeCUDACtx(0);
+  TestWeightedMultiSampling(&ctx);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::common
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 78fda5ce5..329379b5b 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -28,7 +28,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
   Context ctx;
   ctx.nthread = 4;
   int static constexpr kRows = 8, kCols = 16;
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
 
   TrainParam param;
   param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
@@ -102,7 +102,7 @@ TEST(HistMultiEvaluator, Evaluate) {
 
   TrainParam param;
   param.Init(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
 
   std::size_t n_samples = 3;
   bst_feature_t n_features = 2;
@@ -166,7 +166,7 @@ TEST(HistEvaluator, Apply) {
   TrainParam param;
   param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}});
   auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix();
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
   auto evaluator_ = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
 
   CPUExpandEntry entry{0, 0};
@@ -194,7 +194,7 @@ TEST_F(TestPartitionBasedSplit, CPUHist) {
   Context ctx;
   // check the evaluator is returning the optimal split
   std::vector<FeatureType> ft{FeatureType::kCategorical};
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
   HistEvaluator evaluator{&ctx, &param_, info_, sampler};
   evaluator.InitRoot(GradStats{total_gpair_});
   RegTree tree;
@@ -224,7 +224,7 @@ auto CompareOneHotAndPartition(bool onehot) {
   auto dmat =
       RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix();
 
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
   auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
   std::vector<CPUExpandEntry> entries(1);
   HistMakerTrainParam hist_param;
@@ -271,7 +271,7 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
   ASSERT_EQ(node_hist.size(), feature_histogram_.size());
   std::copy(feature_histogram_.cbegin(), feature_histogram_.cend(), node_hist.begin());
 
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
   MetaInfo info;
   info.num_col_ = 1;
   info.feature_types = {FeatureType::kCategorical};
diff --git a/tests/cpp/tree/test_constraints.cc b/tests/cpp/tree/test_constraints.cc
index 912d608a3..4f810102d 100644
--- a/tests/cpp/tree/test_constraints.cc
+++ b/tests/cpp/tree/test_constraints.cc
@@ -1,3 +1,6 @@
+/**
+ * Copyright 2019-2023, XGBoost Contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>
 #include <xgboost/logging.h>
@@ -9,9 +12,7 @@
 #include "../../../src/tree/hist/evaluate_splits.h"
 #include "../helpers.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 TEST(CPUFeatureInteractionConstraint, Empty) {
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
@@ -77,7 +78,7 @@ TEST(CPUMonoConstraint, Basic) {
   param.UpdateAllowUnknown(Args{{"monotone_constraints", str_mono}});
 
   auto Xy = RandomDataGenerator{kRows, kCols, 0.0}.GenerateDMatrix(true);
-  auto sampler = std::make_shared<common::ColumnSampler>();
+  auto sampler = std::make_shared<common::ColumnSampler>(1u);
 
   HistEvaluator evalutor{&ctx, &param, Xy->Info(), sampler};
   evalutor.InitRoot(GradStats{2.0, 2.0});
@@ -90,5 +91,4 @@ TEST(CPUMonoConstraint, Basic) {
 
   ASSERT_TRUE(evalutor.Evaluator().has_constraint);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree

From 0715ab3c1094651fb371199d1a5be288e62dd748 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Nov 2023 19:27:31 +0800
Subject: [PATCH 14/32] Use `dlopen` to load NCCL. (#9796)

This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release.
- Add CMake option to load `nccl` at runtime.
- Add an NCCL stub.

After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change.

At the moment, this is Linux only since we only support MNMG on Linux.
---
 CMakeLists.txt                                |  10 ++
 cmake/Utils.cmake                             |  17 ++-
 cmake/modules/FindNccl.cmake                  |  29 +++--
 doc/tutorials/dask.rst                        |  31 +++++
 include/xgboost/c_api.h                       |   2 +
 include/xgboost/string_view.h                 |  50 ++++----
 jvm-packages/create_jni.py                    |   1 +
 plugin/federated/federated_comm.cuh           |   6 +
 plugin/federated/federated_comm.h             |   6 +-
 python-package/packager/build_config.py       |   2 +
 python-package/pyproject.toml                 |   3 +-
 python-package/xgboost/collective.py          |  30 ++++-
 python-package/xgboost/core.py                |  12 +-
 src/c_api/c_api.cc                            |   6 +-
 src/c_api/c_api.cu                            |   8 ++
 src/collective/coll.cu                        |  62 +++++-----
 src/collective/coll.cuh                       |   3 +-
 src/collective/comm.cc                        |  24 ++--
 src/collective/comm.cu                        |  29 +++--
 src/collective/comm.cuh                       |  47 +++++++-
 src/collective/comm.h                         |  22 +++-
 src/collective/comm_group.cc                  |  15 +--
 src/collective/comm_group.h                   |   6 +-
 src/collective/communicator.cc                |   5 +
 src/collective/communicator.cu                |   6 +-
 src/collective/communicator.h                 |   1 +
 src/collective/nccl_device_communicator.cu    |  49 +++++---
 src/collective/nccl_device_communicator.cuh   |   8 +-
 src/collective/nccl_stub.cc                   | 109 ++++++++++++++++++
 src/collective/nccl_stub.h                    |  94 +++++++++++++++
 src/common/device_helpers.cuh                 |  24 ----
 tests/buildkite/build-cuda-with-rmm.sh        |  17 ++-
 tests/buildkite/build-cuda.sh                 |  16 ++-
 tests/buildkite/test-cpp-gpu.sh               |   1 +
 tests/buildkite/test-cpp-mgpu.sh              |   1 +
 tests/buildkite/test-python-gpu.sh            |   3 +-
 tests/ci_build/Dockerfile.gpu                 |   5 +-
 tests/ci_build/Dockerfile.gpu_build_centos7   |   2 +-
 tests/ci_build/prune_libnccl.sh               |  35 ------
 tests/ci_build/rename_whl.py                  |  40 +++----
 tests/cpp/collective/test_allgather.cu        |   6 +-
 tests/cpp/collective/test_allreduce.cu        |   8 +-
 .../test_nccl_device_communicator.cu          |  13 +--
 tests/cpp/collective/test_worker.h            |   2 +-
 .../test_gpu_with_dask/test_gpu_with_dask.py  |  60 ++++++++++
 45 files changed, 658 insertions(+), 268 deletions(-)
 create mode 100644 src/collective/nccl_stub.cc
 create mode 100644 src/collective/nccl_stub.h
 delete mode 100755 tests/ci_build/prune_libnccl.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e93427ed9..bf8f0cf62 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,7 +69,10 @@ option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binar
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
+# This is specifically designed for PyPI binary release and should be disabled for most of the cases.
+option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
+
 if(USE_CUDA)
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT DEFINED ENV{CUDAARCHS})
     set(GPU_COMPUTE_VER "" CACHE STRING
@@ -80,6 +83,7 @@ if(USE_CUDA)
     unset(GPU_COMPUTE_VER CACHE)
   endif()
 endif()
+
 # CUDA device LTO was introduced in CMake v3.25 and requires host LTO to also be enabled but can still
 # be explicitly disabled allowing for LTO on host only, host and device, or neither, but device-only LTO
 # is not a supproted configuration
@@ -115,6 +119,12 @@ endif()
 if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
   message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
 endif()
+if(USE_DLOPEN_NCCL AND (NOT USE_NCCL))
+  message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable USE_DLOPEN_NCCL.")
+endif()
+if(USE_DLOPEN_NCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")))
+  message(SEND_ERROR "`USE_DLOPEN_NCCL` supports only Linux at the moment.")
+endif()
 if(JVM_BINDINGS AND R_LIB)
   message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
 endif()
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index eafd829fc..9c373bb01 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -171,17 +171,24 @@ function(xgboost_set_cuda_flags target)
   endif()
 endfunction()
 
-macro(xgboost_link_nccl target)
+function(xgboost_link_nccl target)
+  set(xgboost_nccl_flags -DXGBOOST_USE_NCCL=1)
+  if(USE_DLOPEN_NCCL)
+    list(APPEND xgboost_nccl_flags -DXGBOOST_USE_DLOPEN_NCCL=1)
+  endif()
+
   if(BUILD_STATIC_LIB)
     target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
-    target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_NCCL=1)
+    target_compile_definitions(${target} PUBLIC ${xgboost_nccl_flags})
     target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY})
   else()
     target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR})
-    target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1)
-    target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
+    target_compile_definitions(${target} PRIVATE ${xgboost_nccl_flags})
+    if(NOT USE_DLOPEN_NCCL)
+      target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
+    endif()
   endif()
-endmacro()
+endfunction()
 
 # compile options
 macro(xgboost_target_properties target)
diff --git a/cmake/modules/FindNccl.cmake b/cmake/modules/FindNccl.cmake
index 02ee731a1..fa3ed0866 100644
--- a/cmake/modules/FindNccl.cmake
+++ b/cmake/modules/FindNccl.cmake
@@ -54,17 +54,24 @@ find_path(NCCL_INCLUDE_DIR
   NAMES nccl.h
   HINTS  ${NCCL_ROOT}/include $ENV{NCCL_ROOT}/include)
 
-find_library(NCCL_LIBRARY
-  NAMES ${NCCL_LIB_NAME}
-  HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/)
+if(USE_DLOPEN_NCCL)
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Nccl DEFAULT_MSG NCCL_INCLUDE_DIR)
 
-message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
+  mark_as_advanced(NCCL_INCLUDE_DIR)
+else()
+  find_library(NCCL_LIBRARY
+    NAMES ${NCCL_LIB_NAME}
+    HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/)
 
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Nccl DEFAULT_MSG
-                                  NCCL_INCLUDE_DIR NCCL_LIBRARY)
+  message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
 
-mark_as_advanced(
-  NCCL_INCLUDE_DIR
-  NCCL_LIBRARY
-)
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Nccl DEFAULT_MSG
+    NCCL_INCLUDE_DIR NCCL_LIBRARY)
+
+  mark_as_advanced(
+    NCCL_INCLUDE_DIR
+    NCCL_LIBRARY
+  )
+endif()
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 148230fe6..4b145f9a9 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -536,6 +536,37 @@ Troubleshooting
 - MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
   that includes `Multiple processes within a communication group ...` upon initialization.
 
+.. _nccl-load:
+
+- Starting from version 2.1.0, to reduce the size of the binary wheel, the XGBoost package
+  (installed using pip) loads NCCL from the environment instead of bundling it
+  directly. This means that if you encounter an error message like
+  "Failed to load nccl ...", it indicates that NCCL is not installed or properly
+  configured in your environment.
+
+  To resolve this issue, you can install NCCL using pip:
+
+  .. code-block:: sh
+
+    pip install nvidia-nccl-cu12 # (or with any compatible CUDA version)
+
+  The default conda installation of XGBoost should not encounter this error. If you are
+  using a customized XGBoost, please make sure one of the followings is true:
+
+  + XGBoost is NOT compiled with the `USE_DLOPEN_NCCL` flag.
+  + The `dmlc_nccl_path` parameter is set to full NCCL path when initializing the collective.
+
+  Here are some additional tips for troubleshooting NCCL dependency issues:
+
+  + Check the NCCL installation path and verify that it's installed correctly. We try to
+    find NCCL by using ``from nvidia.nccl import lib`` in Python when XGBoost is installed
+    using pip.
+  + Ensure that you have the correct CUDA version installed. NCCL requires a compatible
+    CUDA version to function properly.
+  + If you are not using distributed training with XGBoost and yet see this error, please
+    open an issue on GitHub.
+  + If you continue to encounter NCCL dependency issues, please open an issue on GitHub.
+
 ************
 IPv6 Support
 ************
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index ffa3a6c79..59d4d0881 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1613,6 +1613,8 @@ XGB_DLL int XGTrackerFree(TrackerHandle handle);
  *   - DMLC_TRACKER_PORT: Port number of the tracker.
  *   - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
  *   - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
+ *   - dmlc_nccl_path: The path to NCCL shared object. Only used if XGBoost is compiled with
+ *                     `USE_DLOPEN_NCCL`.
  * Only applicable to the Federated communicator (use upper case for environment variables, use
  * lower case for runtime configuration):
  *   - federated_server_address: Address of the federated server.
diff --git a/include/xgboost/string_view.h b/include/xgboost/string_view.h
index ba0d9f368..463558363 100644
--- a/include/xgboost/string_view.h
+++ b/include/xgboost/string_view.h
@@ -1,23 +1,24 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2023, XGBoost Contributors
  */
 #ifndef XGBOOST_STRING_VIEW_H_
 #define XGBOOST_STRING_VIEW_H_
 #include <xgboost/logging.h>  // CHECK_LT
 #include <xgboost/span.h>     // Span
 
-#include <algorithm>          // std::equal,std::min
-#include <iterator>           // std::reverse_iterator
-#include <ostream>            // std::ostream
-#include <string>             // std::char_traits,std::string
+#include <algorithm>  // for equal, min
+#include <cstddef>    // for size_t
+#include <iterator>   // for reverse_iterator
+#include <ostream>    // for ostream
+#include <string>     // for char_traits, string
 
 namespace xgboost {
 struct StringView {
  private:
-  using CharT = char;  // unsigned char
+  using CharT = char;
   using Traits = std::char_traits<CharT>;
   CharT const* str_{nullptr};
-  size_t size_{0};
+  std::size_t size_{0};
 
  public:
   using value_type = CharT;                                        // NOLINT
@@ -28,40 +29,41 @@ struct StringView {
 
  public:
   constexpr StringView() = default;
-  constexpr StringView(CharT const* str, std::size_t size) : str_{str}, size_{size} {}
+  constexpr StringView(value_type const* str, std::size_t size) : str_{str}, size_{size} {}
   StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {}  // NOLINT
-  constexpr StringView(CharT const* str)  // NOLINT
+  constexpr StringView(value_type const* str)                                   // NOLINT
       : str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {}
 
-  CharT const& operator[](size_t p) const { return str_[p]; }
-  CharT const& at(size_t p) const {  // NOLINT
+  [[nodiscard]] value_type const& operator[](std::size_t p) const { return str_[p]; }
+  [[nodiscard]] explicit operator std::string() const { return {this->c_str(), this->size()}; }
+  [[nodiscard]] value_type const& at(std::size_t p) const {  // NOLINT
     CHECK_LT(p, size_);
     return str_[p];
   }
-  constexpr std::size_t size() const { return size_; }  // NOLINT
-  constexpr bool empty() const { return size() == 0; }  // NOLINT
-  StringView substr(size_t beg, size_t n) const {       // NOLINT
+  [[nodiscard]] constexpr std::size_t size() const { return size_; }       // NOLINT
+  [[nodiscard]] constexpr bool empty() const { return size() == 0; }       // NOLINT
+  [[nodiscard]] StringView substr(std::size_t beg, std::size_t n) const {  // NOLINT
     CHECK_LE(beg, size_);
-    size_t len = std::min(n, size_ - beg);
+    std::size_t len = std::min(n, size_ - beg);
     return {str_ + beg, len};
   }
-  CharT const* c_str() const { return str_; }                    // NOLINT
+  [[nodiscard]] value_type const* c_str() const { return str_; }  // NOLINT
 
-  constexpr CharT const* cbegin() const { return str_; }         // NOLINT
-  constexpr CharT const* cend() const { return str_ + size(); }  // NOLINT
-  constexpr CharT const* begin() const { return str_; }          // NOLINT
-  constexpr CharT const* end() const { return str_ + size(); }   // NOLINT
+  [[nodiscard]] constexpr const_iterator cbegin() const { return str_; }         // NOLINT
+  [[nodiscard]] constexpr const_iterator cend() const { return str_ + size(); }  // NOLINT
+  [[nodiscard]] constexpr iterator begin() const { return str_; }                // NOLINT
+  [[nodiscard]] constexpr iterator end() const { return str_ + size(); }         // NOLINT
 
-  const_reverse_iterator rbegin() const noexcept {               // NOLINT
+  [[nodiscard]] const_reverse_iterator rbegin() const noexcept {  // NOLINT
     return const_reverse_iterator(this->end());
   }
-  const_reverse_iterator crbegin() const noexcept {  // NOLINT
+  [[nodiscard]] const_reverse_iterator crbegin() const noexcept {  // NOLINT
     return const_reverse_iterator(this->end());
   }
-  const_reverse_iterator rend() const noexcept {  // NOLINT
+  [[nodiscard]] const_reverse_iterator rend() const noexcept {  // NOLINT
     return const_reverse_iterator(this->begin());
   }
-  const_reverse_iterator crend() const noexcept {  // NOLINT
+  [[nodiscard]] const_reverse_iterator crend() const noexcept {  // NOLINT
     return const_reverse_iterator(this->begin());
   }
 };
diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py
index 18908fc1c..3692cb13c 100755
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@@ -103,6 +103,7 @@ if __name__ == "__main__":
             if cli_args.use_cuda == 'ON':
                 CONFIG['USE_CUDA'] = 'ON'
                 CONFIG['USE_NCCL'] = 'ON'
+                CONFIG["USE_DLOPEN_NCCL"] = "OFF"
 
             args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
 
diff --git a/plugin/federated/federated_comm.cuh b/plugin/federated/federated_comm.cuh
index df9127644..58c52f67e 100644
--- a/plugin/federated/federated_comm.cuh
+++ b/plugin/federated/federated_comm.cuh
@@ -5,9 +5,11 @@
 
 #include <memory>  // for shared_ptr
 
+#include "../../src/collective/coll.h"          // for Coll
 #include "../../src/common/device_helpers.cuh"  // for CUDAStreamView
 #include "federated_comm.h"                     // for FederatedComm
 #include "xgboost/context.h"                    // for Context
+#include "xgboost/logging.h"
 
 namespace xgboost::collective {
 class CUDAFederatedComm : public FederatedComm {
@@ -16,5 +18,9 @@ class CUDAFederatedComm : public FederatedComm {
  public:
   explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl);
   [[nodiscard]] auto Stream() const { return stream_; }
+  Comm* MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const override {
+    LOG(FATAL) << "[Internal Error]: Invalid request for CUDA variant.";
+    return nullptr;
+  }
 };
 }  // namespace xgboost::collective
diff --git a/plugin/federated/federated_comm.h b/plugin/federated/federated_comm.h
index a24798626..750d94abd 100644
--- a/plugin/federated/federated_comm.h
+++ b/plugin/federated/federated_comm.h
@@ -10,12 +10,12 @@
 #include <memory>   // for unique_ptr
 #include <string>   // for string
 
-#include "../../src/collective/comm.h"    // for Comm
+#include "../../src/collective/comm.h"    // for HostComm
 #include "../../src/common/json_utils.h"  // for OptionalArg
 #include "xgboost/json.h"
 
 namespace xgboost::collective {
-class FederatedComm : public Comm {
+class FederatedComm : public HostComm {
   std::shared_ptr<federated::Federated::Stub> stub_;
 
   void Init(std::string const& host, std::int32_t port, std::int32_t world, std::int32_t rank,
@@ -64,6 +64,6 @@ class FederatedComm : public Comm {
   [[nodiscard]] bool IsFederated() const override { return true; }
   [[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); }
 
-  Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
+  [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
 };
 }  // namespace xgboost::collective
diff --git a/python-package/packager/build_config.py b/python-package/packager/build_config.py
index 26392a897..d3733d628 100644
--- a/python-package/packager/build_config.py
+++ b/python-package/packager/build_config.py
@@ -15,6 +15,8 @@ class BuildConfiguration:  # pylint: disable=R0902
     use_cuda: bool = False
     # Whether to enable NCCL
     use_nccl: bool = False
+    # Whether to load nccl dynamically
+    use_dlopen_nccl: bool = False
     # Whether to enable HDFS
     use_hdfs: bool = False
     # Whether to enable Azure Storage
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index 199e0f06c..3bd642cc7 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -29,7 +29,8 @@ classifiers = [
 ]
 dependencies = [
     "numpy",
-    "scipy"
+    "scipy",
+    "nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'"
 ]
 
 [project.urls]
diff --git a/python-package/xgboost/collective.py b/python-package/xgboost/collective.py
index 4c67ccbfc..4eb5ea2ab 100644
--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@@ -2,14 +2,15 @@
 import ctypes
 import json
 import logging
+import os
 import pickle
 from enum import IntEnum, unique
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import numpy as np
 
 from ._typing import _T
-from .core import _LIB, _check_call, c_str, from_pystr_to_cstr, py_str
+from .core import _LIB, _check_call, build_info, c_str, from_pystr_to_cstr, py_str
 
 LOGGER = logging.getLogger("[xgboost.collective]")
 
@@ -250,6 +251,31 @@ class CommunicatorContext:
 
     def __init__(self, **args: Any) -> None:
         self.args = args
+        key = "dmlc_nccl_path"
+        if args.get(key, None) is not None:
+            return
+
+        binfo = build_info()
+        if not binfo["USE_DLOPEN_NCCL"]:
+            return
+
+        try:
+            # PyPI package of NCCL.
+            from nvidia.nccl import lib
+
+            # There are two versions of nvidia-nccl, one is from PyPI, another one from
+            # nvidia-pyindex. We support only the first one as the second one is too old
+            # (2.9.8 as of writing).
+            if lib.__file__ is not None:
+                dirname: Optional[str] = os.path.dirname(lib.__file__)
+            else:
+                dirname = None
+
+            if dirname:
+                path = os.path.join(dirname, "libnccl.so.2")
+                self.args[key] = path
+        except ImportError:
+            pass
 
     def __enter__(self) -> Dict[str, Any]:
         init(**self.args)
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 648851b31..bfc94aa04 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -184,6 +184,13 @@ def _py_version() -> str:
         return f.read().strip()
 
 
+def _register_log_callback(lib: ctypes.CDLL) -> None:
+    lib.XGBGetLastError.restype = ctypes.c_char_p
+    lib.callback = _get_log_callback_func()  # type: ignore
+    if lib.XGBRegisterLogCallback(lib.callback) != 0:
+        raise XGBoostError(lib.XGBGetLastError())
+
+
 def _load_lib() -> ctypes.CDLL:
     """Load xgboost Library."""
     lib_paths = find_lib_path()
@@ -228,10 +235,7 @@ Likely causes:
 Error message(s): {os_error_list}
 """
         )
-    lib.XGBGetLastError.restype = ctypes.c_char_p
-    lib.callback = _get_log_callback_func()  # type: ignore
-    if lib.XGBRegisterLogCallback(lib.callback) != 0:
-        raise XGBoostError(lib.XGBGetLastError())
+    _register_log_callback(lib)
 
     def parse(ver: str) -> Tuple[int, int, int]:
         """Avoid dependency on packaging (PEP 440)."""
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 8975bfb2e..22f03640e 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -7,8 +7,6 @@
 #include <cinttypes>                         // for strtoimax
 #include <cmath>                             // for nan
 #include <cstring>                           // for strcmp
-#include <fstream>                           // for operator<<, basic_ostream, ios, stringstream
-#include <functional>                        // for less
 #include <limits>                            // for numeric_limits
 #include <map>                               // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
 #include <memory>                            // for shared_ptr, allocator, __shared_ptr_access
@@ -22,7 +20,6 @@
 #include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
 #include "../common/hist_util.h"             // for HistogramCuts
 #include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
-#include "../common/linalg_op.h"             // for ElementWiseTransformHost
 #include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
 #include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
 #include "../data/ellpack_page.h"            // for EllpackPage
@@ -35,14 +32,12 @@
 #include "dmlc/parameter.h"                  // for FieldAccessEntry, FieldEntry, ParamManager
 #include "dmlc/thread_local.h"               // for ThreadLocalStore
 #include "rabit/c_api.h"                     // for RabitLinkTag
-#include "rabit/rabit.h"                     // for CheckPoint, LoadCheckPoint
 #include "xgboost/base.h"                    // for bst_ulong, bst_float, GradientPair, bst_feat...
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for DMatrix, MetaInfo, DataType, ExtSparsePage
 #include "xgboost/feature_map.h"             // for FeatureMap
 #include "xgboost/global_config.h"           // for GlobalConfiguration, GlobalConfigThreadLocal...
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
-#include "xgboost/intrusive_ptr.h"           // for xgboost
 #include "xgboost/json.h"                    // for Json, get, Integer, IsA, Boolean, String
 #include "xgboost/learner.h"                 // for Learner, PredictionType
 #include "xgboost/logging.h"                 // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
@@ -79,6 +74,7 @@ void XGBBuildInfoDevice(Json *p_info) {
   info["USE_CUDA"] = Boolean{false};
   info["USE_NCCL"] = Boolean{false};
   info["USE_RMM"] = Boolean{false};
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
 }
 }  // namespace xgboost
 #endif
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 84a371558..4ace8b7cc 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -33,8 +33,16 @@ void XGBBuildInfoDevice(Json *p_info) {
   info["USE_NCCL"] = Boolean{true};
   v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
   info["NCCL_VERSION"] = v;
+
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  info["USE_DLOPEN_NCCL"] = Boolean{true};
+#else
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+
 #else
   info["USE_NCCL"] = Boolean{false};
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
 #endif
 
 #if defined(XGBOOST_USE_RMM)
diff --git a/src/collective/coll.cu b/src/collective/coll.cu
index bac9fb094..60072b6a5 100644
--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@@ -19,25 +19,6 @@ Coll* Coll::MakeCUDAVar() { return new NCCLColl{}; }
 
 NCCLColl::~NCCLColl() = default;
 namespace {
-Result GetNCCLResult(ncclResult_t code) {
-  if (code == ncclSuccess) {
-    return Success();
-  }
-
-  std::stringstream ss;
-  ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
-  if (code == ncclUnhandledCudaError) {
-    // nccl usually preserves the last error so we can get more details.
-    auto err = cudaPeekAtLastError();
-    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
-  } else if (code == ncclSystemError) {
-    ss << "  This might be caused by a network configuration issue. Please consider specifying "
-          "the network interface for NCCL via environment variables listed in its reference: "
-          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
-  }
-  return Fail(ss.str());
-}
-
 auto GetNCCLType(ArrayInterfaceHandler::Type type) {
   auto fatal = [] {
     LOG(FATAL) << "Invalid type for NCCL operation.";
@@ -94,11 +75,12 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> ou
                                       common::Span<std::int8_t> data, Op op) {
   dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World());
   auto* device_buffer = buffer.data().get();
+  auto stub = pcomm->Stub();
 
   // First gather data from all the workers.
   CHECK(handle);
-  auto rc = GetNCCLResult(
-      ncclAllGather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream()));
+  auto rc = GetNCCLResult(stub, stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8,
+                                                handle, pcomm->Stream()));
   if (!rc.OK()) {
     return rc;
   }
@@ -149,6 +131,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
   }
   auto nccl = dynamic_cast<NCCLComm const*>(&comm);
   CHECK(nccl);
+  auto stub = nccl->Stub();
+
   return Success() << [&] {
     if (IsBitwiseOp(op)) {
       return BitwiseAllReduce(nccl, nccl->Handle(), data, op);
@@ -156,9 +140,9 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
       return DispatchDType(type, [=](auto t) {
         using T = decltype(t);
         auto rdata = common::RestoreType<T>(data);
-        auto rc = ncclAllReduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
-                                GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
-        return GetNCCLResult(rc);
+        auto rc = stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
+                                  GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
+        return GetNCCLResult(stub, rc);
       });
     }
   } << [&] { return nccl->Block(); };
@@ -171,9 +155,11 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
   }
   auto nccl = dynamic_cast<NCCLComm const*>(&comm);
   CHECK(nccl);
+  auto stub = nccl->Stub();
+
   return Success() << [&] {
-    return GetNCCLResult(ncclBroadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root,
-                                       nccl->Handle(), nccl->Stream()));
+    return GetNCCLResult(stub, stub->Broadcast(data.data(), data.data(), data.size_bytes(),
+                                               ncclInt8, root, nccl->Handle(), nccl->Stream()));
   } << [&] { return nccl->Block(); };
 }
 
@@ -184,10 +170,12 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
   }
   auto nccl = dynamic_cast<NCCLComm const*>(&comm);
   CHECK(nccl);
+  auto stub = nccl->Stub();
+
   auto send = data.subspan(comm.Rank() * size, size);
   return Success() << [&] {
-    return GetNCCLResult(
-        ncclAllGather(send.data(), data.data(), size, ncclInt8, nccl->Handle(), nccl->Stream()));
+    return GetNCCLResult(stub, stub->Allgather(send.data(), data.data(), size, ncclInt8,
+                                               nccl->Handle(), nccl->Stream()));
   } << [&] { return nccl->Block(); };
 }
 
@@ -199,19 +187,20 @@ namespace cuda_impl {
  */
 Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const> data,
                            common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
-  return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
+  auto stub = comm->Stub();
+  return Success() << [&stub] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
     std::size_t offset = 0;
     for (std::int32_t r = 0; r < comm->World(); ++r) {
       auto as_bytes = sizes[r];
-      auto rc = ncclBroadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
-                              ncclInt8, r, comm->Handle(), dh::DefaultStream());
+      auto rc = stub->Broadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
+                                ncclInt8, r, comm->Handle(), dh::DefaultStream());
       if (rc != ncclSuccess) {
-        return GetNCCLResult(rc);
+        return GetNCCLResult(stub, rc);
       }
       offset += as_bytes;
     }
     return Success();
-  } << [] { return GetNCCLResult(ncclGroupEnd()); };
+  } << [&] { return GetNCCLResult(stub, stub->GroupEnd()); };
 }
 }  // namespace cuda_impl
 
@@ -224,10 +213,11 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
   if (!comm.IsDistributed()) {
     return Success();
   }
+  auto stub = nccl->Stub();
 
   switch (algo) {
     case AllgatherVAlgo::kRing: {
-      return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
+      return Success() << [&] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
         // get worker offset
         detail::AllgatherVOffset(sizes, recv_segments);
         // copy data
@@ -237,8 +227,8 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
                                         cudaMemcpyDeviceToDevice, nccl->Stream()));
         }
         return detail::RingAllgatherV(comm, sizes, recv_segments, recv);
-      } << [] {
-        return GetNCCLResult(ncclGroupEnd());
+      } << [&] {
+        return GetNCCLResult(stub, stub->GroupEnd());
       } << [&] { return nccl->Block(); };
     }
     case AllgatherVAlgo::kBcast: {
diff --git a/src/collective/coll.cuh b/src/collective/coll.cuh
index 87fb46711..6ededd101 100644
--- a/src/collective/coll.cuh
+++ b/src/collective/coll.cuh
@@ -8,7 +8,8 @@
 #include "../data/array_interface.h"  // for ArrayInterfaceHandler
 #include "coll.h"                     // for Coll
 #include "comm.h"                     // for Comm
-#include "xgboost/span.h"             // for Span
+#include "nccl_stub.h"
+#include "xgboost/span.h"  // for Span
 
 namespace xgboost::collective {
 class NCCLColl : public Coll {
diff --git a/src/collective/comm.cc b/src/collective/comm.cc
index 9da9083f8..783278b65 100644
--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@@ -7,15 +7,12 @@
 #include <chrono>     // for seconds
 #include <cstdlib>    // for exit
 #include <memory>     // for shared_ptr
-#include <mutex>      // for unique_lock
 #include <string>     // for string
 #include <utility>    // for move, forward
 
 #include "../common/common.h"           // for AssertGPUSupport
-#include "../common/json_utils.h"       // for OptionalArg
 #include "allgather.h"                  // for RingAllgather
 #include "protocol.h"                   // for kMagic
-#include "tracker.h"                    // for GetHostAddress
 #include "xgboost/base.h"               // for XGBOOST_STRICT_R_MODE
 #include "xgboost/collective/socket.h"  // for TCPSocket
 #include "xgboost/json.h"               // for Json, Object
@@ -62,14 +59,6 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st
                             this->Rank(), this->World());
 }
 
-#if !defined(XGBOOST_USE_NCCL)
-Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
-  common::AssertGPUSupport();
-  common::AssertNCCLSupport();
-  return nullptr;
-}
-#endif  //  !defined(XGBOOST_USE_NCCL)
-
 [[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
                                     proto::PeerInfo ninfo, std::chrono::seconds timeout,
                                     std::int32_t retry,
@@ -194,12 +183,21 @@ Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
 }
 
 RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-                     std::int32_t retry, std::string task_id)
-    : Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
+                     std::int32_t retry, std::string task_id, StringView nccl_path)
+    : HostComm{std::move(host), port, timeout, retry, std::move(task_id)},
+      nccl_path_{std::move(nccl_path)} {
   auto rc = this->Bootstrap(timeout_, retry_, task_id_);
   CHECK(rc.OK()) << rc.Report();
 }
 
+#if !defined(XGBOOST_USE_NCCL)
+Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
+  common::AssertGPUSupport();
+  common::AssertNCCLSupport();
+  return nullptr;
+}
+#endif  //  !defined(XGBOOST_USE_NCCL)
+
 [[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
                                           std::string task_id) {
   TCPSocket tracker;
diff --git a/src/collective/comm.cu b/src/collective/comm.cu
index 09edc522d..cc67def0a 100644
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@@ -13,19 +13,21 @@
 #include "../common/cuda_context.cuh"    // for CUDAContext
 #include "../common/device_helpers.cuh"  // for DefaultStream
 #include "../common/type.h"              // for EraseType
-#include "broadcast.h"                   // for Broadcast
 #include "comm.cuh"                      // for NCCLComm
 #include "comm.h"                        // for Comm
+#include "nccl_stub.h"                   // for NcclStub
 #include "xgboost/collective/result.h"   // for Result
 #include "xgboost/span.h"                // for Span
 
 namespace xgboost::collective {
 namespace {
-Result GetUniqueId(Comm const& comm, std::shared_ptr<Coll> coll, ncclUniqueId* pid) {
+Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared_ptr<Coll> coll,
+                   ncclUniqueId* pid) {
   static const int kRootRank = 0;
   ncclUniqueId id;
   if (comm.Rank() == kRootRank) {
-    dh::safe_nccl(ncclGetUniqueId(&id));
+    auto rc = GetNCCLResult(stub, stub->GetUniqueId(&id));
+    CHECK(rc.OK()) << rc.Report();
   }
   auto rc = coll->Broadcast(
       comm, common::Span{reinterpret_cast<std::int8_t*>(&id), sizeof(ncclUniqueId)}, kRootRank);
@@ -54,11 +56,12 @@ static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> c
 }
 }  // namespace
 
-Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
-  return new NCCLComm{ctx, *this, pimpl};
+Comm* RabitComm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
+  return new NCCLComm{ctx, *this, pimpl, StringView{this->nccl_path_}};
 }
 
-NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl)
+NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
+                   StringView nccl_path)
     : Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(),
            root.TaskID()},
       stream_{ctx->CUDACtx()->Stream()} {
@@ -70,6 +73,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
   }
 
   dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
+  stub_ = std::make_shared<NcclStub>(nccl_path);
 
   std::vector<std::uint64_t> uuids(root.World() * kUuidLength, 0);
   auto s_uuid = xgboost::common::Span<std::uint64_t>{uuids.data(), uuids.size()};
@@ -95,19 +99,24 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
       << "Multiple processes within communication group running on same CUDA "
       << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
 
-  rc = GetUniqueId(root, pimpl, &nccl_unique_id_);
+  rc = std::move(rc) << [&] {
+    return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_);
+  } << [&] {
+    return GetNCCLResult(this->stub_, this->stub_->CommInitRank(&nccl_comm_, root.World(),
+                                                                nccl_unique_id_, root.Rank()));
+  };
   CHECK(rc.OK()) << rc.Report();
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank()));
 
   for (std::int32_t r = 0; r < root.World(); ++r) {
     this->channels_.emplace_back(
-        std::make_shared<NCCLChannel>(root, r, nccl_comm_, dh::DefaultStream()));
+        std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, dh::DefaultStream()));
   }
 }
 
 NCCLComm::~NCCLComm() {
   if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
+    CHECK(rc.OK()) << rc.Report();
   }
 }
 }  // namespace xgboost::collective
diff --git a/src/collective/comm.cuh b/src/collective/comm.cuh
index ea15c50f3..ef537b5a9 100644
--- a/src/collective/comm.cuh
+++ b/src/collective/comm.cuh
@@ -6,9 +6,13 @@
 #ifdef XGBOOST_USE_NCCL
 #include "nccl.h"
 #endif  // XGBOOST_USE_NCCL
+
+#include <utility>  // for move
+
 #include "../common/device_helpers.cuh"
 #include "coll.h"
 #include "comm.h"
+#include "nccl_stub.h"  // for NcclStub
 #include "xgboost/context.h"
 
 namespace xgboost::collective {
@@ -21,15 +25,20 @@ inline Result GetCUDAResult(cudaError rc) {
   return Fail(msg);
 }
 
+#if defined(XGBOOST_USE_NCCL)
 class NCCLComm : public Comm {
   ncclComm_t nccl_comm_{nullptr};
+  std::shared_ptr<NcclStub> stub_;
   ncclUniqueId nccl_unique_id_{};
   dh::CUDAStreamView stream_;
+  std::string nccl_path_;
 
  public:
   [[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; }
+  auto Stub() const { return stub_; }
 
-  explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl);
+  explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
+                    StringView nccl_path);
   [[nodiscard]] Result LogTracker(std::string) const override {
     LOG(FATAL) << "Device comm is used for logging.";
     return Fail("Undefined.");
@@ -43,25 +52,53 @@ class NCCLComm : public Comm {
   }
 };
 
+inline Result GetNCCLResult(std::shared_ptr<NcclStub> stub, ncclResult_t code) {
+  if (code == ncclSuccess) {
+    return Success();
+  }
+
+  std::stringstream ss;
+  ss << "NCCL failure: " << stub->GetErrorString(code) << ".";
+  if (code == ncclUnhandledCudaError) {
+    // nccl usually preserves the last error so we can get more details.
+    auto err = cudaPeekAtLastError();
+    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+  } else if (code == ncclSystemError) {
+    ss << "  This might be caused by a network configuration issue. Please consider specifying "
+          "the network interface for NCCL via environment variables listed in its reference: "
+          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
+  }
+  return Fail(ss.str());
+}
+
 class NCCLChannel : public Channel {
   std::int32_t rank_{-1};
   ncclComm_t nccl_comm_{};
+  std::shared_ptr<NcclStub> stub_;
   dh::CUDAStreamView stream_;
 
  public:
   explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm,
-                       dh::CUDAStreamView stream)
-      : rank_{rank}, nccl_comm_{nccl_comm}, Channel{comm, nullptr}, stream_{stream} {}
+                       std::shared_ptr<NcclStub> stub, dh::CUDAStreamView stream)
+      : rank_{rank},
+        nccl_comm_{nccl_comm},
+        stub_{std::move(stub)},
+        Channel{comm, nullptr},
+        stream_{stream} {}
 
   void SendAll(std::int8_t const* ptr, std::size_t n) override {
-    dh::safe_nccl(ncclSend(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+    auto rc = GetNCCLResult(stub_, stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+    CHECK(rc.OK()) << rc.Report();
   }
   void RecvAll(std::int8_t* ptr, std::size_t n) override {
-    dh::safe_nccl(ncclRecv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+    auto rc = GetNCCLResult(stub_, stub_->Recv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+    CHECK(rc.OK()) << rc.Report();
   }
   [[nodiscard]] Result Block() override {
     auto rc = stream_.Sync(false);
     return GetCUDAResult(rc);
   }
 };
+
+#endif  //  defined(XGBOOST_USE_NCCL)
 }  // namespace xgboost::collective
diff --git a/src/collective/comm.h b/src/collective/comm.h
index 76ab479d7..b2f519e3d 100644
--- a/src/collective/comm.h
+++ b/src/collective/comm.h
@@ -34,6 +34,8 @@ inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
   return nrank;
 }
 
+inline StringView DefaultNcclName() { return "libnccl.so.2"; }
+
 class Channel;
 class Coll;
 
@@ -86,11 +88,21 @@ class Comm : public std::enable_shared_from_this<Comm> {
   [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
 
   [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
-
-  virtual Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const;
 };
 
-class RabitComm : public Comm {
+/**
+ * @brief Base class for CPU-based communicator.
+ */
+class HostComm : public Comm {
+ public:
+  using Comm::Comm;
+  [[nodiscard]] virtual Comm* MakeCUDAVar(Context const* ctx,
+                                          std::shared_ptr<Coll> pimpl) const = 0;
+};
+
+class RabitComm : public HostComm {
+  std::string nccl_path_ = std::string{DefaultNcclName()};
+
   [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
                                  std::string task_id);
   [[nodiscard]] Result Shutdown();
@@ -100,13 +112,15 @@ class RabitComm : public Comm {
   RabitComm() = default;
   // ctor for testing where environment is known.
   RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-            std::int32_t retry, std::string task_id);
+            std::int32_t retry, std::string task_id, StringView nccl_path);
   ~RabitComm() noexcept(false) override;
 
   [[nodiscard]] bool IsFederated() const override { return false; }
   [[nodiscard]] Result LogTracker(std::string msg) const override;
 
   [[nodiscard]] Result SignalError(Result const&) override;
+
+  [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
 };
 
 /**
diff --git a/src/collective/comm_group.cc b/src/collective/comm_group.cc
index 3d2e24492..f7bbba754 100644
--- a/src/collective/comm_group.cc
+++ b/src/collective/comm_group.cc
@@ -37,7 +37,7 @@ namespace xgboost::collective {
 [[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const {
   if (device.IsCUDA()) {
     CHECK(ctx->IsCUDA());
-    if (!gpu_comm_) {
+    if (!gpu_comm_ || gpu_comm_->World() != comm_->World()) {
       gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_));
     }
     return *gpu_comm_;
@@ -55,7 +55,6 @@ CommGroup::CommGroup()
   }
 
   std::string type = OptionalArg<String>(config, "dmlc_communicator", std::string{"rabit"});
-  std::vector<std::string> keys;
   // Try both lower and upper case for compatibility
   auto get_param = [&](std::string name, auto dft, auto t) {
     std::string upper;
@@ -63,8 +62,6 @@ CommGroup::CommGroup()
                    [](char c) { return std::toupper(c); });
     std::transform(name.cbegin(), name.cend(), name.begin(),
                    [](char c) { return std::tolower(c); });
-    keys.push_back(upper);
-    keys.push_back(name);
 
     auto const& obj = get<Object const>(config);
     auto it = obj.find(upper);
@@ -75,19 +72,19 @@ CommGroup::CommGroup()
     }
   };
   // Common args
-  auto retry =
-      OptionalArg<Integer>(config, "dmlc_retry", static_cast<Integer::Int>(DefaultRetry()));
-  auto timeout = OptionalArg<Integer>(config, "dmlc_timeout_sec",
-                                      static_cast<Integer::Int>(DefaultTimeoutSec()));
+  auto retry = get_param("dmlc_retry", static_cast<Integer::Int>(DefaultRetry()), Integer{});
+  auto timeout =
+      get_param("dmlc_timeout_sec", static_cast<Integer::Int>(DefaultTimeoutSec()), Integer{});
   auto task_id = get_param("dmlc_task_id", std::string{}, String{});
 
   if (type == "rabit") {
     auto host = get_param("dmlc_tracker_uri", std::string{}, String{});
     auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
+    auto nccl = get_param("dmlc_nccl_path", std::string{DefaultNcclName()}, String{});
     auto ptr =
         new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{  // NOLINT
                           host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout},
-                          static_cast<std::int32_t>(retry), task_id}},
+                          static_cast<std::int32_t>(retry), task_id, nccl}},
                       std::shared_ptr<Coll>(new Coll{})};  // NOLINT
     return ptr;
   } else if (type == "federated") {
diff --git a/src/collective/comm_group.h b/src/collective/comm_group.h
index 62f3e565f..2f6f91d73 100644
--- a/src/collective/comm_group.h
+++ b/src/collective/comm_group.h
@@ -17,14 +17,16 @@ namespace xgboost::collective {
  *        collective implementations.
  */
 class CommGroup {
-  std::shared_ptr<Comm> comm_;
+  std::shared_ptr<HostComm> comm_;
   mutable std::shared_ptr<Comm> gpu_comm_;
 
   std::shared_ptr<Coll> backend_;
   mutable std::shared_ptr<Coll> gpu_coll_;  // lazy initialization
 
   CommGroup(std::shared_ptr<Comm> comm, std::shared_ptr<Coll> coll)
-      : comm_{std::move(comm)}, backend_{std::move(coll)} {}
+      : comm_{std::dynamic_pointer_cast<HostComm>(comm)}, backend_{std::move(coll)} {
+    CHECK(comm_);
+  }
 
  public:
   CommGroup();
diff --git a/src/collective/communicator.cc b/src/collective/communicator.cc
index 6ac9ff58e..7fabe50b4 100644
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@@ -3,6 +3,7 @@
  */
 #include "communicator.h"
 
+#include "comm.h"
 #include "in_memory_communicator.h"
 #include "noop_communicator.h"
 #include "rabit_communicator.h"
@@ -14,8 +15,12 @@
 namespace xgboost::collective {
 thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
 thread_local CommunicatorType Communicator::type_{};
+thread_local std::string Communicator::nccl_path_{};
 
 void Communicator::Init(Json const& config) {
+  auto nccl = OptionalArg<String>(config, "dmlc_nccl_path", std::string{DefaultNcclName()});
+  nccl_path_ = nccl;
+
   auto type = GetTypeFromEnv();
   auto const arg = GetTypeFromConfig(config);
   if (arg != CommunicatorType::kUnknown) {
diff --git a/src/collective/communicator.cu b/src/collective/communicator.cu
index a80eab6d5..a7552d356 100644
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -31,17 +31,17 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
 #ifdef XGBOOST_USE_NCCL
     switch (type_) {
       case CommunicatorType::kRabit:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
         break;
       case CommunicatorType::kFederated:
       case CommunicatorType::kInMemory:
         device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
         break;
       case CommunicatorType::kInMemoryNccl:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true, nccl_path_));
         break;
       default:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
     }
 #else
     device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
diff --git a/src/collective/communicator.h b/src/collective/communicator.h
index feb446355..b6910b80f 100644
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -234,6 +234,7 @@ class Communicator {
 
   static thread_local std::unique_ptr<Communicator> communicator_;
   static thread_local CommunicatorType type_;
+  static thread_local std::string nccl_path_;
 #if defined(XGBOOST_USE_CUDA)
   static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif
diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 3d4905cb1..25b198bde 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -2,12 +2,14 @@
  * Copyright 2023 XGBoost contributors
  */
 #if defined(XGBOOST_USE_NCCL)
+#include "comm.cuh"
 #include "nccl_device_communicator.cuh"
 
 namespace xgboost {
 namespace collective {
 
-NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
+NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync,
+                                               StringView nccl_path)
     : device_ordinal_{device_ordinal},
       needs_sync_{needs_sync},
       world_size_{GetWorldSize()},
@@ -18,6 +20,7 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
   if (world_size_ == 1) {
     return;
   }
+  stub_ = std::make_shared<NcclStub>(std::move(nccl_path));
 
   std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
   auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
@@ -43,7 +46,9 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
 
   nccl_unique_id_ = GetUniqueId();
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
+  auto rc =
+      GetNCCLResult(stub_, stub_->CommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
+  CHECK(rc.OK()) << rc.Report();
 }
 
 NcclDeviceCommunicator::~NcclDeviceCommunicator() {
@@ -51,7 +56,8 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() {
     return;
   }
   if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
+    CHECK(rc.OK()) << rc.Report();
   }
   if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
     LOG(CONSOLE) << "======== NCCL Statistics========";
@@ -137,8 +143,10 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
   auto *device_buffer = buffer.data().get();
 
   // First gather data from all the workers.
-  dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
+  auto rc = GetNCCLResult(
+      stub_, stub_->Allgather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
                               nccl_comm_, dh::DefaultStream()));
+  CHECK(rc.OK()) << rc.Report();
   if (needs_sync_) {
     dh::DefaultStream().Sync();
   }
@@ -170,9 +178,10 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
   if (IsBitwiseOp(op)) {
     BitwiseAllReduce(send_receive_buffer, count, data_type, op);
   } else {
-    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
-                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                dh::DefaultStream()));
+    auto rc = GetNCCLResult(stub_, stub_->Allreduce(send_receive_buffer, send_receive_buffer, count,
+                                                    GetNcclDataType(data_type), GetNcclRedOp(op),
+                                                    nccl_comm_, dh::DefaultStream()));
+    CHECK(rc.OK()) << rc.Report();
   }
   allreduce_bytes_ += count * GetTypeSize(data_type);
   allreduce_calls_ += 1;
@@ -185,8 +194,9 @@ void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_bu
   }
 
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
-                              dh::DefaultStream()));
+  auto rc = GetNCCLResult(stub_, stub_->Allgather(send_buffer, receive_buffer, send_size, ncclInt8,
+                                                  nccl_comm_, dh::DefaultStream()));
+  CHECK(rc.OK()) << rc.Report();
 }
 
 void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
@@ -206,14 +216,19 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
   receive_buffer->resize(total_bytes);
 
   size_t offset = 0;
-  dh::safe_nccl(ncclGroupStart());
-  for (int32_t i = 0; i < world_size_; ++i) {
-    size_t as_bytes = segments->at(i);
-    dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                ncclChar, i, nccl_comm_, dh::DefaultStream()));
-    offset += as_bytes;
-  }
-  dh::safe_nccl(ncclGroupEnd());
+  auto rc = Success() << [&] { return GetNCCLResult(stub_, stub_->GroupStart()); } << [&] {
+    for (int32_t i = 0; i < world_size_; ++i) {
+      size_t as_bytes = segments->at(i);
+      auto rc = GetNCCLResult(
+          stub_, stub_->Broadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
+                                  ncclChar, i, nccl_comm_, dh::DefaultStream()));
+      if (!rc.OK()) {
+        return rc;
+      }
+      offset += as_bytes;
+    }
+    return Success();
+  } << [&] { return GetNCCLResult(stub_, stub_->GroupEnd()); };
 }
 
 void NcclDeviceCommunicator::Synchronize() {
diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index 084db2046..a194b4ef2 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -4,8 +4,10 @@
 #pragma once
 
 #include "../common/device_helpers.cuh"
+#include "comm.cuh"
 #include "communicator.h"
 #include "device_communicator.cuh"
+#include "nccl_stub.h"
 
 namespace xgboost {
 namespace collective {
@@ -25,7 +27,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
    * needed. The in-memory communicator is used in tests with multiple threads, each thread
    * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
    */
-  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
+  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync, StringView nccl_path);
   ~NcclDeviceCommunicator() override;
   void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                  Operation op) override;
@@ -64,7 +66,8 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
     static const int kRootRank = 0;
     ncclUniqueId id;
     if (rank_ == kRootRank) {
-      dh::safe_nccl(ncclGetUniqueId(&id));
+      auto rc = GetNCCLResult(stub_, stub_->GetUniqueId(&id));
+      CHECK(rc.OK()) << rc.Report();
     }
     Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
     return id;
@@ -78,6 +81,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
   int const world_size_;
   int const rank_;
   ncclComm_t nccl_comm_{};
+  std::shared_ptr<NcclStub> stub_;
   ncclUniqueId nccl_unique_id_{};
   size_t allreduce_bytes_{0};  // Keep statistics of the number of bytes communicated.
   size_t allreduce_calls_{0};  // Keep statistics of the number of reduce calls.
diff --git a/src/collective/nccl_stub.cc b/src/collective/nccl_stub.cc
new file mode 100644
index 000000000..f4705a46e
--- /dev/null
+++ b/src/collective/nccl_stub.cc
@@ -0,0 +1,109 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#if defined(XGBOOST_USE_NCCL)
+#include "nccl_stub.h"
+
+#include <cuda.h>   // for CUDA_VERSION
+#include <dlfcn.h>  // for dlclose, dlsym, dlopen
+#include <nccl.h>
+
+#include <cstdint>  // for int32_t
+#include <sstream>  // for stringstream
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "xgboost/logging.h"
+
+namespace xgboost::collective {
+NcclStub::NcclStub(StringView path) : path_{std::move(path)} {
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  CHECK(!path_.empty()) << "Empty path for NCCL.";
+
+  auto cu_major = (CUDA_VERSION) / 1000;
+  std::stringstream ss;
+  ss << R"m(
+
+If XGBoost is installed from PyPI with pip, the error can fixed by:
+
+- Run `pip install nvidia-nccl-cu)m"
+     << cu_major << "` (Or with any CUDA version that's compatible with " << cu_major << ").";
+  ss << R"m(
+
+Otherwise, please refer to:
+
+  https://xgboost.readthedocs.io/en/stable/tutorials/dask.html#troubleshooting
+
+for more info, or open an issue on GitHub. Starting from XGBoost 2.1.0, the PyPI package
+no long bundles NCCL in the binary wheel.
+
+)m";
+  auto help = ss.str();
+  std::string msg{"Failed to load NCCL from path: `" + path_ + "`. Error:\n  "};
+
+  auto safe_load = [&](auto t, StringView name) {
+    std::stringstream errs;
+    auto ptr = reinterpret_cast<decltype(t)>(dlsym(handle_, name.c_str()));
+    if (!ptr) {
+      errs << "Failed to load NCCL symbol `" << name << "` from " << path_ << ". Error:\n  "
+           << dlerror() << help;
+      LOG(FATAL) << errs.str();
+    }
+    return ptr;
+  };
+
+  handle_ = dlopen(path_.c_str(), RTLD_LAZY);
+  if (!handle_) {
+    LOG(FATAL) << msg << dlerror() << help;
+  }
+
+  allreduce_ = safe_load(allreduce_, "ncclAllReduce");
+  broadcast_ = safe_load(broadcast_, "ncclBroadcast");
+  allgather_ = safe_load(allgather_, "ncclAllGather");
+  comm_init_rank_ = safe_load(comm_init_rank_, "ncclCommInitRank");
+  comm_destroy_ = safe_load(comm_destroy_, "ncclCommDestroy");
+  get_uniqueid_ = safe_load(get_uniqueid_, "ncclGetUniqueId");
+  send_ = safe_load(send_, "ncclSend");
+  recv_ = safe_load(recv_, "ncclRecv");
+  group_start_ = safe_load(group_start_, "ncclGroupStart");
+  group_end_ = safe_load(group_end_, "ncclGroupEnd");
+  get_error_string_ = safe_load(get_error_string_, "ncclGetErrorString");
+  get_version_ = safe_load(get_version_, "ncclGetVersion");
+
+  std::int32_t v;
+  CHECK_EQ(get_version_(&v), ncclSuccess);
+  auto patch = v % 100;
+  auto minor = (v / 100) % 100;
+  auto major = v / 10000;
+
+  LOG(INFO) << "Loaded shared NCCL " << major << "." << minor << "." << patch << ":`" << path_
+            << "`" << std::endl;
+#else
+  allreduce_ = ncclAllReduce;
+  broadcast_ = ncclBroadcast;
+  allgather_ = ncclAllGather;
+  comm_init_rank_ = ncclCommInitRank;
+  comm_destroy_ = ncclCommDestroy;
+  get_uniqueid_ = ncclGetUniqueId;
+  send_ = ncclSend;
+  recv_ = ncclRecv;
+  group_start_ = ncclGroupStart;
+  group_end_ = ncclGroupEnd;
+  get_error_string_ = ncclGetErrorString;
+  get_version_ = ncclGetVersion;
+#endif
+};
+
+NcclStub::~NcclStub() {  // NOLINT
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  if (handle_) {
+    auto rc = dlclose(handle_);
+    if (rc != 0) {
+      LOG(WARNING) << "Failed to close NCCL handle:" << dlerror();
+    }
+  }
+  handle_ = nullptr;
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+}
+}  // namespace xgboost::collective
+#endif  // defined(XGBOOST_USE_NCCL)
diff --git a/src/collective/nccl_stub.h b/src/collective/nccl_stub.h
new file mode 100644
index 000000000..a003a6f22
--- /dev/null
+++ b/src/collective/nccl_stub.h
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#if defined(XGBOOST_USE_NCCL)
+#include <cuda_runtime_api.h>
+#include <nccl.h>
+
+#include <string>  // for string
+
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::collective {
+class NcclStub {
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  void* handle_{nullptr};
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+  std::string path_;
+
+  decltype(ncclAllReduce)* allreduce_{nullptr};
+  decltype(ncclBroadcast)* broadcast_{nullptr};
+  decltype(ncclAllGather)* allgather_{nullptr};
+  decltype(ncclCommInitRank)* comm_init_rank_{nullptr};
+  decltype(ncclCommDestroy)* comm_destroy_{nullptr};
+  decltype(ncclGetUniqueId)* get_uniqueid_{nullptr};
+  decltype(ncclSend)* send_{nullptr};
+  decltype(ncclRecv)* recv_{nullptr};
+  decltype(ncclGroupStart)* group_start_{nullptr};
+  decltype(ncclGroupEnd)* group_end_{nullptr};
+  decltype(ncclGetErrorString)* get_error_string_{nullptr};
+  decltype(ncclGetVersion)* get_version_{nullptr};
+
+ public:
+  explicit NcclStub(StringView path);
+  ~NcclStub();
+
+  [[nodiscard]] ncclResult_t Allreduce(const void* sendbuff, void* recvbuff, size_t count,
+                                       ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+                                       cudaStream_t stream) const {
+    CHECK(allreduce_);
+    return this->allreduce_(sendbuff, recvbuff, count, datatype, op, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t Broadcast(const void* sendbuff, void* recvbuff, size_t count,
+                                       ncclDataType_t datatype, int root, ncclComm_t comm,
+                                       cudaStream_t stream) const {
+    CHECK(broadcast_);
+    return this->broadcast_(sendbuff, recvbuff, count, datatype, root, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t Allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
+                                       ncclDataType_t datatype, ncclComm_t comm,
+                                       cudaStream_t stream) const {
+    CHECK(allgather_);
+    return this->allgather_(sendbuff, recvbuff, sendcount, datatype, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t CommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
+                                          int rank) const {
+    CHECK(comm_init_rank_);
+    return this->comm_init_rank_(comm, nranks, commId, rank);
+  }
+  [[nodiscard]] ncclResult_t CommDestroy(ncclComm_t comm) const {
+    CHECK(comm_destroy_);
+    return this->comm_destroy_(comm);
+  }
+
+  [[nodiscard]] ncclResult_t GetUniqueId(ncclUniqueId* uniqueId) const {
+    CHECK(get_uniqueid_);
+    return this->get_uniqueid_(uniqueId);
+  }
+  [[nodiscard]] ncclResult_t Send(const void* sendbuff, size_t count, ncclDataType_t datatype,
+                                  int peer, ncclComm_t comm, cudaStream_t stream) {
+    CHECK(send_);
+    return send_(sendbuff, count, datatype, peer, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t Recv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+                                  ncclComm_t comm, cudaStream_t stream) const {
+    CHECK(recv_);
+    return recv_(recvbuff, count, datatype, peer, comm, stream);
+  }
+  [[nodiscard]] ncclResult_t GroupStart() const {
+    CHECK(group_start_);
+    return group_start_();
+  }
+  [[nodiscard]] ncclResult_t GroupEnd() const {
+    CHECK(group_end_);
+    return group_end_();
+  }
+
+  [[nodiscard]] const char* GetErrorString(ncclResult_t result) const {
+    return get_error_string_(result);
+  }
+};
+}  // namespace xgboost::collective
+
+#endif  // defined(XGBOOST_USE_NCCL)
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 066f8a3e6..89ec42f2b 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -115,30 +115,6 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) {  // NOLINT
 }
 namespace dh {
 
-#ifdef XGBOOST_USE_NCCL
-#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
-
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
-  if (code != ncclSuccess) {
-    std::stringstream ss;
-    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
-    ss << " " << file << "(" << line << ")\n";
-    if (code == ncclUnhandledCudaError) {
-      // nccl usually preserves the last error so we can get more details.
-      auto err = cudaPeekAtLastError();
-      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
-    } else if (code == ncclSystemError) {
-      ss << "  This might be caused by a network configuration issue. Please consider specifying "
-            "the network interface for NCCL via environment variables listed in its reference: "
-            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
-    }
-    LOG(FATAL) << ss.str();
-  }
-
-  return code;
-}
-#endif
-
 inline int32_t CudaGetPointerDevice(void const *ptr) {
   int32_t device = -1;
   cudaPointerAttributes attr;
diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh
index 46bc98028..615608249 100755
--- a/tests/buildkite/build-cuda-with-rmm.sh
+++ b/tests/buildkite/build-cuda-with-rmm.sh
@@ -21,11 +21,18 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
                 `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
 
 echo "--- Build libxgboost from the source"
-$command_wrapper tests/ci_build/prune_libnccl.sh
-$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
-  -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
-  -DPLUGIN_RMM=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
-  -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
+$command_wrapper tests/ci_build/build_via_cmake.sh \
+		 -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
+		 -DUSE_CUDA=ON \
+		 -DUSE_OPENMP=ON \
+		 -DHIDE_CXX_SYMBOLS=ON \
+		 -DPLUGIN_FEDERATED=ON \
+		 -DPLUGIN_RMM=ON \
+		 -DUSE_NCCL=ON \
+		 -DUSE_NCCL_LIB_PATH=ON \
+		 -DNCCL_INCLUDE_DIR=/usr/include \
+		 -DUSE_DLOPEN_NCCL=ON \
+  ${arch_flag}
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
   "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh
index 1926754b8..7bd3492a2 100755
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -21,11 +21,17 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
                 `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
 
 echo "--- Build libxgboost from the source"
-$command_wrapper tests/ci_build/prune_libnccl.sh
-$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc" \
-  -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
-  -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
-  -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
+$command_wrapper tests/ci_build/build_via_cmake.sh \
+		 -DCMAKE_PREFIX_PATH="/opt/grpc" \
+		 -DUSE_CUDA=ON \
+		 -DUSE_OPENMP=ON \
+		 -DHIDE_CXX_SYMBOLS=ON \
+		 -DPLUGIN_FEDERATED=ON \
+		 -DUSE_NCCL=ON \
+		 -DUSE_NCCL_LIB_PATH=ON \
+		 -DNCCL_INCLUDE_DIR=/usr/include \
+		 -DUSE_DLOPEN_NCCL=ON \
+		 ${arch_flag}
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
   "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh
index 58d250308..36f54cd3d 100755
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -10,6 +10,7 @@ chmod +x build/testxgboost
 tests/ci_build/ci_build.sh gpu nvidia-docker \
   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
   build/testxgboost
 
 echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh
index 935a301a6..2aac47407 100755
--- a/tests/buildkite/test-cpp-mgpu.sh
+++ b/tests/buildkite/test-cpp-mgpu.sh
@@ -13,4 +13,5 @@ chmod +x build/testxgboost
 tests/ci_build/ci_build.sh gpu nvidia-docker \
   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
   build/testxgboost --gtest_filter=*MGPU*
diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh
index a575878d3..c2376c021 100755
--- a/tests/buildkite/test-python-gpu.sh
+++ b/tests/buildkite/test-python-gpu.sh
@@ -24,7 +24,8 @@ export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
 
 command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
                 `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
+		`"NCCL_VERSION_ARG=$NCCL_VERSION"
 
 # Run specified test suite
 case "$suite" in
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index 0822767c5..0a5adb6ea 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -2,6 +2,7 @@ ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04
 ARG CUDA_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
+ARG NCCL_VERSION_ARG
 
 # Environment
 ENV DEBIAN_FRONTEND noninteractive
@@ -23,7 +24,9 @@ RUN \
     conda install -c conda-forge mamba && \
     mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
         python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
-        dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
+        nccl>=$(cut -d "-" -f 1 << $NCCL_VERSION_ARG) \
+        dask \
+        dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
         numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
         pyspark>=3.4.0 cloudpickle cuda-python && \
     mamba clean --all && \
diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7
index 98a0a7033..16445de2a 100644
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -27,7 +27,7 @@ RUN \
     wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
     rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
     yum -y update && \
-    yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
+    yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
     rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
 
 ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
diff --git a/tests/ci_build/prune_libnccl.sh b/tests/ci_build/prune_libnccl.sh
deleted file mode 100755
index c5a0d8123..000000000
--- a/tests/ci_build/prune_libnccl.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-rm -rf tmp_nccl
-
-mkdir tmp_nccl
-pushd tmp_nccl
-
-set -x
-
-cat << EOF > test.cu
-int main(void) { return 0; }
-EOF
-
-cat << EOF > CMakeLists.txt
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-project(gencode_extractor CXX C)
-cmake_policy(SET CMP0104 NEW)
-set(CMAKE_CUDA_HOST_COMPILER \${CMAKE_CXX_COMPILER})
-enable_language(CUDA)
-include(../cmake/Utils.cmake)
-compute_cmake_cuda_archs("")
-add_library(test OBJECT test.cu)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-EOF
-
-cmake . -GNinja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-gen_code=$(grep -o -- '--generate-code=\S*' compile_commands.json | paste -sd ' ')
-
-nvprune ${gen_code} /usr/lib64/libnccl_static.a -o ../libnccl_static.a
-
-popd
-rm -rf tmp_nccl
-
-set +x
diff --git a/tests/ci_build/rename_whl.py b/tests/ci_build/rename_whl.py
index 766c88a2f..2da7db8de 100644
--- a/tests/ci_build/rename_whl.py
+++ b/tests/ci_build/rename_whl.py
@@ -1,22 +1,10 @@
 import os
 import sys
-from contextlib import contextmanager
-
-
-@contextmanager
-def cd(path):
-    path = os.path.normpath(path)
-    cwd = os.getcwd()
-    os.chdir(path)
-    print("cd " + path)
-    try:
-        yield path
-    finally:
-        os.chdir(cwd)
 
+from test_utils import DirectoryExcursion
 
 if len(sys.argv) != 4:
-    print('Usage: {} [wheel to rename] [commit id] [platform tag]'.format(sys.argv[0]))
+    print("Usage: {} [wheel to rename] [commit id] [platform tag]".format(sys.argv[0]))
     sys.exit(1)
 
 
@@ -26,20 +14,26 @@ platform_tag = sys.argv[3]
 
 dirname, basename = os.path.dirname(whl_path), os.path.basename(whl_path)
 
-with cd(dirname):
-    tokens = basename.split('-')
+with DirectoryExcursion(dirname):
+    tokens = basename.split("-")
     assert len(tokens) == 5
-    version = tokens[1].split('+')[0]
-    keywords = {'pkg_name': tokens[0],
-                'version': version,
-                'commit_id': commit_id,
-                'platform_tag': platform_tag}
-    new_name = '{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl'.format(**keywords)
-    print('Renaming {} to {}...'.format(basename, new_name))
+    version = tokens[1].split("+")[0]
+    keywords = {
+        "pkg_name": tokens[0],
+        "version": version,
+        "commit_id": commit_id,
+        "platform_tag": platform_tag,
+    }
+    new_name = "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format(
+        **keywords
+    )
+    print("Renaming {} to {}...".format(basename, new_name))
     if os.path.isfile(new_name):
         os.remove(new_name)
     os.rename(basename, new_name)
 
     filesize = os.path.getsize(new_name) / 1024 / 1024  # MB
+    print(f"Wheel size: {filesize}")
+
     msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}"
     assert filesize <= 300, msg
diff --git a/tests/cpp/collective/test_allgather.cu b/tests/cpp/collective/test_allgather.cu
index 48f7c2615..236108198 100644
--- a/tests/cpp/collective/test_allgather.cu
+++ b/tests/cpp/collective/test_allgather.cu
@@ -90,10 +90,10 @@ class Worker : public NCCLWorkerForTest {
   }
 };
 
-class AllgatherTestGPU : public SocketTest {};
+class MGPUAllgatherTest : public SocketTest {};
 }  // namespace
 
-TEST_F(AllgatherTestGPU, MGPUTestVRing) {
+TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
   auto n_workers = common::AllVisibleGPUs();
   TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
@@ -104,7 +104,7 @@ TEST_F(AllgatherTestGPU, MGPUTestVRing) {
   });
 }
 
-TEST_F(AllgatherTestGPU, MGPUTestVBcast) {
+TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
   auto n_workers = common::AllVisibleGPUs();
   TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
diff --git a/tests/cpp/collective/test_allreduce.cu b/tests/cpp/collective/test_allreduce.cu
index af9a4e58f..04ec9f773 100644
--- a/tests/cpp/collective/test_allreduce.cu
+++ b/tests/cpp/collective/test_allreduce.cu
@@ -5,17 +5,15 @@
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>  // for host_vector
 
-#include "../../../src/collective/coll.h"  // for Coll
 #include "../../../src/common/common.h"
 #include "../../../src/common/device_helpers.cuh"  // for ToSpan,  device_vector
 #include "../../../src/common/type.h"              // for EraseType
-#include "../helpers.h"                            // for MakeCUDACtx
 #include "test_worker.cuh"                         // for NCCLWorkerForTest
 #include "test_worker.h"                           // for WorkerForTest, TestDistributed
 
 namespace xgboost::collective {
 namespace {
-class AllreduceTestGPU : public SocketTest {};
+class MGPUAllreduceTest : public SocketTest {};
 
 class Worker : public NCCLWorkerForTest {
  public:
@@ -47,7 +45,7 @@ class Worker : public NCCLWorkerForTest {
 };
 }  // namespace
 
-TEST_F(AllreduceTestGPU, BitOr) {
+TEST_F(MGPUAllreduceTest, BitOr) {
   auto n_workers = common::AllVisibleGPUs();
   TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
@@ -57,7 +55,7 @@ TEST_F(AllreduceTestGPU, BitOr) {
   });
 }
 
-TEST_F(AllreduceTestGPU, Sum) {
+TEST_F(MGPUAllreduceTest, Sum) {
   auto n_workers = common::AllVisibleGPUs();
   TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index a09696c19..3d7b1efc8 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -8,6 +8,7 @@
 #include <bitset>
 #include <string>  // for string
 
+#include "../../../src/collective/comm.cuh"
 #include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/collective/nccl_device_communicator.cuh"
 #include "../helpers.h"
@@ -16,17 +17,15 @@ namespace xgboost {
 namespace collective {
 
 TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { NcclDeviceCommunicator comm{-1, false}; };
+  auto construct = []() { NcclDeviceCommunicator comm{-1, false, DefaultNcclName()}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
 TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
-  try {
-    dh::safe_nccl(ncclSystemError);
-  } catch (dmlc::Error const& e) {
-    auto str = std::string{e.what()};
-    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
-  }
+  auto stub = std::make_shared<NcclStub>(DefaultNcclName());
+  auto rc = GetNCCLResult(stub, ncclSystemError);
+  auto msg = rc.Report();
+  ASSERT_TRUE(msg.find("environment variables") != std::string::npos);
 }
 
 namespace {
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index 490cdf13c..acee0f297 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -33,7 +33,7 @@ class WorkerForTest {
         tracker_port_{port},
         world_size_{world},
         task_id_{"t:" + std::to_string(rank)},
-        comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_} {
+        comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_, DefaultNcclName()} {
     CHECK_EQ(world_size_, comm_.World());
   }
   virtual ~WorkerForTest() = default;
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 893582ee1..883dbbaf2 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -12,6 +12,7 @@ from hypothesis._settings import duration
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.collective import CommunicatorContext
 from xgboost.testing.params import hist_parameter_strategy
 
 pytestmark = [
@@ -572,6 +573,65 @@ def test_with_asyncio(local_cuda_client: Client) -> None:
     assert isinstance(output["history"], dict)
 
 
+def test_invalid_nccl(local_cuda_client: Client) -> None:
+    client = local_cuda_client
+    workers = tm.get_client_workers(client)
+    args = client.sync(
+        dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
+    )
+
+    def run(wid: int) -> None:
+        ctx = CommunicatorContext(dmlc_nccl_path="foo", **args)
+        X, y, w = tm.make_regression(n_samples=10, n_features=10, use_cupy=True)
+
+        with ctx:
+            with pytest.raises(ValueError, match=r"pip install"):
+                xgb.QuantileDMatrix(X, y, weight=w)
+
+    futures = client.map(run, range(len(workers)), workers=workers)
+    client.gather(futures)
+
+
+@pytest.mark.parametrize("tree_method", ["hist", "approx"])
+def test_nccl_load(local_cuda_client: Client, tree_method: str) -> None:
+    X, y, w = tm.make_regression(128, 16, use_cupy=True)
+
+    def make_model() -> None:
+        xgb.XGBRegressor(
+            device="cuda",
+            tree_method=tree_method,
+            objective="reg:quantileerror",
+            verbosity=2,
+            quantile_alpha=[0.2, 0.8],
+        ).fit(X, y, sample_weight=w)
+
+    # no nccl load when using single-node.
+    with tm.captured_output() as (out, err):
+        make_model()
+        assert out.getvalue().find("NCCL") == -1
+        assert err.getvalue().find("NCCL") == -1
+
+    client = local_cuda_client
+    workers = tm.get_client_workers(client)
+    args = client.sync(
+        dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
+    )
+
+    # nccl is loaded
+    def run(wid: int) -> None:
+        # FIXME(jiamingy): https://github.com/dmlc/xgboost/issues/9147
+        from xgboost.core import _LIB, _register_log_callback
+        _register_log_callback(_LIB)
+
+        with CommunicatorContext(**args):
+            with tm.captured_output() as (out, err):
+                make_model()
+                assert out.getvalue().find("Loaded shared NCCL") != -1, out.getvalue()
+
+    futures = client.map(run, range(len(workers)), workers=workers)
+    client.gather(futures)
+
+
 async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT:
     async with Client(scheduler_address, asynchronous=True) as client:
         import cupy as cp

From 1877cb8e832af362fb810d947434f2fb70b3e7ad Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Nov 2023 21:17:48 +0800
Subject: [PATCH 15/32] Change default metric for gamma regression to deviance.
 (#9757)

* Change default metric for gamma regression to deviance.

- Cleanup the gamma implementation.
- Use deviance instead since the objective is derived from deviance.
---
 src/objective/regression_loss.h | 22 +++++++--
 src/objective/regression_obj.cu | 85 ++-------------------------------
 2 files changed, 21 insertions(+), 86 deletions(-)

diff --git a/src/objective/regression_loss.h b/src/objective/regression_loss.h
index 1ef7106cf..d2710d35a 100644
--- a/src/objective/regression_loss.h
+++ b/src/objective/regression_loss.h
@@ -13,9 +13,7 @@
 #include "xgboost/logging.h"
 #include "xgboost/task.h"  // ObjInfo
 
-namespace xgboost {
-namespace obj {
-// common regressions
+namespace xgboost::obj {
 // linear regression
 struct LinearSquareLoss {
   XGBOOST_DEVICE static bst_float PredTransform(bst_float x) { return x; }
@@ -106,7 +104,21 @@ struct LogisticRaw : public LogisticRegression {
 
   static ObjInfo Info() { return ObjInfo::kRegression; }
 };
-}  // namespace obj
-}  // namespace xgboost
 
+// gamma deviance loss.
+class GammaDeviance {
+ public:
+  XGBOOST_DEVICE static float PredTransform(float x) { return std::exp(x); }
+  XGBOOST_DEVICE static float ProbToMargin(float x) { return std::log(x); }
+  XGBOOST_DEVICE static float FirstOrderGradient(float p, float y) {
+    return 1.0f - y / p;
+  }
+  XGBOOST_DEVICE static float SecondOrderGradient(float p, float y) { return y / p; }
+  static ObjInfo Info() { return ObjInfo::kRegression; }
+  static const char* Name() { return "reg:gamma"; }
+  static const char* DefaultEvalMetric() { return "gamma-deviance"; }
+  XGBOOST_DEVICE static bool CheckLabel(float x) { return x > 0.0f; }
+  static const char* LabelErrorMsg() { return "label must be positive for gamma regression."; }
+};
+}  // namespace xgboost::obj
 #endif  // XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 7f498c5f1..f74d01acc 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -221,6 +221,10 @@ XGBOOST_REGISTER_OBJECTIVE(LogisticRaw, LogisticRaw::Name())
           "before logistic transformation.")
 .set_body([]() { return new RegLossObj<LogisticRaw>(); });
 
+XGBOOST_REGISTER_OBJECTIVE(GammaRegression, GammaDeviance::Name())
+    .describe("Gamma regression using the gamma deviance loss with log link.")
+    .set_body([]() { return new RegLossObj<GammaDeviance>(); });
+
 // Deprecated functions
 XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear")
 .describe("Regression with squared error.")
@@ -501,87 +505,6 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
 .describe("Cox regression for censored survival data (negative labels are considered censored).")
 .set_body([]() { return new CoxRegression(); });
 
-// gamma regression
-class GammaRegression : public FitIntercept {
- public:
-  void Configure(Args const&) override {}
-  [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
-
-  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, std::int32_t,
-                   linalg::Matrix<GradientPair>* out_gpair) override {
-    CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
-    const size_t ndata = preds.Size();
-    auto device = ctx_->Device();
-    out_gpair->SetDevice(ctx_->Device());
-    out_gpair->Reshape(info.num_row_, this->Targets(info));
-    label_correct_.Resize(1);
-    label_correct_.Fill(1);
-
-    const bool is_null_weight = info.weights_.Size() == 0;
-    if (!is_null_weight) {
-      CHECK_EQ(info.weights_.Size(), ndata)
-          << "Number of weights should be equal to number of data points.";
-    }
-    common::Transform<>::Init(
-        [=] XGBOOST_DEVICE(size_t _idx,
-                           common::Span<int> _label_correct,
-                           common::Span<GradientPair> _out_gpair,
-                           common::Span<const bst_float> _preds,
-                           common::Span<const bst_float> _labels,
-                           common::Span<const bst_float> _weights) {
-          bst_float p = _preds[_idx];
-          bst_float w = is_null_weight ? 1.0f : _weights[_idx];
-          bst_float y = _labels[_idx];
-          if (y <= 0.0f) {
-            _label_correct[0] = 0;
-          }
-          _out_gpair[_idx] = GradientPair((1 - y / expf(p)) * w, y / expf(p) * w);
-        },
-        common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(), device).Eval(
-            &label_correct_, out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
-
-    // copy "label correct" flags back to host
-    std::vector<int>& label_correct_h = label_correct_.HostVector();
-    for (auto const flag : label_correct_h) {
-      if (flag == 0) {
-        LOG(FATAL) << "GammaRegression: label must be positive.";
-      }
-    }
-  }
-  void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {
-    common::Transform<>::Init(
-        [] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
-          _preds[_idx] = expf(_preds[_idx]);
-        },
-        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->Device())
-        .Eval(io_preds);
-  }
-  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
-    PredTransform(io_preds);
-  }
-  [[nodiscard]] float ProbToMargin(bst_float base_score) const override {
-    return std::log(base_score);
-  }
-  [[nodiscard]] const char* DefaultEvalMetric() const override {
-    return "gamma-nloglik";
-  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["name"] = String("reg:gamma");
-  }
-  void LoadConfig(Json const&) override {}
-
- private:
-  HostDeviceVector<int> label_correct_;
-};
-
-// register the objective functions
-XGBOOST_REGISTER_OBJECTIVE(GammaRegression, "reg:gamma")
-.describe("Gamma regression for severity data.")
-.set_body([]() { return new GammaRegression(); });
-
 
 // declare parameter
 struct TweedieRegressionParam : public XGBoostParameter<TweedieRegressionParam> {

From e9260de3f30708af5992009468432fbfe788fc42 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 23 Nov 2023 00:12:39 +0800
Subject: [PATCH 16/32] [breaking] Remove dense libsvm parser plugin. (#9799)

---
 .github/workflows/main.yml              |  2 +-
 CMakeLists.txt                          |  4 +-
 plugin/CMakeLists.txt                   |  4 --
 plugin/README.md                        |  2 -
 plugin/dense_parser/dense_libsvm.cc     | 87 -------------------------
 python-package/packager/build_config.py | 12 ++--
 tests/buildkite/build-cpu.sh            |  2 +-
 7 files changed, 9 insertions(+), 104 deletions(-)
 delete mode 100644 plugin/dense_parser/dense_libsvm.cc

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 67e77ad6e..8f1252806 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,7 +29,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_DENSE_PARSER=ON -GNinja -DBUILD_DEPRECATED_CLI=ON
+        cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON
         ninja -v
     - name: Run gtest binary
       run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf8f0cf62..a9c6f7410 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,7 +99,6 @@ set(ENABLED_SANITIZERS "address" "leak" CACHE STRING
   "Semicolon separated list of sanitizer names. E.g 'address;leak'. Supported sanitizers are
 address, leak, undefined and thread.")
 ## Plugins
-option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF)
 option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF)
 option(PLUGIN_FEDERATED "Build with Federated Learning" OFF)
 ## TODO: 1. Add check if DPC++ compiler is used for building
@@ -185,6 +184,9 @@ endif()
 if(USE_HDFS)
   message(SEND_ERROR "The option `USE_HDFS` has been removed from XGBoost")
 endif()
+if(PLUGIN_DENSE_PARSER)
+  message(SEND_ERROR "The option `PLUGIN_DENSE_PARSER` has been removed from XGBoost.")
+endif()
 
 #-- Sanitizer
 if(USE_SANITIZER)
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 6089ae486..58b31053f 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -1,7 +1,3 @@
-if(PLUGIN_DENSE_PARSER)
-  target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/dense_parser/dense_libsvm.cc)
-endif()
-
 if(PLUGIN_UPDATER_ONEAPI)
   add_library(oneapi_plugin OBJECT
     ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/regression_obj_oneapi.cc
diff --git a/plugin/README.md b/plugin/README.md
index 6e115c465..008f4ad49 100644
--- a/plugin/README.md
+++ b/plugin/README.md
@@ -36,5 +36,3 @@ The register macros available to plugin writers are:
 And from dmlc-core:
 
  - DMLC_REGISTER_PARAMETER - Register a set of parameter for a specific usecase
- - DMLC_REGISTER_DATA_PARSER - Register a data parser where the data can be
-   represented by a URL. This is used by DMatrix.
diff --git a/plugin/dense_parser/dense_libsvm.cc b/plugin/dense_parser/dense_libsvm.cc
deleted file mode 100644
index 0dd2d0419..000000000
--- a/plugin/dense_parser/dense_libsvm.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/*!
- * Copyright 2015 by Contributors
- * \file dense_libsvm.cc
- * \brief Plugin to load in libsvm, but fill all the missing entries with zeros.
- *  This plugin is mainly used for benchmark purposes and do not need to be included.
- */
-#include <xgboost/base.h>
-#include <dmlc/data.h>
-#include <memory>
-
-namespace dmlc {
-namespace data {
-
-template<typename IndexType>
-class DensifyParser : public dmlc::Parser<IndexType> {
- public:
-  DensifyParser(dmlc::Parser<IndexType>* parser, uint32_t num_col)
-      : parser_(parser), num_col_(num_col) {
-  }
-
-  void BeforeFirst() override {
-    parser_->BeforeFirst();
-  }
-
-  bool Next() override {
-    if (!parser_->Next()) return false;
-    const RowBlock<IndexType>& batch = parser_->Value();
-    LOG(INFO) << batch.size;
-    dense_index_.resize(num_col_ * batch.size);
-    dense_value_.resize(num_col_ * batch.size);
-    std::fill(dense_value_.begin(), dense_value_.end(), 0.0);
-    offset_.resize(batch.size + 1);
-    offset_[0] = 0;
-
-    for (size_t i = 0; i < batch.size; ++i) {
-      offset_[i + 1] = (i + 1) * num_col_;
-      Row<IndexType> row = batch[i];
-      for (uint32_t j = 0; j < num_col_; ++j) {
-        dense_index_[i * num_col_ + j] = j;
-      }
-      for (unsigned k = 0; k < row.length; ++k) {
-        uint32_t index = row.get_index(k);
-        CHECK_LT(index, num_col_)
-            << "Featuere index larger than num_col";
-        dense_value_[i * num_col_ + index]  = row.get_value(k);
-      }
-    }
-    out_ = batch;
-    out_.index = dmlc::BeginPtr(dense_index_);
-    out_.value = dmlc::BeginPtr(dense_value_);
-    out_.offset = dmlc::BeginPtr(offset_);
-    return true;
-  }
-
-  const dmlc::RowBlock<IndexType>& Value() const override {
-    return out_;
-  }
-
-  size_t BytesRead() const override {
-    return parser_->BytesRead();
-  }
-
- private:
-  RowBlock<IndexType> out_;
-  std::unique_ptr<Parser<IndexType> > parser_;
-  uint32_t num_col_;
-  std::vector<size_t> offset_;
-  std::vector<IndexType> dense_index_;
-  std::vector<xgboost::bst_float> dense_value_;
-};
-
-template<typename IndexType, typename DType = real_t>
-Parser<IndexType> *
-CreateDenseLibSVMParser(const std::string& path,
-                        const std::map<std::string, std::string>& args,
-                        unsigned part_index,
-                        unsigned num_parts) {
-  CHECK_NE(args.count("num_col"), 0) << "expect num_col in dense_libsvm";
-  return new DensifyParser<IndexType>(
-            Parser<IndexType>::Create(path.c_str(), part_index, num_parts, "libsvm"),
-           uint32_t(atoi(args.at("num_col").c_str())));
-}
-}  // namespace data
-
-DMLC_REGISTER_DATA_PARSER(uint32_t, real_t, dense_libsvm,
-  data::CreateDenseLibSVMParser<uint32_t __DMLC_COMMA real_t>);
-}  // namespace dmlc
diff --git a/python-package/packager/build_config.py b/python-package/packager/build_config.py
index d3733d628..933bfdce2 100644
--- a/python-package/packager/build_config.py
+++ b/python-package/packager/build_config.py
@@ -17,14 +17,10 @@ class BuildConfiguration:  # pylint: disable=R0902
     use_nccl: bool = False
     # Whether to load nccl dynamically
     use_dlopen_nccl: bool = False
-    # Whether to enable HDFS
-    use_hdfs: bool = False
-    # Whether to enable Azure Storage
-    use_azure: bool = False
-    # Whether to enable AWS S3
-    use_s3: bool = False
-    # Whether to enable the dense parser plugin
-    plugin_dense_parser: bool = False
+    # Whether to enable federated learning
+    plugin_federated: bool = False
+    # Whether to enable rmm support
+    plugin_rmm: bool = False
     # Special option: See explanation below
     use_system_libxgboost: bool = False
 
diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh
index 88da7d395..73e88d8aa 100755
--- a/tests/buildkite/build-cpu.sh
+++ b/tests/buildkite/build-cpu.sh
@@ -15,7 +15,7 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h
   # include/dmlc/build_config_default.h.
 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \
-  -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON
+		 -DPLUGIN_FEDERATED=ON
 echo "--- Run Google Test"
 $command_wrapper bash -c "cd build && ctest --extra-verbose"
 echo "--- Stash XGBoost CLI executable"

From 8fe1a2213c063b633272d217f3b020671f462953 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 25 Nov 2023 09:10:56 +0800
Subject: [PATCH 17/32] Cleanup code for distributed training. (#9805)

* Cleanup code for distributed training.

- Merge `GetNcclResult` into nccl stub.
- Split up utilities from the main dask module.
- Let Channel return `Result` to accommodate nccl channel.
- Remove old `use_label_encoder` parameter.
---
 python-package/xgboost/dask/__init__.py       | 73 +++++++++--------
 python-package/xgboost/dask/utils.py          | 24 ++++++
 python-package/xgboost/sklearn.py             |  1 -
 python-package/xgboost/spark/core.py          |  1 -
 src/c_api/c_api.cu                            |  5 +-
 src/collective/allgather.cc                   | 51 ++++++------
 src/collective/allreduce.cc                   |  8 +-
 src/collective/broadcast.cc                   |  9 ++-
 src/collective/coll.cu                        | 29 ++++---
 src/collective/comm.cu                        | 14 ++--
 src/collective/comm.cuh                       | 29 +------
 src/collective/comm.h                         | 14 ++--
 src/collective/nccl_device_communicator.cu    | 29 +++----
 src/collective/nccl_device_communicator.cuh   |  2 +-
 src/collective/nccl_stub.cc                   | 26 +++++-
 src/collective/nccl_stub.h                    | 79 ++++++++-----------
 src/common/device_helpers.cuh                 |  4 -
 tests/cpp/collective/test_comm.cc             | 13 +--
 .../test_nccl_device_communicator.cu          |  2 +-
 19 files changed, 221 insertions(+), 192 deletions(-)
 create mode 100644 python-package/xgboost/dask/utils.py

diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index a58c0f225..068b1e6ea 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -94,6 +94,8 @@ from xgboost.sklearn import (
 from xgboost.tracker import RabitTracker, get_host_ip
 from xgboost.training import train as worker_train
 
+from .utils import get_n_threads
+
 if TYPE_CHECKING:
     import dask
     import distributed
@@ -908,6 +910,34 @@ async def _check_workers_are_alive(
         raise RuntimeError(f"Missing required workers: {missing_workers}")
 
 
+def _get_dmatrices(
+    train_ref: dict,
+    train_id: int,
+    *refs: dict,
+    evals_id: Sequence[int],
+    evals_name: Sequence[str],
+    n_threads: int,
+) -> Tuple[DMatrix, List[Tuple[DMatrix, str]]]:
+    Xy = _dmatrix_from_list_of_parts(**train_ref, nthread=n_threads)
+    evals: List[Tuple[DMatrix, str]] = []
+    for i, ref in enumerate(refs):
+        if evals_id[i] == train_id:
+            evals.append((Xy, evals_name[i]))
+            continue
+        if ref.get("ref", None) is not None:
+            if ref["ref"] != train_id:
+                raise ValueError(
+                    "The training DMatrix should be used as a reference to evaluation"
+                    " `QuantileDMatrix`."
+                )
+            del ref["ref"]
+            eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads, ref=Xy)
+        else:
+            eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads)
+        evals.append((eval_Xy, evals_name[i]))
+    return Xy, evals
+
+
 async def _train_async(
     client: "distributed.Client",
     global_config: Dict[str, Any],
@@ -940,41 +970,20 @@ async def _train_async(
     ) -> Optional[TrainReturnT]:
         worker = distributed.get_worker()
         local_param = parameters.copy()
-        n_threads = 0
-        # dask worker nthreads, "state" is available in 2022.6.1
-        dwnt = worker.state.nthreads if hasattr(worker, "state") else worker.nthreads
-        for p in ["nthread", "n_jobs"]:
-            if (
-                local_param.get(p, None) is not None
-                and local_param.get(p, dwnt) != dwnt
-            ):
-                LOGGER.info("Overriding `nthreads` defined in dask worker.")
-                n_threads = local_param[p]
-                break
-        if n_threads == 0 or n_threads is None:
-            n_threads = dwnt
+        n_threads = get_n_threads(local_param, worker)
         local_param.update({"nthread": n_threads, "n_jobs": n_threads})
+
         local_history: TrainingCallback.EvalsLog = {}
+
         with CommunicatorContext(**rabit_args), config.config_context(**global_config):
-            Xy = _dmatrix_from_list_of_parts(**train_ref, nthread=n_threads)
-            evals: List[Tuple[DMatrix, str]] = []
-            for i, ref in enumerate(refs):
-                if evals_id[i] == train_id:
-                    evals.append((Xy, evals_name[i]))
-                    continue
-                if ref.get("ref", None) is not None:
-                    if ref["ref"] != train_id:
-                        raise ValueError(
-                            "The training DMatrix should be used as a reference"
-                            " to evaluation `QuantileDMatrix`."
-                        )
-                    del ref["ref"]
-                    eval_Xy = _dmatrix_from_list_of_parts(
-                        **ref, nthread=n_threads, ref=Xy
-                    )
-                else:
-                    eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads)
-                evals.append((eval_Xy, evals_name[i]))
+            Xy, evals = _get_dmatrices(
+                train_ref,
+                train_id,
+                *refs,
+                evals_id=evals_id,
+                evals_name=evals_name,
+                n_threads=n_threads,
+            )
 
             booster = worker_train(
                 params=local_param,
diff --git a/python-package/xgboost/dask/utils.py b/python-package/xgboost/dask/utils.py
new file mode 100644
index 000000000..98e6029b5
--- /dev/null
+++ b/python-package/xgboost/dask/utils.py
@@ -0,0 +1,24 @@
+"""Utilities for the XGBoost Dask interface."""
+import logging
+from typing import TYPE_CHECKING, Any, Dict
+
+LOGGER = logging.getLogger("[xgboost.dask]")
+
+
+if TYPE_CHECKING:
+    import distributed
+
+
+def get_n_threads(local_param: Dict[str, Any], worker: "distributed.Worker") -> int:
+    """Get the number of threads from a worker and the user-supplied parameters."""
+    # dask worker nthreads, "state" is available in 2022.6.1
+    dwnt = worker.state.nthreads if hasattr(worker, "state") else worker.nthreads
+    n_threads = None
+    for p in ["nthread", "n_jobs"]:
+        if local_param.get(p, None) is not None and local_param.get(p, dwnt) != dwnt:
+            LOGGER.info("Overriding `nthreads` defined in dask worker.")
+            n_threads = local_param[p]
+            break
+    if n_threads == 0 or n_threads is None:
+        n_threads = dwnt
+    return n_threads
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 3906973a8..ea309bd94 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -808,7 +808,6 @@ class XGBModel(XGBModelBase):
             "kwargs",
             "missing",
             "n_estimators",
-            "use_label_encoder",
             "enable_categorical",
             "early_stopping_rounds",
             "callbacks",
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index aa8c5b998..7ac01ff07 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -138,7 +138,6 @@ _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.it
 _unsupported_xgb_params = [
     "gpu_id",  # we have "device" pyspark param instead.
     "enable_categorical",  # Use feature_types param to specify categorical feature instead
-    "use_label_encoder",
     "n_jobs",  # Do not allow user to set it, will use `spark.task.cpus` value instead.
     "nthread",  # Ditto
 ]
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 4ace8b7cc..47868f466 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2023, XGBoost Contributors
  */
 #include <thrust/transform.h>  // for transform
 
@@ -15,6 +15,9 @@
 #include "xgboost/data.h"
 #include "xgboost/json.h"
 #include "xgboost/learner.h"
+#if defined(XGBOOST_USE_NCCL)
+#include <nccl.h>
+#endif
 
 namespace xgboost {
 void XGBBuildInfoDevice(Json *p_info) {
diff --git a/src/collective/allgather.cc b/src/collective/allgather.cc
index fa369a9da..148cb6cd2 100644
--- a/src/collective/allgather.cc
+++ b/src/collective/allgather.cc
@@ -26,18 +26,19 @@ Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size
   }
 
   for (std::int32_t r = 0; r < world; ++r) {
-    auto send_rank = (rank + world - r + worker_off) % world;
-    auto send_off = send_rank * segment_size;
-    send_off = std::min(send_off, data.size_bytes());
-    auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
-    next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
-
-    auto recv_rank = (rank + world - r - 1 + worker_off) % world;
-    auto recv_off = recv_rank * segment_size;
-    recv_off = std::min(recv_off, data.size_bytes());
-    auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
-    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
-    auto rc = prev_ch->Block();
+    auto rc = Success() << [&] {
+      auto send_rank = (rank + world - r + worker_off) % world;
+      auto send_off = send_rank * segment_size;
+      send_off = std::min(send_off, data.size_bytes());
+      auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
+      return next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
+    } << [&] {
+      auto recv_rank = (rank + world - r - 1 + worker_off) % world;
+      auto recv_off = recv_rank * segment_size;
+      recv_off = std::min(recv_off, data.size_bytes());
+      auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
+      return prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+    } << [&] { return prev_ch->Block(); };
     if (!rc.OK()) {
       return rc;
     }
@@ -78,19 +79,19 @@ namespace detail {
   auto next_ch = comm.Chan(next);
 
   for (std::int32_t r = 0; r < world; ++r) {
-    auto send_rank = (rank + world - r) % world;
-    auto send_off = offset[send_rank];
-    auto send_size = sizes[send_rank];
-    auto send_seg = erased_result.subspan(send_off, send_size);
-    next_ch->SendAll(send_seg);
-
-    auto recv_rank = (rank + world - r - 1) % world;
-    auto recv_off = offset[recv_rank];
-    auto recv_size = sizes[recv_rank];
-    auto recv_seg = erased_result.subspan(recv_off, recv_size);
-    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
-
-    auto rc = prev_ch->Block();
+    auto rc = Success() << [&] {
+      auto send_rank = (rank + world - r) % world;
+      auto send_off = offset[send_rank];
+      auto send_size = sizes[send_rank];
+      auto send_seg = erased_result.subspan(send_off, send_size);
+      return next_ch->SendAll(send_seg);
+    } << [&] {
+      auto recv_rank = (rank + world - r - 1) % world;
+      auto recv_off = offset[recv_rank];
+      auto recv_size = sizes[recv_rank];
+      auto recv_seg = erased_result.subspan(recv_off, recv_size);
+      return prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+    } << [&] { return prev_ch->Block(); };
     if (!rc.OK()) {
       return rc;
     }
diff --git a/src/collective/allreduce.cc b/src/collective/allreduce.cc
index f95a9a9f1..93b76355f 100644
--- a/src/collective/allreduce.cc
+++ b/src/collective/allreduce.cc
@@ -37,7 +37,10 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
     auto seg_nbytes = std::min(data.size_bytes() - send_off, n_bytes_in_seg);
     auto send_seg = data.subspan(send_off, seg_nbytes);
 
-    next_ch->SendAll(send_seg);
+    auto rc = next_ch->SendAll(send_seg);
+    if (!rc.OK()) {
+      return rc;
+    }
 
     // receive from ring prev
     auto recv_off = ((rank + world - r - 1) % world) * n_bytes_in_seg;
@@ -47,8 +50,7 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
     auto recv_seg = data.subspan(recv_off, seg_nbytes);
     auto seg = s_buf.subspan(0, recv_seg.size());
 
-    prev_ch->RecvAll(seg);
-    auto rc = comm.Block();
+    rc = std::move(rc) << [&] { return prev_ch->RecvAll(seg); } << [&] { return comm.Block(); };
     if (!rc.OK()) {
       return rc;
     }
diff --git a/src/collective/broadcast.cc b/src/collective/broadcast.cc
index 660bb9130..e1ef60f86 100644
--- a/src/collective/broadcast.cc
+++ b/src/collective/broadcast.cc
@@ -62,8 +62,8 @@ Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t
 
   if (shifted_rank != 0) {  // not root
     auto parent = ShiftRight(ShiftedParentRank(shifted_rank, depth), world, root);
-    comm.Chan(parent)->RecvAll(data);
-    auto rc = comm.Chan(parent)->Block();
+    auto rc = Success() << [&] { return comm.Chan(parent)->RecvAll(data); }
+                        << [&] { return comm.Chan(parent)->Block(); };
     if (!rc.OK()) {
       return Fail("broadcast failed.", std::move(rc));
     }
@@ -75,7 +75,10 @@ Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t
       auto sft_peer = shifted_rank + (1 << i);
       auto peer = ShiftRight(sft_peer, world, root);
       CHECK_NE(peer, root);
-      comm.Chan(peer)->SendAll(data);
+      auto rc = comm.Chan(peer)->SendAll(data);
+      if (!rc.OK()) {
+        return rc;
+      }
     }
   }
 
diff --git a/src/collective/coll.cu b/src/collective/coll.cu
index 60072b6a5..d1b66a8ce 100644
--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@@ -79,8 +79,8 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> ou
 
   // First gather data from all the workers.
   CHECK(handle);
-  auto rc = GetNCCLResult(stub, stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8,
-                                                handle, pcomm->Stream()));
+  auto rc =
+      stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream());
   if (!rc.OK()) {
     return rc;
   }
@@ -140,9 +140,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
       return DispatchDType(type, [=](auto t) {
         using T = decltype(t);
         auto rdata = common::RestoreType<T>(data);
-        auto rc = stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
-                                  GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
-        return GetNCCLResult(stub, rc);
+        return stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
+                               GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
       });
     }
   } << [&] { return nccl->Block(); };
@@ -158,8 +157,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
   auto stub = nccl->Stub();
 
   return Success() << [&] {
-    return GetNCCLResult(stub, stub->Broadcast(data.data(), data.data(), data.size_bytes(),
-                                               ncclInt8, root, nccl->Handle(), nccl->Stream()));
+    return stub->Broadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root,
+                           nccl->Handle(), nccl->Stream());
   } << [&] { return nccl->Block(); };
 }
 
@@ -174,8 +173,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
 
   auto send = data.subspan(comm.Rank() * size, size);
   return Success() << [&] {
-    return GetNCCLResult(stub, stub->Allgather(send.data(), data.data(), size, ncclInt8,
-                                               nccl->Handle(), nccl->Stream()));
+    return stub->Allgather(send.data(), data.data(), size, ncclInt8, nccl->Handle(),
+                           nccl->Stream());
   } << [&] { return nccl->Block(); };
 }
 
@@ -188,19 +187,19 @@ namespace cuda_impl {
 Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const> data,
                            common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
   auto stub = comm->Stub();
-  return Success() << [&stub] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
+  return Success() << [&stub] { return stub->GroupStart(); } << [&] {
     std::size_t offset = 0;
     for (std::int32_t r = 0; r < comm->World(); ++r) {
       auto as_bytes = sizes[r];
       auto rc = stub->Broadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
                                 ncclInt8, r, comm->Handle(), dh::DefaultStream());
-      if (rc != ncclSuccess) {
-        return GetNCCLResult(stub, rc);
+      if (!rc.OK()) {
+        return rc;
       }
       offset += as_bytes;
     }
     return Success();
-  } << [&] { return GetNCCLResult(stub, stub->GroupEnd()); };
+  } << [&] { return stub->GroupEnd(); };
 }
 }  // namespace cuda_impl
 
@@ -217,7 +216,7 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
 
   switch (algo) {
     case AllgatherVAlgo::kRing: {
-      return Success() << [&] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
+      return Success() << [&] { return stub->GroupStart(); } << [&] {
         // get worker offset
         detail::AllgatherVOffset(sizes, recv_segments);
         // copy data
@@ -228,7 +227,7 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
         }
         return detail::RingAllgatherV(comm, sizes, recv_segments, recv);
       } << [&] {
-        return GetNCCLResult(stub, stub->GroupEnd());
+        return stub->GroupEnd();
       } << [&] { return nccl->Block(); };
     }
     case AllgatherVAlgo::kBcast: {
diff --git a/src/collective/comm.cu b/src/collective/comm.cu
index cc67def0a..56681253c 100644
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@@ -26,7 +26,7 @@ Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared
   static const int kRootRank = 0;
   ncclUniqueId id;
   if (comm.Rank() == kRootRank) {
-    auto rc = GetNCCLResult(stub, stub->GetUniqueId(&id));
+    auto rc = stub->GetUniqueId(&id);
     CHECK(rc.OK()) << rc.Report();
   }
   auto rc = coll->Broadcast(
@@ -99,12 +99,10 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
       << "Multiple processes within communication group running on same CUDA "
       << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
 
-  rc = std::move(rc) << [&] {
-    return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_);
-  } << [&] {
-    return GetNCCLResult(this->stub_, this->stub_->CommInitRank(&nccl_comm_, root.World(),
-                                                                nccl_unique_id_, root.Rank()));
-  };
+  rc = std::move(rc) << [&] { return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_); } <<
+       [&] {
+         return this->stub_->CommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank());
+       };
   CHECK(rc.OK()) << rc.Report();
 
   for (std::int32_t r = 0; r < root.World(); ++r) {
@@ -115,7 +113,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
 
 NCCLComm::~NCCLComm() {
   if (nccl_comm_) {
-    auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
+    auto rc = stub_->CommDestroy(nccl_comm_);
     CHECK(rc.OK()) << rc.Report();
   }
 }
diff --git a/src/collective/comm.cuh b/src/collective/comm.cuh
index ef537b5a9..a818d95f8 100644
--- a/src/collective/comm.cuh
+++ b/src/collective/comm.cuh
@@ -52,25 +52,6 @@ class NCCLComm : public Comm {
   }
 };
 
-inline Result GetNCCLResult(std::shared_ptr<NcclStub> stub, ncclResult_t code) {
-  if (code == ncclSuccess) {
-    return Success();
-  }
-
-  std::stringstream ss;
-  ss << "NCCL failure: " << stub->GetErrorString(code) << ".";
-  if (code == ncclUnhandledCudaError) {
-    // nccl usually preserves the last error so we can get more details.
-    auto err = cudaPeekAtLastError();
-    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
-  } else if (code == ncclSystemError) {
-    ss << "  This might be caused by a network configuration issue. Please consider specifying "
-          "the network interface for NCCL via environment variables listed in its reference: "
-          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
-  }
-  return Fail(ss.str());
-}
-
 class NCCLChannel : public Channel {
   std::int32_t rank_{-1};
   ncclComm_t nccl_comm_{};
@@ -86,13 +67,11 @@ class NCCLChannel : public Channel {
         Channel{comm, nullptr},
         stream_{stream} {}
 
-  void SendAll(std::int8_t const* ptr, std::size_t n) override {
-    auto rc = GetNCCLResult(stub_, stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
-    CHECK(rc.OK()) << rc.Report();
+  [[nodiscard]] Result SendAll(std::int8_t const* ptr, std::size_t n) override {
+    return stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_);
   }
-  void RecvAll(std::int8_t* ptr, std::size_t n) override {
-    auto rc = GetNCCLResult(stub_, stub_->Recv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
-    CHECK(rc.OK()) << rc.Report();
+  [[nodiscard]] Result RecvAll(std::int8_t* ptr, std::size_t n) override {
+    return stub_->Recv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_);
   }
   [[nodiscard]] Result Block() override {
     auto rc = stream_.Sync(false);
diff --git a/src/collective/comm.h b/src/collective/comm.h
index b2f519e3d..82aa2c45e 100644
--- a/src/collective/comm.h
+++ b/src/collective/comm.h
@@ -135,21 +135,25 @@ class Channel {
   explicit Channel(Comm const& comm, std::shared_ptr<TCPSocket> sock)
       : sock_{std::move(sock)}, comm_{comm} {}
 
-  virtual void SendAll(std::int8_t const* ptr, std::size_t n) {
+  [[nodiscard]] virtual Result SendAll(std::int8_t const* ptr, std::size_t n) {
     Loop::Op op{Loop::Op::kWrite, comm_.Rank(), const_cast<std::int8_t*>(ptr), n, sock_.get(), 0};
     CHECK(sock_.get());
     comm_.Submit(std::move(op));
+    return Success();
   }
-  void SendAll(common::Span<std::int8_t const> data) {
-    this->SendAll(data.data(), data.size_bytes());
+  [[nodiscard]] Result SendAll(common::Span<std::int8_t const> data) {
+    return this->SendAll(data.data(), data.size_bytes());
   }
 
-  virtual void RecvAll(std::int8_t* ptr, std::size_t n) {
+  [[nodiscard]] virtual Result RecvAll(std::int8_t* ptr, std::size_t n) {
     Loop::Op op{Loop::Op::kRead, comm_.Rank(), ptr, n, sock_.get(), 0};
     CHECK(sock_.get());
     comm_.Submit(std::move(op));
+    return Success();
+  }
+  [[nodiscard]] Result RecvAll(common::Span<std::int8_t> data) {
+    return this->RecvAll(data.data(), data.size_bytes());
   }
-  void RecvAll(common::Span<std::int8_t> data) { this->RecvAll(data.data(), data.size_bytes()); }
 
   [[nodiscard]] auto Socket() const { return sock_; }
   [[nodiscard]] virtual Result Block() { return comm_.Block(); }
diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 25b198bde..31c2d394d 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -46,8 +46,7 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
 
   nccl_unique_id_ = GetUniqueId();
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  auto rc =
-      GetNCCLResult(stub_, stub_->CommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
+  auto rc = stub_->CommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_);
   CHECK(rc.OK()) << rc.Report();
 }
 
@@ -56,7 +55,7 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() {
     return;
   }
   if (nccl_comm_) {
-    auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
+    auto rc = stub_->CommDestroy(nccl_comm_);
     CHECK(rc.OK()) << rc.Report();
   }
   if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
@@ -143,9 +142,8 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
   auto *device_buffer = buffer.data().get();
 
   // First gather data from all the workers.
-  auto rc = GetNCCLResult(
-      stub_, stub_->Allgather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
-                              nccl_comm_, dh::DefaultStream()));
+  auto rc = stub_->Allgather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
+                             nccl_comm_, dh::DefaultStream());
   CHECK(rc.OK()) << rc.Report();
   if (needs_sync_) {
     dh::DefaultStream().Sync();
@@ -178,9 +176,9 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
   if (IsBitwiseOp(op)) {
     BitwiseAllReduce(send_receive_buffer, count, data_type, op);
   } else {
-    auto rc = GetNCCLResult(stub_, stub_->Allreduce(send_receive_buffer, send_receive_buffer, count,
-                                                    GetNcclDataType(data_type), GetNcclRedOp(op),
-                                                    nccl_comm_, dh::DefaultStream()));
+    auto rc = stub_->Allreduce(send_receive_buffer, send_receive_buffer, count,
+                               GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
+                               dh::DefaultStream());
     CHECK(rc.OK()) << rc.Report();
   }
   allreduce_bytes_ += count * GetTypeSize(data_type);
@@ -194,8 +192,8 @@ void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_bu
   }
 
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  auto rc = GetNCCLResult(stub_, stub_->Allgather(send_buffer, receive_buffer, send_size, ncclInt8,
-                                                  nccl_comm_, dh::DefaultStream()));
+  auto rc = stub_->Allgather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
+                             dh::DefaultStream());
   CHECK(rc.OK()) << rc.Report();
 }
 
@@ -216,19 +214,18 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
   receive_buffer->resize(total_bytes);
 
   size_t offset = 0;
-  auto rc = Success() << [&] { return GetNCCLResult(stub_, stub_->GroupStart()); } << [&] {
+  auto rc = Success() << [&] { return stub_->GroupStart(); } << [&] {
     for (int32_t i = 0; i < world_size_; ++i) {
       size_t as_bytes = segments->at(i);
-      auto rc = GetNCCLResult(
-          stub_, stub_->Broadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                  ncclChar, i, nccl_comm_, dh::DefaultStream()));
+      auto rc = stub_->Broadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
+                                 ncclChar, i, nccl_comm_, dh::DefaultStream());
       if (!rc.OK()) {
         return rc;
       }
       offset += as_bytes;
     }
     return Success();
-  } << [&] { return GetNCCLResult(stub_, stub_->GroupEnd()); };
+  } << [&] { return stub_->GroupEnd(); };
 }
 
 void NcclDeviceCommunicator::Synchronize() {
diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index a194b4ef2..ef431b571 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -66,7 +66,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
     static const int kRootRank = 0;
     ncclUniqueId id;
     if (rank_ == kRootRank) {
-      auto rc = GetNCCLResult(stub_, stub_->GetUniqueId(&id));
+      auto rc = stub_->GetUniqueId(&id);
       CHECK(rc.OK()) << rc.Report();
     }
     Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
diff --git a/src/collective/nccl_stub.cc b/src/collective/nccl_stub.cc
index f4705a46e..fea3f2755 100644
--- a/src/collective/nccl_stub.cc
+++ b/src/collective/nccl_stub.cc
@@ -4,9 +4,12 @@
 #if defined(XGBOOST_USE_NCCL)
 #include "nccl_stub.h"
 
-#include <cuda.h>   // for CUDA_VERSION
-#include <dlfcn.h>  // for dlclose, dlsym, dlopen
+#include <cuda.h>              // for CUDA_VERSION
+#include <cuda_runtime_api.h>  // for cudaPeekAtLastError
+#include <dlfcn.h>             // for dlclose, dlsym, dlopen
 #include <nccl.h>
+#include <thrust/system/cuda/error.h>  // for cuda_category
+#include <thrust/system_error.h>       // for system_error
 
 #include <cstdint>  // for int32_t
 #include <sstream>  // for stringstream
@@ -16,6 +19,25 @@
 #include "xgboost/logging.h"
 
 namespace xgboost::collective {
+Result NcclStub::GetNcclResult(ncclResult_t code) const {
+  if (code == ncclSuccess) {
+    return Success();
+  }
+
+  std::stringstream ss;
+  ss << "NCCL failure: " << this->GetErrorString(code) << ".";
+  if (code == ncclUnhandledCudaError) {
+    // nccl usually preserves the last error so we can get more details.
+    auto err = cudaPeekAtLastError();
+    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+  } else if (code == ncclSystemError) {
+    ss << "  This might be caused by a network configuration issue. Please consider specifying "
+          "the network interface for NCCL via environment variables listed in its reference: "
+          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
+  }
+  return Fail(ss.str());
+}
+
 NcclStub::NcclStub(StringView path) : path_{std::move(path)} {
 #if defined(XGBOOST_USE_DLOPEN_NCCL)
   CHECK(!path_.empty()) << "Empty path for NCCL.";
diff --git a/src/collective/nccl_stub.h b/src/collective/nccl_stub.h
index a003a6f22..5281b736d 100644
--- a/src/collective/nccl_stub.h
+++ b/src/collective/nccl_stub.h
@@ -8,9 +8,13 @@
 
 #include <string>  // for string
 
-#include "xgboost/string_view.h"  // for StringView
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/string_view.h"        // for StringView
 
 namespace xgboost::collective {
+/**
+ * @brief A stub for NCCL to facilitate dynamic loading.
+ */
 class NcclStub {
 #if defined(XGBOOST_USE_DLOPEN_NCCL)
   void* handle_{nullptr};
@@ -30,61 +34,48 @@ class NcclStub {
   decltype(ncclGetErrorString)* get_error_string_{nullptr};
   decltype(ncclGetVersion)* get_version_{nullptr};
 
+ public:
+  Result GetNcclResult(ncclResult_t code) const;
+
  public:
   explicit NcclStub(StringView path);
   ~NcclStub();
 
-  [[nodiscard]] ncclResult_t Allreduce(const void* sendbuff, void* recvbuff, size_t count,
-                                       ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-                                       cudaStream_t stream) const {
-    CHECK(allreduce_);
-    return this->allreduce_(sendbuff, recvbuff, count, datatype, op, comm, stream);
+  [[nodiscard]] Result Allreduce(const void* sendbuff, void* recvbuff, size_t count,
+                                 ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+                                 cudaStream_t stream) const {
+    return this->GetNcclResult(allreduce_(sendbuff, recvbuff, count, datatype, op, comm, stream));
   }
-  [[nodiscard]] ncclResult_t Broadcast(const void* sendbuff, void* recvbuff, size_t count,
-                                       ncclDataType_t datatype, int root, ncclComm_t comm,
-                                       cudaStream_t stream) const {
-    CHECK(broadcast_);
-    return this->broadcast_(sendbuff, recvbuff, count, datatype, root, comm, stream);
+  [[nodiscard]] Result Broadcast(const void* sendbuff, void* recvbuff, size_t count,
+                                 ncclDataType_t datatype, int root, ncclComm_t comm,
+                                 cudaStream_t stream) const {
+    return this->GetNcclResult(broadcast_(sendbuff, recvbuff, count, datatype, root, comm, stream));
   }
-  [[nodiscard]] ncclResult_t Allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                                       ncclDataType_t datatype, ncclComm_t comm,
-                                       cudaStream_t stream) const {
-    CHECK(allgather_);
-    return this->allgather_(sendbuff, recvbuff, sendcount, datatype, comm, stream);
+  [[nodiscard]] Result Allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
+                                 ncclDataType_t datatype, ncclComm_t comm,
+                                 cudaStream_t stream) const {
+    return this->GetNcclResult(allgather_(sendbuff, recvbuff, sendcount, datatype, comm, stream));
   }
-  [[nodiscard]] ncclResult_t CommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
-                                          int rank) const {
-    CHECK(comm_init_rank_);
-    return this->comm_init_rank_(comm, nranks, commId, rank);
+  [[nodiscard]] Result CommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
+                                    int rank) const {
+    return this->GetNcclResult(this->comm_init_rank_(comm, nranks, commId, rank));
   }
-  [[nodiscard]] ncclResult_t CommDestroy(ncclComm_t comm) const {
-    CHECK(comm_destroy_);
-    return this->comm_destroy_(comm);
+  [[nodiscard]] Result CommDestroy(ncclComm_t comm) const {
+    return this->GetNcclResult(comm_destroy_(comm));
   }
-
-  [[nodiscard]] ncclResult_t GetUniqueId(ncclUniqueId* uniqueId) const {
-    CHECK(get_uniqueid_);
-    return this->get_uniqueid_(uniqueId);
+  [[nodiscard]] Result GetUniqueId(ncclUniqueId* uniqueId) const {
+    return this->GetNcclResult(get_uniqueid_(uniqueId));
   }
-  [[nodiscard]] ncclResult_t Send(const void* sendbuff, size_t count, ncclDataType_t datatype,
-                                  int peer, ncclComm_t comm, cudaStream_t stream) {
-    CHECK(send_);
-    return send_(sendbuff, count, datatype, peer, comm, stream);
+  [[nodiscard]] Result Send(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+                            ncclComm_t comm, cudaStream_t stream) {
+    return this->GetNcclResult(send_(sendbuff, count, datatype, peer, comm, stream));
   }
-  [[nodiscard]] ncclResult_t Recv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
-                                  ncclComm_t comm, cudaStream_t stream) const {
-    CHECK(recv_);
-    return recv_(recvbuff, count, datatype, peer, comm, stream);
+  [[nodiscard]] Result Recv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+                            ncclComm_t comm, cudaStream_t stream) const {
+    return this->GetNcclResult(recv_(recvbuff, count, datatype, peer, comm, stream));
   }
-  [[nodiscard]] ncclResult_t GroupStart() const {
-    CHECK(group_start_);
-    return group_start_();
-  }
-  [[nodiscard]] ncclResult_t GroupEnd() const {
-    CHECK(group_end_);
-    return group_end_();
-  }
-
+  [[nodiscard]] Result GroupStart() const { return this->GetNcclResult(group_start_()); }
+  [[nodiscard]] Result GroupEnd() const { return this->GetNcclResult(group_end_()); }
   [[nodiscard]] const char* GetErrorString(ncclResult_t result) const {
     return get_error_string_(result);
   }
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 89ec42f2b..ffe61800e 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -36,10 +36,6 @@
 #include "xgboost/logging.h"
 #include "xgboost/span.h"
 
-#ifdef XGBOOST_USE_NCCL
-#include "nccl.h"
-#endif  // XGBOOST_USE_NCCL
-
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 #include "rmm/mr/device/per_device_resource.hpp"
 #include "rmm/mr/device/thrust_allocator_adaptor.hpp"
diff --git a/tests/cpp/collective/test_comm.cc b/tests/cpp/collective/test_comm.cc
index 52fec7b5d..8e69b2f8e 100644
--- a/tests/cpp/collective/test_comm.cc
+++ b/tests/cpp/collective/test_comm.cc
@@ -25,15 +25,18 @@ TEST_F(CommTest, Channel) {
       WorkerForTest worker{host, port, timeout, n_workers, i};
       if (i % 2 == 0) {
         auto p_chan = worker.Comm().Chan(i + 1);
-        p_chan->SendAll(
-            EraseType(common::Span<std::int32_t const>{&i, static_cast<std::size_t>(1)}));
-        auto rc = p_chan->Block();
+        auto rc = Success() << [&] {
+          return p_chan->SendAll(
+              EraseType(common::Span<std::int32_t const>{&i, static_cast<std::size_t>(1)}));
+        } << [&] { return p_chan->Block(); };
         ASSERT_TRUE(rc.OK()) << rc.Report();
       } else {
         auto p_chan = worker.Comm().Chan(i - 1);
         std::int32_t r{-1};
-        p_chan->RecvAll(EraseType(common::Span<std::int32_t>{&r, static_cast<std::size_t>(1)}));
-        auto rc = p_chan->Block();
+        auto rc = Success() << [&] {
+          return p_chan->RecvAll(
+              EraseType(common::Span<std::int32_t>{&r, static_cast<std::size_t>(1)}));
+        } << [&] { return p_chan->Block(); };
         ASSERT_TRUE(rc.OK()) << rc.Report();
         ASSERT_EQ(r, i - 1);
       }
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index 3d7b1efc8..47e86220d 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -23,7 +23,7 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
 
 TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
   auto stub = std::make_shared<NcclStub>(DefaultNcclName());
-  auto rc = GetNCCLResult(stub, ncclSystemError);
+  auto rc = stub->GetNcclResult(ncclSystemError);
   auto msg = rc.Report();
   ASSERT_TRUE(msg.find("environment variables") != std::string::npos);
 }

From 3f4e22015a6796a0807650d7b2f97b108f8760df Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 25 Nov 2023 11:25:47 +0800
Subject: [PATCH 18/32] Mark NCCL python test optional. (#9804)

Skip the tests if XGBoost is not compiled with dlopen.
---
 .../test_gpu_with_dask/test_gpu_with_dask.py              | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 883dbbaf2..f25ac9fb0 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -573,6 +573,10 @@ def test_with_asyncio(local_cuda_client: Client) -> None:
     assert isinstance(output["history"], dict)
 
 
+@pytest.mark.skipif(
+    condition=not xgb.build_info()["USE_DLOPEN_NCCL"],
+    reason="Not compiled with dlopen.",
+)
 def test_invalid_nccl(local_cuda_client: Client) -> None:
     client = local_cuda_client
     workers = tm.get_client_workers(client)
@@ -592,6 +596,10 @@ def test_invalid_nccl(local_cuda_client: Client) -> None:
     client.gather(futures)
 
 
+@pytest.mark.skipif(
+    condition=not xgb.build_info()["USE_DLOPEN_NCCL"],
+    reason="Not compiled with dlopen.",
+)
 @pytest.mark.parametrize("tree_method", ["hist", "approx"])
 def test_nccl_load(local_cuda_client: Client, tree_method: str) -> None:
     X, y, w = tm.make_regression(128, 16, use_cupy=True)

From e9f149481e64d7d97cf37e95433ea6f6674d4dbf Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 27 Nov 2023 17:19:01 +0800
Subject: [PATCH 19/32] [sklearn] Fix loading model attributes. (#9808)

---
 python-package/xgboost/dask/__init__.py       |  7 +--
 python-package/xgboost/sklearn.py             | 54 +++++++++----------
 tests/python/test_with_sklearn.py             | 22 ++++----
 .../test_with_dask/test_with_dask.py          |  7 +++
 4 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 068b1e6ea..046a2c982 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -79,7 +79,6 @@ from xgboost.data import _is_cudf_ser, _is_cupy_array
 from xgboost.sklearn import (
     XGBClassifier,
     XGBClassifierBase,
-    XGBClassifierMixIn,
     XGBModel,
     XGBRanker,
     XGBRankerMixIn,
@@ -1863,7 +1862,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
     "Implementation of the scikit-learn API for XGBoost classification.",
     ["estimators", "model"],
 )
-class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBase):
+class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
     # pylint: disable=missing-class-docstring
     async def _fit_async(
         self,
@@ -2045,10 +2044,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
             preds = da.map_blocks(_argmax, pred_probs, drop_axis=1)
         return preds
 
-    def load_model(self, fname: ModelIn) -> None:
-        super().load_model(fname)
-        self._load_model_attributes(self.get_booster())
-
 
 @xgboost_model_doc(
     """Implementation of the Scikit-Learn API for XGBoost Ranking.
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index ea309bd94..748b26cf6 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -43,19 +43,6 @@ from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
 from .training import train
 
 
-class XGBClassifierMixIn:  # pylint: disable=too-few-public-methods
-    """MixIn for classification."""
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, **kwargs)
-
-    def _load_model_attributes(self, booster: Booster) -> None:
-        config = json.loads(booster.save_config())
-        self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
-        # binary classification is treated as regression in XGBoost.
-        self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
-
-
 class XGBRankerMixIn:  # pylint: disable=too-few-public-methods
     """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn
     base classes.
@@ -850,21 +837,38 @@ class XGBModel(XGBModelBase):
         self.get_booster().load_model(fname)
 
         meta_str = self.get_booster().attr("scikit_learn")
-        if meta_str is None:
-            return
+        if meta_str is not None:
+            meta = json.loads(meta_str)
+            t = meta.get("_estimator_type", None)
+            if t is not None and t != self._get_type():
+                raise TypeError(
+                    "Loading an estimator with different type. Expecting: "
+                    f"{self._get_type()}, got: {t}"
+                )
 
-        meta = json.loads(meta_str)
-        t = meta.get("_estimator_type", None)
-        if t is not None and t != self._get_type():
-            raise TypeError(
-                "Loading an estimator with different type. Expecting: "
-                f"{self._get_type()}, got: {t}"
-            )
         self.feature_types = self.get_booster().feature_types
         self.get_booster().set_attr(scikit_learn=None)
+        config = json.loads(self.get_booster().save_config())
+        self._load_model_attributes(config)
 
     load_model.__doc__ = f"""{Booster.load_model.__doc__}"""
 
+    def _load_model_attributes(self, config: dict) -> None:
+        """Load model attributes without hyper-parameters."""
+        from sklearn.base import is_classifier
+
+        booster = self.get_booster()
+
+        self.objective = config["learner"]["objective"]["name"]
+        self.booster = config["learner"]["gradient_booster"]["name"]
+        self.base_score = config["learner"]["learner_model_param"]["base_score"]
+        self.feature_types = booster.feature_types
+
+        if is_classifier(self):
+            self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
+            # binary classification is treated as regression in XGBoost.
+            self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
+
     # pylint: disable=too-many-branches
     def _configure_fit(
         self,
@@ -1414,7 +1418,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
         Number of boosting rounds.
 """,
 )
-class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
+class XGBClassifier(XGBModel, XGBClassifierBase):
     # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
     @_deprecate_positional_args
     def __init__(
@@ -1642,10 +1646,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
     def classes_(self) -> np.ndarray:
         return np.arange(self.n_classes_)
 
-    def load_model(self, fname: ModelIn) -> None:
-        super().load_model(fname)
-        self._load_model_attributes(self.get_booster())
-
 
 @xgboost_model_doc(
     "scikit-learn API for XGBoost random forest classification.",
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 16f7ab9d1..1e49ed053 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -944,6 +944,7 @@ def save_load_model(model_path):
     predt_0 = clf.predict(X)
     clf.save_model(model_path)
     clf.load_model(model_path)
+    assert clf.booster == "gblinear"
     predt_1 = clf.predict(X)
     np.testing.assert_allclose(predt_0, predt_1)
     assert clf.best_iteration == best_iteration
@@ -959,25 +960,26 @@ def save_load_model(model_path):
 
 def test_save_load_model():
     with tempfile.TemporaryDirectory() as tempdir:
-        model_path = os.path.join(tempdir, 'digits.model')
+        model_path = os.path.join(tempdir, "digits.model")
         save_load_model(model_path)
 
     with tempfile.TemporaryDirectory() as tempdir:
-        model_path = os.path.join(tempdir, 'digits.model.json')
+        model_path = os.path.join(tempdir, "digits.model.json")
         save_load_model(model_path)
 
     from sklearn.datasets import load_digits
     from sklearn.model_selection import train_test_split
 
     with tempfile.TemporaryDirectory() as tempdir:
-        model_path = os.path.join(tempdir, 'digits.model.ubj')
+        model_path = os.path.join(tempdir, "digits.model.ubj")
         digits = load_digits(n_class=2)
-        y = digits['target']
-        X = digits['data']
-        booster = xgb.train({'tree_method': 'hist',
-                             'objective': 'binary:logistic'},
-                            dtrain=xgb.DMatrix(X, y),
-                            num_boost_round=4)
+        y = digits["target"]
+        X = digits["data"]
+        booster = xgb.train(
+            {"tree_method": "hist", "objective": "binary:logistic"},
+            dtrain=xgb.DMatrix(X, y),
+            num_boost_round=4,
+        )
         predt_0 = booster.predict(xgb.DMatrix(X))
         booster.save_model(model_path)
         cls = xgb.XGBClassifier()
@@ -1011,6 +1013,8 @@ def test_save_load_model():
         clf = xgb.XGBClassifier()
         clf.load_model(model_path)
         assert clf.classes_.size == 10
+        assert clf.objective == "multi:softprob"
+
         np.testing.assert_equal(clf.classes_, np.arange(10))
         assert clf.n_classes_ == 10
 
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 3510dff7b..d380f0dee 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1931,6 +1931,7 @@ class TestWithDask:
         cls.client = client
         cls.fit(X, y)
         predt_0 = cls.predict(X)
+        proba_0 = cls.predict_proba(X)
 
         with tempfile.TemporaryDirectory() as tmpdir:
             path = os.path.join(tmpdir, "model.pkl")
@@ -1940,7 +1941,9 @@ class TestWithDask:
             with open(path, "rb") as fd:
                 cls = pickle.load(fd)
             predt_1 = cls.predict(X)
+            proba_1 = cls.predict_proba(X)
             np.testing.assert_allclose(predt_0.compute(), predt_1.compute())
+            np.testing.assert_allclose(proba_0.compute(), proba_1.compute())
 
             path = os.path.join(tmpdir, "cls.json")
             cls.save_model(path)
@@ -1949,16 +1952,20 @@ class TestWithDask:
             cls.load_model(path)
             assert cls.n_classes_ == 10
             predt_2 = cls.predict(X)
+            proba_2 = cls.predict_proba(X)
 
             np.testing.assert_allclose(predt_0.compute(), predt_2.compute())
+            np.testing.assert_allclose(proba_0.compute(), proba_2.compute())
 
             # Use single node to load
             cls = xgb.XGBClassifier()
             cls.load_model(path)
             assert cls.n_classes_ == 10
             predt_3 = cls.predict(X_)
+            proba_3 = cls.predict_proba(X_)
 
             np.testing.assert_allclose(predt_0.compute(), predt_3)
+            np.testing.assert_allclose(proba_0.compute(), proba_3)
 
 
 def test_dask_unsupported_features(client: "Client") -> None:

From 34a261669614e63c8095df1a01842e5920330b30 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 27 Nov 2023 20:09:25 +0800
Subject: [PATCH 20/32] [jvm-packages] Update dependencies. (#9809)

- scalatest: 3.2.17
- maven-checkstyle-plugin: 3.3.1
- maven-surefire-plugin: 3.2.2
- maven-project-info-reports-plugin: 3.5.0
- maven-javadoc-plugin: 3.6.2
---
 jvm-packages/pom.xml               | 8 ++++----
 jvm-packages/xgboost4j-gpu/pom.xml | 2 +-
 jvm-packages/xgboost4j/pom.xml     | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 5469773c5..39d4bc444 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -46,7 +46,7 @@
         <cudf.version>23.08.0</cudf.version>
         <spark.rapids.version>23.08.1</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
-        <scalatest.version>3.2.16</scalatest.version>
+        <scalatest.version>3.2.17</scalatest.version>
         <scala-collection-compat.version>2.11.0</scala-collection-compat.version>
       </properties>
     <repositories>
@@ -381,7 +381,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-checkstyle-plugin</artifactId>
-                <version>3.3.0</version>
+                <version>3.3.1</version>
                 <configuration>
                     <configLocation>checkstyle.xml</configLocation>
                     <failOnViolation>true</failOnViolation>
@@ -434,7 +434,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
-                <version>3.1.2</version>
+                <version>3.2.2</version>
                 <configuration>
                     <skipTests>false</skipTests>
                     <useSystemClassLoader>false</useSystemClassLoader>
@@ -457,7 +457,7 @@
         <plugins>
             <plugin>
                 <artifactId>maven-project-info-reports-plugin</artifactId>
-                <version>3.4.5</version>
+                <version>3.5.0</version>
             </plugin>
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index c08988ac8..2fab78126 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -72,7 +72,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-javadoc-plugin</artifactId>
-                <version>3.5.0</version>
+                <version>3.6.2</version>
                 <configuration>
                     <show>protected</show>
                     <nohelp>true</nohelp>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 46ee9158f..e05bbcf48 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -60,7 +60,7 @@
           <plugin>
               <groupId>org.apache.maven.plugins</groupId>
               <artifactId>maven-javadoc-plugin</artifactId>
-              <version>3.5.0</version>
+              <version>3.6.2</version>
               <configuration>
                   <show>protected</show>
                   <nohelp>true</nohelp>

From bfa1252fca4587151d5ac7cf79875cc130bba057 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Tue, 28 Nov 2023 22:42:41 +0100
Subject: [PATCH 21/32] [R][doc] Update docs about fitting from CSR (#9818)

---
 R-package/R/xgb.DMatrix.R    | 2 +-
 R-package/man/xgb.DMatrix.Rd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index b01e98637..8e19e87b0 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -5,7 +5,7 @@
 #' \code{\link{xgb.DMatrix.save}}).
 #'
 #' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
-#'        a \code{dgRMatrix} object (only when making predictions from a fitted model),
+#'        a \code{dgRMatrix} object,
 #'        a \code{dsparseVector} object (only when making predictions from a fitted model, will be
 #'        interpreted as a row vector), or a character string representing a filename.
 #' @param info a named list of additional information to store in the \code{xgb.DMatrix} object.
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index 59ef0b3be..38a65c638 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -15,7 +15,7 @@ xgb.DMatrix(
 }
 \arguments{
 \item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
-a \code{dgRMatrix} object (only when making predictions from a fitted model),
+a \code{dgRMatrix} object,
 a \code{dsparseVector} object (only when making predictions from a fitted model, will be
 interpreted as a row vector), or a character string representing a filename.}
 

From 59684b2db648563211465a7a60c6899362ba9956 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 29 Nov 2023 13:13:40 +0800
Subject: [PATCH 22/32] [doc] Draft for language binding consistency. [skip ci]
 (#9755)

---
 doc/contrib/consistency.rst | 62 +++++++++++++++++++++++++++++++++++++
 doc/contrib/index.rst       |  1 +
 2 files changed, 63 insertions(+)
 create mode 100644 doc/contrib/consistency.rst

diff --git a/doc/contrib/consistency.rst b/doc/contrib/consistency.rst
new file mode 100644
index 000000000..a268820eb
--- /dev/null
+++ b/doc/contrib/consistency.rst
@@ -0,0 +1,62 @@
+#################################
+Consistency for Language Bindings
+#################################
+
+XGBoost has many different language bindings developed over the years, some are in the main repository while others live independently. Many features and interfaces are inconsistent with each others, this document aims to provide some guidelines and actionable items for language binding designers.
+
+*******************
+Model Serialization
+*******************
+
+XGBoost C API exposes a couple functions for serializing a model for persistence storage. These saved files are backward compatible, meaning one can load an older XGBoost model with a newer XGBoost version. If there's change in the model format, we have deprecation notice inside the C++ implementation and public issue for tracking the status. See :doc:`/tutorials/saving_model` for details.
+
+As a result, these are considered to be stable and should work across language bindings. For instance, a model trained in R should be fully functioning in C or Python. Please don't pad anything to the output file or buffer.
+
+If there are extra fields that must be saved:
+
+- First review whether the attribute can be retrieved from known properties of the model. For instance, there's a :py:attr:`~xgboost.XGBClassifier.classes_` attribute in the scikit-learn interface :py:class:`~xgboost.XGBClassifier`, which can be obtained through `numpy.arange(n_classes)` and doesn't need to be saved into the model. Preserving version compatibility is not a trivial task and we are still spending a significant amount of time to maintain it. Please don't make complication if it's not necessary.
+
+- Then please consider whether it's universal. For instance, we have added `feature_types` to the model serialization for categorical features (which is a new feature after 1.6), the attribute is useful or will be useful in the future regardless of the language binding.
+
+- If the field is small, we can save it as model attribute (which is a key-value structure). These attributes are ignored by all other language bindings and mostly an ad-hoc storage.
+
+- Lastly, we should use the UBJSON as the default output format when given a chance (not to be burdened by the old binary format).
+
+*********************
+Training Continuation
+*********************
+
+There are cases where we want to train a model based on the previous model, for boosting trees, it's either adding new trees or modifying the existing trees. This can be normal model update, error recovery, or other special cases we don't know of yet. When it happens, the training iteration should start from 0, not from the last boosted rounds of the model. 0 is a special iteration number, we perform some extra checks like whether the label is valid during that iteration. These checks can be expensive but necessary for eliminating silent errors. Keeping the iteration starts from zero allows us to perform these checks only once for each input data.
+
+*********
+Inference
+*********
+
+The inference function is quite inconsistent among language bindings at the time of writing due to historical reasons, but this makes more important for us to have consistency in mind in the future development.
+
+- Firstly, it's the output shape. There's a relatively new parameter called ``strict_shape`` in XGBoost and is rarely used. We want to make it as the default behavior but couldn't due to compatibility concerns. See :doc:`/prediction` for details. In short, if specified, XGBoost C++ implementation can output prediction with the correct shape, instead of letting the language binding to handle it.
+- Policy around early stopping is at the moment inconsistent between various interfaces. Some considers the ``best_iteration`` attribute while others don't. We should formalize that all interfaces in the future should use the ``best_iteration`` during inference unless user has explicitly specified the ``iteration_range`` parameter.
+
+****************
+Parameter naming
+****************
+
+There are many parameter naming conventions out there, Some XGBoost interfaces try to align with the larger communities. For example, the R package might support parameters naming like ``max.depth=3``, while the Spark package might support ``MaxDepth=3``. These are fine, it's better for the users to keep their pipeline consistent. However, while supporting naming variants, the normal, XGBoost way of naming should also be supported, meaning ``max_depth=3`` should be a valid parameter no-matter what language one is using. If someone were to write duplicated parameter ``max.depth=3, max_depth=3``, a clear error should be preferred instead of prioritizing one over the other.
+
+******************
+Default Parameters
+******************
+
+Like many other machine learning libraries, all parameters from XGBoost can either be inferred from the data or have default values. Bindings should not make copies of these default values and let the XGBoost core decide. When the parameter key is not passed into the C++ core, XGBoost will pick the default accordingly. These defaults are not necessarily optimal, but they are there for consistency. If there's a new choice of default parameter, we can change it inside the core and it will be automatically propagated to all bindings. Given the same set of parameters and data, various bindings should strive to produce the same model. One exception is the `num_boost_rounds`, which exists only in high-level bindings and has various alias like ``n_estimators``. Its default value is close to arbitrary at the moment, we haven't been able to get a good default yet.
+
+*******
+Logging
+*******
+
+XGBoost has a default logger builtin that can be a wrapper over binding-specific logging facility. For instance, the Python binding registers a callback to use Python :py:mod:`warnings` and :py:func:`print` function to output logging. We want to keep logging native to the larger communities instead of using the ``std::cerr`` from C++.
+
+***********************************
+Minimum Amount of Data Manipulation
+***********************************
+
+XGBoost is mostly a machine learning library providing boosting algorithm implementation. Some other implementations might perform some sort of data manipulation implicitly like deciding the coding of the data, and transforming the data according to some heuristic before training. We prefer to keep these operations based on necessities instead of convenience to keep the scope of the project well-defined. Whenever possible, we should leave these features to 3-party libraries and consider how a user can compose their pipeline. For instance, XGBoost itself should not perform ordinal encoding for categorical data, users will pick an encoder that fits their use cases (like out-of-core implementation, distributed implementation, known mapping, etc). If some transformations are decided to be part of the algorithm, we can have it inside the core instead of the language binding. Examples would be target-encoding or sketching the response variables. If we were to support them, we could have it inside the core implementation as part of the ML algorithm. This aligns with the same principles of default parameters, various bindings should provide similar (if not the same) results given the same set of parameters and data.
\ No newline at end of file
diff --git a/doc/contrib/index.rst b/doc/contrib/index.rst
index 6a36cb108..feac865fb 100644
--- a/doc/contrib/index.rst
+++ b/doc/contrib/index.rst
@@ -23,6 +23,7 @@ Here are guidelines for contributing to various aspect of the XGBoost project:
   Community Guideline <community>
   donate
   coding_guide
+  consistency
   python_packaging
   unit_tests
   Docs and Examples <docs>

From c0ef2f8dce0528bb4ee4b9e4235ce6d2c4b973f4 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Wed, 29 Nov 2023 06:14:17 +0100
Subject: [PATCH 23/32] [R] Fix potential memory leaks in case of R allocation
 failures (#9817)

---
 R-package/src/xgboost_R.cc | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 2938d4b6e..982350e95 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -82,11 +82,11 @@ XGB_DLL SEXP XGBGetGlobalConfig_R() {
 }
 
 XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
   DMatrixHandle handle;
   CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
   UNPROTECT(1);
@@ -158,7 +158,7 @@ void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_
 
 XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_row,
                                       SEXP missing, SEXP n_threads) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
   std::int32_t threads = asInteger(n_threads);
 
@@ -180,8 +180,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP
   CHECK_CALL(XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
                                     config.c_str(), &handle));
 
-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
-
+  R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
   UNPROTECT(1);
@@ -190,7 +189,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP
 
 XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col,
                                       SEXP missing, SEXP n_threads) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
   std::int32_t threads = asInteger(n_threads);
 
@@ -211,8 +210,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
   Json::Dump(jconfig, &config);
   CHECK_CALL(XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
                                     config.c_str(), &handle));
-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
-
+  R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
   UNPROTECT(1);
@@ -220,7 +218,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
 }
 
 XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
   int len = length(idxset);
   std::vector<int> idxvec(len);
@@ -232,7 +230,7 @@ XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
                                      BeginPtr(idxvec), len,
                                      &res,
                                      0));
-  ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
+  R_SetExternalPtrAddr(ret, res);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
   UNPROTECT(1);
@@ -351,7 +349,7 @@ void _BoosterFinalizer(SEXP ext) {
 }
 
 XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
   int len = length(dmats);
   std::vector<void*> dvec;
@@ -360,7 +358,7 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
   }
   BoosterHandle handle;
   CHECK_CALL(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
   R_API_END();
   UNPROTECT(1);

From e2e089ce12793744295bbc598592f2d5b8cd3014 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 29 Nov 2023 15:51:07 +0800
Subject: [PATCH 24/32] [jvm-packages] Bump rapids version. (#9820)

---
 jvm-packages/pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 39d4bc444..a06653e73 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -43,8 +43,8 @@
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>23.08.0</cudf.version>
-        <spark.rapids.version>23.08.1</spark.rapids.version>
+        <cudf.version>23.10.0</cudf.version>
+        <spark.rapids.version>23.10.0</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
         <scalatest.version>3.2.17</scalatest.version>
         <scala-collection-compat.version>2.11.0</scala-collection-compat.version>

From da3d55db5b8a1e79908f27dd083e4b207854e40d Mon Sep 17 00:00:00 2001
From: "Yuan (Terry) Tang" <terrytangyuan@gmail.com>
Date: Wed, 29 Nov 2023 14:27:05 -0500
Subject: [PATCH 25/32] Update affiliation (#9822)

---
 CONTRIBUTORS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 29d21e6a8..f96a7dc0d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -10,8 +10,8 @@ The Project Management Committee(PMC) consists group of active committers that m
   - Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project.
 * [Michael Benesty](https://github.com/pommedeterresautee)
   - Michael is a lawyer and data scientist in France. He is the creator of XGBoost interactive analysis module in R.
-* [Yuan Tang](https://github.com/terrytangyuan), Akuity
-  - Yuan is a founding engineer at Akuity. He contributed mostly in R and Python packages.
+* [Yuan Tang](https://github.com/terrytangyuan), Red Hat
+  - Yuan is a principal software engineer at Red Hat. He contributed mostly in R and Python packages.
 * [Nan Zhu](https://github.com/CodingCat), Uber
   - Nan is a software engineer in Uber. He contributed mostly in JVM packages.
 * [Jiaming Yuan](https://github.com/trivialfis)

From 37da66f865f4f771a0198b478799db39ed55cd88 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Wed, 29 Nov 2023 21:35:05 +0100
Subject: [PATCH 26/32] [R] Use array interface for dense DMatrix creation
 (#9816)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 R-package/src/xgboost_R.cc              | 105 +++++++++++++++---------
 R-package/tests/testthat/test_dmatrix.R |  32 ++++++++
 2 files changed, 100 insertions(+), 37 deletions(-)

diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 982350e95..8742a2271 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -8,6 +8,7 @@
 #include <xgboost/data.h>
 #include <xgboost/logging.h>
 
+#include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <sstream>
@@ -21,6 +22,67 @@
 
 #include "./xgboost_R.h"  // Must follow other includes.
 
+namespace {
+[[nodiscard]] std::string MakeArrayInterfaceFromRMat(SEXP R_mat) {
+  SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol);
+  const int *ptr_mat_dims = INTEGER(mat_dims);
+
+  // Lambda for type dispatch.
+  auto make_matrix = [=](auto const *ptr) {
+    using namespace xgboost;  // NOLINT
+    using T = std::remove_pointer_t<decltype(ptr)>;
+
+    auto m = linalg::MatrixView<T>{
+        common::Span{ptr,
+          static_cast<std::size_t>(ptr_mat_dims[0]) * static_cast<std::size_t>(ptr_mat_dims[1])},
+        {ptr_mat_dims[0], ptr_mat_dims[1]},  // Shape
+        DeviceOrd::CPU(),
+        linalg::Order::kF  // R uses column-major
+    };
+    CHECK(m.FContiguous());
+    return linalg::ArrayInterfaceStr(m);
+  };
+
+  const SEXPTYPE arr_type = TYPEOF(R_mat);
+  switch (arr_type) {
+    case REALSXP:
+      return make_matrix(REAL(R_mat));
+    case INTSXP:
+      return make_matrix(INTEGER(R_mat));
+    case LGLSXP:
+      return make_matrix(LOGICAL(R_mat));
+    default:
+      LOG(FATAL) << "Array or matrix has unsupported type.";
+  }
+
+  LOG(FATAL) << "Not reachable";
+  return "";
+}
+
+[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
+  using namespace ::xgboost;  // NOLINT
+  Json jconfig{Object{}};
+
+  const SEXPTYPE missing_type = TYPEOF(missing);
+  if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) ||
+      (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) ||
+      (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) {
+    // missing is not specified
+    if (arr_type == REALSXP) {
+      jconfig["missing"] = std::numeric_limits<double>::quiet_NaN();
+    } else {
+      jconfig["missing"] = R_NaInt;
+    }
+  } else {
+    // missing specified
+    jconfig["missing"] = Rf_asReal(missing);
+  }
+
+  jconfig["nthread"] = Rf_asInteger(n_threads);
+  return Json::Dump(jconfig);
+}
+}  // namespace
+
 /*!
  * \brief macro to annotate begin of api
  */
@@ -94,47 +156,16 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
 }
 
 XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
-  SEXP dim = getAttrib(mat, R_DimSymbol);
-  size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
-  size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
-  const bool is_int = TYPEOF(mat) == INTSXP;
-  double *din;
-  int *iin;
-  if (is_int) {
-    iin = INTEGER(mat);
-  } else {
-    din = REAL(mat);
-  }
-  std::vector<float> data(nrow * ncol);
-  xgboost::Context ctx;
-  ctx.nthread = asInteger(n_threads);
-  std::int32_t threads = ctx.Threads();
 
-  if (is_int) {
-    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
-      for (size_t j = 0; j < ncol; ++j) {
-        auto v = iin[i + nrow * j];
-        if (v == NA_INTEGER) {
-          data[i * ncol + j] = std::numeric_limits<float>::quiet_NaN();
-        } else {
-          data[i * ncol + j] = static_cast<float>(v);
-        }
-      }
-    });
-  } else {
-    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
-      for (size_t j = 0; j < ncol; ++j) {
-        data[i * ncol + j] = din[i + nrow * j];
-      }
-    });
-  }
+  auto array_str = MakeArrayInterfaceFromRMat(mat);
+  auto config_str = MakeJsonConfigForArray(missing, n_threads, TYPEOF(mat));
 
   DMatrixHandle handle;
-  CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol,
-                                        asReal(missing), &handle, threads));
-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  CHECK_CALL(XGDMatrixCreateFromDense(array_str.c_str(), config_str.c_str(), &handle));
+
+  R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
   UNPROTECT(1);
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 461b7d158..a0cf90088 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -265,3 +265,35 @@ test_that("xgb.DMatrix: print", {
     })
     expect_equal(txt, "xgb.DMatrix  dim: 6513 x 126  info: NA  colnames: no")
 })
+
+test_that("xgb.DMatrix: Inf as missing", {
+  x_inf <- matrix(as.numeric(1:10), nrow = 5)
+  x_inf[2, 1] <- Inf
+
+  x_nan <- x_inf
+  x_nan[2, 1] <- NA_real_
+
+  m_inf <- xgb.DMatrix(x_inf, nthread = n_threads, missing = Inf)
+  xgb.DMatrix.save(m_inf, "inf.dmatrix")
+
+  m_nan <- xgb.DMatrix(x_nan, nthread = n_threads, missing = NA_real_)
+  xgb.DMatrix.save(m_nan, "nan.dmatrix")
+
+  infconn <- file("inf.dmatrix", "rb")
+  nanconn <- file("nan.dmatrix", "rb")
+
+  expect_equal(file.size("inf.dmatrix"), file.size("nan.dmatrix"))
+
+  bytes <- file.size("inf.dmatrix")
+  infdmatrix <- readBin(infconn, "raw", n = bytes)
+  nandmatrix <- readBin(nanconn, "raw", n = bytes)
+
+  expect_equal(length(infdmatrix), length(nandmatrix))
+  expect_equal(infdmatrix, nandmatrix)
+
+  close(infconn)
+  close(nanconn)
+
+  file.remove("inf.dmatrix")
+  file.remove("nan.dmatrix")
+})

From 95af5c074be363bddc439fa3d1c1460a29cc818e Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Thu, 30 Nov 2023 17:06:59 +0100
Subject: [PATCH 27/32] more usage of array interface, fix potential memory
 leaks of std::string (#9824)

---
 R-package/src/xgboost_R.cc | 61 +++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 8742a2271..a82913819 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -59,6 +59,32 @@ namespace {
   return "";
 }
 
+[[nodiscard]] std::string MakeArrayInterfaceFromRVector(SEXP R_vec) {
+  const size_t vec_len = Rf_xlength(R_vec);
+
+  // Lambda for type dispatch.
+  auto make_vec = [=](auto const *ptr) {
+    using namespace xgboost;  // NOLINT
+    auto v = linalg::MakeVec(ptr, vec_len);
+    return linalg::ArrayInterfaceStr(v);
+  };
+
+  const SEXPTYPE arr_type = TYPEOF(R_vec);
+  switch (arr_type) {
+    case REALSXP:
+      return make_vec(REAL(R_vec));
+    case INTSXP:
+      return make_vec(INTEGER(R_vec));
+    case LGLSXP:
+      return make_vec(LOGICAL(R_vec));
+    default:
+      LOG(FATAL) << "Array or matrix has unsupported type.";
+  }
+
+  LOG(FATAL) << "Not reachable";
+  return "";
+}
+
 [[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
   using namespace ::xgboost;  // NOLINT
   Json jconfig{Object{}};
@@ -159,12 +185,15 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
   SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
 
-  auto array_str = MakeArrayInterfaceFromRMat(mat);
-  auto config_str = MakeJsonConfigForArray(missing, n_threads, TYPEOF(mat));
-
   DMatrixHandle handle;
-  CHECK_CALL(XGDMatrixCreateFromDense(array_str.c_str(), config_str.c_str(), &handle));
+  int res_code;
+  {
+    auto array_str = MakeArrayInterfaceFromRMat(mat);
+    auto config_str = MakeJsonConfigForArray(missing, n_threads, TYPEOF(mat));
 
+    res_code = XGDMatrixCreateFromDense(array_str.c_str(), config_str.c_str(), &handle);
+  }
+  CHECK_CALL(res_code);
   R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
@@ -279,23 +308,15 @@ XGB_DLL SEXP XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
 
 XGB_DLL SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
   R_API_BEGIN();
-  int len = length(array);
-  const char *name = CHAR(asChar(field));
-  auto ctx = DMatrixCtx(R_ExternalPtrAddr(handle));
-  if (!strcmp("group", name)) {
-    std::vector<unsigned> vec(len);
-    xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
-      vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
-    });
-    CHECK_CALL(
-        XGDMatrixSetUIntInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), BeginPtr(vec), len));
-  } else {
-    std::vector<float> vec(len);
-    xgboost::common::ParallelFor(len, ctx->Threads(),
-                                 [&](xgboost::omp_ulong i) { vec[i] = REAL(array)[i]; });
-    CHECK_CALL(
-        XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), BeginPtr(vec), len));
+  SEXP field_ = PROTECT(Rf_asChar(field));
+  int res_code;
+  {
+    const std::string array_str = MakeArrayInterfaceFromRVector(array);
+    res_code = XGDMatrixSetInfoFromInterface(
+      R_ExternalPtrAddr(handle), CHAR(field_), array_str.c_str());
   }
+  CHECK_CALL(res_code);
+  UNPROTECT(1);
   R_API_END();
   return R_NilValue;
 }

From 2d8c67d6dcebad31fa6e81e0fa9e42704ddd1d6f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 2 Dec 2023 07:34:56 +0800
Subject: [PATCH 28/32] [jvm-packages] Bump dependencies. (#9827)

- #9811
- #9814
- #9826
- #9830
- #9833
- #9832
- #9831
- #9834
---
 jvm-packages/pom.xml               | 4 ++--
 jvm-packages/xgboost4j-gpu/pom.xml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index a06653e73..de778a995 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -33,7 +33,7 @@
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
-        <flink.version>1.17.1</flink.version>
+        <flink.version>1.18.0</flink.version>
         <junit.version>4.13.2</junit.version>
         <spark.version>3.4.1</spark.version>
         <spark.version.gpu>3.4.1</spark.version.gpu>
@@ -481,7 +481,7 @@
         <dependency>
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
-            <version>1.2</version>
+            <version>1.3.0</version>
         </dependency>
         <dependency>
             <groupId>org.scalatest</groupId>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 2fab78126..13f9797cd 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -63,7 +63,7 @@
         <dependency>
             <groupId>org.apache.commons</groupId>
             <artifactId>commons-lang3</artifactId>
-            <version>3.13.0</version>
+            <version>3.14.0</version>
         </dependency>
     </dependencies>
 

From e78b46046edbd22aca59302a0e593099499e2109 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 2 Dec 2023 11:03:17 +0800
Subject: [PATCH 29/32] [CI] Update R version on Linux. (#9835)

---
 tests/buildkite/build-gpu-rpkg.sh             |  4 +++-
 tests/buildkite/conftest.sh                   |  1 +
 tests/ci_build/Dockerfile.gpu_build_r_centos7 | 21 +++++++++++--------
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tests/buildkite/build-gpu-rpkg.sh b/tests/buildkite/build-gpu-rpkg.sh
index 5e0de9f8c..585dc79ae 100755
--- a/tests/buildkite/build-gpu-rpkg.sh
+++ b/tests/buildkite/build-gpu-rpkg.sh
@@ -7,7 +7,9 @@ source tests/buildkite/conftest.sh
 echo "--- Build XGBoost R package with CUDA"
 
 tests/ci_build/ci_build.sh gpu_build_r_centos7 docker \
-  --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} tests/ci_build/build_r_pkg_with_cuda.sh \
+			   --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
+			   --build-arg R_VERSION_ARG=${R_VERSION} \
+			   tests/ci_build/build_r_pkg_with_cuda.sh \
   ${BUILDKITE_COMMIT}
 
 if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 3d820d727..881f98672 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -27,6 +27,7 @@ NCCL_VERSION=2.16.5-1
 RAPIDS_VERSION=23.10
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
+R_VERSION=4.3.2
 
 if [[ -z ${BUILDKITE:-} ]]
 then
diff --git a/tests/ci_build/Dockerfile.gpu_build_r_centos7 b/tests/ci_build/Dockerfile.gpu_build_r_centos7
index b73cf5adb..7c95f09b5 100644
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@@ -1,6 +1,7 @@
 ARG CUDA_VERSION_ARG
 FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
+ARG R_VERSION_ARG
 
 # Install all basic requirements
 RUN \
@@ -11,26 +12,28 @@ RUN \
     yum -y update && \
     yum install -y tar unzip wget xz git which ninja-build readline-devel libX11-devel libXt-devel \
                    xorg-x11-server-devel openssl-devel zlib-devel bzip2-devel xz-devel \
-                   pcre-devel libcurl-devel texlive-* \
+                   pcre2-devel libcurl-devel texlive-* \
                    devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ \
                    devtoolset-9-gcc-gfortran devtoolset-9-libquadmath-devel \
                    devtoolset-9-runtime devtoolset-9-libstdc++-devel
 
-ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:/opt/software/packages/bin:/opt/R/3.3.0/bin:$PATH
-ENV LD_LIBRARY_PATH=/opt/software/packages/lib:/opt/R/3.3.0/lib64:$LD_LIBRARY_PATH
+ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:/opt/software/packages/bin:/opt/R/$R_VERSION_ARG/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/software/packages/lib:/opt/R/$R_VERSION_ARG/lib64:$LD_LIBRARY_PATH
 ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
 ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
 ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
 ENV F77=/opt/rh/devtoolset-9/root/usr/bin/gfortran
+ENV FC=/opt/rh/devtoolset-9/root/usr/bin/gfortran
 
-# R 3.3.0
 RUN \
-    wget -nv -nc https://cran.r-project.org/src/base/R-3/R-3.3.0.tar.gz  && \
-    tar xf R-3.3.0.tar.gz  && \
-    cd R-3.3.0  && \
-    ./configure --prefix=/opt/R/3.3.0 --enable-R-shlib  && \
+    wget -nv -nc https://cran.r-project.org/src/base/R-4/R-$R_VERSION_ARG.tar.gz  && \
+    tar xf R-$R_VERSION_ARG.tar.gz  && \
+    cd R-$R_VERSION_ARG  && \
+    ./configure --prefix=/opt/R/$R_VERSION_ARG --enable-R-shlib --with-pcrel  && \
     make -j$(nproc)  && \
-    make install  && \
+    make install
+
+run \
     # Python
     wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge && \

From 7196c9d95e706566424ae0b7ea0046fe9c5a3849 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Sat, 2 Dec 2023 06:43:50 +0100
Subject: [PATCH 30/32] [R] Fix memory safety issues (#9823)

---
 R-package/src/xgboost_R.cc | 355 +++++++++++++++++++++++--------------
 1 file changed, 226 insertions(+), 129 deletions(-)

diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index a82913819..b267d7da6 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -23,6 +23,31 @@
 #include "./xgboost_R.h"  // Must follow other includes.
 
 namespace {
+
+struct ErrorWithUnwind : public std::exception {};
+
+void ThrowExceptionFromRError(void *unused, Rboolean jump) {
+  if (jump) {
+    throw ErrorWithUnwind();
+  }
+}
+
+struct PtrToConstChar {
+  const char *ptr;
+};
+
+SEXP WrappedMkChar(void *void_ptr) {
+  return Rf_mkChar(static_cast<PtrToConstChar*>(void_ptr)->ptr);
+}
+
+SEXP SafeMkChar(const char *c_str, SEXP continuation_token) {
+  PtrToConstChar ptr_struct{c_str};
+  return R_UnwindProtect(
+    WrappedMkChar, static_cast<void*>(&ptr_struct),
+    ThrowExceptionFromRError, nullptr,
+    continuation_token);
+}
+
 [[nodiscard]] std::string MakeArrayInterfaceFromRMat(SEXP R_mat) {
   SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol);
   const int *ptr_mat_dims = INTEGER(mat_dims);
@@ -208,8 +233,8 @@ void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_
   const int *p_indices = INTEGER(indices);
   const double *p_data = REAL(data);
 
-  auto nindptr = static_cast<std::size_t>(length(indptr));
-  auto ndata = static_cast<std::size_t>(length(data));
+  auto nindptr = static_cast<std::size_t>(Rf_xlength(indptr));
+  auto ndata = static_cast<std::size_t>(Rf_xlength(data));
   CHECK_EQ(ndata, p_indptr[nindptr - 1]);
   xgboost::detail::MakeSparseFromPtr(p_indptr, p_indices, p_data, nindptr, indptr_str, indices_str,
                                      data_str);
@@ -221,24 +246,27 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP
   SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
   std::int32_t threads = asInteger(n_threads);
-
-  using xgboost::Integer;
-  using xgboost::Json;
-  using xgboost::Object;
-
-  std::string sindptr, sindices, sdata;
-  CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
-  auto nrow = static_cast<std::size_t>(INTEGER(num_row)[0]);
-
   DMatrixHandle handle;
-  Json jconfig{Object{}};
-  // Construct configuration
-  jconfig["nthread"] = Integer{threads};
-  jconfig["missing"] = xgboost::Number{asReal(missing)};
-  std::string config;
-  Json::Dump(jconfig, &config);
-  CHECK_CALL(XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
-                                    config.c_str(), &handle));
+
+  int res_code;
+  {
+    using xgboost::Integer;
+    using xgboost::Json;
+    using xgboost::Object;
+    std::string sindptr, sindices, sdata;
+    CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
+    auto nrow = static_cast<std::size_t>(INTEGER(num_row)[0]);
+
+    Json jconfig{Object{}};
+    // Construct configuration
+    jconfig["nthread"] = Integer{threads};
+    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    std::string config;
+    Json::Dump(jconfig, &config);
+    res_code = XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
+                                      config.c_str(), &handle);
+  }
+  CHECK_CALL(res_code);
 
   R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
@@ -252,24 +280,27 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
   SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
   std::int32_t threads = asInteger(n_threads);
-
-  using xgboost::Integer;
-  using xgboost::Json;
-  using xgboost::Object;
-
-  std::string sindptr, sindices, sdata;
-  CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
-  auto ncol = static_cast<std::size_t>(INTEGER(num_col)[0]);
-
   DMatrixHandle handle;
-  Json jconfig{Object{}};
-  // Construct configuration
-  jconfig["nthread"] = Integer{threads};
-  jconfig["missing"] = xgboost::Number{asReal(missing)};
-  std::string config;
-  Json::Dump(jconfig, &config);
-  CHECK_CALL(XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
-                                    config.c_str(), &handle));
+
+  int res_code;
+  {
+    using xgboost::Integer;
+    using xgboost::Json;
+    using xgboost::Object;
+
+    std::string sindptr, sindices, sdata;
+    CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
+    auto ncol = static_cast<std::size_t>(INTEGER(num_col)[0]);
+
+    Json jconfig{Object{}};
+    // Construct configuration
+    jconfig["nthread"] = Integer{threads};
+    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    std::string config;
+    Json::Dump(jconfig, &config);
+    res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
+                                      config.c_str(), &handle);
+  }
   R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
@@ -280,16 +311,22 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
 XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
   SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
-  int len = length(idxset);
-  std::vector<int> idxvec(len);
-  for (int i = 0; i < len; ++i) {
-    idxvec[i] = INTEGER(idxset)[i] - 1;
-  }
+  R_xlen_t len = Rf_xlength(idxset);
+  const int *idxset_ = INTEGER(idxset);
   DMatrixHandle res;
-  CHECK_CALL(XGDMatrixSliceDMatrixEx(R_ExternalPtrAddr(handle),
-                                     BeginPtr(idxvec), len,
-                                     &res,
-                                     0));
+
+  int res_code;
+  {
+    std::vector<int> idxvec(len);
+    for (R_xlen_t i = 0; i < len; ++i) {
+      idxvec[i] = idxset_[i] - 1;
+    }
+    res_code = XGDMatrixSliceDMatrixEx(R_ExternalPtrAddr(handle),
+                                       BeginPtr(idxvec), len,
+                                       &res,
+                                       0);
+  }
+  CHECK_CALL(res_code);
   R_SetExternalPtrAddr(ret, res);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
@@ -325,18 +362,29 @@ XGB_DLL SEXP XGDMatrixSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP array) {
   R_API_BEGIN();
   size_t len{0};
   if (!isNull(array)) {
-    len = length(array);
+    len = Rf_xlength(array);
   }
 
-  const char *name = CHAR(asChar(field));
-  std::vector<std::string> str_info;
+  SEXP str_info_holder = PROTECT(Rf_allocVector(VECSXP, len));
   for (size_t i = 0; i < len; ++i) {
-    str_info.emplace_back(CHAR(asChar(VECTOR_ELT(array, i))));
+    SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i)));
   }
-  std::vector<char const*> vec(len);
-  std::transform(str_info.cbegin(), str_info.cend(), vec.begin(),
-                 [](std::string const &str) { return str.c_str(); });
-  CHECK_CALL(XGDMatrixSetStrFeatureInfo(R_ExternalPtrAddr(handle), name, vec.data(), len));
+
+  SEXP field_ = PROTECT(Rf_asChar(field));
+  const char *name = CHAR(field_);
+  int res_code;
+  {
+    std::vector<std::string> str_info;
+    for (size_t i = 0; i < len; ++i) {
+      str_info.emplace_back(CHAR(VECTOR_ELT(str_info_holder, i)));
+    }
+    std::vector<char const*> vec(len);
+    std::transform(str_info.cbegin(), str_info.cend(), vec.begin(),
+                   [](std::string const &str) { return str.c_str(); });
+    res_code = XGDMatrixSetStrFeatureInfo(R_ExternalPtrAddr(handle), name, vec.data(), len);
+  }
+  CHECK_CALL(res_code);
+  UNPROTECT(2);
   R_API_END();
   return R_NilValue;
 }
@@ -369,8 +417,9 @@ XGB_DLL SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
   const float *res;
   CHECK_CALL(XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), &olen, &res));
   ret = PROTECT(allocVector(REALSXP, olen));
+  double *ret_ = REAL(ret);
   for (size_t i = 0; i < olen; ++i) {
-    REAL(ret)[i] = res[i];
+    ret_[i] = res[i];
   }
   R_API_END();
   UNPROTECT(1);
@@ -403,13 +452,18 @@ void _BoosterFinalizer(SEXP ext) {
 XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
   SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
-  int len = length(dmats);
-  std::vector<void*> dvec;
-  for (int i = 0; i < len; ++i) {
-    dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
-  }
+  R_xlen_t len = Rf_xlength(dmats);
   BoosterHandle handle;
-  CHECK_CALL(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
+
+  int res_code;
+  {
+    std::vector<void*> dvec;
+    for (R_xlen_t i = 0; i < len; ++i) {
+      dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
+    }
+    res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
+  }
+  CHECK_CALL(res_code);
   R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
   R_API_END();
@@ -419,13 +473,18 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
 
 XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {
   R_API_BEGIN();
-  int len = length(dmats);
-  std::vector<void*> dvec;
-  for (int i = 0; i < len; ++i) {
-    dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
-  }
+  R_xlen_t len = Rf_xlength(dmats);
   BoosterHandle handle;
-  CHECK_CALL(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
+
+  int res_code;
+  {
+    std::vector<void*> dvec;
+    for (R_xlen_t i = 0; i < len; ++i) {
+      dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
+    }
+    res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
+  }
+  CHECK_CALL(res_code);
   R_SetExternalPtrAddr(R_handle, handle);
   R_RegisterCFinalizerEx(R_handle, _BoosterFinalizer, TRUE);
   R_API_END();
@@ -434,9 +493,12 @@ XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {
 
 XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
   R_API_BEGIN();
+  SEXP name_ = PROTECT(Rf_asChar(name));
+  SEXP val_ = PROTECT(Rf_asChar(val));
   CHECK_CALL(XGBoosterSetParam(R_ExternalPtrAddr(handle),
-                               CHAR(asChar(name)),
-                               CHAR(asChar(val))));
+                               CHAR(name_),
+                               CHAR(val_)));
+  UNPROTECT(2);
   R_API_END();
   return R_NilValue;
 }
@@ -452,7 +514,7 @@ XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
 
 XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP grad, SEXP hess) {
   R_API_BEGIN();
-  CHECK_EQ(length(grad), length(hess)) << "gradient and hess must have same length.";
+  CHECK_EQ(Rf_xlength(grad), Rf_xlength(hess)) << "gradient and hess must have same length.";
   SEXP gdim = getAttrib(grad, R_DimSymbol);
   auto n_samples = static_cast<std::size_t>(INTEGER(gdim)[0]);
   auto n_targets = static_cast<std::size_t>(INTEGER(gdim)[1]);
@@ -463,11 +525,15 @@ XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP g
   double const *d_grad = REAL(grad);
   double const *d_hess = REAL(hess);
 
-  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
-  auto [s_grad, s_hess] = xgboost::detail::MakeGradientInterface(
-      ctx, d_grad, d_hess, xgboost::linalg::kF, n_samples, n_targets);
-  CHECK_CALL(XGBoosterTrainOneIter(R_ExternalPtrAddr(handle), R_ExternalPtrAddr(dtrain),
-                                   asInteger(iter), s_grad.c_str(), s_hess.c_str()));
+  int res_code;
+  {
+    auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
+    auto [s_grad, s_hess] = xgboost::detail::MakeGradientInterface(
+        ctx, d_grad, d_hess, xgboost::linalg::kF, n_samples, n_targets);
+    res_code = XGBoosterTrainOneIter(R_ExternalPtrAddr(handle), R_ExternalPtrAddr(dtrain),
+                                     asInteger(iter), s_grad.c_str(), s_hess.c_str());
+  }
+  CHECK_CALL(res_code);
 
   R_API_END();
   return R_NilValue;
@@ -476,24 +542,34 @@ XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP g
 XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
   const char *ret;
   R_API_BEGIN();
-  CHECK_EQ(length(dmats), length(evnames))
+  CHECK_EQ(Rf_xlength(dmats), Rf_xlength(evnames))
       << "dmats and evnams must have same length";
-  int len = length(dmats);
-  std::vector<void*> vec_dmats;
-  std::vector<std::string> vec_names;
-  std::vector<const char*> vec_sptr;
-  for (int i = 0; i < len; ++i) {
-    vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
-    vec_names.emplace_back(CHAR(asChar(VECTOR_ELT(evnames, i))));
+  R_xlen_t len = Rf_xlength(dmats);
+  SEXP evnames_lst = PROTECT(Rf_allocVector(VECSXP, len));
+  for (R_xlen_t i = 0; i < len; i++) {
+    SET_VECTOR_ELT(evnames_lst, i, Rf_asChar(VECTOR_ELT(evnames, i)));
   }
-  for (int i = 0; i < len; ++i) {
-    vec_sptr.push_back(vec_names[i].c_str());
+
+  int res_code;
+  {
+    std::vector<void*> vec_dmats;
+    std::vector<std::string> vec_names;
+    std::vector<const char*> vec_sptr;
+    for (R_xlen_t i = 0; i < len; ++i) {
+      vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
+      vec_names.emplace_back(CHAR(VECTOR_ELT(evnames_lst, i)));
+    }
+    for (R_xlen_t i = 0; i < len; ++i) {
+      vec_sptr.push_back(vec_names[i].c_str());
+    }
+    res_code = XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
+                                    asInteger(iter),
+                                    BeginPtr(vec_dmats),
+                                    BeginPtr(vec_sptr),
+                                    len, &ret);
   }
-  CHECK_CALL(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
-                                  asInteger(iter),
-                                  BeginPtr(vec_dmats),
-                                  BeginPtr(vec_sptr),
-                                  len, &ret));
+  CHECK_CALL(res_code);
+  UNPROTECT(1);
   R_API_END();
   return mkString(ret);
 }
@@ -501,10 +577,11 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
 XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config)  {
   SEXP r_out_shape;
   SEXP r_out_result;
-  SEXP r_out;
+  SEXP r_out = PROTECT(allocVector(VECSXP, 2));
+  SEXP json_config_ = PROTECT(Rf_asChar(json_config));
 
   R_API_BEGIN();
-  char const *c_json_config = CHAR(asChar(json_config));
+  char const *c_json_config = CHAR(json_config_);
 
   bst_ulong out_dim;
   bst_ulong const *out_shape;
@@ -515,23 +592,23 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
 
   r_out_shape = PROTECT(allocVector(INTSXP, out_dim));
   size_t len = 1;
+  int *r_out_shape_ = INTEGER(r_out_shape);
   for (size_t i = 0; i < out_dim; ++i) {
-    INTEGER(r_out_shape)[i] = out_shape[i];
+    r_out_shape_[i] = out_shape[i];
     len *= out_shape[i];
   }
   r_out_result = PROTECT(allocVector(REALSXP, len));
   auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
+  double *r_out_result_ = REAL(r_out_result);
   xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
-    REAL(r_out_result)[i] = out_result[i];
+    r_out_result_[i] = out_result[i];
   });
 
-  r_out = PROTECT(allocVector(VECSXP, 2));
-
   SET_VECTOR_ELT(r_out, 0, r_out_shape);
   SET_VECTOR_ELT(r_out, 1, r_out_result);
 
   R_API_END();
-  UNPROTECT(3);
+  UNPROTECT(4);
 
   return r_out;
 }
@@ -554,7 +631,7 @@ XGB_DLL SEXP XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
   R_API_BEGIN();
   CHECK_CALL(XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle),
                                           RAW(raw),
-                                          length(raw)));
+                                          Rf_xlength(raw)));
   R_API_END();
   return R_NilValue;
 }
@@ -612,45 +689,54 @@ XGB_DLL SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw) {
   R_API_BEGIN();
   CHECK_CALL(XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle),
                                  RAW(raw),
-                                 length(raw)));
+                                 Rf_xlength(raw)));
   R_API_END();
   return R_NilValue;
 }
 
 XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) {
   SEXP out;
+  SEXP continuation_token = PROTECT(R_MakeUnwindCont());
+  SEXP dump_format_ = PROTECT(Rf_asChar(dump_format));
+  SEXP fmap_ = PROTECT(Rf_asChar(fmap));
   R_API_BEGIN();
   bst_ulong olen;
   const char **res;
-  const char *fmt = CHAR(asChar(dump_format));
+  const char *fmt = CHAR(dump_format_);
   CHECK_CALL(XGBoosterDumpModelEx(R_ExternalPtrAddr(handle),
-                                CHAR(asChar(fmap)),
+                                CHAR(fmap_),
                                 asInteger(with_stats),
                                 fmt,
                                 &olen, &res));
   out = PROTECT(allocVector(STRSXP, olen));
-  if (!strcmp("json", fmt)) {
-    std::stringstream stream;
-    stream <<  "[\n";
-    for (size_t i = 0; i < olen; ++i) {
-      stream << res[i];
-      if (i < olen - 1) {
-        stream << ",\n";
-      } else {
-        stream << "\n";
+  try {
+    if (!strcmp("json", fmt)) {
+      std::stringstream stream;
+      stream <<  "[\n";
+      for (size_t i = 0; i < olen; ++i) {
+        stream << res[i];
+        if (i < olen - 1) {
+          stream << ",\n";
+        } else {
+          stream << "\n";
+        }
+      }
+      stream <<  "]";
+      const std::string temp_str = stream.str();
+      SET_STRING_ELT(out, 0, SafeMkChar(temp_str.c_str(), continuation_token));
+    } else {
+      for (size_t i = 0; i < olen; ++i) {
+        std::stringstream stream;
+        stream <<  "booster[" << i <<"]\n" << res[i];
+        const std::string temp_str = stream.str();
+        SET_STRING_ELT(out, i, SafeMkChar(temp_str.c_str(), continuation_token));
       }
     }
-    stream <<  "]";
-    SET_STRING_ELT(out, 0, mkChar(stream.str().c_str()));
-  } else {
-    for (size_t i = 0; i < olen; ++i) {
-      std::stringstream stream;
-      stream <<  "booster[" << i <<"]\n" << res[i];
-      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
-    }
+  } catch (ErrorWithUnwind &e) {
+    R_ContinueUnwind(continuation_token);
   }
   R_API_END();
-  UNPROTECT(1);
+  UNPROTECT(4);
   return out;
 }
 
@@ -676,9 +762,19 @@ XGB_DLL SEXP XGBoosterGetAttr_R(SEXP handle, SEXP name) {
 
 XGB_DLL SEXP XGBoosterSetAttr_R(SEXP handle, SEXP name, SEXP val) {
   R_API_BEGIN();
-  const char *v = isNull(val) ? nullptr : CHAR(asChar(val));
+  const char *v = nullptr;
+  SEXP name_ = PROTECT(Rf_asChar(name));
+  SEXP val_;
+  int n_protected = 1;
+  if (!Rf_isNull(val)) {
+    val_ = PROTECT(Rf_asChar(val));
+    n_protected++;
+    v = CHAR(val_);
+  }
+
   CHECK_CALL(XGBoosterSetAttr(R_ExternalPtrAddr(handle),
-                              CHAR(asChar(name)), v));
+                              CHAR(name_), v));
+  UNPROTECT(n_protected);
   R_API_END();
   return R_NilValue;
 }
@@ -707,7 +803,7 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config) {
   SEXP out_features_sexp;
   SEXP out_scores_sexp;
   SEXP out_shape_sexp;
-  SEXP r_out;
+  SEXP r_out = PROTECT(allocVector(VECSXP, 3));
 
   R_API_BEGIN();
   char const *c_json_config = CHAR(asChar(json_config));
@@ -723,23 +819,24 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config) {
                                    &out_dim, &out_shape, &out_scores));
   out_shape_sexp = PROTECT(allocVector(INTSXP, out_dim));
   size_t len = 1;
+  int *out_shape_sexp_ = INTEGER(out_shape_sexp);
   for (size_t i = 0; i < out_dim; ++i) {
-    INTEGER(out_shape_sexp)[i] = out_shape[i];
+    out_shape_sexp_[i] = out_shape[i];
     len *= out_shape[i];
   }
 
-  out_scores_sexp = PROTECT(allocVector(REALSXP, len));
-  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
-  xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
-    REAL(out_scores_sexp)[i] = out_scores[i];
-  });
-
   out_features_sexp = PROTECT(allocVector(STRSXP, out_n_features));
   for (size_t i = 0; i < out_n_features; ++i) {
     SET_STRING_ELT(out_features_sexp, i, mkChar(out_features[i]));
   }
 
-  r_out = PROTECT(allocVector(VECSXP, 3));
+  out_scores_sexp = PROTECT(allocVector(REALSXP, len));
+  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
+  double *out_scores_sexp_ = REAL(out_scores_sexp);
+  xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
+    out_scores_sexp_[i] = out_scores[i];
+  });
+
   SET_VECTOR_ELT(r_out, 0, out_features_sexp);
   SET_VECTOR_ELT(r_out, 1, out_shape_sexp);
   SET_VECTOR_ELT(r_out, 2, out_scores_sexp);

From 381f1d3dc993cea02b17718c48cd46e68006684c Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <dmitry.razdoburdin@intel.com>
Date: Mon, 4 Dec 2023 09:15:57 +0100
Subject: [PATCH 31/32] Add support inference on SYCL devices (#9800)

---------

Co-authored-by: Dmitry Razdoburdin <>
Co-authored-by: Nikolay Petrov <nikolay.a.petrov@intel.com>
Co-authored-by: Alexandra <alexandra.epanchinzeva@intel.com>
---
 .github/workflows/main.yml                    |  39 ++
 .github/workflows/python_tests.yml            |  41 ++
 CMakeLists.txt                                |  18 +-
 include/xgboost/context.h                     |  13 +-
 include/xgboost/predictor.h                   |   4 +-
 plugin/CMakeLists.txt                         |  28 +-
 plugin/sycl/README.md                         |  40 ++
 plugin/sycl/data.h                            | 256 ++++++++++
 plugin/sycl/device_manager.cc                 | 124 +++++
 plugin/sycl/device_manager.h                  |  47 ++
 plugin/sycl/predictor/predictor.cc            | 342 ++++++++++++++
 plugin/updater_oneapi/README.md               |  42 --
 plugin/updater_oneapi/predictor_oneapi.cc     | 447 ------------------
 .../updater_oneapi/regression_loss_oneapi.h   | 145 ------
 .../updater_oneapi/regression_obj_oneapi.cc   | 182 -------
 src/CMakeLists.txt                            |   4 +
 src/common/common.h                           |   8 +-
 src/gbm/gbtree.cc                             |  44 +-
 src/gbm/gbtree.h                              |   6 +-
 tests/ci_build/conda_env/linux_sycl_test.yml  |  20 +
 tests/ci_build/lint_cpp.py                    |   2 +-
 tests/ci_build/lint_python.py                 |   1 +
 tests/cpp/CMakeLists.txt                      |   6 +-
 tests/cpp/plugin/test_predictor_oneapi.cc     | 168 -------
 .../cpp/plugin/test_regression_obj_oneapi.cc  | 176 -------
 tests/cpp/plugin/test_sycl_predictor.cc       | 101 ++++
 tests/cpp/predictor/test_cpu_predictor.cc     |  92 +---
 tests/cpp/predictor/test_gpu_predictor.cu     |   6 +-
 tests/cpp/predictor/test_predictor.cc         |  90 +++-
 tests/cpp/predictor/test_predictor.h          |   6 +-
 tests/python-sycl/test_sycl_prediction.py     | 165 +++++++
 31 files changed, 1369 insertions(+), 1294 deletions(-)
 create mode 100755 plugin/sycl/README.md
 create mode 100644 plugin/sycl/data.h
 create mode 100644 plugin/sycl/device_manager.cc
 create mode 100644 plugin/sycl/device_manager.h
 create mode 100755 plugin/sycl/predictor/predictor.cc
 delete mode 100755 plugin/updater_oneapi/README.md
 delete mode 100755 plugin/updater_oneapi/predictor_oneapi.cc
 delete mode 100755 plugin/updater_oneapi/regression_loss_oneapi.h
 delete mode 100755 plugin/updater_oneapi/regression_obj_oneapi.cc
 create mode 100644 tests/ci_build/conda_env/linux_sycl_test.yml
 delete mode 100755 tests/cpp/plugin/test_predictor_oneapi.cc
 delete mode 100755 tests/cpp/plugin/test_regression_obj_oneapi.cc
 create mode 100755 tests/cpp/plugin/test_sycl_predictor.cc
 create mode 100644 tests/python-sycl/test_sycl_prediction.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 8f1252806..20e91a5d9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -63,6 +63,45 @@ jobs:
         cd build
         ctest --extra-verbose
 
+  gtest-cpu-sycl:
+    name: Test Google C++ unittest (CPU SYCL)
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.8"]
+    steps:
+    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
+      with:
+        submodules: 'true'
+    - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
+      with:
+        cache-downloads: true
+        cache-env: true
+        environment-name: linux_sycl_test
+        environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
+
+    - name: Display Conda env
+      run: |
+        conda info
+        conda list
+    - name: Build and install XGBoost
+      shell: bash -l {0}
+      run: |
+        mkdir build
+        cd build
+        cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
+        make -j$(nproc)
+    - name: Run gtest binary for SYCL
+      run: |
+        cd build
+        ./testxgboost --gtest_filter=Sycl*
+    - name: Run gtest binary for non SYCL
+      run: |
+        cd build
+        ./testxgboost --gtest_filter=-Sycl*
+
   c-api-demo:
     name: Test installing XGBoost lib + building the C API demo
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index e9704c75d..0fca76673 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -256,6 +256,47 @@ jobs:
       run: |
         pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark
 
+  python-sycl-tests-on-ubuntu:
+    name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }}
+    runs-on: ${{ matrix.config.os }}
+    timeout-minutes: 90
+    strategy:
+      matrix:
+        config:
+          - {os: ubuntu-latest, python-version: "3.8"}
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+
+    - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
+      with:
+        cache-downloads: true
+        cache-env: true
+        environment-name: linux_sycl_test
+        environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
+
+    - name: Display Conda env
+      run: |
+        conda info
+        conda list
+    - name: Build XGBoost on Ubuntu
+      run: |
+        mkdir build
+        cd build
+        cmake .. -DPLUGIN_SYCL=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+        make -j$(nproc)
+    - name: Install Python package
+      run: |
+        cd python-package
+        python --version
+        pip install -v .
+    - name: Test Python package
+      run: |
+        pytest -s -v -rxXs --durations=0 ./tests/python-sycl/
+
+
   python-system-installation-on-ubuntu:
     name: Test XGBoost Python package System Installation on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9c6f7410..dbfa1cdc2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,11 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+if(PLUGIN_SYCL)
+  set(CMAKE_CXX_COMPILER  "g++")
+  set(CMAKE_C_COMPILER  "gcc")
+  string(REPLACE " -isystem ${CONDA_PREFIX}/include" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endif()
+
 project(xgboost LANGUAGES CXX C VERSION 2.1.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
@@ -102,7 +109,7 @@ address, leak, undefined and thread.")
 option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF)
 option(PLUGIN_FEDERATED "Build with Federated Learning" OFF)
 ## TODO: 1. Add check if DPC++ compiler is used for building
-option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF)
+option(PLUGIN_SYCL "SYCL plugin" OFF)
 option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
 
 #-- Checks for building XGBoost
@@ -313,6 +320,15 @@ if(PLUGIN_RMM)
   get_target_property(rmm_link_libs rmm::rmm INTERFACE_LINK_LIBRARIES)
 endif()
 
+if(PLUGIN_SYCL)
+  set(CMAKE_CXX_LINK_EXECUTABLE
+      "icpx <FLAGS> <CMAKE_CXX_LINK_FLAGS> -qopenmp <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+  set(CMAKE_CXX_CREATE_SHARED_LIBRARY
+      "icpx <CMAKE_SHARED_LIBRARY_CXX_FLAGS> -qopenmp <LANGUAGE_COMPILE_FLAGS> \
+      <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG>,<TARGET_SONAME> \
+      -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
+endif()
+
 #-- library
 if(BUILD_STATIC_LIB)
   add_library(xgboost STATIC)
diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index 6745bcb60..f32a07a03 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -250,9 +250,15 @@ struct Context : public XGBoostParameter<Context> {
       default:
         // Do not use the device name as this is likely an internal error, the name
         // wouldn't be valid.
-        LOG(FATAL) << "Unknown device type:"
-                   << static_cast<std::underlying_type_t<DeviceOrd::Type>>(this->Device().device);
-        break;
+        if (this->Device().IsSycl()) {
+          LOG(WARNING) << "The requested feature doesn't have SYCL specific implementation yet. "
+                       << "CPU implementation is used";
+          return cpu_fn();
+        } else {
+          LOG(FATAL) << "Unknown device type:"
+                     << static_cast<std::underlying_type_t<DeviceOrd::Type>>(this->Device().device);
+          break;
+        }
     }
     return std::invoke_result_t<CPUFn>();
   }
@@ -262,7 +268,6 @@ struct Context : public XGBoostParameter<Context> {
    */
   template <typename CPUFn, typename CUDAFn, typename SYCLFn>
   decltype(auto) DispatchDevice(CPUFn&& cpu_fn, CUDAFn&& cuda_fn, SYCLFn&& sycl_fn) const {
-    static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<CUDAFn>>);
     static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<SYCLFn>>);
     if (this->Device().IsSycl()) {
       return sycl_fn();
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index 25571213d..6a38d6496 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -92,8 +92,8 @@ class Predictor {
    * \param out_predt Prediction vector to be initialized.
    * \param model Tree model used for prediction.
    */
-  void InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_float>* out_predt,
-                          const gbm::GBTreeModel& model) const;
+  virtual void InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_float>* out_predt,
+                                  const gbm::GBTreeModel& model) const;
 
   /**
    * \brief Generate batch predictions for a given feature matrix. May use
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 58b31053f..0fecb4fb2 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -1,27 +1,29 @@
-if(PLUGIN_UPDATER_ONEAPI)
-  add_library(oneapi_plugin OBJECT
-    ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/regression_obj_oneapi.cc
-    ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/predictor_oneapi.cc)
-  target_include_directories(oneapi_plugin
+if(PLUGIN_SYCL)
+  set(CMAKE_CXX_COMPILER "icpx")
+  add_library(plugin_sycl OBJECT
+    ${xgboost_SOURCE_DIR}/plugin/sycl/device_manager.cc
+    ${xgboost_SOURCE_DIR}/plugin/sycl/predictor/predictor.cc)
+  target_include_directories(plugin_sycl
     PRIVATE
     ${xgboost_SOURCE_DIR}/include
     ${xgboost_SOURCE_DIR}/dmlc-core/include
     ${xgboost_SOURCE_DIR}/rabit/include)
-  target_compile_definitions(oneapi_plugin PUBLIC -DXGBOOST_USE_ONEAPI=1)
-  target_link_libraries(oneapi_plugin PUBLIC -fsycl)
-  set_target_properties(oneapi_plugin PROPERTIES
+    target_compile_definitions(plugin_sycl PUBLIC -DXGBOOST_USE_SYCL=1)
+    target_link_libraries(plugin_sycl PUBLIC -fsycl)
+    set_target_properties(plugin_sycl PROPERTIES
     COMPILE_FLAGS -fsycl
     CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
     POSITION_INDEPENDENT_CODE ON)
   if(USE_OPENMP)
     find_package(OpenMP REQUIRED)
-    target_link_libraries(oneapi_plugin PUBLIC OpenMP::OpenMP_CXX)
+    set_target_properties(plugin_sycl PROPERTIES
+    COMPILE_FLAGS "-fsycl -qopenmp")
   endif()
-  # Get compilation and link flags of oneapi_plugin and propagate to objxgboost
-  target_link_libraries(objxgboost PUBLIC oneapi_plugin)
-  # Add all objects of oneapi_plugin to objxgboost
-  target_sources(objxgboost INTERFACE $<TARGET_OBJECTS:oneapi_plugin>)
+  # Get compilation and link flags of plugin_sycl and propagate to objxgboost
+  target_link_libraries(objxgboost PUBLIC plugin_sycl)
+  # Add all objects of plugin_sycl to objxgboost
+  target_sources(objxgboost INTERFACE $<TARGET_OBJECTS:plugin_sycl>)
 endif()
 
 # Add the Federate Learning plugin if enabled.
diff --git a/plugin/sycl/README.md b/plugin/sycl/README.md
new file mode 100755
index 000000000..b5dc07a1a
--- /dev/null
+++ b/plugin/sycl/README.md
@@ -0,0 +1,40 @@
+<!--
+******************************************************************************
+* Copyright by Contributors 2017-2023
+*******************************************************************************/-->
+
+# SYCL-based Algorithm for Tree Construction
+This plugin adds support of SYCL programming model for prediction algorithms to XGBoost.
+
+## Usage
+Specify the 'device' parameter as described in the table below to offload model training and inference on SYCL device.
+
+### Algorithms
+| device | Description |
+| --- | --- |
+sycl | use default sycl device  |
+sycl:gpu | use default sycl gpu  |
+sycl:cpu | use default sycl cpu  |
+sycl:gpu:N | use sycl gpu number N |
+sycl:cpu:N | use sycl cpu number N |
+
+Python example:
+```python
+param['device'] = 'sycl:gpu:0'
+```
+Note: 'sycl:cpu' devices have full functional support but can't provide good enough performance. We recommend use 'sycl:cpu' devices only for test purposes.
+Note: if device is specified to be 'sycl', device type will be automatically chosen. In case the system has both sycl GPU and sycl CPU, GPU will on use.
+
+## Dependencies
+To build and use the plugin, install [Intel® oneAPI DPC++/C++ Compiler](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html).
+See also [Intel® oneAPI Programming Guide](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2024-0/overview.html).
+
+## Build
+From the ``xgboost`` directory, run:
+
+```bash
+$ mkdir build
+$ cd build
+$ cmake .. -DPLUGIN_SYCL=ON
+$ make -j
+```
\ No newline at end of file
diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h
new file mode 100644
index 000000000..179c7cd1f
--- /dev/null
+++ b/plugin/sycl/data.h
@@ -0,0 +1,256 @@
+/*!
+ * Copyright by Contributors 2017-2023
+ */
+#ifndef PLUGIN_SYCL_DATA_H_
+#define PLUGIN_SYCL_DATA_H_
+
+#include <cstddef>
+#include <limits>
+#include <mutex>
+#include <vector>
+#include <memory>
+#include <algorithm>
+
+#include "xgboost/base.h"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "xgboost/data.h"
+#pragma GCC diagnostic pop
+#include "xgboost/logging.h"
+#include "xgboost/host_device_vector.h"
+
+#include "../../src/common/threading_utils.h"
+
+#include "CL/sycl.hpp"
+
+namespace xgboost {
+namespace sycl {
+enum class MemoryType { shared, on_device};
+
+
+template <typename T>
+class USMDeleter {
+ public:
+  explicit USMDeleter(::sycl::queue qu) : qu_(qu) {}
+
+  void operator()(T* data) const {
+    ::sycl::free(data, qu_);
+  }
+
+ private:
+  ::sycl::queue qu_;
+};
+
+template <typename T, MemoryType memory_type = MemoryType::shared>
+class USMVector {
+  static_assert(std::is_standard_layout<T>::value, "USMVector admits only POD types");
+
+  std::shared_ptr<T> allocate_memory_(::sycl::queue* qu, size_t size) {
+    if constexpr (memory_type == MemoryType::shared) {
+      return std::shared_ptr<T>(::sycl::malloc_shared<T>(size_, *qu), USMDeleter<T>(*qu));
+    } else {
+      return std::shared_ptr<T>(::sycl::malloc_device<T>(size_, *qu), USMDeleter<T>(*qu));
+    }
+  }
+
+  void copy_vector_to_memory_(::sycl::queue* qu, const std::vector<T> &vec) {
+    if constexpr (memory_type == MemoryType::shared) {
+      std::copy(vec.begin(), vec.end(), data_.get());
+    } else {
+      qu->memcpy(data_.get(), vec.data(), size_ * sizeof(T));
+    }
+  }
+
+
+ public:
+  USMVector() : size_(0), capacity_(0), data_(nullptr) {}
+
+  USMVector(::sycl::queue& qu, size_t size) : size_(size), capacity_(size) {
+    data_ = allocate_memory_(qu, size_);
+  }
+
+  USMVector(::sycl::queue& qu, size_t size, T v) : size_(size), capacity_(size) {
+    data_ = allocate_memory_(qu, size_);
+    qu.fill(data_.get(), v, size_).wait();
+  }
+
+  USMVector(::sycl::queue* qu, const std::vector<T> &vec) {
+    size_ = vec.size();
+    capacity_ = size_;
+    data_ = allocate_memory_(qu, size_);
+    copy_vector_to_memory_(qu, vec);
+  }
+
+  ~USMVector() {
+  }
+
+  USMVector<T>& operator=(const USMVector<T>& other) {
+    size_ = other.size_;
+    capacity_ = other.capacity_;
+    data_ = other.data_;
+    return *this;
+  }
+
+  T* Data() { return data_.get(); }
+  const T* DataConst() const { return data_.get(); }
+
+  size_t Size() const { return size_; }
+
+  size_t Capacity() const { return capacity_; }
+
+  T& operator[] (size_t i) { return data_.get()[i]; }
+  const T& operator[] (size_t i) const { return data_.get()[i]; }
+
+  T* Begin () const { return data_.get(); }
+  T* End () const { return data_.get() + size_; }
+
+  bool Empty() const { return (size_ == 0); }
+
+  void Clear() {
+    data_.reset();
+    size_ = 0;
+    capacity_ = 0;
+  }
+
+  void Resize(::sycl::queue* qu, size_t size_new) {
+    if (size_new <= capacity_) {
+      size_ = size_new;
+    } else {
+      size_t size_old = size_;
+      auto data_old = data_;
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);;
+      if (size_old > 0) {
+        qu->memcpy(data_.get(), data_old.get(), sizeof(T) * size_old).wait();
+      }
+    }
+  }
+
+  void Resize(::sycl::queue* qu, size_t size_new, T v) {
+    if (size_new <= size_) {
+      size_ = size_new;
+    } else if (size_new <= capacity_) {
+      qu->fill(data_.get() + size_, v, size_new - size_).wait();
+      size_ = size_new;
+    } else {
+      size_t size_old = size_;
+      auto data_old = data_;
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);
+      if (size_old > 0) {
+        qu->memcpy(data_.get(), data_old.get(), sizeof(T) * size_old).wait();
+      }
+      qu->fill(data_.get() + size_old, v, size_new - size_old).wait();
+    }
+  }
+
+  ::sycl::event ResizeAsync(::sycl::queue* qu, size_t size_new, T v) {
+    if (size_new <= size_) {
+      size_ = size_new;
+      return ::sycl::event();
+    } else if (size_new <= capacity_) {
+      auto event = qu->fill(data_.get() + size_, v, size_new - size_);
+      size_ = size_new;
+      return event;
+    } else {
+      size_t size_old = size_;
+      auto data_old = data_;
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);
+      ::sycl::event event;
+      if (size_old > 0) {
+        event = qu->memcpy(data_.get(), data_old.get(), sizeof(T) * size_old);
+      }
+      return qu->fill(data_.get() + size_old, v, size_new - size_old, event);
+    }
+  }
+
+  ::sycl::event ResizeAndFill(::sycl::queue* qu, size_t size_new, int v) {
+    if (size_new <= size_) {
+      size_ = size_new;
+      return qu->memset(data_.get(), v, size_new * sizeof(T));
+    } else if (size_new <= capacity_) {
+      size_ = size_new;
+      return qu->memset(data_.get(), v, size_new * sizeof(T));
+    } else {
+      size_t size_old = size_;
+      auto data_old = data_;
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);
+      return qu->memset(data_.get(), v, size_new * sizeof(T));
+    }
+  }
+
+  ::sycl::event Fill(::sycl::queue* qu, T v) {
+    return qu->fill(data_.get(), v, size_);
+  }
+
+  void Init(::sycl::queue* qu, const std::vector<T> &vec) {
+    size_ = vec.size();
+    capacity_ = size_;
+    data_ = allocate_memory_(qu, size_);
+    copy_vector_to_memory_(qu, vec);
+  }
+
+  using value_type = T;  // NOLINT
+
+ private:
+  size_t size_;
+  size_t capacity_;
+  std::shared_ptr<T> data_;
+};
+
+/* Wrapper for DMatrix which stores all batches in a single USM buffer */
+struct DeviceMatrix {
+  DMatrix* p_mat;  // Pointer to the original matrix on the host
+  ::sycl::queue qu_;
+  USMVector<size_t> row_ptr;
+  USMVector<Entry> data;
+  size_t total_offset;
+
+  DeviceMatrix(::sycl::queue qu, DMatrix* dmat) : p_mat(dmat), qu_(qu) {
+    size_t num_row = 0;
+    size_t num_nonzero = 0;
+    for (auto &batch : dmat->GetBatches<SparsePage>()) {
+      const auto& data_vec = batch.data.HostVector();
+      const auto& offset_vec = batch.offset.HostVector();
+      num_nonzero += data_vec.size();
+      num_row += batch.Size();
+    }
+
+    row_ptr.Resize(&qu_, num_row + 1);
+    data.Resize(&qu_, num_nonzero);
+
+    size_t data_offset = 0;
+    for (auto &batch : dmat->GetBatches<SparsePage>()) {
+      const auto& data_vec = batch.data.HostVector();
+      const auto& offset_vec = batch.offset.HostVector();
+      size_t batch_size = batch.Size();
+      if (batch_size > 0) {
+        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
+                  row_ptr.Data() + batch.base_rowid);
+        if (batch.base_rowid > 0) {
+          for (size_t i = 0; i < batch_size; i++)
+            row_ptr[i + batch.base_rowid] += batch.base_rowid;
+        }
+        std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size],
+                  data.Data() + data_offset);
+        data_offset += offset_vec[batch_size];
+      }
+    }
+    row_ptr[num_row] = data_offset;
+    total_offset = data_offset;
+  }
+
+  ~DeviceMatrix() {
+  }
+};
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_DATA_H_
diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc
new file mode 100644
index 000000000..0254cdd6a
--- /dev/null
+++ b/plugin/sycl/device_manager.cc
@@ -0,0 +1,124 @@
+/*!
+ * Copyright 2017-2023 by Contributors
+ * \file device_manager.cc
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <rabit/rabit.h>
+#pragma GCC diagnostic pop
+
+#include "../sycl/device_manager.h"
+
+namespace xgboost {
+namespace sycl {
+
+::sycl::device DeviceManager::GetDevice(const DeviceOrd& device_spec) const {
+    if (!device_spec.IsSycl()) {
+        LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
+                     << device_spec.Name() << ". "
+                     << "Default sycl device_selector will be used.";
+    }
+
+    bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
+                                    (rabit::IsDistributed());
+    if (not_use_default_selector) {
+      DeviceRegister& device_register = GetDevicesRegister();
+      const int device_idx = rabit::IsDistributed() ? rabit::GetRank() : device_spec.ordinal;
+      if (device_spec.IsSyclDefault()) {
+          auto& devices = device_register.devices;
+          CHECK_LT(device_idx, devices.size());
+          return devices[device_idx];
+      } else if (device_spec.IsSyclCPU()) {
+          auto& cpu_devices = device_register.cpu_devices;
+          CHECK_LT(device_idx, cpu_devices.size());
+          return cpu_devices[device_idx];
+      } else {
+          auto& gpu_devices = device_register.gpu_devices;
+          CHECK_LT(device_idx, gpu_devices.size());
+          return gpu_devices[device_idx];
+      }
+    } else {
+        if (device_spec.IsSyclCPU()) {
+            return ::sycl::device(::sycl::cpu_selector_v);
+        } else if (device_spec.IsSyclGPU()) {
+            return ::sycl::device(::sycl::gpu_selector_v);
+        } else {
+            return ::sycl::device(::sycl::default_selector_v);
+        }
+    }
+}
+
+::sycl::queue DeviceManager::GetQueue(const DeviceOrd& device_spec) const {
+    if (!device_spec.IsSycl()) {
+        LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
+                     << device_spec.Name() << ". "
+                     << "Default sycl device_selector will be used.";
+    }
+
+    QueueRegister_t& queue_register = GetQueueRegister();
+    if (queue_register.count(device_spec.Name()) > 0) {
+        return queue_register.at(device_spec.Name());
+    }
+
+    bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
+                                    (rabit::IsDistributed());
+    std::lock_guard<std::mutex> guard(queue_registering_mutex);
+    if (not_use_default_selector) {
+      DeviceRegister& device_register = GetDevicesRegister();
+      const int device_idx = rabit::IsDistributed() ? rabit::GetRank() : device_spec.ordinal;
+      if (device_spec.IsSyclDefault()) {
+          auto& devices = device_register.devices;
+          CHECK_LT(device_idx, devices.size());
+          queue_register[device_spec.Name()] = ::sycl::queue(devices[device_idx]);
+      } else if (device_spec.IsSyclCPU()) {
+          auto& cpu_devices = device_register.cpu_devices;
+          CHECK_LT(device_idx, cpu_devices.size());
+          queue_register[device_spec.Name()] = ::sycl::queue(cpu_devices[device_idx]);;
+      } else if (device_spec.IsSyclGPU()) {
+          auto& gpu_devices = device_register.gpu_devices;
+          CHECK_LT(device_idx, gpu_devices.size());
+          queue_register[device_spec.Name()] = ::sycl::queue(gpu_devices[device_idx]);
+      }
+    } else {
+        if (device_spec.IsSyclCPU()) {
+            queue_register[device_spec.Name()] = ::sycl::queue(::sycl::cpu_selector_v);
+        } else if (device_spec.IsSyclGPU()) {
+            queue_register[device_spec.Name()] = ::sycl::queue(::sycl::gpu_selector_v);
+        } else {
+            queue_register[device_spec.Name()] = ::sycl::queue(::sycl::default_selector_v);
+        }
+    }
+    return queue_register.at(device_spec.Name());
+}
+
+DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const {
+    static DeviceRegister device_register;
+
+    if (device_register.devices.size() == 0) {
+        std::lock_guard<std::mutex> guard(device_registering_mutex);
+        std::vector<::sycl::device> devices = ::sycl::device::get_devices();
+        for (size_t i = 0; i < devices.size(); i++) {
+            LOG(INFO) << "device_index = " << i << ", name = "
+                      << devices[i].get_info<::sycl::info::device::name>();
+        }
+
+        for (size_t i = 0; i < devices.size(); i++) {
+            device_register.devices.push_back(devices[i]);
+            if (devices[i].is_cpu()) {
+                device_register.cpu_devices.push_back(devices[i]);
+            } else if (devices[i].is_gpu()) {
+                device_register.gpu_devices.push_back(devices[i]);
+            }
+        }
+    }
+    return device_register;
+}
+
+DeviceManager::QueueRegister_t& DeviceManager::GetQueueRegister() const {
+    static QueueRegister_t queue_register;
+    return queue_register;
+}
+
+}  // namespace sycl
+}  // namespace xgboost
diff --git a/plugin/sycl/device_manager.h b/plugin/sycl/device_manager.h
new file mode 100644
index 000000000..0ae2ee9fe
--- /dev/null
+++ b/plugin/sycl/device_manager.h
@@ -0,0 +1,47 @@
+/*!
+ * Copyright 2017-2023 by Contributors
+ * \file device_manager.h
+ */
+#ifndef PLUGIN_SYCL_DEVICE_MANAGER_H_
+#define PLUGIN_SYCL_DEVICE_MANAGER_H_
+
+#include <vector>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+#include <CL/sycl.hpp>
+
+#include "xgboost/context.h"
+
+namespace xgboost {
+namespace sycl {
+
+class DeviceManager {
+ public:
+  ::sycl::queue GetQueue(const DeviceOrd& device_spec) const;
+
+  ::sycl::device GetDevice(const DeviceOrd& device_spec) const;
+
+ private:
+  using QueueRegister_t = std::unordered_map<std::string, ::sycl::queue>;
+  constexpr static int kDefaultOrdinal = -1;
+
+  struct DeviceRegister {
+    std::vector<::sycl::device> devices;
+    std::vector<::sycl::device> cpu_devices;
+    std::vector<::sycl::device> gpu_devices;
+  };
+
+  QueueRegister_t& GetQueueRegister() const;
+
+  DeviceRegister& GetDevicesRegister() const;
+
+  mutable std::mutex queue_registering_mutex;
+  mutable std::mutex device_registering_mutex;
+};
+
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_DEVICE_MANAGER_H_
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
new file mode 100755
index 000000000..3ceb99f1e
--- /dev/null
+++ b/plugin/sycl/predictor/predictor.cc
@@ -0,0 +1,342 @@
+/*!
+ * Copyright by Contributors 2017-2023
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <rabit/rabit.h>
+#pragma GCC diagnostic pop
+
+#include <cstddef>
+#include <limits>
+#include <mutex>
+
+#include <CL/sycl.hpp>
+
+#include "../data.h"
+
+#include "dmlc/registry.h"
+
+#include "xgboost/tree_model.h"
+#include "xgboost/predictor.h"
+#include "xgboost/tree_updater.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#include "../../src/data/adapter.h"
+#pragma GCC diagnostic pop
+#include "../../src/common/math.h"
+#include "../../src/gbm/gbtree_model.h"
+
+#include "../device_manager.h"
+
+namespace xgboost {
+namespace sycl {
+namespace predictor {
+
+DMLC_REGISTRY_FILE_TAG(predictor_sycl);
+
+/* Wrapper for descriptor of a tree node */
+struct DeviceNode {
+  DeviceNode()
+      : fidx(-1), left_child_idx(-1), right_child_idx(-1) {}
+
+  union NodeValue {
+    float leaf_weight;
+    float fvalue;
+  };
+
+  int fidx;
+  int left_child_idx;
+  int right_child_idx;
+  NodeValue val;
+
+  explicit DeviceNode(const RegTree::Node& n) {
+    this->left_child_idx = n.LeftChild();
+    this->right_child_idx = n.RightChild();
+    this->fidx = n.SplitIndex();
+    if (n.DefaultLeft()) {
+      fidx |= (1U << 31);
+    }
+
+    if (n.IsLeaf()) {
+      this->val.leaf_weight = n.LeafValue();
+    } else {
+      this->val.fvalue = n.SplitCond();
+    }
+  }
+
+  bool IsLeaf() const { return left_child_idx == -1; }
+
+  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
+
+  bool MissingLeft() const { return (fidx >> 31) != 0; }
+
+  int MissingIdx() const {
+    if (MissingLeft()) {
+      return this->left_child_idx;
+    } else {
+      return this->right_child_idx;
+    }
+  }
+
+  float GetFvalue() const { return val.fvalue; }
+
+  float GetWeight() const { return val.leaf_weight; }
+};
+
+/* SYCL implementation of a device model,
+ * storing tree structure in USM buffers to provide access from device kernels
+ */
+class DeviceModel {
+ public:
+  ::sycl::queue qu_;
+  USMVector<DeviceNode> nodes_;
+  USMVector<size_t> tree_segments_;
+  USMVector<int> tree_group_;
+  size_t tree_beg_;
+  size_t tree_end_;
+  int num_group_;
+
+  DeviceModel() {}
+
+  ~DeviceModel() {}
+
+  void Init(::sycl::queue qu, const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) {
+    qu_ = qu;
+
+    tree_segments_.Resize(&qu_, (tree_end - tree_begin) + 1);
+    int sum = 0;
+    tree_segments_[0] = sum;
+    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      if (model.trees[tree_idx]->HasCategoricalSplit()) {
+        LOG(FATAL) << "Categorical features are not yet supported by sycl";
+      }
+      sum += model.trees[tree_idx]->GetNodes().size();
+      tree_segments_[tree_idx - tree_begin + 1] = sum;
+    }
+
+    nodes_.Resize(&qu_, sum);
+    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      auto& src_nodes = model.trees[tree_idx]->GetNodes();
+      for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++)
+        nodes_[node_idx + tree_segments_[tree_idx - tree_begin]] =
+          static_cast<DeviceNode>(src_nodes[node_idx]);
+    }
+
+    tree_group_.Resize(&qu_, model.tree_info.size());
+    for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++)
+      tree_group_[tree_idx] = model.tree_info[tree_idx];
+
+    tree_beg_ = tree_begin;
+    tree_end_ = tree_end;
+    num_group_ = model.learner_model_param->num_output_group;
+  }
+};
+
+float GetFvalue(int ridx, int fidx, Entry* data, size_t* row_ptr, bool* is_missing) {
+  // Binary search
+  auto begin_ptr = data + row_ptr[ridx];
+  auto end_ptr = data + row_ptr[ridx + 1];
+  Entry* previous_middle = nullptr;
+  while (end_ptr != begin_ptr) {
+    auto middle = begin_ptr + (end_ptr - begin_ptr) / 2;
+    if (middle == previous_middle) {
+      break;
+    } else {
+      previous_middle = middle;
+    }
+
+    if (middle->index == fidx) {
+      *is_missing = false;
+      return middle->fvalue;
+    } else if (middle->index < fidx) {
+      begin_ptr = middle;
+    } else {
+      end_ptr = middle;
+    }
+  }
+  *is_missing = true;
+  return 0.0;
+}
+
+float GetLeafWeight(int ridx, const DeviceNode* tree, Entry* data, size_t* row_ptr) {
+  DeviceNode n = tree[0];
+  int node_id = 0;
+  bool is_missing;
+  while (!n.IsLeaf()) {
+    float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, &is_missing);
+    // Missing value
+    if (is_missing) {
+      n = tree[n.MissingIdx()];
+    } else {
+      if (fvalue < n.GetFvalue()) {
+        node_id = n.left_child_idx;
+        n = tree[n.left_child_idx];
+      } else {
+        node_id = n.right_child_idx;
+        n = tree[n.right_child_idx];
+      }
+    }
+  }
+  return n.GetWeight();
+}
+
+void DevicePredictInternal(::sycl::queue qu,
+                           sycl::DeviceMatrix* dmat,
+                           HostDeviceVector<float>* out_preds,
+                           const gbm::GBTreeModel& model,
+                           size_t tree_begin,
+                           size_t tree_end) {
+  if (tree_end - tree_begin == 0) return;
+  if (out_preds->HostVector().size() == 0) return;
+
+  DeviceModel device_model;
+  device_model.Init(qu, model, tree_begin, tree_end);
+
+  auto& out_preds_vec = out_preds->HostVector();
+
+  DeviceNode* nodes = device_model.nodes_.Data();
+  ::sycl::buffer<float, 1> out_preds_buf(out_preds_vec.data(), out_preds_vec.size());
+  size_t* tree_segments = device_model.tree_segments_.Data();
+  int* tree_group = device_model.tree_group_.Data();
+  size_t* row_ptr = dmat->row_ptr.Data();
+  Entry* data = dmat->data.Data();
+  int num_features = dmat->p_mat->Info().num_col_;
+  int num_rows = dmat->row_ptr.Size() - 1;
+  int num_group = model.learner_model_param->num_output_group;
+
+  qu.submit([&](::sycl::handler& cgh) {
+    auto out_predictions = out_preds_buf.template get_access<::sycl::access::mode::read_write>(cgh);
+    cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::id<1> pid) {
+      int global_idx = pid[0];
+      if (global_idx >= num_rows) return;
+      if (num_group == 1) {
+        float sum = 0.0;
+        for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          const DeviceNode* tree = nodes + tree_segments[tree_idx - tree_begin];
+          sum += GetLeafWeight(global_idx, tree, data, row_ptr);
+        }
+        out_predictions[global_idx] += sum;
+      } else {
+        for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          const DeviceNode* tree = nodes + tree_segments[tree_idx - tree_begin];
+          int out_prediction_idx = global_idx * num_group + tree_group[tree_idx];
+          out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr);
+        }
+      }
+    });
+  }).wait();
+}
+
+class Predictor : public xgboost::Predictor {
+ protected:
+  void InitOutPredictions(const MetaInfo& info,
+                          HostDeviceVector<bst_float>* out_preds,
+                          const gbm::GBTreeModel& model) const override {
+    CHECK_NE(model.learner_model_param->num_output_group, 0);
+    size_t n = model.learner_model_param->num_output_group * info.num_row_;
+    const auto& base_margin = info.base_margin_.Data()->HostVector();
+    out_preds->Resize(n);
+    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
+    if (base_margin.size() == n) {
+      CHECK_EQ(out_preds->Size(), n);
+      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
+    } else {
+      auto base_score = model.learner_model_param->BaseScore(ctx_)(0);
+      if (!base_margin.empty()) {
+        std::ostringstream oss;
+        oss << "Ignoring the base margin, since it has incorrect length. "
+            << "The base margin must be an array of length ";
+        if (model.learner_model_param->num_output_group > 1) {
+          oss << "[num_class] * [number of data points], i.e. "
+              << model.learner_model_param->num_output_group << " * " << info.num_row_
+              << " = " << n << ". ";
+        } else {
+          oss << "[number of data points], i.e. " << info.num_row_ << ". ";
+        }
+        oss << "Instead, all data points will use "
+            << "base_score = " << base_score;
+        LOG(WARNING) << oss.str();
+      }
+      std::fill(out_preds_h.begin(), out_preds_h.end(), base_score);
+    }
+  }
+
+ public:
+  explicit Predictor(Context const* context) :
+      xgboost::Predictor::Predictor{context},
+      cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)) {}
+
+  void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
+                    const gbm::GBTreeModel &model, uint32_t tree_begin,
+                    uint32_t tree_end = 0) const override {
+    ::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
+    // TODO(razdoburdin): remove temporary workaround after cache fix
+    sycl::DeviceMatrix device_matrix(qu, dmat);
+
+    auto* out_preds = &predts->predictions;
+    if (tree_end == 0) {
+      tree_end = model.trees.size();
+    }
+
+    if (tree_begin < tree_end) {
+      DevicePredictInternal(qu, &device_matrix, out_preds, model, tree_begin, tree_end);
+    }
+  }
+
+  bool InplacePredict(std::shared_ptr<DMatrix> p_m,
+                      const gbm::GBTreeModel &model, float missing,
+                      PredictionCacheEntry *out_preds, uint32_t tree_begin,
+                      unsigned tree_end) const override {
+    LOG(WARNING) << "InplacePredict is not yet implemented for SYCL. CPU Predictor is used.";
+    return cpu_predictor->InplacePredict(p_m, model, missing, out_preds, tree_begin, tree_end);
+  }
+
+  void PredictInstance(const SparsePage::Inst& inst,
+                       std::vector<bst_float>* out_preds,
+                       const gbm::GBTreeModel& model, unsigned ntree_limit,
+                       bool is_column_split) const override {
+    LOG(WARNING) << "PredictInstance is not yet implemented for SYCL. CPU Predictor is used.";
+    cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit, is_column_split);
+  }
+
+  void PredictLeaf(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_preds,
+                   const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
+    LOG(WARNING) << "PredictLeaf is not yet implemented for SYCL. CPU Predictor is used.";
+    cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
+  }
+
+  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                           const gbm::GBTreeModel& model, uint32_t ntree_limit,
+                           const std::vector<bst_float>* tree_weights,
+                           bool approximate, int condition,
+                           unsigned condition_feature) const override {
+    LOG(WARNING) << "PredictContribution is not yet implemented for SYCL. CPU Predictor is used.";
+    cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights,
+                                       approximate, condition, condition_feature);
+  }
+
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
+                                       const gbm::GBTreeModel& model, unsigned ntree_limit,
+                                       const std::vector<bst_float>* tree_weights,
+                                       bool approximate) const override {
+    LOG(WARNING) << "PredictInteractionContributions is not yet implemented for SYCL. "
+                 << "CPU Predictor is used.";
+    cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit,
+                                                   tree_weights, approximate);
+  }
+
+ private:
+  DeviceManager device_manager;
+
+  std::unique_ptr<xgboost::Predictor> cpu_predictor;
+};
+
+XGBOOST_REGISTER_PREDICTOR(Predictor, "sycl_predictor")
+.describe("Make predictions using SYCL.")
+.set_body([](Context const* ctx) { return new Predictor(ctx); });
+
+}  // namespace predictor
+}  // namespace sycl
+}  // namespace xgboost
diff --git a/plugin/updater_oneapi/README.md b/plugin/updater_oneapi/README.md
deleted file mode 100755
index c2faf6574..000000000
--- a/plugin/updater_oneapi/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# DPC++-based Algorithm for Tree Construction
-This plugin adds support of OneAPI programming model for tree construction and prediction algorithms to XGBoost.
-
-## Usage
-Specify the 'objective' parameter as one of the following options to offload computation of objective function on OneAPI device. 
-
-### Algorithms
-| objective | Description |
-| --- | --- |
-reg:squarederror_oneapi | regression with squared loss  |
-reg:squaredlogerror_oneapi | regression with root mean squared logarithmic loss |
-reg:logistic_oneapi | logistic regression for probability regression task |
-binary:logistic_oneapi | logistic regression for binary classification task |
-binary:logitraw_oneapi | logistic regression for classification, output score before logistic transformation |
-
-Specify the 'predictor' parameter as one of the following options to offload prediction stage on OneAPI device. 
-
-### Algorithms
-| predictor | Description |
-| --- | --- |
-predictor_oneapi | prediction using OneAPI device  |
-
-Please note that parameter names are not finalized and can be changed during further integration of OneAPI support.
-
-Python example:
-```python
-param['predictor'] = 'predictor_oneapi'
-param['objective'] = 'reg:squarederror_oneapi'
-```
-
-## Dependencies
-Building the plugin requires Data Parallel C++ Compiler (https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/dpc-compiler.html)
-
-## Build
-From the command line on Linux starting from the xgboost directory:
-
-```bash
-$ mkdir build
-$ cd build
-$ EXPORT CXX=dpcpp && cmake .. -DPLUGIN_UPDATER_ONEAPI=ON
-$ make -j
-```
diff --git a/plugin/updater_oneapi/predictor_oneapi.cc b/plugin/updater_oneapi/predictor_oneapi.cc
deleted file mode 100755
index 25a14186c..000000000
--- a/plugin/updater_oneapi/predictor_oneapi.cc
+++ /dev/null
@@ -1,447 +0,0 @@
-/*!
- * Copyright by Contributors 2017-2020
- */
-#include <any>  // for any
-#include <cstddef>
-#include <limits>
-#include <mutex>
-
-#include "../../src/common/math.h"
-#include "../../src/data/adapter.h"
-#include "../../src/gbm/gbtree_model.h"
-#include "CL/sycl.hpp"
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/logging.h"
-#include "xgboost/predictor.h"
-#include "xgboost/tree_model.h"
-#include "xgboost/tree_updater.h"
-
-namespace xgboost {
-namespace predictor {
-
-DMLC_REGISTRY_FILE_TAG(predictor_oneapi);
-
-/*! \brief Element from a sparse vector */
-struct EntryOneAPI {
-  /*! \brief feature index */
-  bst_feature_t index;
-  /*! \brief feature value */
-  bst_float fvalue;
-  /*! \brief default constructor */
-  EntryOneAPI() = default;
-  /*!
-   * \brief constructor with index and value
-   * \param index The feature or row index.
-   * \param fvalue The feature value.
-   */
-  EntryOneAPI(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {}
-
-  EntryOneAPI(const Entry& entry) : index(entry.index), fvalue(entry.fvalue) {}
-
-  /*! \brief reversely compare feature values */
-  inline static bool CmpValue(const EntryOneAPI& a, const EntryOneAPI& b) {
-    return a.fvalue < b.fvalue;
-  }
-  inline bool operator==(const EntryOneAPI& other) const {
-    return (this->index == other.index && this->fvalue == other.fvalue);
-  }
-};
-
-struct DeviceMatrixOneAPI {
-  DMatrix* p_mat;  // Pointer to the original matrix on the host
-  cl::sycl::queue qu_;
-  size_t* row_ptr;
-  size_t row_ptr_size;
-  EntryOneAPI* data;
-
-  DeviceMatrixOneAPI(DMatrix* dmat, cl::sycl::queue qu) : p_mat(dmat), qu_(qu) {
-    size_t num_row = 0;
-    size_t num_nonzero = 0;
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const auto& data_vec = batch.data.HostVector();
-      const auto& offset_vec = batch.offset.HostVector();
-      num_nonzero += data_vec.size();
-      num_row += batch.Size();
-    }
-
-    row_ptr = cl::sycl::malloc_shared<size_t>(num_row + 1, qu_);
-    data = cl::sycl::malloc_shared<EntryOneAPI>(num_nonzero, qu_);
-
-    size_t data_offset = 0;
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const auto& data_vec = batch.data.HostVector();
-      const auto& offset_vec = batch.offset.HostVector();
-      size_t batch_size = batch.Size();
-      if (batch_size > 0) {
-        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
-                  row_ptr + batch.base_rowid);
-        if (batch.base_rowid > 0) {
-          for(size_t i = 0; i < batch_size; i++)
-            row_ptr[i + batch.base_rowid] += batch.base_rowid;
-        }
-        std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size],
-                  data + data_offset);
-        data_offset += offset_vec[batch_size];
-      }
-    }
-    row_ptr[num_row] = data_offset;
-    row_ptr_size = num_row + 1;
-  }
-
-  ~DeviceMatrixOneAPI() {
-    if (row_ptr) {
-      cl::sycl::free(row_ptr, qu_);
-    }
-    if (data) {
-      cl::sycl::free(data, qu_);
-    }
-  }
-};
-
-struct DeviceNodeOneAPI {
-  DeviceNodeOneAPI()
-      : fidx(-1), left_child_idx(-1), right_child_idx(-1) {}
-
-  union NodeValue {
-    float leaf_weight;
-    float fvalue;
-  };
-
-  int fidx;
-  int left_child_idx;
-  int right_child_idx;
-  NodeValue val;
-
-  DeviceNodeOneAPI(const RegTree::Node& n) {  // NOLINT
-    this->left_child_idx = n.LeftChild();
-    this->right_child_idx = n.RightChild();
-    this->fidx = n.SplitIndex();
-    if (n.DefaultLeft()) {
-      fidx |= (1U << 31);
-    }
-
-    if (n.IsLeaf()) {
-      this->val.leaf_weight = n.LeafValue();
-    } else {
-      this->val.fvalue = n.SplitCond();
-    }
-  }
-
-  bool IsLeaf() const { return left_child_idx == -1; }
-
-  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
-
-  bool MissingLeft() const { return (fidx >> 31) != 0; }
-
-  int MissingIdx() const {
-    if (MissingLeft()) {
-      return this->left_child_idx;
-    } else {
-      return this->right_child_idx;
-    }
-  }
-
-  float GetFvalue() const { return val.fvalue; }
-
-  float GetWeight() const { return val.leaf_weight; }
-};
-
-class DeviceModelOneAPI {
- public:
-  cl::sycl::queue qu_;
-  DeviceNodeOneAPI* nodes;
-  size_t* tree_segments;
-  int* tree_group;
-  size_t tree_beg_;
-  size_t tree_end_;
-  int num_group;
-
-  DeviceModelOneAPI() : nodes(nullptr), tree_segments(nullptr), tree_group(nullptr) {}
-
-  ~DeviceModelOneAPI() {
-    Reset();
-  }
-
-  void Reset() {
-    if (nodes)
-      cl::sycl::free(nodes, qu_);
-    if (tree_segments)
-      cl::sycl::free(tree_segments, qu_);
-    if (tree_group)
-      cl::sycl::free(tree_group, qu_);
-  }
-
-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, cl::sycl::queue qu) {
-    qu_ = qu;
-    CHECK_EQ(model.param.size_leaf_vector, 0);
-    Reset();
-
-    tree_segments = cl::sycl::malloc_shared<size_t>((tree_end - tree_begin) + 1, qu_);
-    int sum = 0;
-    tree_segments[0] = sum;
-    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      sum += model.trees[tree_idx]->GetNodes().size();
-      tree_segments[tree_idx - tree_begin + 1] = sum;
-    }
-
-    nodes = cl::sycl::malloc_shared<DeviceNodeOneAPI>(sum, qu_);
-    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto& src_nodes = model.trees[tree_idx]->GetNodes();
-      for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++)
-        nodes[node_idx + tree_segments[tree_idx - tree_begin]] = src_nodes[node_idx];
-    }
-
-    tree_group = cl::sycl::malloc_shared<int>(model.tree_info.size(), qu_);
-    for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++)
-      tree_group[tree_idx] = model.tree_info[tree_idx];
-
-    tree_beg_ = tree_begin;
-    tree_end_ = tree_end;
-    num_group = model.learner_model_param->num_output_group;
-  }
-};
-
-float GetFvalue(int ridx, int fidx, EntryOneAPI* data, size_t* row_ptr, bool& is_missing) {
-  // Binary search
-  auto begin_ptr = data + row_ptr[ridx];
-  auto end_ptr = data + row_ptr[ridx + 1];
-  EntryOneAPI* previous_middle = nullptr;
-  while (end_ptr != begin_ptr) {
-    auto middle = begin_ptr + (end_ptr - begin_ptr) / 2;
-    if (middle == previous_middle) {
-      break;
-    } else {
-      previous_middle = middle;
-    }
-
-    if (middle->index == fidx) {
-      is_missing = false;
-      return middle->fvalue;
-    } else if (middle->index < fidx) {
-      begin_ptr = middle;
-    } else {
-      end_ptr = middle;
-    }
-  }
-  is_missing = true;
-  return 0.0;
-}
-
-float GetLeafWeight(int ridx, const DeviceNodeOneAPI* tree, EntryOneAPI* data, size_t* row_ptr) {
-  DeviceNodeOneAPI n = tree[0];
-  int node_id = 0;
-  bool is_missing;
-  while (!n.IsLeaf()) {
-    float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, is_missing);
-    // Missing value
-    if (is_missing) {
-      n = tree[n.MissingIdx()];
-    } else {
-      if (fvalue < n.GetFvalue()) {
-        node_id = n.left_child_idx;
-        n = tree[n.left_child_idx];
-      } else {
-        node_id = n.right_child_idx;
-        n = tree[n.right_child_idx];
-      }
-    }
-  }
-  return n.GetWeight();
-}
-
-class PredictorOneAPI : public Predictor {
- protected:
-  void InitOutPredictions(const MetaInfo& info,
-                          HostDeviceVector<bst_float>* out_preds,
-                          const gbm::GBTreeModel& model) const {
-    CHECK_NE(model.learner_model_param->num_output_group, 0);
-    size_t n = model.learner_model_param->num_output_group * info.num_row_;
-    const auto& base_margin = info.base_margin_.HostVector();
-    out_preds->Resize(n);
-    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
-    if (base_margin.size() == n) {
-      CHECK_EQ(out_preds->Size(), n);
-      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
-    } else {
-      if (!base_margin.empty()) {
-        std::ostringstream oss;
-        oss << "Ignoring the base margin, since it has incorrect length. "
-            << "The base margin must be an array of length ";
-        if (model.learner_model_param->num_output_group > 1) {
-          oss << "[num_class] * [number of data points], i.e. "
-              << model.learner_model_param->num_output_group << " * " << info.num_row_
-              << " = " << n << ". ";
-        } else {
-          oss << "[number of data points], i.e. " << info.num_row_ << ". ";
-        }
-        oss << "Instead, all data points will use "
-            << "base_score = " << model.learner_model_param->base_score;
-        LOG(WARNING) << oss.str();
-      }
-      std::fill(out_preds_h.begin(), out_preds_h.end(),
-                model.learner_model_param->base_score);
-    }
-  }
-
-  void DevicePredictInternal(DeviceMatrixOneAPI* dmat, HostDeviceVector<float>* out_preds,
-                             const gbm::GBTreeModel& model, size_t tree_begin,
-                             size_t tree_end) {
-    if (tree_end - tree_begin == 0) {
-      return;
-    }
-    model_.Init(model, tree_begin, tree_end, qu_);
-
-    auto& out_preds_vec = out_preds->HostVector();
-
-    DeviceNodeOneAPI* nodes = model_.nodes;
-    cl::sycl::buffer<float, 1> out_preds_buf(out_preds_vec.data(), out_preds_vec.size());
-    size_t* tree_segments = model_.tree_segments;
-    int* tree_group = model_.tree_group;
-    size_t* row_ptr = dmat->row_ptr;
-    EntryOneAPI* data = dmat->data;
-    int num_features = dmat->p_mat->Info().num_col_;
-    int num_rows = dmat->row_ptr_size - 1;
-    int num_group = model.learner_model_param->num_output_group;
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto out_predictions = out_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
-      cgh.parallel_for<class PredictInternal>(cl::sycl::range<1>(num_rows), [=](cl::sycl::id<1> pid) {
-        int global_idx = pid[0];
-        if (global_idx >= num_rows) return;
-        if (num_group == 1) {
-          float sum = 0.0;
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
-            sum += GetLeafWeight(global_idx, tree, data, row_ptr);
-          }
-          out_predictions[global_idx] += sum;
-        } else {
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
-            int out_prediction_idx = global_idx * num_group + tree_group[tree_idx];
-            out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr);
-          }
-        }
-      });
-    }).wait();
-  }
-
- public:
-  explicit PredictorOneAPI(Context const* generic_param) :
-      Predictor::Predictor{generic_param}, cpu_predictor(Predictor::Create("cpu_predictor", generic_param)) {
-    cl::sycl::default_selector selector;
-    qu_ = cl::sycl::queue(selector);
-  }
-
-  // ntree_limit is a very problematic parameter, as it's ambiguous in the context of
-  // multi-output and forest.  Same problem exists for tree_begin
-  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
-                    const gbm::GBTreeModel& model, int tree_begin,
-                    uint32_t const ntree_limit = 0) override {
-    if (this->device_matrix_cache_.find(dmat) ==
-        this->device_matrix_cache_.end()) {
-      this->device_matrix_cache_.emplace(
-          dmat, std::unique_ptr<DeviceMatrixOneAPI>(
-                    new DeviceMatrixOneAPI(dmat, qu_)));
-    }
-    DeviceMatrixOneAPI* device_matrix = device_matrix_cache_.find(dmat)->second.get();
-
-    // tree_begin is not used, right now we just enforce it to be 0.
-    CHECK_EQ(tree_begin, 0);
-    auto* out_preds = &predts->predictions;
-    CHECK_GE(predts->version, tree_begin);
-    if (out_preds->Size() == 0 && dmat->Info().num_row_ != 0) {
-      CHECK_EQ(predts->version, 0);
-    }
-    if (predts->version == 0) {
-      // out_preds->Size() can be non-zero as it's initialized here before any tree is
-      // built at the 0^th iterator.
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-    }
-
-    uint32_t const output_groups = model.learner_model_param->num_output_group;
-    CHECK_NE(output_groups, 0);
-    // Right now we just assume ntree_limit provided by users means number of tree layers
-    // in the context of multi-output model
-    uint32_t real_ntree_limit = ntree_limit * output_groups;
-    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
-      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
-    }
-
-    uint32_t const end_version = (tree_begin + real_ntree_limit) / output_groups;
-    // When users have provided ntree_limit, end_version can be lesser, cache is violated
-    if (predts->version > end_version) {
-      CHECK_NE(ntree_limit, 0);
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-      predts->version = 0;
-    }
-    uint32_t const beg_version = predts->version;
-    CHECK_LE(beg_version, end_version);
-
-    if (beg_version < end_version) {
-      DevicePredictInternal(device_matrix, out_preds, model,
-                            beg_version * output_groups,
-                            end_version * output_groups);
-    }
-
-    // delta means {size of forest} * {number of newly accumulated layers}
-    uint32_t delta = end_version - beg_version;
-    CHECK_LE(delta, model.trees.size());
-    predts->Update(delta);
-
-    CHECK(out_preds->Size() == output_groups * dmat->Info().num_row_ ||
-          out_preds->Size() == dmat->Info().num_row_);
-  }
-
-  void InplacePredict(std::any const& x, const gbm::GBTreeModel& model, float missing,
-                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
-                      unsigned tree_end) const override {
-    cpu_predictor->InplacePredict(x, model, missing, out_preds, tree_begin, tree_end);
-  }
-
-  void PredictInstance(const SparsePage::Inst& inst,
-                       std::vector<bst_float>* out_preds,
-                       const gbm::GBTreeModel& model, unsigned ntree_limit) override {
-    cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit);
-  }
-
-  void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
-                   const gbm::GBTreeModel& model, unsigned ntree_limit) override {
-    cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
-  }
-
-  void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
-                           const gbm::GBTreeModel& model, uint32_t ntree_limit,
-                           std::vector<bst_float>* tree_weights,
-                           bool approximate, int condition,
-                           unsigned condition_feature) override {
-    cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate, condition, condition_feature);
-  }
-
-  void PredictInteractionContributions(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
-                                       const gbm::GBTreeModel& model, unsigned ntree_limit,
-                                       std::vector<bst_float>* tree_weights,
-                                       bool approximate) override {
-    cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate);
-  }
-
- private:
-  cl::sycl::queue qu_;
-  DeviceModelOneAPI model_;
-
-  std::mutex lock_;
-  std::unique_ptr<Predictor> cpu_predictor;
-
-  std::unordered_map<DMatrix*, std::unique_ptr<DeviceMatrixOneAPI>>
-      device_matrix_cache_;
-};
-
-XGBOOST_REGISTER_PREDICTOR(PredictorOneAPI, "oneapi_predictor")
-.describe("Make predictions using DPC++.")
-.set_body([](Context const* generic_param) {
-            return new PredictorOneAPI(generic_param);
-          });
-}  // namespace predictor
-}  // namespace xgboost
diff --git a/plugin/updater_oneapi/regression_loss_oneapi.h b/plugin/updater_oneapi/regression_loss_oneapi.h
deleted file mode 100755
index b0299ff7f..000000000
--- a/plugin/updater_oneapi/regression_loss_oneapi.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*!
- * Copyright 2017-2020 XGBoost contributors
- */
-#ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
-#define XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
-
-#include <dmlc/omp.h>
-#include <xgboost/logging.h>
-#include <algorithm>
-
-#include "CL/sycl.hpp"
-
-namespace xgboost {
-namespace obj {
-
-/*!
- * \brief calculate the sigmoid of the input.
- * \param x input parameter
- * \return the transformed value.
- */
-inline float SigmoidOneAPI(float x) {
-  return 1.0f / (1.0f + cl::sycl::exp(-x));
-}
-
-// common regressions
-// linear regression
-struct LinearSquareLossOneAPI {
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bool CheckLabel(bst_float x) { return true; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    return 1.0f;
-  }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
-  static const char* LabelErrorMsg() { return ""; }
-  static const char* DefaultEvalMetric() { return "rmse"; }
-
-  static const char* Name() { return "reg:squarederror_oneapi"; }
-};
-
-// TODO: DPC++ does not fully support std math inside offloaded kernels
-struct SquaredLogErrorOneAPI {
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bool CheckLabel(bst_float label) {
-    return label > -1;
-  }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    predt = std::max(predt, (bst_float)(-1 + 1e-6));  // ensure correct value for log1p
-    return (cl::sycl::log1p(predt) - cl::sycl::log1p(label)) / (predt + 1);
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    predt = std::max(predt, (bst_float)(-1 + 1e-6));
-    float res = (-cl::sycl::log1p(predt) + cl::sycl::log1p(label) + 1) /
-                cl::sycl::pow(predt + 1, (bst_float)2);
-    res = std::max(res, (bst_float)1e-6f);
-    return res;
-  }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
-  static const char* LabelErrorMsg() {
-    return "label must be greater than -1 for rmsle so that log(label + 1) can be valid.";
-  }
-  static const char* DefaultEvalMetric() { return "rmsle"; }
-
-  static const char* Name() { return "reg:squaredlogerror_oneapi"; }
-};
-
-// logistic loss for probability regression task
-struct LogisticRegressionOneAPI {
-  // duplication is necessary, as __device__ specifier
-  // cannot be made conditional on template parameter
-  static bst_float PredTransform(bst_float x) { return SigmoidOneAPI(x); }
-  static bool CheckLabel(bst_float x) { return x >= 0.0f && x <= 1.0f; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    const bst_float eps = 1e-16f;
-    return std::max(predt * (1.0f - predt), eps);
-  }
-  template <typename T>
-  static T PredTransform(T x) { return SigmoidOneAPI(x); }
-  template <typename T>
-  static T FirstOrderGradient(T predt, T label) { return predt - label; }
-  template <typename T>
-  static T SecondOrderGradient(T predt, T label) {
-    const T eps = T(1e-16f);
-    return std::max(predt * (T(1.0f) - predt), eps);
-  }
-  static bst_float ProbToMargin(bst_float base_score) {
-    CHECK(base_score > 0.0f && base_score < 1.0f)
-        << "base_score must be in (0,1) for logistic loss, got: " << base_score;
-    return -logf(1.0f / base_score - 1.0f);
-  }
-  static const char* LabelErrorMsg() {
-    return "label must be in [0,1] for logistic regression";
-  }
-  static const char* DefaultEvalMetric() { return "rmse"; }
-
-  static const char* Name() { return "reg:logistic_oneapi"; }
-};
-
-// logistic loss for binary classification task
-struct LogisticClassificationOneAPI : public LogisticRegressionOneAPI {
-  static const char* DefaultEvalMetric() { return "logloss"; }
-  static const char* Name() { return "binary:logistic_oneapi"; }
-};
-
-// logistic loss, but predict un-transformed margin
-struct LogisticRawOneAPI : public LogisticRegressionOneAPI {
-  // duplication is necessary, as __device__ specifier
-  // cannot be made conditional on template parameter
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    predt = SigmoidOneAPI(predt);
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    const bst_float eps = 1e-16f;
-    predt = SigmoidOneAPI(predt);
-    return std::max(predt * (1.0f - predt), eps);
-  }
-  template <typename T>
-    static T PredTransform(T x) { return x; }
-  template <typename T>
-    static T FirstOrderGradient(T predt, T label) {
-    predt = SigmoidOneAPI(predt);
-    return predt - label;
-  }
-  template <typename T>
-    static T SecondOrderGradient(T predt, T label) {
-    const T eps = T(1e-16f);
-    predt = SigmoidOneAPI(predt);
-    return std::max(predt * (T(1.0f) - predt), eps);
-  }
-  static const char* DefaultEvalMetric() { return "logloss"; }
-
-  static const char* Name() { return "binary:logitraw_oneapi"; }
-};
-
-}  // namespace obj
-}  // namespace xgboost
-
-#endif  // XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
diff --git a/plugin/updater_oneapi/regression_obj_oneapi.cc b/plugin/updater_oneapi/regression_obj_oneapi.cc
deleted file mode 100755
index 3ee5741e7..000000000
--- a/plugin/updater_oneapi/regression_obj_oneapi.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-#include <xgboost/logging.h>
-#include <xgboost/objective.h>
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "xgboost/host_device_vector.h"
-#include "xgboost/json.h"
-#include "xgboost/parameter.h"
-#include "xgboost/span.h"
-
-#include "../../src/common/transform.h"
-#include "../../src/common/common.h"
-#include "./regression_loss_oneapi.h"
-
-#include "CL/sycl.hpp"
-
-namespace xgboost {
-namespace obj {
-
-DMLC_REGISTRY_FILE_TAG(regression_obj_oneapi);
-
-struct RegLossParamOneAPI : public XGBoostParameter<RegLossParamOneAPI> {
-  float scale_pos_weight;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(RegLossParamOneAPI) {
-    DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f)
-      .describe("Scale the weight of positive examples by this factor");
-  }
-};
-
-template<typename Loss>
-class RegLossObjOneAPI : public ObjFunction {
- protected:
-  HostDeviceVector<int> label_correct_;
-
- public:
-  RegLossObjOneAPI() = default;
-
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.UpdateAllowUnknown(args);
-
-    cl::sycl::default_selector selector;
-    qu_ = cl::sycl::queue(selector);
-  }
-
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo &info,
-                   int iter,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
-    if (info.labels_.Size() == 0U) {
-      LOG(WARNING) << "Label set is empty.";
-    }
-    CHECK_EQ(preds.Size(), info.labels_.Size())
-        << " " << "labels are not correctly provided"
-        << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size() << ", "
-        << "Loss: " << Loss::Name();
-
-    size_t const ndata = preds.Size();
-    out_gpair->Resize(ndata);
-
-    // TODO: add label_correct check
-    label_correct_.Resize(1);
-    label_correct_.Fill(1);
-
-    bool is_null_weight = info.weights_.Size() == 0;
-
-    cl::sycl::buffer<bst_float, 1> preds_buf(preds.HostPointer(), preds.Size());
-    cl::sycl::buffer<bst_float, 1> labels_buf(info.labels_.HostPointer(), info.labels_.Size());
-    cl::sycl::buffer<GradientPair, 1> out_gpair_buf(out_gpair->HostPointer(), out_gpair->Size());
-    cl::sycl::buffer<bst_float, 1> weights_buf(is_null_weight ? NULL : info.weights_.HostPointer(),
-                                               is_null_weight ? 1 : info.weights_.Size());
-
-	cl::sycl::buffer<int, 1> additional_input_buf(1);
-	{
-		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>();
-		additional_input_acc[0] = 1; // Fill the label_correct flag
-	}
-
-    auto scale_pos_weight = param_.scale_pos_weight;
-    if (!is_null_weight) {
-      CHECK_EQ(info.weights_.Size(), ndata)
-        << "Number of weights should be equal to number of data points.";
-    }
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto preds_acc            = preds_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto labels_acc           = labels_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto weights_acc          = weights_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto out_gpair_acc        = out_gpair_buf.get_access<cl::sycl::access::mode::write>(cgh);
-      auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>(cgh);
-      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
-        int idx = pid[0];
-        bst_float p = Loss::PredTransform(preds_acc[idx]);
-        bst_float w = is_null_weight ? 1.0f : weights_acc[idx];
-        bst_float label = labels_acc[idx];
-        if (label == 1.0f) {
-          w *= scale_pos_weight;
-        }
-        if (!Loss::CheckLabel(label)) {
-          // If there is an incorrect label, the host code will know.
-          additional_input_acc[0] = 0;
-        }
-        out_gpair_acc[idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
-                                          Loss::SecondOrderGradient(p, label) * w);
-      });
-    }).wait();
-
-    int flag = 1;
-	{
-		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::read>();
-		flag = additional_input_acc[0];
-	}
-
-    if (flag == 0) {
-      LOG(FATAL) << Loss::LabelErrorMsg();
-    }
-  
-  }
-
- public:
-  const char* DefaultEvalMetric() const override {
-    return Loss::DefaultEvalMetric();
-  }
-
-  void PredTransform(HostDeviceVector<float> *io_preds) override {
-    size_t const ndata = io_preds->Size();
-
-    cl::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto io_preds_acc = io_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
-      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
-        int idx = pid[0];
-        io_preds_acc[idx] = Loss::PredTransform(io_preds_acc[idx]);
-      });
-    }).wait();
-  }
-
-  float ProbToMargin(float base_score) const override {
-    return Loss::ProbToMargin(base_score);
-  }
-
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["name"] = String(Loss::Name());
-    out["reg_loss_param"] = ToJson(param_);
-  }
-
-  void LoadConfig(Json const& in) override {
-    FromJson(in["reg_loss_param"], &param_);
-  }
-
- protected:
-  RegLossParamOneAPI param_;
-
-  cl::sycl::queue qu_;
-};
-
-// register the objective functions
-DMLC_REGISTER_PARAMETER(RegLossParamOneAPI);
-
-// TODO: Find a better way to dispatch names of DPC++ kernels with various template parameters of loss function
-XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegressionOneAPI, LinearSquareLossOneAPI::Name())
-.describe("Regression with squared error with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LinearSquareLossOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(SquareLogErrorOneAPI, SquaredLogErrorOneAPI::Name())
-.describe("Regression with root mean squared logarithmic error with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<SquaredLogErrorOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticRegressionOneAPI, LogisticRegressionOneAPI::Name())
-.describe("Logistic regression for probability regression task with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticRegressionOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticClassificationOneAPI, LogisticClassificationOneAPI::Name())
-.describe("Logistic regression for binary classification task with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticClassificationOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticRawOneAPI, LogisticRawOneAPI::Name())
-.describe("Logistic regression for classification, output score "
-          "before logistic transformation with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticRawOneAPI>(); });
-
-}  // namespace obj
-}  // namespace xgboost
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f0dfe061f..161889f9e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,6 +16,10 @@ if(USE_CUDA)
   target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
 endif()
 
+if(PLUGIN_SYCL)
+  target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_SYCL=1)
+endif()
+
 target_include_directories(objxgboost
   PRIVATE
   ${xgboost_SOURCE_DIR}/include
diff --git a/src/common/common.h b/src/common/common.h
index ed6ceceb8..4b20ce7c2 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -169,10 +169,10 @@ inline void AssertNCCLSupport() {
 #endif  // !defined(XGBOOST_USE_NCCL)
 }
 
-inline void AssertOneAPISupport() {
-#ifndef XGBOOST_USE_ONEAPI
-    LOG(FATAL) << "XGBoost version not compiled with OneAPI support.";
-#endif  // XGBOOST_USE_ONEAPI
+inline void AssertSYCLSupport() {
+#ifndef XGBOOST_USE_SYCL
+    LOG(FATAL) << "XGBoost version not compiled with SYCL support.";
+#endif  // XGBOOST_USE_SYCL
 }
 
 void SetDevice(std::int32_t device);
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index b0327da15..9ff4abb4d 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -113,13 +113,13 @@ void GBTree::Configure(Args const& cfg) {
   }
 #endif  // defined(XGBOOST_USE_CUDA)
 
-#if defined(XGBOOST_USE_ONEAPI)
-  if (!oneapi_predictor_) {
-    oneapi_predictor_ =
-        std::unique_ptr<Predictor>(Predictor::Create("oneapi_predictor", this->ctx_));
+#if defined(XGBOOST_USE_SYCL)
+  if (!sycl_predictor_) {
+    sycl_predictor_ =
+      std::unique_ptr<Predictor>(Predictor::Create("sycl_predictor", this->ctx_));
   }
-  oneapi_predictor_->Configure(cfg);
-#endif  // defined(XGBOOST_USE_ONEAPI)
+  sycl_predictor_->Configure(cfg);
+#endif  // defined(XGBOOST_USE_SYCL)
 
   // `updater` parameter was manually specified
   specified_updater_ =
@@ -553,6 +553,11 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
       },
       [&, begin = tree_begin, end = tree_end] {
         return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+#if defined(XGBOOST_USE_SYCL)
+      },
+      [&, begin = tree_begin, end = tree_end] {
+        return this->sycl_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+#endif  // defined(XGBOOST_USE_SYCL)
       });
   if (!known_type) {
     auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
@@ -568,10 +573,16 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
   if (f_dmat && !f_dmat->SingleColBlock()) {
     if (ctx_->IsCPU()) {
       return cpu_predictor_;
-    } else {
+    } else if (ctx_->IsCUDA()) {
       common::AssertGPUSupport();
       CHECK(gpu_predictor_);
       return gpu_predictor_;
+    } else {
+#if defined(XGBOOST_USE_SYCL)
+      common::AssertSYCLSupport();
+      CHECK(sycl_predictor_);
+      return sycl_predictor_;
+#endif  // defined(XGBOOST_USE_SYCL)
     }
   }
 
@@ -606,10 +617,16 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
 
   if (ctx_->IsCPU()) {
     return cpu_predictor_;
-  } else {
+  } else if (ctx_->IsCUDA()) {
     common::AssertGPUSupport();
     CHECK(gpu_predictor_);
     return gpu_predictor_;
+  } else {
+#if defined(XGBOOST_USE_SYCL)
+      common::AssertSYCLSupport();
+      CHECK(sycl_predictor_);
+      return sycl_predictor_;
+#endif  // defined(XGBOOST_USE_SYCL)
   }
 
   return cpu_predictor_;
@@ -814,6 +831,11 @@ class Dart : public GBTree {
           },
           [&] {
             return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+#if defined(XGBOOST_USE_SYCL)
+          },
+          [&] {
+            return sycl_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+#endif  // defined(XGBOOST_USE_SYCL)
           });
       CHECK(success) << msg;
     };
@@ -830,6 +852,12 @@ class Dart : public GBTree {
             [&] {
               this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
                                                        model_);
+#if defined(XGBOOST_USE_SYCL)
+            },
+            [&] {
+              this->sycl_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
+                                                        model_);
+#endif  // defined(XGBOOST_USE_SYCL)
             });
       }
       // Multiple the tree weight
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 827d85217..a2d84d848 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -349,9 +349,9 @@ class GBTree : public GradientBooster {
   // Predictors
   std::unique_ptr<Predictor> cpu_predictor_;
   std::unique_ptr<Predictor> gpu_predictor_{nullptr};
-#if defined(XGBOOST_USE_ONEAPI)
-  std::unique_ptr<Predictor> oneapi_predictor_;
-#endif  // defined(XGBOOST_USE_ONEAPI)
+#if defined(XGBOOST_USE_SYCL)
+  std::unique_ptr<Predictor> sycl_predictor_;
+#endif  // defined(XGBOOST_USE_SYCL)
   common::Monitor monitor_;
 };
 
diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/tests/ci_build/conda_env/linux_sycl_test.yml
new file mode 100644
index 000000000..bb14c1e77
--- /dev/null
+++ b/tests/ci_build/conda_env/linux_sycl_test.yml
@@ -0,0 +1,20 @@
+name: linux_sycl_test
+channels:
+- conda-forge
+- intel
+dependencies:
+- python=3.8
+- cmake
+- c-compiler
+- cxx-compiler
+- pip
+- wheel
+- numpy
+- scipy
+- scikit-learn
+- pandas
+- hypothesis>=6.46
+- pytest
+- pytest-timeout
+- pytest-cov
+- dpcpp_linux-64
diff --git a/tests/ci_build/lint_cpp.py b/tests/ci_build/lint_cpp.py
index 6ec2b4e7f..d4775d6b6 100644
--- a/tests/ci_build/lint_cpp.py
+++ b/tests/ci_build/lint_cpp.py
@@ -138,7 +138,7 @@ def main():
         "path",
         nargs="*",
         help="Path to traverse",
-        default=["src", "include", os.path.join("R-package", "src"), "python-package"],
+        default=["src", "include", os.path.join("R-package", "src"), "python-package", "plugin/sycl"],
     )
     parser.add_argument(
         "--exclude_path",
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index e0d16efd4..fdd643da0 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -33,6 +33,7 @@ class LintersPaths:
         "tests/python-gpu/test_gpu_pickling.py",
         "tests/python-gpu/test_gpu_eval_metrics.py",
         "tests/python-gpu/test_gpu_with_sklearn.py",
+        "tests/python-sycl/test_sycl_prediction.py",
         "tests/test_distributed/test_with_spark/",
         "tests/test_distributed/test_gpu_with_spark/",
         # demo
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index ab82b6494..08862feee 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -13,9 +13,9 @@ if(USE_CUDA)
   list(APPEND TEST_SOURCES ${CUDA_TEST_SOURCES})
 endif()
 
-file(GLOB_RECURSE ONEAPI_TEST_SOURCES "plugin/*_oneapi.cc")
-if(NOT PLUGIN_UPDATER_ONEAPI)
-  list(REMOVE_ITEM TEST_SOURCES ${ONEAPI_TEST_SOURCES})
+file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
+if(NOT PLUGIN_SYCL)
+  list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES})
 endif()
 
 if(PLUGIN_FEDERATED)
diff --git a/tests/cpp/plugin/test_predictor_oneapi.cc b/tests/cpp/plugin/test_predictor_oneapi.cc
deleted file mode 100755
index 52edd4a12..000000000
--- a/tests/cpp/plugin/test_predictor_oneapi.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/*!
- * Copyright 2017-2020 XGBoost contributors
- */
-#include <gtest/gtest.h>
-#include <xgboost/predictor.h>
-
-#include "../../../src/data/adapter.h"
-#include "../../../src/gbm/gbtree_model.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
-#include "../helpers.h"
-#include "../predictor/test_predictor.h"
-
-namespace xgboost {
-TEST(Plugin, OneAPIPredictorBasic) {
-  auto lparam = MakeCUDACtx(0);
-  std::unique_ptr<Predictor> oneapi_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("oneapi_predictor", &lparam));
-
-  int kRows = 5;
-  int kCols = 5;
-
-  LearnerModelParam param;
-  param.num_feature = kCols;
-  param.base_score = 0.0;
-  param.num_output_group = 1;
-
-  gbm::GBTreeModel model = CreateTestModel(&param);
-
-  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
-
-  // Test predict batch
-  PredictionCacheEntry out_predictions;
-  oneapi_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
-  ASSERT_EQ(model.trees.size(), out_predictions.version);
-  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
-  for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
-    ASSERT_EQ(out_predictions_h[i], 1.5);
-  }
-
-  // Test predict instance
-  auto const &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
-  for (size_t i = 0; i < batch.Size(); i++) {
-    std::vector<float> instance_out_predictions;
-    oneapi_predictor->PredictInstance(batch[i], &instance_out_predictions, model);
-    ASSERT_EQ(instance_out_predictions[0], 1.5);
-  }
-
-  // Test predict leaf
-  std::vector<float> leaf_out_predictions;
-  oneapi_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model);
-  for (auto v : leaf_out_predictions) {
-    ASSERT_EQ(v, 0);
-  }
-
-  // Test predict contribution
-  std::vector<float> out_contribution;
-  oneapi_predictor->PredictContribution(dmat.get(), &out_contribution, model);
-  ASSERT_EQ(out_contribution.size(), kRows * (kCols + 1));
-  for (size_t i = 0; i < out_contribution.size(); ++i) {
-    auto const& contri = out_contribution[i];
-    // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
-    if ((i+1) % (kCols+1) == 0) {
-      ASSERT_EQ(out_contribution.back(), 1.5f);
-    } else {
-      ASSERT_EQ(contri, 0);
-    }
-  }
-  // Test predict contribution (approximate method)
-  oneapi_predictor->PredictContribution(dmat.get(), &out_contribution, model, 0, nullptr, true);
-  for (size_t i = 0; i < out_contribution.size(); ++i) {
-    auto const& contri = out_contribution[i];
-    // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
-    if ((i+1) % (kCols+1) == 0) {
-      ASSERT_EQ(out_contribution.back(), 1.5f);
-    } else {
-      ASSERT_EQ(contri, 0);
-    }
-  }
-}
-
-TEST(Plugin, OneAPIPredictorExternalMemory) {
-  dmlc::TemporaryDirectory tmpdir;
-  std::string filename = tmpdir.path + "/big.libsvm";
-  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64, filename);
-  auto lparam = MakeCUDACtx(0);
-
-  std::unique_ptr<Predictor> oneapi_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("oneapi_predictor", &lparam));
-
-  LearnerModelParam param;
-  param.base_score = 0;
-  param.num_feature = dmat->Info().num_col_;
-  param.num_output_group = 1;
-
-  gbm::GBTreeModel model = CreateTestModel(&param);
-
-  // Test predict batch
-  PredictionCacheEntry out_predictions;
-  oneapi_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
-  std::vector<float> &out_predictions_h = out_predictions.predictions.HostVector();
-  ASSERT_EQ(out_predictions.predictions.Size(), dmat->Info().num_row_);
-  for (const auto& v : out_predictions_h) {
-    ASSERT_EQ(v, 1.5);
-  }
-
-  // Test predict leaf
-  std::vector<float> leaf_out_predictions;
-  oneapi_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model);
-  ASSERT_EQ(leaf_out_predictions.size(), dmat->Info().num_row_);
-  for (const auto& v : leaf_out_predictions) {
-    ASSERT_EQ(v, 0);
-  }
-
-  // Test predict contribution
-  std::vector<float> out_contribution;
-  oneapi_predictor->PredictContribution(dmat.get(), &out_contribution, model);
-  ASSERT_EQ(out_contribution.size(), dmat->Info().num_row_ * (dmat->Info().num_col_ + 1));
-  for (size_t i = 0; i < out_contribution.size(); ++i) {
-    auto const& contri = out_contribution[i];
-    // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
-    if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) {
-      ASSERT_EQ(out_contribution.back(), 1.5f);
-    } else {
-      ASSERT_EQ(contri, 0);
-    }
-  }
-
-  // Test predict contribution (approximate method)
-  std::vector<float> out_contribution_approximate;
-  oneapi_predictor->PredictContribution(dmat.get(), &out_contribution_approximate, model, 0, nullptr, true);
-  ASSERT_EQ(out_contribution_approximate.size(),
-            dmat->Info().num_row_ * (dmat->Info().num_col_ + 1));
-  for (size_t i = 0; i < out_contribution.size(); ++i) {
-    auto const& contri = out_contribution[i];
-    // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
-    if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) {
-      ASSERT_EQ(out_contribution.back(), 1.5f);
-    } else {
-      ASSERT_EQ(contri, 0);
-    }
-  }
-}
-
-TEST(Plugin, OneAPIPredictorInplacePredict) {
-  bst_row_t constexpr kRows{128};
-  bst_feature_t constexpr kCols{64};
-  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(-1);
-  {
-    HostDeviceVector<float> data;
-    gen.GenerateDense(&data);
-    ASSERT_EQ(data.Size(), kRows * kCols);
-    std::shared_ptr<data::DenseAdapter> x{
-      new data::DenseAdapter(data.HostPointer(), kRows, kCols)};
-    TestInplacePrediction(x, "oneapi_predictor", kRows, kCols, -1);
-  }
-
-  {
-    HostDeviceVector<float> data;
-    HostDeviceVector<bst_row_t> rptrs;
-    HostDeviceVector<bst_feature_t> columns;
-    gen.GenerateCSR(&data, &rptrs, &columns);
-    std::shared_ptr<data::CSRAdapter> x{new data::CSRAdapter(
-        rptrs.HostPointer(), columns.HostPointer(), data.HostPointer(), kRows,
-        data.Size(), kCols)};
-    TestInplacePrediction(x, "oneapi_predictor", kRows, kCols, -1);
-  }
-}
-}  // namespace xgboost
diff --git a/tests/cpp/plugin/test_regression_obj_oneapi.cc b/tests/cpp/plugin/test_regression_obj_oneapi.cc
deleted file mode 100755
index c01d9d951..000000000
--- a/tests/cpp/plugin/test_regression_obj_oneapi.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-/*!
- * Copyright 2017-2019 XGBoost contributors
- */
-#include <gtest/gtest.h>
-#include <xgboost/objective.h>
-#include <xgboost/context.h>
-#include <xgboost/json.h>
-#include "../helpers.h"
-namespace xgboost {
-
-TEST(Plugin, LinearRegressionGPairOneAPI) {
-  Context tparam = MakeCUDACtx(0);
-  std::vector<std::pair<std::string, std::string>> args;
-
-  std::unique_ptr<ObjFunction> obj {
-    ObjFunction::Create("reg:squarederror_oneapi", &tparam)
-  };
-
-  obj->Configure(args);
-  CheckObjFunction(obj,
-                   {0, 0.1f, 0.9f,   1,    0,  0.1f, 0.9f,  1},
-                   {0,   0,   0,   0,    1,    1,    1, 1},
-                   {1,   1,   1,   1,    1,    1,    1, 1},
-                   {0, 0.1f, 0.9f, 1.0f, -1.0f, -0.9f, -0.1f, 0},
-                   {1,   1,   1,   1,    1,    1,    1, 1});
-  CheckObjFunction(obj,
-                   {0, 0.1f, 0.9f,   1,    0,  0.1f, 0.9f,  1},
-                   {0,   0,   0,   0,    1,    1,    1, 1},
-                   {},  // empty weight
-                   {0, 0.1f, 0.9f, 1.0f, -1.0f, -0.9f, -0.1f, 0},
-                   {1,   1,   1,   1,    1,    1,    1, 1});
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
-}
-
-TEST(Plugin, SquaredLogOneAPI) {
-  Context tparam = MakeCUDACtx(0);
-  std::vector<std::pair<std::string, std::string>> args;
-
-  std::unique_ptr<ObjFunction> obj { ObjFunction::Create("reg:squaredlogerror_oneapi", &tparam) };
-  obj->Configure(args);
-  CheckConfigReload(obj, "reg:squaredlogerror_oneapi");
-
-  CheckObjFunction(obj,
-                   {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},  // pred
-                   {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},  // labels
-                   {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},  // weights
-                   {-0.5435f, -0.4257f, -0.25475f, -0.05855f, 0.1009f},
-                   { 1.3205f,  1.0492f,  0.69215f,  0.34115f, 0.1091f});
-  CheckObjFunction(obj,
-                   {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},  // pred
-                   {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},  // labels
-                   {},                              // empty weights
-                   {-0.5435f, -0.4257f, -0.25475f, -0.05855f, 0.1009f},
-                   { 1.3205f,  1.0492f,  0.69215f,  0.34115f, 0.1091f});
-  ASSERT_EQ(obj->DefaultEvalMetric(), std::string{"rmsle"});
-}
-
-TEST(Plugin, LogisticRegressionGPairOneAPI) {
-  Context tparam = MakeCUDACtx(0);
-  std::vector<std::pair<std::string, std::string>> args;
-  std::unique_ptr<ObjFunction> obj { ObjFunction::Create("reg:logistic_oneapi", &tparam) };
-
-  obj->Configure(args);
-  CheckConfigReload(obj, "reg:logistic_oneapi");
-
-  CheckObjFunction(obj,
-                   {   0,  0.1f,  0.9f,    1,    0,   0.1f,  0.9f,      1}, // preds
-                   {   0,    0,    0,    0,    1,     1,     1,     1}, // labels
-                   {   1,    1,    1,    1,    1,     1,     1,     1}, // weights
-                   { 0.5f, 0.52f, 0.71f, 0.73f, -0.5f, -0.47f, -0.28f, -0.26f}, // out_grad
-                   {0.25f, 0.24f, 0.20f, 0.19f, 0.25f,  0.24f,  0.20f,  0.19f}); // out_hess
-}
-
-TEST(Plugin, LogisticRegressionBasicOneAPI) {
-  Context lparam = MakeCUDACtx(0);
-  std::vector<std::pair<std::string, std::string>> args;
-  std::unique_ptr<ObjFunction> obj {
-    ObjFunction::Create("reg:logistic_oneapi", &lparam)
-  };
-
-  obj->Configure(args);
-  CheckConfigReload(obj, "reg:logistic_oneapi");
-
-  // test label validation
-  EXPECT_ANY_THROW(CheckObjFunction(obj, {0}, {10}, {1}, {0}, {0}))
-    << "Expected error when label not in range [0,1f] for LogisticRegression";
-
-  // test ProbToMargin
-  EXPECT_NEAR(obj->ProbToMargin(0.1f), -2.197f, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.5f), 0, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.9f), 2.197f, 0.01f);
-  EXPECT_ANY_THROW(obj->ProbToMargin(10))
-    << "Expected error when base_score not in range [0,1f] for LogisticRegression";
-
-  // test PredTransform
-  HostDeviceVector<bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
-  std::vector<bst_float> out_preds = {0.5f, 0.524f, 0.622f, 0.710f, 0.731f};
-  obj->PredTransform(&io_preds);
-  auto& preds = io_preds.HostVector();
-  for (int i = 0; i < static_cast<int>(io_preds.Size()); ++i) {
-    EXPECT_NEAR(preds[i], out_preds[i], 0.01f);
-  }
-}
-
-TEST(Plugin, LogisticRawGPairOneAPI) {
-  Context lparam = MakeCUDACtx(0);
-  std::vector<std::pair<std::string, std::string>> args;
-  std::unique_ptr<ObjFunction>  obj {
-    ObjFunction::Create("binary:logitraw_oneapi", &lparam)
-  };
-
-  obj->Configure(args);
-
-  CheckObjFunction(obj,
-                   {   0,  0.1f,  0.9f,    1,    0,   0.1f,   0.9f,     1},
-                   {   0,    0,    0,    0,    1,     1,     1,     1},
-                   {   1,    1,    1,    1,    1,     1,     1,     1},
-                   { 0.5f, 0.52f, 0.71f, 0.73f, -0.5f, -0.47f, -0.28f, -0.26f},
-                   {0.25f, 0.24f, 0.20f, 0.19f, 0.25f,  0.24f,  0.20f,  0.19f});
-}
-
-TEST(Plugin, CPUvsOneAPI) {
-  Context ctx = MakeCUDACtx(0);
-
-  ObjFunction * obj_cpu =
-      ObjFunction::Create("reg:squarederror", &ctx);
-  ObjFunction * obj_oneapi =
-      ObjFunction::Create("reg:squarederror_oneapi", &ctx);
-  HostDeviceVector<GradientPair> cpu_out_preds;
-  HostDeviceVector<GradientPair> oneapi_out_preds;
-
-  constexpr size_t kRows = 400;
-  constexpr size_t kCols = 100;
-  auto pdmat = RandomDataGenerator(kRows, kCols, 0).Seed(0).GenerateDMatrix();
-  HostDeviceVector<float> preds;
-  preds.Resize(kRows);
-  auto& h_preds = preds.HostVector();
-  for (size_t i = 0; i < h_preds.size(); ++i) {
-    h_preds[i] = static_cast<float>(i);
-  }
-  auto& info = pdmat->Info();
-
-  info.labels.Reshape(kRows, 1);
-  auto& h_labels = info.labels.Data()->HostVector();
-  for (size_t i = 0; i < h_labels.size(); ++i) {
-    h_labels[i] = 1 / static_cast<float>(i+1);
-  }
-
-  {
-    // CPU
-    ctx = ctx.MakeCPU();
-    obj_cpu->GetGradient(preds, info, 0, &cpu_out_preds);
-  }
-  {
-    // oneapi
-    ctx.gpu_id = 0;
-    obj_oneapi->GetGradient(preds, info, 0, &oneapi_out_preds);
-  }
-
-  auto& h_cpu_out = cpu_out_preds.HostVector();
-  auto& h_oneapi_out = oneapi_out_preds.HostVector();
-
-  float sgrad = 0;
-  float shess = 0;
-  for (size_t i = 0; i < kRows; ++i) {
-    sgrad += std::pow(h_cpu_out[i].GetGrad() - h_oneapi_out[i].GetGrad(), 2);
-    shess += std::pow(h_cpu_out[i].GetHess() - h_oneapi_out[i].GetHess(), 2);
-  }
-  ASSERT_NEAR(sgrad, 0.0f, kRtEps);
-  ASSERT_NEAR(shess, 0.0f, kRtEps);
-
-  delete obj_cpu;
-  delete obj_oneapi;
-}
-
-}  // namespace xgboost
diff --git a/tests/cpp/plugin/test_sycl_predictor.cc b/tests/cpp/plugin/test_sycl_predictor.cc
new file mode 100755
index 000000000..f82a9f33d
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_predictor.cc
@@ -0,0 +1,101 @@
+/*!
+ * Copyright 2017-2023 XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/predictor.h>
+
+#include "../../../src/data/adapter.h"
+#include "../../../src/data/proxy_dmatrix.h"
+#include "../../../src/gbm/gbtree.h"
+#include "../../../src/gbm/gbtree_model.h"
+#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../helpers.h"
+#include "../predictor/test_predictor.h"
+
+namespace xgboost {
+
+TEST(SyclPredictor, Basic) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  size_t constexpr kRows = 5;
+  size_t constexpr kCols = 5;
+  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
+  TestBasic(dmat.get(), &ctx);
+}
+
+TEST(SyclPredictor, ExternalMemory) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
+  size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
+  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
+  TestBasic(dmat.get(), &ctx);
+}
+
+TEST(SyclPredictor, InplacePredict) {
+  bst_row_t constexpr kRows{128};
+  bst_feature_t constexpr kCols{64};
+  Context ctx;
+  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.Device());
+  {
+    HostDeviceVector<float> data;
+    gen.GenerateDense(&data);
+    ASSERT_EQ(data.Size(), kRows * kCols);
+    Context ctx;
+    ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+    std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy{}};
+    auto array_interface = GetArrayInterface(&data, kRows, kCols);
+    std::string arr_str;
+    Json::Dump(array_interface, &arr_str);
+    x->SetArrayData(arr_str.data());
+    TestInplacePrediction(&ctx, x, kRows, kCols);
+  }
+}
+
+TEST(SyclPredictor, IterationRange) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+  TestIterationRange(&ctx);
+}
+
+TEST(SyclPredictor, GHistIndexTraining) {
+  size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+  auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateDMatrix(false);
+  HostDeviceVector<float> storage(kRows * kCols);
+  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage);
+  auto adapter = data::ArrayAdapter(columnar.c_str());
+  std::shared_ptr<DMatrix> p_full{
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
+}
+
+TEST(SyclPredictor, CategoricalPredictLeaf) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+  TestCategoricalPredictLeaf(&ctx, false);
+}
+
+TEST(SyclPredictor, LesserFeatures) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+  TestPredictionWithLesserFeatures(&ctx);
+}
+
+TEST(SyclPredictor, Sparse) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+  TestSparsePrediction(&ctx, 0.2);
+  TestSparsePrediction(&ctx, 0.8);
+}
+
+TEST(SyclPredictor, Multi) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+  TestVectorLeafPrediction(&ctx);
+}
+
+}  // namespace xgboost
\ No newline at end of file
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 07f33d72e..8f3955c05 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -18,92 +18,17 @@
 
 namespace xgboost {
 
-namespace {
-void TestBasic(DMatrix* dmat) {
-  Context ctx;
-  std::unique_ptr<Predictor> cpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &ctx));
-
-  size_t const kRows = dmat->Info().num_row_;
-  size_t const kCols = dmat->Info().num_col_;
-
-  LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
-
-  ctx.UpdateAllowUnknown(Args{});
-  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
-
-  // Test predict batch
-  PredictionCacheEntry out_predictions;
-  cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
-  cpu_predictor->PredictBatch(dmat, &out_predictions, model, 0);
-
-  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
-  for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
-    ASSERT_EQ(out_predictions_h[i], 1.5);
-  }
-
-  // Test predict instance
-  auto const& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
-  auto page = batch.GetView();
-  for (size_t i = 0; i < batch.Size(); i++) {
-    std::vector<float> instance_out_predictions;
-    cpu_predictor->PredictInstance(page[i], &instance_out_predictions, model, 0,
-                                   dmat->Info().IsColumnSplit());
-    ASSERT_EQ(instance_out_predictions[0], 1.5);
-  }
-
-  // Test predict leaf
-  HostDeviceVector<float> leaf_out_predictions;
-  cpu_predictor->PredictLeaf(dmat, &leaf_out_predictions, model);
-  auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector();
-  for (auto v : h_leaf_out_predictions) {
-    ASSERT_EQ(v, 0);
-  }
-
-  if (dmat->Info().IsColumnSplit()) {
-    // Predict contribution is not supported for column split.
-    return;
-  }
-
-  // Test predict contribution
-  HostDeviceVector<float> out_contribution_hdv;
-  auto& out_contribution = out_contribution_hdv.HostVector();
-  cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model);
-  ASSERT_EQ(out_contribution.size(), kRows * (kCols + 1));
-  for (size_t i = 0; i < out_contribution.size(); ++i) {
-    auto const& contri = out_contribution[i];
-    // shift 1 for bias, as test tree is a decision dump, only global bias is
-    // filled with LeafValue().
-    if ((i + 1) % (kCols + 1) == 0) {
-      ASSERT_EQ(out_contribution.back(), 1.5f);
-    } else {
-      ASSERT_EQ(contri, 0);
-    }
-  }
-  // Test predict contribution (approximate method)
-  cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model, 0, nullptr, true);
-  for (size_t i = 0; i < out_contribution.size(); ++i) {
-    auto const& contri = out_contribution[i];
-    // shift 1 for bias, as test tree is a decision dump, only global bias is
-    // filled with LeafValue().
-    if ((i + 1) % (kCols + 1) == 0) {
-      ASSERT_EQ(out_contribution.back(), 1.5f);
-    } else {
-      ASSERT_EQ(contri, 0);
-    }
-  }
-}
-}  // anonymous namespace
-
 TEST(CpuPredictor, Basic) {
+  Context ctx;
   size_t constexpr kRows = 5;
   size_t constexpr kCols = 5;
   auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
-  TestBasic(dmat.get());
+  TestBasic(dmat.get(), &ctx);
 }
 
 namespace {
 void TestColumnSplit() {
+  Context ctx;
   size_t constexpr kRows = 5;
   size_t constexpr kCols = 5;
   auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
@@ -112,7 +37,7 @@ void TestColumnSplit() {
   auto const rank = collective::GetRank();
   dmat = std::unique_ptr<DMatrix>{dmat->SliceCol(world_size, rank)};
 
-  TestBasic(dmat.get());
+  TestBasic(dmat.get(), &ctx);
 }
 }  // anonymous namespace
 
@@ -132,10 +57,11 @@ TEST(CpuPredictor, IterationRangeColmnSplit) {
 }
 
 TEST(CpuPredictor, ExternalMemory) {
+  Context ctx;
   size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
   size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
   std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
-  TestBasic(dmat.get());
+  TestBasic(dmat.get(), &ctx);
 }
 
 TEST(CpuPredictor, InplacePredict) {
@@ -235,12 +161,14 @@ TEST(CPUPredictor, CategoricalPredictionColumnSplit) {
 }
 
 TEST(CPUPredictor, CategoricalPredictLeaf) {
-  TestCategoricalPredictLeaf(false, false);
+  Context ctx;
+  TestCategoricalPredictLeaf(&ctx, false);
 }
 
 TEST(CPUPredictor, CategoricalPredictLeafColumnSplit) {
   auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, false, true);
+  Context ctx;
+  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, &ctx, true);
 }
 
 TEST(CpuPredictor, UpdatePredictionCache) {
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 883e6e01c..50e036b90 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -289,11 +289,13 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
 }
 
 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  TestCategoricalPredictLeaf(true, false);
+  auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  TestCategoricalPredictLeaf(&ctx, false);
 }
 
 TEST_F(MGPUPredictorTest, CategoricalPredictionLeafColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, TestCategoricalPredictLeaf, true, true);
+  auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  RunWithInMemoryCommunicator(world_size_, TestCategoricalPredictLeaf, &ctx, true);
 }
 
 TEST(GPUPredictor, PredictLeafBasic) {
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 21aa483e4..6ee34ae69 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -26,6 +26,79 @@
 #include "xgboost/tree_model.h"                   // for RegTree
 
 namespace xgboost {
+
+void TestBasic(DMatrix* dmat, Context const *ctx) {
+  auto predictor = std::unique_ptr<Predictor>(CreatePredictorForTest(ctx));
+
+  size_t const kRows = dmat->Info().num_row_;
+  size_t const kCols = dmat->Info().num_col_;
+
+  LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
+
+  gbm::GBTreeModel model = CreateTestModel(&mparam, ctx);
+
+  // Test predict batch
+  PredictionCacheEntry out_predictions;
+  predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
+  predictor->PredictBatch(dmat, &out_predictions, model, 0);
+
+  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+  for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
+    ASSERT_EQ(out_predictions_h[i], 1.5);
+  }
+
+  // Test predict instance
+  auto const& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
+  auto page = batch.GetView();
+  for (size_t i = 0; i < batch.Size(); i++) {
+    std::vector<float> instance_out_predictions;
+    predictor->PredictInstance(page[i], &instance_out_predictions, model, 0,
+                                   dmat->Info().IsColumnSplit());
+    ASSERT_EQ(instance_out_predictions[0], 1.5);
+  }
+
+  // Test predict leaf
+  HostDeviceVector<float> leaf_out_predictions;
+  predictor->PredictLeaf(dmat, &leaf_out_predictions, model);
+  auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector();
+  for (auto v : h_leaf_out_predictions) {
+    ASSERT_EQ(v, 0);
+  }
+
+  if (dmat->Info().IsColumnSplit()) {
+    // Predict contribution is not supported for column split.
+    return;
+  }
+
+  // Test predict contribution
+  HostDeviceVector<float> out_contribution_hdv;
+  auto& out_contribution = out_contribution_hdv.HostVector();
+  predictor->PredictContribution(dmat, &out_contribution_hdv, model);
+  ASSERT_EQ(out_contribution.size(), kRows * (kCols + 1));
+  for (size_t i = 0; i < out_contribution.size(); ++i) {
+    auto const& contri = out_contribution[i];
+    // shift 1 for bias, as test tree is a decision dump, only global bias is
+    // filled with LeafValue().
+    if ((i + 1) % (kCols + 1) == 0) {
+      ASSERT_EQ(out_contribution.back(), 1.5f);
+    } else {
+      ASSERT_EQ(contri, 0);
+    }
+  }
+  // Test predict contribution (approximate method)
+  predictor->PredictContribution(dmat, &out_contribution_hdv, model, 0, nullptr, true);
+  for (size_t i = 0; i < out_contribution.size(); ++i) {
+    auto const& contri = out_contribution[i];
+    // shift 1 for bias, as test tree is a decision dump, only global bias is
+    // filled with LeafValue().
+    if ((i + 1) % (kCols + 1) == 0) {
+      ASSERT_EQ(out_contribution.back(), 1.5f);
+    } else {
+      ASSERT_EQ(contri, 0);
+    }
+  }
+}
+
 TEST(Predictor, PredictionCache) {
   size_t constexpr kRows = 16, kCols = 4;
 
@@ -64,7 +137,7 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
                           {"num_feature", std::to_string(kCols)},
                           {"num_class", std::to_string(kClasses)},
                           {"max_bin", std::to_string(bins)},
-                          {"device", ctx->DeviceName()}});
+                          {"device", ctx->IsSycl() ? "cpu" : ctx->DeviceName()}});
   learner->Configure();
 
   for (size_t i = 0; i < kIters; ++i) {
@@ -151,7 +224,7 @@ std::unique_ptr<Learner> LearnerForTest(Context const *ctx, std::shared_ptr<DMat
                                         size_t iters, size_t forest = 1) {
   std::unique_ptr<Learner> learner{Learner::Create({dmat})};
   learner->SetParams(
-      Args{{"num_parallel_tree", std::to_string(forest)}, {"device", ctx->DeviceName()}});
+      Args{{"num_parallel_tree", std::to_string(forest)}, {"device", ctx->IsSycl() ? "cpu" : ctx->DeviceName()}});
   for (size_t i = 0; i < iters; ++i) {
     learner->UpdateOneIter(i, dmat);
   }
@@ -305,11 +378,7 @@ void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
   ASSERT_EQ(out_predictions.predictions.HostVector()[0], left_weight + score);
 }
 
-void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split) {
-  Context ctx;
-  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
-  }
+void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
 
@@ -320,10 +389,10 @@ void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split) {
   float left_weight = 1.3f;
   float right_weight = 1.7f;
 
-  gbm::GBTreeModel model(&mparam, &ctx);
+  gbm::GBTreeModel model(&mparam, ctx);
   GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight);
 
-  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(&ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};
 
   std::vector<float> row(kCols);
   row[split_ind] = split_cat;
@@ -363,7 +432,6 @@ void TestIterationRange(Context const* ctx) {
   HostDeviceVector<float> out_predt_sliced;
   HostDeviceVector<float> out_predt_ranged;
 
-  // margin
   {
     sliced->Predict(dmat, true, &out_predt_sliced, 0, 0, false, false, false, false, false);
     learner->Predict(dmat, true, &out_predt_ranged, 0, lend, false, false, false, false, false);
@@ -519,6 +587,8 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
 
   learner.reset(Learner::Create({Xy}));
   learner->LoadModel(model);
+  learner->SetParam("device", ctx->DeviceName());
+  learner->Configure();
 
   if (ctx->IsCUDA()) {
     learner->SetParam("tree_method", "gpu_hist");
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 9e0891d56..c2b28883a 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -34,6 +34,8 @@ inline gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, Context
 inline auto CreatePredictorForTest(Context const* ctx) {
   if (ctx->IsCPU()) {
     return Predictor::Create("cpu_predictor", ctx);
+  } else if (ctx->IsSycl()) {
+    return Predictor::Create("sycl_predictor", ctx);
   } else {
     return Predictor::Create("gpu_predictor", ctx);
   }
@@ -83,6 +85,8 @@ void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t col
   }
 }
 
+void TestBasic(DMatrix* dmat, Context const * ctx);
+
 // p_full and p_hist should come from the same data set.
 void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
                             std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);
@@ -98,7 +102,7 @@ void TestCategoricalPrediction(bool use_gpu, bool is_column_split);
 
 void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu);
 
-void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split);
+void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split);
 
 void TestIterationRange(Context const* ctx);
 
diff --git a/tests/python-sycl/test_sycl_prediction.py b/tests/python-sycl/test_sycl_prediction.py
new file mode 100644
index 000000000..06167c6c0
--- /dev/null
+++ b/tests/python-sycl/test_sycl_prediction.py
@@ -0,0 +1,165 @@
+import sys
+import unittest
+import pytest
+
+import numpy as np
+import xgboost as xgb
+from hypothesis import given, strategies, assume, settings, note
+
+from xgboost import testing as tm
+
+rng = np.random.RandomState(1994)
+
+shap_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(1, 11),
+        "max_leaves": strategies.integers(0, 256),
+        "num_parallel_tree": strategies.sampled_from([1, 10]),
+    }
+).filter(lambda x: x["max_depth"] > 0 or x["max_leaves"] > 0)
+
+
+class TestSYCLPredict(unittest.TestCase):
+    def test_predict(self):
+        iterations = 10
+        np.random.seed(1)
+        test_num_rows = [10, 1000, 5000]
+        test_num_cols = [10, 50, 500]
+        for num_rows in test_num_rows:
+            for num_cols in test_num_cols:
+                dtrain = xgb.DMatrix(
+                    np.random.randn(num_rows, num_cols),
+                    label=[0, 1] * int(num_rows / 2),
+                )
+                dval = xgb.DMatrix(
+                    np.random.randn(num_rows, num_cols),
+                    label=[0, 1] * int(num_rows / 2),
+                )
+                dtest = xgb.DMatrix(
+                    np.random.randn(num_rows, num_cols),
+                    label=[0, 1] * int(num_rows / 2),
+                )
+                watchlist = [(dtrain, "train"), (dval, "validation")]
+                res = {}
+                param = {
+                    "objective": "binary:logistic",
+                    "eval_metric": "logloss",
+                    "tree_method": "hist",
+                    "device": "cpu",
+                    "max_depth": 1,
+                    "verbosity": 0,
+                }
+                bst = xgb.train(
+                    param, dtrain, iterations, evals=watchlist, evals_result=res
+                )
+                assert tm.non_increasing(res["train"]["logloss"])
+                cpu_pred_train = bst.predict(dtrain, output_margin=True)
+                cpu_pred_test = bst.predict(dtest, output_margin=True)
+                cpu_pred_val = bst.predict(dval, output_margin=True)
+
+                bst.set_param({"device": "sycl"})
+                sycl_pred_train = bst.predict(dtrain, output_margin=True)
+                sycl_pred_test = bst.predict(dtest, output_margin=True)
+                sycl_pred_val = bst.predict(dval, output_margin=True)
+
+                np.testing.assert_allclose(cpu_pred_train, sycl_pred_train, rtol=1e-6)
+                np.testing.assert_allclose(cpu_pred_val, sycl_pred_val, rtol=1e-6)
+                np.testing.assert_allclose(cpu_pred_test, sycl_pred_test, rtol=1e-6)
+
+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_multi_predict(self):
+        from sklearn.datasets import make_regression
+        from sklearn.model_selection import train_test_split
+
+        n = 1000
+        X, y = make_regression(n, random_state=rng)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
+        dtrain = xgb.DMatrix(X_train, label=y_train)
+        dtest = xgb.DMatrix(X_test)
+
+        params = {}
+        params["tree_method"] = "hist"
+        params["device"] = "cpu"
+
+        bst = xgb.train(params, dtrain)
+        cpu_predict = bst.predict(dtest)
+
+        bst.set_param({"device": "sycl"})
+
+        predict0 = bst.predict(dtest)
+        predict1 = bst.predict(dtest)
+
+        assert np.allclose(predict0, predict1)
+        assert np.allclose(predict0, cpu_predict)
+
+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_sklearn(self):
+        m, n = 15000, 14
+        tr_size = 2500
+        X = np.random.rand(m, n)
+        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
+        X_train, y_train = X[:tr_size, :], y[:tr_size]
+        X_test, y_test = X[tr_size:, :], y[tr_size:]
+
+        # First with cpu_predictor
+        params = {
+            "tree_method": "hist",
+            "device": "cpu",
+            "n_jobs": -1,
+            "verbosity": 0,
+            "seed": 123,
+        }
+        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
+        cpu_train_score = m.score(X_train, y_train)
+        cpu_test_score = m.score(X_test, y_test)
+
+        # Now with sycl_predictor
+        params["device"] = "sycl"
+        m.set_params(**params)
+
+        sycl_train_score = m.score(X_train, y_train)
+        sycl_test_score = m.score(X_test, y_test)
+
+        assert np.allclose(cpu_train_score, sycl_train_score)
+        assert np.allclose(cpu_test_score, sycl_test_score)
+
+    @given(
+        strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy
+    )
+    @settings(deadline=None)
+    def test_shap(self, num_rounds, dataset, param):
+        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
+            return
+        param.update({"tree_method": "hist", "device": "cpu"})
+        param = dataset.set_params(param)
+        dmat = dataset.get_dmat()
+        bst = xgb.train(param, dmat, num_rounds)
+        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
+        bst.set_param({"device": "sycl"})
+        shap = bst.predict(test_dmat, pred_contribs=True)
+        margin = bst.predict(test_dmat, output_margin=True)
+        assume(len(dataset.y) > 0)
+        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3)
+
+    @given(
+        strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy
+    )
+    @settings(deadline=None, max_examples=20)
+    def test_shap_interactions(self, num_rounds, dataset, param):
+        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
+            return
+        param.update({"tree_method": "hist", "device": "cpu"})
+        param = dataset.set_params(param)
+        dmat = dataset.get_dmat()
+        bst = xgb.train(param, dmat, num_rounds)
+        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
+        bst.set_param({"device": "sycl"})
+        shap = bst.predict(test_dmat, pred_interactions=True)
+        margin = bst.predict(test_dmat, output_margin=True)
+        assume(len(dataset.y) > 0)
+        assert np.allclose(
+            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
+            margin,
+            1e-3,
+            1e-3,
+        )

From 9c56916fd7e8cd4d7c7aada9e627217d2adb8ae2 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Mon, 4 Dec 2023 11:40:45 +0100
Subject: [PATCH 32/32] [R] Very small performance tweaks (#9837)

---
 R-package/src/xgboost_R.cc | 40 ++++++++++++++------------------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index b267d7da6..8da00aa58 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -160,13 +160,6 @@ SEXP SafeMkChar(const char *c_str, SEXP continuation_token) {
 
 using dmlc::BeginPtr;
 
-xgboost::Context const *DMatrixCtx(DMatrixHandle handle) {
-  CHECK_HANDLE();
-  auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
-  CHECK(p_m);
-  return p_m->get()->Ctx();
-}
-
 XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) {
   return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
 }
@@ -318,6 +311,9 @@ XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
   int res_code;
   {
     std::vector<int> idxvec(len);
+    #ifndef _MSC_VER
+    #pragma omp simd
+    #endif
     for (R_xlen_t i = 0; i < len; ++i) {
       idxvec[i] = idxset_[i] - 1;
     }
@@ -375,6 +371,7 @@ XGB_DLL SEXP XGDMatrixSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP array) {
   int res_code;
   {
     std::vector<std::string> str_info;
+    str_info.reserve(len);
     for (size_t i = 0; i < len; ++i) {
       str_info.emplace_back(CHAR(VECTOR_ELT(str_info_holder, i)));
     }
@@ -457,9 +454,9 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
 
   int res_code;
   {
-    std::vector<void*> dvec;
+    std::vector<void*> dvec(len);
     for (R_xlen_t i = 0; i < len; ++i) {
-      dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
+      dvec[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i));
     }
     res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
   }
@@ -478,9 +475,9 @@ XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {
 
   int res_code;
   {
-    std::vector<void*> dvec;
+    std::vector<void*> dvec(len);
     for (R_xlen_t i = 0; i < len; ++i) {
-      dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
+      dvec[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i));
     }
     res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
   }
@@ -552,15 +549,16 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
 
   int res_code;
   {
-    std::vector<void*> vec_dmats;
+    std::vector<void*> vec_dmats(len);
     std::vector<std::string> vec_names;
-    std::vector<const char*> vec_sptr;
+    vec_names.reserve(len);
+    std::vector<const char*> vec_sptr(len);
     for (R_xlen_t i = 0; i < len; ++i) {
-      vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
+      vec_dmats[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i));
       vec_names.emplace_back(CHAR(VECTOR_ELT(evnames_lst, i)));
     }
     for (R_xlen_t i = 0; i < len; ++i) {
-      vec_sptr.push_back(vec_names[i].c_str());
+      vec_sptr[i] = vec_names[i].c_str();
     }
     res_code = XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
                                     asInteger(iter),
@@ -598,11 +596,7 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
     len *= out_shape[i];
   }
   r_out_result = PROTECT(allocVector(REALSXP, len));
-  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
-  double *r_out_result_ = REAL(r_out_result);
-  xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
-    r_out_result_[i] = out_result[i];
-  });
+  std::copy(out_result, out_result + len, REAL(r_out_result));
 
   SET_VECTOR_ELT(r_out, 0, r_out_shape);
   SET_VECTOR_ELT(r_out, 1, r_out_result);
@@ -831,11 +825,7 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config) {
   }
 
   out_scores_sexp = PROTECT(allocVector(REALSXP, len));
-  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
-  double *out_scores_sexp_ = REAL(out_scores_sexp);
-  xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
-    out_scores_sexp_[i] = out_scores[i];
-  });
+  std::copy(out_scores, out_scores + len, REAL(out_scores_sexp));
 
   SET_VECTOR_ELT(r_out, 0, out_features_sexp);
   SET_VECTOR_ELT(r_out, 1, out_shape_sexp);