diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index 8efcdc2ec..a2d8bb69a 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -40,7 +40,7 @@ jobs:
         key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
         restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
 
-    - name: Test XGBoost4J
+    - name: Test XGBoost4J (Core)
       run: |
         cd jvm-packages
         mvn test -B -pl :xgboost4j_2.12
@@ -67,7 +67,7 @@ jobs:
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
 
 
-    - name: Test XGBoost4J-Spark
+    - name: Test XGBoost4J (Core, Spark, Examples)
       run: |
         rm -rfv build/
         cd jvm-packages
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 0d8e6d653..78a17d3f7 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -65,7 +65,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py sdist
+        python -m build --sdist
         pip install -v ./dist/xgboost-*.tar.gz
         cd ..
         python -c 'import xgboost'
@@ -92,6 +92,9 @@ jobs:
         auto-update-conda: true
         python-version: ${{ matrix.python-version }}
         activate-environment: test
+    - name: Install build
+      run: |
+        conda install -c conda-forge python-build
     - name: Display Conda env
       run: |
         conda info
@@ -100,7 +103,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py sdist
+        python -m build --sdist
         pip install -v ./dist/xgboost-*.tar.gz
         cd ..
         python -c 'import xgboost'
@@ -147,7 +150,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py install
+        pip install -v .
 
     - name: Test Python package
       run: |
@@ -194,7 +197,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py bdist_wheel --universal
+        pip wheel -v . --wheel-dir dist/
         pip install ./dist/*.whl
 
     - name: Test Python package
@@ -238,7 +241,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py install
+        pip install -v .
 
     - name: Test Python package
       run: |
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index 0ec95ace1..640ebce81 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -54,7 +54,7 @@ jobs:
       matrix:
         config:
           - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
-          - {os: windows-latest, r: 'release', compiler: 'msvc', build: 'cmake'}
+          - {os: windows-latest, r: '4.2.0', compiler: 'msvc', build: 'cmake'}
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
       RSPM: ${{ matrix.config.rspm }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4cc47fa6a..2d3fdc728 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
 set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
 option(RABIT_MOCK "Build rabit with mock" OFF)
 option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF)
+option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binary dir" OFF)
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
@@ -312,8 +313,13 @@ if (JVM_BINDINGS)
   xgboost_target_defs(xgboost4j)
 endif (JVM_BINDINGS)
 
-set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
-set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
+if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
+  set_output_directory(runxgboost ${xgboost_BINARY_DIR})
+  set_output_directory(xgboost ${xgboost_BINARY_DIR}/lib)
+else ()
+  set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
+  set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
+endif ()
 # Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names
 add_dependencies(xgboost runxgboost)
 
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 743bf0a66..a84459db9 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -32,7 +32,7 @@ OBJECTS= \
     $(PKGROOT)/src/objective/objective.o \
     $(PKGROOT)/src/objective/regression_obj.o \
     $(PKGROOT)/src/objective/multiclass_obj.o \
-    $(PKGROOT)/src/objective/rank_obj.o \
+    $(PKGROOT)/src/objective/lambdarank_obj.o \
     $(PKGROOT)/src/objective/hinge.o \
     $(PKGROOT)/src/objective/aft_obj.o \
     $(PKGROOT)/src/objective/adaptive.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index a32d2fd2e..25c577e3a 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -32,7 +32,7 @@ OBJECTS= \
     $(PKGROOT)/src/objective/objective.o \
     $(PKGROOT)/src/objective/regression_obj.o \
     $(PKGROOT)/src/objective/multiclass_obj.o \
-    $(PKGROOT)/src/objective/rank_obj.o \
+    $(PKGROOT)/src/objective/lambdarank_obj.o \
     $(PKGROOT)/src/objective/hinge.o \
     $(PKGROOT)/src/objective/aft_obj.o \
     $(PKGROOT)/src/objective/adaptive.o \
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 1d8cb0f23..21d39f255 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -72,7 +72,7 @@ test_that("xgb.DMatrix: saving, loading", {
   tmp <- c("0 1:1 2:1", "1 3:1", "0 1:1")
   tmp_file <- tempfile(fileext = ".libsvm")
   writeLines(tmp, tmp_file)
-  dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE)
+  dtest4 <- xgb.DMatrix(paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE)
   expect_equal(dim(dtest4), c(3, 4))
   expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0))
 
diff --git a/demo/CLI/binary_classification/mushroom.conf b/demo/CLI/binary_classification/mushroom.conf
index 3cf865465..d78199cd7 100644
--- a/demo/CLI/binary_classification/mushroom.conf
+++ b/demo/CLI/binary_classification/mushroom.conf
@@ -20,10 +20,10 @@ num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 2
 # The path of training data
-data = "agaricus.txt.train"
+data = "agaricus.txt.train?format=libsvm"
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "agaricus.txt.test"
+eval[test] = "agaricus.txt.test?format=libsvm"
 # evaluate on training data as well each round
 eval_train = 1
 # The path of test data
-test:data = "agaricus.txt.test"
+test:data = "agaricus.txt.test?format=libsvm"
diff --git a/demo/CLI/regression/machine.conf b/demo/CLI/regression/machine.conf
index 4ba8437d5..42e2b1227 100644
--- a/demo/CLI/regression/machine.conf
+++ b/demo/CLI/regression/machine.conf
@@ -21,8 +21,8 @@ num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0
 # The path of training data
-data = "machine.txt.train"
+data = "machine.txt.train?format=libsvm"
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "machine.txt.test"
+eval[test] = "machine.txt.test?format=libsvm"
 # The path of test data
-test:data = "machine.txt.test"
+test:data = "machine.txt.test?format=libsvm"
diff --git a/demo/c-api/basic/c-api-demo.c b/demo/c-api/basic/c-api-demo.c
index ca6e689aa..15a224e9e 100644
--- a/demo/c-api/basic/c-api-demo.c
+++ b/demo/c-api/basic/c-api-demo.c
@@ -42,8 +42,8 @@ int main() {
 
   // load the data
   DMatrixHandle dtrain, dtest;
-  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train", silent, &dtrain));
-  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test", silent, &dtest));
+  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train?format=libsvm", silent, &dtrain));
+  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test?format=libsvm", silent, &dtest));
 
   // create the booster
   BoosterHandle booster;
diff --git a/demo/guide-python/boost_from_prediction.py b/demo/guide-python/boost_from_prediction.py
index 53a45549a..13f91d7c8 100644
--- a/demo/guide-python/boost_from_prediction.py
+++ b/demo/guide-python/boost_from_prediction.py
@@ -7,15 +7,19 @@ import os
 import xgboost as xgb
 
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 ###
 # advanced: start from a initial base prediction
 #
-print('start running example to start from a initial prediction')
+print("start running example to start from a initial prediction")
 # specify parameters via map, definition are same as c++ version
-param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
 # train xgboost for 1 round
 bst = xgb.train(param, dtrain, 1, watchlist)
 # Note: we need the margin value instead of transformed prediction in
@@ -27,5 +31,5 @@ ptest = bst.predict(dtest, output_margin=True)
 dtrain.set_base_margin(ptrain)
 dtest.set_base_margin(ptest)
 
-print('this is result of running from initial prediction')
+print("this is result of running from initial prediction")
 bst = xgb.train(param, dtrain, 1, watchlist)
diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py
index 2565b02c9..4e537108a 100644
--- a/demo/guide-python/cross_validation.py
+++ b/demo/guide-python/cross_validation.py
@@ -10,27 +10,45 @@ import xgboost as xgb
 
 # load data in do training
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
 num_round = 2
 
-print('running cross validation')
+print("running cross validation")
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
-xgb.cv(param, dtrain, num_round, nfold=5,
-       metrics={'error'}, seed=0,
-       callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)])
+xgb.cv(
+    param,
+    dtrain,
+    num_round,
+    nfold=5,
+    metrics={"error"},
+    seed=0,
+    callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)],
+)
 
-print('running cross validation, disable standard deviation display')
+print("running cross validation, disable standard deviation display")
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value
-res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5,
-             metrics={'error'}, seed=0,
-             callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False),
-                        xgb.callback.EarlyStopping(3)])
+res = xgb.cv(
+    param,
+    dtrain,
+    num_boost_round=10,
+    nfold=5,
+    metrics={"error"},
+    seed=0,
+    callbacks=[
+        xgb.callback.EvaluationMonitor(show_stdv=False),
+        xgb.callback.EarlyStopping(3),
+    ],
+)
 print(res)
-print('running cross validation, with preprocessing function')
+print("running cross validation, with preprocessing function")
+
+
 # define the preprocessing function
 # used to return the preprocessed training, test data, and parameter
 # we can use this to do weight rescale, etc.
@@ -38,32 +56,36 @@ print('running cross validation, with preprocessing function')
 def fpreproc(dtrain, dtest, param):
     label = dtrain.get_label()
     ratio = float(np.sum(label == 0)) / np.sum(label == 1)
-    param['scale_pos_weight'] = ratio
+    param["scale_pos_weight"] = ratio
     return (dtrain, dtest, param)
 
+
 # do cross validation, for each fold
 # the dtrain, dtest, param will be passed into fpreproc
 # then the return value of fpreproc will be used to generate
 # results of that fold
-xgb.cv(param, dtrain, num_round, nfold=5,
-       metrics={'auc'}, seed=0, fpreproc=fpreproc)
+xgb.cv(param, dtrain, num_round, nfold=5, metrics={"auc"}, seed=0, fpreproc=fpreproc)
 
 ###
 # you can also do cross validation with customized loss function
 # See custom_objective.py
 ##
-print('running cross validation, with customized loss function')
+print("running cross validation, with customized loss function")
+
+
 def logregobj(preds, dtrain):
     labels = dtrain.get_label()
     preds = 1.0 / (1.0 + np.exp(-preds))
     grad = preds - labels
     hess = preds * (1.0 - preds)
     return grad, hess
+
+
 def evalerror(preds, dtrain):
     labels = dtrain.get_label()
-    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+    return "error", float(sum(labels != (preds > 0.0))) / len(labels)
 
-param = {'max_depth':2, 'eta':1}
+
+param = {"max_depth": 2, "eta": 1}
 # train with customized objective
-xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
-       obj=logregobj, feval=evalerror)
+xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror)
diff --git a/demo/guide-python/evals_result.py b/demo/guide-python/evals_result.py
index bba8862f5..7b9da96da 100644
--- a/demo/guide-python/evals_result.py
+++ b/demo/guide-python/evals_result.py
@@ -7,28 +7,37 @@ import os
 import xgboost as xgb
 
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
 
-param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')]
+param = [
+    ("max_depth", 2),
+    ("objective", "binary:logistic"),
+    ("eval_metric", "logloss"),
+    ("eval_metric", "error"),
+]
 
 num_round = 2
-watchlist = [(dtest,'eval'), (dtrain,'train')]
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 
 evals_result = {}
 bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result)
 
-print('Access logloss metric directly from evals_result:')
-print(evals_result['eval']['logloss'])
+print("Access logloss metric directly from evals_result:")
+print(evals_result["eval"]["logloss"])
 
-print('')
-print('Access metrics through a loop:')
+print("")
+print("Access metrics through a loop:")
 for e_name, e_mtrs in evals_result.items():
-    print('- {}'.format(e_name))
+    print("- {}".format(e_name))
     for e_mtr_name, e_mtr_vals in e_mtrs.items():
-        print('   - {}'.format(e_mtr_name))
-        print('      - {}'.format(e_mtr_vals))
+        print("   - {}".format(e_mtr_name))
+        print("      - {}".format(e_mtr_vals))
 
-print('')
-print('Access complete dictionary:')
+print("")
+print("Access complete dictionary:")
 print(evals_result)
diff --git a/demo/guide-python/generalized_linear_model.py b/demo/guide-python/generalized_linear_model.py
index 976428f13..3387b1982 100644
--- a/demo/guide-python/generalized_linear_model.py
+++ b/demo/guide-python/generalized_linear_model.py
@@ -11,14 +11,22 @@ import xgboost as xgb
 #  basically, we are using linear model, instead of tree for our boosters
 ##
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
 # change booster to gblinear, so that we are fitting a linear model
 # alpha is the L1 regularizer
 # lambda is the L2 regularizer
 # you can also set lambda_bias which is L2 regularizer on the bias term
-param = {'objective':'binary:logistic', 'booster':'gblinear',
-         'alpha': 0.0001, 'lambda': 1}
+param = {
+    "objective": "binary:logistic",
+    "booster": "gblinear",
+    "alpha": 0.0001,
+    "lambda": 1,
+}
 
 # normally, you do not need to set eta (step_size)
 # XGBoost uses a parallel coordinate descent algorithm (shotgun),
@@ -29,9 +37,15 @@ param = {'objective':'binary:logistic', 'booster':'gblinear',
 ##
 # the rest of settings are the same
 ##
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 num_round = 4
 bst = xgb.train(param, dtrain, num_round, watchlist)
 preds = bst.predict(dtest)
 labels = dtest.get_label()
-print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))))
+print(
+    "error=%f"
+    % (
+        sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
+        / float(len(preds))
+    )
+)
diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py
index 55f7c61af..78137b4e1 100644
--- a/demo/guide-python/predict_first_ntree.py
+++ b/demo/guide-python/predict_first_ntree.py
@@ -16,8 +16,8 @@ test = os.path.join(CURRENT_DIR, "../data/agaricus.txt.test")
 
 def native_interface():
     # load data in do training
-    dtrain = xgb.DMatrix(train)
-    dtest = xgb.DMatrix(test)
+    dtrain = xgb.DMatrix(train + "?format=libsvm")
+    dtest = xgb.DMatrix(test + "?format=libsvm")
     param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
     watchlist = [(dtest, "eval"), (dtrain, "train")]
     num_round = 3
diff --git a/demo/guide-python/predict_leaf_indices.py b/demo/guide-python/predict_leaf_indices.py
index 45cc8fa7f..627619724 100644
--- a/demo/guide-python/predict_leaf_indices.py
+++ b/demo/guide-python/predict_leaf_indices.py
@@ -8,14 +8,18 @@ import xgboost as xgb
 
 # load data in do training
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
-param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
+param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 num_round = 3
 bst = xgb.train(param, dtrain, num_round, watchlist)
 
-print('start testing predict the leaf indices')
+print("start testing predict the leaf indices")
 # predict using first 2 tree
 leafindex = bst.predict(
     dtest, iteration_range=(0, 2), pred_leaf=True, strict_shape=True
diff --git a/demo/nvflare/README.md b/demo/nvflare/README.md
index 328dd7212..93f388208 100644
--- a/demo/nvflare/README.md
+++ b/demo/nvflare/README.md
@@ -3,61 +3,12 @@
 This directory contains a demo of Federated Learning using
 [NVFlare](https://nvidia.github.io/NVFlare/).
 
-## Training with CPU only
+## Horizontal Federated XGBoost
 
-To run the demo, first build XGBoost with the federated learning plugin enabled (see the
-[README](../../plugin/federated/README.md)).
+For horizontal federated learning using XGBoost (data is split row-wise), check out the `horizontal` directory
+(see the [README](horizontal/README.md)).
 
-Install NVFlare (note that currently NVFlare only supports Python 3.8):
-```shell
-pip install nvflare
-```
+## Vertical Federated XGBoost
 
-Prepare the data:
-```shell
-./prepare_data.sh
-```
-
-Start the NVFlare federated server:
-```shell
-/tmp/nvflare/poc/server/startup/start.sh
-```
-
-In another terminal, start the first worker:
-```shell
-/tmp/nvflare/poc/site-1/startup/start.sh
-```
-
-And the second worker:
-```shell
-/tmp/nvflare/poc/site-2/startup/start.sh
-```
-
-Then start the admin CLI:
-```shell
-/tmp/nvflare/poc/admin/startup/fl_admin.sh
-```
-
-In the admin CLI, run the following command:
-```shell
-submit_job hello-xgboost
-```
-
-Once the training finishes, the model file should be written into
-`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
-respectively.
-
-Finally, shutdown everything from the admin CLI, using `admin` as password:
-```shell
-shutdown client
-shutdown server
-```
-
-## Training with GPUs
-
-To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
-Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
-turned off (see the [README](../../plugin/federated/README.md)).
-
-Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
-above.
+For vertical federated learning using XGBoost (data is split column-wise), check out the `vertical` directory
+(see the [README](vertical/README.md)).
diff --git a/demo/nvflare/config/config_fed_client.json b/demo/nvflare/config/config_fed_client.json
deleted file mode 100755
index c15a1997c..000000000
--- a/demo/nvflare/config/config_fed_client.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "format_version": 2,
-  "executors": [
-    {
-      "tasks": [
-        "train"
-      ],
-      "executor": {
-        "path": "trainer.XGBoostTrainer",
-        "args": {
-          "server_address": "localhost:9091",
-          "world_size": 2,
-          "server_cert_path": "server-cert.pem",
-          "client_key_path": "client-key.pem",
-          "client_cert_path": "client-cert.pem",
-          "use_gpus": "false"
-        }
-      }
-    }
-  ],
-  "task_result_filters": [],
-  "task_data_filters": []
-}
diff --git a/demo/nvflare/config/config_fed_server.json b/demo/nvflare/config/config_fed_server.json
deleted file mode 100755
index 32993b652..000000000
--- a/demo/nvflare/config/config_fed_server.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "format_version": 2,
-  "server": {
-    "heart_beat_timeout": 600
-  },
-  "task_data_filters": [],
-  "task_result_filters": [],
-  "workflows": [
-    {
-      "id": "server_workflow",
-      "path": "controller.XGBoostController",
-      "args": {
-        "port": 9091,
-        "world_size": 2,
-        "server_key_path": "server-key.pem",
-        "server_cert_path": "server-cert.pem",
-        "client_cert_path": "client-cert.pem"
-      }
-    }
-  ],
-  "components": []
-}
diff --git a/demo/nvflare/horizontal/README.md b/demo/nvflare/horizontal/README.md
new file mode 100644
index 000000000..93ea3794c
--- /dev/null
+++ b/demo/nvflare/horizontal/README.md
@@ -0,0 +1,63 @@
+# Experimental Support of Horizontal Federated XGBoost using NVFlare
+
+This directory contains a demo of Horizontal Federated Learning using
+[NVFlare](https://nvidia.github.io/NVFlare/).
+
+## Training with CPU only
+
+To run the demo, first build XGBoost with the federated learning plugin enabled (see the
+[README](../../plugin/federated/README.md)).
+
+Install NVFlare (note that currently NVFlare only supports Python 3.8):
+```shell
+pip install nvflare
+```
+
+Prepare the data:
+```shell
+./prepare_data.sh
+```
+
+Start the NVFlare federated server:
+```shell
+/tmp/nvflare/poc/server/startup/start.sh
+```
+
+In another terminal, start the first worker:
+```shell
+/tmp/nvflare/poc/site-1/startup/start.sh
+```
+
+And the second worker:
+```shell
+/tmp/nvflare/poc/site-2/startup/start.sh
+```
+
+Then start the admin CLI:
+```shell
+/tmp/nvflare/poc/admin/startup/fl_admin.sh
+```
+
+In the admin CLI, run the following command:
+```shell
+submit_job horizontal-xgboost
+```
+
+Once the training finishes, the model file should be written into
+`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
+respectively.
+
+Finally, shutdown everything from the admin CLI, using `admin` as password:
+```shell
+shutdown client
+shutdown server
+```
+
+## Training with GPUs
+
+To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
+Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
+turned off (see the [README](../../plugin/federated/README.md)).
+
+Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+above.
diff --git a/demo/nvflare/custom/controller.py b/demo/nvflare/horizontal/custom/controller.py
similarity index 100%
rename from demo/nvflare/custom/controller.py
rename to demo/nvflare/horizontal/custom/controller.py
diff --git a/demo/nvflare/custom/trainer.py b/demo/nvflare/horizontal/custom/trainer.py
similarity index 100%
rename from demo/nvflare/custom/trainer.py
rename to demo/nvflare/horizontal/custom/trainer.py
diff --git a/demo/nvflare/prepare_data.sh b/demo/nvflare/horizontal/prepare_data.sh
similarity index 88%
rename from demo/nvflare/prepare_data.sh
rename to demo/nvflare/horizontal/prepare_data.sh
index 1c88c65fe..6a32008f8 100755
--- a/demo/nvflare/prepare_data.sh
+++ b/demo/nvflare/horizontal/prepare_data.sh
@@ -15,8 +15,8 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.train ag
 split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.test agaricus.txt.test-site-
 
 nvflare poc -n 2 --prepare
-mkdir -p /tmp/nvflare/poc/admin/transfer/hello-xgboost
-cp -fr config custom /tmp/nvflare/poc/admin/transfer/hello-xgboost
+mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
+cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
 for id in $(eval echo "{1..$world_size}"); do
   cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$id"/
diff --git a/demo/nvflare/vertical/README.md b/demo/nvflare/vertical/README.md
new file mode 100644
index 000000000..83c3111b6
--- /dev/null
+++ b/demo/nvflare/vertical/README.md
@@ -0,0 +1,59 @@
+# Experimental Support of Vertical Federated XGBoost using NVFlare
+
+This directory contains a demo of Vertical Federated Learning using
+[NVFlare](https://nvidia.github.io/NVFlare/).
+
+## Training with CPU only
+
+To run the demo, first build XGBoost with the federated learning plugin enabled (see the
+[README](../../plugin/federated/README.md)).
+
+Install NVFlare (note that currently NVFlare only supports Python 3.8):
+```shell
+pip install nvflare
+```
+
+Prepare the data (note that this step will download the HIGGS dataset, which is 2.6GB compressed, and 7.5GB
+uncompressed, so make sure you have enough disk space and are on a fast internet connection):
+```shell
+./prepare_data.sh
+```
+
+Start the NVFlare federated server:
+```shell
+/tmp/nvflare/poc/server/startup/start.sh
+```
+
+In another terminal, start the first worker:
+```shell
+/tmp/nvflare/poc/site-1/startup/start.sh
+```
+
+And the second worker:
+```shell
+/tmp/nvflare/poc/site-2/startup/start.sh
+```
+
+Then start the admin CLI:
+```shell
+/tmp/nvflare/poc/admin/startup/fl_admin.sh
+```
+
+In the admin CLI, run the following command:
+```shell
+submit_job vertical-xgboost
+```
+
+Once the training finishes, the model file should be written into
+`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
+respectively.
+
+Finally, shutdown everything from the admin CLI, using `admin` as password:
+```shell
+shutdown client
+shutdown server
+```
+
+## Training with GPUs
+
+Currently GPUs are not yet supported by vertical federated XGBoost.
diff --git a/demo/nvflare/vertical/custom/controller.py b/demo/nvflare/vertical/custom/controller.py
new file mode 100644
index 000000000..dd3e39f46
--- /dev/null
+++ b/demo/nvflare/vertical/custom/controller.py
@@ -0,0 +1,68 @@
+"""
+Example of training controller with NVFlare
+===========================================
+"""
+import multiprocessing
+
+from nvflare.apis.client import Client
+from nvflare.apis.fl_context import FLContext
+from nvflare.apis.impl.controller import Controller, Task
+from nvflare.apis.shareable import Shareable
+from nvflare.apis.signal import Signal
+from trainer import SupportedTasks
+
+import xgboost.federated
+
+
+class XGBoostController(Controller):
+    def __init__(self, port: int, world_size: int, server_key_path: str,
+                 server_cert_path: str, client_cert_path: str):
+        """Controller for federated XGBoost.
+
+        Args:
+            port: the port for the gRPC server to listen on.
+            world_size: the number of sites.
+            server_key_path: the path to the server key file.
+            server_cert_path: the path to the server certificate file.
+            client_cert_path: the path to the client certificate file.
+        """
+        super().__init__()
+        self._port = port
+        self._world_size = world_size
+        self._server_key_path = server_key_path
+        self._server_cert_path = server_cert_path
+        self._client_cert_path = client_cert_path
+        self._server = None
+
+    def start_controller(self, fl_ctx: FLContext):
+        self._server = multiprocessing.Process(
+            target=xgboost.federated.run_federated_server,
+            args=(self._port, self._world_size, self._server_key_path,
+                  self._server_cert_path, self._client_cert_path))
+        self._server.start()
+
+    def stop_controller(self, fl_ctx: FLContext):
+        if self._server:
+            self._server.terminate()
+
+    def process_result_of_unknown_task(self, client: Client, task_name: str,
+                                       client_task_id: str, result: Shareable,
+                                       fl_ctx: FLContext):
+        self.log_warning(fl_ctx, f"Unknown task: {task_name} from client {client.name}.")
+
+    def control_flow(self, abort_signal: Signal, fl_ctx: FLContext):
+        self.log_info(fl_ctx, "XGBoost training control flow started.")
+        if abort_signal.triggered:
+            return
+        task = Task(name=SupportedTasks.TRAIN, data=Shareable())
+        self.broadcast_and_wait(
+            task=task,
+            min_responses=self._world_size,
+            fl_ctx=fl_ctx,
+            wait_time_after_min_received=1,
+            abort_signal=abort_signal,
+        )
+        if abort_signal.triggered:
+            return
+
+        self.log_info(fl_ctx, "XGBoost training control flow finished.")
diff --git a/demo/nvflare/vertical/custom/trainer.py b/demo/nvflare/vertical/custom/trainer.py
new file mode 100644
index 000000000..cd420129c
--- /dev/null
+++ b/demo/nvflare/vertical/custom/trainer.py
@@ -0,0 +1,97 @@
+import os
+
+from nvflare.apis.executor import Executor
+from nvflare.apis.fl_constant import FLContextKey, ReturnCode
+from nvflare.apis.fl_context import FLContext
+from nvflare.apis.shareable import Shareable, make_reply
+from nvflare.apis.signal import Signal
+
+import xgboost as xgb
+from xgboost import callback
+
+
+class SupportedTasks(object):
+    TRAIN = "train"
+
+
+class XGBoostTrainer(Executor):
+    def __init__(self, server_address: str, world_size: int, server_cert_path: str,
+                 client_key_path: str, client_cert_path: str):
+        """Trainer for federated XGBoost.
+
+        Args:
+            server_address: address for the gRPC server to connect to.
+            world_size: the number of sites.
+            server_cert_path: the path to the server certificate file.
+            client_key_path: the path to the client key file.
+            client_cert_path: the path to the client certificate file.
+        """
+        super().__init__()
+        self._server_address = server_address
+        self._world_size = world_size
+        self._server_cert_path = server_cert_path
+        self._client_key_path = client_key_path
+        self._client_cert_path = client_cert_path
+
+    def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
+                abort_signal: Signal) -> Shareable:
+        self.log_info(fl_ctx, f"Executing {task_name}")
+        try:
+            if task_name == SupportedTasks.TRAIN:
+                self._do_training(fl_ctx)
+                return make_reply(ReturnCode.OK)
+            else:
+                self.log_error(fl_ctx, f"{task_name} is not a supported task.")
+                return make_reply(ReturnCode.TASK_UNKNOWN)
+        except BaseException as e:
+            self.log_exception(fl_ctx,
+                               f"Task {task_name} failed. Exception: {e.__str__()}")
+            return make_reply(ReturnCode.EXECUTION_EXCEPTION)
+
+    def _do_training(self, fl_ctx: FLContext):
+        client_name = fl_ctx.get_prop(FLContextKey.CLIENT_NAME)
+        rank = int(client_name.split('-')[1]) - 1
+        communicator_env = {
+            'xgboost_communicator': 'federated',
+            'federated_server_address': self._server_address,
+            'federated_world_size': self._world_size,
+            'federated_rank': rank,
+            'federated_server_cert': self._server_cert_path,
+            'federated_client_key': self._client_key_path,
+            'federated_client_cert': self._client_cert_path
+        }
+        with xgb.collective.CommunicatorContext(**communicator_env):
+            # Load file, file will not be sharded in federated mode.
+            if rank == 0:
+                label = '&label_column=0'
+            else:
+                label = ''
+            dtrain = xgb.DMatrix(f'higgs.train.csv?format=csv{label}', data_split_mode=1)
+            dtest = xgb.DMatrix(f'higgs.test.csv?format=csv{label}', data_split_mode=1)
+
+            # specify parameters via map
+            param = {
+                'validate_parameters': True,
+                'eta': 0.1,
+                'gamma': 1.0,
+                'max_depth': 8,
+                'min_child_weight': 100,
+                'tree_method': 'approx',
+                'grow_policy': 'depthwise',
+                'objective': 'binary:logistic',
+                'eval_metric': 'auc',
+            }
+
+            # specify validations set to watch performance
+            watchlist = [(dtest, "eval"), (dtrain, "train")]
+            # number of boosting rounds
+            num_round = 10
+
+            bst = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=2)
+
+            # Save the model.
+            workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
+            run_number = fl_ctx.get_prop(FLContextKey.CURRENT_RUN)
+            run_dir = workspace.get_run_dir(run_number)
+            bst.save_model(os.path.join(run_dir, "higgs.model.federated.vertical.json"))
+            xgb.collective.communicator_print("Finished training\n")
diff --git a/demo/nvflare/vertical/prepare_data.sh b/demo/nvflare/vertical/prepare_data.sh
new file mode 100755
index 000000000..86ec3dfa2
--- /dev/null
+++ b/demo/nvflare/vertical/prepare_data.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -e
+
+rm -fr ./*.pem /tmp/nvflare/poc
+
+world_size=2
+
+# Generate server and client certificates.
+openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out server-cert.pem -subj "/C=US/CN=localhost"
+openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
+
+# Download HIGGS dataset.
+if [ -f "HIGGS.csv" ]; then
+  echo "HIGGS.csv exists, skipping download."
+else
+  echo "Downloading HIGGS dataset."
+  wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
+  gunzip HIGGS.csv.gz
+fi
+
+# Split into train/test.
+if [[ -f higgs.train.csv && -f higgs.test.csv ]]; then
+  echo "higgs.train.csv and higgs.test.csv exist, skipping split."
+else
+  echo "Splitting HIGGS dataset into train/test."
+  head -n 10450000 HIGGS.csv > higgs.train.csv
+  tail -n 550000 HIGGS.csv > higgs.test.csv
+fi
+
+# Split train and test files by column to simulate a federated environment.
+site_files=(higgs.{train,test}.csv-site-*)
+if [ ${#site_files[@]} -eq $((world_size*2)) ]; then
+  echo "Site files exist, skipping split."
+else
+  echo "Splitting train/test into site files."
+  total_cols=28  # plus label
+  cols=$((total_cols/world_size))
+  echo "Columns per site: $cols"
+  for (( site=1; site<=world_size; site++ )); do
+    if (( site == 1 )); then
+      start=$((cols*(site-1)+1))
+    else
+      start=$((cols*(site-1)+2))
+    fi
+    if (( site == world_size )); then
+      end=$((total_cols+1))
+    else
+      end=$((cols*site+1))
+    fi
+    echo "Site $site, columns $start-$end"
+    cut -d, -f${start}-${end} higgs.train.csv > higgs.train.csv-site-"${site}"
+    cut -d, -f${start}-${end} higgs.test.csv > higgs.test.csv-site-"${site}"
+  done
+fi
+
+nvflare poc -n 2 --prepare
+mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost
+cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
+cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
+for (( site=1; site<=world_size; site++ )); do
+  cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/
+  ln -s "${PWD}"/higgs.train.csv-site-"${site}" /tmp/nvflare/poc/site-"${site}"/higgs.train.csv
+  ln -s "${PWD}"/higgs.test.csv-site-"${site}" /tmp/nvflare/poc/site-"${site}"/higgs.test.csv
+done
diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index 18c317a91..eab64ff0c 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -105,7 +105,7 @@ def make_pysrc_wheel(release: str, outdir: str) -> None:
         os.mkdir(dist)
 
     with DirectoryExcursion(os.path.join(ROOT, "python-package")):
-        subprocess.check_call(["python", "setup.py", "sdist"])
+        subprocess.check_call(["python", "-m", "build", "--sdist"])
         src = os.path.join(DIST, f"xgboost-{release}.tar.gz")
         subprocess.check_call(["twine", "check", src])
         shutil.move(src, os.path.join(dist, f"xgboost-{release}.tar.gz"))
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index b159ef172..e24d67282 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.8
+# Doxyfile 1.9.1
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "xgboost"
+PROJECT_NAME           = xgboost
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -46,10 +46,10 @@ PROJECT_NUMBER         = @XGBOOST_VERSION@
 
 PROJECT_BRIEF          =
 
-# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
-# the documentation. The maximum height of the logo should not exceed 55 pixels
-# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
-# to the output directory.
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
 
 PROJECT_LOGO           =
 
@@ -60,7 +60,7 @@ PROJECT_LOGO           =
 
 OUTPUT_DIRECTORY       = @PROJECT_BINARY_DIR@/doc_doxygen
 
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
 # will distribute the generated files over these directories. Enabling this
 # option can be useful when feeding doxygen a huge amount of source files, where
@@ -76,7 +76,7 @@ CREATE_SUBDIRS         = NO
 # U+3044.
 # The default value is: NO.
 
-#ALLOW_UNICODE_NAMES    = NO
+ALLOW_UNICODE_NAMES    = NO
 
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
@@ -93,14 +93,22 @@ CREATE_SUBDIRS         = NO
 
 OUTPUT_LANGUAGE        = English
 
-# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
 # The default value is: YES.
 
 BRIEF_MEMBER_DESC      = YES
 
-# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
 # description of a member or function before the detailed description
 #
 # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
@@ -135,7 +143,7 @@ ALWAYS_DETAILED_SEC    = NO
 
 INLINE_INHERITED_MEMB  = NO
 
-# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
 # before files name in the file list and in the header files. If set to NO the
 # shortest path that makes the file name unique will be used
 # The default value is: YES.
@@ -179,6 +187,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = NO
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -199,15 +217,23 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
 
 INHERIT_DOCS           = YES
 
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
-# new page for each member. If set to NO, the documentation of a member will be
-# part of the file/class/namespace that contains it.
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
 # The default value is: NO.
 
 SEPARATE_MEMBER_PAGES  = NO
@@ -226,16 +252,15 @@ TAB_SIZE               = 8
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
 # "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -264,42 +289,63 @@ OPTIMIZE_FOR_FORTRAN   = NO
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
-# Note For files without extension you can use no_extension as a placeholder.
+# Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
 # The default value is: YES.
 
-#MARKDOWN_SUPPORT       = YES
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
 
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by by putting a % sign in front of the word
-# or globally by setting AUTOLINK_SUPPORT to NO.
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
 # The default value is: YES.
 
-#AUTOLINK_SUPPORT       = YES
+AUTOLINK_SUPPORT       = YES
 
 # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
 # to include (a tag file for) the STL sources as input, then you should set this
@@ -318,7 +364,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -336,13 +382,20 @@ SIP_SUPPORT            = NO
 IDL_PROPERTY_SUPPORT   = YES
 
 # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
+# tag is set to YES then doxygen will reuse the documentation of the first
 # member in the group (if any) for the other members of the group. By default
 # all members of a group must be documented explicitly.
 # The default value is: NO.
 
 DISTRIBUTE_GROUP_DOC   = NO
 
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
 # Set the SUBGROUPING tag to YES to allow class member groups of the same type
 # (for instance a group of public functions) to be put as a subgroup of that
 # type (e.g. under the Public Functions section). Set it to NO to prevent
@@ -397,11 +450,24 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
 
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
 # documentation are documented, even if no documentation was available. Private
 # class members and static file members will be hidden unless the
 # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
@@ -411,35 +477,41 @@ LOOKUP_CACHE_SIZE      = 0
 
 EXTRACT_ALL            = YES
 
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
 # be included in the documentation.
 # The default value is: NO.
 
 EXTRACT_PRIVATE        = NO
 
-# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
 
-#EXTRACT_PACKAGE        = NO
+EXTRACT_PACKAGE        = NO
 
-# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
 # included in the documentation.
 # The default value is: NO.
 
 EXTRACT_STATIC         = NO
 
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
 # only classes defined in header files are included. Does not have any effect
 # for Java sources.
 # The default value is: YES.
 
 EXTRACT_LOCAL_CLASSES  = YES
 
-# This flag is only useful for Objective-C code. When set to YES local methods,
+# This flag is only useful for Objective-C code. If set to YES, local methods,
 # which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO only methods in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
 # included.
 # The default value is: NO.
 
@@ -454,6 +526,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -464,21 +543,21 @@ HIDE_UNDOC_MEMBERS     = NO
 
 # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
 # undocumented classes that are normally visible in the class hierarchy. If set
-# to NO these classes will be included in the various overviews. This option has
-# no effect if EXTRACT_ALL is enabled.
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
 
 # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO these
+# documentation blocks found inside the body of a function. If set to NO, these
 # blocks will be appended to the function's detailed documentation block.
 # The default value is: NO.
 
@@ -491,22 +570,36 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
 
 # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES the
+# their full class and namespace scopes in the documentation. If set to YES, the
 # scope will be hidden.
 # The default value is: NO.
 
 HIDE_SCOPE_NAMES       = NO
 
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
 # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.
@@ -518,7 +611,7 @@ SHOW_INCLUDE_FILES     = YES
 # which file to include in order to use the member.
 # The default value is: NO.
 
-#SHOW_GROUPED_MEMB_INC  = NO
+SHOW_GROUPED_MEMB_INC  = NO
 
 # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
 # files with double quotes in the documentation rather than with sharp brackets.
@@ -534,14 +627,14 @@ INLINE_INFO            = YES
 
 # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
 # (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
+# name. If set to NO, the members will appear in declaration order.
 # The default value is: YES.
 
 SORT_MEMBER_DOCS       = YES
 
 # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
 # descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order. Note that
+# name. If set to NO, the members will appear in declaration order. Note that
 # this will also influence the order of the classes in the class list.
 # The default value is: NO.
 
@@ -586,27 +679,25 @@ SORT_BY_SCOPE_NAME     = NO
 
 STRICT_PROTO_MATCHING  = NO
 
-# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
-# todo list. This list is created by putting \todo commands in the
-# documentation.
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
 # The default value is: YES.
 
 GENERATE_TODOLIST      = YES
 
-# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
-# test list. This list is created by putting \test commands in the
-# documentation.
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
 # The default value is: YES.
 
 GENERATE_TESTLIST      = YES
 
-# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
 # list. This list is created by putting \bug commands in the documentation.
 # The default value is: YES.
 
 GENERATE_BUGLIST       = YES
 
-# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
 # the deprecated list. This list is created by putting \deprecated commands in
 # the documentation.
 # The default value is: YES.
@@ -631,8 +722,8 @@ ENABLED_SECTIONS       =
 MAX_INITIALIZER_LINES  = 30
 
 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES the list
-# will mention the files that were used to generate the documentation.
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
 # The default value is: YES.
 
 SHOW_USED_FILES        = YES
@@ -677,7 +768,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -696,7 +787,7 @@ CITE_BIB_FILES         =
 QUIET                  = NO
 
 # The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
 # this implies that the warnings are on.
 #
 # Tip: Turn warnings on while writing the documentation.
@@ -704,7 +795,7 @@ QUIET                  = NO
 
 WARNINGS               = YES
 
-# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
 # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
 # will automatically be disabled.
 # The default value is: YES.
@@ -721,12 +812,22 @@ WARN_IF_DOC_ERROR      = YES
 
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
-# value. If set to NO doxygen will only warn about wrong or incomplete parameter
-# documentation, but not about the absence of documentation.
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = YES
 
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
 # The WARN_FORMAT tag determines the format of the warning messages that doxygen
 # can produce. The string should contain the $file, $line, and $text tags, which
 # will be replaced by the file and line number from which the warning originated
@@ -750,7 +851,7 @@ WARN_LOGFILE           =
 # The INPUT tag is used to specify the files and/or directories that contain
 # documented source files. You may enter file names like myfile.cpp or
 # directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = @PROJECT_SOURCE_DIR@/include
@@ -758,20 +859,29 @@ INPUT                  = @PROJECT_SOURCE_DIR@/include
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank the
-# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
-# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
-# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
-# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
-# *.qsf, *.as and *.js.
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.h
 
@@ -858,6 +968,10 @@ IMAGE_PATH             =
 # Note that the filter must not add or remove lines; it is applied before the
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
@@ -867,11 +981,15 @@ INPUT_FILTER           =
 # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER ) will also be used to filter the input files that are used for
+# INPUT_FILTER) will also be used to filter the input files that are used for
 # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
 # The default value is: NO.
 
@@ -890,7 +1008,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-#USE_MDFILE_AS_MAINPAGE =
+USE_MDFILE_AS_MAINPAGE =
 
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
@@ -919,7 +1037,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -931,7 +1049,7 @@ REFERENCED_BY_RELATION = NO
 REFERENCES_RELATION    = NO
 
 # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
 # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
 # link to the documentation.
 # The default value is: YES.
@@ -946,17 +1064,17 @@ REFERENCES_LINK_SOURCE = YES
 # The default value is: YES.
 # This tag requires that the tag SOURCE_BROWSER is set to YES.
 
-#SOURCE_TOOLTIPS        = YES
+SOURCE_TOOLTIPS        = YES
 
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -978,16 +1096,22 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
-# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see:
+# http://clang.llvm.org/) for more accurate parsing at the cost of reduced
+# performance. This can be particularly helpful with template rich C++ code for
+# which doxygen's built-in parser lacks the necessary type information.
 # Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
+# generated with the -Duse_libclang=ON option for CMake.
 # The default value is: NO.
 
-#CLANG_ASSISTED_PARSING = NO
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to
+# YES then doxygen will add the directory of each input to the include path.
+# The default value is: YES.
+
+CLANG_ADD_INC_PATHS    = YES
 
 # If clang assisted parsing is enabled you can provide the compiler with command
 # line options that you would normally use when invoking the compiler. Note that
@@ -995,7 +1119,20 @@ VERBATIM_HEADERS       = YES
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-#CLANG_OPTIONS          =
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the directory containing a file called compile_commands.json. This
+# file is the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
+# options used when the source files were built. This is equivalent to
+# specifying the -p option to a clang tool, such as clang-check. These options
+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
+# will be added as well.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1008,13 +1145,6 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all classes will
 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
 # can be used to specify a prefix (or a list of prefixes) that should be ignored
@@ -1027,7 +1157,7 @@ IGNORE_PREFIX          =
 # Configuration options related to the HTML output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
 # The default value is: YES.
 
 GENERATE_HTML          = YES
@@ -1093,14 +1223,14 @@ HTML_STYLESHEET        =
 # cascading style sheets that are included after the standard style sheets
 # created by doxygen. Using this option one can overrule certain style aspects.
 # This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefor more robust against future updates.
+# standard style sheet and is therefore more robust against future updates.
 # Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra stylesheet files is of importance (e.g. the last
-# stylesheet in the list overrules the setting of the previous ones in the
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-#HTML_EXTRA_STYLESHEET  =
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1113,9 +1243,9 @@ HTML_STYLESHEET        =
 HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the stylesheet and background images according to
+# will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1144,12 +1274,24 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
 # page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: YES.
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_TIMESTAMP         = YES
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1169,17 +1311,18 @@ HTML_DYNAMIC_SECTIONS  = NO
 # Minimum value: 0, maximum value: 9999, default value: 100.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-#HTML_INDEX_NUM_ENTRIES = 100
+HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1218,8 +1361,8 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1241,28 +1384,28 @@ GENERATE_HTMLHELP      = NO
 CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
 # doxygen will try to run the HTML help compiler on the generated index.hhp.
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 HHC_LOCATION           =
 
-# The GENERATE_CHI flag controls if a separate .chi index file is generated (
-# YES) or that it should be included in the master .chm file ( NO).
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 GENERATE_CHI           = NO
 
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 CHM_INDEX_ENCODING     =
 
-# The BINARY_TOC flag controls whether a binary table of contents is generated (
-# YES) or a normal table of contents ( NO) in the .chm file. Furthermore it
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
 # enables the Previous and Next buttons.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
@@ -1294,7 +1437,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1302,8 +1446,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1311,30 +1455,30 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1376,7 +1520,7 @@ DISABLE_INDEX          = NO
 # index structure (just like the one that is generated for HTML Help). For this
 # to work a browser that supports JavaScript, DHTML, CSS and frames is required
 # (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
 # further fine-tune the look of the index. As an example, the default style
 # sheet generated by doxygen has an example that shows how to put an image at
 # the root of the tree instead of the PROJECT_NAME. Since the tree basically has
@@ -1404,13 +1548,24 @@ ENUM_VALUES_PER_LINE   = 4
 
 TREEVIEW_WIDTH         = 250
 
-# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
 # external symbols imported via tag files in a separate window.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1420,7 +1575,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1431,9 +1586,15 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
 # to it using the MATHJAX_RELPATH option.
@@ -1444,13 +1605,13 @@ USE_MATHJAX            = NO
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
 # Possible values are: HTML-CSS (which is slower, but has the best
 # compatibility), NativeMML (i.e. MathML) and SVG.
 # The default value is: HTML-CSS.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-#MATHJAX_FORMAT         = HTML-CSS
+MATHJAX_FORMAT         = HTML-CSS
 
 # When MathJax is enabled you need to specify the location relative to the HTML
 # output directory using the MATHJAX_RELPATH option. The destination directory
@@ -1459,8 +1620,8 @@ USE_MATHJAX            = NO
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
@@ -1474,11 +1635,12 @@ MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-#MATHJAX_CODEFILE       =
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1502,7 +1664,7 @@ MATHJAX_EXTENSIONS     =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1519,26 +1681,28 @@ SERVER_BASED_SEARCH    = NO
 # external search engine pointed to by the SEARCHENGINE_URL option to obtain the
 # search results.
 #
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#EXTERNAL_SEARCH        = NO
+EXTERNAL_SEARCH        = NO
 
 # The SEARCHENGINE_URL should point to a search engine hosted by a web server
 # which will return the search results when EXTERNAL_SEARCH is enabled.
 #
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#SEARCHENGINE_URL       =
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1546,7 +1710,7 @@ SERVER_BASED_SEARCH    = NO
 # The default file is: searchdata.xml.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#SEARCHDATA_FILE        = searchdata.xml
+SEARCHDATA_FILE        = searchdata.xml
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
 # EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
@@ -1554,7 +1718,7 @@ SERVER_BASED_SEARCH    = NO
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#EXTERNAL_SEARCH_ID     =
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1564,13 +1728,13 @@ SERVER_BASED_SEARCH    = NO
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#EXTRA_SEARCH_MAPPINGS  =
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
 # The default value is: YES.
 
 GENERATE_LATEX         = YES
@@ -1586,22 +1750,36 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
-# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
 # The default value is: NO.
@@ -1619,9 +1797,12 @@ COMPACT_LATEX          = NO
 PAPER_TYPE             = a4
 
 # The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. To get the times font for
-# instance you can specify
-# EXTRA_PACKAGES=times
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1636,9 +1817,9 @@ EXTRA_PACKAGES         =
 # Note: Only use a user-defined header if you know what you are doing! The
 # following commands have a special meaning inside the header: $title,
 # $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empy string,
-# for the replacement values of the other commands the user is refered to
-# HTML_HEADER.
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
@@ -1654,13 +1835,24 @@ LATEX_HEADER           =
 
 LATEX_FOOTER           =
 
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
 # directory. Note that the files will be copied as-is; there are no commands or
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-#LATEX_EXTRA_FILES      =
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1671,9 +1863,11 @@ LATEX_FOOTER           =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1707,17 +1901,33 @@ LATEX_SOURCE_CODE      = NO
 
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_BIB_STYLE        = plain
 
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
 # RTF output is optimized for Word 97 and may not look too pretty with other RTF
 # readers/editors.
 # The default value is: NO.
@@ -1732,7 +1942,7 @@ GENERATE_RTF           = NO
 
 RTF_OUTPUT             = rtf
 
-# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
 # The default value is: NO.
@@ -1752,9 +1962,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1763,17 +1973,27 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
 
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
 # classes and files.
 # The default value is: NO.
 
@@ -1802,7 +2022,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-#MAN_SUBDIR             =
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -1817,7 +2037,7 @@ MAN_LINKS              = NO
 # Configuration options related to the XML output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
@@ -1831,7 +2051,7 @@ GENERATE_XML           = YES
 
 XML_OUTPUT             = xml
 
-# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
 # listings (including syntax highlighting and cross-referencing information) to
 # the XML output. Note that enabling this will significantly increase the size
 # of the XML output.
@@ -1840,15 +2060,22 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
 # that can be used to generate PDF.
 # The default value is: NO.
 
-#GENERATE_DOCBOOK       = NO
+GENERATE_DOCBOOK       = NO
 
 # The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
 # If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
@@ -1856,25 +2083,25 @@ XML_PROGRAMLISTING     = YES
 # The default directory is: docbook.
 # This tag requires that the tag GENERATE_DOCBOOK is set to YES.
 
-#DOCBOOK_OUTPUT         = docbook
+DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES doxygen will include the
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
 # program listings (including syntax highlighting and cross-referencing
 # information) to the DOCBOOK output. Note that enabling this will significantly
 # increase the size of the DOCBOOK output.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_DOCBOOK is set to YES.
 
-#DOCBOOK_PROGRAMLISTING = NO
+DOCBOOK_PROGRAMLISTING = NO
 
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
-# Definitions (see http://autogen.sf.net) file that captures the structure of
-# the code including all documentation. Note that this feature is still
-# experimental and incomplete at the moment.
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -1883,7 +2110,7 @@ GENERATE_AUTOGEN_DEF   = NO
 # Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
 # file that captures the structure of the code including all documentation.
 #
 # Note that this feature is still experimental and incomplete at the moment.
@@ -1891,7 +2118,7 @@ GENERATE_AUTOGEN_DEF   = NO
 
 GENERATE_PERLMOD       = NO
 
-# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
 # Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
 # output from the Perl module output.
 # The default value is: NO.
@@ -1899,9 +2126,9 @@ GENERATE_PERLMOD       = NO
 
 PERLMOD_LATEX          = NO
 
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
 # formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO the
+# understand what is going on. On the other hand, if this tag is set to NO, the
 # size of the Perl module output will be much smaller and Perl will parse it
 # just the same.
 # The default value is: YES.
@@ -1921,14 +2148,14 @@ PERLMOD_MAKEVAR_PREFIX =
 # Configuration options related to the preprocessor
 #---------------------------------------------------------------------------
 
-# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
 # C-preprocessor directives found in the sources and include files.
 # The default value is: YES.
 
 ENABLE_PREPROCESSING   = YES
 
-# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
-# in the source code. If set to NO only conditional compilation will be
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
 # performed. Macro expansion can be done in a controlled way by setting
 # EXPAND_ONLY_PREDEF to YES.
 # The default value is: NO.
@@ -1944,7 +2171,7 @@ MACRO_EXPANSION        = YES
 
 EXPAND_ONLY_PREDEF     = YES
 
-# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
 # INCLUDE_PATH will be searched if a #include is found.
 # The default value is: YES.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
@@ -1975,8 +2202,8 @@ INCLUDE_FILE_PATTERNS  =
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 PREDEFINED             = DMLC_USE_CXX11 \
-                         "XGB_DLL=" \
-                         "XGB_EXTERN_C="
+                         XGB_DLL= \
+                         XGB_EXTERN_C=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2022,37 +2249,32 @@ TAGFILES               =
 
 GENERATE_TAGFILE       =
 
-# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
-# class index. If set to NO only the inherited external classes will be listed.
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
 # The default value is: NO.
 
 ALLEXTERNALS           = NO
 
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
-# the modules index. If set to NO, only the current project's groups will be
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
 # listed.
 # The default value is: YES.
 
 EXTERNAL_GROUPS        = YES
 
-# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
 # the related pages index. If set to NO, only the current project's pages will
 # be listed.
 # The default value is: YES.
 
-#EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
+EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
 # (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
 # NO turns the diagrams off. Note that this option also works with HAVE_DOT
 # disabled, but it is recommended to install and use dot, since it yields more
@@ -2061,23 +2283,14 @@ PERL_PATH              = /usr/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-#DIA_PATH               =
+DIA_PATH               =
 
-# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
 
@@ -2150,7 +2363,7 @@ COLLABORATION_GRAPH    = YES
 
 GROUP_GRAPHS           = YES
 
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
 # collaboration diagrams in a style similar to the OMG's Unified Modeling
 # Language.
 # The default value is: NO.
@@ -2167,9 +2380,31 @@ UML_LOOK               = YES
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-#UML_LIMIT_NUM_FIELDS   = 10
+DOT_WRAP_THRESHOLD     = 17
 
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
@@ -2202,7 +2437,8 @@ INCLUDED_BY_GRAPH      = YES
 #
 # Note that enabling this option will significantly increase the time of a run.
 # So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command.
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
 # The default value is: NO.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2213,7 +2449,8 @@ CALL_GRAPH             = NO
 #
 # Note that enabling this option will significantly increase the time of a run.
 # So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command.
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
 # The default value is: NO.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2236,13 +2473,17 @@ GRAPHICAL_HIERARCHY    = YES
 DIRECTORY_GRAPH        = YES
 
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot.
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
 # Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
 # png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
-# gif:cairo:gd, gif:gd, gif:gd:gd and svg.
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
 # The default value is: png.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2283,16 +2524,25 @@ MSCFILE_DIRS           =
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-#DIAFILE_DIRS           =
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
 # PlantUML is not used or called during a preprocessing step. Doxygen will
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
-#PLANTUML_JAR_PATH      =
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
@@ -2330,7 +2580,7 @@ MAX_DOT_GRAPH_DEPTH    = 0
 
 DOT_TRANSPARENT        = NO
 
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10) support
 # this, this feature is disabled by default.
@@ -2347,9 +2597,11 @@ DOT_MULTI_TARGETS      = YES
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/doc/build.rst b/doc/build.rst
index 53d9a3209..e78d2d2f4 100644
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -12,6 +12,7 @@ systems.  If the instructions do not work for you, please feel free to ask quest
   Consider installing XGBoost from a pre-built binary, to avoid the trouble of building XGBoost from the source.  Checkout :doc:`Installation Guide </install>`.
 
 .. contents:: Contents
+  :local:
 
 .. _get_source:
 
@@ -152,11 +153,11 @@ On Windows, run CMake as follows:
 
   mkdir build
   cd build
-  cmake .. -G"Visual Studio 14 2015 Win64" -DUSE_CUDA=ON
+  cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON
 
 (Change the ``-G`` option appropriately if you have a different version of Visual Studio installed.)
 
-The above cmake configuration run will create an ``xgboost.sln`` solution file in the build directory. Build this solution in release mode as a x64 build, either from Visual studio or from command line:
+The above cmake configuration run will create an ``xgboost.sln`` solution file in the build directory. Build this solution in Release mode, either from Visual studio or from command line:
 
 .. code-block:: bash
 
@@ -176,111 +177,104 @@ Building Python Package with Default Toolchains
 ===============================================
 There are several ways to build and install the package from source:
 
-1. Use Python setuptools directly
+1. Build C++ core with CMake first
 
-  The XGBoost Python package supports most of the setuptools commands, here is a list of tested commands:
+  You can first build C++ library using CMake as described in :ref:`build_shared_lib`.
+  After compilation, a shared library will appear in ``lib/`` directory.
+  On Linux distributions, the shared library is ``lib/libxgboost.so``.
+  The install script ``pip install .`` will reuse the shared library instead of compiling
+  it from scratch, making it quite fast to run.
+
+  .. code-block:: console
+
+    $ cd python-package/
+    $ pip install .  # Will re-use lib/libxgboost.so
+
+2. Install the Python package directly
+
+  You can navigate to ``python-package/`` directory and install the Python package directly
+  by running
+
+  .. code-block:: console
+
+    $ cd python-package/
+    $ pip install -v .
+
+  which will compile XGBoost's native (C++) code using default CMake flags.
+  To enable additional compilation options, pass corresponding ``--config-settings``:
+
+  .. code-block:: console
+
+    $ pip install -v . --config-settings use_cuda=True --config-settings use_nccl=True
+
+  Use Pip 22.1 or later to use ``--config-settings`` option.
+
+  Here are the available options for ``--config-settings``:
+
+  .. literalinclude:: ../python-package/packager/build_config.py
+    :language: python
+    :start-at: @dataclasses.dataclass
+    :end-before: def _set_config_setting(
+
+  ``use_system_libxgboost`` is a special option. See Item 4 below for
+  detailed description.
+
+  .. note:: Verbose flag recommended
+
+    As ``pip install .`` will build C++ code, it will take a while to complete.
+    To ensure that the build is progressing successfully, we suggest that
+    you add the verbose flag (``-v``) when invoking ``pip install``.
+
+
+3. Editable installation
+
+  To further enable rapid development and iteration, we provide an **editable installation**.
+  In an editable installation, the installed package is simply a symbolic link to your
+  working copy of the XGBoost source code. So every changes you make to your source
+  directory will be immediately visible to the Python interpreter. Here is how to
+  install XGBoost as editable installation:
 
   .. code-block:: bash
 
-    python setup.py install  # Install the XGBoost to your current Python environment.
-    python setup.py build    # Build the Python package.
-    python setup.py build_ext # Build only the C++ core.
-    python setup.py sdist     # Create a source distribution
-    python setup.py bdist     # Create a binary distribution
-    python setup.py bdist_wheel # Create a binary distribution with wheel format
-
-  Running ``python setup.py install`` will compile XGBoost using default CMake flags.  For
-  passing additional compilation options, append the flags to the command.  For example,
-  to enable CUDA acceleration and NCCL (distributed GPU) support:
-
-  .. code-block:: bash
-
-    python setup.py install --use-cuda --use-nccl
-
-  Please refer to ``setup.py`` for a complete list of available options.  Some other
-  options used for development are only available for using CMake directly.  See next
-  section on how to use CMake with setuptools manually.
-
-  You can install the created distribution packages using pip. For example, after running
-  ``sdist`` setuptools command, a tar ball similar to ``xgboost-1.0.0.tar.gz`` will be
-  created under the ``dist`` directory.  Then you can install it by invoking the following
-  command under ``dist`` directory:
-
-  .. code-block:: bash
-
-    # under python-package directory
-    cd dist
-    pip install ./xgboost-1.0.0.tar.gz
-
-
-  For details about these commands, please refer to the official document of `setuptools
-  <https://setuptools.readthedocs.io/en/latest/>`_, or just Google "how to install Python
-  package from source".  XGBoost Python package follows the general convention.
-  Setuptools is usually available with your Python distribution, if not you can install it
-  via system command.  For example on Debian or Ubuntu:
-
-  .. code-block:: bash
-
-    sudo apt-get install python-setuptools
-
-
-  For cleaning up the directory after running above commands, ``python setup.py clean`` is
-  not sufficient.  After copying out the build result, simply running ``git clean -xdf``
-  under ``python-package`` is an efficient way to remove generated cache files.  If you
-  find weird behaviors in Python build or running linter, it might be caused by those
-  cached files.
-
-  For using develop command (editable installation), see next section.
-
-  .. code-block::
-
-    python setup.py develop   # Create a editable installation.
-    pip install -e .          # Same as above, but carried out by pip.
-
-
-2. Build C++ core with CMake first
-
-  This is mostly for C++ developers who don't want to go through the hooks in Python
-  setuptools.  You can build C++ library directly using CMake as described in above
-  sections.  After compilation, a shared object (or called dynamic linked library, jargon
-  depending on your platform) will appear in XGBoost's source tree under ``lib/``
-  directory.  On Linux distributions it's ``lib/libxgboost.so``.  From there all Python
-  setuptools commands will reuse that shared object instead of compiling it again.  This
-  is especially convenient if you are using the editable installation, where the installed
-  package is simply a link to the source tree.  We can perform rapid testing during
-  development.  Here is a simple bash script does that:
-
-  .. code-block:: bash
-
-    # Under xgboost source tree.
+    # Under xgboost source directory
     mkdir build
     cd build
-    cmake ..
-    make -j$(nproc)
+    # Build shared library libxgboost.so
+    cmake .. -GNinja
+    ninja
+    # Install as editable installation
     cd ../python-package
-    pip install -e .  # or equivalently python setup.py develop
+    pip install -e .
 
-3. Use ``libxgboost.so`` on system path.
+4. Use ``libxgboost.so`` on system path.
 
-  This is for distributing xgboost in a language independent manner, where
-  ``libxgboost.so`` is separately packaged with Python package.  Assuming `libxgboost.so`
-  is already presented in system library path, which can be queried via:
+  This option is useful for package managers that wish to separately package
+  ``libxgboost.so`` and the XGBoost Python package. For example, Conda
+  publishes ``libxgboost`` (for the shared library) and ``py-xgboost``
+  (for the Python package).
+
+  To use this option, first make sure that ``libxgboost.so`` exists in the system library path:
 
   .. code-block:: python
 
     import sys
-    import os
-    os.path.join(sys.prefix, 'lib')
+    import pathlib
+    libpath = pathlib.Path(sys.prefix).joinpath("lib", "libxgboost.so")
+    assert libpath.exists()
 
-  Then one only needs to provide an user option when installing Python package to reuse the
-  shared object in system path:
+  Then pass ``use_system_libxgboost=True`` option to ``pip install``:
 
   .. code-block:: bash
 
-    cd xgboost/python-package
-    python setup.py install --use-system-libxgboost
+    cd python-package
+    pip install . --config-settings use_system_libxgboost=True
 
 
+.. note::
+
+  See :doc:`contrib/python_packaging` for instructions on packaging
+  and distributing XGBoost as Python distributions.
+
 .. _python_mingw:
 
 Building Python Package for Windows with MinGW-w64 (Advanced)
@@ -297,7 +291,7 @@ So you may want to build XGBoost with GCC own your own risk. This presents some
 2. ``-O3`` is OK.
 3. ``-mtune=native`` is also OK.
 4. Don't use ``-march=native`` gcc flag. Using it causes the Python interpreter to crash if the DLL was actually used.
-5. You may need to provide the lib with the runtime libs. If ``mingw32/bin`` is not in ``PATH``, build a wheel (``python setup.py bdist_wheel``), open it with an archiver and put the needed dlls to the directory where ``xgboost.dll`` is situated. Then you can install the wheel with ``pip``.
+5. You may need to provide the lib with the runtime libs. If ``mingw32/bin`` is not in ``PATH``, build a wheel (``pip wheel``), open it with an archiver and put the needed dlls to the directory where ``xgboost.dll`` is situated. Then you can install the wheel with ``pip``.
 
 ******************************
 Building R Package From Source
diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
index 6073e646a..76e06de35 100644
--- a/doc/contrib/ci.rst
+++ b/doc/contrib/ci.rst
@@ -35,8 +35,9 @@ calls ``cibuildwheel`` to build the wheel. The ``cibuildwheel`` is a library tha
 suitable Python environment for each OS and processor target. Since we don't have Apple Silion
 machine in GitHub Actions, cross-compilation is needed; ``cibuildwheel`` takes care of the complex
 task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call
-``setup.py bdist_wheel``. Since XGBoost has a native library component, ``setup.py`` contains
-a glue code to call CMake and a C++ compiler to build the native library on the fly.)
+``pip wheel``. Since XGBoost has a native library component, we created a customized build
+backend that hooks into ``pip``. The customized backend contains the glue code to compile the native
+library on the fly.)
 
 *********************************************************
 Reproduce CI testing environments using Docker containers
diff --git a/doc/contrib/index.rst b/doc/contrib/index.rst
index c9c5f93a2..6a36cb108 100644
--- a/doc/contrib/index.rst
+++ b/doc/contrib/index.rst
@@ -23,6 +23,7 @@ Here are guidelines for contributing to various aspect of the XGBoost project:
   Community Guideline <community>
   donate
   coding_guide
+  python_packaging
   unit_tests
   Docs and Examples <docs>
   git_guide
diff --git a/doc/contrib/python_packaging.rst b/doc/contrib/python_packaging.rst
new file mode 100644
index 000000000..5cf085685
--- /dev/null
+++ b/doc/contrib/python_packaging.rst
@@ -0,0 +1,83 @@
+###########################################
+Notes on packaging XGBoost's Python package
+###########################################
+
+
+.. contents:: Contents
+  :local:
+
+.. _packaging_python_xgboost:
+
+***************************************************
+How to build binary wheels and source distributions
+***************************************************
+
+Wheels and source distributions (sdist for short) are the two main
+mechanisms for packaging and distributing Python packages.
+
+* A **source distribution** (sdist) is a tarball (``.tar.gz`` extension) that
+  contains the source code.
+* A **wheel** is a ZIP-compressed archive (with ``.whl`` extension)
+  representing a *built* distribution. Unlike an sdist, a wheel can contain
+  compiled components. The compiled components are compiled prior to distribution,
+  making it more convenient for end-users to install a wheel. Wheels containing
+  compiled components are referred to as **binary wheels**.
+
+See `Python Packaging User Guide <https://packaging.python.org/en/latest/>`_
+to learn more about how Python packages in general are packaged and
+distributed.
+
+For the remainder of this document, we will focus on packaging and
+distributing XGBoost.
+
+Building sdists
+===============
+
+In the case of XGBoost, an sdist contains both the Python code as well as
+the C++ code, so that the core part of XGBoost can be compiled into the
+shared libary ``libxgboost.so`` [#shared_lib_name]_.
+
+You can obtain an sdist as follows:
+
+.. code-block:: console
+
+  $ python -m build --sdist .
+
+(You'll need to install the ``build`` package first:
+``pip install build`` or ``conda install python-build``.)
+
+Running ``pip install`` with an sdist will launch CMake and a C++ compiler
+to compile the bundled C++ code into ``libxgboost.so``:
+
+.. code-block:: console
+
+  $ pip install -v xgboost-2.0.0.tar.gz  # Add -v to show build progress
+
+Building binary wheels
+======================
+
+You can also build a wheel as follows:
+
+.. code-block:: console
+
+   $ pip wheel --no-deps -v .
+
+Notably, the resulting wheel contains a copy of the shared library
+``libxgboost.so`` [#shared_lib_name]_. The wheel is a **binary wheel**,
+since it contains a compiled binary.
+
+
+Running ``pip install`` with the binary wheel will extract the content of
+the wheel into the current Python environment. Since the wheel already
+contains a pre-built copy of ``libxgboost.so``, it does not have to be
+built at the time of install. So ``pip install`` with the binary wheel
+completes quickly:
+
+.. code-block:: console
+  
+  $ pip install xgboost-2.0.0-py3-none-linux_x86_64.whl  # Completes quickly
+
+.. rubric:: Footnotes
+
+.. [#shared_lib_name] The name of the shared library file will differ
+   depending on the operating system in use. See :ref:`build_shared_lib`.
diff --git a/doc/install.rst b/doc/install.rst
index 03daf465f..0e155f647 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -16,15 +16,28 @@ Stable Release
 Python
 ------
 
-Pre-built binary are uploaded to PyPI (Python Package Index) for each release.  Supported platforms are Linux (x86_64, aarch64), Windows (x86_64) and MacOS (x86_64, Apple Silicon).
+Pre-built binary wheels are uploaded to PyPI (Python Package Index) for each release. Supported platforms are Linux (x86_64, aarch64), Windows (x86_64) and MacOS (x86_64, Apple Silicon).
 
 .. code-block:: bash
 
+  # Pip 21.3+ is required
   pip install xgboost
 
 
 You might need to run the command with ``--user`` flag or use ``virtualenv`` if you run
-into permission errors.  Python pre-built binary capability for each platform:
+into permission errors.
+
+.. note:: Windows users need to install Visual C++ Redistributable
+
+  XGBoost requires DLLs from `Visual C++ Redistributable
+  <https://www.microsoft.com/en-us/download/details.aspx?id=48145>`_
+  in order to function, so make sure to install it. Exception: If
+  you have Visual Studio installed, you already have access to
+  necessary libraries and thus don't need to install Visual C++
+  Redistributable.
+
+
+Capabilities of binary wheels for each platform:
 
 .. |tick| unicode:: U+2714
 .. |cross| unicode:: U+2718
diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst
index 6721908f9..2b476781b 100644
--- a/doc/jvm/index.rst
+++ b/doc/jvm/index.rst
@@ -41,3 +41,7 @@ Contents
   XGBoost4J Scala API <scaladocs/xgboost4j/index>
   XGBoost4J-Spark Scala API <scaladocs/xgboost4j-spark/index>
   XGBoost4J-Flink Scala API <scaladocs/xgboost4j-flink/index>
+
+.. note::
+
+  Please note that the flink interface is still under construction.
diff --git a/doc/model.schema b/doc/model.schema
index b9e2da305..103d9d9e4 100644
--- a/doc/model.schema
+++ b/doc/model.schema
@@ -219,6 +219,16 @@
         "num_pairsample": { "type": "string" },
         "fix_list_weight": { "type": "string" }
       }
+    },
+    "lambdarank_param": {
+      "type": "object",
+      "properties": {
+        "lambdarank_num_pair_per_sample": { "type": "string" },
+        "lambdarank_pair_method": { "type": "string" },
+        "lambdarank_unbiased": {"type": "string" },
+        "lambdarank_bias_norm": {"type": "string" },
+        "ndcg_exp_gain": {"type": "string"}
+      }
     }
   },
   "type": "object",
@@ -477,22 +487,22 @@
               "type": "object",
               "properties": {
                 "name": { "const": "rank:pairwise" },
-                "lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
+                "lambda_rank_param": { "$ref": "#/definitions/lambdarank_param"}
               },
               "required": [
                 "name",
-                "lambda_rank_param"
+                "lambdarank_param"
               ]
             },
             {
               "type": "object",
               "properties": {
                 "name": { "const": "rank:ndcg" },
-                "lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
+                "lambda_rank_param": { "$ref": "#/definitions/lambdarank_param"}
               },
               "required": [
                 "name",
-                "lambda_rank_param"
+                "lambdarank_param"
               ]
             },
             {
diff --git a/doc/parameter.rst b/doc/parameter.rst
index c070e7018..8c7cadcdc 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -233,7 +233,7 @@ Parameters for Tree Booster
   .. note:: This parameter is working-in-progress.
 
   - The strategy used for training multi-target models, including multi-target regression
-  and multi-class classification. See :doc:`/tutorials/multioutput` for more information.
+    and multi-class classification. See :doc:`/tutorials/multioutput` for more information.
 
     - ``one_output_per_tree``: One model for each target.
     - ``multi_output_tree``:  Use multi-target trees.
@@ -380,9 +380,9 @@ Specify the learning task and the corresponding learning objective. The objectiv
     See :doc:`/tutorials/aft_survival_analysis` for details.
   - ``multi:softmax``: set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes)
   - ``multi:softprob``: same as softmax, but output a vector of ``ndata * nclass``, which can be further reshaped to ``ndata * nclass`` matrix. The result contains predicted probability of each data point belonging to each class.
-  - ``rank:pairwise``: Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
-  - ``rank:ndcg``: Use LambdaMART to perform list-wise ranking where `Normalized Discounted Cumulative Gain (NDCG) <http://en.wikipedia.org/wiki/NDCG>`_ is maximized
-  - ``rank:map``: Use LambdaMART to perform list-wise ranking where `Mean Average Precision (MAP) <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_ is maximized
+  - ``rank:ndcg``: Use LambdaMART to perform pair-wise ranking where `Normalized Discounted Cumulative Gain (NDCG) <http://en.wikipedia.org/wiki/NDCG>`_ is maximized. This objective supports position debiasing for click data.
+  - ``rank:map``: Use LambdaMART to perform pair-wise ranking where `Mean Average Precision (MAP) <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_ is maximized
+  - ``rank:pairwise``: Use LambdaRank to perform pair-wise ranking using the `ranknet` objective.
   - ``reg:gamma``: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications>`_.
   - ``reg:tweedie``: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be `Tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications>`_.
 
@@ -395,8 +395,9 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
 * ``eval_metric`` [default according to objective]
 
-  - Evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and logloss for classification, mean average precision for ranking)
-  - User can add multiple evaluation metrics. Python users: remember to pass the metrics in as list of parameters pairs instead of map, so that latter ``eval_metric`` won't override previous one
+  - Evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and logloss for classification, `mean average precision` for ``rank:map``, etc.)
+  - User can add multiple evaluation metrics. Python users: remember to pass the metrics in as list of parameters pairs instead of map, so that latter ``eval_metric`` won't override previous ones
+
   - The choices are listed below:
 
     - ``rmse``: `root mean square error <http://en.wikipedia.org/wiki/Root_mean_square_error>`_
@@ -480,6 +481,36 @@ Parameter for using AFT Survival Loss (``survival:aft``) and Negative Log Likeli
 
 * ``aft_loss_distribution``: Probability Density Function, ``normal``, ``logistic``, or ``extreme``.
 
+.. _ltr-param:
+
+Parameters for learning to rank (``rank:ndcg``, ``rank:map``, ``rank:pairwise``)
+================================================================================
+
+These are parameters specific to learning to rank task. See :doc:`Learning to Rank </tutorials/learning_to_rank>` for an in-depth explanation.
+
+* ``lambdarank_pair_method`` [default = ``mean``]
+
+  How to construct pairs for pair-wise learning.
+
+  - ``mean``: Sample ``lambdarank_num_pair_per_sample`` pairs for each document in the query list.
+  - ``topk``: Focus on top-``lambdarank_num_pair_per_sample`` documents. Construct :math:`|query|` pairs for each document at the top-``lambdarank_num_pair_per_sample`` ranked by the model.
+
+* ``lambdarank_num_pair_per_sample`` [range = :math:`[1, \infty]`]
+
+  It specifies the number of pairs sampled for each document when pair method is ``mean``, or the truncation level for queries when the pair method is ``topk``. For example, to train with ``ndcg@6``, set ``lambdarank_num_pair_per_sample`` to :math:`6` and ``lambdarank_pair_method`` to ``topk``.
+
+* ``lambdarank_unbiased`` [default = ``false``]
+
+  Specify whether do we need to debias input click data.
+
+* ``lambdarank_bias_norm`` [default = 2.0]
+
+  :math:`L_p` normalization for position debiasing, default is :math:`L_2`. Only relevant when ``lambdarank_unbiased`` is set to true.
+
+* ``ndcg_exp_gain`` [default = ``true``]
+
+  Whether we should use exponential gain function for ``NDCG``. There are two forms of gain function for ``NDCG``, one is using relevance value directly while the other is using :math:`2^{rel} - 1` to emphasize on retrieving relevant documents. When ``ndcg_exp_gain`` is true (the default), relevance degree cannot be greater than 31.
+
 ***********************
 Command Line Parameters
 ***********************
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index c33a90c81..888683975 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -23,7 +23,7 @@ Requirements
 
 Dask can be installed using either pip or conda (see the dask `installation
 documentation <https://docs.dask.org/en/latest/install.html>`_ for more information).  For
-accelerating XGBoost with GPUs, `dask-cuda <https://github.com/rapidsai/dask-cuda>`_ is
+accelerating XGBoost with GPUs, `dask-cuda <https://github.com/rapidsai/dask-cuda>`__ is
 recommended for creating GPU clusters.
 
 
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 3b96cfe92..006d63b43 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -77,7 +77,7 @@ The external memory version takes in the following `URI <https://en.wikipedia.or
 
 .. code-block:: none
 
-  filename#cacheprefix
+  filename?format=libsvm#cacheprefix
 
 The ``filename`` is the normal path to LIBSVM format file you want to load in, and
 ``cacheprefix`` is a path to a cache file that XGBoost will use for caching preprocessed
@@ -97,13 +97,13 @@ you have a dataset stored in a file similar to ``agaricus.txt.train`` with LIBSV
 
 .. code-block:: python
 
-  dtrain = DMatrix('../data/agaricus.txt.train#dtrain.cache')
+  dtrain = DMatrix('../data/agaricus.txt.train?format=libsvm#dtrain.cache')
 
 XGBoost will first load ``agaricus.txt.train`` in, preprocess it, then write to a new file named
 ``dtrain.cache`` as an on disk cache for storing preprocessed data in an internal binary format.  For
 more notes about text input formats, see :doc:`/tutorials/input_format`.
 
-For CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train#dtrain.cache"``.
+For CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train?format=libsvm#dtrain.cache"``.
 
 
 **********************************
diff --git a/doc/tutorials/input_format.rst b/doc/tutorials/input_format.rst
index 923a82650..ab0158e13 100644
--- a/doc/tutorials/input_format.rst
+++ b/doc/tutorials/input_format.rst
@@ -2,10 +2,15 @@
 Text Input Format of DMatrix
 ############################
 
+.. _basic_input_format:
+
+Here we will briefly describe the text input formats for XGBoost. However, for users with access to a supported language environment like Python or R, it's recommended to use data parsers from that ecosystem instead. For instance, :py:func:`sklearn.datasets.load_svmlight_file`.
+
 ******************
 Basic Input Format
 ******************
-XGBoost currently supports two text formats for ingesting data: LIBSVM and CSV. The rest of this document will describe the LIBSVM format. (See `this Wikipedia article <https://en.wikipedia.org/wiki/Comma-separated_values>`_ for a description of the CSV format.).  Please be careful that, XGBoost does **not** understand file extensions, nor try to guess the file format, as there is no universal agreement upon file extension of LIBSVM or CSV.  Instead it employs `URI <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format for specifying the precise input file type.  For example if you provide a `csv` file ``./data.train.csv`` as input, XGBoost will blindly use the default LIBSVM parser to digest it and generate a parser error.  Instead, users need to provide an URI in the form of ``train.csv?format=csv``.  For external memory input, the URI should of a form similar to ``train.csv?format=csv#dtrain.cache``.  See :ref:`python_data_interface` and :doc:`/tutorials/external_memory` also.
+
+XGBoost currently supports two text formats for ingesting data: LIBSVM and CSV. The rest of this document will describe the LIBSVM format. (See `this Wikipedia article <https://en.wikipedia.org/wiki/Comma-separated_values>`_ for a description of the CSV format.).  Please be careful that, XGBoost does **not** understand file extensions, nor try to guess the file format, as there is no universal agreement upon file extension of LIBSVM or CSV.  Instead it employs `URI <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format for specifying the precise input file type.  For example if you provide a `csv` file ``./data.train.csv`` as input, XGBoost will blindly use the default LIBSVM parser to digest it and generate a parser error.  Instead, users need to provide an URI in the form of ``train.csv?format=csv`` or ``train.csv?format=libsvm``.  For external memory input, the URI should of a form similar to ``train.csv?format=csv#dtrain.cache``.  See :ref:`python_data_interface` and :doc:`/tutorials/external_memory` also.
 
 For training or predicting, XGBoost takes an instance file with the format as below:
 
diff --git a/doc/tutorials/spark_estimator.rst b/doc/tutorials/spark_estimator.rst
index 02ddb60ea..fb69b70e1 100644
--- a/doc/tutorials/spark_estimator.rst
+++ b/doc/tutorials/spark_estimator.rst
@@ -108,8 +108,8 @@ virtualenv and pip:
   python -m venv xgboost_env
   source xgboost_env/bin/activate
   pip install pyarrow pandas venv-pack xgboost
-  # https://rapids.ai/pip.html#install
-  pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+  # https://docs.rapids.ai/install#pip-install
+  pip install cudf-cu11 --extra-index-url=https://pypi.nvidia.com
   venv-pack -o xgboost_env.tar.gz
 
 With Conda:
@@ -241,7 +241,7 @@ additional spark configurations and dependencies:
     --master spark://<master-ip>:7077 \
     --conf spark.executor.resource.gpu.amount=1 \
     --conf spark.task.resource.gpu.amount=1 \
-    --packages com.nvidia:rapids-4-spark_2.12:22.08.0 \
+    --packages com.nvidia:rapids-4-spark_2.12:23.04.0 \
     --conf spark.plugins=com.nvidia.spark.SQLPlugin \
     --conf spark.sql.execution.arrow.maxRecordsPerBatch=1000000 \
     --archives xgboost_env.tar.gz#environment \
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 2233336e9..4b9d37335 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -38,7 +38,7 @@ typedef uint64_t bst_ulong;  // NOLINT(*)
  */
 
 /**
- * @defgroup Library
+ * @defgroup Library Library
  *
  * These functions are used to obtain general information about XGBoost including version,
  * build info and current global configuration.
@@ -112,7 +112,7 @@ XGB_DLL int XGBGetGlobalConfig(char const **out_config);
 /**@}*/
 
 /**
- * @defgroup DMatrix
+ * @defgroup DMatrix DMatrix
  *
  * @brief DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms
  *        including both training, prediction and explanation. There are a few variants of
@@ -138,7 +138,11 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
 /*!
  * \brief load a data matrix
  * \param config JSON encoded parameters for DMatrix construction.  Accepted fields are:
- *   - uri: The URI of the input file.
+
+ *   - uri: The URI of the input file. The URI parameter `format` is required when loading text data.
+ *          \verbatim embed:rst:leading-asterisk
+ *            See :doc:`/tutorials/input_format` for more info.
+ *          \endverbatim
  *   - silent (optional): Whether to print message during loading. Default to true.
  *   - data_split_mode (optional): Whether to split by row or column. In distributed mode, the
  *     file is split accordingly; otherwise this is only an indicator on how the file was split
@@ -200,7 +204,7 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatr
  * \return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data,
-                                   bst_ulong nrow, char const *c_json_config, DMatrixHandle *out);
+                                   bst_ulong nrow, char const *config, DMatrixHandle *out);
 
 /*!
  * \brief create a matrix content from CSC format
@@ -281,7 +285,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, char const *
                                                   DMatrixHandle *out);
 
 /**
- * @defgroup Streaming
+ * @defgroup Streaming Streaming
  * @ingroup DMatrix
  *
  * @brief Quantile DMatrix and external memory DMatrix can be created from batches of
@@ -431,7 +435,7 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
  * - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
  * - Step 1: Create a DMatrix proxy by \ref XGProxyDMatrixCreate and hold the handle.
  * - Step 2: Pass the iterator handle, proxy handle and 2 methods into
- *           `XGDMatrixCreateFromCallback`, along with other parameters encoded as a JSON object.
+ *           \ref XGDMatrixCreateFromCallback, along with other parameters encoded as a JSON object.
  * - Step 3: Call appropriate data setters in `next` functions.
  *
  * \param iter    A handle to external data iterator.
@@ -830,7 +834,7 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
 /** @} */  // End of DMatrix
 
 /**
- * @defgroup Booster
+ * @defgroup Booster Booster
  *
  * @brief The `Booster` class is the gradient-boosted model for XGBoost.
  * @{
@@ -953,7 +957,7 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle d
  */
 
 /**
- * @defgroup Prediction
+ * @defgroup Prediction Prediction
  * @ingroup Booster
  *
  * @brief These functions are used for running prediction and explanation algorithms.
@@ -1155,7 +1159,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *v
 
 
 /**
- * @defgroup Serialization
+ * @defgroup Serialization Serialization
  * @ingroup Booster
  *
  * @brief There are multiple ways to serialize a Booster object depending on the use case.
@@ -1490,7 +1494,7 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
 /**@}*/  // End of Booster
 
 /**
- * @defgroup Collective
+ * @defgroup Collective Collective
  *
  * @brief Experimental support for exposing internal communicator in XGBoost.
  *
diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index aaa1e3eb8..f1cd391df 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -50,7 +50,19 @@ struct Context : public XGBoostParameter<Context> {
 
   bool IsCPU() const { return gpu_id == kCpuId; }
   bool IsCUDA() const { return !IsCPU(); }
+
   CUDAContext const* CUDACtx() const;
+  // Make a CUDA context based on the current context.
+  Context MakeCUDA(std::int32_t device = 0) const {
+    Context ctx = *this;
+    ctx.gpu_id = device;
+    return ctx;
+  }
+  Context MakeCPU() const {
+    Context ctx = *this;
+    ctx.gpu_id = kCpuId;
+    return ctx;
+  }
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(Context) {
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 4af306859..6305abff8 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) 2015-2022 by XGBoost Contributors
+/**
+ * Copyright 2015-2023 by XGBoost Contributors
  * \file data.h
  * \brief The input data structure of xgboost.
  * \author Tianqi Chen
@@ -196,6 +196,14 @@ class MetaInfo {
    */
   bool IsVerticalFederated() const;
 
+  /*!
+   * \brief A convenient method to check if the MetaInfo should contain labels.
+   *
+   * Normally we assume labels are available everywhere. The only exception is in vertical federated
+   * learning where labels are only available on worker 0.
+   */
+  bool ShouldHaveLabels() const;
+
  private:
   void SetInfoFromHost(Context const& ctx, StringView key, Json arr);
   void SetInfoFromCUDA(Context const& ctx, StringView key, Json arr);
@@ -230,44 +238,72 @@ struct Entry {
   }
 };
 
-/*!
- * \brief Parameters for constructing batches.
+/**
+ * \brief Parameters for constructing histogram index batches.
  */
 struct BatchParam {
-  /*! \brief The GPU device to use. */
-  int gpu_id {-1};
-  /*! \brief Maximum number of bins per feature for histograms. */
+  /**
+   * \brief Maximum number of bins per feature for histograms.
+   */
   bst_bin_t max_bin{0};
-  /*! \brief Hessian, used for sketching with future approx implementation. */
+  /**
+   * \brief Hessian, used for sketching with future approx implementation.
+   */
   common::Span<float> hess;
-  /*! \brief Whether should DMatrix regenerate the batch.  Only used for GHistIndex. */
-  bool regen {false};
-  /*! \brief Parameter used to generate column matrix for hist. */
+  /**
+   * \brief Whether should we force DMatrix to regenerate the batch.  Only used for
+   *        GHistIndex.
+   */
+  bool regen{false};
+  /**
+   * \brief Forbid regenerating the gradient index. Used for internal validation.
+   */
+  bool forbid_regen{false};
+  /**
+   * \brief Parameter used to generate column matrix for hist.
+   */
   double sparse_thresh{std::numeric_limits<double>::quiet_NaN()};
 
+  /**
+   * \brief Exact or others that don't need histogram.
+   */
   BatchParam() = default;
-  // GPU Hist
-  BatchParam(int32_t device, bst_bin_t max_bin)
-      : gpu_id{device}, max_bin{max_bin} {}
-  // Hist
+  /**
+   * \brief Used by the hist tree method.
+   */
   BatchParam(bst_bin_t max_bin, double sparse_thresh)
       : max_bin{max_bin}, sparse_thresh{sparse_thresh} {}
-  // Approx
   /**
-   * \brief Get batch with sketch weighted by hessian.  The batch will be regenerated if
-   *        the span is changed, so caller should keep the span for each iteration.
+   * \brief Used by the approx tree method.
+   *
+   *   Get batch with sketch weighted by hessian.  The batch will be regenerated if the
+   *   span is changed, so caller should keep the span for each iteration.
    */
   BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
       : max_bin{max_bin}, hess{hessian}, regen{regenerate} {}
 
-  bool operator!=(BatchParam const& other) const {
-    if (hess.empty() && other.hess.empty()) {
-      return gpu_id != other.gpu_id || max_bin != other.max_bin;
-    }
-    return gpu_id != other.gpu_id || max_bin != other.max_bin || hess.data() != other.hess.data();
+  bool ParamNotEqual(BatchParam const& other) const {
+    // Check non-floating parameters.
+    bool cond = max_bin != other.max_bin;
+    // Check sparse thresh.
+    bool l_nan = std::isnan(sparse_thresh);
+    bool r_nan = std::isnan(other.sparse_thresh);
+    bool st_chg = (l_nan != r_nan) || (!l_nan && !r_nan && (sparse_thresh != other.sparse_thresh));
+    cond |= st_chg;
+
+    return cond;
   }
-  bool operator==(BatchParam const& other) const {
-    return !(*this != other);
+  bool Initialized() const { return max_bin != 0; }
+  /**
+   * \brief Make a copy of self for DMatrix to describe how its existing index was generated.
+   */
+  BatchParam MakeCache() const {
+    auto p = *this;
+    // These parameters have nothing to do with how the gradient index was generated in the
+    // first place.
+    p.regen = false;
+    p.forbid_regen = false;
+    return p;
   }
 };
 
@@ -427,7 +463,7 @@ class EllpackPage {
    * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
    * in CSR format.
    */
-  explicit EllpackPage(DMatrix* dmat, const BatchParam& param);
+  explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
 
   /*! \brief Destructor. */
   ~EllpackPage();
@@ -543,7 +579,9 @@ class DMatrix {
   template <typename T>
   BatchSet<T> GetBatches();
   template <typename T>
-  BatchSet<T> GetBatches(const BatchParam& param);
+  BatchSet<T> GetBatches(Context const* ctx);
+  template <typename T>
+  BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
   template <typename T>
   bool PageExists() const;
 
@@ -558,21 +596,17 @@ class DMatrix {
     return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
   }
 
-  /*!
+  /**
    * \brief Load DMatrix from URI.
+   *
    * \param uri The URI of input.
    * \param silent Whether print information during loading.
    * \param data_split_mode In distributed mode, split the input according this mode; otherwise,
    *                        it's just an indicator on how the input was split beforehand.
-   * \param file_format The format type of the file, used for dmlc::Parser::Create.
-   *   By default "auto" will be able to load in both local binary file.
-   * \param page_size Page size for external memory.
    * \return The created DMatrix.
    */
-  static DMatrix* Load(const std::string& uri,
-                       bool silent = true,
-                       DataSplitMode data_split_mode = DataSplitMode::kRow,
-                       const std::string& file_format = "auto");
+  static DMatrix* Load(const std::string& uri, bool silent = true,
+                       DataSplitMode data_split_mode = DataSplitMode::kRow);
 
   /**
    * \brief Creates a new DMatrix from an external data adapter.
@@ -654,18 +688,19 @@ class DMatrix {
 
  protected:
   virtual BatchSet<SparsePage> GetRowBatches() = 0;
-  virtual BatchSet<CSCPage> GetColumnBatches() = 0;
-  virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
-  virtual BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) = 0;
-  virtual BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) = 0;
-  virtual BatchSet<ExtSparsePage> GetExtBatches(BatchParam const& param) = 0;
+  virtual BatchSet<CSCPage> GetColumnBatches(Context const* ctx) = 0;
+  virtual BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const* ctx) = 0;
+  virtual BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, BatchParam const& param) = 0;
+  virtual BatchSet<GHistIndexMatrix> GetGradientIndex(Context const* ctx,
+                                                      BatchParam const& param) = 0;
+  virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;
 
   virtual bool EllpackExists() const = 0;
   virtual bool GHistIndexExists() const = 0;
   virtual bool SparsePageExists() const = 0;
 };
 
-template<>
+template <>
 inline BatchSet<SparsePage> DMatrix::GetBatches() {
   return GetRowBatches();
 }
@@ -680,34 +715,39 @@ inline bool DMatrix::PageExists<GHistIndexMatrix>() const {
   return this->GHistIndexExists();
 }
 
-template<>
+template <>
 inline bool DMatrix::PageExists<SparsePage>() const {
   return this->SparsePageExists();
 }
 
-template<>
-inline BatchSet<CSCPage> DMatrix::GetBatches() {
-  return GetColumnBatches();
-}
-
-template<>
-inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
-  return GetSortedColumnBatches();
-}
-
-template<>
-inline BatchSet<EllpackPage> DMatrix::GetBatches(const BatchParam& param) {
-  return GetEllpackBatches(param);
+template <>
+inline BatchSet<SparsePage> DMatrix::GetBatches(Context const*) {
+  return GetRowBatches();
 }
 
 template <>
-inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(const BatchParam& param) {
-  return GetGradientIndex(param);
+inline BatchSet<CSCPage> DMatrix::GetBatches(Context const* ctx) {
+  return GetColumnBatches(ctx);
 }
 
 template <>
-inline BatchSet<ExtSparsePage> DMatrix::GetBatches() {
-  return GetExtBatches(BatchParam{});
+inline BatchSet<SortedCSCPage> DMatrix::GetBatches(Context const* ctx) {
+  return GetSortedColumnBatches(ctx);
+}
+
+template <>
+inline BatchSet<EllpackPage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
+  return GetEllpackBatches(ctx, param);
+}
+
+template <>
+inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
+  return GetGradientIndex(ctx, param);
+}
+
+template <>
+inline BatchSet<ExtSparsePage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
+  return GetExtBatches(ctx, param);
 }
 }  // namespace xgboost
 
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 61dd94302..393dda59c 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -567,7 +567,7 @@ class RegTree : public Model {
      * \brief drop the trace after fill, must be called after fill.
      * \param inst The sparse instance to drop.
      */
-    void Drop(const SparsePage::Inst& inst);
+    void Drop();
     /*!
      * \brief returns the size of the feature vector
      * \return the size of the feature vector
@@ -807,13 +807,10 @@ inline void RegTree::FVec::Fill(const SparsePage::Inst& inst) {
   has_missing_ = data_.size() != feature_count;
 }
 
-inline void RegTree::FVec::Drop(const SparsePage::Inst& inst) {
-  for (auto const& entry : inst) {
-    if (entry.index >= data_.size()) {
-      continue;
-    }
-    data_[entry.index].flag = -1;
-  }
+inline void RegTree::FVec::Drop() {
+  Entry e{};
+  e.flag = -1;
+  std::fill_n(data_.data(), data_.size(), e);
   has_missing_ = true;
 }
 
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index facb955ce..4903b8f38 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -33,16 +33,16 @@
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
-        <flink.version>1.8.3</flink.version>
-        <spark.version>3.1.1</spark.version>
-        <scala.version>2.12.8</scala.version>
+        <flink.version>1.17.0</flink.version>
+        <spark.version>3.4.0</spark.version>
+        <scala.version>2.12.17</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
         <hadoop.version>3.3.5</hadoop.version>
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>22.12.0</cudf.version>
-        <spark.rapids.version>22.12.0</spark.rapids.version>
+        <cudf.version>23.04.0</cudf.version>
+        <spark.rapids.version>23.04.0</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
     </properties>
     <repositories>
@@ -374,7 +374,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-checkstyle-plugin</artifactId>
-                <version>3.2.1</version>
+                <version>3.2.2</version>
                 <configuration>
                     <configLocation>checkstyle.xml</configLocation>
                     <failOnViolation>true</failOnViolation>
@@ -450,7 +450,7 @@
         <plugins>
             <plugin>
                 <artifactId>maven-project-info-reports-plugin</artifactId>
-                <version>3.4.2</version>
+                <version>3.4.3</version>
             </plugin>
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
@@ -469,7 +469,7 @@
         <dependency>
             <groupId>com.esotericsoftware</groupId>
             <artifactId>kryo</artifactId>
-            <version>5.4.0</version>
+            <version>5.5.0</version>
         </dependency>
         <dependency>
             <groupId>org.scala-lang</groupId>
@@ -477,11 +477,6 @@
             <version>${scala.version}</version>
             <scope>provided</scope>
         </dependency>
-        <dependency>
-            <groupId>org.scala-lang</groupId>
-            <artifactId>scala-reflect</artifactId>
-            <version>${scala.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.scala-lang</groupId>
             <artifactId>scala-library</artifactId>
@@ -495,13 +490,13 @@
         <dependency>
             <groupId>org.scalatest</groupId>
             <artifactId>scalatest_${scala.binary.version}</artifactId>
-            <version>3.0.8</version>
+            <version>3.2.15</version>
             <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>org.scalactic</groupId>
             <artifactId>scalactic_${scala.binary.version}</artifactId>
-            <version>3.0.8</version>
+            <version>3.2.15</version>
             <scope>test</scope>
         </dependency>
     </dependencies>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index d08e4f409..40c9c72a4 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
+            <version>${project.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
@@ -37,12 +37,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>3.12.0</version>
+            <version>${project.version}</version>
         </dependency>
     </dependencies>
 </project>
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java
index 7e4fe6806..8a74b74da 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2021 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -62,8 +62,8 @@ public class BasicWalkThrough {
 
   public static void main(String[] args) throws IOException, XGBoostError {
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     HashMap<String, Object> params = new HashMap<String, Object>();
     params.put("eta", 1.0);
@@ -112,7 +112,8 @@ public class BasicWalkThrough {
 
     System.out.println("start build dmatrix from csr sparse data ...");
     //build dmatrix from CSR Sparse Matrix
-    DataLoader.CSRSparseData spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train");
+    DataLoader.CSRSparseData spData =
+        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm");
 
     DMatrix trainMat2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data,
                                     DMatrix.SparseType.CSR, 127);
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java
index 7eb9e99f0..fe5db0465 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java
@@ -32,8 +32,8 @@ public class BoostFromPrediction {
     System.out.println("start running example to start from a initial prediction");
 
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     //specify parameters
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java
index dbe5f368c..3577be226 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java
@@ -30,7 +30,7 @@ import ml.dmlc.xgboost4j.java.XGBoostError;
 public class CrossValidation {
   public static void main(String[] args) throws IOException, XGBoostError {
     //load train mat
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
 
     //set params
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java
index 6d529974c..c631dc01a 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java
@@ -139,9 +139,9 @@ public class CustomObjective {
 
   public static void main(String[] args) throws XGBoostError {
     //load train mat (svmlight format)
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
     //load valid mat (svmlight format)
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     HashMap<String, Object> params = new HashMap<String, Object>();
     params.put("eta", 1.0);
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java
index 61e752f85..9e52c12fd 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java
@@ -29,9 +29,9 @@ import ml.dmlc.xgboost4j.java.example.util.DataLoader;
 public class EarlyStopping {
   public static void main(String[] args) throws IOException, XGBoostError {
     DataLoader.CSRSparseData trainCSR =
-        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train");
+        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm");
     DataLoader.CSRSparseData testCSR =
-        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test");
+        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test?format=libsvm");
 
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
index 349098ae1..70b2b85b5 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
@@ -32,8 +32,8 @@ public class ExternalMemory {
     //this is the only difference, add a # followed by a cache prefix name
     //several cache file with the prefix will be generated
     //currently only support convert from libsvm file
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache");
 
     //specify parameters
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java
index 422cdea6a..09cc91c7f 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java
@@ -32,8 +32,8 @@ import ml.dmlc.xgboost4j.java.example.util.CustomEval;
 public class GeneralizedLinearModel {
   public static void main(String[] args) throws XGBoostError {
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     //specify parameters
     //change booster to gblinear, so that we are fitting a linear model
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java
index c98534a93..9038502bd 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java
@@ -31,8 +31,8 @@ import ml.dmlc.xgboost4j.java.example.util.CustomEval;
 public class PredictFirstNtree {
   public static void main(String[] args) throws XGBoostError {
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     //specify parameters
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java
index 0fcfb39de..7b1dfcb28 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java
@@ -31,8 +31,8 @@ import ml.dmlc.xgboost4j.java.XGBoostError;
 public class PredictLeafIndices {
   public static void main(String[] args) throws XGBoostError {
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     //specify parameters
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExample.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExample.java
new file mode 100644
index 000000000..94e5cdab5
--- /dev/null
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExample.java
@@ -0,0 +1,107 @@
+/*
+ Copyright (c) 2014-2021 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.java.example.flink;
+
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.flink.api.common.typeinfo.TypeHint;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.ExecutionEnvironment;
+import org.apache.flink.api.java.operators.MapOperator;
+import org.apache.flink.api.java.tuple.Tuple13;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.api.java.utils.DataSetUtils;
+import org.apache.flink.ml.linalg.DenseVector;
+import org.apache.flink.ml.linalg.Vector;
+import org.apache.flink.ml.linalg.Vectors;
+
+import ml.dmlc.xgboost4j.java.flink.XGBoost;
+import ml.dmlc.xgboost4j.java.flink.XGBoostModel;
+
+
+public class DistTrainWithFlinkExample {
+
+  static Tuple2<XGBoostModel, DataSet<Float[]>> runPrediction(
+      ExecutionEnvironment env,
+      java.nio.file.Path trainPath,
+      int percentage) throws Exception {
+    // reading data
+    final DataSet<Tuple2<Long, Tuple2<Vector, Double>>> data =
+        DataSetUtils.zipWithIndex(parseCsv(env, trainPath));
+    final long size = data.count();
+    final long trainCount = Math.round(size * 0.01 * percentage);
+    final DataSet<Tuple2<Vector, Double>> trainData =
+        data
+          .filter(item -> item.f0 < trainCount)
+          .map(t -> t.f1)
+          .returns(TypeInformation.of(new TypeHint<Tuple2<Vector, Double>>(){}));
+    final DataSet<Vector> testData =
+        data
+          .filter(tuple -> tuple.f0 >= trainCount)
+          .map(t -> t.f1.f0)
+          .returns(TypeInformation.of(new TypeHint<Vector>(){}));
+
+    // define parameters
+    HashMap<String, Object> paramMap = new HashMap<String, Object>(3);
+    paramMap.put("eta", 0.1);
+    paramMap.put("max_depth", 2);
+    paramMap.put("objective", "binary:logistic");
+
+    // number of iterations
+    final int round = 2;
+    // train the model
+    XGBoostModel model = XGBoost.train(trainData, paramMap, round);
+    DataSet<Float[]> predTest = model.predict(testData);
+    return new Tuple2<XGBoostModel, DataSet<Float[]>>(model, predTest);
+  }
+
+  private static MapOperator<Tuple13<Double, String, Double, Double, Double, Integer, Integer,
+      Integer, Integer, Integer, Integer, Integer, Integer>,
+      Tuple2<Vector, Double>> parseCsv(ExecutionEnvironment env, Path trainPath) {
+    return env.readCsvFile(trainPath.toString())
+      .ignoreFirstLine()
+      .types(Double.class, String.class, Double.class, Double.class, Double.class,
+        Integer.class, Integer.class, Integer.class, Integer.class, Integer.class,
+        Integer.class, Integer.class, Integer.class)
+      .map(DistTrainWithFlinkExample::mapFunction);
+  }
+
+  private static Tuple2<Vector, Double> mapFunction(Tuple13<Double, String, Double, Double, Double,
+      Integer, Integer, Integer, Integer, Integer, Integer, Integer, Integer> tuple) {
+    final DenseVector dense = Vectors.dense(tuple.f2, tuple.f3, tuple.f4, tuple.f5, tuple.f6,
+        tuple.f7, tuple.f8, tuple.f9, tuple.f10, tuple.f11, tuple.f12);
+    if (tuple.f1.contains("inf")) {
+      return new Tuple2<Vector, Double>(dense, 1.0);
+    } else {
+      return new Tuple2<Vector, Double>(dense, 0.0);
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    final java.nio.file.Path parentPath = java.nio.file.Paths.get(Arrays.stream(args)
+        .findFirst().orElse("."));
+    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
+    Tuple2<XGBoostModel, DataSet<Float[]>> tuple2 = runPrediction(
+        env, parentPath.resolve("veterans_lung_cancer.csv"), 70
+    );
+    List<Float[]> list = tuple2.f1.collect();
+    System.out.println(list.size());
+  }
+}
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala
index e8481b047..1893288b4 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -36,8 +36,8 @@ object BasicWalkThrough {
   }
 
   def main(args: Array[String]): Unit = {
-    val trainMax = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMax = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMax = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMax = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
@@ -76,7 +76,7 @@ object BasicWalkThrough {
 
     // build dmatrix from CSR Sparse Matrix
     println("start build dmatrix from csr sparse data ...")
-    val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train")
+    val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm")
     val trainMax2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data,
       JDMatrix.SparseType.CSR)
     trainMax2.setLabel(spData.labels)
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala
index b894532fa..09b72fc50 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala
@@ -24,8 +24,8 @@ object BoostFromPrediction {
   def main(args: Array[String]): Unit = {
     println("start running example to start from a initial prediction")
 
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala
index 62f8b461a..6083209ec 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala
@@ -21,7 +21,7 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix}
 
 object CrossValidation {
   def main(args: Array[String]): Unit = {
-    val trainMat: DMatrix = new DMatrix("../../demo/data/agaricus.txt.train")
+    val trainMat: DMatrix = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
 
     // set params
     val params = new mutable.HashMap[String, Any]
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala
index fe88423e7..8cc49c90d 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala
@@ -138,8 +138,8 @@ object CustomObjective {
   }
 
   def main(args: Array[String]): Unit = {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
     params += "max_depth" -> 2
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala
index 447c98295..c7f3d8bbb 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala
@@ -25,8 +25,8 @@ object ExternalMemory {
     // this is the only difference, add a # followed by a cache prefix name
     // several cache file with the prefix will be generated
     // currently only support convert from libsvm file
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala
index 27ed98eca..e370010b6 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala
@@ -27,8 +27,8 @@ import ml.dmlc.xgboost4j.scala.example.util.CustomEval
  */
 object GeneralizedLinearModel {
   def main(args: Array[String]): Unit = {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     // specify parameters
     // change booster to gblinear, so that we are fitting a linear model
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala
index 5395e3638..40a5ffc44 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala
@@ -23,8 +23,8 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix}
 object PredictFirstNTree {
 
   def main(args: Array[String]): Unit = {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala
index f40a8aac6..7ae2e6520 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala
@@ -25,8 +25,8 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix}
 object PredictLeafIndices {
 
   def main(args: Array[String]): Unit = {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
index 74b24ac35..cb859f62d 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014 - 2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -15,27 +15,84 @@
  */
 package ml.dmlc.xgboost4j.scala.example.flink
 
-import ml.dmlc.xgboost4j.scala.flink.XGBoost
-import org.apache.flink.api.scala.{ExecutionEnvironment, _}
-import org.apache.flink.ml.MLUtils
+import java.lang.{Double => JDouble, Long => JLong}
+import java.nio.file.{Path, Paths}
+import org.apache.flink.api.java.tuple.{Tuple13, Tuple2}
+import org.apache.flink.api.java.{DataSet, ExecutionEnvironment}
+import org.apache.flink.ml.linalg.{Vector, Vectors}
+import ml.dmlc.xgboost4j.java.flink.{XGBoost, XGBoostModel}
+import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation}
+import org.apache.flink.api.java.utils.DataSetUtils
+
 
 object DistTrainWithFlink {
-  def main(args: Array[String]) {
-    val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
-    // read trainining data
-    val trainData =
-      MLUtils.readLibSVM(env, "/path/to/data/agaricus.txt.train")
-    val testData = MLUtils.readLibSVM(env, "/path/to/data/agaricus.txt.test")
-    // define parameters
-    val paramMap = List(
-      "eta" -> 0.1,
-      "max_depth" -> 2,
-      "objective" -> "binary:logistic").toMap
+  import scala.jdk.CollectionConverters._
+  private val rowTypeHint = TypeInformation.of(new TypeHint[Tuple2[Vector, JDouble]]{})
+  private val testDataTypeHint = TypeInformation.of(classOf[Vector])
+
+  private[flink] def parseCsv(trainPath: Path)(implicit env: ExecutionEnvironment):
+      DataSet[Tuple2[JLong, Tuple2[Vector, JDouble]]] = {
+    DataSetUtils.zipWithIndex(
+    env
+      .readCsvFile(trainPath.toString)
+      .ignoreFirstLine
+      .types(
+        classOf[Double], classOf[String], classOf[Double], classOf[Double], classOf[Double],
+        classOf[Integer], classOf[Integer], classOf[Integer], classOf[Integer],
+        classOf[Integer], classOf[Integer], classOf[Integer], classOf[Integer]
+      )
+      .map((row: Tuple13[Double, String, Double, Double, Double,
+        Integer, Integer, Integer, Integer, Integer, Integer, Integer, Integer]) => {
+        val dense = Vectors.dense(row.f2, row.f3, row.f4,
+          row.f5.toDouble, row.f6.toDouble, row.f7.toDouble, row.f8.toDouble,
+          row.f9.toDouble, row.f10.toDouble, row.f11.toDouble, row.f12.toDouble)
+        val label = if (row.f1.contains("inf")) {
+          JDouble.valueOf(1.0)
+        } else {
+          JDouble.valueOf(0.0)
+        }
+        new Tuple2[Vector, JDouble](dense, label)
+      })
+      .returns(rowTypeHint)
+    )
+  }
+
+  private[flink] def runPrediction(trainPath: Path, percentage: Int)
+                                  (implicit env: ExecutionEnvironment):
+    (XGBoostModel, DataSet[Array[Float]]) = {
+    // read training data
+    val data: DataSet[Tuple2[JLong, Tuple2[Vector, JDouble]]] = parseCsv(trainPath)
+    val trainSize = Math.round(0.01 * percentage * data.count())
+    val trainData: DataSet[Tuple2[Vector, JDouble]] =
+      data.filter(d => d.f0 < trainSize).map(_.f1).returns(rowTypeHint)
+
+
+    val testData: DataSet[Vector] =
+        data
+          .filter(d => d.f0 >= trainSize)
+          .map(_.f1.f0)
+          .returns(testDataTypeHint)
+
+    val paramMap = mapAsJavaMap(Map(
+      ("eta", "0.1".asInstanceOf[AnyRef]),
+      ("max_depth", "2"),
+      ("objective", "binary:logistic"),
+      ("verbosity", "1")
+    ))
+
     // number of iterations
     val round = 2
     // train the model
     val model = XGBoost.train(trainData, paramMap, round)
-    val predTest = model.predict(testData.map{x => x.vector})
-    model.saveModelAsHadoopFile("file:///path/to/xgboost.model")
+    val result = model.predict(testData).map(prediction => prediction.map(Float.unbox))
+    (model, result)
+  }
+
+  def main(args: Array[String]): Unit = {
+    implicit val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
+    val parentPath = Paths.get(args.headOption.getOrElse("."))
+    val (_, predTest) = runPrediction(parentPath.resolve("veterans_lung_cancer.csv"), 70)
+    val list = predTest.collect().asScala
+    println(list.length)
   }
 }
diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExampleTest.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExampleTest.scala
new file mode 100644
index 000000000..b9929639f
--- /dev/null
+++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExampleTest.scala
@@ -0,0 +1,36 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.java.example.flink
+
+import org.apache.flink.api.java.ExecutionEnvironment
+import org.scalatest.Inspectors._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+
+import java.nio.file.Paths
+
+class DistTrainWithFlinkExampleTest extends AnyFunSuite {
+  private val parentPath = Paths.get("../../").resolve("demo").resolve("data")
+  private val data = parentPath.resolve("veterans_lung_cancer.csv")
+
+  test("Smoke test for scala flink example") {
+    val env = ExecutionEnvironment.createLocalEnvironment(1)
+    val tuple2 = DistTrainWithFlinkExample.runPrediction(env, data, 70)
+    val results = tuple2.f1.collect()
+    results should have size 41
+    forEvery(results)(item => item should have size 1)
+  }
+}
diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlinkSuite.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlinkSuite.scala
new file mode 100644
index 000000000..d9e98d81c
--- /dev/null
+++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlinkSuite.scala
@@ -0,0 +1,37 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.scala.example.flink
+
+import org.apache.flink.api.java.ExecutionEnvironment
+import org.scalatest.Inspectors._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+
+import java.nio.file.Paths
+import scala.jdk.CollectionConverters._
+
+class DistTrainWithFlinkSuite extends AnyFunSuite {
+  private val parentPath = Paths.get("../../").resolve("demo").resolve("data")
+  private val data = parentPath.resolve("veterans_lung_cancer.csv")
+
+  test("Smoke test for scala flink example") {
+    implicit val env: ExecutionEnvironment = ExecutionEnvironment.createLocalEnvironment(1)
+    val (_, result) = DistTrainWithFlink.runPrediction(data, 70)
+    val results = result.collect().asScala
+    results should have size 41
+    forEvery(results)(item => item should have size 1)
+  }
+}
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index b8b757eae..a9a80e29a 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -8,8 +8,11 @@
         <artifactId>xgboost-jvm_2.12</artifactId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
-    <artifactId>xgboost4j-flink_2.12</artifactId>
+    <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
     <version>2.0.0-SNAPSHOT</version>
+    <properties>
+      <flink-ml.version>2.2.0</flink-ml.version>
+    </properties>
     <build>
         <plugins>
             <plugin>
@@ -26,32 +29,22 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>3.12.0</version>
+            <version>${project.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.flink</groupId>
-            <artifactId>flink-scala_${scala.binary.version}</artifactId>
+            <artifactId>flink-clients</artifactId>
             <version>${flink.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.flink</groupId>
-            <artifactId>flink-clients_${scala.binary.version}</artifactId>
-            <version>${flink.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.flink</groupId>
-            <artifactId>flink-ml_${scala.binary.version}</artifactId>
-            <version>${flink.version}</version>
+            <artifactId>flink-ml-servable-core</artifactId>
+            <version>${flink-ml.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
-            <version>3.3.5</version>
+            <version>${hadoop.version}</version>
         </dependency>
     </dependencies>
 
diff --git a/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java
new file mode 100644
index 000000000..7a5e3ac68
--- /dev/null
+++ b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java
@@ -0,0 +1,187 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.java.flink;
+
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Optional;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import org.apache.flink.api.common.functions.RichMapPartitionFunction;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.ml.linalg.SparseVector;
+import org.apache.flink.ml.linalg.Vector;
+import org.apache.flink.util.Collector;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import ml.dmlc.xgboost4j.LabeledPoint;
+import ml.dmlc.xgboost4j.java.Booster;
+import ml.dmlc.xgboost4j.java.Communicator;
+import ml.dmlc.xgboost4j.java.DMatrix;
+import ml.dmlc.xgboost4j.java.RabitTracker;
+import ml.dmlc.xgboost4j.java.XGBoostError;
+
+
+public class XGBoost {
+  private static final Logger logger = LoggerFactory.getLogger(XGBoost.class);
+
+  private static class MapFunction
+      extends RichMapPartitionFunction<Tuple2<Vector, Double>, XGBoostModel> {
+
+    private final Map<String, Object> params;
+    private final int round;
+    private final Map<String, String> workerEnvs;
+
+    public MapFunction(Map<String, Object> params, int round, Map<String, String> workerEnvs) {
+      this.params = params;
+      this.round = round;
+      this.workerEnvs = workerEnvs;
+    }
+
+    public void mapPartition(java.lang.Iterable<Tuple2<Vector, Double>> it,
+                             Collector<XGBoostModel> collector) throws XGBoostError {
+      workerEnvs.put(
+          "DMLC_TASK_ID",
+          String.valueOf(this.getRuntimeContext().getIndexOfThisSubtask())
+      );
+
+      if (logger.isInfoEnabled()) {
+        logger.info("start with env: {}", workerEnvs.entrySet().stream()
+            .map(e -> String.format("\"%s\": \"%s\"", e.getKey(), e.getValue()))
+            .collect(Collectors.joining(", "))
+        );
+      }
+
+      final Iterator<LabeledPoint> dataIter =
+          StreamSupport
+            .stream(it.spliterator(), false)
+            .map(VectorToPointMapper.INSTANCE)
+            .iterator();
+
+      if (dataIter.hasNext()) {
+        final DMatrix trainMat = new DMatrix(dataIter, null);
+        int numEarlyStoppingRounds =
+            Optional.ofNullable(params.get("numEarlyStoppingRounds"))
+              .map(x -> Integer.parseInt(x.toString()))
+              .orElse(0);
+
+        final Booster booster = trainBooster(trainMat, numEarlyStoppingRounds);
+        collector.collect(new XGBoostModel(booster));
+      } else {
+        logger.warn("Nothing to train with.");
+      }
+    }
+
+    private Booster trainBooster(DMatrix trainMat,
+                                 int numEarlyStoppingRounds) throws XGBoostError {
+      Booster booster;
+      final Map<String, DMatrix> watches =
+          new HashMap<String, DMatrix>() {{ put("train", trainMat); }};
+      try {
+        Communicator.init(workerEnvs);
+        booster = ml.dmlc.xgboost4j.java.XGBoost
+          .train(
+            trainMat,
+            params,
+            round,
+            watches,
+            null,
+            null,
+            null,
+            numEarlyStoppingRounds);
+      } catch (XGBoostError xgbException) {
+        final String identifier = String.valueOf(this.getRuntimeContext().getIndexOfThisSubtask());
+        logger.warn(
+            String.format("XGBooster worker %s has failed due to", identifier),
+            xgbException
+        );
+        throw xgbException;
+      } finally {
+        Communicator.shutdown();
+      }
+      return booster;
+    }
+
+    private static class VectorToPointMapper
+        implements Function<Tuple2<Vector, Double>, LabeledPoint> {
+      public static VectorToPointMapper INSTANCE = new VectorToPointMapper();
+      @Override
+      public LabeledPoint apply(Tuple2<Vector, Double> tuple) {
+        final SparseVector vector = tuple.f0.toSparse();
+        final double[] values = vector.values;
+        final int size = values.length;
+        final float[] array = new float[size];
+        for (int i = 0; i < size; i++) {
+          array[i] = (float) values[i];
+        }
+        return new LabeledPoint(
+          tuple.f1.floatValue(),
+          vector.size(),
+          vector.indices,
+          array);
+      }
+    }
+  }
+
+  /**
+   * Load XGBoost model from path, using Hadoop Filesystem API.
+   *
+   * @param modelPath The path that is accessible by hadoop filesystem API.
+   * @return The loaded model
+   */
+  public static XGBoostModel loadModelFromHadoopFile(final String modelPath) throws Exception {
+    final FileSystem fileSystem = FileSystem.get(new Configuration());
+    final Path f = new Path(modelPath);
+
+    try (FSDataInputStream opened = fileSystem.open(f)) {
+      return new XGBoostModel(ml.dmlc.xgboost4j.java.XGBoost.loadModel(opened));
+    }
+  }
+
+  /**
+   * Train a xgboost model with link.
+   *
+   * @param dtrain The training data.
+   * @param params XGBoost parameters.
+   * @param numBoostRound  Number of rounds to train.
+   */
+  public static XGBoostModel train(DataSet<Tuple2<Vector, Double>> dtrain,
+                                   Map<String, Object> params,
+                                   int numBoostRound) throws Exception {
+    final RabitTracker tracker =
+        new RabitTracker(dtrain.getExecutionEnvironment().getParallelism());
+    if (tracker.start(0L)) {
+      return dtrain
+        .mapPartition(new MapFunction(params, numBoostRound, tracker.getWorkerEnvs()))
+        .reduce((x, y) -> x)
+        .collect()
+        .get(0);
+    } else {
+      throw new Error("Tracker cannot be started");
+    }
+  }
+}
diff --git a/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoostModel.java b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoostModel.java
new file mode 100644
index 000000000..03de50482
--- /dev/null
+++ b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoostModel.java
@@ -0,0 +1,136 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.java.flink;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.stream.StreamSupport;
+
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.flink.api.common.functions.MapPartitionFunction;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.ml.linalg.SparseVector;
+import org.apache.flink.ml.linalg.Vector;
+import org.apache.flink.util.Collector;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import ml.dmlc.xgboost4j.LabeledPoint;
+import ml.dmlc.xgboost4j.java.Booster;
+import ml.dmlc.xgboost4j.java.DMatrix;
+import ml.dmlc.xgboost4j.java.XGBoostError;
+
+
+public class XGBoostModel implements Serializable {
+  private static final org.slf4j.Logger logger =
+      org.slf4j.LoggerFactory.getLogger(XGBoostModel.class);
+
+  private final Booster booster;
+  private final PredictorFunction predictorFunction;
+
+
+  public XGBoostModel(Booster booster) {
+    this.booster = booster;
+    this.predictorFunction = new PredictorFunction(booster);
+  }
+
+  /**
+   * Save the model as a Hadoop filesystem file.
+   *
+   * @param modelPath The model path as in Hadoop path.
+   */
+  public void saveModelAsHadoopFile(String modelPath) throws IOException, XGBoostError {
+    booster.saveModel(FileSystem.get(new Configuration()).create(new Path(modelPath)));
+  }
+
+  public byte[] toByteArray(String format) throws XGBoostError {
+    return booster.toByteArray(format);
+  }
+
+  /**
+   * Save the model as a Hadoop filesystem file.
+   *
+   * @param modelPath The model path as in Hadoop path.
+   * @param format The model format (ubj, json, deprecated)
+   * @throws XGBoostError internal error
+   * @throws IOException save error
+   */
+  public void saveModelAsHadoopFile(String modelPath, String format)
+      throws IOException, XGBoostError {
+    booster.saveModel(FileSystem.get(new Configuration()).create(new Path(modelPath)), format);
+  }
+
+  /**
+   * predict with the given DMatrix
+   *
+   * @param testSet the local test set represented as DMatrix
+   * @return prediction result
+   */
+  public float[][] predict(DMatrix testSet) throws XGBoostError {
+    return booster.predict(testSet, true, 0);
+  }
+
+  /**
+   * Predict given vector dataset.
+   *
+   * @param data The dataset to be predicted.
+   * @return The prediction result.
+   */
+  public DataSet<Float[]> predict(DataSet<Vector> data) {
+    return data.mapPartition(predictorFunction);
+  }
+
+
+  private static class PredictorFunction implements MapPartitionFunction<Vector, Float[]> {
+
+    private final Booster booster;
+
+    public PredictorFunction(Booster booster) {
+      this.booster = booster;
+    }
+
+    @Override
+    public void mapPartition(Iterable<Vector> it, Collector<Float[]> out) throws Exception {
+      final Iterator<LabeledPoint> dataIter =
+          StreamSupport.stream(it.spliterator(), false)
+            .map(Vector::toSparse)
+            .map(PredictorFunction::fromVector)
+            .iterator();
+
+      if (dataIter.hasNext()) {
+        final DMatrix data = new DMatrix(dataIter, null);
+        float[][] predictions = booster.predict(data, true, 2);
+        Arrays.stream(predictions).map(ArrayUtils::toObject).forEach(out::collect);
+      } else {
+        logger.debug("Empty partition");
+      }
+    }
+
+    private static LabeledPoint fromVector(SparseVector vector) {
+      final int[] index = vector.indices;
+      final double[] value = vector.values;
+      int size = value.length;
+      final float[] values = new float[size];
+      for (int i = 0; i < size; i++) {
+        values[i] = (float) value[i];
+      }
+      return new LabeledPoint(0.0f, vector.size(), index, values);
+    }
+  }
+}
diff --git a/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoost.scala b/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoost.scala
deleted file mode 100644
index 6878f1865..000000000
--- a/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoost.scala
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.flink
-
-import scala.collection.JavaConverters.asScalaIteratorConverter
-
-import ml.dmlc.xgboost4j.LabeledPoint
-import ml.dmlc.xgboost4j.java.{Communicator, RabitTracker}
-import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => XGBoostScala}
-
-import org.apache.commons.logging.LogFactory
-import org.apache.flink.api.common.functions.RichMapPartitionFunction
-import org.apache.flink.api.scala.{DataSet, _}
-import org.apache.flink.ml.common.LabeledVector
-import org.apache.flink.util.Collector
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-
-object XGBoost {
-  /**
-    * Helper map function to start the job.
-    *
-    * @param workerEnvs
-    */
-  private class MapFunction(paramMap: Map[String, Any],
-                            round: Int,
-                            workerEnvs: java.util.Map[String, String])
-    extends RichMapPartitionFunction[LabeledVector, XGBoostModel] {
-    val logger = LogFactory.getLog(this.getClass)
-
-    def mapPartition(it: java.lang.Iterable[LabeledVector],
-                     collector: Collector[XGBoostModel]): Unit = {
-      workerEnvs.put("DMLC_TASK_ID", String.valueOf(this.getRuntimeContext.getIndexOfThisSubtask))
-      logger.info("start with env" + workerEnvs.toString)
-      Communicator.init(workerEnvs)
-      val mapper = (x: LabeledVector) => {
-        val (index, value) = x.vector.toSeq.unzip
-        LabeledPoint(x.label.toFloat, x.vector.size, index.toArray, value.map(_.toFloat).toArray)
-      }
-      val dataIter = for (x <- it.iterator().asScala) yield mapper(x)
-      val trainMat = new DMatrix(dataIter, null)
-      val watches = List("train" -> trainMat).toMap
-      val round = 2
-      val numEarlyStoppingRounds = paramMap.get("numEarlyStoppingRounds")
-          .map(_.toString.toInt).getOrElse(0)
-      val booster = XGBoostScala.train(trainMat, paramMap, round, watches,
-        earlyStoppingRound = numEarlyStoppingRounds)
-      Communicator.shutdown()
-      collector.collect(new XGBoostModel(booster))
-    }
-  }
-
-  val logger = LogFactory.getLog(this.getClass)
-
-  /**
-    * Load XGBoost model from path, using Hadoop Filesystem API.
-    *
-    * @param modelPath The path that is accessible by hadoop filesystem API.
-    * @return The loaded model
-    */
-  def loadModelFromHadoopFile(modelPath: String) : XGBoostModel = {
-    new XGBoostModel(
-      XGBoostScala.loadModel(FileSystem.get(new Configuration).open(new Path(modelPath))))
-  }
-
-  /**
-    * Train a xgboost model with link.
-    *
-    * @param dtrain The training data.
-    * @param params The parameters to XGBoost.
-    * @param round Number of rounds to train.
-    */
-  def train(dtrain: DataSet[LabeledVector], params: Map[String, Any], round: Int):
-      XGBoostModel = {
-    val tracker = new RabitTracker(dtrain.getExecutionEnvironment.getParallelism)
-    if (tracker.start(0L)) {
-      dtrain
-        .mapPartition(new MapFunction(params, round, tracker.getWorkerEnvs))
-        .reduce((x, y) => x).collect().head
-    } else {
-      throw new Error("Tracker cannot be started")
-      null
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoostModel.scala b/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoostModel.scala
deleted file mode 100644
index 71b376974..000000000
--- a/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoostModel.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.flink
-
-import ml.dmlc.xgboost4j.LabeledPoint
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
-
-import org.apache.flink.api.scala.{DataSet, _}
-import org.apache.flink.ml.math.Vector
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-
-class XGBoostModel (booster: Booster) extends Serializable {
-  /**
-    * Save the model as a Hadoop filesystem file.
-    *
-    * @param modelPath The model path as in Hadoop path.
-    */
-  def saveModelAsHadoopFile(modelPath: String): Unit = {
-    booster.saveModel(FileSystem
-      .get(new Configuration)
-      .create(new Path(modelPath)))
-  }
-
-  /**
-   * predict with the given DMatrix
-   * @param testSet the local test set represented as DMatrix
-   * @return prediction result
-   */
-  def predict(testSet: DMatrix): Array[Array[Float]] = {
-    booster.predict(testSet, true, 0)
-  }
-
-  /**
-    * Predict given vector dataset.
-    *
-    * @param data The dataset to be predicted.
-    * @return The prediction result.
-    */
-  def predict(data: DataSet[Vector]) : DataSet[Array[Float]] = {
-    val predictMap: Iterator[Vector] => Traversable[Array[Float]] =
-      (it: Iterator[Vector]) => {
-        val mapper = (x: Vector) => {
-          val (index, value) = x.toSeq.unzip
-          LabeledPoint(0.0f, x.size, index.toArray, value.map(_.toFloat).toArray)
-        }
-        val dataIter = for (x <- it) yield mapper(x)
-        val dmat = new DMatrix(dataIter, null)
-        this.booster.predict(dmat)
-      }
-    data.mapPartition(predictMap)
-  }
-}
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 167635209..1d7a06708 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -38,22 +38,10 @@
             <version>4.13.2</version>
             <scope>test</scope>
         </dependency>
-        <dependency>
-            <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.6.20</version>
-            <scope>compile</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.6.20</version>
-            <scope>test</scope>
-        </dependency>
         <dependency>
             <groupId>org.scalatest</groupId>
             <artifactId>scalatest_${scala.binary.version}</artifactId>
-            <version>3.0.5</version>
+            <version>3.2.15</version>
             <scope>provided</scope>
         </dependency>
         <dependency>
diff --git a/jvm-packages/xgboost4j-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala b/jvm-packages/xgboost4j-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala
index ba8c5fa9a..28ac2207a 100644
--- a/jvm-packages/xgboost4j-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala
+++ b/jvm-packages/xgboost4j-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala
@@ -19,10 +19,10 @@ package ml.dmlc.xgboost4j.scala
 import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.Table
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import ml.dmlc.xgboost4j.gpu.java.CudfColumnBatch
 
-class QuantileDMatrixSuite extends FunSuite {
+class QuantileDMatrixSuite extends AnyFunSuite {
 
   test("QuantileDMatrix test") {
 
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index b1932f3cc..bcb7edb2a 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -44,13 +44,6 @@
             <version>${spark.version}</version>
             <scope>provided</scope>
         </dependency>
-        <dependency>
-          <groupId>ai.rapids</groupId>
-          <artifactId>cudf</artifactId>
-          <version>${cudf.version}</version>
-          <classifier>${cudf.classifier}</classifier>
-          <scope>provided</scope>
-        </dependency>
         <dependency>
           <groupId>com.nvidia</groupId>
           <artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
index 175e00b39..2a355e160 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
@@ -20,14 +20,15 @@ import java.nio.file.{Files, Path}
 import java.sql.{Date, Timestamp}
 import java.util.{Locale, TimeZone}
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.{GpuTestUtils, SparkConf}
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.sql.{Row, SparkSession}
 
-trait GpuTestSuite extends FunSuite with TmpFolderSuite {
+trait GpuTestSuite extends AnyFunSuite with TmpFolderSuite {
   import SparkSessionHolder.withSparkSession
 
   protected def getResourcePath(resource: String): String = {
@@ -200,7 +201,7 @@ trait GpuTestSuite extends FunSuite with TmpFolderSuite {
 
 }
 
-trait TmpFolderSuite extends BeforeAndAfterAll { self: FunSuite =>
+trait TmpFolderSuite extends BeforeAndAfterAll { self: AnyFunSuite =>
   protected var tempDir: Path = _
 
   override def beforeAll(): Unit = {
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
index 176a54832..31d58224b 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2021-2022 by Contributors
+ Copyright (c) 2021-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@ import java.util.ServiceLoader
 import scala.collection.JavaConverters._
 import scala.collection.{AbstractIterator, Iterator, mutable}
 
-import ml.dmlc.xgboost4j.java.Communicator
 import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
 import ml.dmlc.xgboost4j.scala.spark.util.DataUtils.PackedParams
 import ml.dmlc.xgboost4j.scala.spark.params.XGBoostEstimatorCommon
@@ -35,7 +34,6 @@ import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import org.apache.commons.logging.LogFactory
 
 import org.apache.spark.TaskContext
-import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType}
@@ -263,12 +261,6 @@ object PreXGBoost extends PreXGBoostProvider {
         private var batchCnt = 0
 
         private val batchIterImpl = rowIterator.grouped(inferBatchSize).flatMap { batchRow =>
-          if (batchCnt == 0) {
-            val rabitEnv = Array(
-              "DMLC_TASK_ID" -> TaskContext.getPartitionId().toString).toMap
-            Communicator.init(rabitEnv.asJava)
-          }
-
           val features = batchRow.iterator.map(row => row.getAs[Vector](featuresCol))
 
           import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
@@ -295,13 +287,8 @@ object PreXGBoost extends PreXGBoostProvider {
 
         override def hasNext: Boolean = batchIterImpl.hasNext
 
-        override def next(): Row = {
-          val ret = batchIterImpl.next()
-          if (!batchIterImpl.hasNext) {
-            Communicator.shutdown()
-          }
-          ret
-        }
+        override def next(): Row = batchIterImpl.next()
+
       }
     }
 
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 281997295..0aeae791a 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@ import scala.util.Random
 import scala.collection.JavaConverters._
 
 import ml.dmlc.xgboost4j.java.{Communicator, IRabitTracker, XGBoostError, RabitTracker => PyRabitTracker}
-import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
 import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams
 import ml.dmlc.xgboost4j.scala.ExternalCheckpointManager
 import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
@@ -44,21 +43,16 @@ import org.apache.spark.sql.SparkSession
  *                                Use a finite, non-zero timeout value to prevent tracker from
  *                                hanging indefinitely (in milliseconds)
  *                                (supported by "scala" implementation only.)
- * @param trackerImpl Choice between "python" or "scala". The former utilizes the Java wrapper of
- *                    the Python Rabit tracker (in dmlc_core), whereas the latter is implemented
- *                    in Scala without Python components, and with full support of timeouts.
- *                    The Scala implementation is currently experimental, use at your own risk.
- *
  * @param hostIp The Rabit Tracker host IP address which is only used for python implementation.
  *               This is only needed if the host IP cannot be automatically guessed.
  * @param pythonExec The python executed path for Rabit Tracker,
  *                   which is only used for python implementation.
  */
-case class TrackerConf(workerConnectionTimeout: Long, trackerImpl: String,
+case class TrackerConf(workerConnectionTimeout: Long,
   hostIp: String = "", pythonExec: String = "")
 
 object TrackerConf {
-  def apply(): TrackerConf = TrackerConf(0L, "python")
+  def apply(): TrackerConf = TrackerConf(0L)
 }
 
 private[scala] case class XGBoostExecutionEarlyStoppingParams(numEarlyStoppingRounds: Int,
@@ -349,11 +343,9 @@ object XGBoost extends Serializable {
 
   /** visiable for testing */
   private[scala] def getTracker(nWorkers: Int, trackerConf: TrackerConf): IRabitTracker = {
-    val tracker: IRabitTracker = trackerConf.trackerImpl match {
-      case "scala" => new RabitTracker(nWorkers)
-      case "python" => new PyRabitTracker(nWorkers, trackerConf.hostIp, trackerConf.pythonExec)
-      case _ => new PyRabitTracker(nWorkers)
-    }
+    val tracker: IRabitTracker = new PyRabitTracker(
+      nWorkers, trackerConf.hostIp, trackerConf.pythonExec
+    )
     tracker
   }
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
index 579e3dd37..5445cd1bf 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
@@ -22,11 +22,10 @@ import scala.util.Random
 
 import ml.dmlc.xgboost4j.java.{Communicator, RabitTracker => PyRabitTracker}
 import ml.dmlc.xgboost4j.java.IRabitTracker.TrackerStatus
-import ml.dmlc.xgboost4j.scala.rabit.{RabitTracker => ScalaRabitTracker}
 import ml.dmlc.xgboost4j.scala.DMatrix
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
-class CommunicatorRobustnessSuite extends FunSuite with PerTest {
+class CommunicatorRobustnessSuite extends AnyFunSuite with PerTest {
 
   private def getXGBoostExecutionParams(paramMap: Map[String, Any]): XGBoostExecutionParams = {
     val classifier = new XGBoostClassifier(paramMap)
@@ -40,7 +39,7 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
 
     val paramMap = Map(
       "num_workers" -> numWorkers,
-      "tracker_conf" -> TrackerConf(0L, "python", hostIp))
+      "tracker_conf" -> TrackerConf(0L, hostIp))
     val xgbExecParams = getXGBoostExecutionParams(paramMap)
     val tracker = XGBoost.getTracker(xgbExecParams.numWorkers, xgbExecParams.trackerConf)
     tracker match {
@@ -53,7 +52,7 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
 
     val paramMap1 = Map(
       "num_workers" -> numWorkers,
-      "tracker_conf" -> TrackerConf(0L, "python", "", pythonExec))
+      "tracker_conf" -> TrackerConf(0L, "", pythonExec))
     val xgbExecParams1 = getXGBoostExecutionParams(paramMap1)
     val tracker1 = XGBoost.getTracker(xgbExecParams1.numWorkers, xgbExecParams1.trackerConf)
     tracker1 match {
@@ -66,7 +65,7 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
 
     val paramMap2 = Map(
       "num_workers" -> numWorkers,
-      "tracker_conf" -> TrackerConf(0L, "python", hostIp, pythonExec))
+      "tracker_conf" -> TrackerConf(0L, hostIp, pythonExec))
     val xgbExecParams2 = getXGBoostExecutionParams(paramMap2)
     val tracker2 = XGBoost.getTracker(xgbExecParams2.numWorkers, xgbExecParams2.trackerConf)
     tracker2 match {
@@ -78,58 +77,6 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
     }
   }
 
-  test("training with Scala-implemented Rabit tracker") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
-  test("test Communicator allreduce to validate Scala-implemented Rabit tracker") {
-    val vectorLength = 100
-    val rdd = sc.parallelize(
-      (1 to numWorkers * vectorLength).toArray.map { _ => Random.nextFloat() }, numWorkers).cache()
-
-    val tracker = new ScalaRabitTracker(numWorkers)
-    tracker.start(0)
-    val trackerEnvs = tracker.getWorkerEnvs
-    val collectedAllReduceResults = new LinkedBlockingDeque[Array[Float]]()
-
-    val rawData = rdd.mapPartitions { iter =>
-      Iterator(iter.toArray)
-    }.collect()
-
-    val maxVec = (0 until vectorLength).toArray.map { j =>
-      (0 until numWorkers).toArray.map { i => rawData(i)(j) }.max
-    }
-
-    val allReduceResults = rdd.mapPartitions { iter =>
-      Communicator.init(trackerEnvs)
-      val arr = iter.toArray
-      val results = Communicator.allReduce(arr, Communicator.OpType.MAX)
-      Communicator.shutdown()
-      Iterator(results)
-    }.cache()
-
-    val sparkThread = new Thread() {
-      override def run(): Unit = {
-        allReduceResults.foreachPartition(() => _)
-        val byPartitionResults = allReduceResults.collect()
-        assert(byPartitionResults(0).length == vectorLength)
-        collectedAllReduceResults.put(byPartitionResults(0))
-      }
-    }
-    sparkThread.start()
-    assert(tracker.waitFor(0L) == 0)
-    sparkThread.join()
-
-    assert(collectedAllReduceResults.poll().sameElements(maxVec))
-  }
-
   test("test Java RabitTracker wrapper's exception handling: it should not hang forever.") {
     /*
       Deliberately create new instances of SparkContext in each unit test to avoid reusing the
@@ -193,68 +140,6 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
     assert(tracker.waitFor(0) != 0)
   }
 
-  test("test Scala RabitTracker's exception handling: it should not hang forever.") {
-    val rdd = sc.parallelize(1 to numWorkers, numWorkers).cache()
-
-    val tracker = new ScalaRabitTracker(numWorkers)
-    tracker.start(0)
-    val trackerEnvs = tracker.getWorkerEnvs
-
-    val workerCount: Int = numWorkers
-    val dummyTasks = rdd.mapPartitions { iter =>
-      Communicator.init(trackerEnvs)
-      val index = iter.next()
-      Thread.sleep(100 + index * 10)
-      if (index == workerCount) {
-        // kill the worker by throwing an exception
-        throw new RuntimeException("Worker exception.")
-      }
-      Communicator.shutdown()
-      Iterator(index)
-    }.cache()
-
-    val sparkThread = new Thread() {
-      override def run(): Unit = {
-        // forces a Spark job.
-        dummyTasks.foreachPartition(() => _)
-      }
-    }
-    sparkThread.setUncaughtExceptionHandler(tracker)
-    sparkThread.start()
-    assert(tracker.waitFor(0L) == TrackerStatus.FAILURE.getStatusCode)
-  }
-
-  test("test Scala RabitTracker's workerConnectionTimeout") {
-    val rdd = sc.parallelize(1 to numWorkers, numWorkers).cache()
-
-    val tracker = new ScalaRabitTracker(numWorkers)
-    tracker.start(500)
-    val trackerEnvs = tracker.getWorkerEnvs
-
-    val dummyTasks = rdd.mapPartitions { iter =>
-      val index = iter.next()
-      // simulate that the first worker cannot connect to tracker due to network issues.
-      if (index != 1) {
-        Communicator.init(trackerEnvs)
-        Thread.sleep(1000)
-        Communicator.shutdown()
-      }
-
-      Iterator(index)
-    }.cache()
-
-    val sparkThread = new Thread() {
-      override def run(): Unit = {
-        // forces a Spark job.
-        dummyTasks.foreachPartition(() => _)
-      }
-    }
-    sparkThread.setUncaughtExceptionHandler(tracker)
-    sparkThread.start()
-    // should fail due to connection timeout
-    assert(tracker.waitFor(0L) == TrackerStatus.FAILURE.getStatusCode)
-  }
-
   test("should allow the dataframe containing communicator calls to be partially evaluated for" +
     " multiple times (ISSUE-4406)") {
     val paramMap = Map(
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
index 61766b755..8d9723bb6 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
@@ -17,13 +17,13 @@
 package ml.dmlc.xgboost4j.scala.spark
 
 import org.apache.spark.ml.linalg.Vectors
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import ml.dmlc.xgboost4j.scala.spark.util.DataUtils
 import ml.dmlc.xgboost4j.scala.spark.util.DataUtils.PackedParams
 
 import org.apache.spark.sql.functions._
 
-class DeterministicPartitioningSuite extends FunSuite with TmpFolderPerSuite with PerTest {
+class DeterministicPartitioningSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
 
   test("perform deterministic partitioning when checkpointInternal and" +
     " checkpointPath is set (Classifier)") {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
index cdcfd76f5..adc9c1068 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
@@ -19,10 +19,10 @@ package ml.dmlc.xgboost4j.scala.spark
 import java.io.File
 
 import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, ExternalCheckpointManager, XGBoost => SXGBoost}
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import org.apache.hadoop.fs.{FileSystem, Path}
 
-class ExternalCheckpointManagerSuite extends FunSuite with TmpFolderPerSuite with PerTest {
+class ExternalCheckpointManagerSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
 
   private def produceParamMap(checkpointPath: String, checkpointInterval: Int):
   Map[String, Any] = {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
index e0151dde3..789fd162b 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
@@ -18,12 +18,12 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import org.apache.spark.Partitioner
 import org.apache.spark.ml.feature.VectorAssembler
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import org.apache.spark.sql.functions._
 
 import scala.util.Random
 
-class FeatureSizeValidatingSuite extends FunSuite with PerTest {
+class FeatureSizeValidatingSuite extends AnyFunSuite with PerTest {
 
   test("transform throwing exception if feature size of dataset is greater than model's") {
     val modelPath = getClass.getResource("/model/0.82/model").getPath
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
index 5863e2ace..6a7f7129d 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
@@ -19,12 +19,12 @@ package ml.dmlc.xgboost4j.scala.spark
 import org.apache.spark.ml.feature.VectorAssembler
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.sql.DataFrame
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import scala.util.Random
 
 import org.apache.spark.SparkException
 
-class MissingValueHandlingSuite extends FunSuite with PerTest {
+class MissingValueHandlingSuite extends AnyFunSuite with PerTest {
   test("dense vectors containing missing value") {
     def buildDenseDataFrame(): DataFrame = {
       val numRows = 100
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
index e3468b811..11b60e74d 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
@@ -16,12 +16,13 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.SparkException
 import org.apache.spark.ml.param.ParamMap
 
-class ParameterSuite extends FunSuite with PerTest with BeforeAndAfterAll {
+class ParameterSuite extends AnyFunSuite with PerTest with BeforeAndAfterAll {
 
   test("XGBoost and Spark parameters synchronize correctly") {
     val xgbParamMap = Map("eta" -> "1", "objective" -> "binary:logistic",
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
index e96618c51..24bc00e18 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
@@ -22,13 +22,14 @@ import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.sql._
-import org.scalatest.{BeforeAndAfterEach, FunSuite}
+import org.scalatest.BeforeAndAfterEach
+import org.scalatest.funsuite.AnyFunSuite
 import scala.math.min
 import scala.util.Random
 
 import org.apache.commons.io.IOUtils
 
-trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
+trait PerTest extends BeforeAndAfterEach { self: AnyFunSuite =>
 
   protected val numWorkers: Int = min(Runtime.getRuntime.availableProcessors(), 4)
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
index cf8dcca57..5425b8647 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
@@ -25,9 +25,9 @@ import scala.util.Random
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.sql.functions._
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
-class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest {
+class PersistenceSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
 
   test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
     val eval = new EvalError()
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TmpFolderPerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TmpFolderPerSuite.scala
index 96b74d679..bb523ffdf 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TmpFolderPerSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TmpFolderPerSuite.scala
@@ -19,9 +19,10 @@ package ml.dmlc.xgboost4j.scala.spark
 import java.nio.file.{Files, Path}
 
 import org.apache.spark.network.util.JavaUtils
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 
-trait TmpFolderPerSuite extends BeforeAndAfterAll { self: FunSuite =>
+trait TmpFolderPerSuite extends BeforeAndAfterAll { self: AnyFunSuite =>
   protected var tempDir: Path = _
 
   override def beforeAll(): Unit = {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
index f31207b9f..0031be9c7 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -22,13 +22,13 @@ import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
 
 import org.apache.spark.ml.linalg._
 import org.apache.spark.sql._
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import org.apache.commons.io.IOUtils
 
 import org.apache.spark.Partitioner
 import org.apache.spark.ml.feature.VectorAssembler
 
-class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuite {
+class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
 
   protected val treeMethod: String = "auto"
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala
index a7310f1ab..86b82e63c 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala
@@ -21,11 +21,11 @@ import ml.dmlc.xgboost4j.scala.Booster
 import scala.collection.JavaConverters._
 
 import org.apache.spark.sql._
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.SparkException
 
-class XGBoostCommunicatorRegressionSuite extends FunSuite with PerTest {
+class XGBoostCommunicatorRegressionSuite extends AnyFunSuite with PerTest {
   val predictionErrorMin = 0.00001f
   val maxFailure = 2;
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
index 7d588d97c..086fda2d7 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
@@ -19,9 +19,9 @@ package ml.dmlc.xgboost4j.scala.spark
 import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
 
 import org.apache.spark.sql._
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
-class XGBoostConfigureSuite extends FunSuite with PerTest {
+class XGBoostConfigureSuite extends AnyFunSuite with PerTest {
 
   override def sparkSessionBuilder: SparkSession.Builder = super.sparkSessionBuilder
       .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
index 0bf8c2fbb..c1e34224c 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -22,12 +22,12 @@ import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import ml.dmlc.xgboost4j.scala.DMatrix
 
 import org.apache.spark.{SparkException, TaskContext}
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.ml.feature.VectorAssembler
 import org.apache.spark.sql.functions.lit
 
-class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
+class XGBoostGeneralSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
 
   test("distributed training with the specified worker number") {
     val trainingRDD = sc.parallelize(Classification.train)
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
index 4e3d59b25..efcb38cf6 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
@@ -23,11 +23,11 @@ import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{DataFrame, Row}
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.ml.feature.VectorAssembler
 
-class XGBoostRegressorSuite extends FunSuite with PerTest with TmpFolderPerSuite {
+class XGBoostRegressorSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
   protected val treeMethod: String = "auto"
 
   test("XGBoost-Spark XGBoostRegressor output should match XGBoost4j") {
diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py
index edc9759bd..06372e9b2 100644
--- a/jvm-packages/xgboost4j-tester/generate_pom.py
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@@ -69,7 +69,7 @@ pom_template = """
     <dependency>
       <groupId>org.scalactic</groupId>
       <artifactId>scalactic_${{scala.binary.version}}</artifactId>
-      <version>3.0.8</version>
+      <version>3.2.15</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index aa8694751..3a1c4b2cf 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -31,22 +31,10 @@
             <version>4.13.2</version>
             <scope>test</scope>
         </dependency>
-        <dependency>
-            <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.6.20</version>
-            <scope>compile</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.6.20</version>
-            <scope>test</scope>
-        </dependency>
         <dependency>
           <groupId>org.scalatest</groupId>
           <artifactId>scalatest_${scala.binary.version}</artifactId>
-          <version>3.0.5</version>
+          <version>3.2.15</version>
           <scope>provided</scope>
         </dependency>
     </dependencies>
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTracker.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTracker.scala
deleted file mode 100644
index fb388d083..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTracker.scala
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit
-
-import java.net.{InetAddress, InetSocketAddress}
-
-import akka.actor.ActorSystem
-import akka.pattern.ask
-import ml.dmlc.xgboost4j.java.{IRabitTracker, TrackerProperties}
-import ml.dmlc.xgboost4j.scala.rabit.handler.RabitTrackerHandler
-
-import scala.concurrent.duration._
-import scala.concurrent.{Await, Future}
-import scala.util.{Failure, Success, Try}
-
-/**
-  * Scala implementation of the Rabit tracker interface without Python dependency.
-  * The Scala Rabit tracker fully implements the timeout logic, effectively preventing the tracker
-  * (and thus any distributed tasks) to hang indefinitely due to network issues or worker node
-  * failures.
-  *
-  * Note that this implementation is currently experimental, and should be used at your own risk.
-  *
-  * Example usage:
-  * {{{
-  *   import scala.concurrent.duration._
-  *
-  *   val tracker = new RabitTracker(32)
-  *   // allow up to 10 minutes for all workers to connect to the tracker.
-  *   tracker.start(10 minutes)
-  *
-  *   /* ...
-  *      launching workers in parallel
-  *      ...
-  *   */
-  *
-  *   // wait for worker execution up to 6 hours.
-  *   // providing a finite timeout prevents a long-running task from hanging forever in
-  *   // catastrophic events, like the loss of an executor during model training.
-  *   tracker.waitFor(6 hours)
-  * }}}
-  *
-  * @param numWorkers Number of distributed workers from which the tracker expects connections.
-  * @param port The minimum port number that the tracker binds to.
-  *             If port is omitted, or given as None, a random ephemeral port is chosen at runtime.
-  * @param maxPortTrials The maximum number of trials of socket binding, by sequentially
-  *                      increasing the port number.
-  */
-private[scala] class RabitTracker(numWorkers: Int, port: Option[Int] = None,
-                                  maxPortTrials: Int = 1000)
-  extends IRabitTracker {
-
-  import scala.collection.JavaConverters._
-
-  require(numWorkers >=1, "numWorkers must be greater than or equal to one (1).")
-
-  val system = ActorSystem.create("RabitTracker")
-  val handler = system.actorOf(RabitTrackerHandler.props(numWorkers), "Handler")
-  implicit val askTimeout: akka.util.Timeout = akka.util.Timeout(30 seconds)
-  private[this] val tcpBindingTimeout: Duration = 1 minute
-
-  var workerEnvs: Map[String, String] = Map.empty
-
-  override def uncaughtException(t: Thread, e: Throwable): Unit = {
-    handler ? RabitTrackerHandler.InterruptTracker(e)
-  }
-
-  /**
-    * Start the Rabit tracker.
-    *
-    * @param timeout The timeout for awaiting connections from worker nodes.
-    *      Note that when used in Spark applications, because all Spark transformations are
-    *      lazily executed, the I/O time for loading RDDs/DataFrames from external sources
-    *      (local dist, HDFS, S3 etc.) must be taken into account for the timeout value.
-    *      If the timeout value is too small, the Rabit tracker will likely timeout before workers
-    *      establishing connections to the tracker, due to the overhead of loading data.
-    *      Using a finite timeout is encouraged, as it prevents the tracker (thus the Spark driver
-    *      running it) from hanging indefinitely due to worker connection issues (e.g. firewall.)
-    * @return Boolean flag indicating if the Rabit tracker starts successfully.
-    */
-  private def start(timeout: Duration): Boolean = {
-    val hostAddress = Option(TrackerProperties.getInstance().getHostIp)
-      .map(InetAddress.getByName).getOrElse(InetAddress.getLocalHost)
-
-    handler ? RabitTrackerHandler.StartTracker(
-      new InetSocketAddress(hostAddress, port.getOrElse(0)), maxPortTrials, timeout)
-
-    // block by waiting for the actor to bind to a port
-    Try(Await.result(handler ? RabitTrackerHandler.RequestBoundFuture, askTimeout.duration)
-      .asInstanceOf[Future[Map[String, String]]]) match {
-      case Success(futurePortBound) =>
-        // The success of the Future is contingent on binding to an InetSocketAddress.
-        val isBound = Try(Await.ready(futurePortBound, tcpBindingTimeout)).isSuccess
-        if (isBound) {
-          workerEnvs = Await.result(futurePortBound, 0 nano)
-        }
-        isBound
-      case Failure(ex: Throwable) =>
-        false
-    }
-  }
-
-  /**
-    * Start the Rabit tracker.
-    *
-    * @param connectionTimeoutMillis Timeout, in milliseconds, for the tracker to wait for worker
-    *                                connections. If a non-positive value is provided, the tracker
-    *                                waits for incoming worker connections indefinitely.
-    * @return Boolean flag indicating if the Rabit tracker starts successfully.
-    */
-  def start(connectionTimeoutMillis: Long): Boolean = {
-    if (connectionTimeoutMillis <= 0) {
-      start(Duration.Inf)
-    } else {
-      start(Duration.fromNanos(connectionTimeoutMillis * 1e6))
-    }
-  }
-
-  def stop(): Unit = {
-    system.terminate()
-  }
-
-  /**
-    * Get a Map of necessary environment variables to initiate Rabit workers.
-    *
-    * @return HashMap containing tracker information.
-    */
-  def getWorkerEnvs: java.util.Map[String, String] = {
-    new java.util.HashMap((workerEnvs ++ Map(
-        "DMLC_NUM_WORKER" -> numWorkers.toString,
-        "DMLC_NUM_SERVER" -> "0"
-    )).asJava)
-  }
-
-  /**
-    * Await workers to complete assigned tasks for at most 'atMostMillis' milliseconds.
-    * This method blocks until timeout or task completion.
-    *
-    * @param atMost the maximum execution time for the workers. By default,
-    *     the tracker waits for the workers indefinitely.
-    * @return 0 if the tasks complete successfully, and non-zero otherwise.
-    */
-  private def waitFor(atMost: Duration): Int = {
-    // request the completion Future from the tracker actor
-    Try(Await.result(handler ? RabitTrackerHandler.RequestCompletionFuture, askTimeout.duration)
-      .asInstanceOf[Future[Int]]) match {
-      case Success(futureCompleted) =>
-        // wait for all workers to complete synchronously.
-        val statusCode = Try(Await.result(futureCompleted, atMost)) match {
-          case Success(n) if n == numWorkers =>
-            IRabitTracker.TrackerStatus.SUCCESS.getStatusCode
-          case Success(n) if n < numWorkers =>
-            IRabitTracker.TrackerStatus.TIMEOUT.getStatusCode
-          case Failure(e) =>
-            IRabitTracker.TrackerStatus.FAILURE.getStatusCode
-        }
-        system.terminate()
-        statusCode
-      case Failure(ex: Throwable) =>
-        system.terminate()
-        IRabitTracker.TrackerStatus.FAILURE.getStatusCode
-    }
-  }
-
-  /**
-    * Await workers to complete assigned tasks for at most 'atMostMillis' milliseconds.
-    * This method blocks until timeout or task completion.
-    *
-    * @param atMostMillis Number of milliseconds for the tracker to wait for workers. If a
-    *                     non-positive number is given, the tracker waits indefinitely.
-    * @return 0 if the tasks complete successfully, and non-zero otherwise
-    */
-  def waitFor(atMostMillis: Long): Int = {
-    if (atMostMillis <= 0) {
-      waitFor(Duration.Inf)
-    } else {
-      waitFor(Duration.fromNanos(atMostMillis * 1e6))
-    }
-  }
-}
-
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitTrackerHandler.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitTrackerHandler.scala
deleted file mode 100644
index f9de71746..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitTrackerHandler.scala
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit.handler
-
-import java.net.InetSocketAddress
-import java.util.UUID
-
-import scala.concurrent.duration._
-import scala.collection.mutable
-import scala.concurrent.{Promise, TimeoutException}
-import akka.io.{IO, Tcp}
-import akka.actor._
-import ml.dmlc.xgboost4j.java.XGBoostError
-import ml.dmlc.xgboost4j.scala.rabit.util.{AssignedRank, LinkMap}
-
-import scala.util.{Failure, Random, Success, Try}
-
-/** The Akka actor for handling and coordinating Rabit worker connections.
-  * This is the main actor for handling socket connections, interacting with the synchronous
-  * tracker interface, and resolving tree/ring/parent dependencies between workers.
-  *
-  * @param numWorkers Number of workers to track.
-  */
-private[scala] class RabitTrackerHandler(numWorkers: Int)
-  extends Actor with ActorLogging {
-
-  import context.system
-  import RabitWorkerHandler._
-  import RabitTrackerHandler._
-
-  private[this] val promisedWorkerEnvs = Promise[Map[String, String]]()
-  private[this] val promisedShutdownWorkers = Promise[Int]()
-  private[this] val tcpManager = IO(Tcp)
-
-  // resolves worker connection dependency.
-  val resolver = context.actorOf(Props(classOf[WorkerDependencyResolver], self), "Resolver")
-
-  // workers that have sent "shutdown" signal
-  private[this] val shutdownWorkers = mutable.Set.empty[Int]
-  private[this] val jobToRankMap = mutable.HashMap.empty[String, Int]
-  private[this] val actorRefToHost = mutable.HashMap.empty[ActorRef, String]
-  private[this] val ranksToAssign = mutable.ListBuffer(0 until numWorkers: _*)
-  private[this] var maxPortTrials = 0
-  private[this] var workerConnectionTimeout: Duration = Duration.Inf
-  private[this] var portTrials = 0
-  private[this] val startedWorkers = mutable.Set.empty[Int]
-
-  val linkMap = new LinkMap(numWorkers)
-
-  def decideRank(rank: Int, jobId: String = "NULL"): Option[Int] = {
-    rank match {
-      case r if r >= 0 => Some(r)
-      case _ =>
-        jobId match {
-          case "NULL" => None
-          case jid => jobToRankMap.get(jid)
-        }
-    }
-  }
-
-  /**
-    * Handler for all Akka Tcp connection/binding events. Read/write over the socket is handled
-    * by the RabitWorkerHandler.
-    *
-    * @param event Generic Tcp.Event
-    */
-  private def handleTcpEvents(event: Tcp.Event): Unit = event match {
-    case Tcp.Bound(local) =>
-      // expect all workers to connect within timeout
-      log.info(s"Tracker listening @ ${local.getAddress.getHostAddress}:${local.getPort}")
-      log.info(s"Worker connection timeout is $workerConnectionTimeout.")
-
-      context.setReceiveTimeout(workerConnectionTimeout)
-      promisedWorkerEnvs.success(Map(
-        "DMLC_TRACKER_URI" -> local.getAddress.getHostAddress,
-        "DMLC_TRACKER_PORT" -> local.getPort.toString,
-        // not required because the world size will be communicated to the
-        // worker node after the rank is assigned.
-        "rabit_world_size" -> numWorkers.toString
-      ))
-
-    case Tcp.CommandFailed(cmd: Tcp.Bind) =>
-      if (portTrials < maxPortTrials) {
-        portTrials += 1
-        tcpManager ! Tcp.Bind(self,
-          new InetSocketAddress(cmd.localAddress.getAddress, cmd.localAddress.getPort + 1),
-          backlog = 256)
-      }
-
-    case Tcp.Connected(remote, local) =>
-      log.debug(s"Incoming connection from worker @ ${remote.getAddress.getHostAddress}")
-      // revoke timeout if all workers have connected.
-      val workerHandler = context.actorOf(RabitWorkerHandler.props(
-        remote.getAddress.getHostAddress, numWorkers, self, sender()
-      ), s"ConnectionHandler-${UUID.randomUUID().toString}")
-      val connection = sender()
-      connection ! Tcp.Register(workerHandler, keepOpenOnPeerClosed = true)
-
-      actorRefToHost.put(workerHandler, remote.getAddress.getHostName)
-  }
-
-  /**
-    * Handles external tracker control messages sent by RabitTracker (usually in ask patterns)
-    * to interact with the tracker interface.
-    *
-    * @param trackerMsg control messages sent by RabitTracker class.
-    */
-  private def handleTrackerControlMessage(trackerMsg: TrackerControlMessage): Unit =
-    trackerMsg match {
-
-    case msg: StartTracker =>
-      maxPortTrials = msg.maxPortTrials
-      workerConnectionTimeout = msg.connectionTimeout
-
-      // if the port number is missing, try binding to a random ephemeral port.
-      if (msg.addr.getPort == 0) {
-        tcpManager ! Tcp.Bind(self,
-          new InetSocketAddress(msg.addr.getAddress, new Random().nextInt(61000 - 32768) + 32768),
-          backlog = 256)
-      } else {
-        tcpManager ! Tcp.Bind(self, msg.addr, backlog = 256)
-      }
-      sender() ! true
-
-    case RequestBoundFuture =>
-      sender() ! promisedWorkerEnvs.future
-
-    case RequestCompletionFuture =>
-      sender() ! promisedShutdownWorkers.future
-
-    case InterruptTracker(e) =>
-      log.error(e, "Uncaught exception thrown by worker.")
-      // make sure that waitFor() does not hang indefinitely.
-      promisedShutdownWorkers.failure(e)
-      context.stop(self)
-  }
-
-  /**
-    * Handles messages sent by child actors representing connecting Rabit workers, by brokering
-    * messages to the dependency resolver, and processing worker commands.
-    *
-    * @param workerMsg Message sent by RabitWorkerHandler actors.
-    */
-  private def handleRabitWorkerMessage(workerMsg: RabitWorkerRequest): Unit = workerMsg match {
-    case req @ RequestAwaitConnWorkers(_, _) =>
-      // since the requester may request to connect to other workers
-      // that have not fully set up, delegate this request to the
-      // dependency resolver which handles the dependencies properly.
-      resolver forward req
-
-    // ---- Rabit worker commands: start/recover/shutdown/print ----
-    case WorkerTrackerPrint(_, _, _, msg) =>
-      log.info(msg.trim)
-
-    case WorkerShutdown(rank, _, _) =>
-      assert(rank >= 0, "Invalid rank.")
-      assert(!shutdownWorkers.contains(rank))
-      shutdownWorkers.add(rank)
-
-      log.info(s"Received shutdown signal from $rank")
-
-      if (shutdownWorkers.size == numWorkers) {
-        promisedShutdownWorkers.success(shutdownWorkers.size)
-      }
-
-    case WorkerRecover(prevRank, worldSize, jobId) =>
-      assert(prevRank >= 0)
-      sender() ! linkMap.assignRank(prevRank)
-
-    case WorkerStart(rank, worldSize, jobId) =>
-      assert(worldSize == numWorkers || worldSize == -1,
-        s"Purported worldSize ($worldSize) does not match worker count ($numWorkers)."
-      )
-
-      Try(decideRank(rank, jobId).getOrElse(ranksToAssign.remove(0))) match {
-        case Success(wkRank) =>
-          if (jobId != "NULL") {
-            jobToRankMap.put(jobId, wkRank)
-          }
-
-          val assignedRank = linkMap.assignRank(wkRank)
-          sender() ! assignedRank
-          resolver ! assignedRank
-
-          log.info("Received start signal from " +
-            s"${actorRefToHost.getOrElse(sender(), "")} [rank: $wkRank]")
-
-        case Failure(ex: IndexOutOfBoundsException) =>
-          // More than worldSize workers have connected, likely due to executor loss.
-          // Since Rabit currently does not support crash recovery (because the Allreduce results
-          // are not cached by the tracker, and because existing workers cannot reestablish
-          // connections to newly spawned executor/worker), the most reasonble action here is to
-          // interrupt the tracker immediate with failure state.
-          log.error("Received invalid start signal from " +
-            s"${actorRefToHost.getOrElse(sender(), "")}: all $worldSize workers have started."
-          )
-          promisedShutdownWorkers.failure(new XGBoostError("Invalid start signal" +
-            " received from worker, likely due to executor loss."))
-
-        case Failure(ex) =>
-          log.error(ex, "Unexpected error")
-          promisedShutdownWorkers.failure(ex)
-      }
-
-
-    // ---- Dependency resolving related messages ----
-    case msg @ WorkerStarted(host, rank, awaitingAcceptance) =>
-      log.info(s"Worker $host (rank: $rank) has started.")
-      resolver forward msg
-
-      startedWorkers.add(rank)
-      if (startedWorkers.size == numWorkers) {
-        log.info("All workers have started.")
-      }
-
-    case req @ DropFromWaitingList(_) =>
-      // all peer workers in dependency link map have connected;
-      // forward message to resolver to update dependencies.
-      resolver forward req
-
-    case _ =>
-  }
-
-  def receive: Actor.Receive = {
-    case tcpEvent: Tcp.Event => handleTcpEvents(tcpEvent)
-    case trackerMsg: TrackerControlMessage => handleTrackerControlMessage(trackerMsg)
-    case workerMsg: RabitWorkerRequest => handleRabitWorkerMessage(workerMsg)
-
-    case akka.actor.ReceiveTimeout =>
-      if (startedWorkers.size < numWorkers) {
-        promisedShutdownWorkers.failure(
-          new TimeoutException("Timed out waiting for workers to connect: " +
-            s"${numWorkers - startedWorkers.size} of $numWorkers did not start/connect.")
-        )
-        context.stop(self)
-      }
-
-      context.setReceiveTimeout(Duration.Undefined)
-  }
-}
-
-/**
-  * Resolve the dependency between nodes as they connect to the tracker.
-  * The dependency is enforced that a worker of rank K depends on its neighbors (from the treeMap
-  * and ringMap) whose ranks are smaller than K. Since ranks are assigned in the order of
-  * connections by workers, this dependency constraint assumes that a worker node connects first
-  * is likely to finish setup first.
-  */
-private[rabit] class WorkerDependencyResolver(handler: ActorRef) extends Actor with ActorLogging {
-  import RabitWorkerHandler._
-
-  context.watch(handler)
-
-  case class Fulfillment(toConnectSet: Set[Int], promise: Promise[AwaitingConnections])
-
-  // worker nodes that have connected, but have not send WorkerStarted message.
-  private val dependencyMap = mutable.Map.empty[Int, Set[Int]]
-  private val startedWorkers = mutable.Set.empty[Int]
-  // worker nodes that have started, and await for connections.
-  private val awaitConnWorkers = mutable.Map.empty[Int, ActorRef]
-  private val pendingFulfillment = mutable.Map.empty[Int, Fulfillment]
-
-  def awaitingWorkers(linkSet: Set[Int]): AwaitingConnections = {
-    val connSet = awaitConnWorkers.toMap
-      .filterKeys(k => linkSet.contains(k))
-    AwaitingConnections(connSet, linkSet.size - connSet.size)
-  }
-
-  def receive: Actor.Receive = {
-    // a copy of the AssignedRank message that is also sent to the worker
-    case AssignedRank(rank, tree_neighbors, ring, parent) =>
-      // the workers that the worker of given `rank` depends on:
-      // worker of rank K only depends on workers with rank smaller than K.
-      val dependentWorkers = (tree_neighbors.toSet ++ Set(ring._1, ring._2))
-        .filter{ r => r != -1 && r < rank}
-
-      log.debug(s"Rank $rank connected, dependencies: $dependentWorkers")
-      dependencyMap.put(rank, dependentWorkers)
-
-    case RequestAwaitConnWorkers(rank, toConnectSet) =>
-      val promise = Promise[AwaitingConnections]()
-
-      assert(dependencyMap.contains(rank))
-
-      val updatedDependency = dependencyMap(rank) diff startedWorkers
-      if (updatedDependency.isEmpty) {
-        // all dependencies are satisfied
-        log.debug(s"Rank $rank has all dependencies satisfied.")
-        promise.success(awaitingWorkers(toConnectSet))
-      } else {
-        log.debug(s"Rank $rank's request for AwaitConnWorkers is pending fulfillment.")
-        // promise is pending fulfillment due to unresolved dependency
-        pendingFulfillment.put(rank, Fulfillment(toConnectSet, promise))
-      }
-
-      sender() ! promise.future
-
-    case WorkerStarted(_, started, awaitingAcceptance) =>
-      startedWorkers.add(started)
-      if (awaitingAcceptance > 0) {
-        awaitConnWorkers.put(started, sender())
-      }
-
-      // remove the started rank from all dependencies.
-      dependencyMap.remove(started)
-      dependencyMap.foreach { case (r, dset) =>
-        val updatedDependency = dset diff startedWorkers
-        // fulfill the future if all dependencies are met (started.)
-        if (updatedDependency.isEmpty) {
-          log.debug(s"Rank $r has all dependencies satisfied.")
-          pendingFulfillment.remove(r).map{
-            case Fulfillment(toConnectSet, promise) =>
-              promise.success(awaitingWorkers(toConnectSet))
-          }
-        }
-
-        dependencyMap.update(r, updatedDependency)
-      }
-
-    case DropFromWaitingList(rank) =>
-      assert(awaitConnWorkers.remove(rank).isDefined)
-
-    case Terminated(ref) =>
-      if (ref.equals(handler)) {
-        context.stop(self)
-      }
-  }
-}
-
-private[scala] object RabitTrackerHandler {
-  // Messages sent by RabitTracker to this RabitTrackerHandler actor
-  trait TrackerControlMessage
-  case object RequestCompletionFuture extends TrackerControlMessage
-  case object RequestBoundFuture extends TrackerControlMessage
-  // Start the Rabit tracker at given socket address awaiting worker connections.
-  // All workers must connect to the tracker before connectionTimeout, otherwise the tracker will
-  // shut down due to timeout.
-  case class StartTracker(addr: InetSocketAddress,
-                          maxPortTrials: Int,
-                          connectionTimeout: Duration) extends TrackerControlMessage
-  // To interrupt the tracker handler due to uncaught exception thrown by the thread acting as
-  // driver for the distributed training.
-  case class InterruptTracker(e: Throwable) extends TrackerControlMessage
-
-  def props(numWorkers: Int): Props =
-    Props(new RabitTrackerHandler(numWorkers))
-}
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitWorkerHandler.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitWorkerHandler.scala
deleted file mode 100644
index 234c4d25a..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitWorkerHandler.scala
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit.handler
-
-import java.nio.{ByteBuffer, ByteOrder}
-
-import akka.io.Tcp
-import akka.actor._
-import akka.util.ByteString
-import ml.dmlc.xgboost4j.scala.rabit.util.{AssignedRank, RabitTrackerHelpers}
-
-import scala.concurrent.{Await, Future}
-import scala.concurrent.duration._
-import scala.util.Try
-
-/**
-  * Actor to handle socket communication from worker node.
-  * To handle fragmentation in received data, this class acts like a FSM
-  * (finite-state machine) to keep track of the internal states.
-  *
-  * @param host IP address of the remote worker
-  * @param worldSize number of total workers
-  * @param tracker the RabitTrackerHandler actor reference
-  */
-private[scala] class RabitWorkerHandler(host: String, worldSize: Int, tracker: ActorRef,
-                                        connection: ActorRef)
-  extends FSM[RabitWorkerHandler.State, RabitWorkerHandler.DataStruct]
-    with ActorLogging with Stash {
-
-  import RabitWorkerHandler._
-  import RabitTrackerHelpers._
-
-  private[this] var rank: Int = 0
-  private[this] var port: Int = 0
-
-  // indicate if the connection is transient (like "print" or "shutdown")
-  private[this] var transient: Boolean = false
-  private[this] var peerClosed: Boolean = false
-
-  // number of workers pending acceptance of current worker
-  private[this] var awaitingAcceptance: Int = 0
-  private[this] var neighboringWorkers = Set.empty[Int]
-
-  // TODO: use a single memory allocation to host all buffers,
-  // including the transient ones for writing.
-  private[this] val readBuffer = ByteBuffer.allocate(4096)
-    .order(ByteOrder.nativeOrder())
-  // in case the received message is longer than needed,
-  // stash the spilled over part in this buffer, and send
-  // to self when transition occurs.
-  private[this] val spillOverBuffer = ByteBuffer.allocate(4096)
-    .order(ByteOrder.nativeOrder())
-  // when setup is complete, need to notify peer handlers
-  // to reduce the awaiting-connection counter.
-  private[this] var pendingAcknowledgement: Option[AcknowledgeAcceptance] = None
-
-  private def resetBuffers(): Unit = {
-    readBuffer.clear()
-    if (spillOverBuffer.position() > 0) {
-      spillOverBuffer.flip()
-      self ! Tcp.Received(ByteString.fromByteBuffer(spillOverBuffer))
-      spillOverBuffer.clear()
-    }
-  }
-
-  private def stashSpillOver(buf: ByteBuffer): Unit = {
-    if (buf.remaining() > 0) spillOverBuffer.put(buf)
-  }
-
-  def getNeighboringWorkers: Set[Int] = neighboringWorkers
-
-  def decodeCommand(buffer: ByteBuffer): TrackerCommand = {
-    val readBuffer = buffer.duplicate().order(ByteOrder.nativeOrder())
-    readBuffer.flip()
-
-    val rank = readBuffer.getInt()
-    val worldSize = readBuffer.getInt()
-    val jobId = readBuffer.getString
-
-    val command = readBuffer.getString
-    val trackerCommand = command match {
-      case "start" => WorkerStart(rank, worldSize, jobId)
-      case "shutdown" =>
-        transient = true
-        WorkerShutdown(rank, worldSize, jobId)
-      case "recover" =>
-        require(rank >= 0, "Invalid rank for recovering worker.")
-        WorkerRecover(rank, worldSize, jobId)
-      case "print" =>
-        transient = true
-        WorkerTrackerPrint(rank, worldSize, jobId, readBuffer.getString)
-    }
-
-    stashSpillOver(readBuffer)
-    trackerCommand
-  }
-
-  startWith(AwaitingHandshake, DataStruct())
-
-  when(AwaitingHandshake) {
-    case Event(Tcp.Received(magic), _) =>
-      assert(magic.length == 4)
-      val purportedMagic = magic.asNativeOrderByteBuffer.getInt
-      assert(purportedMagic == MAGIC_NUMBER, s"invalid magic number $purportedMagic from $host")
-
-      // echo back the magic number
-      connection ! Tcp.Write(magic)
-      goto(AwaitingCommand) using StructTrackerCommand
-  }
-
-  when(AwaitingCommand) {
-    case Event(Tcp.Received(bytes), validator) =>
-      bytes.asByteBuffers.foreach { buf => readBuffer.put(buf) }
-      if (validator.verify(readBuffer)) {
-        Try(decodeCommand(readBuffer)) match {
-          case scala.util.Success(decodedCommand) =>
-            tracker ! decodedCommand
-          case scala.util.Failure(th: java.nio.BufferUnderflowException) =>
-            // BufferUnderflowException would occur if the message to print has not arrived yet.
-            // Do nothing, wait for next Tcp.Received event
-          case scala.util.Failure(th: Throwable) => throw th
-        }
-      }
-
-      stay
-    // when rank for a worker is assigned, send encoded rank information
-    // back to worker over Tcp socket.
-    case Event(aRank @ AssignedRank(assignedRank, neighbors, ring, parent), _) =>
-      log.debug(s"Assigned rank [$assignedRank] for $host, T: $neighbors, R: $ring, P: $parent")
-
-      rank = assignedRank
-      // ranks from the ring
-      val ringRanks = List(
-        // ringPrev
-        if (ring._1 != -1 && ring._1 != rank) ring._1 else -1,
-        // ringNext
-        if (ring._2 != -1 && ring._2 != rank) ring._2 else -1
-      )
-
-      // update the set of all linked workers to current worker.
-      neighboringWorkers = neighbors.toSet ++ ringRanks.filterNot(_ == -1).toSet
-
-      connection ! Tcp.Write(ByteString.fromByteBuffer(aRank.toByteBuffer(worldSize)))
-      // to prevent reading before state transition
-      connection ! Tcp.SuspendReading
-      goto(BuildingLinkMap) using StructNodes
-  }
-
-  when(BuildingLinkMap) {
-    case Event(Tcp.Received(bytes), validator) =>
-      bytes.asByteBuffers.foreach { buf =>
-        readBuffer.put(buf)
-      }
-
-      if (validator.verify(readBuffer)) {
-        readBuffer.flip()
-        // for a freshly started worker, numConnected should be 0.
-        val numConnected = readBuffer.getInt()
-        val toConnectSet = neighboringWorkers.diff(
-          (0 until numConnected).map { index => readBuffer.getInt() }.toSet)
-
-        // check which workers are currently awaiting connections
-        tracker ! RequestAwaitConnWorkers(rank, toConnectSet)
-      }
-      stay
-
-    // got a Future from the tracker (resolver) about workers that are
-    // currently awaiting connections (particularly from this node.)
-    case Event(future: Future[_], _) =>
-      // blocks execution until all dependencies for current worker is resolved.
-      Await.result(future, 1 minute).asInstanceOf[AwaitingConnections] match {
-        // numNotReachable is the number of workers that currently
-        // cannot be connected to (pending connection or setup). Instead, this worker will AWAIT
-        // connections from those currently non-reachable nodes in the future.
-        case AwaitingConnections(waitConnNodes, numNotReachable) =>
-          log.debug(s"Rank $rank needs to connect to: $waitConnNodes, # bad: $numNotReachable")
-          val buf = ByteBuffer.allocate(8).order(ByteOrder.nativeOrder())
-          buf.putInt(waitConnNodes.size).putInt(numNotReachable)
-          buf.flip()
-
-          // cache this message until the final state (SetupComplete)
-          pendingAcknowledgement = Some(AcknowledgeAcceptance(
-            waitConnNodes, numNotReachable))
-
-          connection ! Tcp.Write(ByteString.fromByteBuffer(buf))
-          if (waitConnNodes.isEmpty) {
-            connection ! Tcp.SuspendReading
-            goto(AwaitingErrorCount)
-          }
-          else {
-            waitConnNodes.foreach { case (peerRank, peerRef) =>
-              peerRef ! RequestWorkerHostPort
-            }
-
-            // a countdown for DivulgedHostPort messages.
-            stay using DataStruct(Seq.empty[DataField], waitConnNodes.size - 1)
-          }
-      }
-
-    case Event(DivulgedWorkerHostPort(peerRank, peerHost, peerPort), data) =>
-      val hostBytes = peerHost.getBytes()
-      val buffer = ByteBuffer.allocate(4 * 3 + hostBytes.length)
-        .order(ByteOrder.nativeOrder())
-      buffer.putInt(peerHost.length).put(hostBytes)
-        .putInt(peerPort).putInt(peerRank)
-
-      buffer.flip()
-      connection ! Tcp.Write(ByteString.fromByteBuffer(buffer))
-
-      if (data.counter == 0) {
-        // to prevent reading before state transition
-        connection ! Tcp.SuspendReading
-        goto(AwaitingErrorCount)
-      }
-      else {
-        stay using data.decrement()
-      }
-  }
-
-  when(AwaitingErrorCount) {
-    case Event(Tcp.Received(numErrors), _) =>
-      val buf = numErrors.asNativeOrderByteBuffer
-
-      buf.getInt match {
-        case 0 =>
-          stashSpillOver(buf)
-          goto(AwaitingPortNumber)
-        case _ =>
-          stashSpillOver(buf)
-          goto(BuildingLinkMap) using StructNodes
-      }
-  }
-
-  when(AwaitingPortNumber) {
-    case Event(Tcp.Received(assignedPort), _) =>
-      assert(assignedPort.length == 4)
-      port = assignedPort.asNativeOrderByteBuffer.getInt
-      log.debug(s"Rank $rank listening @ $host:$port")
-      // wait until the worker closes connection.
-      if (peerClosed) goto(SetupComplete) else stay
-
-    case Event(Tcp.PeerClosed, _) =>
-      peerClosed = true
-      if (port == 0) stay else goto(SetupComplete)
-  }
-
-  when(SetupComplete) {
-    case Event(ReduceWaitCount(count: Int), _) =>
-      awaitingAcceptance -= count
-      // check peerClosed to avoid prematurely stopping this actor (which sends RST to worker)
-      if (awaitingAcceptance == 0 && peerClosed) {
-        tracker ! DropFromWaitingList(rank)
-        // no longer needed.
-        context.stop(self)
-      }
-      stay
-
-    case Event(AcknowledgeAcceptance(peers, numBad), _) =>
-      awaitingAcceptance = numBad
-      tracker ! WorkerStarted(host, rank, awaitingAcceptance)
-      peers.values.foreach { peer =>
-        peer ! ReduceWaitCount(1)
-      }
-
-      if (awaitingAcceptance == 0 && peerClosed) self ! PoisonPill
-
-      stay
-
-    // can only divulge the complete host and port information
-    // when this worker is declared fully connected (otherwise
-    // port information is still missing.)
-    case Event(RequestWorkerHostPort, _) =>
-      sender() ! DivulgedWorkerHostPort(rank, host, port)
-      stay
-  }
-
-  onTransition {
-    // reset buffer when state transitions as data becomes stale
-    case _ -> SetupComplete =>
-      connection ! Tcp.ResumeReading
-      resetBuffers()
-      if (pendingAcknowledgement.isDefined) {
-        self ! pendingAcknowledgement.get
-      }
-    case _ =>
-      connection ! Tcp.ResumeReading
-      resetBuffers()
-  }
-
-  // default message handler
-  whenUnhandled {
-    case Event(Tcp.PeerClosed, _) =>
-      peerClosed = true
-      if (transient) context.stop(self)
-      stay
-  }
-}
-
-private[scala] object RabitWorkerHandler {
-  val MAGIC_NUMBER = 0xff99
-
-  // Finite states of this actor, which acts like a FSM.
-  // The following states are defined in order as the FSM progresses.
-  sealed trait State
-
-  // [1] Initial state, awaiting worker to send magic number per protocol.
-  case object AwaitingHandshake extends State
-  // [2] Awaiting worker to send command (start/print/recover/shutdown etc.)
-  case object AwaitingCommand extends State
-  // [3] Brokers connections between workers per ring/tree/parent link map.
-  case object BuildingLinkMap extends State
-  // [4] A transient state in which the worker reports the number of errors in establishing
-  // connections to other peer workers. If no errors, transition to next state.
-  case object AwaitingErrorCount extends State
-  // [5] Awaiting the worker to report its port number for accepting connections from peer workers.
-  // This port number information is later forwarded to linked workers.
-  case object AwaitingPortNumber extends State
-  // [6] Final state after completing the setup with the connecting worker. At this stage, the
-  // worker will have closed the Tcp connection. The actor remains alive to handle messages from
-  // peer actors representing workers with pending setups.
-  case object SetupComplete extends State
-
-  sealed trait DataField
-  case object IntField extends DataField
-  // an integer preceding the actual string
-  case object StringField extends DataField
-  case object IntSeqField extends DataField
-
-  object DataStruct {
-    def apply(): DataStruct = DataStruct(Seq.empty[DataField], 0)
-  }
-
-  // Internal data pertaining to individual state, used to verify the validity of packets sent by
-  // workers.
-  case class DataStruct(fields: Seq[DataField], counter: Int) {
-    /**
-      * Validate whether the provided buffer is complete (i.e., contains
-      * all data fields specified for this DataStruct.)
- *
-      * @param buf a byte buffer containing received data.
-      */
-    def verify(buf: ByteBuffer): Boolean = {
-      if (fields.isEmpty) return true
-
-      val dupBuf = buf.duplicate().order(ByteOrder.nativeOrder())
-      dupBuf.flip()
-
-      Try(fields.foldLeft(true) {
-        case (complete, field) =>
-          val remBytes = dupBuf.remaining()
-          complete && (remBytes > 0) && (remBytes >= (field match {
-            case IntField =>
-              dupBuf.position(dupBuf.position() + 4)
-              4
-            case StringField =>
-              val strLen = dupBuf.getInt
-              dupBuf.position(dupBuf.position() + strLen)
-              4 + strLen
-            case IntSeqField =>
-              val seqLen = dupBuf.getInt
-              dupBuf.position(dupBuf.position() + seqLen * 4)
-              4 + seqLen * 4
-          }))
-      }).getOrElse(false)
-    }
-
-    def increment(): DataStruct = DataStruct(fields, counter + 1)
-    def decrement(): DataStruct = DataStruct(fields, counter - 1)
-  }
-
-  val StructNodes = DataStruct(List(IntSeqField), 0)
-  val StructTrackerCommand = DataStruct(List(
-    IntField, IntField, StringField, StringField
-  ), 0)
-
-  // ---- Messages between RabitTrackerHandler and RabitTrackerConnectionHandler ----
-
-  // RabitWorkerHandler --> RabitTrackerHandler
-  sealed trait RabitWorkerRequest
-  // RabitWorkerHandler <-- RabitTrackerHandler
-  sealed trait RabitWorkerResponse
-
-  // Representations of decoded worker commands.
-  abstract class TrackerCommand(val command: String) extends RabitWorkerRequest {
-    def rank: Int
-    def worldSize: Int
-    def jobId: String
-
-    def encode: ByteString = {
-      val buf = ByteBuffer.allocate(4 * 4 + jobId.length + command.length)
-        .order(ByteOrder.nativeOrder())
-
-      buf.putInt(rank).putInt(worldSize).putInt(jobId.length).put(jobId.getBytes())
-        .putInt(command.length).put(command.getBytes()).flip()
-
-      ByteString.fromByteBuffer(buf)
-    }
-  }
-
-  case class WorkerStart(rank: Int, worldSize: Int, jobId: String)
-    extends TrackerCommand("start")
-  case class WorkerShutdown(rank: Int, worldSize: Int, jobId: String)
-    extends TrackerCommand("shutdown")
-  case class WorkerRecover(rank: Int, worldSize: Int, jobId: String)
-    extends TrackerCommand("recover")
-  case class WorkerTrackerPrint(rank: Int, worldSize: Int, jobId: String, msg: String)
-    extends TrackerCommand("print") {
-
-    override def encode: ByteString = {
-      val buf = ByteBuffer.allocate(4 * 5 + jobId.length + command.length + msg.length)
-        .order(ByteOrder.nativeOrder())
-
-      buf.putInt(rank).putInt(worldSize).putInt(jobId.length).put(jobId.getBytes())
-        .putInt(command.length).put(command.getBytes())
-        .putInt(msg.length).put(msg.getBytes()).flip()
-
-      ByteString.fromByteBuffer(buf)
-    }
-  }
-
-  // Request to remove the worker of given rank from the list of workers awaiting peer connections.
-  case class DropFromWaitingList(rank: Int) extends RabitWorkerRequest
-  // Notify the tracker that the worker of given rank has finished setup and started.
-  case class WorkerStarted(host: String, rank: Int, awaitingAcceptance: Int)
-    extends RabitWorkerRequest
-  // Request the set of workers to connect to, according to the LinkMap structure.
-  case class RequestAwaitConnWorkers(rank: Int, toConnectSet: Set[Int])
-    extends RabitWorkerRequest
-
-  // Request, from the tracker, the set of nodes to connect.
-  case class AwaitingConnections(workers: Map[Int, ActorRef], numBad: Int)
-    extends RabitWorkerResponse
-
-  // ---- Messages between ConnectionHandler actors ----
-  sealed trait IntraWorkerMessage
-
-  // Notify neighboring workers to decrease the counter of awaiting workers by `count`.
-  case class ReduceWaitCount(count: Int) extends IntraWorkerMessage
-  // Request host and port information from peer ConnectionHandler actors (acting on behave of
-  // connecting workers.) This message will be brokered by RabitTrackerHandler.
-  case object RequestWorkerHostPort extends IntraWorkerMessage
-  // Response to the above request
-  case class DivulgedWorkerHostPort(rank: Int, host: String, port: Int) extends IntraWorkerMessage
-  // A reminder to send ReduceWaitCount messages once the actor is in state "SetupComplete".
-  case class AcknowledgeAcceptance(peers: Map[Int, ActorRef], numBad: Int)
-    extends IntraWorkerMessage
-
-  // ---- End of message definitions ----
-
-  def props(host: String, worldSize: Int, tracker: ActorRef, connection: ActorRef): Props = {
-    Props(new RabitWorkerHandler(host, worldSize, tracker, connection))
-  }
-}
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/LinkMap.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/LinkMap.scala
deleted file mode 100644
index edec4931b..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/LinkMap.scala
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit.util
-
-import java.nio.{ByteBuffer, ByteOrder}
-
-/**
-  * The assigned rank to a connecting Rabit worker, along with the information of the ranks of
-  * its linked peer workers, which are critical to perform Allreduce.
-  * When RabitWorkerHandler delegates "start" or "recover" commands from the connecting worker
-  * client, RabitTrackerHandler utilizes LinkMap to figure out linkage relationships, and respond
-  * with this class as a message, which is later encoded to byte string, and sent over socket
-  * connection to the worker client.
-  *
-  * @param rank assigned rank (ranked by worker connection order: first worker connecting to the
-  *             tracker is assigned rank 0, second with rank 1, etc.)
-  * @param neighbors ranks of neighboring workers in a tree map.
-  * @param ring ranks of neighboring workers in a ring map.
-  * @param parent rank of the parent worker.
-  */
-private[rabit] case class AssignedRank(rank: Int, neighbors: Seq[Int],
-                                       ring: (Int, Int), parent: Int) {
-  /**
-    * Encode the AssignedRank message into byte sequence for socket communication with Rabit worker
-    * client.
-    * @param worldSize the number of total distributed workers. Must match `numWorkers` used in
-    *                  LinkMap.
-    * @return a ByteBuffer containing encoded data.
-    */
-  def toByteBuffer(worldSize: Int): ByteBuffer = {
-    val buffer = ByteBuffer.allocate(4 * (neighbors.length + 6)).order(ByteOrder.nativeOrder())
-    buffer.putInt(rank).putInt(parent).putInt(worldSize).putInt(neighbors.length)
-    // neighbors in tree structure
-    neighbors.foreach { n => buffer.putInt(n) }
-    buffer.putInt(if (ring._1 != -1 && ring._1 != rank) ring._1 else -1)
-    buffer.putInt(if (ring._2 != -1 && ring._2 != rank) ring._2 else -1)
-
-    buffer.flip()
-    buffer
-  }
-}
-
-private[rabit] class LinkMap(numWorkers: Int) {
-  private def getNeighbors(rank: Int): Seq[Int] = {
-    val rank1 = rank + 1
-    Vector(rank1 / 2 - 1, rank1 * 2 - 1, rank1 * 2).filter { r =>
-      r >= 0 && r < numWorkers
-    }
-  }
-
-  /**
-    * Construct a ring structure that tends to share nodes with the tree.
-    *
-    * @param treeMap
-    * @param parentMap
-    * @param rank
-    * @return Seq[Int] instance starting from rank.
-    */
-  private def constructShareRing(treeMap: Map[Int, Seq[Int]],
-                                 parentMap: Map[Int, Int],
-                                 rank: Int = 0): Seq[Int] = {
-    treeMap(rank).toSet - parentMap(rank) match {
-      case emptySet if emptySet.isEmpty =>
-        List(rank)
-      case connectionSet =>
-        connectionSet.zipWithIndex.foldLeft(List(rank)) {
-          case (ringSeq, (v, cnt)) =>
-            val vConnSeq = constructShareRing(treeMap, parentMap, v)
-            vConnSeq match {
-              case vconn if vconn.size == cnt + 1 =>
-                ringSeq ++ vconn.reverse
-              case vconn =>
-                ringSeq ++ vconn
-            }
-        }
-    }
-  }
-  /**
-    * Construct a ring connection used to recover local data.
-    *
-    * @param treeMap
-    * @param parentMap
-    */
-  private def constructRingMap(treeMap: Map[Int, Seq[Int]], parentMap: Map[Int, Int]) = {
-    assert(parentMap(0) == -1)
-
-    val sharedRing = constructShareRing(treeMap, parentMap, 0).toVector
-    assert(sharedRing.length == treeMap.size)
-
-    (0 until numWorkers).map { r =>
-      val rPrev = (r + numWorkers - 1) % numWorkers
-      val rNext = (r + 1) % numWorkers
-      sharedRing(r) -> (sharedRing(rPrev), sharedRing(rNext))
-    }.toMap
-  }
-
-  private[this] val treeMap_ = (0 until numWorkers).map { r => r -> getNeighbors(r) }.toMap
-  private[this] val parentMap_ = (0 until numWorkers).map{ r => r -> ((r + 1) / 2 - 1) }.toMap
-  private[this] val ringMap_ = constructRingMap(treeMap_, parentMap_)
-  val rMap_ = (0 until (numWorkers - 1)).foldLeft((Map(0 -> 0), 0)) {
-    case ((rmap, k), i) =>
-      val kNext = ringMap_(k)._2
-      (rmap ++ Map(kNext -> (i + 1)), kNext)
-  }._1
-
-  val ringMap = ringMap_.map {
-    case (k, (v0, v1)) => rMap_(k) -> (rMap_(v0), rMap_(v1))
-  }
-  val treeMap = treeMap_.map {
-    case (k, vSeq) => rMap_(k) -> vSeq.map{ v => rMap_(v) }
-  }
-  val parentMap = parentMap_.map {
-    case (k, v) if k == 0 =>
-      rMap_(k) -> -1
-    case (k, v) =>
-      rMap_(k) -> rMap_(v)
-  }
-
-  def assignRank(rank: Int): AssignedRank = {
-    AssignedRank(rank, treeMap(rank), ringMap(rank), parentMap(rank))
-  }
-}
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/RabitTrackerHelpers.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/RabitTrackerHelpers.scala
deleted file mode 100644
index 3d7be618d..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/RabitTrackerHelpers.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit.util
-
-import java.nio.{ByteOrder, ByteBuffer}
-import akka.util.ByteString
-
-private[rabit] object RabitTrackerHelpers {
-  implicit class ByteStringHelplers(bs: ByteString) {
-    // Java by default uses big endian. Enforce native endian so that
-    // the byte order is consistent with the workers.
-    def asNativeOrderByteBuffer: ByteBuffer = {
-      bs.asByteBuffer.order(ByteOrder.nativeOrder())
-    }
-  }
-
-  implicit class ByteBufferHelpers(buf: ByteBuffer) {
-    def getString: String = {
-      val len = buf.getInt()
-      val stringBuffer = ByteBuffer.allocate(len).order(ByteOrder.nativeOrder())
-      buf.get(stringBuffer.array(), 0, len)
-      new String(stringBuffer.array(), "utf-8")
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index cce1254d0..20a243f5b 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -30,8 +30,8 @@ import org.junit.Test;
  * @author hzx
  */
 public class BoosterImplTest {
-  private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1";
-  private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1";
+  private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1&format=libsvm";
+  private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1&format=libsvm";
 
   public static class EvalError implements IEvaluation {
     @Override
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
index cf174c6dd..d658c5529 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
@@ -4,7 +4,7 @@
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
-    
+
  http://www.apache.org/licenses/LICENSE-2.0
 
  Unless required by applicable law or agreed to in writing, software
@@ -88,7 +88,7 @@ public class DMatrixTest {
   public void testCreateFromFile() throws XGBoostError {
     //create DMatrix from file
     String filePath = writeResourceIntoTempFile("/agaricus.txt.test");
-    DMatrix dmat = new DMatrix(filePath);
+    DMatrix dmat = new DMatrix(filePath + "?format=libsvm");
     //get label
     float[] labels = dmat.getLabel();
     //check length
diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
index 05200f49e..53325effa 100644
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
+++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
@@ -20,12 +20,12 @@ import java.util.Arrays
 
 import scala.util.Random
 
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
 
-class DMatrixSuite extends FunSuite {
+class DMatrixSuite extends AnyFunSuite {
   test("create DMatrix from File") {
-    val dmat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val dmat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     // get label
     val labels: Array[Float] = dmat.getLabel
     // check length
diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
index 157971f82..2eda1fa2d 100644
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
+++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
@@ -20,11 +20,11 @@ import java.io.{FileOutputStream, FileInputStream, File}
 
 import junit.framework.TestCase
 import org.apache.commons.logging.LogFactory
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
 import ml.dmlc.xgboost4j.java.XGBoostError
 
-class ScalaBoosterImplSuite extends FunSuite {
+class ScalaBoosterImplSuite extends AnyFunSuite {
 
   private class EvalError extends EvalTrait {
 
@@ -95,8 +95,8 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("basic operation of booster") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val booster = trainBooster(trainMat, testMat)
     val predicts = booster.predict(testMat, true)
@@ -106,8 +106,8 @@ class ScalaBoosterImplSuite extends FunSuite {
 
   test("save/load model with path") {
 
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val eval = new EvalError
     val booster = trainBooster(trainMat, testMat)
     // save and load
@@ -123,8 +123,8 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("save/load model with stream") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val eval = new EvalError
     val booster = trainBooster(trainMat, testMat)
     // save and load
@@ -139,7 +139,7 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("cross validation") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
     val params = List("eta" -> "1.0", "max_depth" -> "3", "silent" -> "1", "nthread" -> "6",
       "objective" -> "binary:logistic", "gamma" -> "1.0", "eval_metric" -> "error").toMap
     val round = 2
@@ -148,8 +148,8 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("test with quantile histo depthwise") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "3", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "eval_metric" -> "auc").toMap
@@ -158,8 +158,8 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("test with quantile histo lossguide") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "3", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "auc").toMap
@@ -168,8 +168,8 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("test with quantile histo lossguide with max bin") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "3", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
@@ -179,8 +179,8 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("test with quantile histo depthwidth with max depth") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_leaves" -> "8", "max_depth" -> "2",
@@ -190,8 +190,8 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("test with quantile histo depthwidth with max depth and max bin") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
@@ -201,7 +201,7 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("test training from existing model in scala") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
@@ -213,8 +213,8 @@ class ScalaBoosterImplSuite extends FunSuite {
   }
 
   test("test getting number of features from a booster") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val booster = trainBooster(trainMat, testMat)
 
     TestCase.assertEquals(booster.getNumFeature, 127)
diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTrackerConnectionHandlerTest.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTrackerConnectionHandlerTest.scala
deleted file mode 100644
index cd9016812..000000000
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTrackerConnectionHandlerTest.scala
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit
-
-import java.nio.{ByteBuffer, ByteOrder}
-
-import akka.actor.{ActorRef, ActorSystem}
-import akka.io.Tcp
-import akka.testkit.{ImplicitSender, TestFSMRef, TestKit, TestProbe}
-import akka.util.ByteString
-import ml.dmlc.xgboost4j.scala.rabit.handler.RabitWorkerHandler
-import ml.dmlc.xgboost4j.scala.rabit.handler.RabitWorkerHandler._
-import ml.dmlc.xgboost4j.scala.rabit.util.LinkMap
-import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{FlatSpecLike, Matchers}
-
-import scala.concurrent.Promise
-
-object RabitTrackerConnectionHandlerTest {
-  def intSeqToByteString(seq: Seq[Int]): ByteString = {
-    val buf = ByteBuffer.allocate(seq.length * 4).order(ByteOrder.nativeOrder())
-    seq.foreach { i => buf.putInt(i) }
-    buf.flip()
-    ByteString.fromByteBuffer(buf)
-  }
-}
-
-@RunWith(classOf[JUnitRunner])
-class RabitTrackerConnectionHandlerTest
-  extends TestKit(ActorSystem("RabitTrackerConnectionHandlerTest"))
-    with FlatSpecLike with Matchers with ImplicitSender {
-
-  import RabitTrackerConnectionHandlerTest._
-
-  val magic = intSeqToByteString(List(0xff99))
-
-  "RabitTrackerConnectionHandler" should "handle Rabit client 'start' command properly" in {
-    val trackerProbe = TestProbe()
-    val connProbe = TestProbe()
-
-    val worldSize = 4
-
-    val fsm = TestFSMRef(new RabitWorkerHandler("localhost", worldSize,
-      trackerProbe.ref, connProbe.ref))
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingHandshake
-
-    // send mock magic number
-    fsm ! Tcp.Received(magic)
-    connProbe.expectMsg(Tcp.Write(magic))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-    fsm.stateData shouldEqual RabitWorkerHandler.StructTrackerCommand
-    // ResumeReading should be seen once state transitions
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // send mock tracker command in fragments: the handler should be able to handle it.
-    val bufRank = ByteBuffer.allocate(8).order(ByteOrder.nativeOrder())
-    bufRank.putInt(0).putInt(worldSize).flip()
-
-    val bufJobId = ByteBuffer.allocate(5).order(ByteOrder.nativeOrder())
-    bufJobId.putInt(1).put(Array[Byte]('0')).flip()
-
-    val bufCmd = ByteBuffer.allocate(9).order(ByteOrder.nativeOrder())
-    bufCmd.putInt(5).put("start".getBytes()).flip()
-
-    fsm ! Tcp.Received(ByteString.fromByteBuffer(bufRank))
-    fsm ! Tcp.Received(ByteString.fromByteBuffer(bufJobId))
-
-    // the state should not change for incomplete command data.
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-
-    // send the last fragment, and expect message at tracker actor.
-    fsm ! Tcp.Received(ByteString.fromByteBuffer(bufCmd))
-    trackerProbe.expectMsg(WorkerStart(0, worldSize, "0"))
-
-    val linkMap = new LinkMap(worldSize)
-    val assignedRank = linkMap.assignRank(0)
-    trackerProbe.reply(assignedRank)
-
-    connProbe.expectMsg(Tcp.Write(ByteString.fromByteBuffer(
-      assignedRank.toByteBuffer(worldSize)
-    )))
-
-    // reading should be suspended upon transitioning to BuildingLinkMap
-    connProbe.expectMsg(Tcp.SuspendReading)
-    // state should transition with according state data changes.
-    fsm.stateName shouldEqual RabitWorkerHandler.BuildingLinkMap
-    fsm.stateData shouldEqual RabitWorkerHandler.StructNodes
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // since the connection handler in test has rank 0, it will not have any nodes to connect to.
-    fsm ! Tcp.Received(intSeqToByteString(List(0)))
-    trackerProbe.expectMsg(RequestAwaitConnWorkers(0, fsm.underlyingActor.getNeighboringWorkers))
-
-    // return mock response to the connection handler
-    val awaitConnPromise = Promise[AwaitingConnections]()
-    awaitConnPromise.success(AwaitingConnections(Map.empty[Int, ActorRef],
-      fsm.underlyingActor.getNeighboringWorkers.size
-    ))
-    fsm ! awaitConnPromise.future
-    connProbe.expectMsg(Tcp.Write(
-      intSeqToByteString(List(0, fsm.underlyingActor.getNeighboringWorkers.size))
-    ))
-    connProbe.expectMsg(Tcp.SuspendReading)
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingErrorCount
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // send mock error count (0)
-    fsm ! Tcp.Received(intSeqToByteString(List(0)))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingPortNumber
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // simulate Tcp.PeerClosed event first, then Tcp.Received to test handling of async events.
-    fsm ! Tcp.PeerClosed
-    // state should not transition
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingPortNumber
-    fsm ! Tcp.Received(intSeqToByteString(List(32768)))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.SetupComplete
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    trackerProbe.expectMsg(RabitWorkerHandler.WorkerStarted("localhost", 0, 2))
-
-    val handlerStopProbe = TestProbe()
-    handlerStopProbe watch fsm
-
-    // simulate connections from other workers by mocking ReduceWaitCount commands
-    fsm ! RabitWorkerHandler.ReduceWaitCount(1)
-    fsm.stateName shouldEqual RabitWorkerHandler.SetupComplete
-    fsm ! RabitWorkerHandler.ReduceWaitCount(1)
-    trackerProbe.expectMsg(RabitWorkerHandler.DropFromWaitingList(0))
-    handlerStopProbe.expectTerminated(fsm)
-
-    // all done.
-  }
-
-  it should "forward print command to tracker" in {
-    val trackerProbe = TestProbe()
-    val connProbe = TestProbe()
-
-    val fsm = TestFSMRef(new RabitWorkerHandler("localhost", 4,
-      trackerProbe.ref, connProbe.ref))
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingHandshake
-
-    fsm ! Tcp.Received(magic)
-    connProbe.expectMsg(Tcp.Write(magic))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-    fsm.stateData shouldEqual RabitWorkerHandler.StructTrackerCommand
-    // ResumeReading should be seen once state transitions
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    val printCmd = WorkerTrackerPrint(0, 4, "print", "hello world!")
-    fsm ! Tcp.Received(printCmd.encode)
-
-    trackerProbe.expectMsg(printCmd)
-  }
-
-  it should "handle fragmented print command without throwing exception" in {
-    val trackerProbe = TestProbe()
-    val connProbe = TestProbe()
-
-    val fsm = TestFSMRef(new RabitWorkerHandler("localhost", 4,
-      trackerProbe.ref, connProbe.ref))
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingHandshake
-
-    fsm ! Tcp.Received(magic)
-    connProbe.expectMsg(Tcp.Write(magic))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-    fsm.stateData shouldEqual RabitWorkerHandler.StructTrackerCommand
-    // ResumeReading should be seen once state transitions
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    val printCmd = WorkerTrackerPrint(0, 4, "0", "fragmented!")
-    // 4 (rank: Int) + 4 (worldSize: Int) + (4+1) (jobId: String) + (4+5) (command: String) = 22
-    val (partialMessage, remainder) = printCmd.encode.splitAt(22)
-
-    // make sure that the partialMessage in itself is a valid command
-    val partialMsgBuf = ByteBuffer.allocate(22).order(ByteOrder.nativeOrder())
-    partialMsgBuf.put(partialMessage.asByteBuffer)
-    RabitWorkerHandler.StructTrackerCommand.verify(partialMsgBuf) shouldBe true
-
-    fsm ! Tcp.Received(partialMessage)
-    fsm ! Tcp.Received(remainder)
-
-    trackerProbe.expectMsg(printCmd)
-  }
-
-  it should "handle spill-over Tcp data correctly between state transition" in {
-    val trackerProbe = TestProbe()
-    val connProbe = TestProbe()
-
-    val worldSize = 4
-
-    val fsm = TestFSMRef(new RabitWorkerHandler("localhost", worldSize,
-      trackerProbe.ref, connProbe.ref))
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingHandshake
-
-    // send mock magic number
-    fsm ! Tcp.Received(magic)
-    connProbe.expectMsg(Tcp.Write(magic))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-    fsm.stateData shouldEqual RabitWorkerHandler.StructTrackerCommand
-    // ResumeReading should be seen once state transitions
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // send mock tracker command in fragments: the handler should be able to handle it.
-    val bufCmd = ByteBuffer.allocate(26).order(ByteOrder.nativeOrder())
-    bufCmd.putInt(0).putInt(worldSize).putInt(1).put(Array[Byte]('0'))
-      .putInt(5).put("start".getBytes())
-      // spilled-over data
-      .putInt(0).flip()
-
-    // send data with 4 extra bytes corresponding to the next state.
-    fsm ! Tcp.Received(ByteString.fromByteBuffer(bufCmd))
-
-    trackerProbe.expectMsg(WorkerStart(0, worldSize, "0"))
-
-    val linkMap = new LinkMap(worldSize)
-    val assignedRank = linkMap.assignRank(0)
-    trackerProbe.reply(assignedRank)
-
-    connProbe.expectMsg(Tcp.Write(ByteString.fromByteBuffer(
-      assignedRank.toByteBuffer(worldSize)
-    )))
-
-    // reading should be suspended upon transitioning to BuildingLinkMap
-    connProbe.expectMsg(Tcp.SuspendReading)
-    // state should transition with according state data changes.
-    fsm.stateName shouldEqual RabitWorkerHandler.BuildingLinkMap
-    fsm.stateData shouldEqual RabitWorkerHandler.StructNodes
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // the handler should be able to handle spill-over data, and stash it until state transition.
-    trackerProbe.expectMsg(RequestAwaitConnWorkers(0, fsm.underlyingActor.getNeighboringWorkers))
-  }
-}
diff --git a/plugin/federated/README.md b/plugin/federated/README.md
index d83db6be1..631c44cee 100644
--- a/plugin/federated/README.md
+++ b/plugin/federated/README.md
@@ -19,7 +19,7 @@ cmake .. -GNinja \
  -DUSE_NCCL=ON
 ninja
 cd ../python-package
-pip install -e .  # or equivalently python setup.py develop
+pip install -e .
 ```
 If CMake fails to locate gRPC, you may need to pass `-DCMAKE_PREFIX_PATH=<grpc path>` to CMake.
 
diff --git a/python-package/MANIFEST.in b/python-package/MANIFEST.in
deleted file mode 100644
index 23f2684c2..000000000
--- a/python-package/MANIFEST.in
+++ /dev/null
@@ -1,56 +0,0 @@
-include README.rst
-include xgboost/LICENSE
-include xgboost/VERSION
-include xgboost/CMakeLists.txt
-
-include xgboost/py.typed
-recursive-include xgboost *.py
-recursive-include xgboost/cmake *
-exclude xgboost/cmake/RPackageInstall.cmake.in
-exclude xgboost/cmake/RPackageInstallTargetSetup.cmake
-exclude xgboost/cmake/Sanitizer.cmake
-exclude xgboost/cmake/modules/FindASan.cmake
-exclude xgboost/cmake/modules/FindLSan.cmake
-exclude xgboost/cmake/modules/FindLibR.cmake
-exclude xgboost/cmake/modules/FindTSan.cmake
-exclude xgboost/cmake/modules/FindUBSan.cmake
-recursive-include xgboost/include *
-recursive-include xgboost/plugin *
-recursive-include xgboost/src *
-
-recursive-include xgboost/gputreeshap/GPUTreeShap *
-
-include xgboost/rabit/CMakeLists.txt
-recursive-include xgboost/rabit/include *
-recursive-include xgboost/rabit/src *
-prune xgboost/rabit/doc
-prune xgboost/rabit/guide
-
-include xgboost/dmlc-core/CMakeLists.txt
-
-recursive-include xgboost/dmlc-core/cmake *
-exclude xgboost/dmlc-core/cmake/gtest_cmake.in
-exclude xgboost/dmlc-core/cmake/lint.cmake
-exclude xgboost/dmlc-core/cmake/Sanitizer.cmake
-exclude xgboost/dmlc-core/cmake/Modules/FindASan.cmake
-exclude xgboost/dmlc-core/cmake/Modules/FindLSan.cmake
-exclude xgboost/dmlc-core/cmake/Modules/FindTSan.cmake
-exclude xgboost/dmlc-core/cmake/Modules/FindUBSan.cmake
-
-recursive-include xgboost/dmlc-core/include *
-recursive-include xgboost/dmlc-core/include *
-recursive-include xgboost/dmlc-core/make *
-recursive-include xgboost/dmlc-core/src *
-include xgboost/dmlc-core/tracker/dmlc-submit
-recursive-include xgboost/dmlc-core/tracker/dmlc_tracker *.py
-include xgboost/dmlc-core/tracker/yarn/build.bat
-include xgboost/dmlc-core/tracker/yarn/build.sh
-include xgboost/dmlc-core/tracker/yarn/pom.xml
-recursive-include xgboost/dmlc-core/tracker/yarn/src *
-include xgboost/dmlc-core/windows/dmlc.sln
-include xgboost/dmlc-core/windows/dmlc/dmlc.vcxproj
-
-prune xgboost/dmlc-core/doc
-prune xgboost/dmlc-core/scripts/
-
-global-exclude *.py[oc]
diff --git a/python-package/hatch_build.py b/python-package/hatch_build.py
new file mode 100644
index 000000000..696787fa2
--- /dev/null
+++ b/python-package/hatch_build.py
@@ -0,0 +1,22 @@
+"""
+Custom hook to customize the behavior of Hatchling.
+Here, we customize the tag of the generated wheels.
+"""
+import sysconfig
+from typing import Any, Dict
+
+from hatchling.builders.hooks.plugin.interface import BuildHookInterface
+
+
+def get_tag() -> str:
+    """Get appropriate wheel tag according to system"""
+    tag_platform = sysconfig.get_platform().replace("-", "_").replace(".", "_")
+    return f"py3-none-{tag_platform}"
+
+
+class CustomBuildHook(BuildHookInterface):
+    """A custom build hook"""
+
+    def initialize(self, version: str, build_data: Dict[str, Any]) -> None:
+        """This step ccurs immediately before each build."""
+        build_data["tag"] = get_tag()
diff --git a/python-package/packager/__init__.py b/python-package/packager/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python-package/packager/build_config.py b/python-package/packager/build_config.py
new file mode 100644
index 000000000..290cf15db
--- /dev/null
+++ b/python-package/packager/build_config.py
@@ -0,0 +1,56 @@
+"""Build configuration"""
+import dataclasses
+from typing import Any, Dict, List, Optional
+
+
+@dataclasses.dataclass
+class BuildConfiguration:  # pylint: disable=R0902
+    """Configurations use when building libxgboost"""
+
+    # Whether to hide C++ symbols in libxgboost.so
+    hide_cxx_symbols: bool = True
+    # Whether to enable OpenMP
+    use_openmp: bool = True
+    # Whether to enable CUDA
+    use_cuda: bool = False
+    # Whether to enable NCCL
+    use_nccl: bool = False
+    # Whether to enable HDFS
+    use_hdfs: bool = False
+    # Whether to enable Azure Storage
+    use_azure: bool = False
+    # Whether to enable AWS S3
+    use_s3: bool = False
+    # Whether to enable the dense parser plugin
+    plugin_dense_parser: bool = False
+    # Special option: See explanation below
+    use_system_libxgboost: bool = False
+
+    def _set_config_setting(
+        self, config_settings: Dict[str, Any], field_name: str
+    ) -> None:
+        if field_name in config_settings:
+            setattr(
+                self,
+                field_name,
+                (config_settings[field_name].lower() in ["true", "1", "on"]),
+            )
+        else:
+            raise ValueError(f"Field {field_name} is not a valid config_settings")
+
+    def update(self, config_settings: Optional[Dict[str, Any]]) -> None:
+        """Parse config_settings from Pip (or other PEP 517 frontend)"""
+        if config_settings is not None:
+            for field_name in [x.name for x in dataclasses.fields(self)]:
+                self._set_config_setting(config_settings, field_name)
+
+    def get_cmake_args(self) -> List[str]:
+        """Convert build configuration to CMake args"""
+        cmake_args = []
+        for field_name in [x.name for x in dataclasses.fields(self)]:
+            if field_name in ["use_system_libxgboost"]:
+                continue
+            cmake_option = field_name.upper()
+            cmake_value = "ON" if getattr(self, field_name) is True else "OFF"
+            cmake_args.append(f"-D{cmake_option}={cmake_value}")
+        return cmake_args
diff --git a/python-package/packager/nativelib.py b/python-package/packager/nativelib.py
new file mode 100644
index 000000000..f7f5b4e79
--- /dev/null
+++ b/python-package/packager/nativelib.py
@@ -0,0 +1,157 @@
+"""
+Functions for building libxgboost
+"""
+import logging
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+from platform import system
+from typing import Optional
+
+from .build_config import BuildConfiguration
+
+
+def _lib_name() -> str:
+    """Return platform dependent shared object name."""
+    if system() in ["Linux", "OS400"] or system().upper().endswith("BSD"):
+        name = "libxgboost.so"
+    elif system() == "Darwin":
+        name = "libxgboost.dylib"
+    elif system() == "Windows":
+        name = "xgboost.dll"
+    else:
+        raise NotImplementedError(f"System {system()} not supported")
+    return name
+
+
+def build_libxgboost(
+    cpp_src_dir: pathlib.Path,
+    build_dir: pathlib.Path,
+    build_config: BuildConfiguration,
+) -> pathlib.Path:
+    """Build libxgboost in a temporary directory and obtain the path to built libxgboost"""
+    logger = logging.getLogger("xgboost.packager.build_libxgboost")
+
+    if not cpp_src_dir.is_dir():
+        raise RuntimeError(f"Expected {cpp_src_dir} to be a directory")
+    logger.info(
+        "Building %s from the C++ source files in %s...", _lib_name(), str(cpp_src_dir)
+    )
+
+    def _build(*, generator: str) -> None:
+        cmake_cmd = [
+            "cmake",
+            str(cpp_src_dir),
+            generator,
+            "-DKEEP_BUILD_ARTIFACTS_IN_BINARY_DIR=ON",
+        ]
+        cmake_cmd.extend(build_config.get_cmake_args())
+
+        # Flag for cross-compiling for Apple Silicon
+        # We use environment variable because it's the only way to pass down custom flags
+        # through the cibuildwheel package, which calls `pip wheel` command.
+        if "CIBW_TARGET_OSX_ARM64" in os.environ:
+            cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")
+
+        logger.info("CMake args: %s", str(cmake_cmd))
+        subprocess.check_call(cmake_cmd, cwd=build_dir)
+
+        if system() == "Windows":
+            subprocess.check_call(
+                ["cmake", "--build", ".", "--config", "Release"], cwd=build_dir
+            )
+        else:
+            nproc = os.cpu_count()
+            assert build_tool is not None
+            subprocess.check_call([build_tool, f"-j{nproc}"], cwd=build_dir)
+
+    if system() == "Windows":
+        supported_generators = (
+            "-GVisual Studio 17 2022",
+            "-GVisual Studio 16 2019",
+            "-GVisual Studio 15 2017",
+            "-GMinGW Makefiles",
+        )
+        for generator in supported_generators:
+            try:
+                _build(generator=generator)
+                logger.info(
+                    "Successfully built %s using generator %s", _lib_name(), generator
+                )
+                break
+            except subprocess.CalledProcessError as e:
+                logger.info(
+                    "Tried building with generator %s but failed with exception %s",
+                    generator,
+                    str(e),
+                )
+                # Empty build directory
+                shutil.rmtree(build_dir)
+                build_dir.mkdir()
+        else:
+            raise RuntimeError(
+                "None of the supported generators produced a successful build!"
+                f"Supported generators: {supported_generators}"
+            )
+    else:
+        build_tool = "ninja" if shutil.which("ninja") else "make"
+        generator = "-GNinja" if build_tool == "ninja" else "-GUnix Makefiles"
+        try:
+            _build(generator=generator)
+        except subprocess.CalledProcessError as e:
+            logger.info("Failed to build with OpenMP. Exception: %s", str(e))
+            build_config.use_openmp = False
+            _build(generator=generator)
+
+    return build_dir / "lib" / _lib_name()
+
+
+def locate_local_libxgboost(
+    toplevel_dir: pathlib.Path,
+    logger: logging.Logger,
+) -> Optional[pathlib.Path]:
+    """
+    Locate libxgboost from the local project directory's lib/ subdirectory.
+    """
+    libxgboost = toplevel_dir.parent / "lib" / _lib_name()
+    if libxgboost.exists():
+        logger.info("Found %s at %s", libxgboost.name, str(libxgboost.parent))
+        return libxgboost
+    return None
+
+
+def locate_or_build_libxgboost(
+    toplevel_dir: pathlib.Path,
+    build_dir: pathlib.Path,
+    build_config: BuildConfiguration,
+) -> pathlib.Path:
+    """Locate libxgboost; if not exist, build it"""
+    logger = logging.getLogger("xgboost.packager.locate_or_build_libxgboost")
+
+    libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
+    if libxgboost is not None:
+        return libxgboost
+    if build_config.use_system_libxgboost:
+        # Find libxgboost from system prefix
+        sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
+        libxgboost = sys_prefix / "lib" / _lib_name()
+        if not libxgboost.exists():
+            raise RuntimeError(
+                f"use_system_libxgboost was specified but {_lib_name()} is "
+                f"not found in {libxgboost.parent}"
+            )
+
+        logger.info("Using system XGBoost: %s", str(libxgboost))
+        return libxgboost
+
+    if toplevel_dir.joinpath("cpp_src").exists():
+        # Source distribution; all C++ source files to be found in cpp_src/
+        cpp_src_dir = toplevel_dir.joinpath("cpp_src")
+    else:
+        # Probably running "pip install ." from python-package/
+        cpp_src_dir = toplevel_dir.parent
+        if not cpp_src_dir.joinpath("CMakeLists.txt").exists():
+            raise RuntimeError(f"Did not find CMakeLists.txt from {cpp_src_dir}")
+    return build_libxgboost(cpp_src_dir, build_dir=build_dir, build_config=build_config)
diff --git a/python-package/packager/pep517.py b/python-package/packager/pep517.py
new file mode 100644
index 000000000..56583e117
--- /dev/null
+++ b/python-package/packager/pep517.py
@@ -0,0 +1,157 @@
+"""
+Custom build backend for XGBoost Python package.
+Builds source distribution and binary wheels, following PEP 517 / PEP 660.
+Reuses components of Hatchling (https://github.com/pypa/hatch/tree/master/backend) for the sake
+of brevity.
+"""
+import dataclasses
+import logging
+import os
+import pathlib
+import tempfile
+from contextlib import contextmanager
+from typing import Any, Dict, Iterator, Optional, Union
+
+import hatchling.build
+
+from .build_config import BuildConfiguration
+from .nativelib import locate_local_libxgboost, locate_or_build_libxgboost
+from .sdist import copy_cpp_src_tree
+from .util import copy_with_logging, copytree_with_logging
+
+
+@contextmanager
+def cd(path: Union[str, pathlib.Path]) -> Iterator[str]:  # pylint: disable=C0103
+    """
+    Temporarily change working directory.
+    TODO(hcho3): Remove this once we adopt Python 3.11, which implements contextlib.chdir.
+    """
+    path = str(path)
+    path = os.path.realpath(path)
+    cwd = os.getcwd()
+    os.chdir(path)
+    try:
+        yield path
+    finally:
+        os.chdir(cwd)
+
+
+TOPLEVEL_DIR = pathlib.Path(__file__).parent.parent.absolute().resolve()
+logging.basicConfig(level=logging.INFO)
+
+
+# Aliases
+get_requires_for_build_sdist = hatchling.build.get_requires_for_build_sdist
+get_requires_for_build_wheel = hatchling.build.get_requires_for_build_wheel
+get_requires_for_build_editable = hatchling.build.get_requires_for_build_editable
+
+
+def build_wheel(
+    wheel_directory: str,
+    config_settings: Optional[Dict[str, Any]] = None,
+    metadata_directory: Optional[str] = None,
+) -> str:
+    """Build a wheel"""
+    logger = logging.getLogger("xgboost.packager.build_wheel")
+
+    build_config = BuildConfiguration()
+    build_config.update(config_settings)
+    logger.info("Parsed build configuration: %s", dataclasses.asdict(build_config))
+
+    # Create tempdir with Python package + libxgboost
+    with tempfile.TemporaryDirectory() as td:
+        td_path = pathlib.Path(td)
+        build_dir = td_path / "libbuild"
+        build_dir.mkdir()
+
+        workspace = td_path / "whl_workspace"
+        workspace.mkdir()
+        logger.info("Copying project files to temporary directory %s", str(workspace))
+
+        copy_with_logging(TOPLEVEL_DIR / "pyproject.toml", workspace, logger=logger)
+        copy_with_logging(TOPLEVEL_DIR / "hatch_build.py", workspace, logger=logger)
+        copy_with_logging(TOPLEVEL_DIR / "README.rst", workspace, logger=logger)
+
+        pkg_path = workspace / "xgboost"
+        copytree_with_logging(TOPLEVEL_DIR / "xgboost", pkg_path, logger=logger)
+        lib_path = pkg_path / "lib"
+        lib_path.mkdir()
+        libxgboost = locate_or_build_libxgboost(
+            TOPLEVEL_DIR, build_dir=build_dir, build_config=build_config
+        )
+        copy_with_logging(libxgboost, lib_path, logger=logger)
+
+        with cd(workspace):
+            wheel_name = hatchling.build.build_wheel(
+                wheel_directory, config_settings, metadata_directory
+            )
+    return wheel_name
+
+
+def build_sdist(
+    sdist_directory: str,
+    config_settings: Optional[Dict[str, Any]] = None,
+) -> str:
+    """Build a source distribution"""
+    logger = logging.getLogger("xgboost.packager.build_sdist")
+
+    if config_settings:
+        raise NotImplementedError(
+            "XGBoost's custom build backend doesn't support config_settings option "
+            f"when building sdist. {config_settings=}"
+        )
+
+    cpp_src_dir = TOPLEVEL_DIR.parent
+    if not cpp_src_dir.joinpath("CMakeLists.txt").exists():
+        raise RuntimeError(f"Did not find CMakeLists.txt from {cpp_src_dir}")
+
+    # Create tempdir with Python package + C++ sources
+    with tempfile.TemporaryDirectory() as td:
+        td_path = pathlib.Path(td)
+
+        workspace = td_path / "sdist_workspace"
+        workspace.mkdir()
+        logger.info("Copying project files to temporary directory %s", str(workspace))
+
+        copy_with_logging(TOPLEVEL_DIR / "pyproject.toml", workspace, logger=logger)
+        copy_with_logging(TOPLEVEL_DIR / "hatch_build.py", workspace, logger=logger)
+        copy_with_logging(TOPLEVEL_DIR / "README.rst", workspace, logger=logger)
+
+        copytree_with_logging(
+            TOPLEVEL_DIR / "xgboost", workspace / "xgboost", logger=logger
+        )
+        copytree_with_logging(
+            TOPLEVEL_DIR / "packager", workspace / "packager", logger=logger
+        )
+
+        temp_cpp_src_dir = workspace / "cpp_src"
+        copy_cpp_src_tree(cpp_src_dir, target_dir=temp_cpp_src_dir, logger=logger)
+
+        with cd(workspace):
+            sdist_name = hatchling.build.build_sdist(sdist_directory, config_settings)
+    return sdist_name
+
+
+def build_editable(
+    wheel_directory: str,
+    config_settings: Optional[Dict[str, Any]] = None,
+    metadata_directory: Optional[str] = None,
+) -> str:
+    """Build an editable installation. We mostly delegate to Hatchling."""
+    logger = logging.getLogger("xgboost.packager.build_editable")
+
+    if config_settings:
+        raise NotImplementedError(
+            "XGBoost's custom build backend doesn't support config_settings option "
+            f"when building editable installation. {config_settings=}"
+        )
+
+    if locate_local_libxgboost(TOPLEVEL_DIR, logger=logger) is None:
+        raise RuntimeError(
+            "To use the editable installation, first build libxgboost with CMake. "
+            "See https://xgboost.readthedocs.io/en/latest/build.html for detailed instructions."
+        )
+
+    return hatchling.build.build_editable(
+        wheel_directory, config_settings, metadata_directory
+    )
diff --git a/python-package/packager/sdist.py b/python-package/packager/sdist.py
new file mode 100644
index 000000000..af9fbca0d
--- /dev/null
+++ b/python-package/packager/sdist.py
@@ -0,0 +1,27 @@
+"""
+Functions for building sdist
+"""
+import logging
+import pathlib
+
+from .util import copy_with_logging, copytree_with_logging
+
+
+def copy_cpp_src_tree(
+    cpp_src_dir: pathlib.Path, target_dir: pathlib.Path, logger: logging.Logger
+) -> None:
+    """Copy C++ source tree into build directory"""
+
+    for subdir in [
+        "src",
+        "include",
+        "dmlc-core",
+        "gputreeshap",
+        "rabit",
+        "cmake",
+        "plugin",
+    ]:
+        copytree_with_logging(cpp_src_dir / subdir, target_dir / subdir, logger=logger)
+
+    for filename in ["CMakeLists.txt", "LICENSE"]:
+        copy_with_logging(cpp_src_dir.joinpath(filename), target_dir, logger=logger)
diff --git a/python-package/packager/util.py b/python-package/packager/util.py
new file mode 100644
index 000000000..0fff062d7
--- /dev/null
+++ b/python-package/packager/util.py
@@ -0,0 +1,25 @@
+"""
+Utility functions for implementing PEP 517 backend
+"""
+import logging
+import pathlib
+import shutil
+
+
+def copytree_with_logging(
+    src: pathlib.Path, dest: pathlib.Path, logger: logging.Logger
+) -> None:
+    """Call shutil.copytree() with logging"""
+    logger.info("Copying %s -> %s", str(src), str(dest))
+    shutil.copytree(src, dest)
+
+
+def copy_with_logging(
+    src: pathlib.Path, dest: pathlib.Path, logger: logging.Logger
+) -> None:
+    """Call shutil.copy() with logging"""
+    if dest.is_dir():
+        logger.info("Copying %s -> %s", str(src), str(dest / src.name))
+    else:
+        logger.info("Copying %s -> %s", str(src), str(dest))
+    shutil.copy(src, dest)
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
new file mode 100644
index 000000000..8f120df5d
--- /dev/null
+++ b/python-package/pyproject.toml
@@ -0,0 +1,42 @@
+[build-system]
+requires = [
+    "hatchling>=1.12.1"
+]
+backend-path = ["."]
+build-backend = "packager.pep517"
+
+[project]
+name = "xgboost"
+version = "2.0.0-dev"
+authors = [
+    {name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu"},
+    {name = "Jiaming Yuan", email = "jm.yuan@outlook.com"}
+]
+description = "XGBoost Python Package"
+readme = {file = "README.rst", content-type = "text/x-rst"}
+requires-python = ">=3.8"
+license = {text = "Apache-2.0"}
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Development Status :: 5 - Production/Stable",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10"
+]
+dependencies = [
+    "numpy",
+    "scipy"
+]
+
+[project.optional-dependencies]
+pandas = ["pandas"]
+scikit-learn = ["scikit-learn"]
+dask = ["dask", "pandas", "distributed"]
+datatable = ["datatable"]
+plotting = ["graphviz", "matplotlib"]
+pyspark = ["pyspark", "scikit-learn", "cloudpickle"]
+
+[tool.hatch.build.targets.wheel.hooks.custom]
diff --git a/python-package/xgboost/config.py b/python-package/xgboost/config.py
index c08a13150..1691d473f 100644
--- a/python-package/xgboost/config.py
+++ b/python-package/xgboost/config.py
@@ -16,7 +16,7 @@ def config_doc(
     extra_note: Optional[str] = None,
     parameters: Optional[str] = None,
     returns: Optional[str] = None,
-    see_also: Optional[str] = None
+    see_also: Optional[str] = None,
 ) -> Callable[[_F], _F]:
     """Decorator to format docstring for config functions.
 
diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
index 88bd1c819..35c5c009f 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -73,6 +73,7 @@ from .core import (
     _deprecate_positional_args,
     _expect,
 )
+from .data import _is_cudf_ser, _is_cupy_array
 from .sklearn import (
     XGBClassifier,
     XGBClassifierBase,
@@ -1894,10 +1895,15 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
         )
 
         # pylint: disable=attribute-defined-outside-init
-        if isinstance(y, (da.Array)):
+        if isinstance(y, da.Array):
             self.classes_ = await self.client.compute(da.unique(y))
         else:
             self.classes_ = await self.client.compute(y.drop_duplicates())
+        if _is_cudf_ser(self.classes_):
+            self.classes_ = self.classes_.to_cupy()
+        if _is_cupy_array(self.classes_):
+            self.classes_ = self.classes_.get()
+        self.classes_ = numpy.array(self.classes_)
         self.n_classes_ = len(self.classes_)
 
         if self.n_classes_ > 2:
diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py
index 71058e8c9..d9eb14d0f 100644
--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -30,7 +30,7 @@ def plot_importance(
     grid: bool = True,
     show_values: bool = True,
     values_format: str = "{v}",
-    **kwargs: Any
+    **kwargs: Any,
 ) -> Axes:
     """Plot importance based on fitted trees.
 
@@ -155,7 +155,7 @@ def to_graphviz(
     no_color: Optional[str] = None,
     condition_node_params: Optional[dict] = None,
     leaf_node_params: Optional[dict] = None,
-    **kwargs: Any
+    **kwargs: Any,
 ) -> GraphvizSource:
     """Convert specified tree to graphviz instance. IPython can automatically plot
     the returned graphviz instance. Otherwise, you should call .render() method
@@ -250,7 +250,7 @@ def plot_tree(
     num_trees: int = 0,
     rankdir: Optional[str] = None,
     ax: Optional[Axes] = None,
-    **kwargs: Any
+    **kwargs: Any,
 ) -> Axes:
     """Plot specified tree.
 
diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
index f2c5e1197..8f84459d7 100644
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -219,7 +219,9 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
                 array: Optional[np.ndarray] = part[feature_cols]
             elif part[name].shape[0] > 0:
                 array = part[name]
-                array = stack_series(array)
+                if name == alias.data:
+                    # For the array/vector typed case.
+                    array = stack_series(array)
             else:
                 array = None
 
diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py
index 78a35eee0..7c3231431 100644
--- a/python-package/xgboost/spark/params.py
+++ b/python-package/xgboost/spark/params.py
@@ -1,4 +1,6 @@
 """Xgboost pyspark integration submodule for params."""
+from typing import Dict
+
 # pylint: disable=too-few-public-methods
 from pyspark.ml.param import TypeConverters
 from pyspark.ml.param.shared import Param, Params
@@ -11,7 +13,7 @@ class HasArbitraryParamsDict(Params):
     input.
     """
 
-    arbitrary_params_dict: Param[dict] = Param(
+    arbitrary_params_dict: "Param[Dict]" = Param(
         Params._dummy(),
         "arbitrary_params_dict",
         "arbitrary_params_dict This parameter holds all of the additional parameters which are "
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 20a4c681e..7bf3cf45b 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -317,13 +317,15 @@ class TestDataset:
             enable_categorical=True,
         )
 
-    def get_device_dmat(self) -> xgb.QuantileDMatrix:
+    def get_device_dmat(self, max_bin: Optional[int]) -> xgb.QuantileDMatrix:
         import cupy as cp
 
         w = None if self.w is None else cp.array(self.w)
         X = cp.array(self.X, dtype=np.float32)
         y = cp.array(self.y, dtype=np.float32)
-        return xgb.QuantileDMatrix(X, y, weight=w, base_margin=self.margin)
+        return xgb.QuantileDMatrix(
+            X, y, weight=w, base_margin=self.margin, max_bin=max_bin
+        )
 
     def get_external_dmat(self) -> xgb.DMatrix:
         n_samples = self.X.shape[0]
@@ -431,8 +433,11 @@ def make_ltr(
     """Make a dataset for testing LTR."""
     rng = np.random.default_rng(1994)
     X = rng.normal(0, 1.0, size=n_samples * n_features).reshape(n_samples, n_features)
-    y = rng.integers(0, max_rel, size=n_samples)
-    qid = rng.integers(0, n_query_groups, size=n_samples)
+    y = np.sum(X, axis=1)
+    y -= y.min()
+    y = np.round(y / y.max() * max_rel).astype(np.int32)
+
+    qid = rng.integers(0, n_query_groups, size=n_samples, dtype=np.int32)
     w = rng.normal(0, 1.0, size=n_query_groups)
     w -= np.min(w)
     w /= np.max(w)
@@ -879,5 +884,12 @@ def data_dir(path: str) -> str:
     return os.path.join(demo_dir(path), "data")
 
 
+def load_agaricus(path: str) -> Tuple[xgb.DMatrix, xgb.DMatrix]:
+    dpath = data_dir(path)
+    dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train?format=libsvm"))
+    dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test?format=libsvm"))
+    return dtrain, dtest
+
+
 def project_root(path: str) -> str:
     return normpath(os.path.join(demo_dir(path), os.path.pardir))
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 74a0107e1..a09a5499c 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -3,30 +3,50 @@
  */
 #include "xgboost/c_api.h"
 
-#include <rabit/c_api.h>
+#include <algorithm>                         // for copy
+#include <cinttypes>                         // for strtoimax
+#include <cmath>                             // for nan
+#include <cstring>                           // for strcmp
+#include <fstream>                           // for operator<<, basic_ostream, ios, stringstream
+#include <functional>                        // for less
+#include <limits>                            // for numeric_limits
+#include <map>                               // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
+#include <memory>                            // for shared_ptr, allocator, __shared_ptr_access
+#include <string>                            // for char_traits, basic_string, operator==, string
+#include <system_error>                      // for errc
+#include <utility>                           // for pair
+#include <vector>                            // for vector
 
-#include <cstring>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "../collective/communicator-inl.h"
-#include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
-#include "../common/charconv.h"
-#include "../common/io.h"
-#include "../data/adapter.h"
-#include "../data/simple_dmatrix.h"
-#include "c_api_utils.h"
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/global_config.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/json.h"
-#include "xgboost/learner.h"
-#include "xgboost/logging.h"
-#include "xgboost/string_view.h"  // StringView
-#include "xgboost/version_config.h"
+#include "../collective/communicator-inl.h"  // for Allreduce, Broadcast, Finalize, GetProcessor...
+#include "../common/api_entry.h"             // for XGBAPIThreadLocalEntry
+#include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
+#include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
+#include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
+#include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
+#include "../data/proxy_dmatrix.h"           // for DMatrixProxy
+#include "../data/simple_dmatrix.h"          // for SimpleDMatrix
+#include "c_api_error.h"                     // for xgboost_CHECK_C_ARG_PTR, API_END, API_BEGIN
+#include "c_api_utils.h"                     // for RequiredArg, OptionalArg, GetMissing, CastDM...
+#include "dmlc/base.h"                       // for BeginPtr, DMLC_ATTRIBUTE_UNUSED
+#include "dmlc/io.h"                         // for Stream
+#include "dmlc/parameter.h"                  // for FieldAccessEntry, FieldEntry, ParamManager
+#include "dmlc/thread_local.h"               // for ThreadLocalStore
+#include "rabit/c_api.h"                     // for RabitLinkTag
+#include "rabit/rabit.h"                     // for CheckPoint, LoadCheckPoint
+#include "xgboost/base.h"                    // for bst_ulong, bst_float, GradientPair, bst_feat...
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/data.h"                    // for DMatrix, MetaInfo, DataType, ExtSparsePage
+#include "xgboost/feature_map.h"             // for FeatureMap
+#include "xgboost/global_config.h"           // for GlobalConfiguration, GlobalConfigThreadLocal...
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/intrusive_ptr.h"           // for xgboost
+#include "xgboost/json.h"                    // for Json, get, Integer, IsA, Boolean, String
+#include "xgboost/learner.h"                 // for Learner, PredictionType
+#include "xgboost/logging.h"                 // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
+#include "xgboost/predictor.h"               // for PredictionCacheEntry
+#include "xgboost/span.h"                    // for Span
+#include "xgboost/string_view.h"             // for StringView, operator<<
+#include "xgboost/version_config.h"          // for XGBOOST_VER_MAJOR, XGBOOST_VER_MINOR, XGBOOS...
 
 #if defined(XGBOOST_USE_FEDERATED)
 #include "../../plugin/federated/federated_server.h"
@@ -343,10 +363,10 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
   API_END();
 }
 
-XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) {
+XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle *out) {
   API_BEGIN();
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);;
+  *out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);
   API_END();
 }
 
@@ -748,7 +768,7 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
 
   CHECK_LE(p_m->Info().num_col_, std::numeric_limits<unsigned>::max());
 
-  for (auto const &page : p_m->GetBatches<ExtSparsePage>()) {
+  for (auto const &page : p_m->GetBatches<ExtSparsePage>(p_m->Ctx(), BatchParam{})) {
     CHECK(page.page);
     auto const &h_offset = page.page->offset.ConstHostVector();
     std::copy(h_offset.cbegin(), h_offset.cend(), out_indptr);
diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
new file mode 100644
index 000000000..b33ca28ef
--- /dev/null
+++ b/src/collective/aggregator.h
@@ -0,0 +1,127 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ *
+ * Higher level functions built on top the Communicator API, taking care of behavioral differences
+ * between row-split vs column-split distributed training, and horizontal vs vertical federated
+ * learning.
+ */
+#pragma once
+#include <xgboost/data.h>
+
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "communicator-inl.h"
+
+namespace xgboost {
+namespace collective {
+
+/**
+ * @brief Apply the given function where the labels are.
+ *
+ * Normally all the workers have access to the labels, so the function is just applied locally. In
+ * vertical federated learning, we assume labels are only available on worker 0, so the function is
+ * applied there, with the results broadcast to other workers.
+ *
+ * @tparam Function The function used to calculate the results.
+ * @tparam Args Arguments to the function.
+ * @param info MetaInfo about the DMatrix.
+ * @param buffer The buffer storing the results.
+ * @param size The size of the buffer.
+ * @param function The function used to calculate the results.
+ */
+template <typename Function>
+void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& function) {
+  if (info.IsVerticalFederated()) {
+    // We assume labels are only available on worker 0, so the calculation is done there and result
+    // broadcast to other workers.
+    std::string message;
+    if (collective::GetRank() == 0) {
+      try {
+        std::forward<Function>(function)();
+      } catch (dmlc::Error& e) {
+        message = e.what();
+      }
+    }
+
+    collective::Broadcast(&message, 0);
+    if (message.empty()) {
+      collective::Broadcast(buffer, size, 0);
+    } else {
+      LOG(FATAL) << &message[0];
+    }
+  } else {
+    std::forward<Function>(function)();
+  }
+}
+
+/**
+ * @brief Find the global max of the given value across all workers.
+ *
+ * This only applies when the data is split row-wise (horizontally). When data is split
+ * column-wise (vertically), the local value is returned.
+ *
+ * @tparam T The type of the value.
+ * @param info MetaInfo about the DMatrix.
+ * @param value The input for finding the global max.
+ * @return The global max of the input.
+ */
+template <typename T>
+T GlobalMax(MetaInfo const& info, T value) {
+  if (info.IsRowSplit()) {
+    collective::Allreduce<collective::Operation::kMax>(&value, 1);
+  }
+  return value;
+}
+
+/**
+ * @brief Find the global sum of the given values across all workers.
+ *
+ * This only applies when the data is split row-wise (horizontally). When data is split
+ * column-wise (vertically), the original values are returned.
+ *
+ * @tparam T The type of the values.
+ * @param info MetaInfo about the DMatrix.
+ * @param values Pointer to the inputs to sum.
+ * @param size Number of values to sum.
+ */
+template <typename T>
+void GlobalSum(MetaInfo const& info, T* values, size_t size) {
+  if (info.IsRowSplit()) {
+    collective::Allreduce<collective::Operation::kSum>(values, size);
+  }
+}
+
+template <typename Container>
+void GlobalSum(MetaInfo const& info, Container* values) {
+  GlobalSum(info, values->data(), values->size());
+}
+
+/**
+ * @brief Find the global ratio of the given two values across all workers.
+ *
+ * This only applies when the data is split row-wise (horizontally). When data is split
+ * column-wise (vertically), the local ratio is returned.
+ *
+ * @tparam T The type of the values.
+ * @param info MetaInfo about the DMatrix.
+ * @param dividend The dividend of the ratio.
+ * @param divisor The divisor of the ratio.
+ * @return The global ratio of the two inputs.
+ */
+template <typename T>
+T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
+  std::array<T, 2> results{dividend, divisor};
+  GlobalSum(info, &results);
+  std::tie(dividend, divisor) = std::tuple_cat(results);
+  if (divisor <= 0) {
+    return std::numeric_limits<T>::quiet_NaN();
+  } else {
+    return dividend / divisor;
+  }
+}
+
+}  // namespace collective
+}  // namespace xgboost
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 3dbb7f52c..3f57a63a3 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -24,5 +24,14 @@ constexpr StringView LabelScoreSize() {
 constexpr StringView InfInData() {
   return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
 }
+
+constexpr StringView NoF128() {
+  return "128-bit floating point is not supported on current platform.";
+}
+
+constexpr StringView InconsistentMaxBin() {
+  return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
+         "and consistent with the Booster being trained.";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index a99ed4f10..c9b50792d 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -2,15 +2,18 @@
  * Copyright 2017-2023 by XGBoost Contributors
  * \file hist_util.cc
  */
+#include "hist_util.h"
+
 #include <dmlc/timer.h>
 
 #include <vector>
 
-#include "xgboost/base.h"
 #include "../common/common.h"
-#include "hist_util.h"
 #include "column_matrix.h"
 #include "quantile.h"
+#include "xgboost/base.h"
+#include "xgboost/context.h"  // Context
+#include "xgboost/data.h"     // SparsePage, SortedCSCPage
 
 #if defined(XGBOOST_MM_PREFETCH_PRESENT)
   #include <xmmintrin.h>
@@ -28,10 +31,11 @@ HistogramCuts::HistogramCuts() {
   cut_ptrs_.HostVector().emplace_back(0);
 }
 
-HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, bool use_sorted,
+HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted,
                               Span<float> const hessian) {
   HistogramCuts out;
-  auto const& info = m->Info();
+  auto const &info = m->Info();
+  auto n_threads = ctx->Threads();
   std::vector<bst_row_t> reduced(info.num_col_, 0);
   for (auto const &page : m->GetBatches<SparsePage>()) {
     auto const &entries_per_column =
@@ -44,21 +48,22 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
   }
 
   if (!use_sorted) {
-    HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
-                                  HostSketchContainer::UseGroup(info),
-                                  m->Info().IsColumnSplit(), n_threads);
-    for (auto const& page : m->GetBatches<SparsePage>()) {
+    HostSketchContainer container(ctx, max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
+                                  HostSketchContainer::UseGroup(info));
+    for (auto const &page : m->GetBatches<SparsePage>()) {
       container.PushRowPage(page, info, hessian);
     }
-    container.MakeCuts(&out);
+    container.MakeCuts(m->Info(), &out);
   } else {
-    SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
-                                    HostSketchContainer::UseGroup(info),
-                                    m->Info().IsColumnSplit(), n_threads};
-    for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+    SortedSketchContainer container{ctx,
+                                    max_bins,
+                                    m->Info().feature_types.ConstHostSpan(),
+                                    reduced,
+                                    HostSketchContainer::UseGroup(info)};
+    for (auto const &page : m->GetBatches<SortedCSCPage>(ctx)) {
       container.PushColPage(page, info, hessian);
     }
-    container.MakeCuts(&out);
+    container.MakeCuts(m->Info(), &out);
   }
 
   return out;
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index d95d405eb..6380952d7 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -170,7 +170,7 @@ class HistogramCuts {
  * \param use_sorted Whether should we use SortedCSC for sketching, it's more efficient
  *                   but consumes more memory.
  */
-HistogramCuts SketchOnDMatrix(DMatrix* m, int32_t max_bins, int32_t n_threads,
+HistogramCuts SketchOnDMatrix(Context const* ctx, DMatrix* m, bst_bin_t max_bins,
                               bool use_sorted = false, Span<float> const hessian = {});
 
 enum BinTypeSize : uint8_t {
diff --git a/src/common/math.h b/src/common/math.h
index 62c609f0b..9987c4ebb 100644
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015 by Contributors
+/**
+ * Copyright 2015-2023 by XGBoost Contributors
  * \file math.h
  * \brief additional math utils
  * \author Tianqi Chen
@@ -7,16 +7,19 @@
 #ifndef XGBOOST_COMMON_MATH_H_
 #define XGBOOST_COMMON_MATH_H_
 
-#include <xgboost/base.h>
+#include <xgboost/base.h>  // for XGBOOST_DEVICE
 
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <utility>
-#include <vector>
+#include <algorithm>    // for max
+#include <cmath>        // for exp, abs, log, lgamma
+#include <limits>       // for numeric_limits
+#include <type_traits>  // for is_floating_point, conditional, is_signed, is_same, declval, enable_if
+#include <utility>      // for pair
 
 namespace xgboost {
 namespace common {
+
+template <typename T> XGBOOST_DEVICE T Sqr(T const &w) { return w * w; }
+
 /*!
  * \brief calculate the sigmoid of the input.
  * \param x input parameter
@@ -30,9 +33,11 @@ XGBOOST_DEVICE inline float Sigmoid(float x) {
   return y;
 }
 
-template <typename T>
-XGBOOST_DEVICE inline static T Sqr(T a) { return a * a; }
-
+XGBOOST_DEVICE inline double Sigmoid(double x) {
+  auto denom = std::exp(-x) + 1.0;
+  auto y = 1.0 / denom;
+  return y;
+}
 /*!
  * \brief Equality test for both integer and floating point.
  */
@@ -134,10 +139,6 @@ inline static bool CmpFirst(const std::pair<float, unsigned> &a,
                             const std::pair<float, unsigned> &b) {
   return a.first > b.first;
 }
-inline static bool CmpSecond(const std::pair<float, unsigned> &a,
-                             const std::pair<float, unsigned> &b) {
-  return a.second > b.second;
-}
 
 // Redefined here to workaround a VC bug that doesn't support overloading for integer
 // types.
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index aaf271934..a93184b95 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -6,6 +6,7 @@
 #include <limits>
 #include <utility>
 
+#include "../collective/aggregator.h"
 #include "../collective/communicator-inl.h"
 #include "../data/adapter.h"
 #include "categorical.h"
@@ -15,17 +16,16 @@ namespace xgboost {
 namespace common {
 
 template <typename WQSketch>
-SketchContainerImpl<WQSketch>::SketchContainerImpl(std::vector<bst_row_t> columns_size,
+SketchContainerImpl<WQSketch>::SketchContainerImpl(Context const *ctx,
+                                                   std::vector<bst_row_t> columns_size,
                                                    int32_t max_bins,
                                                    Span<FeatureType const> feature_types,
-                                                   bool use_group, bool col_split,
-                                                   int32_t n_threads)
+                                                   bool use_group)
     : feature_types_(feature_types.cbegin(), feature_types.cend()),
       columns_size_{std::move(columns_size)},
       max_bins_{max_bins},
       use_group_ind_{use_group},
-      col_split_{col_split},
-      n_threads_{n_threads} {
+      n_threads_{ctx->Threads()} {
   monitor_.Init(__func__);
   CHECK_NE(columns_size_.size(), 0);
   sketches_.resize(columns_size_.size());
@@ -202,10 +202,10 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories() {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
   auto world_size = collective::GetWorldSize();
   auto rank = collective::GetRank();
-  if (world_size == 1 || col_split_) {
+  if (world_size == 1 || info.IsColumnSplit()) {
     return;
   }
 
@@ -273,6 +273,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories() {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::AllReduce(
+    MetaInfo const& info,
     std::vector<typename WQSketch::SummaryContainer> *p_reduced,
     std::vector<int32_t>* p_num_cuts) {
   monitor_.Start(__func__);
@@ -281,7 +282,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
   collective::Allreduce<collective::Operation::kMax>(&n_columns, 1);
   CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";
 
-  AllreduceCategories();
+  AllreduceCategories(info);
 
   auto& num_cuts = *p_num_cuts;
   CHECK_EQ(num_cuts.size(), 0);
@@ -292,10 +293,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
 
   // Prune the intermediate num cuts for synchronization.
   std::vector<bst_row_t> global_column_size(columns_size_);
-  if (!col_split_) {
-    collective::Allreduce<collective::Operation::kSum>(global_column_size.data(),
-                                                       global_column_size.size());
-  }
+  collective::GlobalSum(info, &global_column_size);
 
   ParallelFor(sketches_.size(), n_threads_, [&](size_t i) {
     int32_t intermediate_num_cuts = static_cast<int32_t>(
@@ -316,7 +314,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
   });
 
   auto world = collective::GetWorldSize();
-  if (world == 1 || col_split_) {
+  if (world == 1 || info.IsColumnSplit()) {
     monitor_.Stop(__func__);
     return;
   }
@@ -382,13 +380,13 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
+void SketchContainerImpl<WQSketch>::MakeCuts(MetaInfo const &info, HistogramCuts *p_cuts) {
   monitor_.Start(__func__);
   std::vector<typename WQSketch::SummaryContainer> reduced;
   std::vector<int32_t> num_cuts;
-  this->AllReduce(&reduced, &num_cuts);
+  this->AllReduce(info, &reduced, &num_cuts);
 
-  cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
+  p_cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
   std::vector<typename WQSketch::SummaryContainer> final_summaries(reduced.size());
 
   ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) {
@@ -403,48 +401,48 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
       a.SetPrune(reduced[fidx], max_num_bins + 1);
       CHECK(a.data && reduced[fidx].data);
       const bst_float mval = a.data[0].value;
-      cuts->min_vals_.HostVector()[fidx] = mval - fabs(mval) - 1e-5f;
+      p_cuts->min_vals_.HostVector()[fidx] = mval - fabs(mval) - 1e-5f;
     } else {
       // Empty column.
       const float mval = 1e-5f;
-      cuts->min_vals_.HostVector()[fidx] = mval;
+      p_cuts->min_vals_.HostVector()[fidx] = mval;
     }
   });
 
   float max_cat{-1.f};
   for (size_t fid = 0; fid < reduced.size(); ++fid) {
     size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
-    typename WQSketch::SummaryContainer const& a = final_summaries[fid];
+    typename WQSketch::SummaryContainer const &a = final_summaries[fid];
     if (IsCat(feature_types_, fid)) {
-      max_cat = std::max(max_cat, AddCategories(categories_.at(fid), cuts));
+      max_cat = std::max(max_cat, AddCategories(categories_.at(fid), p_cuts));
     } else {
-      AddCutPoint<WQSketch>(a, max_num_bins, cuts);
+      AddCutPoint<WQSketch>(a, max_num_bins, p_cuts);
       // push a value that is greater than anything
       const bst_float cpt =
-          (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
+          (a.size > 0) ? a.data[a.size - 1].value : p_cuts->min_vals_.HostVector()[fid];
       // this must be bigger than last value in a scale
       const bst_float last = cpt + (fabs(cpt) + 1e-5f);
-      cuts->cut_values_.HostVector().push_back(last);
+      p_cuts->cut_values_.HostVector().push_back(last);
     }
 
     // Ensure that every feature gets at least one quantile point
-    CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
-    auto cut_size = static_cast<uint32_t>(cuts->cut_values_.HostVector().size());
-    CHECK_GT(cut_size, cuts->cut_ptrs_.HostVector().back());
-    cuts->cut_ptrs_.HostVector().push_back(cut_size);
+    CHECK_LE(p_cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
+    auto cut_size = static_cast<uint32_t>(p_cuts->cut_values_.HostVector().size());
+    CHECK_GT(cut_size, p_cuts->cut_ptrs_.HostVector().back());
+    p_cuts->cut_ptrs_.HostVector().push_back(cut_size);
   }
 
-  cuts->SetCategorical(this->has_categorical_, max_cat);
+  p_cuts->SetCategorical(this->has_categorical_, max_cat);
   monitor_.Stop(__func__);
 }
 
 template class SketchContainerImpl<WQuantileSketch<float, float>>;
 template class SketchContainerImpl<WXQuantileSketch<float, float>>;
 
-HostSketchContainer::HostSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
-                                         std::vector<size_t> columns_size, bool use_group,
-                                         bool col_split, int32_t n_threads)
-    : SketchContainerImpl{columns_size, max_bins, ft, use_group, col_split, n_threads} {
+HostSketchContainer::HostSketchContainer(Context const *ctx, bst_bin_t max_bins,
+                                         common::Span<FeatureType const> ft,
+                                         std::vector<size_t> columns_size, bool use_group)
+    : SketchContainerImpl{ctx, columns_size, max_bins, ft, use_group} {
   monitor_.Init(__func__);
   ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
     auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
diff --git a/src/common/quantile.h b/src/common/quantile.h
index a19b4bbb0..0a82f7c90 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -789,7 +789,6 @@ class SketchContainerImpl {
   std::vector<bst_row_t> columns_size_;
   int32_t max_bins_;
   bool use_group_ind_{false};
-  bool col_split_;
   int32_t n_threads_;
   bool has_categorical_{false};
   Monitor monitor_;
@@ -801,9 +800,8 @@ class SketchContainerImpl {
    * \param max_bins maximum number of bins for each feature.
    * \param use_group whether is assigned to group to data instance.
    */
-  SketchContainerImpl(std::vector<bst_row_t> columns_size, int32_t max_bins,
-                      common::Span<FeatureType const> feature_types, bool use_group, bool col_split,
-                      int32_t n_threads);
+  SketchContainerImpl(Context const *ctx, std::vector<bst_row_t> columns_size, int32_t max_bins,
+                      common::Span<FeatureType const> feature_types, bool use_group);
 
   static bool UseGroup(MetaInfo const &info) {
     size_t const num_groups =
@@ -829,7 +827,7 @@ class SketchContainerImpl {
                         std::vector<bst_row_t> *p_sketches_scan,
                         std::vector<typename WQSketch::Entry> *p_global_sketches);
   // Merge sketches from all workers.
-  void AllReduce(std::vector<typename WQSketch::SummaryContainer> *p_reduced,
+  void AllReduce(MetaInfo const& info, std::vector<typename WQSketch::SummaryContainer> *p_reduced,
                  std::vector<int32_t> *p_num_cuts);
 
   template <typename Batch, typename IsValid>
@@ -883,11 +881,11 @@ class SketchContainerImpl {
   /* \brief Push a CSR matrix. */
   void PushRowPage(SparsePage const &page, MetaInfo const &info, Span<float const> hessian = {});
 
-  void MakeCuts(HistogramCuts* cuts);
+  void MakeCuts(MetaInfo const& info, HistogramCuts* cuts);
 
  private:
   // Merge all categories from other workers.
-  void AllreduceCategories();
+  void AllreduceCategories(MetaInfo const& info);
 };
 
 class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, float>> {
@@ -895,9 +893,8 @@ class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, fl
   using WQSketch = WQuantileSketch<float, float>;
 
  public:
-  HostSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
-                      std::vector<size_t> columns_size, bool use_group, bool col_split,
-                      int32_t n_threads);
+  HostSketchContainer(Context const *ctx, bst_bin_t max_bins, common::Span<FeatureType const> ft,
+                      std::vector<size_t> columns_size, bool use_group);
 
   template <typename Batch>
   void PushAdapterBatch(Batch const &batch, size_t base_rowid, MetaInfo const &info, float missing);
@@ -992,10 +989,10 @@ class SortedSketchContainer : public SketchContainerImpl<WXQuantileSketch<float,
   using Super = SketchContainerImpl<WXQuantileSketch<float, float>>;
 
  public:
-  explicit SortedSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
-                                 std::vector<size_t> columns_size, bool use_group, bool col_split,
-                                 int32_t n_threads)
-      : SketchContainerImpl{columns_size, max_bins, ft, use_group, col_split, n_threads} {
+  explicit SortedSketchContainer(Context const *ctx, int32_t max_bins,
+                                 common::Span<FeatureType const> ft,
+                                 std::vector<size_t> columns_size, bool use_group)
+      : SketchContainerImpl{ctx, columns_size, max_bins, ft, use_group} {
     monitor_.Init(__func__);
     sketches_.resize(columns_size.size());
     size_t i = 0;
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index bc071c2d6..dd823a0d6 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -70,7 +70,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
   // pairs
   // should be accessed by getter for auto configuration.
   // nolint so that we can keep the string name.
-  PairMethod lambdarank_pair_method{PairMethod::kMean};  // NOLINT
+  PairMethod lambdarank_pair_method{PairMethod::kTopK};  // NOLINT
   std::size_t lambdarank_num_pair_per_sample{NotSet()};  // NOLINT
 
  public:
@@ -78,7 +78,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
 
   // unbiased
   bool lambdarank_unbiased{false};
-  double lambdarank_bias_norm{2.0};
+  double lambdarank_bias_norm{1.0};
   // ndcg
   bool ndcg_exp_gain{true};
 
@@ -135,7 +135,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
         .set_default(false)
         .describe("Unbiased lambda mart. Use extended IPW to debias click position");
     DMLC_DECLARE_FIELD(lambdarank_bias_norm)
-        .set_default(2.0)
+        .set_default(1.0)
         .set_lower_bound(0.0)
         .describe("Lp regularization for unbiased lambdarank.");
     DMLC_DECLARE_FIELD(ndcg_exp_gain)
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index 2a078ed60..d62936e90 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -7,8 +7,9 @@
 #define XGBOOST_DATA_ARRAY_INTERFACE_H_
 
 #include <algorithm>
-#include <cstddef>  // std::size_t
+#include <cstddef>  // for size_t
 #include <cstdint>
+#include <limits>  // for numeric_limits
 #include <map>
 #include <string>
 #include <type_traits>  // std::alignment_of,std::remove_pointer_t
@@ -17,6 +18,7 @@
 
 #include "../common/bitfield.h"
 #include "../common/common.h"
+#include "../common/error_msg.h"  // for NoF128
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/json.h"
@@ -454,9 +456,8 @@ class ArrayInterface {
   void AssignType(StringView typestr) {
     using T = ArrayInterfaceHandler::Type;
     if (typestr.size() == 4 && typestr[1] == 'f' && typestr[2] == '1' && typestr[3] == '6') {
+      CHECK(sizeof(long double) == 16) << error::NoF128();
       type = T::kF16;
-      CHECK(sizeof(long double) == 16)
-          << "128-bit floating point is not supported on current platform.";
     } else if (typestr[1] == 'f' && typestr[2] == '2') {
 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
       type = T::kF2;
@@ -572,19 +573,90 @@ class ArrayInterface {
   // Used only by columnar format.
   RBitField8 valid;
   // Array stride
-  size_t strides[D]{0};
+  std::size_t strides[D]{0};
   // Array shape
-  size_t shape[D]{0};
+  std::size_t shape[D]{0};
   // Type earsed pointer referencing the data.
   void const *data{nullptr};
   // Total number of items
-  size_t n{0};
+  std::size_t n{0};
   // Whether the memory is c-contiguous
   bool is_contiguous{false};
   // RTTI, initialized to the f16 to avoid masking potential bugs in initialization.
   ArrayInterfaceHandler::Type type{ArrayInterfaceHandler::kF16};
 };
 
+template <std::int32_t D, typename Fn>
+void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
+  // Only used for cuDF at the moment.
+  CHECK_EQ(array.valid.Size(), 0);
+  auto dispatch = [&](auto t) {
+    using T = std::remove_const_t<decltype(t)> const;
+    // Set the data size to max as we don't know the original size of a sliced array:
+    //
+    // Slicing an array A with shape (4, 2, 3) and stride (6, 3, 1) by [:, 1, :] results
+    // in an array B with shape (4, 3) and strides (6, 1). We can't calculate the original
+    // size 24 based on the slice.
+    fn(linalg::TensorView<T, D>{common::Span<T const>{static_cast<T *>(array.data),
+                                                      std::numeric_limits<std::size_t>::max()},
+                                array.shape, array.strides, device});
+  };
+  switch (array.type) {
+    case ArrayInterfaceHandler::kF2: {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+      dispatch(__half{});
+#endif
+      break;
+    }
+    case ArrayInterfaceHandler::kF4: {
+      dispatch(float{});
+      break;
+    }
+    case ArrayInterfaceHandler::kF8: {
+      dispatch(double{});
+      break;
+    }
+    case ArrayInterfaceHandler::kF16: {
+      using T = long double;
+      CHECK(sizeof(long double) == 16) << error::NoF128();
+      dispatch(T{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI1: {
+      dispatch(std::int8_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI2: {
+      dispatch(std::int16_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI4: {
+      dispatch(std::int32_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI8: {
+      dispatch(std::int64_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU1: {
+      dispatch(std::uint8_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU2: {
+      dispatch(std::uint16_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU4: {
+      dispatch(std::uint32_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU8: {
+      dispatch(std::uint64_t{});
+      break;
+    }
+  }
+}
+
 /**
  * \brief Helper for type casting.
  */
diff --git a/src/data/batch_utils.h b/src/data/batch_utils.h
new file mode 100644
index 000000000..f75d24ffd
--- /dev/null
+++ b/src/data/batch_utils.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#ifndef XGBOOST_DATA_BATCH_UTILS_H_
+#define XGBOOST_DATA_BATCH_UTILS_H_
+
+#include "xgboost/data.h"  // for BatchParam
+
+namespace xgboost::data::detail {
+// At least one batch parameter is initialized.
+inline void CheckEmpty(BatchParam const& l, BatchParam const& r) {
+  if (!l.Initialized()) {
+    CHECK(r.Initialized()) << "Batch parameter is not initialized.";
+  }
+}
+
+/**
+ * \brief Should we regenerate the gradient index?
+ *
+ * \param old Parameter stored in DMatrix.
+ * \param p   New parameter passed in by caller.
+ */
+inline bool RegenGHist(BatchParam old, BatchParam p) {
+  // Parameter is renewed or caller requests a regen
+  if (!p.Initialized()) {
+    // Empty parameter is passed in, don't regenerate so that we can use gindex in
+    // predictor, which doesn't have any training parameter.
+    return false;
+  }
+  return p.regen || old.ParamNotEqual(p);
+}
+}  // namespace xgboost::data::detail
+#endif  // XGBOOST_DATA_BATCH_UTILS_H_
diff --git a/src/data/data.cc b/src/data/data.cc
index 557c6b8bf..bd34309d6 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -427,10 +427,13 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
     return;
   }
   p_out->Reshape(array.shape);
-  auto t = p_out->View(Context::kCpuId);
-  CHECK(t.CContiguous());
-  linalg::ElementWiseTransformHost(t, ctx.Threads(), [&](auto i, auto) {
-    return linalg::detail::Apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, t.Shape()));
+  auto t_out = p_out->View(Context::kCpuId);
+  CHECK(t_out.CContiguous());
+  auto const shape = t_out.Shape();
+  DispatchDType(array, Context::kCpuId, [&](auto&& in) {
+    linalg::ElementWiseTransformHost(t_out, ctx.Threads(), [&](auto i, auto) {
+      return std::apply(in, linalg::UnravelIndex<D>(i, shape));
+    });
   });
 }
 }  // namespace
@@ -774,6 +777,10 @@ bool MetaInfo::IsVerticalFederated() const {
   return collective::IsFederated() && IsColumnSplit();
 }
 
+bool MetaInfo::ShouldHaveLabels() const {
+  return !IsVerticalFederated() || collective::GetRank() == 0;
+}
+
 using DMatrixThreadLocal =
     dmlc::ThreadLocalStore<std::map<DMatrix const *, XGBAPIThreadLocalEntry>>;
 
@@ -812,8 +819,7 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) {
   return nullptr;
 }
 
-DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode,
-                       const std::string& file_format) {
+DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
   auto need_split = false;
   if (collective::IsFederated()) {
     LOG(CONSOLE) << "XGBoost federated mode detected, not splitting data among workers";
@@ -855,11 +861,9 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
   }
 
   // legacy handling of binary data loading
-  if (file_format == "auto") {
-    DMatrix* loaded = TryLoadBinary(fname, silent);
-    if (loaded) {
-      return loaded;
-    }
+  DMatrix* loaded = TryLoadBinary(fname, silent);
+  if (loaded) {
+    return loaded;
   }
 
   int partid = 0, npart = 1;
@@ -875,47 +879,24 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
   }
 
+  data::ValidateFileFormat(fname);
   DMatrix* dmat {nullptr};
-  try {
-    if (cache_file.empty()) {
-      std::unique_ptr<dmlc::Parser<uint32_t>> parser(
-          dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
-      data::FileAdapter adapter(parser.get());
-      dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
-                             cache_file, data_split_mode);
-    } else {
-      data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart),
-                              file_format};
-      dmat = new data::SparsePageDMatrix{&iter,
-                                         iter.Proxy(),
-                                         data::fileiter::Reset,
-                                         data::fileiter::Next,
-                                         std::numeric_limits<float>::quiet_NaN(),
-                                         1,
-                                         cache_file};
-    }
-  } catch (dmlc::Error& e) {
-    std::vector<std::string> splited = common::Split(fname, '#');
-    std::vector<std::string> args = common::Split(splited.front(), '?');
-    std::string format {file_format};
-    if (args.size() == 1 && file_format == "auto") {
-      auto extension = common::Split(args.front(), '.').back();
-      if (extension == "csv" || extension == "libsvm") {
-        format = extension;
-      }
-      if (format == extension) {
-        LOG(WARNING)
-            << "No format parameter is provided in input uri, but found file extension: "
-            << format << " .  "
-            << "Consider providing a uri parameter: filename?format=" << format;
-      } else {
-        LOG(WARNING)
-            << "No format parameter is provided in input uri.  "
-            << "Choosing default parser in dmlc-core.  "
-            << "Consider providing a uri parameter like: filename?format=csv";
-      }
-    }
-    LOG(FATAL) << "Encountered parser error:\n" << e.what();
+
+  if (cache_file.empty()) {
+    std::unique_ptr<dmlc::Parser<uint32_t>> parser(
+        dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
+    data::FileAdapter adapter(parser.get());
+    dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
+                           cache_file, data_split_mode);
+  } else {
+    data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
+    dmat = new data::SparsePageDMatrix{&iter,
+                                       iter.Proxy(),
+                                       data::fileiter::Reset,
+                                       data::fileiter::Next,
+                                       std::numeric_limits<float>::quiet_NaN(),
+                                       1,
+                                       cache_file};
   }
 
   if (need_split && data_split_mode == DataSplitMode::kCol) {
diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc
index 6199c1b21..f561ea97e 100644
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 
@@ -12,7 +12,7 @@ class EllpackPageImpl {};
 
 EllpackPage::EllpackPage() = default;
 
-EllpackPage::EllpackPage(DMatrix*, const BatchParam&) {
+EllpackPage::EllpackPage(Context const*, DMatrix*, const BatchParam&) {
   LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
                 "EllpackPage is required";
 }
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index c1a964348..f2674aec0 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -21,8 +21,8 @@ namespace xgboost {
 
 EllpackPage::EllpackPage() : impl_{new EllpackPageImpl()} {}
 
-EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param)
-    : impl_{new EllpackPageImpl(dmat, param)} {}
+EllpackPage::EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param)
+    : impl_{new EllpackPageImpl{ctx, dmat, param}} {}
 
 EllpackPage::~EllpackPage() = default;
 
@@ -114,14 +114,13 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 }
 
 // Construct an ELLPACK matrix in memory.
-EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
+EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
     : is_dense(dmat->IsDense()) {
   monitor_.Init("ellpack_page");
-
 #if defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(param.gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(param.gpu_id));
+  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
 #endif
 
   n_rows = dmat->Info().num_row_;
@@ -129,19 +128,19 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
   monitor_.Start("Quantiles");
   // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
   row_stride = GetRowStride(dmat);
-  cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
+  cuts_ = common::DeviceSketch(ctx->gpu_id, dmat, param.max_bin);
   monitor_.Stop("Quantiles");
 
   monitor_.Start("InitCompressedData");
-  this->InitCompressedData(param.gpu_id);
+  this->InitCompressedData(ctx->gpu_id);
   monitor_.Stop("InitCompressedData");
 
-  dmat->Info().feature_types.SetDevice(param.gpu_id);
+  dmat->Info().feature_types.SetDevice(ctx->gpu_id);
   auto ft = dmat->Info().feature_types.ConstDeviceSpan();
   monitor_.Start("BinningCompression");
   CHECK(dmat->SingleColBlock());
   for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(param.gpu_id, batch, ft);
+    CreateHistIndices(ctx->gpu_id, batch, ft);
   }
   monitor_.Stop("BinningCompression");
 }
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index faf44b3b6..ee6a2c221 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -155,7 +155,7 @@ class EllpackPageImpl {
    * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
    * in CSR format.
    */
-  explicit EllpackPageImpl(DMatrix* dmat, const BatchParam& parm);
+  explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
 
   template <typename AdapterBatch>
   explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index c9a79dfda..f7889cf50 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2022 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #include <memory>
 #include <utility>
@@ -11,9 +11,9 @@ namespace xgboost {
 namespace data {
 void EllpackPageSource::Fetch() {
 #if defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(param_.gpu_id));
+  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(param_.gpu_id));
+  dh::safe_cuda(hipSetDevice(device_));
 #endif
   if (!this->ReadCache()) {
     if (count_ != 0 && !sync_) {
@@ -26,8 +26,7 @@ void EllpackPageSource::Fetch() {
     auto const &csr = source_->Page();
     this->page_.reset(new EllpackPage{});
     auto *impl = this->page_->Impl();
-    *impl = EllpackPageImpl(param_.gpu_id, *cuts_, *csr, is_dense_, row_stride_,
-                            feature_types_);
+    *impl = EllpackPageImpl(device_, *cuts_, *csr, is_dense_, row_stride_, feature_types_);
     page_->SetBaseRowId(csr->base_rowid);
     this->WriteCache();
   }
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 9ac513ec3..3e8857521 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2022 by XGBoost Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  */
 
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
@@ -23,19 +23,21 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
   BatchParam param_;
   common::Span<FeatureType const> feature_types_;
   std::unique_ptr<common::HistogramCuts> cuts_;
+  std::int32_t device_;
 
  public:
   EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
                     std::shared_ptr<Cache> cache, BatchParam param,
                     std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
                     common::Span<FeatureType const> feature_types,
-                    std::shared_ptr<SparsePageSource> source)
+                    std::shared_ptr<SparsePageSource> source, std::int32_t device)
       : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
         is_dense_{is_dense},
         row_stride_{row_stride},
         param_{std::move(param)},
         feature_types_{feature_types},
-        cuts_{std::move(cuts)} {
+        cuts_{std::move(cuts)},
+        device_{device} {
     this->source_ = source;
     this->Fetch();
   }
diff --git a/src/data/file_iterator.h b/src/data/file_iterator.h
index 96f0e09d4..4d7239677 100644
--- a/src/data/file_iterator.h
+++ b/src/data/file_iterator.h
@@ -1,22 +1,50 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #ifndef XGBOOST_DATA_FILE_ITERATOR_H_
 #define XGBOOST_DATA_FILE_ITERATOR_H_
 
-#include <string>
+#include <map>
 #include <memory>
-#include <vector>
+#include <string>
 #include <utility>
+#include <vector>
 
+#include "array_interface.h"
 #include "dmlc/data.h"
 #include "xgboost/c_api.h"
 #include "xgboost/json.h"
 #include "xgboost/linalg.h"
-#include "array_interface.h"
 
 namespace xgboost {
 namespace data {
+inline void ValidateFileFormat(std::string const& uri) {
+  std::vector<std::string> name_cache = common::Split(uri, '#');
+  CHECK_LE(name_cache.size(), 2)
+      << "Only one `#` is allowed in file path for cachefile specification";
+
+  std::vector<std::string> name_args = common::Split(name_cache[0], '?');
+  CHECK_LE(name_args.size(), 2) << "only one `?` is allowed in file path.";
+
+  StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"};
+  CHECK_EQ(name_args.size(), 2) << msg;
+
+  std::map<std::string, std::string> args;
+  std::vector<std::string> arg_list = common::Split(name_args[1], '&');
+  for (size_t i = 0; i < arg_list.size(); ++i) {
+    std::istringstream is(arg_list[i]);
+    std::pair<std::string, std::string> kv;
+    CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
+                                           << " for key in arg " << i + 1;
+    CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
+                                       << " for value in arg " << i + 1;
+    args.insert(kv);
+  }
+  if (args.find("format") == args.cend()) {
+    LOG(FATAL) << msg;
+  }
+}
+
 /**
  * An iterator for implementing external memory support with file inputs.  Users of
  * external memory are encouraged to define their own file parsers/loaders so this one is
@@ -31,8 +59,6 @@ class FileIterator {
   uint32_t part_idx_;
   // Equals to total number of workers.
   uint32_t n_parts_;
-  // Format of the input file, like "libsvm".
-  std::string type_;
 
   DMatrixHandle proxy_;
 
@@ -45,10 +71,9 @@ class FileIterator {
   std::string indices_;
 
  public:
-  FileIterator(std::string uri, unsigned part_index, unsigned num_parts,
-               std::string type)
-      : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts},
-        type_{std::move(type)} {
+  FileIterator(std::string uri, unsigned part_index, unsigned num_parts)
+      : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts} {
+    ValidateFileFormat(uri_);
     XGProxyDMatrixCreate(&proxy_);
   }
   ~FileIterator() {
@@ -94,9 +119,7 @@ class FileIterator {
   auto Proxy() -> decltype(proxy_) { return proxy_; }
 
   void Reset() {
-    CHECK(!type_.empty());
-    parser_.reset(dmlc::Parser<uint32_t>::Create(uri_.c_str(), part_idx_,
-                                                 n_parts_, type_.c_str()));
+    parser_.reset(dmlc::Parser<uint32_t>::Create(uri_.c_str(), part_idx_, n_parts_, "auto"));
   }
 };
 
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 4d7dbe9b5..d1f2659a3 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2022 by XGBoost Contributors
+/**
+ * Copyright 2017-2023, XGBoost Contributors
  * \brief Data type for fast histogram aggregation.
  */
 #include "gradient_index.h"
@@ -19,18 +19,18 @@ namespace xgboost {
 
 GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnMatrix>()} {}
 
-GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
-                                   double sparse_thresh, bool sorted_sketch, int32_t n_threads,
+GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
+                                   double sparse_thresh, bool sorted_sketch,
                                    common::Span<float> hess)
     : max_numeric_bins_per_feat{max_bins_per_feat} {
   CHECK(p_fmat->SingleColBlock());
   // We use sorted sketching for approx tree method since it's more efficient in
   // computation time (but higher memory usage).
-  cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
+  cut = common::SketchOnDMatrix(ctx, p_fmat, max_bins_per_feat, sorted_sketch, hess);
 
   const uint32_t nbins = cut.Ptrs().back();
   hit_count.resize(nbins, 0);
-  hit_count_tloc_.resize(n_threads * nbins, 0);
+  hit_count_tloc_.resize(ctx->Threads() * nbins, 0);
 
   size_t new_size = 1;
   for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
@@ -45,7 +45,7 @@ GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
   auto ft = p_fmat->Info().feature_types.ConstHostSpan();
 
   for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-    this->PushBatch(batch, ft, n_threads);
+    this->PushBatch(batch, ft, ctx->Threads());
   }
   this->columns_ = std::make_unique<common::ColumnMatrix>();
 
@@ -54,7 +54,7 @@ GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
     // hist
     CHECK(!sorted_sketch);
     for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
-      this->columns_->InitFromSparse(page, *this, sparse_thresh, n_threads);
+      this->columns_->InitFromSparse(page, *this, sparse_thresh, ctx->Threads());
     }
   }
 }
@@ -166,6 +166,12 @@ float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
   auto const &values = cut.Values();
   auto const &mins = cut.MinValues();
   auto const &ptrs = cut.Ptrs();
+  return this->GetFvalue(ptrs, values, mins, ridx, fidx, is_cat);
+}
+
+float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
+                                  std::vector<float> const &values, std::vector<float> const &mins,
+                                  bst_row_t ridx, bst_feature_t fidx, bool is_cat) const {
   if (is_cat) {
     auto gidx = GetGindex(ridx, fidx);
     if (gidx == -1) {
@@ -181,24 +187,27 @@ float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
     }
     return common::HistogramCuts::NumericBinValue(ptrs, values, mins, fidx, bin_idx);
   };
-
-  if (columns_->GetColumnType(fidx) == common::kDenseColumn) {
-    if (columns_->AnyMissing()) {
+  switch (columns_->GetColumnType(fidx)) {
+    case common::kDenseColumn: {
+      if (columns_->AnyMissing()) {
+        return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
+          auto column = columns_->DenseColumn<decltype(dtype), true>(fidx);
+          return get_bin_val(column);
+        });
+      } else {
+        return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
+          auto column = columns_->DenseColumn<decltype(dtype), false>(fidx);
+          auto bin_idx = column[ridx];
+          return common::HistogramCuts::NumericBinValue(ptrs, values, mins, fidx, bin_idx);
+        });
+      }
+    }
+    case common::kSparseColumn: {
       return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
-        auto column = columns_->DenseColumn<decltype(dtype), true>(fidx);
-        return get_bin_val(column);
-      });
-    } else {
-      return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
-        auto column = columns_->DenseColumn<decltype(dtype), false>(fidx);
+        auto column = columns_->SparseColumn<decltype(dtype)>(fidx, 0);
         return get_bin_val(column);
       });
     }
-  } else {
-    return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
-      auto column = columns_->SparseColumn<decltype(dtype)>(fidx, 0);
-      return get_bin_val(column);
-    });
   }
 
   SPAN_CHECK(false);
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 3cb0709bd..d36373d6b 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -19,7 +19,6 @@
 #include "../common/threading_utils.h"
 #include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
-#include "proxy_dmatrix.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 
@@ -155,8 +154,8 @@ class GHistIndexMatrix {
   /**
    * \brief Constrcutor for SimpleDMatrix.
    */
-  GHistIndexMatrix(DMatrix* x, bst_bin_t max_bins_per_feat, double sparse_thresh,
-                   bool sorted_sketch, int32_t n_threads, common::Span<float> hess = {});
+  GHistIndexMatrix(Context const* ctx, DMatrix* x, bst_bin_t max_bins_per_feat,
+                   double sparse_thresh, bool sorted_sketch, common::Span<float> hess = {});
   /**
    * \brief Constructor for Iterative DMatrix. Initialize basic information and prepare
    *        for push batch.
@@ -239,6 +238,9 @@ class GHistIndexMatrix {
   bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
 
   float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
+  float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
+                  std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
+                  bool is_cat) const;
 
  private:
   std::unique_ptr<common::ColumnMatrix> columns_;
@@ -292,28 +294,5 @@ void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
     }
   });
 }
-
-/**
- * \brief Should we regenerate the gradient index?
- *
- * \param old Parameter stored in DMatrix.
- * \param p   New parameter passed in by caller.
- */
-inline bool RegenGHist(BatchParam old, BatchParam p) {
-  // parameter is renewed or caller requests a regen
-  if (p == BatchParam{}) {
-    // empty parameter is passed in, don't regenerate so that we can use gindex in
-    // predictor, which doesn't have any training parameter.
-    return false;
-  }
-
-  // Avoid comparing nan values.
-  bool l_nan = std::isnan(old.sparse_thresh);
-  bool r_nan = std::isnan(p.sparse_thresh);
-  // regenerate if parameter is changed.
-  bool st_chg = (l_nan != r_nan) || (!l_nan && !r_nan && (old.sparse_thresh != p.sparse_thresh));
-  bool param_chg = old.gpu_id != p.gpu_id || old.max_bin != p.max_bin;
-  return p.regen || param_chg || st_chg;
-}
 }      // namespace xgboost
 #endif  // XGBOOST_DATA_GRADIENT_INDEX_H_
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 1bf755915..8eb1c2034 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -1,25 +1,26 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023, XGBoost contributors
  */
 #include "iterative_dmatrix.h"
 
-#include <algorithm>    // std::copy
-#include <cstddef>      // std::size_t
-#include <type_traits>  // std::underlying_type_t
-#include <vector>       // std::vector
+#include <algorithm>    // for copy
+#include <cstddef>      // for size_t
+#include <memory>       // for shared_ptr
+#include <type_traits>  // for underlying_type_t
+#include <vector>       // for vector
 
 #include "../collective/communicator-inl.h"
 #include "../common/categorical.h"  // common::IsCat
 #include "../common/column_matrix.h"
-#include "../tree/param.h"  // FIXME(jiamingy): Find a better way to share this parameter.
+#include "../tree/param.h"          // FIXME(jiamingy): Find a better way to share this parameter.
+#include "batch_utils.h"            // for RegenGHist
 #include "gradient_index.h"
 #include "proxy_dmatrix.h"
 #include "simple_batch_iterator.h"
-#include "xgboost/data.h"  // FeatureType
+#include "xgboost/data.h"  // for FeatureType, DMatrix
 #include "xgboost/logging.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy,
                                    std::shared_ptr<DMatrix> ref, DataIterResetCallback* reset,
                                    XGDMatrixCallbackNext* next, float missing, int nthread,
@@ -34,60 +35,61 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
 
   auto d = MakeProxy(proxy_)->DeviceIdx();
 
-  StringView msg{"All batch should be on the same device."};
-  if (batch_param_.gpu_id != Context::kCpuId) {
-    CHECK_EQ(d, batch_param_.gpu_id) << msg;
-  }
-
-  batch_param_ = BatchParam{d, max_bin};
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
   // hardcoded parameter.
-  batch_param_.sparse_thresh = tree::TrainParam::DftSparseThreshold();
+  BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
-  ctx_.UpdateAllowUnknown(
-      Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
-  if (ctx_.IsCPU()) {
-    this->InitFromCPU(iter_handle, missing, ref);
+  if (ctx.IsCPU()) {
+    this->InitFromCPU(&ctx, p, iter_handle, missing, ref);
   } else {
-    this->InitFromCUDA(iter_handle, missing, ref);
+    this->InitFromCUDA(&ctx, p, iter_handle, missing, ref);
   }
+
+  this->fmat_ctx_ = ctx;
+  this->batch_ = p;
 }
 
-void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, BatchParam p,
-                    common::HistogramCuts* p_cuts) {
-  CHECK(ref_);
+void GetCutsFromRef(Context const* ctx, std::shared_ptr<DMatrix> ref, bst_feature_t n_features,
+                    BatchParam p, common::HistogramCuts* p_cuts) {
+  CHECK(ref);
   CHECK(p_cuts);
-  auto csr = [&]() {
-    for (auto const& page : ref_->GetBatches<GHistIndexMatrix>(p)) {
+  p.forbid_regen = true;
+  // Fetch cuts from GIDX
+  auto csr = [&] {
+    for (auto const& page : ref->GetBatches<GHistIndexMatrix>(ctx, p)) {
       *p_cuts = page.cut;
       break;
     }
   };
-  auto ellpack = [&]() {
-    // workaround ellpack being initialized from CPU.
-    if (p.gpu_id == Context::kCpuId) {
-      p.gpu_id = ref_->Ctx()->gpu_id;
-    }
-    if (p.gpu_id == Context::kCpuId) {
-      p.gpu_id = 0;
-    }
-    for (auto const& page : ref_->GetBatches<EllpackPage>(p)) {
+  // Fetch cuts from Ellpack.
+  auto ellpack = [&] {
+    for (auto const& page : ref->GetBatches<EllpackPage>(ctx, p)) {
       GetCutsFromEllpack(page, p_cuts);
       break;
     }
   };
 
-  if (ref_->PageExists<GHistIndexMatrix>()) {
+  if (ref->PageExists<GHistIndexMatrix>() && ref->PageExists<EllpackPage>()) {
+    // Both exists
+    if (ctx->IsCPU()) {
+      csr();
+    } else {
+      ellpack();
+    }
+  } else if (ref->PageExists<GHistIndexMatrix>()) {
     csr();
-  } else if (ref_->PageExists<EllpackPage>()) {
+  } else if (ref->PageExists<EllpackPage>()) {
     ellpack();
   } else {
-    if (p.gpu_id == Context::kCpuId) {
+    // None exist
+    if (ctx->IsCPU()) {
       csr();
     } else {
       ellpack();
     }
   }
-  CHECK_EQ(ref_->Info().num_col_, n_features)
+  CHECK_EQ(ref->Info().num_col_, n_features)
       << "Invalid ref DMatrix, different number of features.";
 }
 
@@ -112,7 +114,8 @@ void SyncFeatureType(std::vector<FeatureType>* p_h_ft) {
 }
 }  // anonymous namespace
 
-void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
+void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
+                                   DataIterHandle iter_handle, float missing,
                                    std::shared_ptr<DMatrix> ref) {
   DMatrixProxy* proxy = MakeProxy(proxy_);
   CHECK(proxy);
@@ -133,7 +136,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   auto const is_valid = data::IsValidFunctor{missing};
   auto nnz_cnt = [&]() {
     return HostAdapterDispatch(proxy, [&](auto const& value) {
-      size_t n_threads = ctx_.Threads();
+      size_t n_threads = ctx->Threads();
       size_t n_features = column_sizes.size();
       linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
       column_sizes_tloc.Data()->Fill(0ul);
@@ -158,10 +161,10 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
     });
   };
 
-  size_t n_features = 0;
-  size_t n_batches = 0;
-  size_t accumulated_rows{0};
-  size_t nnz{0};
+  std::uint64_t n_features = 0;
+  std::size_t n_batches = 0;
+  std::uint64_t accumulated_rows{0};
+  std::uint64_t nnz{0};
 
   /**
    * CPU impl needs an additional loop for accumulating the column size.
@@ -203,7 +206,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   accumulated_rows = 0;
   std::vector<FeatureType> h_ft;
   if (ref) {
-    GetCutsFromRef(ref, Info().num_col_, batch_param_, &cuts);
+    GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
     h_ft = ref->Info().feature_types.HostVector();
   } else {
     size_t i = 0;
@@ -211,9 +214,8 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
       if (!p_sketch) {
         h_ft = proxy->Info().feature_types.ConstHostVector();
         SyncFeatureType(&h_ft);
-        p_sketch.reset(new common::HostSketchContainer{
-            batch_param_.max_bin, h_ft, column_sizes, !proxy->Info().group_ptr_.empty(),
-            proxy->Info().IsColumnSplit(), ctx_.Threads()});
+        p_sketch.reset(new common::HostSketchContainer{ctx, p.max_bin, h_ft, column_sizes,
+                                                       !proxy->Info().group_ptr_.empty()});
       }
       HostAdapterDispatch(proxy, [&](auto const& batch) {
         proxy->Info().num_nonzero_ = batch_nnz[i];
@@ -228,7 +230,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
     CHECK_EQ(accumulated_rows, Info().num_row_);
 
     CHECK(p_sketch);
-    p_sketch->MakeCuts(&cuts);
+    p_sketch->MakeCuts(Info(), &cuts);
   }
   if (!h_ft.empty()) {
     CHECK_EQ(h_ft.size(), n_features);
@@ -237,15 +239,15 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   /**
    * Generate gradient index.
    */
-  this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), batch_param_.max_bin);
+  this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), p.max_bin);
   size_t rbegin = 0;
   size_t prev_sum = 0;
   size_t i = 0;
   while (iter.Next()) {
     HostAdapterDispatch(proxy, [&](auto const& batch) {
       proxy->Info().num_nonzero_ = batch_nnz[i];
-      this->ghist_->PushAdapterBatch(&ctx_, rbegin, prev_sum, batch, missing, h_ft,
-                                     batch_param_.sparse_thresh, Info().num_row_);
+      this->ghist_->PushAdapterBatch(ctx, rbegin, prev_sum, batch, missing, h_ft, p.sparse_thresh,
+                                     Info().num_row_);
     });
     if (n_batches != 1) {
       this->info_.Extend(std::move(proxy->Info()), false, true);
@@ -265,7 +267,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   accumulated_rows = 0;
   while (iter.Next()) {
     HostAdapterDispatch(proxy, [&](auto const& batch) {
-      this->ghist_->PushAdapterBatchColumns(&ctx_, batch, missing, accumulated_rows);
+      this->ghist_->PushAdapterBatchColumns(ctx, batch, missing, accumulated_rows);
     });
     accumulated_rows += num_rows();
   }
@@ -282,11 +284,27 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   Info().feature_types.HostVector() = h_ft;
 }
 
-BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(BatchParam const& param) {
-  CheckParam(param);
+BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(Context const* ctx,
+                                                              BatchParam const& param) {
+  if (param.Initialized()) {
+    CheckParam(param);
+    CHECK(!detail::RegenGHist(param, batch_)) << error::InconsistentMaxBin();
+  }
+  if (!ellpack_ && !ghist_) {
+    LOG(FATAL) << "`QuantileDMatrix` not initialized.";
+  }
+
   if (!ghist_) {
-    CHECK(ellpack_);
-    ghist_ = std::make_shared<GHistIndexMatrix>(&ctx_, Info(), *ellpack_, param);
+    if (ctx->IsCPU()) {
+      ghist_ = std::make_shared<GHistIndexMatrix>(ctx, Info(), *ellpack_, param);
+    } else if (fmat_ctx_.IsCPU()) {
+      ghist_ = std::make_shared<GHistIndexMatrix>(&fmat_ctx_, Info(), *ellpack_, param);
+    } else {
+      // Can happen when QDM is initialized on GPU, but a CPU version is queried by a different QDM
+      // for cut reference.
+      auto cpu_ctx = ctx->MakeCPU();
+      ghist_ = std::make_shared<GHistIndexMatrix>(&cpu_ctx, Info(), *ellpack_, param);
+    }
   }
 
   if (!std::isnan(param.sparse_thresh) &&
@@ -300,8 +318,9 @@ BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(BatchParam const&
   return BatchSet<GHistIndexMatrix>(begin_iter);
 }
 
-BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(BatchParam const& param) {
-  for (auto const& page : this->GetGradientIndex(param)) {
+BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(Context const* ctx,
+                                                        BatchParam const& param) {
+  for (auto const& page : this->GetGradientIndex(ctx, param)) {
     auto p_out = std::make_shared<SparsePage>();
     p_out->data.Resize(this->Info().num_nonzero_);
     p_out->offset.Resize(this->Info().num_row_ + 1);
@@ -336,5 +355,26 @@ BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(BatchParam const& param)
       BatchIterator<ExtSparsePage>(new SimpleBatchIteratorImpl<ExtSparsePage>(nullptr));
   return BatchSet<ExtSparsePage>(begin_iter);
 }
-}  // namespace data
-}  // namespace xgboost
+
+#if !defined(XGBOOST_USE_CUDA)
+inline void IterativeDMatrix::InitFromCUDA(Context const*, BatchParam const&, DataIterHandle, float,
+                                           std::shared_ptr<DMatrix>) {
+  // silent the warning about unused variables.
+  (void)(proxy_);
+  (void)(reset_);
+  (void)(next_);
+  common::AssertGPUSupport();
+}
+
+inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
+                                                                 BatchParam const& param) {
+  common::AssertGPUSupport();
+  auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));
+  return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
+}
+
+inline void GetCutsFromEllpack(EllpackPage const&, common::HistogramCuts*) {
+  common::AssertGPUSupport();
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::data
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 0cdffa124..ad968b7f1 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -1,22 +1,24 @@
-/*!
- * Copyright 2020-2022 XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
 #include <algorithm>
 #include <memory>
 #include <type_traits>
 
 #include "../common/hist_util.cuh"
+#include "batch_utils.h"  // for RegenGHist
 #include "device_adapter.cuh"
 #include "ellpack_page.cuh"
+#include "gradient_index.h"
 #include "iterative_dmatrix.h"
 #include "proxy_dmatrix.cuh"
 #include "proxy_dmatrix.h"
 #include "simple_batch_iterator.h"
 #include "sparse_page_source.h"
 
-namespace xgboost {
-namespace data {
-void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
+namespace xgboost::data {
+void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
+                                    DataIterHandle iter_handle, float missing,
                                     std::shared_ptr<DMatrix> ref) {
   // A handle passed to external iterator.
   DMatrixProxy* proxy = MakeProxy(proxy_);
@@ -52,7 +54,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
 #endif
 
   auto get_device = [&]() -> int32_t {
-    int32_t d = (ctx_.gpu_id == Context::kCpuId) ? current_device : ctx_.gpu_id;
+    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
     CHECK_NE(d, Context::kCpuId);
     return d;
   };
@@ -63,7 +65,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
   common::HistogramCuts cuts;
   do {
     // We use do while here as the first batch is fetched in ctor
-    ctx_.gpu_id = proxy->DeviceIdx();
+    // ctx_.gpu_id = proxy->DeviceIdx();
     CHECK_LT(ctx_.gpu_id, common::AllVisibleGPUs());
 
 #if defined(XGBOOST_USE_CUDA)
@@ -80,12 +82,12 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
       CHECK_EQ(cols, num_cols()) << "Inconsistent number of columns.";
     }
     if (!ref) {
-      sketch_containers.emplace_back(proxy->Info().feature_types, batch_param_.max_bin, cols,
-                                     num_rows(), get_device());
+      sketch_containers.emplace_back(proxy->Info().feature_types, p.max_bin, cols, num_rows(),
+                                     get_device());
       auto* p_sketch = &sketch_containers.back();
       proxy->Info().weights_.SetDevice(get_device());
       Dispatch(proxy, [&](auto const& value) {
-        common::AdapterDeviceSketch(value, batch_param_.max_bin, proxy->Info(), missing, p_sketch);
+        common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
       });
     }
     auto batch_rows = num_rows();
@@ -118,8 +120,8 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
   if (!ref) {
     HostDeviceVector<FeatureType> ft;
     common::SketchContainer final_sketch(
-        sketch_containers.empty() ? ft : sketch_containers.front().FeatureTypes(),
-        batch_param_.max_bin, cols, accumulated_rows, get_device());
+        sketch_containers.empty() ? ft : sketch_containers.front().FeatureTypes(), p.max_bin, cols,
+        accumulated_rows, get_device());
     for (auto const& sketch : sketch_containers) {
       final_sketch.Merge(sketch.ColumnsPtr(), sketch.Data());
       final_sketch.FixError();
@@ -129,7 +131,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
 
     final_sketch.MakeCuts(&cuts);
   } else {
-    GetCutsFromRef(ref, Info().num_col_, batch_param_, &cuts);
+    GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
   }
 
   this->info_.num_row_ = accumulated_rows;
@@ -198,24 +200,34 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
   info_.SynchronizeNumberOfColumns();
 }
 
-BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& param) {
-  CheckParam(param);
+BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
+                                                          BatchParam const& param) {
+  if (param.Initialized()) {
+    CheckParam(param);
+    CHECK(!detail::RegenGHist(param, batch_)) << error::InconsistentMaxBin();
+  }
   if (!ellpack_ && !ghist_) {
     LOG(FATAL) << "`QuantileDMatrix` not initialized.";
   }
-  if (!ellpack_ && ghist_) {
+
+  if (!ellpack_) {
     ellpack_.reset(new EllpackPage());
-    // Evaluation QuantileDMatrix initialized from CPU data might not have the correct GPU
-    // ID.
-    if (this->ctx_.IsCPU()) {
-      this->ctx_.gpu_id = param.gpu_id;
+    if (ctx->IsCUDA()) {
+      this->Info().feature_types.SetDevice(ctx->gpu_id);
+      *ellpack_->Impl() =
+          EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
+    } else if (fmat_ctx_.IsCUDA()) {
+      this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
+      *ellpack_->Impl() =
+          EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
+    } else {
+      // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
+      // for cut reference.
+      auto cuda_ctx = ctx->MakeCUDA();
+      this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
+      *ellpack_->Impl() =
+          EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     }
-    if (this->ctx_.IsCPU()) {
-      this->ctx_.gpu_id = dh::CurrentDevice();
-    }
-    this->Info().feature_types.SetDevice(this->ctx_.gpu_id);
-    *ellpack_->Impl() =
-        EllpackPageImpl(&ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
   }
   CHECK(ellpack_);
   auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));
@@ -225,5 +237,4 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& para
 void GetCutsFromEllpack(EllpackPage const& page, common::HistogramCuts* cuts) {
   *cuts = page.Impl()->Cuts();
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h
index d3ee62696..bcaa5b63c 100644
--- a/src/data/iterative_dmatrix.h
+++ b/src/data/iterative_dmatrix.h
@@ -1,6 +1,8 @@
-/*!
- * Copyright 2020-2022 by Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
  * \file iterative_dmatrix.h
+ *
+ * \brief Implementation of the higher-level `QuantileDMatrix`.
  */
 #ifndef XGBOOST_DATA_ITERATIVE_DMATRIX_H_
 #define XGBOOST_DATA_ITERATIVE_DMATRIX_H_
@@ -10,10 +12,12 @@
 #include <utility>
 #include <vector>
 
+#include "../common/error_msg.h"
 #include "proxy_dmatrix.h"
 #include "simple_batch_iterator.h"
 #include "xgboost/base.h"
 #include "xgboost/c_api.h"
+#include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"
 
 namespace xgboost {
@@ -43,21 +47,17 @@ namespace data {
  */
 class IterativeDMatrix : public DMatrix {
   MetaInfo info_;
-  Context ctx_;
-  BatchParam batch_param_;
   std::shared_ptr<EllpackPage> ellpack_;
   std::shared_ptr<GHistIndexMatrix> ghist_;
+  BatchParam batch_;
 
   DMatrixHandle proxy_;
   DataIterResetCallback *reset_;
   XGDMatrixCallbackNext *next_;
+  Context fmat_ctx_;
 
   void CheckParam(BatchParam const &param) {
-    // FIXME(Jiamingy): https://github.com/dmlc/xgboost/issues/7976
-    if (param.max_bin != batch_param_.max_bin && param.max_bin != 0) {
-      LOG(WARNING) << "Inconsistent max_bin between Quantile DMatrix and Booster:" << param.max_bin
-                   << " vs. " << batch_param_.max_bin;
-    }
+    CHECK_EQ(param.max_bin, batch_.max_bin) << error::InconsistentMaxBin();
     CHECK(!param.regen && param.hess.empty())
         << "Only `hist` and `gpu_hist` tree method can use `QuantileDMatrix`.";
   }
@@ -68,8 +68,10 @@ class IterativeDMatrix : public DMatrix {
     return BatchSet<Page>(BatchIterator<Page>(nullptr));
   }
 
-  void InitFromCUDA(DataIterHandle iter, float missing, std::shared_ptr<DMatrix> ref);
-  void InitFromCPU(DataIterHandle iter_handle, float missing, std::shared_ptr<DMatrix> ref);
+  void InitFromCUDA(Context const *ctx, BatchParam const &p, DataIterHandle iter_handle,
+                    float missing, std::shared_ptr<DMatrix> ref);
+  void InitFromCPU(Context const *ctx, BatchParam const &p, DataIterHandle iter_handle,
+                   float missing, std::shared_ptr<DMatrix> ref);
 
  public:
   explicit IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy,
@@ -94,51 +96,40 @@ class IterativeDMatrix : public DMatrix {
     LOG(FATAL) << "Not implemented.";
     return BatchSet<SparsePage>(BatchIterator<SparsePage>(nullptr));
   }
-  BatchSet<CSCPage> GetColumnBatches() override { return InvalidTreeMethod<CSCPage>(); }
-  BatchSet<SortedCSCPage> GetSortedColumnBatches() override {
+  BatchSet<CSCPage> GetColumnBatches(Context const *) override {
+    return InvalidTreeMethod<CSCPage>();
+  }
+  BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const *) override {
     return InvalidTreeMethod<SortedCSCPage>();
   }
-  BatchSet<GHistIndexMatrix> GetGradientIndex(BatchParam const &param) override;
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const *ctx, BatchParam const &param) override;
 
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam &param) override;
-  BatchSet<ExtSparsePage> GetExtBatches(BatchParam const& param) override;
+  BatchSet<EllpackPage> GetEllpackBatches(Context const *ctx, const BatchParam &param) override;
+  BatchSet<ExtSparsePage> GetExtBatches(Context const *ctx, BatchParam const &param) override;
 
   bool SingleColBlock() const override { return true; }
 
   MetaInfo &Info() override { return info_; }
   MetaInfo const &Info() const override { return info_; }
 
-  Context const *Ctx() const override { return &ctx_; }
+  Context const *Ctx() const override { return &fmat_ctx_; }
 };
 
 /**
- * \brief Get quantile cuts from reference Quantile DMatrix.
+ * \brief Get quantile cuts from reference (Quantile)DMatrix.
+ *
+ * \param ctx The context of the new DMatrix.
+ * \param ref The reference DMatrix.
+ * \param n_features Number of features, used for validation only.
+ * \param p Batch parameter for the new DMatrix.
+ * \param p_cuts Output quantile cuts.
  */
-void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, BatchParam p,
-                    common::HistogramCuts *p_cuts);
+void GetCutsFromRef(Context const *ctx, std::shared_ptr<DMatrix> ref, bst_feature_t n_features,
+                    BatchParam p, common::HistogramCuts *p_cuts);
 /**
  * \brief Get quantile cuts from ellpack page.
  */
 void GetCutsFromEllpack(EllpackPage const &page, common::HistogramCuts *cuts);
-
-#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
-inline void IterativeDMatrix::InitFromCUDA(DataIterHandle, float, std::shared_ptr<DMatrix>) {
-  // silent the warning about unused variables.
-  (void)(proxy_);
-  (void)(reset_);
-  (void)(next_);
-  common::AssertGPUSupport();
-}
-inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(const BatchParam &) {
-  common::AssertGPUSupport();
-  auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));
-  return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
-}
-
-inline void GetCutsFromEllpack(EllpackPage const &, common::HistogramCuts *) {
-  common::AssertGPUSupport();
-}
-#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace data
 }  // namespace xgboost
 
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 587510bd2..e885b471f 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -25,16 +25,11 @@ class DataIterProxy {
   NextFn* next_;
 
  public:
-  DataIterProxy(DataIterHandle iter, ResetFn* reset, NextFn* next) :
-      iter_{iter},
-      reset_{reset}, next_{next} {}
+  DataIterProxy(DataIterHandle iter, ResetFn* reset, NextFn* next)
+      : iter_{iter}, reset_{reset}, next_{next} {}
 
-  bool Next() {
-    return next_(iter_);
-  }
-  void Reset() {
-    reset_(iter_);
-  }
+  bool Next() { return next_(iter_); }
+  void Reset() { reset_(iter_); }
 };
 
 /*
@@ -68,9 +63,8 @@ class DMatrixProxy : public DMatrix {
   }
 
   void SetArrayData(char const* c_interface);
-  void SetCSRData(char const *c_indptr, char const *c_indices,
-                  char const *c_values, bst_feature_t n_features,
-                  bool on_host);
+  void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
+                  bst_feature_t n_features, bool on_host);
 
   MetaInfo& Info() override { return info_; }
   MetaInfo const& Info() const override { return info_; }
@@ -81,6 +75,12 @@ class DMatrixProxy : public DMatrix {
   bool GHistIndexExists() const override { return false; }
   bool SparsePageExists() const override { return false; }
 
+  template <typename Page>
+  BatchSet<Page> NoBatch() {
+    LOG(FATAL) << "Proxy DMatrix cannot return data batch.";
+    return BatchSet<Page>(BatchIterator<Page>(nullptr));
+  }
+
   DMatrix* Slice(common::Span<int32_t const> /*ridxs*/) override {
     LOG(FATAL) << "Slicing DMatrix is not supported for Proxy DMatrix.";
     return nullptr;
@@ -89,29 +89,19 @@ class DMatrixProxy : public DMatrix {
     LOG(FATAL) << "Slicing DMatrix columns is not supported for Proxy DMatrix.";
     return nullptr;
   }
-  BatchSet<SparsePage> GetRowBatches() override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<SparsePage>(BatchIterator<SparsePage>(nullptr));
+  BatchSet<SparsePage> GetRowBatches() override { return NoBatch<SparsePage>(); }
+  BatchSet<CSCPage> GetColumnBatches(Context const*) override { return NoBatch<CSCPage>(); }
+  BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const*) override {
+    return NoBatch<SortedCSCPage>();
   }
-  BatchSet<CSCPage> GetColumnBatches() override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<CSCPage>(BatchIterator<CSCPage>(nullptr));
+  BatchSet<EllpackPage> GetEllpackBatches(Context const*, BatchParam const&) override {
+    return NoBatch<EllpackPage>();
   }
-  BatchSet<SortedCSCPage> GetSortedColumnBatches() override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(nullptr));
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const*, BatchParam const&) override {
+    return NoBatch<GHistIndexMatrix>();
   }
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam&) override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(nullptr));
-  }
-  BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam&) override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<GHistIndexMatrix>(BatchIterator<GHistIndexMatrix>(nullptr));
-  }
-  BatchSet<ExtSparsePage> GetExtBatches(BatchParam const&) override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<ExtSparsePage>(BatchIterator<ExtSparsePage>(nullptr));
+  BatchSet<ExtSparsePage> GetExtBatches(Context const*, BatchParam const&) override {
+    return NoBatch<ExtSparsePage>();
   }
   std::any Adapter() const { return batch_; }
 };
@@ -144,8 +134,7 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
     } else {
       LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
     }
-    return std::result_of_t<Fn(
-        decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
+    return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
   }
 }
 }  // namespace xgboost::data
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index e916311a5..ab75cf03e 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -11,10 +11,12 @@
 #include <type_traits>
 #include <vector>
 
+#include "../common/error_msg.h"  // for InconsistentMaxBin
 #include "../common/random.h"
 #include "../common/threading_utils.h"
 #include "./simple_batch_iterator.h"
 #include "adapter.h"
+#include "batch_utils.h"  // for CheckEmpty, RegenGHist
 #include "gradient_index.h"
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"
@@ -28,7 +30,7 @@ const MetaInfo& SimpleDMatrix::Info() const { return info_; }
 DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
   auto out = new SimpleDMatrix;
   SparsePage& out_page = *out->sparse_page_;
-  for (auto const &page : this->GetBatches<SparsePage>()) {
+  for (auto const& page : this->GetBatches<SparsePage>()) {
     auto batch = page.GetView();
     auto& h_data = out_page.data.HostVector();
     auto& h_offset = out_page.offset.HostVector();
@@ -42,7 +44,7 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
     out->Info() = this->Info().Slice(ridxs);
     out->Info().num_nonzero_ = h_offset.back();
   }
-  out->ctx_ = this->ctx_;
+  out->fmat_ctx_ = this->fmat_ctx_;
   return out;
 }
 
@@ -52,7 +54,7 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
   auto const slice_size = info_.num_col_ / num_slices;
   auto const slice_start = slice_size * slice_id;
   auto const slice_end = (slice_id == num_slices - 1) ? info_.num_col_ : slice_start + slice_size;
-  for (auto const &page : this->GetBatches<SparsePage>()) {
+  for (auto const& page : this->GetBatches<SparsePage>()) {
     auto batch = page.GetView();
     auto& h_data = out_page.data.HostVector();
     auto& h_offset = out_page.offset.HostVector();
@@ -60,9 +62,8 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
     for (bst_row_t i = 0; i < this->Info().num_row_; i++) {
       auto inst = batch[i];
       auto prev_size = h_data.size();
-      std::copy_if(inst.begin(), inst.end(), std::back_inserter(h_data), [&](Entry e) {
-        return e.index >= slice_start && e.index < slice_end;
-      });
+      std::copy_if(inst.begin(), inst.end(), std::back_inserter(h_data),
+                   [&](Entry e) { return e.index >= slice_start && e.index < slice_end; });
       rptr += h_data.size() - prev_size;
       h_offset.emplace_back(rptr);
     }
@@ -73,7 +74,7 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
   return out;
 }
 
-void SimpleDMatrix::ReindexFeatures() {
+void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
   if (info_.IsVerticalFederated()) {
     std::vector<uint64_t> buffer(collective::GetWorldSize());
     buffer[collective::GetRank()] = info_.num_col_;
@@ -82,72 +83,115 @@ void SimpleDMatrix::ReindexFeatures() {
     if (offset == 0) {
       return;
     }
-    sparse_page_->Reindex(offset, ctx_.Threads());
+    sparse_page_->Reindex(offset, ctx->Threads());
   }
 }
 
 BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
   // since csr is the default data structure so `source_` is always available.
-  auto begin_iter = BatchIterator<SparsePage>(
-      new SimpleBatchIteratorImpl<SparsePage>(sparse_page_));
+  auto begin_iter =
+      BatchIterator<SparsePage>(new SimpleBatchIteratorImpl<SparsePage>(sparse_page_));
   return BatchSet<SparsePage>(begin_iter);
 }
 
-BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches() {
+BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
   // column page doesn't exist, generate it
   if (!column_page_) {
-    column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx_.Threads())));
+    column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
   }
-  auto begin_iter =
-      BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
+  auto begin_iter = BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
   return BatchSet<CSCPage>(begin_iter);
 }
 
-BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches() {
+BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches(Context const* ctx) {
   // Sorted column page doesn't exist, generate it
   if (!sorted_column_page_) {
     sorted_column_page_.reset(
-        new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx_.Threads())));
-    sorted_column_page_->SortRows(ctx_.Threads());
+        new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
+    sorted_column_page_->SortRows(ctx->Threads());
   }
-  auto begin_iter = BatchIterator<SortedCSCPage>(
-      new SimpleBatchIteratorImpl<SortedCSCPage>(sorted_column_page_));
+  auto begin_iter =
+      BatchIterator<SortedCSCPage>(new SimpleBatchIteratorImpl<SortedCSCPage>(sorted_column_page_));
   return BatchSet<SortedCSCPage>(begin_iter);
 }
 
-namespace {
-void CheckEmpty(BatchParam const& l, BatchParam const& r) {
-  if (l == BatchParam{}) {
-    CHECK(r != BatchParam{}) << "Batch parameter is not initialized.";
+BatchSet<EllpackPage> SimpleDMatrix::GetEllpackBatches(Context const* ctx,
+                                                       const BatchParam& param) {
+  detail::CheckEmpty(batch_param_, param);
+  if (ellpack_page_ && param.Initialized() && param.forbid_regen) {
+    if (detail::RegenGHist(batch_param_, param)) {
+      CHECK_EQ(batch_param_.max_bin, param.max_bin) << error::InconsistentMaxBin();
+    }
+    CHECK(!detail::RegenGHist(batch_param_, param));
   }
-}
-}  // anonymous namespace
-
-BatchSet<EllpackPage> SimpleDMatrix::GetEllpackBatches(const BatchParam& param) {
-  // ELLPACK page doesn't exist, generate it
-  CheckEmpty(batch_param_, param);
-  if (!ellpack_page_ || RegenGHist(batch_param_, param)) {
-    CHECK_GE(param.gpu_id, 0);
+  if (!ellpack_page_ || detail::RegenGHist(batch_param_, param)) {
+    // ELLPACK page doesn't exist, generate it
+    LOG(INFO) << "Generating new Ellpack page.";
+    // These places can ask for a ellpack page:
+    // - GPU hist: the ctx must be on CUDA.
+    // - IterativeDMatrix::InitFromCUDA: The ctx must be on CUDA.
+    // - IterativeDMatrix::InitFromCPU: It asks for ellpack only if it exists. It should
+    //   not regen, otherwise it indicates a mismatched parameter like max_bin.
     CHECK_GE(param.max_bin, 2);
-    ellpack_page_.reset(new EllpackPage(this, param));
-    batch_param_ = param;
+    if (ctx->IsCUDA()) {
+      // The context passed in is on GPU, we pick it first since we prioritize the context
+      // in Booster.
+      ellpack_page_.reset(new EllpackPage(ctx, this, param));
+    } else if (fmat_ctx_.IsCUDA()) {
+      // DMatrix was initialized on GPU, we use the context from initialization.
+      ellpack_page_.reset(new EllpackPage(&fmat_ctx_, this, param));
+    } else {
+      // Mismatched parameter, user set a new max_bin during training.
+      auto cuda_ctx = ctx->MakeCUDA();
+      ellpack_page_.reset(new EllpackPage(&cuda_ctx, this, param));
+    }
+
+    batch_param_ = param.MakeCache();
   }
   auto begin_iter =
       BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_page_));
   return BatchSet<EllpackPage>(begin_iter);
 }
 
-BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(const BatchParam& param) {
-  CheckEmpty(batch_param_, param);
-  if (!gradient_index_ || RegenGHist(batch_param_, param)) {
+BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(Context const* ctx,
+                                                           const BatchParam& param) {
+  detail::CheckEmpty(batch_param_, param);
+  // Check whether we can regenerate the gradient index. This is to keep the consistency
+  // between evaluation data and training data.
+  if (gradient_index_ && param.Initialized() && param.forbid_regen) {
+    if (detail::RegenGHist(batch_param_, param)) {
+      CHECK_EQ(batch_param_.max_bin, param.max_bin) << error::InconsistentMaxBin();
+    }
+    CHECK(!detail::RegenGHist(batch_param_, param)) << "Inconsistent sparse threshold.";
+  }
+  if (!gradient_index_ || detail::RegenGHist(batch_param_, param)) {
+    // GIDX page doesn't exist, generate it
     LOG(INFO) << "Generating new Gradient Index.";
+    // These places can ask for a CSR gidx:
+    // - CPU Hist: the ctx must be on CPU.
+    // - IterativeDMatrix::InitFromCPU: The ctx must be on CPU.
+    // - IterativeDMatrix::InitFromCUDA: It asks for gidx only if it exists. It should not
+    //   regen, otherwise it indicates a mismatched parameter like max_bin.
     CHECK_GE(param.max_bin, 2);
-    CHECK_EQ(param.gpu_id, -1);
     // Used only by approx.
     auto sorted_sketch = param.regen;
-    gradient_index_.reset(new GHistIndexMatrix(this, param.max_bin, param.sparse_thresh,
-                                               sorted_sketch, this->ctx_.Threads(), param.hess));
-    batch_param_ = param;
+    if (ctx->IsCPU()) {
+      // The context passed in is on CPU, we pick it first since we prioritize the context
+      // in Booster.
+      gradient_index_.reset(new GHistIndexMatrix{ctx, this, param.max_bin, param.sparse_thresh,
+                                                 sorted_sketch, param.hess});
+    } else if (fmat_ctx_.IsCPU()) {
+      // DMatrix was initialized on CPU, we use the context from initialization.
+      gradient_index_.reset(new GHistIndexMatrix{&fmat_ctx_, this, param.max_bin,
+                                                 param.sparse_thresh, sorted_sketch, param.hess});
+    } else {
+      // Mismatched parameter, user set a new max_bin during training.
+      auto cpu_ctx = ctx->MakeCPU();
+      gradient_index_.reset(new GHistIndexMatrix{&cpu_ctx, this, param.max_bin, param.sparse_thresh,
+                                                 sorted_sketch, param.hess});
+    }
+
+    batch_param_ = param.MakeCache();
     CHECK_EQ(batch_param_.hess.data(), param.hess.data());
   }
   auto begin_iter = BatchIterator<GHistIndexMatrix>(
@@ -155,7 +199,7 @@ BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(const BatchParam& par
   return BatchSet<GHistIndexMatrix>(begin_iter);
 }
 
-BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(BatchParam const&) {
+BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(Context const*, BatchParam const&) {
   auto casted = std::make_shared<ExtSparsePage>(sparse_page_);
   CHECK(casted);
   auto begin_iter =
@@ -166,7 +210,8 @@ BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(BatchParam const&) {
 template <typename AdapterT>
 SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
                              DataSplitMode data_split_mode) {
-  this->ctx_.nthread = nthread;
+  Context ctx;
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}});
 
   std::vector<uint64_t> qids;
   uint64_t default_max = std::numeric_limits<uint64_t>::max();
@@ -176,13 +221,13 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   auto& data_vec = sparse_page_->data.HostVector();
   uint64_t inferred_num_columns = 0;
   uint64_t total_batch_size = 0;
-    // batch_size is either number of rows or cols, depending on data layout
+  // batch_size is either number of rows or cols, depending on data layout
 
   adapter->BeforeFirst();
   // Iterate over batches of input data
   while (adapter->Next()) {
     auto& batch = adapter->Value();
-    auto batch_max_columns = sparse_page_->Push(batch, missing, ctx_.Threads());
+    auto batch_max_columns = sparse_page_->Push(batch, missing, ctx.Threads());
     inferred_num_columns = std::max(batch_max_columns, inferred_num_columns);
     total_batch_size += batch.Size();
     // Append meta information if available
@@ -229,19 +274,18 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
     info_.num_col_ = adapter->NumColumns();
   }
 
-
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
-  ReindexFeatures();
+  ReindexFeatures(&ctx);
   info_.SynchronizeNumberOfColumns();
 
   if (adapter->NumRows() == kAdapterUnknownSize) {
-    using IteratorAdapterT
-      = IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
+    using IteratorAdapterT =
+        IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
     // If AdapterT is either IteratorAdapter or FileAdapter type, use the total batch size to
     // determine the correct number of rows, as offset_vec may be too short
-    if (std::is_same<AdapterT, IteratorAdapterT>::value
-        || std::is_same<AdapterT, FileAdapter>::value) {
+    if (std::is_same<AdapterT, IteratorAdapterT>::value ||
+        std::is_same<AdapterT, FileAdapter>::value) {
       info_.num_row_ = total_batch_size;
       // Ensure offset_vec.size() - 1 == [number of rows]
       while (offset_vec.size() - 1 < total_batch_size) {
@@ -265,9 +309,11 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   info_.num_nonzero_ = data_vec.size();
 
   // Sort the index for row partitioners used by variuos tree methods.
-  if (!sparse_page_->IsIndicesSorted(this->ctx_.Threads())) {
-    sparse_page_->SortIndices(this->ctx_.Threads());
+  if (!sparse_page_->IsIndicesSorted(ctx.Threads())) {
+    sparse_page_->SortIndices(ctx.Threads());
   }
+
+  this->fmat_ctx_ = ctx;
 }
 
 SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
@@ -280,12 +326,12 @@ SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
 }
 
 void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
-    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
-    int tmagic = kMagic;
-    fo->Write(tmagic);
-    info_.SaveBinary(fo.get());
-    fo->Write(sparse_page_->offset.HostVector());
-    fo->Write(sparse_page_->data.HostVector());
+  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
+  int tmagic = kMagic;
+  fo->Write(tmagic);
+  info_.SaveBinary(fo.get());
+  fo->Write(sparse_page_->offset.HostVector());
+  fo->Write(sparse_page_->data.HostVector());
 }
 
 template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread,
@@ -305,14 +351,14 @@ template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing,
 template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread,
                                       DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(
-    IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>
-        *adapter,
+    IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
     float missing, int nthread, DataSplitMode data_split_mode);
 
 template <>
 SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
                              DataSplitMode data_split_mode) {
-    ctx_.nthread = nthread;
+  Context ctx;
+  ctx.nthread = nthread;
 
   auto& offset_vec = sparse_page_->offset.HostVector();
   auto& data_vec = sparse_page_->data.HostVector();
@@ -326,7 +372,7 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
     size_t num_elements = 0;
     size_t num_rows = 0;
     // Import Arrow RecordBatches
-#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx_.Threads())
+#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx.Threads())
     for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
       num_elements += batches[i]->Import(missing);
       num_rows += batches[i]->Size();
@@ -348,7 +394,7 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
     data_vec.resize(total_elements);
     offset_vec.resize(total_batch_size + 1);
     // Copy data into DMatrix
-#pragma omp parallel num_threads(ctx_.Threads())
+#pragma omp parallel num_threads(ctx.Threads())
     {
 #pragma omp for nowait
       for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
@@ -372,12 +418,14 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
   // Synchronise worker columns
   info_.num_col_ = adapter->NumColumns();
   info_.data_split_mode = data_split_mode;
-  ReindexFeatures();
+  ReindexFeatures(&ctx);
   info_.SynchronizeNumberOfColumns();
 
   info_.num_row_ = total_batch_size;
   info_.num_nonzero_ = data_vec.size();
   CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
+
+  fmat_ctx_ = ctx;
 }
 }  // namespace data
 }  // namespace xgboost
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index b52333fe6..7aa6979c4 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -1,12 +1,14 @@
-/*!
- * Copyright 2019-2021 by XGBoost Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  * \file simple_dmatrix.cu
  */
 #include <thrust/copy.h>
-#include <xgboost/data.h>
+
+#include "device_adapter.cuh"  // for CurrentDevice
 #include "simple_dmatrix.cuh"
 #include "simple_dmatrix.h"
-#include "device_adapter.cuh"
+#include "xgboost/context.h"  // for Context
+#include "xgboost/data.h"
 
 namespace xgboost {
 namespace data {
@@ -15,7 +17,7 @@ namespace data {
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
 template <typename AdapterT>
-SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/,
+SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthread,
                              DataSplitMode data_split_mode) {
   CHECK(data_split_mode != DataSplitMode::kCol)
       << "Column-wise data split is currently not supported on the GPU.";
@@ -29,6 +31,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
   dh::safe_cuda(hipSetDevice(device));
 #endif
 
+  Context ctx;
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(device)}});
+
   CHECK(adapter->NumRows() != kAdapterUnknownSize);
   CHECK(adapter->NumColumns() != kAdapterUnknownSize);
 
@@ -38,13 +43,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
   // Enforce single batch
   CHECK(!adapter->Next());
 
-  info_.num_nonzero_ =
-      CopyToSparsePage(adapter->Value(), device, missing, sparse_page_.get());
+  info_.num_nonzero_ = CopyToSparsePage(adapter->Value(), device, missing, sparse_page_.get());
   info_.num_col_ = adapter->NumColumns();
   info_.num_row_ = adapter->NumRows();
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
   info_.SynchronizeNumberOfColumns();
+
+  this->fmat_ctx_ = ctx;
 }
 
 template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
index 853e765af..56685c1e6 100644
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -32,7 +32,7 @@ class SimpleDMatrix : public DMatrix {
 
   MetaInfo& Info() override;
   const MetaInfo& Info() const override;
-  Context const* Ctx() const override { return &ctx_; }
+  Context const* Ctx() const override { return &fmat_ctx_; }
 
   bool SingleColBlock() const override { return true; }
   DMatrix* Slice(common::Span<int32_t const> ridxs) override;
@@ -43,11 +43,11 @@ class SimpleDMatrix : public DMatrix {
 
  protected:
   BatchSet<SparsePage> GetRowBatches() override;
-  BatchSet<CSCPage> GetColumnBatches() override;
-  BatchSet<SortedCSCPage> GetSortedColumnBatches() override;
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
-  BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) override;
-  BatchSet<ExtSparsePage> GetExtBatches(BatchParam const& param) override;
+  BatchSet<CSCPage> GetColumnBatches(Context const* ctx) override;
+  BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const* ctx) override;
+  BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, const BatchParam& param) override;
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const* ctx, const BatchParam& param) override;
+  BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) override;
 
   MetaInfo info_;
   // Primary storage type
@@ -69,10 +69,11 @@ class SimpleDMatrix : public DMatrix {
    * starting from 0. However, all the algorithms assume the features are globally indexed, so we
    * reindex the features based on the offset needed to obtain the global view.
    */
-  void ReindexFeatures();
+  void ReindexFeatures(Context const* ctx);
 
  private:
-  Context ctx_;
+  // Context used only for DMatrix initialization.
+  Context fmat_ctx_;
 };
 }  // namespace data
 }  // namespace xgboost
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index f9b74ebcf..2cf32a115 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -1,6 +1,7 @@
-/*!
- * Copyright 2014-2022 by Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
  * \file sparse_page_dmatrix.cc
+ *
  * \brief The external memory version of Page Iterator.
  * \author Tianqi Chen
  */
@@ -8,11 +9,10 @@
 
 #include "../collective/communicator-inl.h"
 #include "./simple_batch_iterator.h"
+#include "batch_utils.h"  // for RegenGHist
 #include "gradient_index.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 MetaInfo &SparsePageDMatrix::Info() { return info_; }
 
 const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
@@ -46,7 +46,9 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
                                      int32_t nthreads, std::string cache_prefix)
     : proxy_{proxy_handle}, iter_{iter_handle}, reset_{reset}, next_{next}, missing_{missing},
       cache_prefix_{std::move(cache_prefix)} {
-  ctx_.nthread = nthreads;
+  Context ctx;
+  ctx.nthread = nthreads;
+
   cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
   if (collective::IsDistributed()) {
     cache_prefix_ += ("-r" + std::to_string(collective::GetRank()));
@@ -81,7 +83,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
 
   // the proxy is iterated together with the sparse page source so we can obtain all
   // information in 1 pass.
-  for (auto const &page : this->GetRowBatchesImpl()) {
+  for (auto const &page : this->GetRowBatchesImpl(&ctx)) {
     this->info_.Extend(std::move(proxy->Info()), false, false);
     n_features = std::max(n_features, num_cols());
     n_samples += num_rows();
@@ -98,9 +100,11 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
 
   info_.SynchronizeNumberOfColumns();
   CHECK_NE(info_.num_col_, 0);
+
+  fmat_ctx_ = ctx;
 }
 
-void SparsePageDMatrix::InitializeSparsePage() {
+void SparsePageDMatrix::InitializeSparsePage(Context const *ctx) {
   auto id = MakeCache(this, ".row.page", cache_prefix_, &cache_info_);
   // Don't use proxy DMatrix once this is already initialized, this allows users to
   // release the iterator and data.
@@ -110,33 +114,33 @@ void SparsePageDMatrix::InitializeSparsePage() {
     return;
   }
 
-  auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
-      iter_, reset_, next_};
+  auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_, reset_, next_};
   DMatrixProxy *proxy = MakeProxy(proxy_);
   sparse_page_source_.reset();  // clear before creating new one to prevent conflicts.
-  sparse_page_source_ = std::make_shared<SparsePageSource>(
-      iter, proxy, this->missing_, this->ctx_.Threads(), this->info_.num_col_,
-      this->n_batches_, cache_info_.at(id));
+  sparse_page_source_ = std::make_shared<SparsePageSource>(iter, proxy, this->missing_,
+                                                           ctx->Threads(), this->info_.num_col_,
+                                                           this->n_batches_, cache_info_.at(id));
 }
 
-BatchSet<SparsePage> SparsePageDMatrix::GetRowBatchesImpl() {
-  this->InitializeSparsePage();
+BatchSet<SparsePage> SparsePageDMatrix::GetRowBatchesImpl(Context const* ctx) {
+  this->InitializeSparsePage(ctx);
   auto begin_iter = BatchIterator<SparsePage>(sparse_page_source_);
   return BatchSet<SparsePage>(BatchIterator<SparsePage>(begin_iter));
 }
 
 BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
-  return this->GetRowBatchesImpl();
+  // Use context from initialization for the default row page.
+  return this->GetRowBatchesImpl(&fmat_ctx_);
 }
 
-BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches() {
+BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
   auto id = MakeCache(this, ".col.page", cache_prefix_, &cache_info_);
   CHECK_NE(this->Info().num_col_, 0);
-  this->InitializeSparsePage();
+  this->InitializeSparsePage(ctx);
   if (!column_source_) {
-    column_source_ = std::make_shared<CSCPageSource>(
-        this->missing_, this->ctx_.Threads(), this->Info().num_col_,
-        this->n_batches_, cache_info_.at(id), sparse_page_source_);
+    column_source_ =
+        std::make_shared<CSCPageSource>(this->missing_, ctx->Threads(), this->Info().num_col_,
+                                        this->n_batches_, cache_info_.at(id), sparse_page_source_);
   } else {
     column_source_->Reset();
   }
@@ -144,14 +148,14 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches() {
   return BatchSet<CSCPage>(BatchIterator<CSCPage>(begin_iter));
 }
 
-BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches() {
+BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const *ctx) {
   auto id = MakeCache(this, ".sorted.col.page", cache_prefix_, &cache_info_);
   CHECK_NE(this->Info().num_col_, 0);
-  this->InitializeSparsePage();
+  this->InitializeSparsePage(ctx);
   if (!sorted_column_source_) {
     sorted_column_source_ = std::make_shared<SortedCSCPageSource>(
-        this->missing_, this->ctx_.Threads(), this->Info().num_col_,
-        this->n_batches_, cache_info_.at(id), sparse_page_source_);
+        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
+        sparse_page_source_);
   } else {
     sorted_column_source_->Reset();
   }
@@ -159,27 +163,27 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches() {
   return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(begin_iter));
 }
 
-BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(const BatchParam &param) {
+BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ctx,
+                                                               const BatchParam &param) {
   CHECK_GE(param.max_bin, 2);
   auto id = MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
-  this->InitializeSparsePage();
-  if (!cache_info_.at(id)->written || RegenGHist(batch_param_, param)) {
+  this->InitializeSparsePage(ctx);
+  if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
     cache_info_.erase(id);
     MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
     LOG(INFO) << "Generating new Gradient Index.";
     // Use sorted sketch for approx.
     auto sorted_sketch = param.regen;
-    auto cuts =
-        common::SketchOnDMatrix(this, param.max_bin, ctx_.Threads(), sorted_sketch, param.hess);
-    this->InitializeSparsePage();  // reset after use.
+    auto cuts = common::SketchOnDMatrix(ctx, this, param.max_bin, sorted_sketch, param.hess);
+    this->InitializeSparsePage(ctx);  // reset after use.
 
     batch_param_ = param;
     ghist_index_source_.reset();
     CHECK_NE(cuts.Values().size(), 0);
     auto ft = this->info_.feature_types.ConstHostSpan();
     ghist_index_source_.reset(new GradientIndexPageSource(
-        this->missing_, this->ctx_.Threads(), this->Info().num_col_, this->n_batches_,
-        cache_info_.at(id), param, std::move(cuts), this->IsDense(), ft, sparse_page_source_));
+        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
+        param, std::move(cuts), this->IsDense(), ft, sparse_page_source_));
   } else {
     CHECK(ghist_index_source_);
     ghist_index_source_->Reset();
@@ -189,11 +193,10 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(const BatchParam
 }
 
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
-BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam &) {
+BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const *, const BatchParam &) {
   common::AssertGPUSupport();
   auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
   return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
 }
 #endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace data
-}  // namespace xgboost
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index b36a0e2a3..0a4cde43d 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -1,42 +1,40 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023 by XGBoost contributors
  */
-#include "sparse_page_source.h"
 #include "../common/hist_util.cuh"
+#include "batch_utils.h"  // for CheckEmpty, RegenGHist
 #include "ellpack_page.cuh"
 #include "sparse_page_dmatrix.h"
+#include "sparse_page_source.h"
 
-namespace xgboost {
-namespace data {
-BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) {
-  CHECK_GE(param.gpu_id, 0);
+namespace xgboost::data {
+BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
+                                                           const BatchParam& param) {
+  CHECK(ctx->IsCUDA());
   CHECK_GE(param.max_bin, 2);
-  if (!(batch_param_ != BatchParam{})) {
-    CHECK(param != BatchParam{}) << "Batch parameter is not initialized.";
-  }
+  detail::CheckEmpty(batch_param_, param);
   auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
   size_t row_stride = 0;
-  this->InitializeSparsePage();
-  if (!cache_info_.at(id)->written || RegenGHist(batch_param_, param)) {
+  this->InitializeSparsePage(ctx);
+  if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
     // reinitialize the cache
     cache_info_.erase(id);
     MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
     std::unique_ptr<common::HistogramCuts> cuts;
-    cuts.reset(new common::HistogramCuts{
-        common::DeviceSketch(param.gpu_id, this, param.max_bin, 0)});
-    this->InitializeSparsePage();  // reset after use.
+    cuts.reset(
+        new common::HistogramCuts{common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0)});
+    this->InitializeSparsePage(ctx);  // reset after use.
 
     row_stride = GetRowStride(this);
-    this->InitializeSparsePage();  // reset after use.
+    this->InitializeSparsePage(ctx);  // reset after use.
     CHECK_NE(row_stride, 0);
     batch_param_ = param;
 
     auto ft = this->info_.feature_types.ConstDeviceSpan();
     ellpack_page_source_.reset();  // release resources.
     ellpack_page_source_.reset(new EllpackPageSource(
-        this->missing_, this->ctx_.Threads(), this->Info().num_col_,
-        this->n_batches_, cache_info_.at(id), param, std::move(cuts),
-        this->IsDense(), row_stride, ft, sparse_page_source_));
+        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id));
   } else {
     CHECK(sparse_page_source_);
     ellpack_page_source_->Reset();
@@ -45,5 +43,4 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& par
   auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
   return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h
index aa0be6984..02aa9a5c0 100644
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2021 by Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
  * \file sparse_page_dmatrix.h
  * \brief External-memory version of DMatrix.
  * \author Tianqi Chen
@@ -9,12 +9,13 @@
 
 #include <xgboost/data.h>
 #include <xgboost/logging.h>
+
 #include <algorithm>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
-#include <map>
 
 #include "ellpack_page_source.h"
 #include "gradient_index_page_source.h"
@@ -69,19 +70,18 @@ class SparsePageDMatrix : public DMatrix {
   XGDMatrixCallbackNext *next_;
 
   float missing_;
-  Context ctx_;
+  Context fmat_ctx_;
   std::string cache_prefix_;
-  uint32_t n_batches_ {0};
+  uint32_t n_batches_{0};
   // sparse page is the source to other page types, we make a special member function.
-  void InitializeSparsePage();
+  void InitializeSparsePage(Context const *ctx);
   // Non-virtual version that can be used in constructor
-  BatchSet<SparsePage> GetRowBatchesImpl();
+  BatchSet<SparsePage> GetRowBatchesImpl(Context const *ctx);
 
  public:
-  explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy,
-                             DataIterResetCallback *reset,
-                             XGDMatrixCallbackNext *next, float missing,
-                             int32_t nthreads, std::string cache_prefix);
+  explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
+                             XGDMatrixCallbackNext *next, float missing, int32_t nthreads,
+                             std::string cache_prefix);
 
   ~SparsePageDMatrix() override {
     // Clear out all resources before deleting the cache file.
@@ -98,9 +98,9 @@ class SparsePageDMatrix : public DMatrix {
     }
   }
 
-  MetaInfo& Info() override;
-  const MetaInfo& Info() const override;
-  Context const* Ctx() const override { return &ctx_; }
+  MetaInfo &Info() override;
+  const MetaInfo &Info() const override;
+  Context const *Ctx() const override { return &fmat_ctx_; }
 
   bool SingleColBlock() const override { return false; }
   DMatrix *Slice(common::Span<int32_t const>) override {
@@ -114,11 +114,11 @@ class SparsePageDMatrix : public DMatrix {
 
  private:
   BatchSet<SparsePage> GetRowBatches() override;
-  BatchSet<CSCPage> GetColumnBatches() override;
-  BatchSet<SortedCSCPage> GetSortedColumnBatches() override;
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
-  BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam&) override;
-  BatchSet<ExtSparsePage> GetExtBatches(BatchParam const &) override {
+  BatchSet<CSCPage> GetColumnBatches(Context const *ctx) override;
+  BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const *ctx) override;
+  BatchSet<EllpackPage> GetEllpackBatches(Context const *ctx, const BatchParam &param) override;
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const *ctx, const BatchParam &) override;
+  BatchSet<ExtSparsePage> GetExtBatches(Context const *, BatchParam const &) override {
     LOG(FATAL) << "Can not obtain a single CSR page for external memory DMatrix";
     return BatchSet<ExtSparsePage>(BatchIterator<ExtSparsePage>(nullptr));
   }
@@ -141,9 +141,8 @@ inline std::string MakeId(std::string prefix, SparsePageDMatrix *ptr) {
   return prefix + "-" + ss.str();
 }
 
-inline std::string
-MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix,
-          std::map<std::string, std::shared_ptr<Cache>> *out) {
+inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix,
+                             std::map<std::string, std::shared_ptr<Cache>> *out) {
   auto &cache_info = *out;
   auto name = MakeId(prefix, ptr);
   auto id = name + format;
diff --git a/src/learner.cc b/src/learner.cc
index 1150a2355..78297404b 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -34,6 +34,7 @@
 #include <utility>                        // for pair, as_const, move, swap
 #include <vector>                         // for vector
 
+#include "collective/aggregator.h"        // for ApplyWithLabels
 #include "collective/communicator-inl.h"  // for Allreduce, Broadcast, GetRank, IsDistributed
 #include "collective/communicator.h"      // for Operation
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
@@ -859,22 +860,10 @@ class LearnerConfiguration : public Learner {
   }
 
   void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
-    // Special handling for vertical federated learning.
-    if (info.IsVerticalFederated()) {
-      // We assume labels are only available on worker 0, so the estimation is calculated there
-      // and broadcast to other workers.
-      if (collective::GetRank() == 0) {
-        UsePtr(obj_)->InitEstimation(info, base_score);
-        collective::Broadcast(base_score->Data()->HostPointer(),
-                              sizeof(bst_float) * base_score->Size(), 0);
-      } else {
-        base_score->Reshape(1);
-        collective::Broadcast(base_score->Data()->HostPointer(),
-                              sizeof(bst_float) * base_score->Size(), 0);
-      }
-    } else {
-      UsePtr(obj_)->InitEstimation(info, base_score);
-    }
+    base_score->Reshape(1);
+    collective::ApplyWithLabels(info, base_score->Data()->HostPointer(),
+                                sizeof(bst_float) * base_score->Size(),
+                                [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
   }
 };
 
@@ -1486,24 +1475,10 @@ class LearnerImpl : public LearnerIO {
  private:
   void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration,
                    HostDeviceVector<GradientPair>* out_gpair) {
-    // Special handling for vertical federated learning.
-    if (info.IsVerticalFederated()) {
-      // We assume labels are only available on worker 0, so the gradients are calculated there
-      // and broadcast to other workers.
-      if (collective::GetRank() == 0) {
-        obj_->GetGradient(preds, info, iteration, out_gpair);
-        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
-                              0);
-      } else {
-        CHECK_EQ(info.labels.Size(), 0)
-            << "In vertical federated learning, labels should only be on the first worker";
-        out_gpair->Resize(preds.Size());
-        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
-                              0);
-      }
-    } else {
-      obj_->GetGradient(preds, info, iteration, out_gpair);
-    }
+    out_gpair->Resize(preds.Size());
+    collective::ApplyWithLabels(info, out_gpair->HostPointer(),
+                                out_gpair->Size() * sizeof(GradientPair),
+                                [&] { obj_->GetGradient(preds, info, iteration, out_gpair); });
   }
 
   /*! \brief random number transformation seed. */
diff --git a/src/linear/coordinate_common.h b/src/linear/coordinate_common.h
index f61c423f0..f08856bd1 100644
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018 by Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
  * \author Rory Mitchell
  */
 #pragma once
@@ -78,11 +78,12 @@ inline double CoordinateDeltaBias(double sum_grad, double sum_hess) {
  *
  * \return  The gradient and diagonal Hessian entry for a given feature.
  */
-inline std::pair<double, double> GetGradient(int group_idx, int num_group, int fidx,
-                                             const std::vector<GradientPair> &gpair,
+inline std::pair<double, double> GetGradient(Context const *ctx, int group_idx, int num_group,
+                                             bst_feature_t fidx,
+                                             std::vector<GradientPair> const &gpair,
                                              DMatrix *p_fmat) {
   double sum_grad = 0.0, sum_hess = 0.0;
-  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+  for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
     auto page = batch.GetView();
     auto col = page[fidx];
     const auto ndata = static_cast<bst_omp_uint>(col.size());
@@ -115,7 +116,7 @@ inline std::pair<double, double> GetGradientParallel(Context const *ctx, int gro
   std::vector<double> sum_grad_tloc(ctx->Threads(), 0.0);
   std::vector<double> sum_hess_tloc(ctx->Threads(), 0.0);
 
-  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+  for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
     auto page = batch.GetView();
     auto col = page[fidx];
     const auto ndata = static_cast<bst_omp_uint>(col.size());
@@ -177,16 +178,16 @@ inline std::pair<double, double> GetBiasGradientParallel(int group_idx, int num_
  * \param in_gpair  The gradient vector to be updated.
  * \param p_fmat    The input feature matrix.
  */
-inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
-                                   float dw, std::vector<GradientPair> *in_gpair,
-                                   DMatrix *p_fmat, int32_t n_threads) {
+inline void UpdateResidualParallel(Context const *ctx, bst_feature_t fidx, int group_idx,
+                                   int num_group, float dw, std::vector<GradientPair> *in_gpair,
+                                   DMatrix *p_fmat) {
   if (dw == 0.0f) return;
-  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+  for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
     auto page = batch.GetView();
     auto col = page[fidx];
     // update grad value
     const auto num_row = static_cast<bst_omp_uint>(col.size());
-    common::ParallelFor(num_row, n_threads, [&](auto j) {
+    common::ParallelFor(num_row, ctx->Threads(), [&](auto j) {
       GradientPair &p = (*in_gpair)[col[j].index * num_group + group_idx];
       if (p.GetHess() < 0.0f) return;
       p += GradientPair(p.GetHess() * col[j].fvalue * dw, 0);
@@ -203,12 +204,12 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
  * \param in_gpair  The gradient vector to be updated.
  * \param p_fmat    The input feature matrix.
  */
-inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias,
-                                       std::vector<GradientPair> *in_gpair, DMatrix *p_fmat,
-                                       int32_t n_threads) {
+inline void UpdateBiasResidualParallel(Context const *ctx, int group_idx, int num_group,
+                                       float dbias, std::vector<GradientPair> *in_gpair,
+                                       DMatrix *p_fmat) {
   if (dbias == 0.0f) return;
   const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
-  common::ParallelFor(ndata, n_threads, [&](auto i) {
+  common::ParallelFor(ndata, ctx->Threads(), [&](auto i) {
     GradientPair &g = (*in_gpair)[i * num_group + group_idx];
     if (g.GetHess() < 0.0f) return;
     g += GradientPair(g.GetHess() * dbias, 0);
@@ -220,18 +221,16 @@ inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias
  *        in coordinate descent algorithms.
  */
 class FeatureSelector {
- protected:
-  int32_t n_threads_{-1};
-
  public:
-  explicit FeatureSelector(int32_t n_threads) : n_threads_{n_threads} {}
+  FeatureSelector() = default;
   /*! \brief factory method */
-  static FeatureSelector *Create(int choice, int32_t n_threads);
+  static FeatureSelector *Create(int choice);
   /*! \brief virtual destructor */
   virtual ~FeatureSelector() = default;
   /**
    * \brief Setting up the selector state prior to looping through features.
    *
+   * \param ctx    The booster context.
    * \param model  The model.
    * \param gpair  The gpair.
    * \param p_fmat The feature matrix.
@@ -239,13 +238,12 @@ class FeatureSelector {
    * \param lambda Regularisation lambda.
    * \param param  A parameter with algorithm-dependent use.
    */
-  virtual void Setup(const gbm::GBLinearModel &,
-                     const std::vector<GradientPair> &,
-                     DMatrix *,
-                     float , float , int ) {}
+  virtual void Setup(Context const *, const gbm::GBLinearModel &,
+                     const std::vector<GradientPair> &, DMatrix *, float, float, int) {}
   /**
    * \brief Select next coordinate to update.
    *
+   * \param ctx       Booster context
    * \param iteration The iteration in a loop through features
    * \param model     The model.
    * \param group_idx Zero-based index of the group.
@@ -256,11 +254,9 @@ class FeatureSelector {
    *
    * \return  The index of the selected feature. -1 indicates none selected.
    */
-  virtual int NextFeature(int iteration,
-                          const gbm::GBLinearModel &model,
-                          int group_idx,
-                          const std::vector<GradientPair> &gpair,
-                          DMatrix *p_fmat, float alpha, float lambda) = 0;
+  virtual int NextFeature(Context const *ctx, int iteration, const gbm::GBLinearModel &model,
+                          int group_idx, const std::vector<GradientPair> &gpair, DMatrix *p_fmat,
+                          float alpha, float lambda) = 0;
 };
 
 /**
@@ -269,9 +265,8 @@ class FeatureSelector {
 class CyclicFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  int NextFeature(int iteration, const gbm::GBLinearModel &model,
-                  int , const std::vector<GradientPair> &,
-                  DMatrix *, float, float) override {
+  int NextFeature(Context const *, int iteration, const gbm::GBLinearModel &model, int,
+                  const std::vector<GradientPair> &, DMatrix *, float, float) override {
     return iteration % model.learner_model_param->num_feature;
   }
 };
@@ -283,8 +278,7 @@ class CyclicFeatureSelector : public FeatureSelector {
 class ShuffleFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  void Setup(const gbm::GBLinearModel &model,
-             const std::vector<GradientPair>&,
+  void Setup(Context const *, const gbm::GBLinearModel &model, const std::vector<GradientPair> &,
              DMatrix *, float, float, int) override {
     if (feat_index_.size() == 0) {
       feat_index_.resize(model.learner_model_param->num_feature);
@@ -293,9 +287,8 @@ class ShuffleFeatureSelector : public FeatureSelector {
     std::shuffle(feat_index_.begin(), feat_index_.end(), common::GlobalRandom());
   }
 
-  int NextFeature(int iteration, const gbm::GBLinearModel &model,
-                  int, const std::vector<GradientPair> &,
-                  DMatrix *, float, float) override {
+  int NextFeature(Context const *, int iteration, const gbm::GBLinearModel &model, int,
+                  const std::vector<GradientPair> &, DMatrix *, float, float) override {
     return feat_index_[iteration % model.learner_model_param->num_feature];
   }
 
@@ -310,9 +303,8 @@ class ShuffleFeatureSelector : public FeatureSelector {
 class RandomFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  int NextFeature(int, const gbm::GBLinearModel &model,
-                  int, const std::vector<GradientPair> &,
-                  DMatrix *, float, float) override {
+  int NextFeature(Context const *, int, const gbm::GBLinearModel &model, int,
+                  const std::vector<GradientPair> &, DMatrix *, float, float) override {
     return common::GlobalRandom()() % model.learner_model_param->num_feature;
   }
 };
@@ -329,8 +321,7 @@ class RandomFeatureSelector : public FeatureSelector {
 class GreedyFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  void Setup(const gbm::GBLinearModel &model,
-             const std::vector<GradientPair> &,
+  void Setup(Context const *, const gbm::GBLinearModel &model, const std::vector<GradientPair> &,
              DMatrix *, float, float, int param) override {
     top_k_ = static_cast<bst_uint>(param);
     const bst_uint ngroup = model.learner_model_param->num_output_group;
@@ -344,7 +335,7 @@ class GreedyFeatureSelector : public FeatureSelector {
     }
   }
 
-  int NextFeature(int, const gbm::GBLinearModel &model,
+  int NextFeature(Context const* ctx, int, const gbm::GBLinearModel &model,
                   int group_idx, const std::vector<GradientPair> &gpair,
                   DMatrix *p_fmat, float alpha, float lambda) override {
     // k-th selected feature for a group
@@ -356,9 +347,9 @@ class GreedyFeatureSelector : public FeatureSelector {
     const bst_omp_uint nfeat = model.learner_model_param->num_feature;
     // Calculate univariate gradient sums
     std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
-    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+    for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
       auto page = batch.GetView();
-      common::ParallelFor(nfeat, this->n_threads_, [&](bst_omp_uint i) {
+      common::ParallelFor(nfeat, ctx->Threads(), [&](bst_omp_uint i) {
         const auto col = page[i];
         const bst_uint ndata = col.size();
         auto &sums = gpair_sums_[group_idx * nfeat + i];
@@ -406,9 +397,10 @@ class GreedyFeatureSelector : public FeatureSelector {
 class ThriftyFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  void Setup(const gbm::GBLinearModel &model,
-             const std::vector<GradientPair> &gpair,
-             DMatrix *p_fmat, float alpha, float lambda, int param) override {
+
+  void Setup(Context const *ctx, const gbm::GBLinearModel &model,
+             const std::vector<GradientPair> &gpair, DMatrix *p_fmat, float alpha, float lambda,
+             int param) override {
     top_k_ = static_cast<bst_uint>(param);
     if (param <= 0) top_k_ = std::numeric_limits<bst_uint>::max();
     const bst_uint ngroup = model.learner_model_param->num_output_group;
@@ -422,10 +414,10 @@ class ThriftyFeatureSelector : public FeatureSelector {
     }
     // Calculate univariate gradient sums
     std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
-    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+    for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
       auto page = batch.GetView();
       // column-parallel is usually fastaer than row-parallel
-      common::ParallelFor(nfeat, this->n_threads_, [&](auto i) {
+      common::ParallelFor(nfeat, ctx->Threads(), [&](auto i) {
         const auto col = page[i];
         const bst_uint ndata = col.size();
         for (bst_uint gid = 0u; gid < ngroup; ++gid) {
@@ -462,9 +454,8 @@ class ThriftyFeatureSelector : public FeatureSelector {
     }
   }
 
-  int NextFeature(int, const gbm::GBLinearModel &model,
-                  int group_idx, const std::vector<GradientPair> &,
-                  DMatrix *, float, float) override {
+  int NextFeature(Context const *, int, const gbm::GBLinearModel &model, int group_idx,
+                  const std::vector<GradientPair> &, DMatrix *, float, float) override {
     // k-th selected feature for a group
     auto k = counter_[group_idx]++;
     // stop after either reaching top-N or going through all the features in a group
@@ -482,18 +473,18 @@ class ThriftyFeatureSelector : public FeatureSelector {
   std::vector<std::pair<double, double>> gpair_sums_;
 };
 
-inline FeatureSelector *FeatureSelector::Create(int choice, int32_t n_threads) {
+inline FeatureSelector *FeatureSelector::Create(int choice) {
   switch (choice) {
     case kCyclic:
-      return new CyclicFeatureSelector(n_threads);
+      return new CyclicFeatureSelector;
     case kShuffle:
-      return new ShuffleFeatureSelector(n_threads);
+      return new ShuffleFeatureSelector;
     case kThrifty:
-      return new ThriftyFeatureSelector(n_threads);
+      return new ThriftyFeatureSelector;
     case kGreedy:
-      return new GreedyFeatureSelector(n_threads);
+      return new GreedyFeatureSelector;
     case kRandom:
-      return new RandomFeatureSelector(n_threads);
+      return new RandomFeatureSelector;
     default:
       LOG(FATAL) << "unknown coordinate selector: " << choice;
   }
diff --git a/src/linear/updater_coordinate.cc b/src/linear/updater_coordinate.cc
index 29ba5451b..84f15d706 100644
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018 by Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
  * \author Rory Mitchell
  */
 
@@ -30,7 +30,7 @@ class CoordinateUpdater : public LinearUpdater {
       tparam_.UpdateAllowUnknown(args)
     };
     cparam_.UpdateAllowUnknown(rest);
-    selector_.reset(FeatureSelector::Create(tparam_.feature_selector, ctx_->Threads()));
+    selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
     monitor_.Init("CoordinateUpdater");
   }
 
@@ -56,19 +56,17 @@ class CoordinateUpdater : public LinearUpdater {
       auto dbias = static_cast<float>(tparam_.learning_rate *
                                       CoordinateDeltaBias(grad.first, grad.second));
       model->Bias()[group_idx] += dbias;
-      UpdateBiasResidualParallel(group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat,
-                                 ctx_->Threads());
+      UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
     }
     // prepare for updating the weights
-    selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
-                    tparam_.reg_alpha_denorm,
-                    tparam_.reg_lambda_denorm, cparam_.top_k);
+    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
+                     tparam_.reg_lambda_denorm, cparam_.top_k);
     // update weights
     for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
       for (unsigned i = 0U; i < model->learner_model_param->num_feature; i++) {
-        int fidx = selector_->NextFeature
-          (i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
-           tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
+        int fidx =
+            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
+                                   tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
         if (fidx < 0) break;
         this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
       }
@@ -76,8 +74,8 @@ class CoordinateUpdater : public LinearUpdater {
     monitor_.Stop("UpdateFeature");
   }
 
-  inline void UpdateFeature(int fidx, int group_idx, std::vector<GradientPair> *in_gpair,
-                            DMatrix *p_fmat, gbm::GBLinearModel *model) {
+  void UpdateFeature(int fidx, int group_idx, std::vector<GradientPair> *in_gpair, DMatrix *p_fmat,
+                     gbm::GBLinearModel *model) {
     const int ngroup = model->learner_model_param->num_output_group;
     bst_float &w = (*model)[fidx][group_idx];
     auto gradient = GetGradientParallel(ctx_, group_idx, ngroup, fidx,
@@ -87,8 +85,7 @@ class CoordinateUpdater : public LinearUpdater {
         CoordinateDelta(gradient.first, gradient.second, w, tparam_.reg_alpha_denorm,
                         tparam_.reg_lambda_denorm));
     w += dw;
-    UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat,
-                           ctx_->Threads());
+    UpdateResidualParallel(ctx_, fidx, group_idx, ngroup, dw, in_gpair, p_fmat);
   }
 
  private:
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index 2f8e3b992..709a7d277 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -32,7 +32,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
   void Configure(Args const& args) override {
     tparam_.UpdateAllowUnknown(args);
     coord_param_.UpdateAllowUnknown(args);
-    selector_.reset(FeatureSelector::Create(tparam_.feature_selector, ctx_->Threads()));
+    selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
     monitor_.Init("GPUCoordinateUpdater");
   }
 
@@ -53,7 +53,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
 
     CHECK(p_fmat->SingleColBlock());
-    SparsePage const& batch = *(p_fmat->GetBatches<CSCPage>().begin());
+    SparsePage const &batch = *(p_fmat->GetBatches<CSCPage>(ctx_).begin());
     auto page = batch.GetView();
 
     if (IsEmpty()) {
@@ -125,16 +125,15 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     this->UpdateBias(model);
     monitor_.Stop("UpdateBias");
     // prepare for updating the weights
-    selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
-                     tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm,
-                     coord_param_.top_k);
+    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
+                     tparam_.reg_lambda_denorm, coord_param_.top_k);
     monitor_.Start("UpdateFeature");
     for (uint32_t group_idx = 0; group_idx < model->learner_model_param->num_output_group;
          ++group_idx) {
       for (auto i = 0U; i < model->learner_model_param->num_feature; i++) {
-        auto fidx = selector_->NextFeature(
-            i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
-            tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
+        auto fidx =
+            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
+                                   tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
         if (fidx < 0) break;
         this->UpdateFeature(fidx, group_idx, model);
       }
diff --git a/src/linear/updater_shotgun.cc b/src/linear/updater_shotgun.cc
index d8592f1cf..18b747f64 100644
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018 by Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
  * \author Tianqi Chen, Rory Mitchell
  */
 
@@ -21,7 +21,7 @@ class ShotgunUpdater : public LinearUpdater {
       LOG(FATAL) << "Unsupported feature selector for shotgun updater.\n"
                  << "Supported options are: {cyclic, shuffle}";
     }
-    selector_.reset(FeatureSelector::Create(param_.feature_selector, ctx_->Threads()));
+    selector_.reset(FeatureSelector::Create(param_.feature_selector));
   }
   void LoadConfig(Json const& in) override {
     auto const& config = get<Object const>(in);
@@ -45,18 +45,17 @@ class ShotgunUpdater : public LinearUpdater {
       auto dbias = static_cast<bst_float>(param_.learning_rate *
                                CoordinateDeltaBias(grad.first, grad.second));
       model->Bias()[gid] += dbias;
-      UpdateBiasResidualParallel(gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat,
-                                 ctx_->Threads());
+      UpdateBiasResidualParallel(ctx_, gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
     }
 
     // lock-free parallel updates of weights
-    selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
-                     param_.reg_alpha_denorm, param_.reg_lambda_denorm, 0);
-    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
+                     param_.reg_lambda_denorm, 0);
+    for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx_)) {
       auto page = batch.GetView();
       const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
       common::ParallelFor(nfeat, ctx_->Threads(), [&](auto i) {
-        int ii = selector_->NextFeature(i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
+        int ii = selector_->NextFeature(ctx_, i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
                                         param_.reg_alpha_denorm, param_.reg_lambda_denorm);
         if (ii < 0) return;
         const bst_uint fid = ii;
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index d8a32d201..63dc2b0a1 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -116,8 +116,7 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
 
   // we have 2 averages going in here, first is among workers, second is among
   // classes. allreduce sums up fp/tp auc for each class.
-  collective::Allreduce<collective::Operation::kSum>(results.Values().data(),
-                                                     results.Values().size());
+  collective::GlobalSum(info, &results.Values());
   double auc_sum{0};
   double tp_sum{0};
   for (size_t c = 0; c < n_classes; ++c) {
@@ -268,7 +267,9 @@ class EvalAUC : public MetricNoCache {
     }
     //  We use the global size to handle empty dataset.
     std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
-    collective::Allreduce<collective::Operation::kMax>(meta.data(), meta.size());
+    if (!info.IsVerticalFederated()) {
+      collective::Allreduce<collective::Operation::kMax>(meta.data(), meta.size());
+    }
     if (meta[0] == 0) {
       // Empty across all workers, which is not supported.
       auc = std::numeric_limits<double>::quiet_NaN();
@@ -289,15 +290,8 @@ class EvalAUC : public MetricNoCache {
         InvalidGroupAUC();
       }
 
-      std::array<double, 2> results{auc, static_cast<double>(valid_groups)};
-      collective::Allreduce<collective::Operation::kSum>(results.data(), results.size());
-      auc = results[0];
-      valid_groups = static_cast<uint32_t>(results[1]);
-
-      if (valid_groups <= 0) {
-        auc = std::numeric_limits<double>::quiet_NaN();
-      } else {
-        auc /= valid_groups;
+      auc = collective::GlobalRatio(info, auc, static_cast<double>(valid_groups));
+      if (!std::isnan(auc)) {
         CHECK_LE(auc, 1) << "Total AUC across groups: " << auc * valid_groups
                          << ", valid groups: " << valid_groups;
       }
@@ -317,17 +311,9 @@ class EvalAUC : public MetricNoCache {
         std::tie(fp, tp, auc) =
             static_cast<Curve *>(this)->EvalBinary(preds, info);
       }
-      double local_area = fp * tp;
-      std::array<double, 2> result{auc, local_area};
-      collective::Allreduce<collective::Operation::kSum>(result.data(), result.size());
-      std::tie(auc, local_area) = common::UnpackArr(std::move(result));
-      if (local_area <= 0) {
-        // the dataset across all workers have only positive or negative sample
-        auc = std::numeric_limits<double>::quiet_NaN();
-      } else {
-        CHECK_LE(auc, local_area);
-        // normalization
-        auc = auc / local_area;
+      auc = collective::GlobalRatio(info, auc, fp * tp);
+      if (!std::isnan(auc)) {
+        CHECK_LE(auc, 1.0);
       }
     }
     if (std::isnan(auc)) {
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index fb85cca8a..9f50ac124 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -8,6 +8,7 @@
  */
 #include <dmlc/registry.h>
 
+#include <array>
 #include <cmath>
 
 #include "../collective/communicator-inl.h"
@@ -213,10 +214,8 @@ class PseudoErrorLoss : public MetricNoCache {
           auto v = common::Sqr(slope) * (std::sqrt((1 + common::Sqr(a / slope))) - 1) * wt;
           return std::make_tuple(v, wt);
         });
-    double dat[2]{result.Residue(), result.Weights()};
-    if (collective::IsDistributed()) {
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
-    }
+    std::array<double, 2> dat{result.Residue(), result.Weights()};
+    collective::GlobalSum(info, &dat);
     return EvalRowMAPE::GetFinal(dat[0], dat[1]);
   }
 };
@@ -233,7 +232,7 @@ struct EvalError {
     }
   }
   const char *Name() const {
-    static std::string name;
+    static thread_local std::string name;
     if (has_param_) {
       std::ostringstream os;
       os << "error";
@@ -331,7 +330,7 @@ struct EvalTweedieNLogLik {
         << "tweedie variance power must be in interval [1, 2)";
   }
   const char *Name() const {
-    static std::string name;
+    static thread_local std::string name;
     std::ostringstream os;
     os << "tweedie-nloglik@" << rho_;
     name = os.str();
@@ -382,8 +381,8 @@ struct EvalEWiseBase : public MetricNoCache {
           return std::make_tuple(residue, wt);
         });
 
-    double dat[2]{result.Residue(), result.Weights()};
-    collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    std::array<double, 2> dat{result.Residue(), result.Weights()};
+    collective::GlobalSum(info, &dat);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
@@ -454,8 +453,8 @@ class QuantileError : public MetricNoCache {
     CHECK(!alpha_.Empty());
     if (info.num_row_ == 0) {
       // empty DMatrix on distributed env
-      double dat[2]{0.0, 0.0};
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
+      std::array<double, 2> dat{0.0, 0.0};
+      collective::GlobalSum(info, &dat);
       CHECK_GT(dat[1], 0);
       return dat[0] / dat[1];
     }
@@ -492,8 +491,8 @@ class QuantileError : public MetricNoCache {
               loss(y_predt(sample_id, quantile_id, target_id), y_true(sample_id, target_id)) * w;
           return std::make_tuple(l, w);
         });
-    double dat[2]{result.Residue(), result.Weights()};
-    collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    std::array<double, 2> dat{result.Residue(), result.Weights()};
+    collective::GlobalSum(info, &dat);
     CHECK_GT(dat[1], 0);
     return dat[0] / dat[1];
   }
diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h
index 5fbd6f256..a6fad7158 100644
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -9,6 +9,8 @@
 #include <memory>  // shared_ptr
 #include <string>
 
+#include "../collective/aggregator.h"
+#include "../collective/communicator-inl.h"
 #include "../common/common.h"
 #include "xgboost/metric.h"
 
@@ -20,7 +22,12 @@ class MetricNoCache : public Metric {
   virtual double Eval(HostDeviceVector<float> const &predts, MetaInfo const &info) = 0;
 
   double Evaluate(HostDeviceVector<float> const &predts, std::shared_ptr<DMatrix> p_fmat) final {
-    return this->Eval(predts, p_fmat->Info());
+    double result{0.0};
+    auto const& info = p_fmat->Info();
+    collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
+      result = this->Eval(predts, info);
+    });
+    return result;
   }
 };
 
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index c6cd80ae6..6c27f4100 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -6,6 +6,7 @@
  */
 #include <xgboost/metric.h>
 
+#include <array>
 #include <atomic>
 #include <cmath>
 
@@ -196,7 +197,7 @@ struct EvalMClassBase : public MetricNoCache {
     } else {
       CHECK(preds.Size() % info.labels.Size() == 0) << "label and prediction size not match";
     }
-    double dat[2] { 0.0, 0.0 };
+    std::array<double, 2> dat{0.0, 0.0};
     if (info.labels.Size() != 0) {
       const size_t nclass = preds.Size() / info.labels.Size();
       CHECK_GE(nclass, 1U)
@@ -208,7 +209,7 @@ struct EvalMClassBase : public MetricNoCache {
       dat[0] = result.Residue();
       dat[1] = result.Weights();
     }
-    collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    collective::GlobalSum(info, &dat);
     return Derived::GetFinal(dat[0], dat[1]);
   }
   /*!
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index a84d0edb1..c4549458d 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -28,9 +28,8 @@
 #include <algorithm>                         // for stable_sort, copy, fill_n, min, max
 #include <array>                             // for array
 #include <cmath>                             // for log, sqrt
-#include <cstddef>                           // for size_t, std
-#include <cstdint>                           // for uint32_t
 #include <functional>                        // for less, greater
+#include <limits>                            // for numeric_limits
 #include <map>                               // for operator!=, _Rb_tree_const_iterator
 #include <memory>                            // for allocator, unique_ptr, shared_ptr, __shared_...
 #include <numeric>                           // for accumulate
@@ -39,15 +38,11 @@
 #include <utility>                           // for pair, make_pair
 #include <vector>                            // for vector
 
-#include "../collective/communicator-inl.h"  // for IsDistributed, Allreduce
-#include "../collective/communicator.h"      // for Operation
+#include "../collective/aggregator.h"        // for ApplyWithLabels
 #include "../common/algorithm.h"             // for ArgSort, Sort
 #include "../common/linalg_op.h"             // for cbegin, cend
 #include "../common/math.h"                  // for CmpFirst
 #include "../common/optional_weight.h"       // for OptionalWeights, MakeOptionalWeights
-#include "../common/ranking_utils.h"         // for LambdaRankParam, NDCGCache, ParseMetricName
-#include "../common/threading_utils.h"       // for ParallelFor
-#include "../common/transform_iterator.h"    // for IndexTransformIter
 #include "dmlc/common.h"                     // for OMPException
 #include "metric_common.h"                   // for MetricNoCache, GPUMetric, PackedReduceResult
 #include "xgboost/base.h"                    // for bst_float, bst_omp_uint, bst_group_t, Args
@@ -59,7 +54,6 @@
 #include "xgboost/linalg.h"                  // for Tensor, TensorView, Range, VectorView, MakeT...
 #include "xgboost/logging.h"                 // for CHECK, ConsoleLogger, LOG_INFO, CHECK_EQ
 #include "xgboost/metric.h"                  // for MetricReg, XGBOOST_REGISTER_METRIC, Metric
-#include "xgboost/span.h"                    // for Span, operator!=
 #include "xgboost/string_view.h"             // for StringView
 
 namespace {
@@ -244,14 +238,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
       exc.Rethrow();
     }
 
-    if (collective::IsDistributed()) {
-      double dat[2]{sum_metric, static_cast<double>(ngroups)};
-      // approximately estimate the metric using mean
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
-      return dat[0] / dat[1];
-    } else {
-      return sum_metric / ngroups;
-    }
+    return collective::GlobalRatio(info, sum_metric, static_cast<double>(ngroups));
   }
 
   const char* Name() const override {
@@ -385,15 +372,19 @@ class EvalRankWithCache : public Metric {
   }
 
   double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
+    double result{0.0};
     auto const& info = p_fmat->Info();
-    auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
-    if (p_cache->Param() != param_) {
-      p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
-    }
-    CHECK(p_cache->Param() == param_);
-    CHECK_EQ(preds.Size(), info.labels.Size());
+    collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
+      auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
+      if (p_cache->Param() != param_) {
+        p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
+      }
+      CHECK(p_cache->Param() == param_);
+      CHECK_EQ(preds.Size(), info.labels.Size());
 
-    return this->Eval(preds, info, p_cache);
+      result = this->Eval(preds, info, p_cache);
+    });
+    return result;
   }
 
   virtual double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
@@ -401,9 +392,10 @@ class EvalRankWithCache : public Metric {
 };
 
 namespace {
-double Finalize(double score, double sw) {
+double Finalize(MetaInfo const& info, double score, double sw) {
   std::array<double, 2> dat{score, sw};
-  collective::Allreduce<collective::Operation::kSum>(dat.data(), dat.size());
+  collective::GlobalSum(info, &dat);
+  std::tie(score, sw) = std::tuple_cat(dat);
   if (sw > 0.0) {
     score = score / sw;
   }
@@ -430,7 +422,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
               std::shared_ptr<ltr::NDCGCache> p_cache) override {
     if (ctx_->IsCUDA()) {
       auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
-      return Finalize(ndcg.Residue(), ndcg.Weights());
+      return Finalize(info, ndcg.Residue(), ndcg.Weights());
     }
 
     // group local ndcg
@@ -476,7 +468,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
       sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
     }
     auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
-    return Finalize(ndcg, sum_w);
+    return Finalize(info, ndcg, sum_w);
   }
 };
 
@@ -489,7 +481,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
               std::shared_ptr<ltr::MAPCache> p_cache) override {
     if (ctx_->IsCUDA()) {
       auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
-      return Finalize(map.Residue(), map.Weights());
+      return Finalize(info, map.Residue(), map.Weights());
     }
 
     auto gptr = p_cache->DataGroupPtr(ctx_);
@@ -501,7 +493,6 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
     auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
 
     common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
-      auto g_predt = h_predt.Slice(linalg::Range(gptr[g], gptr[g + 1]));
       auto g_label = h_label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
       auto g_rank = rank_idx.subspan(gptr[g]);
 
@@ -532,7 +523,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
       sw += weight[i];
     }
     auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
-    return Finalize(sum, sw);
+    return Finalize(info, sum, sw);
   }
 };
 
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 793337b96..e4accc436 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -7,6 +7,7 @@
 
 #include <dmlc/registry.h>
 
+#include <array>
 #include <memory>
 #include <vector>
 
@@ -234,8 +235,8 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
     auto result = reducer_.Reduce(*ctx_, info.weights_, info.labels_lower_bound_,
                                   info.labels_upper_bound_, preds);
 
-    double dat[2]{result.Residue(), result.Weights()};
-    collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    std::array<double, 2> dat{result.Residue(), result.Weights()};
+    collective::GlobalSum(info, &dat);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index bd8609d67..53676a4b8 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -99,44 +99,40 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
   auto h_predt = linalg::MakeTensorView(ctx, predt.ConstHostSpan(), info.num_row_,
                                         predt.Size() / info.num_row_);
 
-  if (!info.IsVerticalFederated() || collective::GetRank() == 0) {
-    // loop over each leaf
-    common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
-      auto nidx = h_node_idx[k];
-      CHECK(tree[nidx].IsLeaf());
-      CHECK_LT(k + 1, h_node_ptr.size());
-      size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
-      auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
+  collective::ApplyWithLabels(
+      info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
+        // loop over each leaf
+        common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
+          auto nidx = h_node_idx[k];
+          CHECK(tree[nidx].IsLeaf());
+          CHECK_LT(k + 1, h_node_ptr.size());
+          size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
+          auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
 
-      auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
-      auto h_weights = linalg::MakeVec(&info.weights_);
+          auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
+          auto h_weights = linalg::MakeVec(&info.weights_);
 
-      auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
-        auto row_idx = h_row_set[i];
-        return h_labels(row_idx) - h_predt(row_idx, group_idx);
+          auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
+            auto row_idx = h_row_set[i];
+            return h_labels(row_idx) - h_predt(row_idx, group_idx);
+          });
+          auto w_it = common::MakeIndexTransformIter([&](size_t i) -> float {
+            auto row_idx = h_row_set[i];
+            return h_weights(row_idx);
+          });
+
+          float q{0};
+          if (info.weights_.Empty()) {
+            q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
+          } else {
+            q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
+          }
+          if (std::isnan(q)) {
+            CHECK(h_row_set.empty());
+          }
+          quantiles.at(k) = q;
+        });
       });
-      auto w_it = common::MakeIndexTransformIter([&](size_t i) -> float {
-        auto row_idx = h_row_set[i];
-        return h_weights(row_idx);
-      });
-
-      float q{0};
-      if (info.weights_.Empty()) {
-        q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
-      } else {
-        q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
-      }
-      if (std::isnan(q)) {
-        CHECK(h_row_set.empty());
-      }
-      quantiles.at(k) = q;
-    });
-  }
-
-  if (info.IsVerticalFederated()) {
-    collective::Broadcast(static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float),
-                          0);
-  }
 
   UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
 }
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index 7494bceb1..ffd3ddec7 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -6,8 +6,9 @@
 #include <algorithm>
 #include <cstdint>  // std::int32_t
 #include <limits>
-#include <vector>  // std::vector
+#include <vector>   // std::vector
 
+#include "../collective/aggregator.h"
 #include "../collective/communicator-inl.h"
 #include "../common/common.h"
 #include "xgboost/base.h"                // bst_node_t
@@ -41,10 +42,7 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
   auto& quantiles = *p_quantiles;
   auto const& h_node_idx = nidx;
 
-  size_t n_leaf{h_node_idx.size()};
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kMax>(&n_leaf, 1);
-  }
+  size_t n_leaf = collective::GlobalMax(info, h_node_idx.size());
   CHECK(quantiles.empty() || quantiles.size() == n_leaf);
   if (quantiles.empty()) {
     quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
@@ -54,16 +52,12 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
   std::vector<int32_t> n_valids(quantiles.size());
   std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
                  [](float q) { return static_cast<int32_t>(!std::isnan(q)); });
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(n_valids.data(), n_valids.size());
-  }
+  collective::GlobalSum(info, &n_valids);
   // convert to 0 for all reduce
   std::replace_if(
       quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
   // use the mean value
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(quantiles.data(), quantiles.size());
-  }
+  collective::GlobalSum(info, &quantiles);
   for (size_t i = 0; i < n_leaf; ++i) {
     if (n_valids[i] > 0) {
       quantiles[i] /= static_cast<float>(n_valids[i]);
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
new file mode 100644
index 000000000..d0ff5bda5
--- /dev/null
+++ b/src/objective/lambdarank_obj.cc
@@ -0,0 +1,633 @@
+/**
+ * Copyright (c) 2023, XGBoost contributors
+ */
+#include "lambdarank_obj.h"
+
+#include <dmlc/registry.h>                 // for DMLC_REGISTRY_FILE_TAG
+
+#include <algorithm>                       // for transform, copy, fill_n, min, max
+#include <cmath>                           // for pow, log2
+#include <cstddef>                         // for size_t
+#include <cstdint>                         // for int32_t
+#include <map>                             // for operator!=
+#include <memory>                          // for shared_ptr, __shared_ptr_access, allocator
+#include <ostream>                         // for operator<<, basic_ostream
+#include <string>                          // for char_traits, operator<, basic_string, string
+#include <tuple>                           // for apply, make_tuple
+#include <type_traits>                     // for is_floating_point
+#include <utility>                         // for pair, swap
+#include <vector>                          // for vector
+
+#include "../common/error_msg.h"           // for GroupWeight, LabelScoreSize
+#include "../common/linalg_op.h"           // for begin, cbegin, cend
+#include "../common/optional_weight.h"     // for MakeOptionalWeights, OptionalWeights
+#include "../common/ranking_utils.h"       // for RankingCache, LambdaRankParam, MAPCache, NDCGC...
+#include "../common/threading_utils.h"     // for ParallelFor, Sched
+#include "../common/transform_iterator.h"  // for IndexTransformIter
+#include "init_estimation.h"               // for FitIntercept
+#include "xgboost/base.h"                  // for bst_group_t, GradientPair, kRtEps, GradientPai...
+#include "xgboost/context.h"               // for Context
+#include "xgboost/data.h"                  // for MetaInfo
+#include "xgboost/host_device_vector.h"    // for HostDeviceVector
+#include "xgboost/json.h"                  // for Json, get, Value, ToJson, F32Array, FromJson, IsA
+#include "xgboost/linalg.h"                // for Vector, Range, TensorView, VectorView, All
+#include "xgboost/logging.h"               // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_LE, CHE...
+#include "xgboost/objective.h"             // for ObjFunctionReg, XGBOOST_REGISTER_OBJECTIVE
+#include "xgboost/span.h"                  // for Span, operator!=
+#include "xgboost/string_view.h"           // for operator<<, StringView
+#include "xgboost/task.h"                  // for ObjInfo
+
+namespace xgboost::obj {
+namespace cpu_impl {
+void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double const> li_full,
+                                  linalg::VectorView<double const> lj_full,
+                                  linalg::Vector<double>* p_ti_plus,
+                                  linalg::Vector<double>* p_tj_minus, linalg::Vector<double>* p_li,
+                                  linalg::Vector<double>* p_lj,
+                                  std::shared_ptr<ltr::RankingCache> p_cache) {
+  auto ti_plus = p_ti_plus->HostView();
+  auto tj_minus = p_tj_minus->HostView();
+  auto li = p_li->HostView();
+  auto lj = p_lj->HostView();
+
+  auto gptr = p_cache->DataGroupPtr(ctx);
+  auto n_groups = p_cache->Groups();
+  auto regularizer = p_cache->Param().Regularizer();
+
+  // Aggregate over query groups
+  for (bst_group_t g{0}; g < n_groups; ++g) {
+    auto begin = gptr[g];
+    auto end = gptr[g + 1];
+    std::size_t group_size = end - begin;
+    auto n = std::min(group_size, p_cache->MaxPositionSize());
+
+    auto g_li = li_full.Slice(linalg::Range(begin, end));
+    auto g_lj = lj_full.Slice(linalg::Range(begin, end));
+
+    for (std::size_t i{0}; i < n; ++i) {
+      li(i) += g_li(i);
+      lj(i) += g_lj(i);
+    }
+  }
+
+  // The ti+ is not guaranteed to decrease since it depends on the |\delta Z|
+  //
+  // The update normalizes the ti+ to make ti+(0) equal to 1, which breaks the probability
+  // meaning. The reasoning behind the normalization is not clear, here we are just
+  // following the authors.
+  for (std::size_t i = 0; i < ti_plus.Size(); ++i) {
+    if (li(0) >= Eps64()) {
+      ti_plus(i) = std::pow(li(i) / li(0), regularizer);  // eq.30
+    }
+    if (lj(0) >= Eps64()) {
+      tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);  // eq.31
+    }
+    assert(!std::isinf(ti_plus(i)));
+    assert(!std::isinf(tj_minus(i)));
+  }
+}
+}  // namespace cpu_impl
+
+/**
+ * \brief Base class for pair-wise learning to rank.
+ *
+ *   See `From RankNet to LambdaRank to LambdaMART: An Overview` for a description of the
+ *   algorithm.
+ *
+ *   In addition to ranking, this also implements `Unbiased LambdaMART: An Unbiased
+ *   Pairwise Learning-to-Rank Algorithm`.
+ */
+template <typename Loss, typename Cache>
+class LambdaRankObj : public FitIntercept {
+  MetaInfo const* p_info_{nullptr};
+
+  // Update position biased for unbiased click data
+  void UpdatePositionBias() {
+    li_full_.SetDevice(ctx_->gpu_id);
+    lj_full_.SetDevice(ctx_->gpu_id);
+    li_.SetDevice(ctx_->gpu_id);
+    lj_.SetDevice(ctx_->gpu_id);
+
+    if (ctx_->IsCPU()) {
+      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
+                                             lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
+                                             &li_, &lj_, p_cache_);
+    } else {
+      cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
+                                              lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
+                                              &li_, &lj_, p_cache_);
+    }
+
+    li_full_.Data()->Fill(0.0);
+    lj_full_.Data()->Fill(0.0);
+
+    li_.Data()->Fill(0.0);
+    lj_.Data()->Fill(0.0);
+  }
+
+ protected:
+  // L / tj-* (eq. 30)
+  linalg::Vector<double> li_;
+  // L / ti+* (eq. 31)
+  linalg::Vector<double> lj_;
+  // position bias ratio for relevant doc, ti+ (eq. 30)
+  linalg::Vector<double> ti_plus_;
+  // position bias ratio for irrelevant doc, tj- (eq. 31)
+  linalg::Vector<double> tj_minus_;
+  // li buffer for all samples
+  linalg::Vector<double> li_full_;
+  // lj buffer for all samples
+  linalg::Vector<double> lj_full_;
+
+  ltr::LambdaRankParam param_;
+  // cache
+  std::shared_ptr<ltr::RankingCache> p_cache_;
+
+  [[nodiscard]] std::shared_ptr<Cache> GetCache() const {
+    auto ptr = std::static_pointer_cast<Cache>(p_cache_);
+    CHECK(ptr);
+    return ptr;
+  }
+
+  // get group view for li/lj
+  linalg::VectorView<double> GroupLoss(bst_group_t g, linalg::Vector<double>* v) const {
+    auto gptr = p_cache_->DataGroupPtr(ctx_);
+    auto begin = gptr[g];
+    auto end = gptr[g + 1];
+    if (param_.lambdarank_unbiased) {
+      return v->HostView().Slice(linalg::Range(begin, end));
+    }
+    return v->HostView();
+  }
+
+  // Calculate lambda gradient for each group on CPU.
+  template <bool unbiased, typename Delta>
+  void CalcLambdaForGroup(std::int32_t iter, common::Span<float const> g_predt,
+                          linalg::VectorView<float const> g_label, float w,
+                          common::Span<std::size_t const> g_rank, bst_group_t g, Delta delta,
+                          common::Span<GradientPair> g_gpair) {
+    std::fill_n(g_gpair.data(), g_gpair.size(), GradientPair{});
+    auto p_gpair = g_gpair.data();
+
+    auto ti_plus = ti_plus_.HostView();
+    auto tj_minus = tj_minus_.HostView();
+
+    auto li = GroupLoss(g, &li_full_);
+    auto lj = GroupLoss(g, &lj_full_);
+
+    // Normalization, first used by LightGBM.
+    // https://github.com/microsoft/LightGBM/pull/2331#issuecomment-523259298
+    double sum_lambda{0.0};
+
+    auto delta_op = [&](auto const&... args) { return delta(args..., g); };
+
+    auto loop = [&](std::size_t i, std::size_t j) {
+      // higher/lower on the target ranked list
+      std::size_t rank_high = i, rank_low = j;
+      if (g_label(g_rank[rank_high]) == g_label(g_rank[rank_low])) {
+        return;
+      }
+      if (g_label(g_rank[rank_high]) < g_label(g_rank[rank_low])) {
+        std::swap(rank_high, rank_low);
+      }
+
+      double cost;
+      auto pg = LambdaGrad<unbiased>(g_label, g_predt, g_rank, rank_high, rank_low, delta_op,
+                                     ti_plus, tj_minus, &cost);
+      auto ng = Repulse(pg);
+
+      std::size_t idx_high = g_rank[rank_high];
+      std::size_t idx_low = g_rank[rank_low];
+      p_gpair[idx_high] += pg;
+      p_gpair[idx_low] += ng;
+
+      if (unbiased) {
+        auto k = ti_plus.Size();
+        // We can probably use all the positions. If we skip the update due to having
+        // high/low > k, we might be losing out too many pairs. On the other hand, if we
+        // cap the position, then we might be accumulating too many tail bias into the
+        // last tracked position.
+        // We use `idx_high` since it represents the original position from the label
+        // list, and label list is assumed to be sorted.
+        if (idx_high < k && idx_low < k) {
+          if (tj_minus(idx_low) >= Eps64()) {
+            li(idx_high) += cost / tj_minus(idx_low);  // eq.30
+          }
+          if (ti_plus(idx_high) >= Eps64()) {
+            lj(idx_low) += cost / ti_plus(idx_high);  // eq.31
+          }
+        }
+      }
+
+      sum_lambda += -2.0 * static_cast<double>(pg.GetGrad());
+    };
+
+    MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
+    if (sum_lambda > 0.0) {
+      double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+      std::transform(g_gpair.data(), g_gpair.data() + g_gpair.size(), g_gpair.data(),
+                     [norm](GradientPair const& g) { return g * norm; });
+    }
+
+    auto w_norm = p_cache_->WeightNorm();
+    std::transform(g_gpair.begin(), g_gpair.end(), g_gpair.begin(),
+                   [&](GradientPair const& gpair) { return gpair * w * w_norm; });
+  }
+
+ public:
+  void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); }
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String(Loss::Name());
+    out["lambdarank_param"] = ToJson(param_);
+
+    auto save_bias = [](linalg::Vector<double> const& in, Json out) {
+      auto& out_array = get<F32Array>(out);
+      out_array.resize(in.Size());
+      auto h_in = in.HostView();
+      std::copy(linalg::cbegin(h_in), linalg::cend(h_in), out_array.begin());
+    };
+
+    if (param_.lambdarank_unbiased) {
+      out["ti+"] = F32Array();
+      save_bias(ti_plus_, out["ti+"]);
+      out["tj-"] = F32Array();
+      save_bias(tj_minus_, out["tj-"]);
+    }
+  }
+  void LoadConfig(Json const& in) override {
+    auto const& obj = get<Object const>(in);
+    if (obj.find("lambdarank_param") != obj.cend()) {
+      FromJson(in["lambdarank_param"], &param_);
+    }
+
+    if (param_.lambdarank_unbiased) {
+      auto load_bias = [](Json in, linalg::Vector<double>* out) {
+        if (IsA<F32Array>(in)) {
+          // JSON
+          auto const& array = get<F32Array>(in);
+          out->Reshape(array.size());
+          auto h_out = out->HostView();
+          std::copy(array.cbegin(), array.cend(), linalg::begin(h_out));
+        } else {
+          // UBJSON
+          auto const& array = get<Array>(in);
+          out->Reshape(array.size());
+          auto h_out = out->HostView();
+          std::transform(array.cbegin(), array.cend(), linalg::begin(h_out),
+                         [](Json const& v) { return get<Number const>(v); });
+        }
+      };
+      load_bias(in["ti+"], &ti_plus_);
+      load_bias(in["tj-"], &tj_minus_);
+    }
+  }
+
+  [[nodiscard]] ObjInfo Task() const override { return ObjInfo{ObjInfo::kRanking}; }
+
+  [[nodiscard]] bst_target_t Targets(MetaInfo const& info) const override {
+    CHECK_LE(info.labels.Shape(1), 1) << "multi-output for LTR is not yet supported.";
+    return 1;
+  }
+
+  [[nodiscard]] const char* RankEvalMetric(StringView metric) const {
+    static thread_local std::string name;
+    if (param_.HasTruncation()) {
+      name = ltr::MakeMetricName(metric, param_.NumPair(), false);
+    } else {
+      name = ltr::MakeMetricName(metric, param_.NotSet(), false);
+    }
+    return name.c_str();
+  }
+
+  void GetGradient(HostDeviceVector<float> const& predt, MetaInfo const& info, std::int32_t iter,
+                   HostDeviceVector<GradientPair>* out_gpair) override {
+    CHECK_EQ(info.labels.Size(), predt.Size()) << error::LabelScoreSize();
+
+    // init/renew cache
+    if (!p_cache_ || p_info_ != &info || p_cache_->Param() != param_) {
+      p_cache_ = std::make_shared<Cache>(ctx_, info, param_);
+      p_info_ = &info;
+    }
+    auto n_groups = p_cache_->Groups();
+    if (!info.weights_.Empty()) {
+      CHECK_EQ(info.weights_.Size(), n_groups) << error::GroupWeight();
+    }
+
+    if (ti_plus_.Size() == 0 && param_.lambdarank_unbiased) {
+      CHECK_EQ(iter, 0);
+      ti_plus_ = linalg::Constant<double>(ctx_, 1.0, p_cache_->MaxPositionSize());
+      tj_minus_ = linalg::Constant<double>(ctx_, 1.0, p_cache_->MaxPositionSize());
+
+      li_ = linalg::Zeros<double>(ctx_, p_cache_->MaxPositionSize());
+      lj_ = linalg::Zeros<double>(ctx_, p_cache_->MaxPositionSize());
+
+      li_full_ = linalg::Zeros<double>(ctx_, info.num_row_);
+      lj_full_ = linalg::Zeros<double>(ctx_, info.num_row_);
+    }
+    static_cast<Loss*>(this)->GetGradientImpl(iter, predt, info, out_gpair);
+
+    if (param_.lambdarank_unbiased) {
+      this->UpdatePositionBias();
+    }
+  }
+};
+
+class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
+ public:
+  template <bool unbiased, bool exp_gain>
+  void CalcLambdaForGroupNDCG(std::int32_t iter, common::Span<float const> g_predt,
+                              linalg::VectorView<float const> g_label, float w,
+                              common::Span<std::size_t const> g_rank,
+                              common::Span<GradientPair> g_gpair,
+                              linalg::VectorView<double const> inv_IDCG,
+                              common::Span<double const> discount, bst_group_t g) {
+    auto delta = [&](auto y_high, auto y_low, std::size_t rank_high, std::size_t rank_low,
+                     bst_group_t g) {
+      static_assert(std::is_floating_point<decltype(y_high)>::value);
+      return DeltaNDCG<exp_gain>(y_high, y_low, rank_high, rank_low, inv_IDCG(g), discount);
+    };
+    this->CalcLambdaForGroup<unbiased>(iter, g_predt, g_label, w, g_rank, g, delta, g_gpair);
+  }
+
+  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
+                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+    if (ctx_->IsCUDA()) {
+      cuda_impl::LambdaRankGetGradientNDCG(
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
+          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
+          out_gpair);
+      return;
+    }
+
+    bst_group_t n_groups = p_cache_->Groups();
+    auto gptr = p_cache_->DataGroupPtr(ctx_);
+
+    out_gpair->Resize(info.num_row_);
+    auto h_gpair = out_gpair->HostSpan();
+    auto h_predt = predt.ConstHostSpan();
+    auto h_label = info.labels.HostView();
+    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
+
+    auto dct = GetCache()->Discount(ctx_);
+    auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
+    auto inv_IDCG = GetCache()->InvIDCG(ctx_);
+
+    common::ParallelFor(n_groups, ctx_->Threads(), common::Sched::Guided(), [&](auto g) {
+      std::size_t cnt = gptr[g + 1] - gptr[g];
+      auto w = h_weight[g];
+      auto g_predt = h_predt.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_label = h_label.Slice(make_range(g), 0);
+      auto g_rank = rank_idx.subspan(gptr[g], cnt);
+
+      auto args =
+          std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g_gpair, inv_IDCG, dct, g);
+
+      if (param_.lambdarank_unbiased) {
+        if (param_.ndcg_exp_gain) {
+          std::apply(&LambdaRankNDCG::CalcLambdaForGroupNDCG<true, true>, args);
+        } else {
+          std::apply(&LambdaRankNDCG::CalcLambdaForGroupNDCG<true, false>, args);
+        }
+      } else {
+        if (param_.ndcg_exp_gain) {
+          std::apply(&LambdaRankNDCG::CalcLambdaForGroupNDCG<false, true>, args);
+        } else {
+          std::apply(&LambdaRankNDCG::CalcLambdaForGroupNDCG<false, false>, args);
+        }
+      }
+    });
+  }
+
+  static char const* Name() { return "rank:ndcg"; }
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
+    return this->RankEvalMetric("ndcg");
+  }
+  [[nodiscard]] Json DefaultMetricConfig() const override {
+    Json config{Object{}};
+    config["name"] = String{DefaultEvalMetric()};
+    config["lambdarank_param"] = ToJson(param_);
+    return config;
+  }
+};
+
+namespace cuda_impl {
+#if !defined(XGBOOST_USE_CUDA)
+void LambdaRankGetGradientNDCG(Context const*, std::int32_t, HostDeviceVector<float> const&,
+                               const MetaInfo&, std::shared_ptr<ltr::NDCGCache>,
+                               linalg::VectorView<double const>,  // input bias ratio
+                               linalg::VectorView<double const>,  // input bias ratio
+                               linalg::VectorView<double>, linalg::VectorView<double>,
+                               HostDeviceVector<GradientPair>*) {
+  common::AssertGPUSupport();
+}
+
+void LambdaRankUpdatePositionBias(Context const*, linalg::VectorView<double const>,
+                                  linalg::VectorView<double const>, linalg::Vector<double>*,
+                                  linalg::Vector<double>*, linalg::Vector<double>*,
+                                  linalg::Vector<double>*, std::shared_ptr<ltr::RankingCache>) {
+  common::AssertGPUSupport();
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace cuda_impl
+
+namespace cpu_impl {
+void MAPStat(Context const* ctx, linalg::VectorView<float const> label,
+             common::Span<std::size_t const> rank_idx, std::shared_ptr<ltr::MAPCache> p_cache) {
+  auto h_n_rel = p_cache->NumRelevant(ctx);
+  auto gptr = p_cache->DataGroupPtr(ctx);
+
+  CHECK_EQ(h_n_rel.size(), gptr.back());
+  CHECK_EQ(h_n_rel.size(), label.Size());
+
+  auto h_acc = p_cache->Acc(ctx);
+
+  common::ParallelFor(p_cache->Groups(), ctx->Threads(), [&](auto g) {
+    auto cnt = gptr[g + 1] - gptr[g];
+    auto g_n_rel = h_n_rel.subspan(gptr[g], cnt);
+    auto g_rank = rank_idx.subspan(gptr[g], cnt);
+    auto g_label = label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
+
+    // The number of relevant documents at each position
+    g_n_rel[0] = g_label(g_rank[0]);
+    for (std::size_t k = 1; k < g_rank.size(); ++k) {
+      g_n_rel[k] = g_n_rel[k - 1] + g_label(g_rank[k]);
+    }
+
+    // \sum l_k/k
+    auto g_acc = h_acc.subspan(gptr[g], cnt);
+    g_acc[0] = g_label(g_rank[0]) / 1.0;
+
+    for (std::size_t k = 1; k < g_rank.size(); ++k) {
+      g_acc[k] = g_acc[k - 1] + (g_label(g_rank[k]) / static_cast<double>(k + 1));
+    }
+  });
+}
+}  // namespace cpu_impl
+
+class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
+ public:
+  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
+                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the MAP objective.";
+    if (ctx_->IsCUDA()) {
+      return cuda_impl::LambdaRankGetGradientMAP(
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
+          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
+          out_gpair);
+    }
+
+    auto gptr = p_cache_->DataGroupPtr(ctx_).data();
+    bst_group_t n_groups = p_cache_->Groups();
+
+    out_gpair->Resize(info.num_row_);
+    auto h_gpair = out_gpair->HostSpan();
+    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
+    auto h_predt = predt.ConstHostSpan();
+    auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
+    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
+
+    auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
+
+    cpu_impl::MAPStat(ctx_, h_label, rank_idx, GetCache());
+    auto n_rel = GetCache()->NumRelevant(ctx_);
+    auto acc = GetCache()->Acc(ctx_);
+
+    auto delta_map = [&](auto y_high, auto y_low, std::size_t rank_high, std::size_t rank_low,
+                         bst_group_t g) {
+      if (rank_high > rank_low) {
+        std::swap(rank_high, rank_low);
+        std::swap(y_high, y_low);
+      }
+      auto cnt = gptr[g + 1] - gptr[g];
+      // In a hot loop
+      auto g_n_rel = common::Span<double const>{n_rel.data() + gptr[g], cnt};
+      auto g_acc = common::Span<double const>{acc.data() + gptr[g], cnt};
+      auto d = DeltaMAP(y_high, y_low, rank_high, rank_low, g_n_rel, g_acc);
+      return d;
+    };
+    using D = decltype(delta_map);
+
+    common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
+      auto cnt = gptr[g + 1] - gptr[g];
+      auto w = h_weight[g];
+      auto g_predt = h_predt.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_label = h_label.Slice(make_range(g));
+      auto g_rank = rank_idx.subspan(gptr[g], cnt);
+
+      auto args = std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g, delta_map, g_gpair);
+
+      if (param_.lambdarank_unbiased) {
+        std::apply(&LambdaRankMAP::CalcLambdaForGroup<true, D>, args);
+      } else {
+        std::apply(&LambdaRankMAP::CalcLambdaForGroup<false, D>, args);
+      }
+    });
+  }
+  static char const* Name() { return "rank:map"; }
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
+    return this->RankEvalMetric("map");
+  }
+};
+
+#if !defined(XGBOOST_USE_CUDA)
+namespace cuda_impl {
+void MAPStat(Context const*, MetaInfo const&, common::Span<std::size_t const>,
+             std::shared_ptr<ltr::MAPCache>) {
+  common::AssertGPUSupport();
+}
+
+void LambdaRankGetGradientMAP(Context const*, std::int32_t, HostDeviceVector<float> const&,
+                              const MetaInfo&, std::shared_ptr<ltr::MAPCache>,
+                              linalg::VectorView<double const>,  // input bias ratio
+                              linalg::VectorView<double const>,  // input bias ratio
+                              linalg::VectorView<double>, linalg::VectorView<double>,
+                              HostDeviceVector<GradientPair>*) {
+  common::AssertGPUSupport();
+}
+}  // namespace cuda_impl
+#endif  // !defined(XGBOOST_USE_CUDA)
+
+/**
+ * \brief The RankNet loss.
+ */
+class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::RankingCache> {
+ public:
+  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
+                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the pairwise objective.";
+    if (ctx_->IsCUDA()) {
+      return cuda_impl::LambdaRankGetGradientPairwise(
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
+          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
+          out_gpair);
+    }
+
+    auto gptr = p_cache_->DataGroupPtr(ctx_);
+    bst_group_t n_groups = p_cache_->Groups();
+
+    out_gpair->Resize(info.num_row_);
+    auto h_gpair = out_gpair->HostSpan();
+    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
+    auto h_predt = predt.ConstHostSpan();
+    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
+
+    auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
+    auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
+
+    auto delta = [](auto...) { return 1.0; };
+    using D = decltype(delta);
+
+    common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
+      auto cnt = gptr[g + 1] - gptr[g];
+      auto w = h_weight[g];
+      auto g_predt = h_predt.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_label = h_label.Slice(make_range(g));
+      auto g_rank = rank_idx.subspan(gptr[g], cnt);
+
+      auto args = std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g, delta, g_gpair);
+      if (param_.lambdarank_unbiased) {
+        std::apply(&LambdaRankPairwise::CalcLambdaForGroup<true, D>, args);
+      } else {
+        std::apply(&LambdaRankPairwise::CalcLambdaForGroup<false, D>, args);
+      }
+    });
+  }
+
+  static char const* Name() { return "rank:pairwise"; }
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
+    return this->RankEvalMetric("ndcg");
+  }
+};
+
+#if !defined(XGBOOST_USE_CUDA)
+namespace cuda_impl {
+void LambdaRankGetGradientPairwise(Context const*, std::int32_t, HostDeviceVector<float> const&,
+                                   const MetaInfo&, std::shared_ptr<ltr::RankingCache>,
+                                   linalg::VectorView<double const>,  // input bias ratio
+                                   linalg::VectorView<double const>,  // input bias ratio
+                                   linalg::VectorView<double>, linalg::VectorView<double>,
+                                   HostDeviceVector<GradientPair>*) {
+  common::AssertGPUSupport();
+}
+}  // namespace cuda_impl
+#endif  // !defined(XGBOOST_USE_CUDA)
+
+XGBOOST_REGISTER_OBJECTIVE(LambdaRankNDCG, LambdaRankNDCG::Name())
+    .describe("LambdaRank with NDCG loss as objective")
+    .set_body([]() { return new LambdaRankNDCG{}; });
+
+XGBOOST_REGISTER_OBJECTIVE(LambdaRankPairwise, LambdaRankPairwise::Name())
+    .describe("LambdaRank with RankNet loss as objective")
+    .set_body([]() { return new LambdaRankPairwise{}; });
+
+XGBOOST_REGISTER_OBJECTIVE(LambdaRankMAP, LambdaRankMAP::Name())
+    .describe("LambdaRank with MAP loss as objective.")
+    .set_body([]() { return new LambdaRankMAP{}; });
+
+DMLC_REGISTRY_FILE_TAG(lambdarank_obj);
+}  // namespace xgboost::obj
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index eb82b17b4..110e4ae87 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -37,6 +37,312 @@ namespace xgboost::obj {
 DMLC_REGISTRY_FILE_TAG(lambdarank_obj_cu);
 
 namespace cuda_impl {
+namespace {
+/**
+ * \brief Calculate minimum value of bias for floating point truncation.
+ */
+void MinBias(Context const* ctx, std::shared_ptr<ltr::RankingCache> p_cache,
+             linalg::VectorView<double const> t_plus, linalg::VectorView<double const> tj_minus,
+             common::Span<double> d_min) {
+  CHECK_EQ(d_min.size(), 2);
+  auto cuctx = ctx->CUDACtx();
+
+  auto k = t_plus.Size();
+  auto const& p = p_cache->Param();
+  CHECK_GT(k, 0);
+  CHECK_EQ(k, p_cache->MaxPositionSize());
+
+  auto key_it = dh::MakeTransformIterator<std::size_t>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return i * k; });
+  auto val_it = dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul),
+                                                  [=] XGBOOST_DEVICE(std::size_t i) {
+                                                    if (i >= k) {
+                                                      return std::abs(tj_minus(i - k));
+                                                    }
+                                                    return std::abs(t_plus(i));
+                                                  });
+  std::size_t bytes;
+  cub::DeviceSegmentedReduce::Min(nullptr, bytes, val_it, d_min.data(), 2, key_it, key_it + 1,
+                                  cuctx->Stream());
+  dh::TemporaryArray<char> temp(bytes);
+  cub::DeviceSegmentedReduce::Min(temp.data().get(), bytes, val_it, d_min.data(), 2, key_it,
+                                  key_it + 1, cuctx->Stream());
+}
+
+/**
+ * \brief Type for gradient statistic. (Gradient, cost for unbiased LTR, normalization factor)
+ */
+using GradCostNorm = thrust::tuple<GradientPair, double, double>;
+
+/**
+ * \brief Obtain and update the gradient for one pair.
+ */
+template <bool unbiased, bool has_truncation, typename Delta>
+struct GetGradOp {
+  MakePairsOp<has_truncation> make_pair;
+  Delta delta;
+
+  bool need_update;
+
+  auto __device__ operator()(std::size_t idx) -> GradCostNorm {
+    auto const& args = make_pair.args;
+    auto g = dh::SegmentId(args.d_threads_group_ptr, idx);
+
+    auto data_group_begin = static_cast<std::size_t>(args.d_group_ptr[g]);
+    std::size_t n_data = args.d_group_ptr[g + 1] - data_group_begin;
+    // obtain group segment data.
+    auto g_label = args.labels.Slice(linalg::Range(data_group_begin, data_group_begin + n_data), 0);
+    auto g_predt = args.predts.subspan(data_group_begin, n_data);
+    auto g_gpair = args.gpairs.subspan(data_group_begin, n_data).data();
+    auto g_rank = args.d_sorted_idx.subspan(data_group_begin, n_data);
+
+    auto [i, j] = make_pair(idx, g);
+
+    std::size_t rank_high = i, rank_low = j;
+    if (g_label(g_rank[i]) == g_label(g_rank[j])) {
+      return thrust::make_tuple(GradientPair{}, 0.0, 0.0);
+    }
+    if (g_label(g_rank[i]) < g_label(g_rank[j])) {
+      thrust::swap(rank_high, rank_low);
+    }
+
+    double cost{0};
+
+    auto delta_op = [&](auto const&... args) { return delta(args..., g); };
+    GradientPair pg = LambdaGrad<unbiased>(g_label, g_predt, g_rank, rank_high, rank_low, delta_op,
+                                           args.ti_plus, args.tj_minus, &cost);
+
+    std::size_t idx_high = g_rank[rank_high];
+    std::size_t idx_low = g_rank[rank_low];
+
+    if (need_update) {
+      // second run, update the gradient
+
+      auto ng = Repulse(pg);
+
+      auto gr = args.d_roundings(g);
+      // positive gradient truncated
+      auto pgt = GradientPair{common::TruncateWithRounding(gr.GetGrad(), pg.GetGrad()),
+                              common::TruncateWithRounding(gr.GetHess(), pg.GetHess())};
+      // negative gradient truncated
+      auto ngt = GradientPair{common::TruncateWithRounding(gr.GetGrad(), ng.GetGrad()),
+                              common::TruncateWithRounding(gr.GetHess(), ng.GetHess())};
+
+      dh::AtomicAddGpair(g_gpair + idx_high, pgt);
+      dh::AtomicAddGpair(g_gpair + idx_low, ngt);
+    }
+
+    if (unbiased && need_update) {
+      // second run, update the cost
+      assert(args.tj_minus.Size() == args.ti_plus.Size() && "Invalid size of position bias");
+
+      auto g_li = args.li.Slice(linalg::Range(data_group_begin, data_group_begin + n_data));
+      auto g_lj = args.lj.Slice(linalg::Range(data_group_begin, data_group_begin + n_data));
+
+      if (idx_high < args.ti_plus.Size() && idx_low < args.ti_plus.Size()) {
+        if (args.tj_minus(idx_low) >= Eps64()) {
+          // eq.30
+          atomicAdd(&g_li(idx_high), common::TruncateWithRounding(args.d_cost_rounding[0],
+                                                                  cost / args.tj_minus(idx_low)));
+        }
+        if (args.ti_plus(idx_high) >= Eps64()) {
+          // eq.31
+          atomicAdd(&g_lj(idx_low), common::TruncateWithRounding(args.d_cost_rounding[0],
+                                                                 cost / args.ti_plus(idx_high)));
+        }
+      }
+    }
+    return thrust::make_tuple(GradientPair{std::abs(pg.GetGrad()), std::abs(pg.GetHess())},
+                              std::abs(cost), -2.0 * static_cast<double>(pg.GetGrad()));
+  }
+};
+
+template <bool unbiased, bool has_truncation, typename Delta>
+struct MakeGetGrad {
+  MakePairsOp<has_truncation> make_pair;
+  Delta delta;
+
+  [[nodiscard]] KernelInputs const& Args() const { return make_pair.args; }
+
+  MakeGetGrad(KernelInputs args, Delta d) : make_pair{args}, delta{std::move(d)} {}
+
+  GetGradOp<unbiased, has_truncation, Delta> operator()(bool need_update) {
+    return GetGradOp<unbiased, has_truncation, Delta>{make_pair, delta, need_update};
+  }
+};
+
+/**
+ * \brief Calculate gradient for all pairs using update op created by make_get_grad.
+ *
+ * We need to run gradient calculation twice, the first time gathers infomation like
+ * maximum gradient, maximum cost, and the normalization term using reduction. The second
+ * time performs the actual update.
+ *
+ * Without normalization, we only need to run it once since we can manually calculate
+ * the bounds of gradient (NDCG \in [0, 1], delta_NDCG \in [0, 1], ti+/tj- are from the
+ * previous iteration so the bound can be calculated for current iteration). However, if
+ * normalization is used, the delta score is un-bounded and we need to obtain the sum
+ * gradient. As a tradeoff, we simply run the kernel twice, once as reduction, second
+ * one as for_each.
+ *
+ * Alternatively, we can bound the delta score by limiting the output of the model using
+ * sigmoid for binary output and some normalization for multi-level. But effect to the
+ * accuracy is not known yet, and it's only used by GPU.
+ *
+ * For performance, the segmented sort for sorted scores is the bottleneck and takes up
+ * about half of the time, while the reduction and for_each takes up the second half.
+ */
+template <bool unbiased, bool has_truncation, typename Delta>
+void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::RankingCache> p_cache,
+              MakeGetGrad<unbiased, has_truncation, Delta> make_get_grad) {
+  auto n_groups = p_cache->Groups();
+  auto d_threads_group_ptr = p_cache->CUDAThreadsGroupPtr();
+  auto d_gptr = p_cache->DataGroupPtr(ctx);
+  auto d_gpair = make_get_grad.Args().gpairs;
+
+  /**
+   * First pass, gather info for normalization and rounding factor.
+   */
+  auto val_it = dh::MakeTransformIterator<GradCostNorm>(thrust::make_counting_iterator(0ul),
+                                                        make_get_grad(false));
+  auto reduction_op = [] XGBOOST_DEVICE(GradCostNorm const& l,
+                                        GradCostNorm const& r) -> GradCostNorm {
+    // get maximum gradient for each group, along with cost and the normalization term
+    auto const& lg = thrust::get<0>(l);
+    auto const& rg = thrust::get<0>(r);
+    auto grad = std::max(lg.GetGrad(), rg.GetGrad());
+    auto hess = std::max(lg.GetHess(), rg.GetHess());
+    auto cost = std::max(thrust::get<1>(l), thrust::get<1>(r));
+    double sum_lambda = thrust::get<2>(l) + thrust::get<2>(r);
+    return thrust::make_tuple(GradientPair{std::abs(grad), std::abs(hess)}, cost, sum_lambda);
+  };
+  auto init = thrust::make_tuple(GradientPair{0.0f, 0.0f}, 0.0, 0.0);
+  common::Span<GradCostNorm> d_max_lambdas = p_cache->MaxLambdas<GradCostNorm>(ctx, n_groups);
+  CHECK_EQ(n_groups * sizeof(GradCostNorm), d_max_lambdas.size_bytes());
+
+  std::size_t bytes;
+  cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, d_max_lambdas.data(), n_groups,
+                                     d_threads_group_ptr.data(), d_threads_group_ptr.data() + 1,
+                                     reduction_op, init, ctx->CUDACtx()->Stream());
+  dh::TemporaryArray<char> temp(bytes);
+  cub::DeviceSegmentedReduce::Reduce(
+      temp.data().get(), bytes, val_it, d_max_lambdas.data(), n_groups, d_threads_group_ptr.data(),
+      d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream());
+
+  dh::TemporaryArray<double> min_bias(2);
+  auto d_min_bias = dh::ToSpan(min_bias);
+  if (unbiased) {
+    MinBias(ctx, p_cache, make_get_grad.Args().ti_plus, make_get_grad.Args().tj_minus, d_min_bias);
+  }
+  /**
+   * Create rounding factors
+   */
+  auto d_cost_rounding = p_cache->CUDACostRounding(ctx);
+  auto d_rounding = p_cache->CUDARounding(ctx);
+  dh::LaunchN(n_groups, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t g) mutable {
+    auto group_size = d_gptr[g + 1] - d_gptr[g];
+    auto const& max_grad = thrust::get<0>(d_max_lambdas[g]);
+    // float group size
+    auto fgs = static_cast<float>(group_size);
+    auto grad = common::CreateRoundingFactor(fgs * max_grad.GetGrad(), group_size);
+    auto hess = common::CreateRoundingFactor(fgs * max_grad.GetHess(), group_size);
+    d_rounding(g) = GradientPair{grad, hess};
+
+    auto cost = thrust::get<1>(d_max_lambdas[g]);
+    if (unbiased) {
+      cost /= std::min(d_min_bias[0], d_min_bias[1]);
+      d_cost_rounding[0] = common::CreateRoundingFactor(fgs * cost, group_size);
+    }
+  });
+
+  /**
+   * Second pass, actual update to gradient and bias.
+   */
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul),
+                     p_cache->CUDAThreads(), make_get_grad(true));
+
+  /**
+   * Lastly, normalization and weight.
+   */
+  auto d_weights = common::MakeOptionalWeights(ctx, info.weights_);
+  auto w_norm = p_cache->WeightNorm();
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.size(),
+                     [=] XGBOOST_DEVICE(std::size_t i) {
+                       auto g = dh::SegmentId(d_gptr, i);
+                       auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
+                       // Normalization
+                       if (sum_lambda > 0.0) {
+                         double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+                         d_gpair[i] *= norm;
+                       }
+                       d_gpair[i] *= (d_weights[g] * w_norm);
+                     });
+}
+
+/**
+ * \brief Handles boilerplate code like getting device span.
+ */
+template <typename Delta>
+void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const& preds,
+            const MetaInfo& info, std::shared_ptr<ltr::RankingCache> p_cache, Delta delta,
+            linalg::VectorView<double const> ti_plus,   // input bias ratio
+            linalg::VectorView<double const> tj_minus,  // input bias ratio
+            linalg::VectorView<double> li, linalg::VectorView<double> lj,
+            HostDeviceVector<GradientPair>* out_gpair) {
+  // boilerplate
+  std::int32_t device_id = ctx->gpu_id;
+  dh::safe_cuda(cudaSetDevice(device_id));
+  auto n_groups = p_cache->Groups();
+
+  info.labels.SetDevice(device_id);
+  preds.SetDevice(device_id);
+  out_gpair->SetDevice(device_id);
+  out_gpair->Resize(preds.Size());
+
+  CHECK(p_cache);
+
+  auto d_rounding = p_cache->CUDARounding(ctx);
+  auto d_cost_rounding = p_cache->CUDACostRounding(ctx);
+
+  CHECK_NE(d_rounding.Size(), 0);
+
+  auto label = info.labels.View(ctx->gpu_id);
+  auto predts = preds.ConstDeviceSpan();
+  auto gpairs = out_gpair->DeviceSpan();
+  thrust::fill_n(ctx->CUDACtx()->CTP(), gpairs.data(), gpairs.size(), GradientPair{0.0f, 0.0f});
+
+  auto const d_threads_group_ptr = p_cache->CUDAThreadsGroupPtr();
+  auto const d_gptr = p_cache->DataGroupPtr(ctx);
+  auto const rank_idx = p_cache->SortedIdx(ctx, predts);
+
+  auto const unbiased = p_cache->Param().lambdarank_unbiased;
+
+  common::Span<std::size_t const> d_y_sorted_idx;
+  if (!p_cache->Param().HasTruncation()) {
+    d_y_sorted_idx = SortY(ctx, info, rank_idx, p_cache);
+  }
+
+  KernelInputs args{ti_plus,        tj_minus, li,     lj,     d_gptr,     d_threads_group_ptr,
+                    rank_idx,       label,    predts, gpairs, d_rounding, d_cost_rounding.data(),
+                    d_y_sorted_idx, iter};
+
+  // dispatch based on unbiased and truncation
+  if (p_cache->Param().HasTruncation()) {
+    if (unbiased) {
+      CalcGrad(ctx, info, p_cache, MakeGetGrad<true, true, Delta>{args, delta});
+    } else {
+      CalcGrad(ctx, info, p_cache, MakeGetGrad<false, true, Delta>{args, delta});
+    }
+  } else {
+    if (unbiased) {
+      CalcGrad(ctx, info, p_cache, MakeGetGrad<true, false, Delta>{args, delta});
+    } else {
+      CalcGrad(ctx, info, p_cache, MakeGetGrad<false, false, Delta>{args, delta});
+    }
+  }
+}
+}  // anonymous namespace
+
 common::Span<std::size_t const> SortY(Context const* ctx, MetaInfo const& info,
                                       common::Span<std::size_t const> d_rank,
                                       std::shared_ptr<ltr::RankingCache> p_cache) {
@@ -58,5 +364,222 @@ common::Span<std::size_t const> SortY(Context const* ctx, MetaInfo const& info,
   common::SegmentedArgSort<false, true>(ctx, d_y_ranked, d_group_ptr, d_y_sorted_idx);
   return d_y_sorted_idx;
 }
+
+void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
+                               const HostDeviceVector<float>& preds, const MetaInfo& info,
+                               std::shared_ptr<ltr::NDCGCache> p_cache,
+                               linalg::VectorView<double const> ti_plus,   // input bias ratio
+                               linalg::VectorView<double const> tj_minus,  // input bias ratio
+                               linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                               HostDeviceVector<GradientPair>* out_gpair) {
+  // boilerplate
+  std::int32_t device_id = ctx->gpu_id;
+  dh::safe_cuda(cudaSetDevice(device_id));
+  auto const d_inv_IDCG = p_cache->InvIDCG(ctx);
+  auto const discount = p_cache->Discount(ctx);
+
+  info.labels.SetDevice(device_id);
+  preds.SetDevice(device_id);
+
+  auto const exp_gain = p_cache->Param().ndcg_exp_gain;
+  auto delta_ndcg = [=] XGBOOST_DEVICE(float y_high, float y_low, std::size_t rank_high,
+                                       std::size_t rank_low, bst_group_t g) {
+    return exp_gain ? DeltaNDCG<true>(y_high, y_low, rank_high, rank_low, d_inv_IDCG(g), discount)
+                    : DeltaNDCG<false>(y_high, y_low, rank_high, rank_low, d_inv_IDCG(g), discount);
+  };
+  Launch(ctx, iter, preds, info, p_cache, delta_ndcg, ti_plus, tj_minus, li, lj, out_gpair);
+}
+
+void MAPStat(Context const* ctx, MetaInfo const& info, common::Span<std::size_t const> d_rank_idx,
+             std::shared_ptr<ltr::MAPCache> p_cache) {
+  common::Span<double> out_n_rel = p_cache->NumRelevant(ctx);
+  common::Span<double> out_acc = p_cache->Acc(ctx);
+
+  CHECK_EQ(out_n_rel.size(), info.num_row_);
+  CHECK_EQ(out_acc.size(), info.num_row_);
+
+  auto group_ptr = p_cache->DataGroupPtr(ctx);
+  auto key_it = dh::MakeTransformIterator<std::size_t>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(std::size_t i) -> std::size_t { return dh::SegmentId(group_ptr, i); });
+  auto label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto const* cuctx = ctx->CUDACtx();
+
+  {
+    // calculate number of relevant documents
+    auto val_it = dh::MakeTransformIterator<double>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> double {
+          auto g = dh::SegmentId(group_ptr, i);
+          auto g_label = label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
+          auto idx_in_group = i - group_ptr[g];
+          auto g_sorted_idx = d_rank_idx.subspan(group_ptr[g], group_ptr[g + 1] - group_ptr[g]);
+          return static_cast<double>(g_label(g_sorted_idx[idx_in_group]));
+        });
+    thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + info.num_row_, val_it,
+                                  out_n_rel.data());
+  }
+  {
+    // \sum l_k/k
+    auto val_it = dh::MakeTransformIterator<double>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> double {
+          auto g = dh::SegmentId(group_ptr, i);
+          auto g_label = label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
+          auto g_sorted_idx = d_rank_idx.subspan(group_ptr[g], group_ptr[g + 1] - group_ptr[g]);
+          auto idx_in_group = i - group_ptr[g];
+          double rank_in_group = idx_in_group + 1.0;
+          return static_cast<double>(g_label(g_sorted_idx[idx_in_group])) / rank_in_group;
+        });
+    thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + info.num_row_, val_it,
+                                  out_acc.data());
+  }
+}
+
+void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
+                              HostDeviceVector<float> const& predt, const MetaInfo& info,
+                              std::shared_ptr<ltr::MAPCache> p_cache,
+                              linalg::VectorView<double const> ti_plus,   // input bias ratio
+                              linalg::VectorView<double const> tj_minus,  // input bias ratio
+                              linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                              HostDeviceVector<GradientPair>* out_gpair) {
+  std::int32_t device_id = ctx->gpu_id;
+  dh::safe_cuda(cudaSetDevice(device_id));
+
+  info.labels.SetDevice(device_id);
+  predt.SetDevice(device_id);
+
+  CHECK(p_cache);
+
+  auto d_predt = predt.ConstDeviceSpan();
+  auto const d_sorted_idx = p_cache->SortedIdx(ctx, d_predt);
+
+  MAPStat(ctx, info, d_sorted_idx, p_cache);
+  auto d_n_rel = p_cache->NumRelevant(ctx);
+  auto d_acc = p_cache->Acc(ctx);
+  auto d_gptr = p_cache->DataGroupPtr(ctx).data();
+
+  auto delta_map = [=] XGBOOST_DEVICE(float y_high, float y_low, std::size_t rank_high,
+                                      std::size_t rank_low, bst_group_t g) {
+    if (rank_high > rank_low) {
+      thrust::swap(rank_high, rank_low);
+      thrust::swap(y_high, y_low);
+    }
+    auto cnt = d_gptr[g + 1] - d_gptr[g];
+    auto g_n_rel = d_n_rel.subspan(d_gptr[g], cnt);
+    auto g_acc = d_acc.subspan(d_gptr[g], cnt);
+    auto d = DeltaMAP(y_high, y_low, rank_high, rank_low, g_n_rel, g_acc);
+    return d;
+  };
+
+  Launch(ctx, iter, predt, info, p_cache, delta_map, ti_plus, tj_minus, li, lj, out_gpair);
+}
+
+void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
+                                   HostDeviceVector<float> const& predt, const MetaInfo& info,
+                                   std::shared_ptr<ltr::RankingCache> p_cache,
+                                   linalg::VectorView<double const> ti_plus,   // input bias ratio
+                                   linalg::VectorView<double const> tj_minus,  // input bias ratio
+                                   linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                                   HostDeviceVector<GradientPair>* out_gpair) {
+  std::int32_t device_id = ctx->gpu_id;
+  dh::safe_cuda(cudaSetDevice(device_id));
+
+  info.labels.SetDevice(device_id);
+  predt.SetDevice(device_id);
+
+  auto d_predt = predt.ConstDeviceSpan();
+  auto const d_sorted_idx = p_cache->SortedIdx(ctx, d_predt);
+
+  auto delta = [] XGBOOST_DEVICE(float, float, std::size_t, std::size_t, bst_group_t) {
+    return 1.0;
+  };
+
+  Launch(ctx, iter, predt, info, p_cache, delta, ti_plus, tj_minus, li, lj, out_gpair);
+}
+
+namespace {
+struct ReduceOp {
+  template <typename Tup>
+  Tup XGBOOST_DEVICE operator()(Tup const& l, Tup const& r) {
+    return thrust::make_tuple(thrust::get<0>(l) + thrust::get<0>(r),
+                              thrust::get<1>(l) + thrust::get<1>(r));
+  }
+};
+}  // namespace
+
+void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double const> li_full,
+                                  linalg::VectorView<double const> lj_full,
+                                  linalg::Vector<double>* p_ti_plus,
+                                  linalg::Vector<double>* p_tj_minus,
+                                  linalg::Vector<double>* p_li,  // loss
+                                  linalg::Vector<double>* p_lj,
+                                  std::shared_ptr<ltr::RankingCache> p_cache) {
+  auto const d_group_ptr = p_cache->DataGroupPtr(ctx);
+  auto n_groups = d_group_ptr.size() - 1;
+
+  auto ti_plus = p_ti_plus->View(ctx->gpu_id);
+  auto tj_minus = p_tj_minus->View(ctx->gpu_id);
+
+  auto li = p_li->View(ctx->gpu_id);
+  auto lj = p_lj->View(ctx->gpu_id);
+  CHECK_EQ(li.Size(), ti_plus.Size());
+
+  auto const& param = p_cache->Param();
+  auto regularizer = param.Regularizer();
+  std::size_t k = p_cache->MaxPositionSize();
+
+  CHECK_EQ(li.Size(), k);
+  CHECK_EQ(lj.Size(), k);
+  // reduce li_full to li for each group.
+  auto make_iter = [&](linalg::VectorView<double const> l_full) {
+    auto l_it = [=] XGBOOST_DEVICE(std::size_t i) {
+      // group index
+      auto g = i % n_groups;
+      // rank is the position within a group, also the segment index
+      auto r = i / n_groups;
+
+      auto begin = d_group_ptr[g];
+      std::size_t group_size = d_group_ptr[g + 1] - begin;
+      auto n = std::min(group_size, k);
+      // r can be greater than n since we allocate threads based on truncation level
+      // instead of actual group size.
+      if (r >= n) {
+        return 0.0;
+      }
+      return l_full(r + begin);
+    };
+    return l_it;
+  };
+  auto li_it =
+      dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), make_iter(li_full));
+  auto lj_it =
+      dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), make_iter(lj_full));
+  // k segments, each segment has size n_groups.
+  auto key_it = dh::MakeTransformIterator<std::size_t>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(std::size_t i) { return i * n_groups; });
+  auto val_it = thrust::make_zip_iterator(thrust::make_tuple(li_it, lj_it));
+  auto out_it =
+      thrust::make_zip_iterator(thrust::make_tuple(li.Values().data(), lj.Values().data()));
+
+  auto init = thrust::make_tuple(0.0, 0.0);
+  std::size_t bytes;
+  cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, out_it, k, key_it, key_it + 1,
+                                     ReduceOp{}, init, ctx->CUDACtx()->Stream());
+  dh::TemporaryArray<char> temp(bytes);
+  cub::DeviceSegmentedReduce::Reduce(temp.data().get(), bytes, val_it, out_it, k, key_it,
+                                     key_it + 1, ReduceOp{}, init, ctx->CUDACtx()->Stream());
+
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), li.Size(),
+                     [=] XGBOOST_DEVICE(std::size_t i) mutable {
+                       if (li(0) >= Eps64()) {
+                         ti_plus(i) = std::pow(li(i) / li(0), regularizer);
+                       }
+                       if (lj(0) >= Eps64()) {
+                         tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);
+                       }
+                       assert(!std::isinf(ti_plus(i)));
+                       assert(!std::isinf(tj_minus(i)));
+                     });
+}
 }  // namespace cuda_impl
 }  // namespace xgboost::obj
diff --git a/src/objective/lambdarank_obj.h b/src/objective/lambdarank_obj.h
index 3adb27a2e..c2222c028 100644
--- a/src/objective/lambdarank_obj.h
+++ b/src/objective/lambdarank_obj.h
@@ -1,5 +1,15 @@
 /**
- * Copyright 2023 XGBoost contributors
+ * Copyright 2023, XGBoost contributors
+ *
+ * Vocabulary explanation:
+ *
+ * There are two different lists we need to handle in the objective, first is the list of
+ * labels (relevance degree) provided by the user. Its order has no particular meaning
+ * when bias estimation is NOT used. Another one is generated by our model, sorted index
+ * based on prediction scores. `rank_high` refers to the position index of the model rank
+ * list that is higher than `rank_low`, while `idx_high` refers to where does the
+ * `rank_high` sample comes from. Simply put, `rank_high` indexes into the rank list
+ * obtained from the model, while `idx_high` indexes into the user provided sample list.
  */
 #ifndef XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_
 #define XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_
@@ -25,14 +35,19 @@
 #include "xgboost/span.h"                  // for Span
 
 namespace xgboost::obj {
+double constexpr Eps64() { return 1e-16; }
+
 template <bool exp>
-XGBOOST_DEVICE double DeltaNDCG(float y_high, float y_low, std::size_t r_high, std::size_t r_low,
-                                double inv_IDCG, common::Span<double const> discount) {
+XGBOOST_DEVICE double DeltaNDCG(float y_high, float y_low, std::size_t rank_high,
+                                std::size_t rank_low, double inv_IDCG,
+                                common::Span<double const> discount) {
+  // Use rank_high instead of idx_high as we are calculating discount based on ranks
+  // provided by the model.
   double gain_high = exp ? ltr::CalcDCGGain(y_high) : y_high;
-  double discount_high = discount[r_high];
+  double discount_high = discount[rank_high];
 
   double gain_low = exp ? ltr::CalcDCGGain(y_low) : y_low;
-  double discount_low = discount[r_low];
+  double discount_low = discount[rank_low];
 
   double original = gain_high * discount_high + gain_low * discount_low;
   double changed = gain_low * discount_high + gain_high * discount_low;
@@ -70,9 +85,9 @@ template <bool unbiased, typename Delta>
 XGBOOST_DEVICE GradientPair
 LambdaGrad(linalg::VectorView<float const> labels, common::Span<float const> predts,
            common::Span<size_t const> sorted_idx,
-           std::size_t rank_high,                     // cordiniate
-           std::size_t rank_low,                      // cordiniate
-           Delta delta,                               // delta score
+           std::size_t rank_high,                     // higher index on the model rank list
+           std::size_t rank_low,                      // lower index on the model rank list
+           Delta delta,                               // function to calculate delta score
            linalg::VectorView<double const> t_plus,   // input bias ratio
            linalg::VectorView<double const> t_minus,  // input bias ratio
            double* p_cost) {
@@ -95,30 +110,34 @@ LambdaGrad(linalg::VectorView<float const> labels, common::Span<float const> pre
 
   // Use double whenever possible as we are working on the exp space.
   double delta_score = std::abs(s_high - s_low);
-  double sigmoid = common::Sigmoid(s_high - s_low);
+  double const sigmoid = common::Sigmoid(s_high - s_low);
   // Change in metric score like \delta NDCG or \delta MAP
   double delta_metric = std::abs(delta(y_high, y_low, rank_high, rank_low));
 
   if (best_score != worst_score) {
-    delta_metric /= (delta_score + kRtEps);
+    delta_metric /= (delta_score + 0.01);
   }
 
   if (unbiased) {
     *p_cost = std::log(1.0 / (1.0 - sigmoid)) * delta_metric;
   }
 
-  constexpr double kEps = 1e-16;
   auto lambda_ij = (sigmoid - 1.0) * delta_metric;
-  auto hessian_ij = std::max(sigmoid * (1.0 - sigmoid), kEps) * delta_metric * 2.0;
+  auto hessian_ij = std::max(sigmoid * (1.0 - sigmoid), Eps64()) * delta_metric * 2.0;
 
   auto k = t_plus.Size();
   assert(t_minus.Size() == k && "Invalid size of position bias");
 
-  if (unbiased && idx_high < k && idx_low < k) {
-    lambda_ij /= (t_minus(idx_low) * t_plus(idx_high) + kRtEps);
-    hessian_ij /= (t_minus(idx_low) * t_plus(idx_high) + kRtEps);
+  // We need to skip samples that exceed the maximum number of tracked positions, and
+  // samples that have low probability and might bring us floating point issues.
+  if (unbiased && idx_high < k && idx_low < k && t_minus(idx_low) >= Eps64() &&
+      t_plus(idx_high) >= Eps64()) {
+    // The index should be ranks[idx_low], since we assume label is sorted, this reduces
+    // to `idx_low`, which represents the position on the input list, as explained in the
+    // file header.
+    lambda_ij /= (t_plus(idx_high) * t_minus(idx_low));
+    hessian_ij /= (t_plus(idx_high) * t_minus(idx_low));
   }
-
   auto pg = GradientPair{static_cast<float>(lambda_ij), static_cast<float>(hessian_ij)};
   return pg;
 }
diff --git a/src/objective/objective.cc b/src/objective/objective.cc
index 925456fd0..7addf957a 100644
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -47,13 +47,14 @@ DMLC_REGISTRY_LINK_TAG(regression_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(quantile_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(hinge_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj_gpu);
-DMLC_REGISTRY_LINK_TAG(rank_obj_gpu);
+DMLC_REGISTRY_LINK_TAG(lambdarank_obj);
+DMLC_REGISTRY_LINK_TAG(lambdarank_obj_cu);
 #else
 DMLC_REGISTRY_LINK_TAG(regression_obj);
 DMLC_REGISTRY_LINK_TAG(quantile_obj);
 DMLC_REGISTRY_LINK_TAG(hinge_obj);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj);
-DMLC_REGISTRY_LINK_TAG(rank_obj);
+DMLC_REGISTRY_LINK_TAG(lambdarank_obj);
 #endif  // XGBOOST_USE_CUDA, XGBOOST_USE_HIP
 }  // namespace obj
 }  // namespace xgboost
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index e14f448eb..75e8faa4c 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -1,6 +1,7 @@
 /**
  * Copyright 2023 by XGBoost contributors
  */
+#include <array>                            // std::array
 #include <cstddef>                          // std::size_t
 #include <cstdint>                          // std::int32_t
 #include <vector>                           // std::vector
@@ -35,7 +36,7 @@ class QuantileRegression : public ObjFunction {
   bst_target_t Targets(MetaInfo const& info) const override {
     auto const& alpha = param_.quantile_alpha.Get();
     CHECK_EQ(alpha.size(), alpha_.Size()) << "The objective is not yet configured.";
-    if (!info.IsVerticalFederated() || collective::GetRank() == 0) {
+    if (info.ShouldHaveLabels()) {
       CHECK_EQ(info.labels.Shape(1), 1)
           << "Multi-target is not yet supported by the quantile loss.";
     }
@@ -170,10 +171,9 @@ class QuantileRegression : public ObjFunction {
     common::Mean(ctx_, *base_score, &temp);
     double meanq = temp(0) * sw;
 
-    if (info.IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(&meanq, 1);
-      collective::Allreduce<collective::Operation::kSum>(&sw, 1);
-    }
+    std::array<double, 2> dat{meanq, sw};
+    collective::GlobalSum(info, &dat);
+    std::tie(meanq, sw) = std::tuple_cat(dat);
     meanq /= (sw + kRtEps);
     base_score->Reshape(1);
     base_score->Data()->Fill(meanq);
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 2658d780d..53f235017 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -728,10 +728,8 @@ class MeanAbsoluteError : public ObjFunction {
     std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
                    [w](float v) { return v * w; });
 
-    if (info.IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(out.Values().data(), out.Values().size());
-      collective::Allreduce<collective::Operation::kSum>(&w, 1);
-    }
+    collective::GlobalSum(info, &out.Values());
+    collective::GlobalSum(info, &w, 1);
 
     if (common::CloseTo(w, 0.0)) {
       // Mostly for handling empty dataset test.
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 2b7a96d9c..aa8972989 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -75,7 +75,7 @@ bst_float PredValue(const SparsePage::Inst &inst,
       psum += (*trees[i])[nidx].LeafValue();
     }
   }
-  p_feats->Drop(inst);
+  p_feats->Drop();
   return psum;
 }
 
@@ -172,13 +172,11 @@ void FVecFill(const size_t block_size, const size_t batch_offset, const int num_
   }
 }
 
-template <typename DataView>
-void FVecDrop(const size_t block_size, const size_t batch_offset, DataView *batch,
-              const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
+void FVecDrop(std::size_t const block_size, std::size_t const fvec_offset,
+              std::vector<RegTree::FVec> *p_feats) {
   for (size_t i = 0; i < block_size; ++i) {
     RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
-    const SparsePage::Inst inst = (*batch)[batch_offset + i];
-    feats.Drop(inst);
+    feats.Drop();
   }
 }
 
@@ -196,11 +194,15 @@ struct SparsePageView {
 struct GHistIndexMatrixView {
  private:
   GHistIndexMatrix const &page_;
-  uint64_t n_features_;
+  std::uint64_t const n_features_;
   common::Span<FeatureType const> ft_;
   common::Span<Entry> workspace_;
   std::vector<size_t> current_unroll_;
 
+  std::vector<std::uint32_t> const& ptrs_;
+  std::vector<float> const& mins_;
+  std::vector<float> const& values_;
+
  public:
   size_t base_rowid;
 
@@ -213,6 +215,9 @@ struct GHistIndexMatrixView {
         ft_{ft},
         workspace_{workplace},
         current_unroll_(n_threads > 0 ? n_threads : 1, 0),
+        ptrs_{_page.cut.Ptrs()},
+        mins_{_page.cut.MinValues()},
+        values_{_page.cut.Values()},
         base_rowid{_page.base_rowid} {}
 
   SparsePage::Inst operator[](size_t r) {
@@ -221,7 +226,7 @@ struct GHistIndexMatrixView {
     size_t non_missing{static_cast<std::size_t>(beg)};
 
     for (bst_feature_t c = 0; c < n_features_; ++c) {
-      float f = page_.GetFvalue(r, c, common::IsCat(ft_, c));
+      float f = page_.GetFvalue(ptrs_, values_, mins_, r, c, common::IsCat(ft_, c));
       if (!common::CheckNAN(f)) {
         workspace_[non_missing] = Entry{c, f};
         ++non_missing;
@@ -301,7 +306,7 @@ void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &mod
     // process block of rows through all trees to keep cache locality
     PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid, thread_temp,
                       fvec_offset, block_size, out_predt);
-    FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
+    FVecDrop(block_size, fvec_offset, p_thread_temp);
   });
 }
 
@@ -529,7 +534,7 @@ class ColumnSplitHelper {
 
       FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, &feat_vecs_);
       MaskAllTrees(batch_offset, fvec_offset, block_size);
-      FVecDrop(block_size, batch_offset, &batch, fvec_offset, &feat_vecs_);
+      FVecDrop(block_size, fvec_offset, &feat_vecs_);
     });
 
     AllreduceBitVectors();
@@ -629,7 +634,7 @@ class CPUPredictor : public Predictor {
     if (!p_fmat->PageExists<SparsePage>()) {
       std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
       auto ft = p_fmat->Info().feature_types.ConstHostVector();
-      for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
+      for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
         if (blocked) {
           PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
               GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
@@ -780,7 +785,7 @@ class CPUPredictor : public Predictor {
           }
           preds[ridx * ntree_limit + j] = static_cast<bst_float>(nidx);
         }
-        feats.Drop(page[i]);
+        feats.Drop();
       });
     }
   }
@@ -853,7 +858,7 @@ class CPUPredictor : public Predictor {
                   (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
             }
           }
-          feats.Drop(page[i]);
+          feats.Drop();
           // add base margin to BIAS
           if (base_margin.Size() != 0) {
             CHECK_EQ(base_margin.Shape(1), ngroup);
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index b50bcf399..cf951add4 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -750,7 +750,7 @@ class GPUPredictor : public xgboost::Predictor {
       }
     } else {
       size_t batch_offset = 0;
-      for (auto const& page : dmat->GetBatches<EllpackPage>(BatchParam{})) {
+      for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
         dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
         auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
         this->PredictInternal(
@@ -1047,7 +1047,7 @@ class GPUPredictor : public xgboost::Predictor {
         batch_offset += batch.Size();
       }
     } else {
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(BatchParam{})) {
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
         bst_row_t batch_offset = 0;
         EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
         size_t num_rows = batch.Size();
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index 47b1c6f91..0dca65e97 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -8,6 +8,7 @@
 #include <cinttypes>  // std::int32_t
 #include <cstddef>    // std::size_t
 
+#include "../collective/aggregator.h"
 #include "../collective/communicator-inl.h"
 #include "../common/common.h"              // AssertGPUSupport
 #include "../common/numeric.h"             // cpu_impl::Reduce
@@ -45,10 +46,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
   }
   CHECK(h_sum.CContiguous());
 
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
-  }
+  collective::GlobalSum(info, reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
 
   for (std::size_t i = 0; i < h_sum.Size(); ++i) {
     out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 676497336..f22fa172f 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2021 by XGBoost Contributors
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
  */
 #include <thrust/functional.h>
 #include <thrust/random.h>
@@ -12,6 +12,7 @@
 #include <utility>
 
 #include "../../common/compressed_iterator.h"
+#include "../../common/cuda_context.cuh"  // for CUDAContext
 #include "../../common/random.h"
 #include "../param.h"
 #include "gradient_based_sampler.cuh"
@@ -147,25 +148,26 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
 
 NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
 
-GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
+GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
+                                       DMatrix* dmat) {
   return {dmat->Info().num_row_, page_, gpair};
 }
 
-ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl const* page,
-                                                   size_t n_rows,
-                                                   const BatchParam& batch_param)
-    : batch_param_(batch_param),
-      page_(new EllpackPageImpl(batch_param.gpu_id, page->Cuts(), page->is_dense,
-                                page->row_stride, n_rows)) {}
+ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
+                                                   size_t n_rows, BatchParam batch_param)
+    : batch_param_{std::move(batch_param)},
+      page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
+                                n_rows)) {}
 
-GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair> gpair,
+GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
+                                                     common::Span<GradientPair> gpair,
                                                      DMatrix* dmat) {
   if (!page_concatenated_) {
     // Concatenate all the external memory ELLPACK pages into a single in-memory page.
     size_t offset = 0;
-    for (auto& batch : dmat->GetBatches<EllpackPage>(batch_param_)) {
+    for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
       auto page = batch.Impl();
-      size_t num_elements = page_->Copy(batch_param_.gpu_id, page, offset);
+      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
       offset += num_elements;
     }
     page_concatenated_ = true;
@@ -176,12 +178,13 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair>
 UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
     : page_(page), subsample_(subsample) {}
 
-GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
+GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                                            DMatrix* dmat) {
   // Set gradient pair to 0 with p = 1 - subsample
-  thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
-                     thrust::counting_iterator<size_t>(0),
-                     BernoulliTrial(common::GlobalRandom()(), subsample_),
-                     GradientPair());
+  auto cuctx = ctx->CUDACtx();
+  thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
+                     thrust::counting_iterator<std::size_t>(0),
+                     BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
   return {dmat->Info().num_row_, page_, gpair};
 }
 
@@ -192,7 +195,8 @@ ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
       subsample_(subsample),
       sample_row_index_(n_rows) {}
 
-GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientPair> gpair,
+GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
+                                                          common::Span<GradientPair> gpair,
                                                           DMatrix* dmat) {
   // Set gradient pair to 0 with p = 1 - subsample
   thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
@@ -216,18 +220,17 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP
                     sample_row_index_.begin(),
                     ClearEmptyRows());
 
-  auto batch_iterator = dmat->GetBatches<EllpackPage>(batch_param_);
+  auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(
-      batch_param_.gpu_id, first_page->Cuts(), first_page->is_dense,
-                           first_page->row_stride, sample_rows));
+  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+                                  first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(batch_param_.gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
@@ -242,18 +245,17 @@ GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
       threshold_(n_rows + 1, 0.0f),
       grad_sum_(n_rows, 0.0f) {}
 
-GradientBasedSample GradientBasedSampling::Sample(common::Span<GradientPair> gpair,
-                                                  DMatrix* dmat) {
+GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
+                                                  common::Span<GradientPair> gpair, DMatrix* dmat) {
+  auto cuctx = ctx->CUDACtx();
   size_t n_rows = dmat->Info().num_row_;
   size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
       gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
 
   // Perform Poisson sampling in place.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    thrust::counting_iterator<size_t>(0),
-                    dh::tbegin(gpair),
-                    PoissonSampling(dh::ToSpan(threshold_),
-                                    threshold_index,
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
+                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
+                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                     RandomWeight(common::GlobalRandom()())));
   return {n_rows, page_, gpair};
 }
@@ -268,7 +270,8 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
       grad_sum_(n_rows, 0.0f),
       sample_row_index_(n_rows) {}
 
-GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<GradientPair> gpair,
+GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
+                                                                common::Span<GradientPair> gpair,
                                                                 DMatrix* dmat) {
   size_t n_rows = dmat->Info().num_row_;
   size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
@@ -298,28 +301,25 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra
                     sample_row_index_.begin(),
                     ClearEmptyRows());
 
-  auto batch_iterator = dmat->GetBatches<EllpackPage>(batch_param_);
+  auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(batch_param_.gpu_id, first_page->Cuts(),
-                                  first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(batch_param_.gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }
 
-GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
-                                           size_t n_rows,
-                                           const BatchParam& batch_param,
-                                           float subsample,
-                                           int sampling_method) {
+GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
+                                           size_t n_rows, const BatchParam& batch_param,
+                                           float subsample, int sampling_method) {
   monitor_.Init("gradient_based_sampler");
 
   bool is_sampling = subsample < 1.0;
@@ -346,7 +346,7 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
     }
   } else {
     if (is_external_memory) {
-      strategy_.reset(new ExternalMemoryNoSampling(page, n_rows, batch_param));
+      strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
     } else {
       strategy_.reset(new NoSampling(page));
     }
@@ -354,10 +354,10 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
 }
 
 // Sample a DMatrix based on the given gradient pairs.
-GradientBasedSample GradientBasedSampler::Sample(common::Span<GradientPair> gpair,
-                                                 DMatrix* dmat) {
+GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
+                                                 common::Span<GradientPair> gpair, DMatrix* dmat) {
   monitor_.Start("Sample");
-  GradientBasedSample sample = strategy_->Sample(gpair, dmat);
+  GradientBasedSample sample = strategy_->Sample(ctx, gpair, dmat);
   monitor_.Stop("Sample");
   return sample;
 }
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh
index 5be6c71de..dafb98cfd 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -24,7 +24,8 @@ struct GradientBasedSample {
 class SamplingStrategy {
  public:
   /*! \brief Sample from a DMatrix based on the given gradient pairs. */
-  virtual GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) = 0;
+  virtual GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                                     DMatrix* dmat) = 0;
   virtual ~SamplingStrategy() = default;
 };
 
@@ -32,7 +33,8 @@ class SamplingStrategy {
 class NoSampling : public SamplingStrategy {
  public:
   explicit NoSampling(EllpackPageImpl const* page);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   EllpackPageImpl const* page_;
@@ -41,10 +43,10 @@ class NoSampling : public SamplingStrategy {
 /*! \brief No sampling in external memory mode. */
 class ExternalMemoryNoSampling : public SamplingStrategy {
  public:
-  ExternalMemoryNoSampling(EllpackPageImpl const* page,
-                           size_t n_rows,
-                           const BatchParam& batch_param);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
+                           BatchParam batch_param);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   BatchParam batch_param_;
@@ -56,7 +58,8 @@ class ExternalMemoryNoSampling : public SamplingStrategy {
 class UniformSampling : public SamplingStrategy {
  public:
   UniformSampling(EllpackPageImpl const* page, float subsample);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   EllpackPageImpl const* page_;
@@ -66,10 +69,9 @@ class UniformSampling : public SamplingStrategy {
 /*! \brief No sampling in external memory mode. */
 class ExternalMemoryUniformSampling : public SamplingStrategy {
  public:
-  ExternalMemoryUniformSampling(size_t n_rows,
-                                BatchParam batch_param,
-                                float subsample);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  ExternalMemoryUniformSampling(size_t n_rows, BatchParam batch_param, float subsample);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   BatchParam batch_param_;
@@ -82,11 +84,10 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
 /*! \brief Gradient-based sampling in in-memory mode.. */
 class GradientBasedSampling : public SamplingStrategy {
  public:
-  GradientBasedSampling(EllpackPageImpl const* page,
-                        size_t n_rows,
-                        const BatchParam& batch_param,
+  GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
                         float subsample);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   EllpackPageImpl const* page_;
@@ -98,10 +99,9 @@ class GradientBasedSampling : public SamplingStrategy {
 /*! \brief Gradient-based sampling in external memory mode.. */
 class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
  public:
-  ExternalMemoryGradientBasedSampling(size_t n_rows,
-                                      BatchParam batch_param,
-                                      float subsample);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  ExternalMemoryGradientBasedSampling(size_t n_rows, BatchParam batch_param, float subsample);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   BatchParam batch_param_;
@@ -124,14 +124,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
  */
 class GradientBasedSampler {
  public:
-  GradientBasedSampler(EllpackPageImpl const* page,
-                       size_t n_rows,
-                       const BatchParam& batch_param,
-                       float subsample,
-                       int sampling_method);
+  GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
+                       const BatchParam& batch_param, float subsample, int sampling_method);
 
   /*! \brief Sample from a DMatrix based on the given gradient pairs. */
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
 
   /*! \brief Calculate the threshold used to normalize sampling probabilities. */
   static size_t CalculateThresholdIndex(common::Span<GradientPair> gpair,
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index d22e8f679..f637427ad 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -7,6 +7,7 @@
 #include <memory>
 #include <vector>
 
+#include "../collective/aggregator.h"
 #include "../common/random.h"
 #include "../data/gradient_index.h"
 #include "common_row_partitioner.h"
@@ -65,7 +66,7 @@ class GloablApproxBuilder {
     partitioner_.clear();
     // Generating the GHistIndexMatrix is quite slow, is there a way to speed it up?
     for (auto const &page :
-         p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess, *task_))) {
+         p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess, *task_))) {
       if (n_total_bins == 0) {
         n_total_bins = page.cut.TotalBins();
         feature_values_ = page.cut;
@@ -92,13 +93,11 @@ class GloablApproxBuilder {
     for (auto const &g : gpair) {
       root_sum.Add(g);
     }
-    if (p_fmat->Info().IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
-    }
+    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
     std::vector<CPUExpandEntry> nodes{best};
     size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
       histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
                                    {}, gpair);
       i++;
@@ -149,7 +148,7 @@ class GloablApproxBuilder {
 
     size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
       histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
                                    nodes_to_build, nodes_to_sub, gpair);
       i++;
@@ -215,7 +214,8 @@ class GloablApproxBuilder {
 
       monitor_->Start("UpdatePosition");
       size_t page_id = 0;
-      for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
+      for (auto const &page :
+           p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
         partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
         page_id++;
       }
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 02edfa74a..bda9b4dfa 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -76,7 +76,7 @@ class ColMaker: public TreeUpdater {
     // Finds densities if we don't already have them
     if (column_densities_.empty()) {
       std::vector<size_t> column_size(dmat->Info().num_col_);
-      for (const auto &batch : dmat->GetBatches<SortedCSCPage>()) {
+      for (const auto &batch : dmat->GetBatches<SortedCSCPage>(ctx_)) {
         auto page = batch.GetView();
         for (auto i = 0u; i < batch.Size(); i++) {
           column_size[i] += page[i].size();
@@ -467,7 +467,7 @@ class ColMaker: public TreeUpdater {
       auto evaluator = tree_evaluator_.GetEvaluator();
 
       auto feat_set = column_sampler_.GetFeatureSet(depth);
-      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
+      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
         this->UpdateSolution(batch, feat_set->HostVector(), gpair, p_fmat);
       }
       // after this each thread's stemp will get the best candidates, aggregate results
@@ -546,7 +546,7 @@ class ColMaker: public TreeUpdater {
       }
       std::sort(fsplits.begin(), fsplits.end());
       fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
+      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
         auto page = batch.GetView();
         for (auto fid : fsplits) {
           auto col = page[fid];
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index fcd4d4ef2..13cc5cc65 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -219,7 +219,7 @@ struct GPUHistMakerDevice {
         column_sampler(column_sampler_seed),
         interaction_constraints(param, n_features),
         batch_param(std::move(_batch_param)) {
-    sampler.reset(new GradientBasedSampler(page, _n_rows, batch_param, param.subsample,
+    sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
                                            param.sampling_method));
     if (!param.monotone_constraints.empty()) {
       // Copy assigning an empty vector causes an exception in MSVC debug builds
@@ -275,7 +275,7 @@ struct GPUHistMakerDevice {
         dh_gpair->Size() * sizeof(GradientPair), hipMemcpyDeviceToDevice));
 #endif
 
-    auto sample = sampler->Sample(dh::ToSpan(d_gpair), dmat);
+    auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
     page = sample.page;
     gpair = sample.gpair;
 
@@ -872,11 +872,8 @@ class GPUHistMaker : public TreeUpdater {
     uint32_t column_sampling_seed = common::GlobalRandom()();
     collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
 
-    BatchParam batch_param{
-        ctx_->gpu_id,
-        param->max_bin,
-    };
-    auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
+    auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
+    auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
 #if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 4906a21b7..f0dd3dd12 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -134,7 +134,7 @@ class MultiTargetHistBuilder {
                       std::vector<MultiExpandEntry> const &applied) {
     monitor_->Start(__func__);
     std::size_t page_id{0};
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(this->param_))) {
       this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
       page_id++;
     }
@@ -152,7 +152,7 @@ class MultiTargetHistBuilder {
     std::size_t page_id = 0;
     bst_bin_t n_total_bins = 0;
     partitioner_.clear();
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       if (n_total_bins == 0) {
         n_total_bins = page.cut.TotalBins();
       } else {
@@ -206,7 +206,7 @@ class MultiTargetHistBuilder {
     std::vector<MultiExpandEntry> nodes{best};
     std::size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       for (bst_target_t t{0}; t < n_targets; ++t) {
         auto t_gpair = gpair.Slice(linalg::All(), t);
         histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
@@ -225,7 +225,7 @@ class MultiTargetHistBuilder {
     for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
       hists.push_back(&histogram_builder_[t].Histogram());
     }
-    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
       break;
     }
@@ -263,7 +263,7 @@ class MultiTargetHistBuilder {
 
     std::size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
         auto t_gpair = gpair.Slice(linalg::All(), t);
         // Make sure the gradient matrix is f-order.
@@ -283,7 +283,7 @@ class MultiTargetHistBuilder {
     for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
       hists.push_back(&histogram_builder_[t].Histogram());
     }
-    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
       break;
     }
@@ -294,6 +294,7 @@ class MultiTargetHistBuilder {
                      std::vector<bst_node_t> *p_out_position) {
     monitor_->Start(__func__);
     if (!task_->UpdateTreeLeaf()) {
+      monitor_->Stop(__func__);
       return;
     }
     for (auto const &part : partitioner_) {
@@ -382,7 +383,7 @@ class HistBuilder {
     std::size_t page_id{0};
     bst_bin_t n_total_bins{0};
     partitioner_.clear();
-    for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       if (n_total_bins == 0) {
         n_total_bins = page.cut.TotalBins();
       } else {
@@ -397,6 +398,7 @@ class HistBuilder {
     evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
                                                                  col_sampler_);
     p_last_tree_ = p_tree;
+    monitor_->Stop(__func__);
   }
 
   void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
@@ -404,7 +406,7 @@ class HistBuilder {
     monitor_->Start(__func__);
     auto const &histograms = histogram_builder_->Histogram();
     auto ft = p_fmat->Info().feature_types.ConstHostSpan();
-    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
       break;
     }
@@ -421,7 +423,7 @@ class HistBuilder {
 
     std::size_t page_id = 0;
     auto space = ConstructHistSpace(partitioner_, {node});
-    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       std::vector<CPUExpandEntry> nodes_to_build{node};
       std::vector<CPUExpandEntry> nodes_to_sub;
       this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
@@ -437,7 +439,7 @@ class HistBuilder {
          * Specialized code for dense data: For dense data (with no missing value), the sum
          * of gradient histogram is equal to snode[nid]
          */
-        auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
+        auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_)).begin());
         std::vector<std::uint32_t> const &row_ptr = gmat.cut.Ptrs();
         CHECK_GE(row_ptr.size(), 2);
         std::uint32_t const ibegin = row_ptr[0];
@@ -465,7 +467,7 @@ class HistBuilder {
       std::vector<CPUExpandEntry> entries{node};
       monitor_->Start("EvaluateSplits");
       auto ft = p_fmat->Info().feature_types.ConstHostSpan();
-      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
         evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
                                    &entries);
         break;
@@ -501,7 +503,7 @@ class HistBuilder {
 
     std::size_t page_id{0};
     auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
                                     partitioner_.at(page_id).Partitions(), nodes_to_build,
                                     nodes_to_sub, gpair.Values());
@@ -513,7 +515,7 @@ class HistBuilder {
                       std::vector<CPUExpandEntry> const &applied) {
     monitor_->Start(__func__);
     std::size_t page_id{0};
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
       page_id++;
     }
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 17c565490..448492de0 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -79,7 +79,7 @@ class TreeRefresher : public TreeUpdater {
                      dmlc::BeginPtr(stemp[tid]) + offset);
             offset += tree->NumNodes();
           }
-          feats.Drop(inst);
+          feats.Drop();
         });
       }
       // aggregate the statistics
diff --git a/tests/buildkite/build-cpu-arm64.sh b/tests/buildkite/build-cpu-arm64.sh
index 1a95a880a..fd00a7971 100755
--- a/tests/buildkite/build-cpu-arm64.sh
+++ b/tests/buildkite/build-cpu-arm64.sh
@@ -18,7 +18,7 @@ $command_wrapper bash -c "cd build && ctest --extra-verbose"
 
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
-  "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
+  "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
 $command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
   ${BUILDKITE_COMMIT} ${WHEEL_TAG}
 
diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh
index b25345b1b..c180695e8 100755
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -27,7 +27,7 @@ $command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc
   -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
-  "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
+  "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
 $command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
   ${BUILDKITE_COMMIT} ${WHEEL_TAG}
 
diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1
index 05d7aefb9..32cd2806a 100644
--- a/tests/buildkite/build-win64-gpu.ps1
+++ b/tests/buildkite/build-win64-gpu.ps1
@@ -24,21 +24,17 @@ if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 Write-Host "--- Build binary wheel"
 cd ../python-package
 conda activate
-& python setup.py bdist_wheel --universal
+& pip install --user -v "pip>=23"
+& pip --version
+& pip wheel --no-deps -v . --wheel-dir dist/
 Get-ChildItem . -Filter dist/*.whl |
 Foreach-Object {
   & python ../tests/ci_build/rename_whl.py $_.FullName $Env:BUILDKITE_COMMIT win_amd64
   if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 }
 
-Write-Host "--- Insert vcomp140.dll (OpenMP runtime) into the wheel"
-cd dist
-Copy-Item -Path ../../tests/ci_build/insert_vcomp140.py -Destination .
-& python insert_vcomp140.py *.whl
-if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
-
 Write-Host "--- Upload Python wheel"
-cd ../..
+cd ..
 Get-ChildItem . -Filter python-package/dist/*.whl |
 Foreach-Object {
   & buildkite-agent artifact upload python-package/dist/$_
diff --git a/tests/ci_build/build_python_wheels.sh b/tests/ci_build/build_python_wheels.sh
index d91df2286..205b3b695 100644
--- a/tests/ci_build/build_python_wheels.sh
+++ b/tests/ci_build/build_python_wheels.sh
@@ -26,7 +26,7 @@ if [[ "$platform_id" == macosx_* ]]; then
         # cibuildwheel will take care of cross-compilation.
         wheel_tag=macosx_12_0_arm64
         cpython_ver=38
-        setup_env_var='CIBW_TARGET_OSX_ARM64=1'  # extra flag to be passed to setup.py
+        setup_env_var='CIBW_TARGET_OSX_ARM64=1'  # extra flag to be passed to xgboost.packager backend
         export PYTHON_CROSSENV=1
         export MACOSX_DEPLOYMENT_TARGET=12.0
         #OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-arm64/llvm-openmp-11.1.0-hf3c4609_1.tar.bz2"
diff --git a/tests/ci_build/change_version.py b/tests/ci_build/change_version.py
index 62cb894dc..25561859c 100644
--- a/tests/ci_build/change_version.py
+++ b/tests/ci_build/change_version.py
@@ -40,14 +40,24 @@ def pypkg(
     major: int, minor: int, patch: int, rc: int, is_rc: bool, is_dev: bool
 ) -> None:
     version = f"{major}.{minor}.{patch}"
-    pyver_path = os.path.join("xgboost", "VERSION")
     pyver = version
     if is_rc:
         pyver = pyver + f"rc{rc}"
     if is_dev:
         pyver = pyver + "-dev"
+
+    pyver_path = os.path.join("xgboost", "VERSION")
     with open(pyver_path, "w") as fd:
-        fd.write(pyver)
+        fd.write(pyver + "\n")
+
+    pyprj_path = os.path.join("pyproject.toml")
+    with open(pyprj_path, "r") as fd:
+        pyprj = fd.read()
+    matched = re.search('version = "' + r"([0-9]+\.[0-9]+\.[0-9]+.*)" + '"', pyprj)
+    assert matched, "Couldn't find version string in pyproject.toml."
+    pyprj = pyprj[: matched.start(1)] + pyver + pyprj[matched.end(1) :]
+    with open(pyprj_path, "w") as fd:
+        fd.write(pyprj)
 
 
 @cd(R_PACKAGE)
diff --git a/tests/ci_build/conda_env/python_lint.yml b/tests/ci_build/conda_env/python_lint.yml
index a64f649a2..3d42dfaf3 100644
--- a/tests/ci_build/conda_env/python_lint.yml
+++ b/tests/ci_build/conda_env/python_lint.yml
@@ -18,6 +18,7 @@ dependencies:
 - cloudpickle
 - pytest
 - hypothesis
+- hatchling
 - pip:
   # TODO: Replace it with pyspark>=3.4 once 3.4 released.
   - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
diff --git a/tests/ci_build/conda_env/sdist_test.yml b/tests/ci_build/conda_env/sdist_test.yml
index acc4607ad..67a9324f7 100644
--- a/tests/ci_build/conda_env/sdist_test.yml
+++ b/tests/ci_build/conda_env/sdist_test.yml
@@ -8,5 +8,6 @@ dependencies:
 - wheel
 - cmake
 - ninja
+- python-build
 - c-compiler
 - cxx-compiler
diff --git a/tests/ci_build/insert_vcomp140.py b/tests/ci_build/insert_vcomp140.py
deleted file mode 100644
index cfa8d792d..000000000
--- a/tests/ci_build/insert_vcomp140.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import argparse
-import base64
-import glob
-import hashlib
-import os
-import pathlib
-import re
-import shutil
-import tempfile
-
-VCOMP140_PATH = "C:\\Windows\\System32\\vcomp140.dll"
-
-
-def get_sha256sum(path):
-    return (
-        base64.urlsafe_b64encode(hashlib.sha256(open(path, "rb").read()).digest())
-        .decode("latin1")
-        .rstrip("=")
-    )
-
-
-def update_record(*, wheel_content_dir, xgboost_version):
-    vcomp140_size = os.path.getsize(VCOMP140_PATH)
-    vcomp140_hash = get_sha256sum(VCOMP140_PATH)
-
-    record_path = wheel_content_dir / pathlib.Path(
-        f"xgboost-{xgboost_version}.dist-info/RECORD"
-    )
-    with open(record_path, "r") as f:
-        record_content = f.read()
-    record_content += f"xgboost-{xgboost_version}.data/data/xgboost/vcomp140.dll,"
-    record_content += f"sha256={vcomp140_hash},{vcomp140_size}\n"
-    with open(record_path, "w") as f:
-        f.write(record_content)
-
-
-def main(args):
-    candidates = list(sorted(glob.glob(args.wheel_path)))
-    for wheel_path in candidates:
-        print(f"Processing wheel {wheel_path}")
-        m = re.search(r"xgboost-(.*)\+.*-py3", wheel_path)
-        if not m:
-            raise ValueError(f"Wheel {wheel_path} has unexpected name")
-        version = m.group(1)
-        print(f"  Detected version for {wheel_path}: {version}")
-        print(f"  Inserting vcomp140.dll into {wheel_path}...")
-        with tempfile.TemporaryDirectory() as tempdir:
-            wheel_content_dir = pathlib.Path(tempdir) / "wheel_content"
-            print(f"    Extract {wheel_path} into {wheel_content_dir}")
-            shutil.unpack_archive(
-                wheel_path, extract_dir=wheel_content_dir, format="zip"
-            )
-            data_dir = wheel_content_dir / pathlib.Path(
-                f"xgboost-{version}.data/data/xgboost"
-            )
-            data_dir.mkdir(parents=True, exist_ok=True)
-
-            print(f"    Copy {VCOMP140_PATH} -> {data_dir}")
-            shutil.copy(VCOMP140_PATH, data_dir)
-
-            print(f"    Update RECORD")
-            update_record(wheel_content_dir=wheel_content_dir, xgboost_version=version)
-
-            print(f"    Content of {wheel_content_dir}:")
-            for e in sorted(wheel_content_dir.rglob("*")):
-                if e.is_file():
-                    r = e.relative_to(wheel_content_dir)
-                    print(f"      {r}")
-
-            print(f"    Create new wheel...")
-            new_wheel_tmp_path = pathlib.Path(tempdir) / "new_wheel"
-            shutil.make_archive(
-                str(new_wheel_tmp_path.resolve()),
-                format="zip",
-                root_dir=wheel_content_dir,
-            )
-            new_wheel_tmp_path = new_wheel_tmp_path.resolve().with_suffix(".zip")
-            new_wheel_tmp_path = new_wheel_tmp_path.rename(
-                new_wheel_tmp_path.with_suffix(".whl")
-            )
-            print(f"    Created new wheel {new_wheel_tmp_path}")
-
-            # Rename the old wheel with suffix .bak
-            # The new wheel takes the name of the old wheel
-            wheel_path_obj = pathlib.Path(wheel_path).resolve()
-            backup_path = wheel_path_obj.with_suffix(".whl.bak")
-            print(f"    Rename {wheel_path_obj} -> {backup_path}")
-            wheel_path_obj.replace(backup_path)
-            print(f"    Rename {new_wheel_tmp_path} -> {wheel_path_obj}")
-            new_wheel_tmp_path.replace(wheel_path_obj)
-
-            shutil.rmtree(wheel_content_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "wheel_path", type=str, help="Path to wheel (wildcard permitted)"
-    )
-    args = parser.parse_args()
-
-    main(args)
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 00791e19d..3f553da9f 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -198,7 +198,7 @@ def main(args: argparse.Namespace) -> None:
             run_mypy(path)
             for path in [
                 # core
-                "python-package/xgboost/",
+                "python-package/",
                 # demo
                 "demo/json-model/json_parser.py",
                 "demo/guide-python/external_memory.py",
diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh
index 7375b4c9f..a70b27961 100755
--- a/tests/ci_build/test_python.sh
+++ b/tests/ci_build/test_python.sh
@@ -28,7 +28,7 @@ function install_xgboost {
   then
     pushd .
     cd python-package
-    python setup.py install --user
+    pip install --user -v .
     popd
   fi
 }
diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc
index de7b9a258..b49350702 100644
--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@@ -14,11 +14,12 @@ TEST(DenseColumn, Test) {
   int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   BinTypeSize last{kUint8BinsTypeSize};
   for (int32_t max_num_bin : max_num_bins) {
     auto dmat = RandomDataGenerator(100, 10, 0.0).GenerateDMatrix();
     auto sparse_thresh = 0.2;
-    GHistIndexMatrix gmat{dmat.get(), max_num_bin, sparse_thresh, false, AllThreadsForTest()};
+    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, sparse_thresh, false};
     ColumnMatrix column_matrix;
     for (auto const& page : dmat->GetBatches<SparsePage>()) {
       column_matrix.InitFromSparse(page, gmat, sparse_thresh, AllThreadsForTest());
@@ -62,9 +63,10 @@ TEST(SparseColumn, Test) {
   int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (int32_t max_num_bin : max_num_bins) {
     auto dmat = RandomDataGenerator(100, 1, 0.85).GenerateDMatrix();
-    GHistIndexMatrix gmat{dmat.get(), max_num_bin, 0.5f, false, AllThreadsForTest()};
+    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, 0.5f, false};
     ColumnMatrix column_matrix;
     for (auto const& page : dmat->GetBatches<SparsePage>()) {
       column_matrix.InitFromSparse(page, gmat, 1.0, AllThreadsForTest());
@@ -90,9 +92,10 @@ TEST(DenseColumnWithMissing, Test) {
   int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (int32_t max_num_bin : max_num_bins) {
     auto dmat = RandomDataGenerator(100, 1, 0.5).GenerateDMatrix();
-    GHistIndexMatrix gmat(dmat.get(), max_num_bin, 0.2, false, AllThreadsForTest());
+    GHistIndexMatrix gmat(&ctx, dmat.get(), max_num_bin, 0.2, false);
     ColumnMatrix column_matrix;
     for (auto const& page : dmat->GetBatches<SparsePage>()) {
       column_matrix.InitFromSparse(page, gmat, 0.2, AllThreadsForTest());
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index 41c728f35..69ec2cc82 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -156,6 +156,7 @@ TEST(CutsBuilder, SearchGroupInd) {
 }
 
 TEST(HistUtil, DenseCutsCategorical) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
    int categorical_sizes[] = {2, 6, 8, 12};
    int num_bins = 256;
    int sizes[] = {25, 100, 1000};
@@ -165,7 +166,7 @@ TEST(HistUtil, DenseCutsCategorical) {
        std::vector<float> x_sorted(x);
        std::sort(x_sorted.begin(), x_sorted.end());
        auto dmat = GetDMatrixFromData(x, n, 1);
-       HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest());
+       HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
        auto cuts_from_sketch = cuts.Values();
        EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
        EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
@@ -176,6 +177,7 @@ TEST(HistUtil, DenseCutsCategorical) {
 }
 
 TEST(HistUtil, DenseCutsAccuracyTest) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   int bin_sizes[] = {2, 16, 256, 512};
   int sizes[] = {100};
   int num_columns = 5;
@@ -183,7 +185,7 @@ TEST(HistUtil, DenseCutsAccuracyTest) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
     for (auto num_bins : bin_sizes) {
-      HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest());
+      HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
       ValidateCuts(cuts, dmat.get(), num_bins);
     }
   }
@@ -193,6 +195,7 @@ TEST(HistUtil, DenseCutsAccuracyTestWeights) {
   int bin_sizes[] = {2, 16, 256, 512};
   int sizes[] = {100, 1000, 1500};
   int num_columns = 5;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
@@ -200,11 +203,11 @@ TEST(HistUtil, DenseCutsAccuracyTestWeights) {
     dmat->Info().weights_.HostVector() = w;
     for (auto num_bins : bin_sizes) {
       {
-        HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest(), true);
+        HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins, true);
         ValidateCuts(cuts, dmat.get(), num_bins);
       }
       {
-        HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest(), false);
+        HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins, false);
         ValidateCuts(cuts, dmat.get(), num_bins);
       }
     }
@@ -215,6 +218,7 @@ void TestQuantileWithHessian(bool use_sorted) {
   int bin_sizes[] = {2, 16, 256, 512};
   int sizes[] = {1000, 1500};
   int num_columns = 5;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
@@ -225,15 +229,13 @@ void TestQuantileWithHessian(bool use_sorted) {
     dmat->Info().weights_.HostVector() = w;
 
     for (auto num_bins : bin_sizes) {
-      HistogramCuts cuts_hess =
-          SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest(), use_sorted, hessian);
+      HistogramCuts cuts_hess = SketchOnDMatrix(&ctx, dmat.get(), num_bins, use_sorted, hessian);
       for (size_t i = 0; i < w.size(); ++i) {
         dmat->Info().weights_.HostVector()[i] = w[i] * hessian[i];
       }
       ValidateCuts(cuts_hess, dmat.get(), num_bins);
 
-      HistogramCuts cuts_wh =
-          SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest(), use_sorted);
+      HistogramCuts cuts_wh = SketchOnDMatrix(&ctx, dmat.get(), num_bins, use_sorted);
       ValidateCuts(cuts_wh, dmat.get(), num_bins);
 
       ASSERT_EQ(cuts_hess.Values().size(), cuts_wh.Values().size());
@@ -255,12 +257,13 @@ TEST(HistUtil, DenseCutsExternalMemory) {
   int bin_sizes[] = {2, 16, 256, 512};
   int sizes[] = {100, 1000, 1500};
   int num_columns = 5;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     dmlc::TemporaryDirectory tmpdir;
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, tmpdir);
     for (auto num_bins : bin_sizes) {
-      HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest());
+      HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
       ValidateCuts(cuts, dmat.get(), num_bins);
     }
   }
@@ -275,12 +278,12 @@ TEST(HistUtil, IndexBinBound) {
                                            kUint32BinsTypeSize};
   size_t constexpr kRows = 100;
   size_t constexpr kCols = 10;
-
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   size_t bin_id = 0;
   for (auto max_bin : bin_sizes) {
     auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
 
-    GHistIndexMatrix hmat(p_fmat.get(), max_bin, 0.5, false, AllThreadsForTest());
+    GHistIndexMatrix hmat(&ctx, p_fmat.get(), max_bin, 0.5, false);
     EXPECT_EQ(hmat.index.Size(), kRows*kCols);
     EXPECT_EQ(expected_bin_type_sizes[bin_id++], hmat.index.GetBinTypeSize());
   }
@@ -300,10 +303,11 @@ TEST(HistUtil, IndexBinData) {
                                      static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
   size_t constexpr kRows = 100;
   size_t constexpr kCols = 10;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
 
   for (auto max_bin : kBinSizes) {
     auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
-    GHistIndexMatrix hmat(p_fmat.get(), max_bin, 0.5, false, AllThreadsForTest());
+    GHistIndexMatrix hmat(&ctx, p_fmat.get(), max_bin, 0.5, false);
     uint32_t const* offsets = hmat.index.Offset();
     EXPECT_EQ(hmat.index.Size(), kRows*kCols);
     switch (max_bin) {
@@ -327,10 +331,10 @@ void TestSketchFromWeights(bool with_group) {
   size_t constexpr kRows = 300, kCols = 20, kBins = 256;
   size_t constexpr kGroups = 10;
   auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
-  common::HistogramCuts cuts = SketchOnDMatrix(m.get(), kBins, AllThreadsForTest());
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);
 
   MetaInfo info;
-  Context ctx;
   auto& h_weights = info.weights_.HostVector();
   if (with_group) {
     h_weights.resize(kGroups);
@@ -363,7 +367,7 @@ void TestSketchFromWeights(bool with_group) {
 
   if (with_group) {
     m->Info().weights_ = decltype(m->Info().weights_)();  // remove weight
-    HistogramCuts non_weighted = SketchOnDMatrix(m.get(), kBins, AllThreadsForTest());
+    HistogramCuts non_weighted = SketchOnDMatrix(&ctx, m.get(), kBins);
     for (size_t i = 0; i < cuts.Values().size(); ++i) {
       EXPECT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
     }
@@ -382,7 +386,7 @@ void TestSketchFromWeights(bool with_group) {
     for (size_t i = 0; i < h_weights.size(); ++i) {
       h_weights[i] = static_cast<float>(i + 1) / static_cast<float>(kGroups);
     }
-    HistogramCuts weighted = SketchOnDMatrix(m.get(), kBins, AllThreadsForTest());
+    HistogramCuts weighted = SketchOnDMatrix(&ctx, m.get(), kBins);
     ValidateCuts(weighted, m.get(), kBins);
   }
 }
@@ -393,11 +397,12 @@ TEST(HistUtil, SketchFromWeights) {
 }
 
 TEST(HistUtil, SketchCategoricalFeatures) {
-  TestCategoricalSketch(1000, 256, 32, false, [](DMatrix* p_fmat, int32_t num_bins) {
-    return SketchOnDMatrix(p_fmat, num_bins, AllThreadsForTest());
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  TestCategoricalSketch(1000, 256, 32, false, [&ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return SketchOnDMatrix(&ctx, p_fmat, num_bins);
   });
-  TestCategoricalSketch(1000, 256, 32, true, [](DMatrix* p_fmat, int32_t num_bins) {
-    return SketchOnDMatrix(p_fmat, num_bins, AllThreadsForTest());
+  TestCategoricalSketch(1000, 256, 32, true, [&ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return SketchOnDMatrix(&ctx, p_fmat, num_bins);
   });
 }
 }  // namespace common
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index b91cf0b33..4f8bc3975 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -25,9 +25,9 @@ namespace xgboost {
 namespace common {
 
 template <typename AdapterT>
-HistogramCuts GetHostCuts(AdapterT *adapter, int num_bins, float missing) {
+HistogramCuts GetHostCuts(Context const* ctx, AdapterT* adapter, int num_bins, float missing) {
   data::SimpleDMatrix dmat(adapter, missing, 1);
-  HistogramCuts cuts = SketchOnDMatrix(&dmat, num_bins, AllThreadsForTest());
+  HistogramCuts cuts = SketchOnDMatrix(ctx, &dmat, num_bins);
   return cuts;
 }
 
@@ -39,7 +39,9 @@ TEST(HistUtil, DeviceSketch) {
   auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
 
   auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
-  HistogramCuts host_cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest());
+
+  Context ctx;
+  HistogramCuts host_cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
 
   EXPECT_EQ(device_cuts.Values(), host_cuts.Values());
   EXPECT_EQ(device_cuts.Ptrs(), host_cuts.Ptrs());
@@ -314,7 +316,8 @@ TEST(HistUtil, AdapterDeviceSketch) {
   data::CupyAdapter adapter(str);
 
   auto device_cuts = MakeUnweightedCutsForTest(adapter, num_bins, missing);
-  auto host_cuts = GetHostCuts(&adapter, num_bins, missing);
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  auto host_cuts = GetHostCuts(&ctx, &adapter, num_bins, missing);
 
   EXPECT_EQ(device_cuts.Values(), host_cuts.Values());
   EXPECT_EQ(device_cuts.Ptrs(), host_cuts.Ptrs());
diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h
index f368dfd5a..d31df0811 100644
--- a/tests/cpp/common/test_hist_util.h
+++ b/tests/cpp/common/test_hist_util.h
@@ -88,7 +88,8 @@ inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
     fo << row_data.str() << "\n";
   }
   fo.close();
-  return std::shared_ptr<DMatrix>(DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"));
+  return std::shared_ptr<DMatrix>(
+      DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache"));
 }
 
 // Test that elements are approximately equally distributed among bins
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index 3cd32ea0c..a65969a6c 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -16,7 +16,8 @@ TEST(Quantile, LoadBalance) {
   size_t constexpr kRows = 1000, kCols = 100;
   auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
   std::vector<bst_feature_t> cols_ptr;
-  for (auto const& page : m->GetBatches<SparsePage>()) {
+  Context ctx;
+  for (auto const& page : m->GetBatches<SparsePage>(&ctx)) {
     data::SparsePageAdapterBatch adapter{page.GetView()};
     cols_ptr = LoadBalance(adapter, page.data.Size(), kCols, 13, [](auto) { return true; });
   }
@@ -43,6 +44,7 @@ void PushPage(HostSketchContainer* container, SparsePage const& page, MetaInfo c
 
 template <bool use_column>
 void DoTestDistributedQuantile(size_t rows, size_t cols) {
+  Context ctx;
   auto const world = collective::GetWorldSize();
   std::vector<MetaInfo> infos(2);
   auto& h_weights = infos.front().weights_.HostVector();
@@ -51,7 +53,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   SimpleRealUniformDistribution<float> dist(3, 1000);
   std::generate(h_weights.begin(), h_weights.end(), [&]() { return dist(&lcg); });
   std::vector<bst_row_t> column_size(cols, rows);
-  size_t n_bins = 64;
+  bst_bin_t n_bins = 64;
 
   // Generate cuts for distributed environment.
   auto sparsity = 0.5f;
@@ -72,29 +74,29 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   std::vector<float> hessian(rows, 1.0);
   auto hess = Span<float const>{hessian};
 
-  ContainerType<use_column> sketch_distributed(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                               column_size, false, false, AllThreadsForTest());
+  ContainerType<use_column> sketch_distributed(
+      &ctx, n_bins, m->Info().feature_types.ConstHostSpan(), column_size, false);
 
   if (use_column) {
-    for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+    for (auto const& page : m->GetBatches<SortedCSCPage>(&ctx)) {
       PushPage(&sketch_distributed, page, m->Info(), hess);
     }
   } else {
-    for (auto const& page : m->GetBatches<SparsePage>()) {
+    for (auto const& page : m->GetBatches<SparsePage>(&ctx)) {
       PushPage(&sketch_distributed, page, m->Info(), hess);
     }
   }
 
   HistogramCuts distributed_cuts;
-  sketch_distributed.MakeCuts(&distributed_cuts);
+  sketch_distributed.MakeCuts(m->Info(), &distributed_cuts);
 
   // Generate cuts for single node environment
   collective::Finalize();
   CHECK_EQ(collective::GetWorldSize(), 1);
   std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
   m->Info().num_row_ = world * rows;
-  ContainerType<use_column> sketch_on_single_node(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                                  column_size, false, false, AllThreadsForTest());
+  ContainerType<use_column> sketch_on_single_node(
+      &ctx, n_bins, m->Info().feature_types.ConstHostSpan(), column_size, false);
   m->Info().num_row_ = rows;
 
   for (auto rank = 0; rank < world; ++rank) {
@@ -106,7 +108,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
                  .Upper(1.0f)
                  .GenerateDMatrix();
     if (use_column) {
-      for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+      for (auto const& page : m->GetBatches<SortedCSCPage>(&ctx)) {
         PushPage(&sketch_on_single_node, page, m->Info(), hess);
       }
     } else {
@@ -117,7 +119,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   }
 
   HistogramCuts single_node_cuts;
-  sketch_on_single_node.MakeCuts(&single_node_cuts);
+  sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts);
 
   auto const& sptrs = single_node_cuts.Ptrs();
   auto const& dptrs = distributed_cuts.Ptrs();
@@ -172,6 +174,7 @@ TEST(Quantile, SortedDistributed) {
 namespace {
 template <bool use_column>
 void DoTestColSplitQuantile(size_t rows, size_t cols) {
+  Context ctx;
   auto const world = collective::GetWorldSize();
   auto const rank = collective::GetRank();
 
@@ -204,22 +207,22 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
   // Generate cuts for distributed environment.
   HistogramCuts distributed_cuts;
   {
-    ContainerType<use_column> sketch_distributed(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                                 column_size, false, true, AllThreadsForTest());
+    ContainerType<use_column> sketch_distributed(
+        &ctx, n_bins, m->Info().feature_types.ConstHostSpan(), column_size, false);
 
     std::vector<float> hessian(rows, 1.0);
     auto hess = Span<float const>{hessian};
     if (use_column) {
-      for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+      for (auto const& page : m->GetBatches<SortedCSCPage>(&ctx)) {
         PushPage(&sketch_distributed, page, m->Info(), hess);
       }
     } else {
-      for (auto const& page : m->GetBatches<SparsePage>()) {
+      for (auto const& page : m->GetBatches<SparsePage>(&ctx)) {
         PushPage(&sketch_distributed, page, m->Info(), hess);
       }
     }
 
-    sketch_distributed.MakeCuts(&distributed_cuts);
+    sketch_distributed.MakeCuts(m->Info(), &distributed_cuts);
   }
 
   // Generate cuts for single node environment
@@ -227,22 +230,22 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
   CHECK_EQ(collective::GetWorldSize(), 1);
   HistogramCuts single_node_cuts;
   {
-    ContainerType<use_column> sketch_on_single_node(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                                    column_size, false, false, AllThreadsForTest());
+    ContainerType<use_column> sketch_on_single_node(
+        &ctx, n_bins, m->Info().feature_types.ConstHostSpan(), column_size, false);
 
     std::vector<float> hessian(rows, 1.0);
     auto hess = Span<float const>{hessian};
     if (use_column) {
-      for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+      for (auto const& page : m->GetBatches<SortedCSCPage>(&ctx)) {
         PushPage(&sketch_on_single_node, page, m->Info(), hess);
       }
     } else {
-      for (auto const& page : m->GetBatches<SparsePage>()) {
+      for (auto const& page : m->GetBatches<SparsePage>(&ctx)) {
         PushPage(&sketch_on_single_node, page, m->Info(), hess);
       }
     }
 
-    sketch_on_single_node.MakeCuts(&single_node_cuts);
+    sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts);
   }
 
   auto const& sptrs = single_node_cuts.Ptrs();
@@ -299,8 +302,10 @@ namespace {
 void TestSameOnAllWorkers() {
   auto const world = collective::GetWorldSize();
   constexpr size_t kRows = 1000, kCols = 100;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+
   RunWithSeedsAndBins(
-      kRows, [=](int32_t seed, size_t n_bins, MetaInfo const&) {
+      kRows, [=, &ctx](int32_t seed, size_t n_bins, MetaInfo const&) {
         auto rank = collective::GetRank();
         HostDeviceVector<float> storage;
         std::vector<FeatureType> ft(kCols);
@@ -314,7 +319,7 @@ void TestSameOnAllWorkers() {
                      .MaxCategory(17)
                      .Seed(rank + seed)
                      .GenerateDMatrix();
-        auto cuts = SketchOnDMatrix(m.get(), n_bins, AllThreadsForTest());
+        auto cuts = SketchOnDMatrix(&ctx, m.get(), n_bins);
         std::vector<float> cut_values(cuts.Values().size() * world, 0);
         std::vector<
             typename std::remove_reference_t<decltype(cuts.Ptrs())>::value_type>
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index ee40a6430..356c84bb0 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -1,17 +1,17 @@
-/*!
- * Copyright 2019-2020 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #include <xgboost/base.h>
 
 #include <utility>
 
-#include "../helpers.h"
-#include "../histogram_helpers.h"
-#include "gtest/gtest.h"
-
 #include "../../../src/common/categorical.h"
 #include "../../../src/common/hist_util.h"
 #include "../../../src/data/ellpack_page.cuh"
+#include "../../../src/tree/param.h"  // TrainParam
+#include "../helpers.h"
+#include "../histogram_helpers.h"
+#include "gtest/gtest.h"
 
 namespace xgboost {
 
@@ -19,7 +19,10 @@ TEST(EllpackPage, EmptyDMatrix) {
   constexpr int kNRows = 0, kNCols = 0, kMaxBin = 256;
   constexpr float kSparsity = 0;
   auto dmat = RandomDataGenerator(kNRows, kNCols, kSparsity).GenerateDMatrix();
-  auto& page = *dmat->GetBatches<EllpackPage>({0, kMaxBin}).begin();
+  Context ctx{MakeCUDACtx(0)};
+  auto& page = *dmat->GetBatches<EllpackPage>(
+                        &ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()})
+                    .begin();
   auto impl = page.Impl();
   ASSERT_EQ(impl->row_stride, 0);
   ASSERT_EQ(impl->Cuts().TotalBins(), 0);
@@ -87,8 +90,9 @@ TEST(EllpackPage, FromCategoricalBasic) {
   auto& h_ft = m->Info().feature_types.HostVector();
   h_ft.resize(kCols, FeatureType::kCategorical);
 
-  BatchParam p{0, max_bins};
-  auto ellpack = EllpackPage(m.get(), p);
+  Context ctx{MakeCUDACtx(0)};
+  auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
+  auto ellpack = EllpackPage(&ctx, m.get(), p);
   auto accessor = ellpack.Impl()->GetDeviceAccessor(0);
   ASSERT_EQ(kCats, accessor.NumBins());
 
@@ -142,8 +146,9 @@ TEST(EllpackPage, Copy) {
   dmlc::TemporaryDirectory tmpdir;
   std::unique_ptr<DMatrix>
       dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
-  BatchParam param{0, 256};
-  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
   EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
@@ -151,7 +156,7 @@ TEST(EllpackPage, Copy) {
 
   // Copy batch pages into the result page.
   size_t offset = 0;
-  for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     size_t num_elements = result.Copy(0, batch.Impl(), offset);
     offset += num_elements;
   }
@@ -161,7 +166,7 @@ TEST(EllpackPage, Copy) {
   thrust::device_vector<bst_float> row_result_d(kCols);
   std::vector<bst_float> row(kCols);
   std::vector<bst_float> row_result(kCols);
-  for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& page : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     auto impl = page.Impl();
     EXPECT_EQ(impl->base_rowid, current_row);
 
@@ -186,10 +191,11 @@ TEST(EllpackPage, Compact) {
 
   // Create a DMatrix with multiple batches.
   dmlc::TemporaryDirectory tmpdir;
-  std::unique_ptr<DMatrix>
-      dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
-  BatchParam param{0, 256};
-  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  std::unique_ptr<DMatrix> dmat(
+      CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
   EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
@@ -201,7 +207,7 @@ TEST(EllpackPage, Compact) {
     SIZE_MAX};
   thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
   common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
-  for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     result.Compact(0, batch.Impl(), row_indexes_span);
   }
 
@@ -210,7 +216,7 @@ TEST(EllpackPage, Compact) {
   thrust::device_vector<bst_float> row_result_d(kCols);
   std::vector<bst_float> row(kCols);
   std::vector<bst_float> row_result(kCols);
-  for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& page : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     auto impl = page.Impl();
     ASSERT_EQ(impl->base_rowid, current_row);
 
@@ -249,15 +255,17 @@ class EllpackPageTest : public testing::TestWithParam<float> {
     // device.
     size_t n_samples{128}, n_features{13};
     Context ctx;
-    ctx.gpu_id = 0;
+    Context gpu_ctx{MakeCUDACtx(0)};
     auto Xy = RandomDataGenerator{n_samples, n_features, sparsity}.GenerateDMatrix(true);
     std::unique_ptr<EllpackPageImpl> from_ghist;
     ASSERT_TRUE(Xy->SingleColBlock());
-    for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(BatchParam{17, 0.6})) {
-      from_ghist.reset(new EllpackPageImpl{&ctx, page, {}});
+
+    for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{17, 0.6})) {
+      from_ghist.reset(new EllpackPageImpl{&gpu_ctx, page, {}});
     }
 
-    for (auto const& page : Xy->GetBatches<EllpackPage>(BatchParam{0, 17})) {
+    for (auto const& page : Xy->GetBatches<EllpackPage>(
+             &gpu_ctx, BatchParam{17, tree::TrainParam::DftSparseThreshold()})) {
       auto from_sparse_page = page.Impl();
       ASSERT_EQ(from_sparse_page->is_dense, from_ghist->is_dense);
       ASSERT_EQ(from_sparse_page->base_rowid, 0);
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 92b4acf4b..66d4024ec 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -1,17 +1,21 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
 
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/sparse_page_source.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../../../src/tree/param.h"  // TrainParam
+#include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost {
 namespace data {
 TEST(EllpackPageRawFormat, IO) {
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+
   std::unique_ptr<SparsePageFormat<EllpackPage>> format{CreatePageFormat<EllpackPage>("raw")};
 
   auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
@@ -20,7 +24,7 @@ TEST(EllpackPageRawFormat, IO) {
 
   {
     std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
-    for (auto const &ellpack : m->GetBatches<EllpackPage>({0, 256})) {
+    for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
       format->Write(ellpack, fo.get());
     }
   }
@@ -29,7 +33,7 @@ TEST(EllpackPageRawFormat, IO) {
   std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
   format->Read(&page, fi.get());
 
-  for (auto const &ellpack : m->GetBatches<EllpackPage>({0, 256})) {
+  for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
     auto loaded = page.Impl();
     auto orig = ellpack.Impl();
     ASSERT_EQ(loaded->Cuts().Ptrs(), orig->Cuts().Ptrs());
diff --git a/tests/cpp/data/test_file_iterator.cc b/tests/cpp/data/test_file_iterator.cc
index 31da2c1fa..bd8c4b9c2 100644
--- a/tests/cpp/data/test_file_iterator.cc
+++ b/tests/cpp/data/test_file_iterator.cc
@@ -29,16 +29,16 @@ TEST(FileIterator, Basic) {
   {
     auto zpath = tmpdir.path + "/0-based.svm";
     CreateBigTestData(zpath, 3 * 64, true);
-    zpath += "?indexing_mode=0";
-    FileIterator iter{zpath, 0, 1, "libsvm"};
+    zpath += "?indexing_mode=0&format=libsvm";
+    FileIterator iter{zpath, 0, 1};
     check_n_features(&iter);
   }
 
   {
     auto opath = tmpdir.path + "/1-based.svm";
     CreateBigTestData(opath, 3 * 64, false);
-    opath += "?indexing_mode=1";
-    FileIterator iter{opath, 0, 1, "libsvm"};
+    opath += "?indexing_mode=1&format=libsvm";
+    FileIterator iter{opath, 0, 1};
     check_n_features(&iter);
   }
 }
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index c623ecfae..b9dd1a640 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -2,20 +2,38 @@
  * Copyright 2021-2023 by XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/data.h>
+#include <xgboost/data.h>                       // for BatchIterator, BatchSet, DMatrix, BatchParam
 
-#include "../../../src/common/column_matrix.h"
-#include "../../../src/common/io.h"  // MemoryBufferStream
-#include "../../../src/data/gradient_index.h"
-#include "../helpers.h"
+#include <algorithm>                            // for sort, unique
+#include <cmath>                                // for isnan
+#include <cstddef>                              // for size_t
+#include <limits>                               // for numeric_limits
+#include <memory>                               // for shared_ptr, __shared_ptr_access, unique_ptr
+#include <string>                               // for string
+#include <tuple>                                // for make_tuple, tie, tuple
+#include <utility>                              // for move
+#include <vector>                               // for vector
+
+#include "../../../src/common/categorical.h"    // for AsCat
+#include "../../../src/common/column_matrix.h"  // for ColumnMatrix
+#include "../../../src/common/hist_util.h"      // for Index, HistogramCuts, SketchOnDMatrix
+#include "../../../src/common/io.h"             // for MemoryBufferStream
+#include "../../../src/data/adapter.h"          // for SparsePageAdapterBatch
+#include "../../../src/data/gradient_index.h"   // for GHistIndexMatrix
+#include "../../../src/tree/param.h"            // for TrainParam
+#include "../helpers.h"                         // for CreateEmptyGenericParam, GenerateRandomCa...
+#include "xgboost/base.h"                       // for bst_bin_t
+#include "xgboost/context.h"                    // for Context
+#include "xgboost/host_device_vector.h"         // for HostDeviceVector
 
 namespace xgboost {
 namespace data {
 TEST(GradientIndex, ExternalMemory) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
   std::vector<size_t> base_rowids;
   std::vector<float> hessian(dmat->Info().num_row_, 1);
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>({64, hessian, true})) {
+  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
     base_rowids.push_back(page.base_rowid);
   }
   size_t i = 0;
@@ -24,9 +42,8 @@ TEST(GradientIndex, ExternalMemory) {
     ++i;
   }
 
-
   base_rowids.clear();
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>({64, hessian, false})) {
+  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
     base_rowids.push_back(page.base_rowid);
   }
   i = 0;
@@ -41,12 +58,13 @@ TEST(GradientIndex, FromCategoricalBasic) {
   size_t max_bins = 8;
   auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
   auto m = GetDMatrixFromData(x, kRows, 1);
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
 
   auto &h_ft = m->Info().feature_types.HostVector();
   h_ft.resize(kCols, FeatureType::kCategorical);
 
   BatchParam p(max_bins, 0.8);
-  GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, AllThreadsForTest(), {});
+  GHistIndexMatrix gidx(&ctx, m.get(), max_bins, p.sparse_thresh, false, {});
 
   auto x_copy = x;
   std::sort(x_copy.begin(), x_copy.end());
@@ -80,11 +98,11 @@ TEST(GradientIndex, FromCategoricalLarge) {
 
   BatchParam p{max_bins, 0.8};
   {
-    GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, AllThreadsForTest(), {});
+    GHistIndexMatrix gidx{&ctx, m.get(), max_bins, p.sparse_thresh, false, {}};
     ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
   }
   {
-    for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(&ctx, p)) {
       common::HistogramCuts cut = page.cut;
       GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
       ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
@@ -96,10 +114,11 @@ TEST(GradientIndex, PushBatch) {
   size_t constexpr kRows = 64, kCols = 4;
   bst_bin_t max_bins = 64;
   float st = 0.5;
+  Context ctx;
 
   auto test = [&](float sparisty) {
     auto m = RandomDataGenerator{kRows, kCols, sparisty}.GenerateDMatrix(true);
-    auto cuts = common::SketchOnDMatrix(m.get(), max_bins, AllThreadsForTest(), false, {});
+    auto cuts = common::SketchOnDMatrix(&ctx, m.get(), max_bins, false, {});
     common::HistogramCuts copy_cuts = cuts;
 
     ASSERT_EQ(m->Info().num_row_, kRows);
@@ -112,7 +131,7 @@ TEST(GradientIndex, PushBatch) {
                             m->Info().num_row_);
       gmat.PushAdapterBatchColumns(m->Ctx(), batch, std::numeric_limits<float>::quiet_NaN(), 0);
     }
-    for (auto const &page : m->GetBatches<GHistIndexMatrix>(BatchParam{max_bins, st})) {
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{max_bins, st})) {
       for (size_t i = 0; i < kRows; ++i) {
         for (size_t j = 0; j < kCols; ++j) {
           auto v0 = gmat.GetFvalue(i, j, false);
@@ -143,17 +162,19 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
     // device.
     size_t n_samples{128}, n_features{13};
     Context ctx;
-    ctx.gpu_id = 0;
     auto Xy = RandomDataGenerator{n_samples, n_features, 1 - density}.GenerateDMatrix(true);
     std::unique_ptr<GHistIndexMatrix> from_ellpack;
     ASSERT_TRUE(Xy->SingleColBlock());
     bst_bin_t constexpr kBins{17};
     auto p = BatchParam{kBins, threshold};
-    for (auto const &page : Xy->GetBatches<EllpackPage>(BatchParam{0, kBins})) {
+    Context gpu_ctx;
+    gpu_ctx.gpu_id = 0;
+    for (auto const &page : Xy->GetBatches<EllpackPage>(
+             &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
       from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
     }
 
-    for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(p)) {
+    for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
       ASSERT_EQ(from_sparse_page.IsDense(), from_ellpack->IsDense());
       ASSERT_EQ(from_sparse_page.base_rowid, 0);
       ASSERT_EQ(from_sparse_page.base_rowid, from_ellpack->base_rowid);
diff --git a/tests/cpp/data/test_gradient_index_page_raw_format.cc b/tests/cpp/data/test_gradient_index_page_raw_format.cc
index fa1a10faa..570d1dbca 100644
--- a/tests/cpp/data/test_gradient_index_page_raw_format.cc
+++ b/tests/cpp/data/test_gradient_index_page_raw_format.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
@@ -11,6 +11,8 @@
 namespace xgboost {
 namespace data {
 TEST(GHistIndexPageRawFormat, IO) {
+  Context ctx;
+
   std::unique_ptr<SparsePageFormat<GHistIndexMatrix>> format{
       CreatePageFormat<GHistIndexMatrix>("raw")};
   auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
@@ -20,7 +22,7 @@ TEST(GHistIndexPageRawFormat, IO) {
 
   {
     std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
-    for (auto const &index : m->GetBatches<GHistIndexMatrix>(batch)) {
+    for (auto const &index : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
       format->Write(index, fo.get());
     }
   }
@@ -29,7 +31,7 @@ TEST(GHistIndexPageRawFormat, IO) {
   std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
   format->Read(&page, fi.get());
 
-  for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(batch)) {
+  for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
     auto const &loaded = gidx;
     ASSERT_EQ(loaded.cut.Ptrs(), page.cut.Ptrs());
     ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
@@ -43,5 +45,5 @@ TEST(GHistIndexPageRawFormat, IO) {
     ASSERT_EQ(loaded.Transpose().GetTypeSize(), loaded.Transpose().GetTypeSize());
   }
 }
-} // namespace data
-} // namespace xgboost
+}  // namespace data
+}  // namespace xgboost
diff --git a/tests/cpp/data/test_iterative_dmatrix.cc b/tests/cpp/data/test_iterative_dmatrix.cc
index f95f7c03c..74a69e109 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cc
+++ b/tests/cpp/data/test_iterative_dmatrix.cc
@@ -15,8 +15,9 @@
 namespace xgboost {
 namespace data {
 TEST(IterativeDMatrix, Ref) {
+  Context ctx;
   TestRefDMatrix<GHistIndexMatrix, NumpyArrayIterForTest>(
-      [&](GHistIndexMatrix const& page) { return page.cut; });
+      &ctx, [&](GHistIndexMatrix const& page) { return page.cut; });
 }
 
 TEST(IterativeDMatrix, IsDense) {
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index be97a3f6a..2f2f1f84f 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -1,11 +1,12 @@
-/*!
- * Copyright 2020-2022 XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/iterative_dmatrix.h"
+#include "../../../src/tree/param.h"  // TrainParam
 #include "../helpers.h"
 #include "test_iterative_dmatrix.h"
 
@@ -13,15 +14,17 @@ namespace xgboost {
 namespace data {
 
 void TestEquivalent(float sparsity) {
+  Context ctx{MakeCUDACtx(0)};
+
   CudaArrayIterForTest iter{sparsity};
   IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
                      std::numeric_limits<float>::quiet_NaN(), 0, 256);
-  size_t offset = 0;
-  auto first = (*m.GetEllpackBatches({}).begin()).Impl();
+  std::size_t offset = 0;
+  auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
   std::unique_ptr<EllpackPageImpl> page_concatenated {
     new EllpackPageImpl(0, first->Cuts(), first->is_dense,
                         first->row_stride, 1000 * 100)};
-  for (auto& batch : m.GetBatches<EllpackPage>({})) {
+  for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
     auto page = batch.Impl();
     size_t num_elements = page_concatenated->Copy(0, page, offset);
     offset += num_elements;
@@ -34,8 +37,8 @@ void TestEquivalent(float sparsity) {
   auto adapter = CupyAdapter(interface_str);
   std::unique_ptr<DMatrix> dm{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
-  BatchParam bp {0, 256};
-  for (auto& ellpack : dm->GetBatches<EllpackPage>(bp)) {
+  auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
     auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
 
     std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
@@ -92,7 +95,8 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
                      std::numeric_limits<float>::quiet_NaN(), 0, 256);
   size_t n_batches = 0;
   std::string interface_str = iter.AsArray();
-  for (auto& ellpack : m.GetBatches<EllpackPage>({})) {
+  Context ctx{MakeCUDACtx(0)};
+  for (auto& ellpack : m.GetBatches<EllpackPage>(&ctx, {})) {
     n_batches ++;
     auto impl = ellpack.Impl();
     common::CompressedIterator<uint32_t> iterator(
@@ -140,7 +144,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
 
   IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
                      std::numeric_limits<float>::quiet_NaN(), 0, 256);
-  auto &ellpack = *m.GetBatches<EllpackPage>({0, 256}).begin();
+  auto ctx = MakeCUDACtx(0);
+  auto& ellpack =
+      *m.GetBatches<EllpackPage>(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()})
+           .begin();
   auto impl = ellpack.Impl();
   common::CompressedIterator<uint32_t> iterator(
       impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
@@ -171,8 +178,9 @@ TEST(IterativeDeviceDMatrix, IsDense) {
 }
 
 TEST(IterativeDeviceDMatrix, Ref) {
+  Context ctx{MakeCUDACtx(0)};
   TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
-      [](EllpackPage const& page) { return page.Impl()->Cuts(); });
+      &ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
 }
 }  // namespace data
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_iterative_dmatrix.h b/tests/cpp/data/test_iterative_dmatrix.h
index 588d2b3be..ed8e2da77 100644
--- a/tests/cpp/data/test_iterative_dmatrix.h
+++ b/tests/cpp/data/test_iterative_dmatrix.h
@@ -1,8 +1,11 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023, XGBoost contributors
  */
 #pragma once
-#include <memory>  // std::make_shared
+#include <xgboost/context.h>  // for Context
+
+#include <limits>             // for numeric_limits
+#include <memory>             // for make_shared
 
 #include "../../../src/data/iterative_dmatrix.h"
 #include "../helpers.h"
@@ -10,7 +13,7 @@
 namespace xgboost {
 namespace data {
 template <typename Page, typename Iter, typename Cuts>
-void TestRefDMatrix(Cuts&& get_cuts) {
+void TestRefDMatrix(Context const* ctx, Cuts&& get_cuts) {
   int n_bins = 256;
   Iter iter(0.3, 2048);
   auto m = std::make_shared<IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
@@ -20,8 +23,8 @@ void TestRefDMatrix(Cuts&& get_cuts) {
   auto m_1 = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), m, Reset, Next,
                                                 std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
 
-  for (auto const& page_0 : m->template GetBatches<Page>({})) {
-    for (auto const& page_1 : m_1->template GetBatches<Page>({})) {
+  for (auto const& page_0 : m->template GetBatches<Page>(ctx, {})) {
+    for (auto const& page_1 : m_1->template GetBatches<Page>(ctx, {})) {
       auto const& cuts_0 = get_cuts(page_0);
       auto const& cuts_1 = get_cuts(page_1);
       ASSERT_EQ(cuts_0.Values(), cuts_1.Values());
@@ -32,8 +35,8 @@ void TestRefDMatrix(Cuts&& get_cuts) {
 
   m_1 = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), nullptr, Reset, Next,
                                            std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
-  for (auto const& page_0 : m->template GetBatches<Page>({})) {
-    for (auto const& page_1 : m_1->template GetBatches<Page>({})) {
+  for (auto const& page_0 : m->template GetBatches<Page>(ctx, {})) {
+    for (auto const& page_1 : m_1->template GetBatches<Page>(ctx, {})) {
       auto const& cuts_0 = get_cuts(page_0);
       auto const& cuts_1 = get_cuts(page_1);
       ASSERT_NE(cuts_0.Values(), cuts_1.Values());
@@ -45,8 +48,8 @@ void TestRefDMatrix(Cuts&& get_cuts) {
   auto dm = RandomDataGenerator(2048, Iter::Cols(), 0.5).GenerateDMatrix(true);
   auto dqm = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), dm, Reset, Next,
                                                 std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
-  for (auto const& page_0 : dm->template GetBatches<Page>({})) {
-    for (auto const& page_1 : dqm->template GetBatches<Page>({})) {
+  for (auto const& page_0 : dm->template GetBatches<Page>(ctx, {})) {
+    for (auto const& page_1 : dqm->template GetBatches<Page>(ctx, {})) {
       auto const& cuts_0 = get_cuts(page_0);
       auto const& cuts_1 = get_cuts(page_1);
       ASSERT_EQ(cuts_0.Values(), cuts_1.Values());
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index 1d0d0d340..dd22da593 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -157,8 +157,7 @@ TEST(MetaInfo, LoadQid) {
   dmlc::TemporaryDirectory tempdir;
   std::string tmp_file = tempdir.path + "/qid_test.libsvm";
   {
-    std::unique_ptr<dmlc::Stream> fs(
-      dmlc::Stream::Create(tmp_file.c_str(), "w"));
+    std::unique_ptr<dmlc::Stream> fs(dmlc::Stream::Create(tmp_file.c_str(), "w"));
     dmlc::ostream os(fs.get());
     os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
                 2 qid:1 1:0 2:0 3:1 4:0.1 5:1
@@ -175,7 +174,7 @@ TEST(MetaInfo, LoadQid) {
     os.set_stream(nullptr);
   }
   std::unique_ptr<xgboost::DMatrix> dmat(
-    xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kRow, "libsvm"));
+      xgboost::DMatrix::Load(tmp_file + "?format=libsvm", true, xgboost::DataSplitMode::kRow));
 
   const xgboost::MetaInfo& info = dmat->Info();
   const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index a37352626..43d0877d3 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -17,11 +17,15 @@
 
 using namespace xgboost;  // NOLINT
 
+namespace {
+std::string UriSVM(std::string name) { return name + "?format=libsvm"; }
+}  // namespace
+
 TEST(SimpleDMatrix, MetaInfo) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
+  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
 
   // Test the metadata that was parsed
   EXPECT_EQ(dmat->Info().num_row_, 2);
@@ -37,7 +41,7 @@ TEST(SimpleDMatrix, RowAccess) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, false);
+  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file), false);
 
   // Loop over the batches and count the records
   int64_t row_count = 0;
@@ -57,16 +61,17 @@ TEST(SimpleDMatrix, RowAccess) {
 }
 
 TEST(SimpleDMatrix, ColAccessWithoutBatches) {
+  Context ctx;
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
+  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
 
   ASSERT_TRUE(dmat->SingleColBlock());
 
   // Loop over the batches and assert the data is as expected
   int64_t num_col_batch = 0;
-  for (const auto &batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
+  for (const auto &batch : dmat->GetBatches<xgboost::SortedCSCPage>(&ctx)) {
     num_col_batch += 1;
     EXPECT_EQ(batch.Size(), dmat->Info().num_col_)
         << "Expected batch size = number of cells as #batches is 1.";
@@ -387,7 +392,7 @@ TEST(SimpleDMatrix, SaveLoadBinary) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file);
+  xgboost::DMatrix * dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
   data::SimpleDMatrix *simple_dmat = dynamic_cast<data::SimpleDMatrix*>(dmat);
 
   const std::string tmp_binfile = tempdir.path + "/csr_source.binary";
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 24dc40949..4cbbe6dc9 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -16,14 +16,19 @@
 #include "../helpers.h"
 
 using namespace xgboost;  // NOLINT
+namespace {
+std::string UriSVM(std::string name, std::string cache) {
+  return name + "?format=libsvm" + "#" + cache + ".cache";
+}
+}  // namespace
 
 template <typename Page>
-void TestSparseDMatrixLoadFile() {
+void TestSparseDMatrixLoadFile(Context const* ctx) {
   dmlc::TemporaryDirectory tmpdir;
   auto opath = tmpdir.path + "/1-based.svm";
   CreateBigTestData(opath, 3 * 64, false);
-  opath += "?indexing_mode=1";
-  data::FileIterator iter{opath, 0, 1, "libsvm"};
+  opath += "?indexing_mode=1&format=libsvm";
+  data::FileIterator iter{opath, 0, 1};
   auto n_threads = 0;
   data::SparsePageDMatrix m{&iter,
                             iter.Proxy(),
@@ -43,7 +48,7 @@ void TestSparseDMatrixLoadFile() {
   data::SimpleDMatrix simple{&adapter, std::numeric_limits<float>::quiet_NaN(),
                              1};
   Page out;
-  for (auto const& page : m.GetBatches<Page>()) {
+  for (auto const &page : m.GetBatches<Page>(ctx)) {
     if (std::is_same<Page, SparsePage>::value) {
       out.Push(page);
     } else {
@@ -53,7 +58,7 @@ void TestSparseDMatrixLoadFile() {
   ASSERT_EQ(m.Info().num_col_, simple.Info().num_col_);
   ASSERT_EQ(m.Info().num_row_, simple.Info().num_row_);
 
-  for (auto const& page : simple.GetBatches<Page>()) {
+  for (auto const& page : simple.GetBatches<Page>(ctx)) {
     ASSERT_EQ(page.offset.HostVector(), out.offset.HostVector());
     for (size_t i = 0; i < page.data.Size(); ++i) {
       ASSERT_EQ(page.data.HostVector()[i].fvalue, out.data.HostVector()[i].fvalue);
@@ -62,16 +67,18 @@ void TestSparseDMatrixLoadFile() {
 }
 
 TEST(SparsePageDMatrix, LoadFile) {
-  TestSparseDMatrixLoadFile<SparsePage>();
-  TestSparseDMatrixLoadFile<CSCPage>();
-  TestSparseDMatrixLoadFile<SortedCSCPage>();
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  TestSparseDMatrixLoadFile<SparsePage>(&ctx);
+  TestSparseDMatrixLoadFile<CSCPage>(&ctx);
+  TestSparseDMatrixLoadFile<SortedCSCPage>(&ctx);
 }
 
 // allow caller to retain pages so they can process multiple pages at the same time.
 template <typename Page>
 void TestRetainPage() {
   auto m = CreateSparsePageDMatrix(10000);
-  auto batches = m->GetBatches<Page>();
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  auto batches = m->GetBatches<Page>(&ctx);
   auto begin = batches.begin();
   auto end = batches.end();
 
@@ -95,7 +102,7 @@ void TestRetainPage() {
   }
 
   // make sure it's const and the caller can not modify the content of page.
-  for (auto& page : m->GetBatches<Page>()) {
+  for (auto &page : m->GetBatches<Page>({&ctx})) {
     static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
   }
 }
@@ -112,15 +119,13 @@ TEST(SparsePageDMatrix, MetaInfo) {
   size_t constexpr kEntries = 24;
   CreateBigTestData(tmp_file, kEntries);
 
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", false);
+  std::unique_ptr<DMatrix> dmat{xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file), false)};
 
   // Test the metadata that was parsed
   EXPECT_EQ(dmat->Info().num_row_, 8ul);
   EXPECT_EQ(dmat->Info().num_col_, 5ul);
   EXPECT_EQ(dmat->Info().num_nonzero_, kEntries);
   EXPECT_EQ(dmat->Info().labels.Size(), dmat->Info().num_row_);
-
-  delete dmat;
 }
 
 TEST(SparsePageDMatrix, RowAccess) {
@@ -139,11 +144,12 @@ TEST(SparsePageDMatrix, ColAccess) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
+  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file));
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
 
   // Loop over the batches and assert the data is as expected
   size_t iter = 0;
-  for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
+  for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>(&ctx)) {
     auto col_page = col_batch.GetView();
     ASSERT_EQ(col_page.Size(), dmat->Info().num_col_);
     if (iter == 1) {
@@ -161,7 +167,7 @@ TEST(SparsePageDMatrix, ColAccess) {
 
   // Loop over the batches and assert the data is as expected
   iter = 0;
-  for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
+  for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>(&ctx)) {
     auto col_page = col_batch.GetView();
     EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
     if (iter == 0) {
@@ -179,9 +185,9 @@ TEST(SparsePageDMatrix, ColAccess) {
 TEST(SparsePageDMatrix, ThreadSafetyException) {
   size_t constexpr kEntriesPerCol = 3;
   size_t constexpr kEntries = 64 * kEntriesPerCol * 2;
+  Context ctx;
 
-  std::unique_ptr<xgboost::DMatrix> dmat =
-      xgboost::CreateSparsePageDMatrix(kEntries);
+  std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(kEntries);
 
   int threads = 1000;
 
@@ -218,7 +224,8 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
   // Create multiple sparse pages
   std::unique_ptr<xgboost::DMatrix> dmat{xgboost::CreateSparsePageDMatrix(kEntries)};
   ASSERT_EQ(dmat->Ctx()->Threads(), AllThreadsForTest());
-  for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  for (auto const &page : dmat->GetBatches<xgboost::CSCPage>(&ctx)) {
     ASSERT_EQ(dmat->Info().num_col_, page.Size());
   }
 }
@@ -231,7 +238,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
   std::string filename = tempdir.path + "/simple.libsvm";
   CreateBigTestData(filename, 1 << 16);
 
-  data::FileIterator iter(filename, 0, 1, "auto");
+  data::FileIterator iter(filename + "?format=libsvm", 0, 1);
   std::unique_ptr<DMatrix> sparse{
       new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
                                   std::numeric_limits<float>::quiet_NaN(), threads, filename}};
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index bb562ffb7..846fe7f63 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -1,23 +1,28 @@
 /**
  * Copyright 2019-2023 by XGBoost Contributors
  */
+#include <xgboost/data.h>  // for DMatrix
+
 #include "../../../src/common/compressed_iterator.h"
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/sparse_page_dmatrix.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../../../src/tree/param.h"  // TrainParam
+#include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost {
 
 TEST(SparsePageDMatrix, EllpackPage) {
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
+  DMatrix* dmat = DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache");
 
   // Loop over the batches and assert the data is as expected
   size_t n = 0;
-  for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
+  for (const auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     n += batch.Size();
   }
   EXPECT_EQ(n, dmat->Info().num_row_);
@@ -37,6 +42,8 @@ TEST(SparsePageDMatrix, EllpackPage) {
 }
 
 TEST(SparsePageDMatrix, MultipleEllpackPages) {
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   dmlc::TemporaryDirectory tmpdir;
   std::string filename = tmpdir.path + "/big.libsvm";
   size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
@@ -46,7 +53,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
   // Loop over the batches and count the records
   int64_t batch_count = 0;
   int64_t row_count = 0;
-  for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
+  for (const auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     EXPECT_LT(batch.Size(), dmat->Info().num_row_);
     batch_count++;
     row_count += batch.Size();
@@ -61,8 +68,11 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
 }
 
 TEST(SparsePageDMatrix, RetainEllpackPage) {
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{32, tree::TrainParam::DftSparseThreshold()};
   auto m = CreateSparsePageDMatrix(10000);
-  auto batches = m->GetBatches<EllpackPage>({0, 32});
+
+  auto batches = m->GetBatches<EllpackPage>(&ctx, param);
   auto begin = batches.begin();
   auto end = batches.end();
 
@@ -87,7 +97,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
   }
 
   // make sure it's const and the caller can not modify the content of page.
-  for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
+  for (auto& page : m->GetBatches<EllpackPage>(&ctx, param)) {
     static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
   }
 
@@ -98,6 +108,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
 }
 
 TEST(SparsePageDMatrix, EllpackPageContent) {
+  auto ctx = CreateEmptyGenericParam(0);
   constexpr size_t kRows = 6;
   constexpr size_t kCols = 2;
   constexpr size_t kPageSize = 1;
@@ -110,8 +121,8 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
   std::unique_ptr<DMatrix>
       dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
 
-  BatchParam param{0, 2};
-  auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  auto param = BatchParam{2, tree::TrainParam::DftSparseThreshold()};
+  auto impl = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_EQ(impl->base_rowid, 0);
   EXPECT_EQ(impl->n_rows, kRows);
   EXPECT_FALSE(impl->is_dense);
@@ -120,7 +131,7 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
 
   std::unique_ptr<EllpackPageImpl> impl_ext;
   size_t offset = 0;
-  for (auto& batch : dmat_ext->GetBatches<EllpackPage>(param)) {
+  for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
     if (!impl_ext) {
       impl_ext.reset(new EllpackPageImpl(
           batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
@@ -170,8 +181,9 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
   std::unique_ptr<DMatrix>
       dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
 
-  BatchParam param{0, kMaxBins};
-  auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
+  auto impl = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_EQ(impl->base_rowid, 0);
   EXPECT_EQ(impl->n_rows, kRows);
 
@@ -180,7 +192,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
   thrust::device_vector<bst_float> row_ext_d(kCols);
   std::vector<bst_float> row(kCols);
   std::vector<bst_float> row_ext(kCols);
-  for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
+  for (auto& page : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
     auto impl_ext = page.Impl();
     EXPECT_EQ(impl_ext->base_rowid, current_row);
 
@@ -211,10 +223,11 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
   std::unique_ptr<DMatrix>
       dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
 
-  BatchParam param{0, kMaxBins};
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
 
   size_t current_row = 0;
-  for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
+  for (auto& page : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
     auto impl_ext = page.Impl();
     EXPECT_EQ(impl_ext->base_rowid, current_row);
     current_row += impl_ext->n_rows;
diff --git a/tests/cpp/data/test_sparse_page_raw_format.cc b/tests/cpp/data/test_sparse_page_raw_format.cc
index 5743c4223..722655880 100644
--- a/tests/cpp/data/test_sparse_page_raw_format.cc
+++ b/tests/cpp/data/test_sparse_page_raw_format.cc
@@ -1,17 +1,24 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/data.h>
+#include <xgboost/data.h>                          // for CSCPage, SortedCSCPage, SparsePage
 
-#include "../../../src/data/sparse_page_source.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
-#include "../helpers.h"
+#include <memory>                                  // for allocator, unique_ptr, __shared_ptr_ac...
+#include <string>                                  // for char_traits, operator+, basic_string
+
+#include "../../../src/data/sparse_page_writer.h"  // for CreatePageFormat
+#include "../helpers.h"                            // for RandomDataGenerator
+#include "dmlc/filesystem.h"                       // for TemporaryDirectory
+#include "dmlc/io.h"                               // for SeekStream, Stream
+#include "gtest/gtest_pred_impl.h"                 // for Test, AssertionResult, ASSERT_EQ, TEST
+#include "xgboost/context.h"                       // for Context
 
 namespace xgboost {
 namespace data {
 template <typename S> void TestSparsePageRawFormat() {
   std::unique_ptr<SparsePageFormat<S>> format{CreatePageFormat<S>("raw")};
+  Context ctx;
 
   auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
   ASSERT_TRUE(m->SingleColBlock());
@@ -21,7 +28,7 @@ template <typename S> void TestSparsePageRawFormat() {
   {
     // block code to flush the stream
     std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
-    for (auto const &page : m->GetBatches<S>()) {
+    for (auto const &page : m->GetBatches<S>(&ctx)) {
       orig.Push(page);
       format->Write(page, fo.get());
     }
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 7b246d4ab..24ff55889 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -167,18 +167,20 @@ xgboost::bst_float GetMetricEval(xgboost::Metric* metric,
                                  xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
                                  std::vector<xgboost::bst_float> labels,
                                  std::vector<xgboost::bst_float> weights,
-                                 std::vector<xgboost::bst_uint> groups) {
+                                 std::vector<xgboost::bst_uint> groups,
+                                 xgboost::DataSplitMode data_split_mode) {
   return GetMultiMetricEval(
       metric, preds,
       xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1}, weights,
-      groups);
+      groups, data_split_mode);
 }
 
 double GetMultiMetricEval(xgboost::Metric* metric,
                           xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
                           xgboost::linalg::Tensor<float, 2> const& labels,
                           std::vector<xgboost::bst_float> weights,
-                          std::vector<xgboost::bst_uint> groups) {
+                          std::vector<xgboost::bst_uint> groups,
+                          xgboost::DataSplitMode data_split_mode) {
   std::shared_ptr<xgboost::DMatrix> p_fmat{xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix()};
   auto& info = p_fmat->Info();
   info.num_row_ = labels.Shape(0);
@@ -186,7 +188,10 @@ double GetMultiMetricEval(xgboost::Metric* metric,
   info.labels.Data()->Copy(*labels.Data());
   info.weights_.HostVector() = weights;
   info.group_ptr_ = groups;
-
+  info.data_split_mode = data_split_mode;
+  if (info.IsVerticalFederated() && xgboost::collective::GetRank() != 0) {
+    info.labels.Reshape(0);
+  }
   return metric->Evaluate(preds, p_fmat);
 }
 
@@ -543,7 +548,7 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
   }
   fo.close();
 
-  std::string uri = tmp_file;
+  std::string uri = tmp_file + "?format=libsvm";
   if (page_size > 0) {
     uri += "#" + tmp_file + ".cache";
   }
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index d63db3c8d..bcd27c568 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -39,6 +39,18 @@
 #define GPUIDX -1
 #endif
 
+#if defined(__CUDACC__)
+#define DeclareUnifiedDistributedTest(name) MGPU ## name
+#else
+#define DeclareUnifiedDistributedTest(name) name
+#endif
+
+#if defined(__CUDACC__)
+#define WORLD_SIZE_FOR_TEST (xgboost::common::AllVisibleGPUs())
+#else
+#define WORLD_SIZE_FOR_TEST (3)
+#endif
+
 namespace xgboost {
 class ObjFunction;
 class Metric;
@@ -92,13 +104,15 @@ xgboost::bst_float GetMetricEval(
   xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
   std::vector<xgboost::bst_float> labels,
   std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
-  std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>());
+  std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>(),
+  xgboost::DataSplitMode data_split_Mode = xgboost::DataSplitMode::kRow);
 
 double GetMultiMetricEval(xgboost::Metric* metric,
                           xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
                           xgboost::linalg::Tensor<float, 2> const& labels,
                           std::vector<xgboost::bst_float> weights = {},
-                          std::vector<xgboost::bst_uint> groups = {});
+                          std::vector<xgboost::bst_uint> groups = {},
+                          xgboost::DataSplitMode data_split_Mode = xgboost::DataSplitMode::kRow);
 
 namespace xgboost {
 
@@ -374,6 +388,11 @@ inline Context CreateEmptyGenericParam(int gpu_id) {
   return tparam;
 }
 
+/**
+ * \brief Make a context that uses CUDA.
+ */
+inline Context MakeCUDACtx(std::int32_t device) { return Context{}.MakeCUDA(device); }
+
 inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows,
                                                               float lower= 0.0f, float upper = 1.0f) {
   xgboost::SimpleLCG gen;
@@ -496,4 +515,17 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
     thread.join();
   }
 }
+
+class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
+ protected:
+  int world_size_;
+
+  void SetUp() override {
+    world_size_ = WORLD_SIZE_FOR_TEST;
+    if (world_size_ <= 1) {
+      GTEST_SKIP() << "Skipping MGPU test with # GPUs = " << world_size_;
+    }
+  }
+};
+
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_auc.cc b/tests/cpp/metric/test_auc.cc
index 2a6738899..de42bba53 100644
--- a/tests/cpp/metric/test_auc.cc
+++ b/tests/cpp/metric/test_auc.cc
@@ -1,261 +1,68 @@
+#include "test_auc.h"
+
 #include <xgboost/metric.h>
-#include "../helpers.h"
 
 namespace xgboost {
 namespace metric {
 
-TEST(Metric, DeclareUnifiedTest(BinaryAUC)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> uni_ptr {Metric::Create("auc", &ctx)};
-  Metric * metric = uni_ptr.get();
-  ASSERT_STREQ(metric->Name(), "auc");
+TEST(Metric, DeclareUnifiedTest(BinaryAUC)) { VerifyBinaryAUC(); }
 
-  // Binary
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.0f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {1, 0}), 0.0f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {0, 1}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {0, 1}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {1, 0}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {1, 0}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {1, 0, 0}, {0, 0, 1}), 0.25f, 1e-10);
+TEST(Metric, DeclareUnifiedTest(MultiClassAUC)) { VerifyMultiClassAUC(); }
 
-  // Invalid dataset
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1};
-  float auc = metric->Evaluate({1, 1}, p_fmat);
-  ASSERT_TRUE(std::isnan(auc));
-  *info.labels.Data() = HostDeviceVector<float>{};
-  auc = metric->Evaluate(HostDeviceVector<float>{}, p_fmat);
-  ASSERT_TRUE(std::isnan(auc));
+TEST(Metric, DeclareUnifiedTest(RankingAUC)) { VerifyRankingAUC(); }
 
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1, 0, 1}, {0, 1, 0, 1}), 1.0f, 1e-10);
+TEST(Metric, DeclareUnifiedTest(PRAUC)) { VerifyPRAUC(); }
 
-  // AUC with instance weights
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.9f, 0.1f, 0.4f, 0.3f},
-                            {0,    0,    1,    1},
-                            {1.0f, 3.0f, 2.0f, 4.0f}),
-              0.75f, 0.001f);
+TEST(Metric, DeclareUnifiedTest(MultiClassPRAUC)) { VerifyMultiClassPRAUC(); }
 
-  // regression test case
-  ASSERT_NEAR(GetMetricEval(
-                  metric,
-                  {0.79523796, 0.5201713,  0.79523796, 0.24273258, 0.53452194,
-                   0.53452194, 0.24273258, 0.5201713,  0.79523796, 0.53452194,
-                   0.24273258, 0.53452194, 0.79523796, 0.5201713,  0.24273258,
-                   0.5201713,  0.5201713,  0.53452194, 0.5201713,  0.53452194},
-                  {0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0}),
-              0.5, 1e-10);
+TEST(Metric, DeclareUnifiedTest(RankingPRAUC)) { VerifyRankingPRAUC(); }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), BinaryAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyBinaryAUC, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(MultiClassAUC)) {
-  auto ctx = CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> uni_ptr{
-      Metric::Create("auc", &ctx)};
-  auto metric = uni_ptr.get();
-
-  // MultiClass
-  // 3x3
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {0, 1, 2}),
-              1.0f, 1e-10);
-
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {0, 1, 2},
-                            {1.0f, 1.0f, 1.0f}),
-              1.0f, 1e-10);
-
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {2, 1, 0}),
-              0.5f, 1e-10);
-
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {2, 0, 1}),
-              0.25f, 1e-10);
-
-  // invalid dataset
-  float auc = GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {0, 1, 1});  // no class 2.
-  EXPECT_TRUE(std::isnan(auc)) << auc;
-
-  HostDeviceVector<float> predts{
-    0.0f, 1.0f, 0.0f,
-    1.0f, 0.0f, 0.0f,
-    0.0f, 0.0f, 1.0f,
-    0.0f, 0.0f, 1.0f,
-  };
-  std::vector<float> labels {1.0f, 0.0f, 2.0f, 1.0f};
-  auc = GetMetricEval(metric, predts, labels, {1.0f, 2.0f, 3.0f, 4.0f});
-  ASSERT_GT(auc, 0.714);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), BinaryAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyBinaryAUC, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(RankingAUC)) {
-  auto ctx = CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> metric{Metric::Create("auc", &ctx)};
-
-  // single group
-  EXPECT_NEAR(GetMetricEval(metric.get(), {0.7f, 0.2f, 0.3f, 0.6f},
-                            {1.0f, 0.8f, 0.4f, 0.2f}, /*weights=*/{},
-                            {0, 4}),
-              0.5f, 1e-10);
-
-  // multi group
-  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2},
-                            {0, 1, 2, 0, 1, 2}, /*weights=*/{}, {0, 3, 6}),
-              1.0f, 1e-10);
-
-  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2},
-                            {0, 1, 2, 0, 1, 2}, /*weights=*/{1.0f, 2.0f},
-                            {0, 3, 6}),
-              1.0f, 1e-10);
-
-  // AUC metric for grouped datasets - exception scenarios
-  ASSERT_TRUE(std::isnan(
-      GetMetricEval(metric.get(), {0, 1, 2}, {0, 0, 0}, {}, {0, 2, 3})));
-
-  // regression case
-  HostDeviceVector<float> predt{0.33935383, 0.5149714,  0.32138085, 1.4547751,
-                                1.2010975,  0.42651367, 0.23104341, 0.83610827,
-                                0.8494239,  0.07136688, 0.5623144,  0.8086237,
-                                1.5066161,  -4.094787,  0.76887935, -2.4082742};
-  std::vector<bst_group_t> groups{0, 7, 16};
-  std::vector<float> labels{1., 0., 0., 1., 2., 1., 0., 0.,
-                            0., 0., 0., 0., 1., 0., 1., 0.};
-
-  EXPECT_NEAR(GetMetricEval(metric.get(), std::move(predt), labels,
-                            /*weights=*/{}, groups),
-              0.769841f, 1e-6);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassAUC, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(PRAUC)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  xgboost::Metric *metric = xgboost::Metric::Create("aucpr", &ctx);
-  ASSERT_STREQ(metric->Name(), "aucpr");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 0, 1, 1}, {0, 0, 1, 1}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}),
-              0.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(
-                  metric,
-                  {0.4f, 0.2f, 0.9f, 0.1f, 0.2f, 0.4f, 0.1f, 0.1f, 0.2f, 0.1f},
-                  {0, 0, 0, 0, 0, 1, 0, 0, 1, 1}),
-              0.2908445f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(
-                  metric, {0.87f, 0.31f, 0.40f, 0.42f, 0.25f, 0.66f, 0.95f,
-                           0.09f, 0.10f, 0.97f, 0.76f, 0.69f, 0.15f, 0.20f,
-                           0.30f, 0.14f, 0.07f, 0.58f, 0.61f, 0.08f},
-                  {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}),
-              0.2769199f, 0.001f);
-  auto auc = GetMetricEval(metric, {0, 1}, {});
-  ASSERT_TRUE(std::isnan(auc));
-
-  // AUCPR with instance weights
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.29f, 0.52f, 0.11f, 0.21f, 0.219f, 0.93f, 0.493f,
-                             0.17f, 0.47f, 0.13f, 0.43f, 0.59f, 0.87f, 0.007f},
-                            {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0},
-                            {1, 2, 7, 4, 5, 2.2f, 3.2f, 5, 6, 1, 2, 1.1f, 3.2f,
-                             4.5f}), // weights
-              0.694435f, 0.001f);
-
-  // Both groups contain only pos or neg samples.
-  auc = GetMetricEval(metric,
-                      {0, 0.1f, 0.3f, 0.5f, 0.7f},
-                      {1, 1, 0, 0, 0},
-                      {},
-                      {0, 2, 5});
-  ASSERT_TRUE(std::isnan(auc));
-  delete metric;
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassAUC, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MultiClassPRAUC)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
-
-  float auc = 0;
-  std::vector<float> labels {1.0f, 0.0f, 2.0f};
-  HostDeviceVector<float> predts{
-    0.0f, 1.0f, 0.0f,
-    1.0f, 0.0f, 0.0f,
-    0.0f, 0.0f, 1.0f,
-  };
-  auc = GetMetricEval(metric.get(), predts, labels, {});
-  EXPECT_EQ(auc, 1.0f);
-
-  auc = GetMetricEval(metric.get(), predts, labels, {1.0f, 1.0f, 1.0f});
-  EXPECT_EQ(auc, 1.0f);
-
-  predts.HostVector() =  {
-    0.0f, 1.0f, 0.0f,
-    1.0f, 0.0f, 0.0f,
-    0.0f, 0.0f, 1.0f,
-    0.0f, 0.0f, 1.0f,
-  };
-  labels = {1.0f, 0.0f, 2.0f, 1.0f};
-  auc = GetMetricEval(metric.get(), predts, labels, {1.0f, 2.0f, 3.0f, 4.0f});
-  ASSERT_GT(auc, 0.699);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRankingAUC, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(RankingPRAUC)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRankingAUC, DataSplitMode::kCol);
+}
 
-  std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PRAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPRAUC, DataSplitMode::kRow);
+}
 
-  std::vector<float> labels {1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f};
-  std::vector<uint32_t> groups {0, 2, 6};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PRAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPRAUC, DataSplitMode::kCol);
+}
 
-  float auc = 0;
-  auc = GetMetricEval(metric.get(), {1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f}, labels, {}, groups);
-  EXPECT_EQ(auc, 1.0f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassPRAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassPRAUC, DataSplitMode::kRow);
+}
 
-  auc = GetMetricEval(metric.get(), {1.0f, 0.5f, 0.8f, 0.3f, 0.2f, 1.0f}, labels, {}, groups);
-  EXPECT_EQ(auc, 1.0f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassPRAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassPRAUC, DataSplitMode::kCol);
+}
 
-  auc = GetMetricEval(metric.get(), {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                      {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {}, groups);
-  ASSERT_TRUE(std::isnan(auc));
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingPRAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRankingPRAUC, DataSplitMode::kRow);
+}
 
-  // Incorrect label
-  ASSERT_THROW(GetMetricEval(metric.get(), {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                             {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 3.0f}, {}, groups),
-               dmlc::Error);
-
-  // AUCPR with groups and no weights
-  EXPECT_NEAR(GetMetricEval(
-      metric.get(), {0.87f, 0.31f, 0.40f, 0.42f, 0.25f, 0.66f, 0.95f,
-                     0.09f, 0.10f, 0.97f, 0.76f, 0.69f, 0.15f, 0.20f,
-                     0.30f, 0.14f, 0.07f, 0.58f, 0.61f, 0.08f},
-                  {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1},
-                  {},  // weights
-                  {0, 2, 5, 9, 14, 20}),  // group info
-              0.556021f, 0.001f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingPRAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRankingPRAUC, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_auc.h b/tests/cpp/metric/test_auc.h
new file mode 100644
index 000000000..3baa53290
--- /dev/null
+++ b/tests/cpp/metric/test_auc.h
@@ -0,0 +1,249 @@
+/*!
+ * Copyright (c) 2023 by XGBoost Contributors
+ */
+#pragma once
+
+#include <xgboost/metric.h>
+
+#include "../helpers.h"
+
+namespace xgboost {
+namespace metric {
+
+inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> uni_ptr{Metric::Create("auc", &ctx)};
+  Metric* metric = uni_ptr.get();
+  ASSERT_STREQ(metric->Name(), "auc");
+
+  // Binary
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1.0f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {1, 0}, {}, {}, data_split_mode), 0.0f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {1, 0}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {1, 0}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {1, 0, 0}, {0, 0, 1}, {}, {}, data_split_mode), 0.25f, 1e-10);
+
+  // Invalid dataset
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1};
+  float auc = metric->Evaluate({1, 1}, p_fmat);
+  ASSERT_TRUE(std::isnan(auc));
+  *info.labels.Data() = HostDeviceVector<float>{};
+  auc = metric->Evaluate(HostDeviceVector<float>{}, p_fmat);
+  ASSERT_TRUE(std::isnan(auc));
+
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1, 0, 1}, {0, 1, 0, 1}, {}, {}, data_split_mode), 1.0f,
+              1e-10);
+
+  // AUC with instance weights
+  EXPECT_NEAR(GetMetricEval(metric, {0.9f, 0.1f, 0.4f, 0.3f}, {0, 0, 1, 1},
+                            {1.0f, 3.0f, 2.0f, 4.0f}, {}, data_split_mode),
+              0.75f, 0.001f);
+
+  // regression test case
+  ASSERT_NEAR(GetMetricEval(metric, {0.79523796, 0.5201713,  0.79523796, 0.24273258, 0.53452194,
+                                     0.53452194, 0.24273258, 0.5201713,  0.79523796, 0.53452194,
+                                     0.24273258, 0.53452194, 0.79523796, 0.5201713,  0.24273258,
+                                     0.5201713,  0.5201713,  0.53452194, 0.5201713,  0.53452194},
+                            {0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0}, {}, {},
+                            data_split_mode),
+              0.5, 1e-10);
+}
+
+inline void VerifyMultiClassAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> uni_ptr{Metric::Create("auc", &ctx)};
+  auto metric = uni_ptr.get();
+
+  // MultiClass
+  // 3x3
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,  // p_0
+                                0.0f, 1.0f, 0.0f,  // p_1
+                                0.0f, 0.0f, 1.0f   // p_2
+                            },
+                            {0, 1, 2}, {}, {}, data_split_mode),
+              1.0f, 1e-10);
+
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,  // p_0
+                                0.0f, 1.0f, 0.0f,  // p_1
+                                0.0f, 0.0f, 1.0f   // p_2
+                            },
+                            {0, 1, 2}, {1.0f, 1.0f, 1.0f}, {}, data_split_mode),
+              1.0f, 1e-10);
+
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,  // p_0
+                                0.0f, 1.0f, 0.0f,  // p_1
+                                0.0f, 0.0f, 1.0f   // p_2
+                            },
+                            {2, 1, 0}, {}, {}, data_split_mode),
+              0.5f, 1e-10);
+
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,  // p_0
+                                0.0f, 1.0f, 0.0f,  // p_1
+                                0.0f, 0.0f, 1.0f   // p_2
+                            },
+                            {2, 0, 1}, {}, {}, data_split_mode),
+              0.25f, 1e-10);
+
+  // invalid dataset
+  float auc = GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,                 // p_0
+                                0.0f, 1.0f, 0.0f,                 // p_1
+                                0.0f, 0.0f, 1.0f                  // p_2
+                            },
+                            {0, 1, 1}, {}, {}, data_split_mode);  // no class 2.
+  EXPECT_TRUE(std::isnan(auc)) << auc;
+
+  HostDeviceVector<float> predts{
+      0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f,
+  };
+  std::vector<float> labels{1.0f, 0.0f, 2.0f, 1.0f};
+  auc = GetMetricEval(metric, predts, labels, {1.0f, 2.0f, 3.0f, 4.0f}, {}, data_split_mode);
+  ASSERT_GT(auc, 0.714);
+}
+
+inline void VerifyRankingAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> metric{Metric::Create("auc", &ctx)};
+
+  // single group
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0.7f, 0.2f, 0.3f, 0.6f}, {1.0f, 0.8f, 0.4f, 0.2f},
+                            /*weights=*/{}, {0, 4}, data_split_mode),
+              0.5f, 1e-10);
+
+  // multi group
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2}, {0, 1, 2, 0, 1, 2}, /*weights=*/{},
+                            {0, 3, 6}, data_split_mode),
+              1.0f, 1e-10);
+
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2}, {0, 1, 2, 0, 1, 2},
+                            /*weights=*/{1.0f, 2.0f}, {0, 3, 6}, data_split_mode),
+              1.0f, 1e-10);
+
+  // AUC metric for grouped datasets - exception scenarios
+  ASSERT_TRUE(std::isnan(
+      GetMetricEval(metric.get(), {0, 1, 2}, {0, 0, 0}, {}, {0, 2, 3}, data_split_mode)));
+
+  // regression case
+  HostDeviceVector<float> predt{
+      0.33935383, 0.5149714,  0.32138085, 1.4547751, 1.2010975, 0.42651367, 0.23104341, 0.83610827,
+      0.8494239,  0.07136688, 0.5623144,  0.8086237, 1.5066161, -4.094787,  0.76887935, -2.4082742};
+  std::vector<bst_group_t> groups{0, 7, 16};
+  std::vector<float> labels{1., 0., 0., 1., 2., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.};
+
+  EXPECT_NEAR(GetMetricEval(metric.get(), std::move(predt), labels,
+                            /*weights=*/{}, groups, data_split_mode),
+              0.769841f, 1e-6);
+}
+
+inline void VerifyPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  xgboost::Metric* metric = xgboost::Metric::Create("aucpr", &ctx);
+  ASSERT_STREQ(metric->Name(), "aucpr");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 0, 1, 1}, {0, 0, 1, 1}, {}, {}, data_split_mode), 1, 1e-10);
+  EXPECT_NEAR(
+      GetMetricEval(metric, {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}, {}, {}, data_split_mode), 0.5f,
+      0.001f);
+  EXPECT_NEAR(GetMetricEval(metric, {0.4f, 0.2f, 0.9f, 0.1f, 0.2f, 0.4f, 0.1f, 0.1f, 0.2f, 0.1f},
+                            {0, 0, 0, 0, 0, 1, 0, 0, 1, 1}, {}, {}, data_split_mode),
+              0.2908445f, 0.001f);
+  EXPECT_NEAR(
+      GetMetricEval(metric, {0.87f, 0.31f, 0.40f, 0.42f, 0.25f, 0.66f, 0.95f, 0.09f, 0.10f, 0.97f,
+                             0.76f, 0.69f, 0.15f, 0.20f, 0.30f, 0.14f, 0.07f, 0.58f, 0.61f, 0.08f},
+                    {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}, {}, {},
+                    data_split_mode),
+      0.2769199f, 0.001f);
+  auto auc = GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode);
+  ASSERT_TRUE(std::isnan(auc));
+
+  // AUCPR with instance weights
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.29f, 0.52f, 0.11f, 0.21f, 0.219f, 0.93f, 0.493f, 0.17f, 0.47f, 0.13f,
+                             0.43f, 0.59f, 0.87f, 0.007f},
+                            {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0},
+                            {1, 2, 7, 4, 5, 2.2f, 3.2f, 5, 6, 1, 2, 1.1f, 3.2f, 4.5f},  // weights
+                            {}, data_split_mode),
+              0.694435f, 0.001f);
+
+  // Both groups contain only pos or neg samples.
+  auc = GetMetricEval(metric, {0, 0.1f, 0.3f, 0.5f, 0.7f}, {1, 1, 0, 0, 0}, {}, {0, 2, 5},
+                      data_split_mode);
+  ASSERT_TRUE(std::isnan(auc));
+  delete metric;
+}
+
+inline void VerifyMultiClassPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
+
+  float auc = 0;
+  std::vector<float> labels{1.0f, 0.0f, 2.0f};
+  HostDeviceVector<float> predts{
+      0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f,
+  };
+  auc = GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode);
+  EXPECT_EQ(auc, 1.0f);
+
+  auc = GetMetricEval(metric.get(), predts, labels, {1.0f, 1.0f, 1.0f}, {}, data_split_mode);
+  EXPECT_EQ(auc, 1.0f);
+
+  predts.HostVector() = {
+      0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f,
+  };
+  labels = {1.0f, 0.0f, 2.0f, 1.0f};
+  auc = GetMetricEval(metric.get(), predts, labels, {1.0f, 2.0f, 3.0f, 4.0f}, {}, data_split_mode);
+  ASSERT_GT(auc, 0.699);
+}
+
+inline void VerifyRankingPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
+
+  std::vector<float> labels{1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f};
+  std::vector<uint32_t> groups{0, 2, 6};
+
+  float auc = 0;
+  auc = GetMetricEval(metric.get(), {1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f}, labels, {}, groups,
+                      data_split_mode);
+  EXPECT_EQ(auc, 1.0f);
+
+  auc = GetMetricEval(metric.get(), {1.0f, 0.5f, 0.8f, 0.3f, 0.2f, 1.0f}, labels, {}, groups,
+                      data_split_mode);
+  EXPECT_EQ(auc, 1.0f);
+
+  auc = GetMetricEval(metric.get(), {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                      {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {}, groups, data_split_mode);
+  ASSERT_TRUE(std::isnan(auc));
+
+  // Incorrect label
+  ASSERT_THROW(GetMetricEval(metric.get(), {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                             {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 3.0f}, {}, groups, data_split_mode),
+               dmlc::Error);
+
+  // AUCPR with groups and no weights
+  EXPECT_NEAR(
+      GetMetricEval(metric.get(),
+                    {0.87f, 0.31f, 0.40f, 0.42f, 0.25f, 0.66f, 0.95f, 0.09f, 0.10f, 0.97f,
+                     0.76f, 0.69f, 0.15f, 0.20f, 0.30f, 0.14f, 0.07f, 0.58f, 0.61f, 0.08f},
+                    {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}, {},  // weights
+                    {0, 2, 5, 9, 14, 20},                                              // group info
+                    data_split_mode),
+      0.556021f, 0.001f);
+}
+}  // namespace metric
+}  // namespace xgboost
diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc
index 9000cfc09..2407dde39 100644
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -1,347 +1,108 @@
 /**
  * Copyright 2018-2023 by XGBoost contributors
  */
-#include <xgboost/json.h>
-#include <xgboost/metric.h>
-
-#include <map>
-#include <memory>
-
-#include "../../../src/common/linalg_op.h"
-#include "../helpers.h"
-
-namespace xgboost {
-namespace {
-inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
-  auto ctx = CreateEmptyGenericParam(device);
-  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
-
-  HostDeviceVector<float> predts;
-  size_t n_samples = 2048;
-
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.labels.Reshape(n_samples, 1);
-  info.num_row_ = n_samples;
-  auto &h_labels = info.labels.Data()->HostVector();
-  auto &h_predts = predts.HostVector();
-
-  SimpleLCG lcg;
-  SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
-
-  h_labels.resize(n_samples);
-  h_predts.resize(n_samples);
-
-  for (size_t i = 0; i < n_samples; ++i) {
-    h_predts[i] = dist(&lcg);
-    h_labels[i] = dist(&lcg);
-  }
-
-  auto result = metric->Evaluate(predts, p_fmat);
-  for (size_t i = 0; i < 8; ++i) {
-    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
-  }
-}
-}  // anonymous namespace
-}  // namespace xgboost
+#include "test_elementwise_metric.h"
 
 namespace xgboost {
 namespace metric {
+TEST(Metric, DeclareUnifiedTest(RMSE)) { VerifyRMSE(); }
 
-TEST(Metric, DeclareUnifiedTest(RMSE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("rmse", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "rmse");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.6403f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              2.8284f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.6708f, 0.001f);
-  delete metric;
+TEST(Metric, DeclareUnifiedTest(RMSLE)) { VerifyRMSLE(); }
 
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"rmse"}, GPUIDX);
+TEST(Metric, DeclareUnifiedTest(MAE)) { VerifyMAE(); }
+
+TEST(Metric, DeclareUnifiedTest(MAPE)) { VerifyMAPE(); }
+
+TEST(Metric, DeclareUnifiedTest(MPHE)) { VerifyMPHE(); }
+
+TEST(Metric, DeclareUnifiedTest(LogLoss)) { VerifyLogLoss(); }
+
+TEST(Metric, DeclareUnifiedTest(Error)) { VerifyError(); }
+
+TEST(Metric, DeclareUnifiedTest(PoissonNegLogLik)) { VerifyPoissonNegLogLik(); }
+
+TEST(Metric, DeclareUnifiedTest(MultiRMSE)) { VerifyMultiRMSE(); }
+
+TEST(Metric, DeclareUnifiedTest(Quantile)) { VerifyQuantile(); }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRMSE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(RMSLE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "rmsle");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
-                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f}),
-              0.4063f, 1e-4);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
-                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
-                            {   0,   -1,    1,    -9,   9}),
-              0.6212f, 1e-4);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
-                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
-                            {   0,    1,    2,    9,    8}),
-              0.2415f, 1e-4);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"rmsle"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRMSE, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MAE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("mae", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "mae");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              8.0f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.54f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"mae"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSLERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRMSLE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(MAPE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("mape", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "mape");
-  EXPECT_NEAR(GetMetricEval(metric, {150, 300}, {100, 200}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {50, 400, 500, 4000},
-                            {100, 200, 500, 1000}),
-              1.125f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {50, 400, 500, 4000},
-                            {100, 200, 500, 1000},
-                            { -1,   1,   9,  -9}),
-              -26.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {50, 400, 500, 4000},
-                            {100, 200, 500, 1000},
-                            {  1,   2,   9,   8}),
-              1.3250f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"mape"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSLEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRMSLE, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MPHE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<xgboost::Metric> metric{xgboost::Metric::Create("mphe", &ctx)};
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "mphe");
-  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric.get(),
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.1751f, 1e-4);
-  EXPECT_NEAR(GetMetricEval(metric.get(),
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              3.4037f, 1e-4);
-  EXPECT_NEAR(GetMetricEval(metric.get(),
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.1922f, 1e-4);
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"mphe"}, GPUIDX);
-
-  metric->Configure({{"huber_slope", "0.1"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(),
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.0461686f, 1e-4);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(LogLoss)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("logloss", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "logloss");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.5f, 1e-17f, 1.0f+1e-17f, 0.9f},
-                            {   0,      0,           1,    1}),
-              0.1996f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              1.2039f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              21.9722f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              1.3138f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"logloss"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAE, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(Error)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("error", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "error");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                           {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              10.0f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.55f, 0.001f);
-
-  EXPECT_ANY_THROW(xgboost::Metric::Create("error@abc", &ctx));
-  delete metric;
-
-  metric = xgboost::Metric::Create("error@0.5f", &ctx);
-  metric->Configure({});
-  EXPECT_STREQ(metric->Name(), "error");
-
-  delete metric;
-
-  metric = xgboost::Metric::Create("error@0.1", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "error@0.1");
-  EXPECT_STREQ(metric->Name(), "error@0.1");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {-0.1f, -0.9f, 0.1f, 0.9f},
-                            {   0,    0,   1,   1}),
-              0.25f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {-0.1f, -0.9f, 0.1f, 0.9f},
-                            {   0,    0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              9.0f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {-0.1f, -0.9f, 0.1f, 0.9f},
-                            {   0,    0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.45f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"error@0.5"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAPE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(PoissionNegLogLik)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "poisson-nloglik");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.5f, 1e-17f, 1.0f+1e-17f, 0.9f},
-                            {   0,      0,           1,    1}),
-              0.6263f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              1.1019f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              13.3750f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              1.5783f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"poisson-nloglik"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAPE, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MultiRMSE)) {
-  size_t n_samples = 32, n_targets = 8;
-  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
-  auto &h_y = y.Data()->HostVector();
-  std::iota(h_y.begin(), h_y.end(), 0);
-
-  HostDeviceVector<float> predt(n_samples * n_targets, 0);
-
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> metric{Metric::Create("rmse", &ctx)};
-  metric->Configure({});
-
-  auto loss = GetMultiMetricEval(metric.get(), predt, y);
-  std::vector<float> weights(n_samples, 1);
-  auto loss_w = GetMultiMetricEval(metric.get(), predt, y, weights);
-
-  std::transform(h_y.cbegin(), h_y.cend(), h_y.begin(), [](auto &v) { return v * v; });
-  auto ret = std::sqrt(std::accumulate(h_y.cbegin(), h_y.cend(), 1.0, std::plus<>{}) / h_y.size());
-  ASSERT_FLOAT_EQ(ret, loss);
-  ASSERT_FLOAT_EQ(ret, loss_w);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MPHERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMPHE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(Quantile)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> metric{Metric::Create("quantile", &ctx)};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MPHEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMPHE, DataSplitMode::kCol);
+}
 
-  HostDeviceVector<float> predts{0.1f, 0.9f, 0.1f, 0.9f};
-  std::vector<float> labels{0.5f, 0.5f, 0.9f, 0.1f};
-  std::vector<float> weights{0.2f,  0.4f,0.6f, 0.8f};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), LogLossRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyLogLoss, DataSplitMode::kRow);
+}
 
-  metric->Configure(Args{{"quantile_alpha", "[0.0]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.400f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.2]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.376f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.4]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.352f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.8]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.304f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.28f, 0.001f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), LogLossColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyLogLoss, DataSplitMode::kCol);
+}
 
-  metric->Configure(Args{{"quantile_alpha", "[0.0]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.2]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.4]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.8]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), ErrorRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyError, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), ErrorColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyError, DataSplitMode::kCol);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PoissonNegLogLikRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPoissonNegLogLik, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PoissonNegLogLikColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPoissonNegLogLik, DataSplitMode::kCol);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiRMSERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiRMSE, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiRMSEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiRMSE, DataSplitMode::kCol);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyQuantile, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyQuantile, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_elementwise_metric.h b/tests/cpp/metric/test_elementwise_metric.h
new file mode 100644
index 000000000..1b06194fe
--- /dev/null
+++ b/tests/cpp/metric/test_elementwise_metric.h
@@ -0,0 +1,385 @@
+/**
+ * Copyright 2018-2023 by XGBoost contributors
+ */
+#pragma once
+#include <xgboost/json.h>
+#include <xgboost/metric.h>
+
+#include <map>
+#include <memory>
+
+#include "../../../src/common/linalg_op.h"
+#include "../helpers.h"
+
+namespace xgboost {
+namespace metric {
+
+inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
+  auto ctx = CreateEmptyGenericParam(device);
+  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
+
+  HostDeviceVector<float> predts;
+  size_t n_samples = 2048;
+
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.labels.Reshape(n_samples, 1);
+  info.num_row_ = n_samples;
+  auto &h_labels = info.labels.Data()->HostVector();
+  auto &h_predts = predts.HostVector();
+
+  SimpleLCG lcg;
+  SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
+
+  h_labels.resize(n_samples);
+  h_predts.resize(n_samples);
+
+  for (size_t i = 0; i < n_samples; ++i) {
+    h_predts[i] = dist(&lcg);
+    h_labels[i] = dist(&lcg);
+  }
+
+  auto result = metric->Evaluate(predts, p_fmat);
+  for (size_t i = 0; i < 8; ++i) {
+    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
+  }
+}
+
+inline void VerifyRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("rmse", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "rmse");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.6403f, 0.001f);
+  auto expected = 2.8284f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected = sqrt(8.0f * collective::GetWorldSize());
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.6708f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"rmse"}, GPUIDX);
+}
+
+inline void VerifyRMSLE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "rmsle");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
+                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, {}, {}, data_split_mode),
+              0.4063f, 1e-4);
+  auto expected = 0.6212f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected = sqrt(0.3859f * collective::GetWorldSize());
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
+                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+                            {   0,   -1,    1,    -9,   9}, {}, data_split_mode),
+              expected, 1e-4);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
+                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+                            {   0,    1,    2,    9,    8}, {}, data_split_mode),
+              0.2415f, 1e-4);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"rmsle"}, GPUIDX);
+}
+
+inline void VerifyMAE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("mae", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "mae");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.5f, 0.001f);
+  auto expected = 8.0f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.54f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"mae"}, GPUIDX);
+}
+
+inline void VerifyMAPE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("mape", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "mape");
+  EXPECT_NEAR(GetMetricEval(metric, {150, 300}, {100, 200}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {50, 400, 500, 4000},
+                            {100, 200, 500, 1000}, {}, {}, data_split_mode),
+              1.125f, 0.001f);
+  auto expected = -26.5f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {50, 400, 500, 4000},
+                            {100, 200, 500, 1000},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {50, 400, 500, 4000},
+                            {100, 200, 500, 1000},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              1.3250f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"mape"}, GPUIDX);
+}
+
+inline void VerifyMPHE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<xgboost::Metric> metric{xgboost::Metric::Create("mphe", &ctx)};
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "mphe");
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric.get(),
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.1751f, 1e-4);
+  auto expected = 3.40375f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric.get(),
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 1e-4);
+  EXPECT_NEAR(GetMetricEval(metric.get(),
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.1922f, 1e-4);
+
+  CheckDeterministicMetricElementWise(StringView{"mphe"}, GPUIDX);
+
+  metric->Configure({{"huber_slope", "0.1"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(),
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.0461686f, 1e-4);
+}
+
+inline void VerifyLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("logloss", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "logloss");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.5f, 1e-17f, 1.0f+1e-17f, 0.9f},
+                            {   0,      0,           1,    1}, {}, {}, data_split_mode),
+              0.1996f, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              1.2039f, 0.001f);
+  auto expected = 21.9722f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              1.3138f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"logloss"}, GPUIDX);
+}
+
+inline void VerifyError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("error", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "error");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.5f, 0.001f);
+  auto expected = 10.0f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                           {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.55f, 0.001f);
+
+  EXPECT_ANY_THROW(xgboost::Metric::Create("error@abc", &ctx));
+  delete metric;
+
+  metric = xgboost::Metric::Create("error@0.5f", &ctx);
+  metric->Configure({});
+  EXPECT_STREQ(metric->Name(), "error");
+
+  delete metric;
+
+  metric = xgboost::Metric::Create("error@0.1", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "error@0.1");
+  EXPECT_STREQ(metric->Name(), "error@0.1");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {-0.1f, -0.9f, 0.1f, 0.9f},
+                            {   0,    0,   1,   1}, {}, {}, data_split_mode),
+              0.25f, 0.001f);
+  expected = 9.0f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {-0.1f, -0.9f, 0.1f, 0.9f},
+                            {   0,    0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {-0.1f, -0.9f, 0.1f, 0.9f},
+                            {   0,    0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.45f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"error@0.5"}, GPUIDX);
+}
+
+inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "poisson-nloglik");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.5f, 1e-17f, 1.0f+1e-17f, 0.9f},
+                            {   0,      0,           1,    1}, {}, {}, data_split_mode),
+              0.6263f, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              1.1019f, 0.001f);
+  auto expected = 13.3750f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              1.5783f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"poisson-nloglik"}, GPUIDX);
+}
+
+inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  size_t n_samples = 32, n_targets = 8;
+  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
+  auto &h_y = y.Data()->HostVector();
+  std::iota(h_y.begin(), h_y.end(), 0);
+
+  HostDeviceVector<float> predt(n_samples * n_targets, 0);
+
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> metric{Metric::Create("rmse", &ctx)};
+  metric->Configure({});
+
+  auto loss = GetMultiMetricEval(metric.get(), predt, y, {}, {}, data_split_mode);
+  std::vector<float> weights(n_samples, 1);
+  auto loss_w = GetMultiMetricEval(metric.get(), predt, y, weights, {}, data_split_mode);
+
+  std::transform(h_y.cbegin(), h_y.cend(), h_y.begin(), [](auto &v) { return v * v; });
+  auto ret = std::sqrt(std::accumulate(h_y.cbegin(), h_y.cend(), 1.0, std::plus<>{}) / h_y.size());
+  ASSERT_FLOAT_EQ(ret, loss);
+  ASSERT_FLOAT_EQ(ret, loss_w);
+}
+
+inline void VerifyQuantile(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> metric{Metric::Create("quantile", &ctx)};
+
+  HostDeviceVector<float> predts{0.1f, 0.9f, 0.1f, 0.9f};
+  std::vector<float> labels{0.5f, 0.5f, 0.9f, 0.1f};
+  std::vector<float> weights{0.2f, 0.4f, 0.6f, 0.8f};
+
+  metric->Configure(Args{{"quantile_alpha", "[0.0]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.400f,
+              0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.2]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.376f,
+              0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.4]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.352f,
+              0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.8]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.304f,
+              0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.28f,
+              0.001f);
+
+  metric->Configure(Args{{"quantile_alpha", "[0.0]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.2]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.4]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.8]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+}
+}  // namespace metric
+}  // namespace xgboost
diff --git a/tests/cpp/metric/test_multiclass_metric.cc b/tests/cpp/metric/test_multiclass_metric.cc
index 2465b11c8..bfb638924 100644
--- a/tests/cpp/metric/test_multiclass_metric.cc
+++ b/tests/cpp/metric/test_multiclass_metric.cc
@@ -1,87 +1,29 @@
 // Copyright by Contributors
-#include <xgboost/metric.h>
+#include "test_multiclass_metric.h"
+
 #include <string>
 
-#include "../helpers.h"
-
 namespace xgboost {
-inline void CheckDeterministicMetricMultiClass(StringView name, int32_t device) {
-  auto ctx = CreateEmptyGenericParam(device);
-  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
+namespace metric {
 
-  HostDeviceVector<float> predts;
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  auto &h_predts = predts.HostVector();
+TEST(Metric, DeclareUnifiedTest(MultiClassError)) { VerifyMultiClassError(); }
 
-  SimpleLCG lcg;
+TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) { VerifyMultiClassLogLoss(); }
 
-  size_t n_samples = 2048, n_classes = 4;
-
-  info.labels.Reshape(n_samples);
-  auto &h_labels = info.labels.Data()->HostVector();
-  h_predts.resize(n_samples * n_classes);
-
-  {
-    SimpleRealUniformDistribution<float> dist{0.0f, static_cast<float>(n_classes)};
-    for (size_t i = 0; i < n_samples; ++i) {
-      h_labels[i] = dist(&lcg);
-    }
-  }
-
-  {
-    SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
-    for (size_t i = 0; i < n_samples * n_classes; ++i) {
-      h_predts[i] = dist(&lcg);
-    }
-  }
-
-  auto result = metric->Evaluate(predts, p_fmat);
-  for (size_t i = 0; i < 8; ++i) {
-    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
-  }
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassErrorRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassError, DataSplitMode::kRow);
 }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassErrorColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassError, DataSplitMode::kCol);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassLogLossRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassLogLoss, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassLogLossColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassLogLoss, DataSplitMode::kCol);
+}
+}  // namespace metric
 }  // namespace xgboost
-
-inline void TestMultiClassError(int device) {
-  auto ctx = xgboost::CreateEmptyGenericParam(device);
-  ctx.gpu_id = device;
-  xgboost::Metric * metric = xgboost::Metric::Create("merror", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "merror");
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0}, {0, 0}));
-  EXPECT_NEAR(GetMetricEval(
-      metric, {1, 0, 0, 0, 1, 0, 0, 0, 1}, {0, 1, 2}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f},
-                            {0, 1, 2}),
-              0.666f, 0.001f);
-  delete metric;
-}
-
-TEST(Metric, DeclareUnifiedTest(MultiClassError)) {
-  TestMultiClassError(GPUIDX);
-  xgboost::CheckDeterministicMetricMultiClass(xgboost::StringView{"merror"}, GPUIDX);
-}
-
-inline void TestMultiClassLogLoss(int device) {
-  auto ctx = xgboost::CreateEmptyGenericParam(device);
-  ctx.gpu_id = device;
-  xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "mlogloss");
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0}, {0, 0}));
-  EXPECT_NEAR(GetMetricEval(
-    metric, {1, 0, 0, 0, 1, 0, 0, 0, 1}, {0, 1, 2}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f},
-                            {0, 1, 2}),
-              2.302f, 0.001f);
-
-  delete metric;
-}
-
-TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) {
-  TestMultiClassLogLoss(GPUIDX);
-  xgboost::CheckDeterministicMetricMultiClass(xgboost::StringView{"mlogloss"}, GPUIDX);
-}
diff --git a/tests/cpp/metric/test_multiclass_metric.h b/tests/cpp/metric/test_multiclass_metric.h
new file mode 100644
index 000000000..cd2b142fc
--- /dev/null
+++ b/tests/cpp/metric/test_multiclass_metric.h
@@ -0,0 +1,91 @@
+// Copyright by Contributors
+#include <xgboost/metric.h>
+#include <string>
+
+#include "../helpers.h"
+
+namespace xgboost {
+namespace metric {
+
+inline void CheckDeterministicMetricMultiClass(StringView name, int32_t device) {
+  auto ctx = CreateEmptyGenericParam(device);
+  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
+
+  HostDeviceVector<float> predts;
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  auto &h_predts = predts.HostVector();
+
+  SimpleLCG lcg;
+
+  size_t n_samples = 2048, n_classes = 4;
+
+  info.labels.Reshape(n_samples);
+  auto &h_labels = info.labels.Data()->HostVector();
+  h_predts.resize(n_samples * n_classes);
+
+  {
+    SimpleRealUniformDistribution<float> dist{0.0f, static_cast<float>(n_classes)};
+    for (size_t i = 0; i < n_samples; ++i) {
+      h_labels[i] = dist(&lcg);
+    }
+  }
+
+  {
+    SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
+    for (size_t i = 0; i < n_samples * n_classes; ++i) {
+      h_predts[i] = dist(&lcg);
+    }
+  }
+
+  auto result = metric->Evaluate(predts, p_fmat);
+  for (size_t i = 0; i < 8; ++i) {
+    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
+  }
+}
+
+inline void TestMultiClassError(int device, DataSplitMode data_split_mode) {
+  auto ctx = xgboost::CreateEmptyGenericParam(device);
+  ctx.gpu_id = device;
+  xgboost::Metric * metric = xgboost::Metric::Create("merror", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "merror");
+  EXPECT_ANY_THROW(GetMetricEval(metric, {0}, {0, 0}, {}, {}, data_split_mode));
+  EXPECT_NEAR(GetMetricEval(
+      metric, {1, 0, 0, 0, 1, 0, 0, 0, 1}, {0, 1, 2}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f},
+                            {0, 1, 2}, {}, {}, data_split_mode),
+              0.666f, 0.001f);
+  delete metric;
+}
+
+inline void VerifyMultiClassError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  TestMultiClassError(GPUIDX, data_split_mode);
+  CheckDeterministicMetricMultiClass(StringView{"merror"}, GPUIDX);
+}
+
+inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
+  auto ctx = xgboost::CreateEmptyGenericParam(device);
+  ctx.gpu_id = device;
+  xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "mlogloss");
+  EXPECT_ANY_THROW(GetMetricEval(metric, {0}, {0, 0}, {}, {}, data_split_mode));
+  EXPECT_NEAR(GetMetricEval(
+    metric, {1, 0, 0, 0, 1, 0, 0, 0, 1}, {0, 1, 2}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f},
+                            {0, 1, 2}, {}, {}, data_split_mode),
+              2.302f, 0.001f);
+
+  delete metric;
+}
+
+inline void VerifyMultiClassLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  TestMultiClassLogLoss(GPUIDX, data_split_mode);
+  CheckDeterministicMetricMultiClass(StringView{"mlogloss"}, GPUIDX);
+}
+
+}  // namespace metric
+}  // namespace xgboost
diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc
index fa506a412..c30d361f0 100644
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -11,16 +11,20 @@
 #include <memory>                        // for unique_ptr
 #include <vector>                        // for vector
 
+#include "test_rank_metric.h"
 #include "../helpers.h"                  // for GetMetricEval, CreateEmptyGe...
 #include "xgboost/base.h"                // for bst_float, kRtEps
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/json.h"                // for Json, String, Object
 
+namespace xgboost {
+namespace metric {
+
 #if !defined(__CUDACC__) && !defined(__HIP_PLATFORM_AMD__)
 TEST(Metric, AMS) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  EXPECT_ANY_THROW(xgboost::Metric::Create("ams", &ctx));
-  xgboost::Metric* metric = xgboost::Metric::Create("ams@0.5f", &ctx);
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  EXPECT_ANY_THROW(Metric::Create("ams", &ctx));
+  Metric* metric = Metric::Create("ams@0.5f", &ctx);
   ASSERT_STREQ(metric->Name(), "ams@0.5");
   EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0.311f, 0.001f);
   EXPECT_NEAR(GetMetricEval(metric,
@@ -29,7 +33,7 @@ TEST(Metric, AMS) {
               0.29710f, 0.001f);
 
   delete metric;
-  metric = xgboost::Metric::Create("ams@0", &ctx);
+  metric = Metric::Create("ams@0", &ctx);
   ASSERT_STREQ(metric->Name(), "ams@0");
   EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0.311f, 0.001f);
 
@@ -37,172 +41,44 @@ TEST(Metric, AMS) {
 }
 #endif
 
-TEST(Metric, DeclareUnifiedTest(Precision)) {
-  // When the limit for precision is not given, it takes the limit at
-  // std::numeric_limits<unsigned>::max(); hence all values are very small
-  // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("pre", &ctx);
-  ASSERT_STREQ(metric->Name(), "pre");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-7);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0, 1e-7);
+TEST(Metric, DeclareUnifiedTest(Precision)) { VerifyPrecision(); }
 
-  delete metric;
-  metric = xgboost::Metric::Create("pre@2", &ctx);
-  ASSERT_STREQ(metric->Name(), "pre@2");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0.5f, 1e-7);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.5f, 0.001f);
+TEST(Metric, DeclareUnifiedTest(NDCG)) { VerifyNDCG(); }
 
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
+TEST(Metric, DeclareUnifiedTest(MAP)) { VerifyMAP(); }
 
-  delete metric;
+TEST(Metric, DeclareUnifiedTest(NDCGExpGain)) { VerifyNDCGExpGain(); }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PrecisionRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPrecision, DataSplitMode::kRow);
 }
 
-namespace xgboost {
-namespace metric {
-TEST(Metric, DeclareUnifiedTest(NDCG)) {
-  auto ctx = CreateEmptyGenericParam(GPUIDX);
-  Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg");
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
-  ASSERT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 1, 1e-10);
-  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.6509f, 0.001f);
-
-  delete metric;
-  metric = xgboost::Metric::Create("ndcg@2", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg@2");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.3868f, 0.001f);
-
-  delete metric;
-  metric = xgboost::Metric::Create("ndcg@-", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg-");
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 0, 1e-10);
-  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.6509f, 0.001f);
-  delete metric;
-  metric = xgboost::Metric::Create("ndcg-", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg-");
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-               0.6509f, 0.001f);
-
-  delete metric;
-  metric = xgboost::Metric::Create("ndcg@2-", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg@2-");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              1.f - 0.3868f, 1.f - 0.001f);
-
-  delete metric;
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PrecisionColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPrecision, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MAP)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  Metric * metric = xgboost::Metric::Create("map", &ctx);
-  ASSERT_STREQ(metric->Name(), "map");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, kRtEps);
-
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            std::vector<xgboost::bst_float>{}), 1, 1e-10);
-
-  // Rank metric with group info
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.2f, 0.8f, 0.4f, 1.7f},
-                            {1, 1, 1, 0, 1, 0},  // Labels
-                            {},  // Weights
-                            {0, 2, 5, 6}),  // Group info
-              0.8611f, 0.001f);
-
-  delete metric;
-  metric = xgboost::Metric::Create("map@-", &ctx);
-  ASSERT_STREQ(metric->Name(), "map-");
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 0, 1e-10);
-
-  delete metric;
-  metric = xgboost::Metric::Create("map-", &ctx);
-  ASSERT_STREQ(metric->Name(), "map-");
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 0, 1e-10);
-
-  delete metric;
-  metric = xgboost::Metric::Create("map@2", &ctx);
-  ASSERT_STREQ(metric->Name(), "map@2");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.25f, 0.001f);
-  delete metric;
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyNDCG, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(NDCGExpGain)) {
-  Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyNDCG, DataSplitMode::kCol);
+}
 
-  auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id};
-  info.num_row_ = info.labels.Shape(0);
-  info.group_ptr_.resize(2);
-  info.group_ptr_[0] = 0;
-  info.group_ptr_[1] = info.num_row_;
-  HostDeviceVector<float> predt{{0.1f, 0.2f, 0.3f, 4.0f, 70.0f}};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAP, DataSplitMode::kRow);
+}
 
-  std::unique_ptr<Metric> metric{Metric::Create("ndcg", &ctx)};
-  Json config{Object{}};
-  config["name"] = String{"ndcg"};
-  config["lambdarank_param"] = Object{};
-  config["lambdarank_param"]["ndcg_exp_gain"] = String{"true"};
-  config["lambdarank_param"]["lambdarank_num_pair_per_sample"] = String{"32"};
-  metric->LoadConfig(config);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAP, DataSplitMode::kCol);
+}
 
-  auto ndcg = metric->Evaluate(predt, p_fmat);
-  ASSERT_NEAR(ndcg, 0.409738f, kRtEps);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGExpGainRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyNDCGExpGain, DataSplitMode::kRow);
+}
 
-  config["lambdarank_param"]["ndcg_exp_gain"] = String{"false"};
-  metric->LoadConfig(config);
-
-  ndcg = metric->Evaluate(predt, p_fmat);
-  ASSERT_NEAR(ndcg, 0.695694f, kRtEps);
-
-  predt.HostVector() = info.labels.Data()->HostVector();
-  ndcg = metric->Evaluate(predt, p_fmat);
-  ASSERT_NEAR(ndcg, 1.0, kRtEps);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGExpGainColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyNDCGExpGain, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_rank_metric.h b/tests/cpp/metric/test_rank_metric.h
new file mode 100644
index 000000000..318de961b
--- /dev/null
+++ b/tests/cpp/metric/test_rank_metric.h
@@ -0,0 +1,191 @@
+/**
+ * Copyright 2016-2023 by XGBoost Contributors
+ */
+#pragma once
+#include <gtest/gtest.h>                 // for Test, EXPECT_NEAR, ASSERT_STREQ
+#include <xgboost/context.h>             // for Context
+#include <xgboost/data.h>                // for MetaInfo, DMatrix
+#include <xgboost/linalg.h>              // for Matrix
+#include <xgboost/metric.h>              // for Metric
+
+#include <algorithm>                     // for max
+#include <memory>                        // for unique_ptr
+#include <vector>                        // for vector
+
+#include "../helpers.h"                  // for GetMetricEval, CreateEmptyGe...
+#include "xgboost/base.h"                // for bst_float, kRtEps
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/json.h"                // for Json, String, Object
+
+namespace xgboost {
+namespace metric {
+
+inline void VerifyPrecision(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  // When the limit for precision is not given, it takes the limit at
+  // std::numeric_limits<unsigned>::max(); hence all values are very small
+  // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("pre", &ctx);
+  ASSERT_STREQ(metric->Name(), "pre");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-7);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0, 1e-7);
+
+  delete metric;
+  metric = xgboost::Metric::Create("pre@2", &ctx);
+  ASSERT_STREQ(metric->Name(), "pre@2");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-7);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.5f, 0.001f);
+
+  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode));
+
+  delete metric;
+}
+
+inline void VerifyNDCG(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg");
+  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode));
+  ASSERT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 1, 1e-10);
+  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.6509f, 0.001f);
+
+  delete metric;
+  metric = xgboost::Metric::Create("ndcg@2", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg@2");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.3868f, 0.001f);
+
+  delete metric;
+  metric = xgboost::Metric::Create("ndcg@-", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg-");
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 0, 1e-10);
+  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1.f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.6509f, 0.001f);
+  delete metric;
+  metric = xgboost::Metric::Create("ndcg-", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg-");
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1.f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+               0.6509f, 0.001f);
+
+  delete metric;
+  metric = xgboost::Metric::Create("ndcg@2-", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg@2-");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1.f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              1.f - 0.3868f, 1.f - 0.001f);
+
+  delete metric;
+}
+
+inline void VerifyMAP(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  Metric * metric = xgboost::Metric::Create("map", &ctx);
+  ASSERT_STREQ(metric->Name(), "map");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, kRtEps);
+
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.5f, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            std::vector<xgboost::bst_float>{}, {}, {}, data_split_mode), 1, 1e-10);
+
+  // Rank metric with group info
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.2f, 0.8f, 0.4f, 1.7f},
+                            {1, 1, 1, 0, 1, 0},  // Labels
+                            {},  // Weights
+                            {0, 2, 5, 6},  // Group info
+                            data_split_mode),
+              0.8611f, 0.001f);
+
+  delete metric;
+  metric = xgboost::Metric::Create("map@-", &ctx);
+  ASSERT_STREQ(metric->Name(), "map-");
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 0, 1e-10);
+
+  delete metric;
+  metric = xgboost::Metric::Create("map-", &ctx);
+  ASSERT_STREQ(metric->Name(), "map-");
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 0, 1e-10);
+
+  delete metric;
+  metric = xgboost::Metric::Create("map@2", &ctx);
+  ASSERT_STREQ(metric->Name(), "map@2");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.25f, 0.001f);
+  delete metric;
+}
+
+inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id};
+  info.num_row_ = info.labels.Shape(0);
+  info.group_ptr_.resize(2);
+  info.group_ptr_[0] = 0;
+  info.group_ptr_[1] = info.num_row_;
+  info.data_split_mode = data_split_mode;
+  HostDeviceVector<float> predt{{0.1f, 0.2f, 0.3f, 4.0f, 70.0f}};
+
+  std::unique_ptr<Metric> metric{Metric::Create("ndcg", &ctx)};
+  Json config{Object{}};
+  config["name"] = String{"ndcg"};
+  config["lambdarank_param"] = Object{};
+  config["lambdarank_param"]["ndcg_exp_gain"] = String{"true"};
+  config["lambdarank_param"]["lambdarank_num_pair_per_sample"] = String{"32"};
+  metric->LoadConfig(config);
+
+  auto ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 0.409738f, kRtEps);
+
+  config["lambdarank_param"]["ndcg_exp_gain"] = String{"false"};
+  metric->LoadConfig(config);
+
+  ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 0.695694f, kRtEps);
+
+  predt.HostVector() = info.labels.Data()->HostVector();
+  ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 1.0, kRtEps);
+}
+}  // namespace metric
+}  // namespace xgboost
diff --git a/tests/cpp/metric/test_survival_metric.cu b/tests/cpp/metric/test_survival_metric.cu
index 80d6b72e6..723f306e4 100644
--- a/tests/cpp/metric/test_survival_metric.cu
+++ b/tests/cpp/metric/test_survival_metric.cu
@@ -2,105 +2,31 @@
  * Copyright (c) by Contributors 2020
  */
 #include <gtest/gtest.h>
-#include <cmath>
+#include "test_survival_metric.h"
 #include "xgboost/metric.h"
-#include "../helpers.h"
-#include "../../../src/common/survival_util.h"
 
 /** Tests for Survival metrics that should run both on CPU and GPU **/
 
 namespace xgboost {
 namespace common {
-namespace {
-inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
-  auto ctx = CreateEmptyGenericParam(device);
-  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
-  metric->Configure(Args{});
+TEST(Metric, DeclareUnifiedTest(AFTNegLogLik)) { VerifyAFTNegLogLik(); }
 
-  HostDeviceVector<float> predts;
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  auto &h_predts = predts.HostVector();
-
-  SimpleLCG lcg;
-  SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
-
-  size_t n_samples = 2048;
-  h_predts.resize(n_samples);
-
-  for (size_t i = 0; i < n_samples; ++i) {
-    h_predts[i] = dist(&lcg);
-  }
-
-  auto &h_upper = info.labels_upper_bound_.HostVector();
-  auto &h_lower = info.labels_lower_bound_.HostVector();
-  h_lower.resize(n_samples);
-  h_upper.resize(n_samples);
-  for (size_t i = 0; i < n_samples; ++i) {
-    h_lower[i] = 1;
-    h_upper[i] = 10;
-  }
-
-  auto result = metric->Evaluate(predts, p_fmat);
-  for (size_t i = 0; i < 8; ++i) {
-    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
-  }
-}
-}  // anonymous namespace
-
-TEST(Metric, DeclareUnifiedTest(AFTNegLogLik)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  /**
-   * Test aggregate output from the AFT metric over a small test data set.
-   * This is unlike AFTLoss.* tests, which verify metric values over individual data points.
-   **/
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.num_row_ = 4;
-  info.labels_lower_bound_.HostVector()
-    = { 100.0f, 0.0f, 60.0f, 16.0f };
-  info.labels_upper_bound_.HostVector()
-    = { 100.0f, 20.0f, std::numeric_limits<bst_float>::infinity(), 200.0f };
-  info.weights_.HostVector() = std::vector<bst_float>();
-  HostDeviceVector<bst_float> preds(4, std::log(64));
-
-  struct TestCase {
-    std::string dist_type;
-    bst_float reference_value;
-  };
-  for (const auto& test_case : std::vector<TestCase>{ {"normal", 2.1508f}, {"logistic", 2.1804f},
-                                                      {"extreme", 2.0706f} }) {
-    std::unique_ptr<Metric> metric(Metric::Create("aft-nloglik", &ctx));
-    metric->Configure({ {"aft_loss_distribution", test_case.dist_type},
-                        {"aft_loss_distribution_scale", "1.0"} });
-    EXPECT_NEAR(metric->Evaluate(preds, p_fmat), test_case.reference_value, 1e-4);
-  }
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), AFTNegLogLikRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyAFTNegLogLik, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(IntervalRegressionAccuracy)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), AFTNegLogLikColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyAFTNegLogLik, DataSplitMode::kCol);
+}
 
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.num_row_ = 4;
-  info.labels_lower_bound_.HostVector() = { 20.0f, 0.0f, 60.0f, 16.0f };
-  info.labels_upper_bound_.HostVector() = { 80.0f, 20.0f, 80.0f, 200.0f };
-  info.weights_.HostVector() = std::vector<bst_float>();
-  HostDeviceVector<bst_float> preds(4, std::log(60.0f));
+TEST(Metric, DeclareUnifiedTest(IntervalRegressionAccuracy)) { VerifyIntervalRegressionAccuracy(); }
 
-  std::unique_ptr<Metric> metric(Metric::Create("interval-regression-accuracy", &ctx));
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.75f);
-  info.labels_lower_bound_.HostVector()[2] = 70.0f;
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
-  info.labels_upper_bound_.HostVector()[2] = std::numeric_limits<bst_float>::infinity();
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
-  info.labels_upper_bound_.HostVector()[3] = std::numeric_limits<bst_float>::infinity();
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
-  info.labels_lower_bound_.HostVector()[0] = 70.0f;
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.25f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), IntervalRegressionAccuracyRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyIntervalRegressionAccuracy, DataSplitMode::kRow);
+}
 
-  CheckDeterministicMetricElementWise(StringView{"interval-regression-accuracy"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), IntervalRegressionAccuracyColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyIntervalRegressionAccuracy, DataSplitMode::kCol);
 }
 
 // Test configuration of AFT metric
@@ -118,6 +44,5 @@ TEST(AFTNegLogLikMetric, DeclareUnifiedTest(Configuration)) {
 
   CheckDeterministicMetricElementWise(StringView{"aft-nloglik"}, GPUIDX);
 }
-
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_survival_metric.h b/tests/cpp/metric/test_survival_metric.h
new file mode 100644
index 000000000..75414733d
--- /dev/null
+++ b/tests/cpp/metric/test_survival_metric.h
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
+ */
+#pragma once
+#include <gtest/gtest.h>
+
+#include <cmath>
+
+#include "../../../src/common/survival_util.h"
+#include "../helpers.h"
+#include "xgboost/metric.h"
+
+namespace xgboost {
+namespace common {
+inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
+  auto ctx = CreateEmptyGenericParam(device);
+  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
+  metric->Configure(Args{});
+
+  HostDeviceVector<float> predts;
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  auto &h_predts = predts.HostVector();
+
+  SimpleLCG lcg;
+  SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
+
+  size_t n_samples = 2048;
+  h_predts.resize(n_samples);
+
+  for (size_t i = 0; i < n_samples; ++i) {
+    h_predts[i] = dist(&lcg);
+  }
+
+  auto &h_upper = info.labels_upper_bound_.HostVector();
+  auto &h_lower = info.labels_lower_bound_.HostVector();
+  h_lower.resize(n_samples);
+  h_upper.resize(n_samples);
+  for (size_t i = 0; i < n_samples; ++i) {
+    h_lower[i] = 1;
+    h_upper[i] = 10;
+  }
+
+  auto result = metric->Evaluate(predts, p_fmat);
+  for (size_t i = 0; i < 8; ++i) {
+    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
+  }
+}
+
+inline void VerifyAFTNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  /**
+   * Test aggregate output from the AFT metric over a small test data set.
+   * This is unlike AFTLoss.* tests, which verify metric values over individual data points.
+   **/
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.num_row_ = 4;
+  info.labels_lower_bound_.HostVector()
+      = { 100.0f, 0.0f, 60.0f, 16.0f };
+  info.labels_upper_bound_.HostVector()
+      = { 100.0f, 20.0f, std::numeric_limits<bst_float>::infinity(), 200.0f };
+  info.weights_.HostVector() = std::vector<bst_float>();
+  info.data_split_mode = data_split_mode;
+  HostDeviceVector<bst_float> preds(4, std::log(64));
+
+  struct TestCase {
+    std::string dist_type;
+    bst_float reference_value;
+  };
+  for (const auto& test_case : std::vector<TestCase>{ {"normal", 2.1508f}, {"logistic", 2.1804f},
+                                                     {"extreme", 2.0706f} }) {
+    std::unique_ptr<Metric> metric(Metric::Create("aft-nloglik", &ctx));
+    metric->Configure({ {"aft_loss_distribution", test_case.dist_type},
+                       {"aft_loss_distribution_scale", "1.0"} });
+    EXPECT_NEAR(metric->Evaluate(preds, p_fmat), test_case.reference_value, 1e-4);
+  }
+}
+
+inline void VerifyIntervalRegressionAccuracy(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.num_row_ = 4;
+  info.labels_lower_bound_.HostVector() = { 20.0f, 0.0f, 60.0f, 16.0f };
+  info.labels_upper_bound_.HostVector() = { 80.0f, 20.0f, 80.0f, 200.0f };
+  info.weights_.HostVector() = std::vector<bst_float>();
+  info.data_split_mode = data_split_mode;
+  HostDeviceVector<bst_float> preds(4, std::log(60.0f));
+
+  std::unique_ptr<Metric> metric(Metric::Create("interval-regression-accuracy", &ctx));
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.75f);
+  info.labels_lower_bound_.HostVector()[2] = 70.0f;
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
+  info.labels_upper_bound_.HostVector()[2] = std::numeric_limits<bst_float>::infinity();
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
+  info.labels_upper_bound_.HostVector()[3] = std::numeric_limits<bst_float>::infinity();
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
+  info.labels_lower_bound_.HostVector()[0] = 70.0f;
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.25f);
+
+  CheckDeterministicMetricElementWise(StringView{"interval-regression-accuracy"}, GPUIDX);
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index 11cbf6bec..c808e97f0 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -5,6 +5,7 @@
 
 #include <gtest/gtest.h>                        // for Test, Message, TestPartResult, CmpHel...
 
+#include <algorithm>                            // for sort
 #include <cstddef>                              // for size_t
 #include <initializer_list>                     // for initializer_list
 #include <map>                                  // for map
@@ -13,7 +14,6 @@
 #include <string>                               // for char_traits, basic_string, string
 #include <vector>                               // for vector
 
-#include "../../../src/common/ranking_utils.h"  // for LambdaRankParam
 #include "../../../src/common/ranking_utils.h"  // for NDCGCache, LambdaRankParam
 #include "../helpers.h"                         // for CheckRankingObjFunction, CheckConfigReload
 #include "xgboost/base.h"                       // for GradientPair, bst_group_t, Args
@@ -25,6 +25,126 @@
 #include "xgboost/span.h"                       // for Span
 
 namespace xgboost::obj {
+TEST(LambdaRank, NDCGJsonIO) {
+  Context ctx;
+  TestNDCGJsonIO(&ctx);
+}
+
+void TestNDCGGPair(Context const* ctx) {
+  {
+    std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
+    obj->Configure(Args{{"lambdarank_pair_method", "topk"}});
+    CheckConfigReload(obj, "rank:ndcg");
+
+    // No gain in swapping 2 documents.
+    CheckRankingObjFunction(obj,
+                            {1, 1, 1, 1},
+                            {1, 1, 1, 1},
+                            {1.0f, 1.0f},
+                            {0, 2, 4},
+                            {0.0f, -0.0f, 0.0f, 0.0f},
+                            {0.0f, 0.0f, 0.0f, 0.0f});
+  }
+  {
+    std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
+    obj->Configure(Args{{"lambdarank_pair_method", "topk"}});
+    // Test with setting sample weight to second query group
+    CheckRankingObjFunction(obj,
+                            {0, 0.1f, 0, 0.1f},
+                            {0,   1, 0, 1},
+                            {2.0f, 0.0f},
+                            {0, 2, 4},
+                            {2.06611f, -2.06611f, 0.0f, 0.0f},
+                            {2.169331f, 2.169331f, 0.0f, 0.0f});
+
+    CheckRankingObjFunction(obj,
+                            {0, 0.1f, 0, 0.1f},
+                            {0,   1, 0, 1},
+                            {2.0f, 2.0f},
+                            {0, 2, 4},
+                            {2.06611f, -2.06611f, 2.06611f, -2.06611f},
+                            {2.169331f, 2.169331f, 2.169331f, 2.169331f});
+  }
+
+  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
+  obj->Configure(Args{{"lambdarank_pair_method", "topk"}});
+
+  HostDeviceVector<float> predts{0, 1, 0, 1};
+  MetaInfo info;
+  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GPUIDX};
+  info.group_ptr_ = {0, 2, 4};
+  info.num_row_ = 4;
+  HostDeviceVector<GradientPair> gpairs;
+  obj->GetGradient(predts, info, 0, &gpairs);
+  ASSERT_EQ(gpairs.Size(), predts.Size());
+
+  {
+    predts = {1, 0, 1, 0};
+    HostDeviceVector<GradientPair> gpairs;
+    obj->GetGradient(predts, info, 0, &gpairs);
+    for (size_t i = 0; i < gpairs.Size(); ++i) {
+      ASSERT_GT(gpairs.HostSpan()[i].GetHess(), 0);
+    }
+    ASSERT_LT(gpairs.HostSpan()[1].GetGrad(), 0);
+    ASSERT_LT(gpairs.HostSpan()[3].GetGrad(), 0);
+
+    ASSERT_GT(gpairs.HostSpan()[0].GetGrad(), 0);
+    ASSERT_GT(gpairs.HostSpan()[2].GetGrad(), 0);
+
+    info.weights_ = {2, 3};
+    HostDeviceVector<GradientPair> weighted_gpairs;
+    obj->GetGradient(predts, info, 0, &weighted_gpairs);
+    auto const& h_gpairs = gpairs.ConstHostSpan();
+    auto const& h_weighted_gpairs = weighted_gpairs.ConstHostSpan();
+    for (size_t i : {0ul, 1ul}) {
+      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetGrad(), h_gpairs[i].GetGrad() * 2.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetHess(), h_gpairs[i].GetHess() * 2.0f);
+    }
+    for (size_t i : {2ul, 3ul}) {
+      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetGrad(), h_gpairs[i].GetGrad() * 3.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetHess(), h_gpairs[i].GetHess() * 3.0f);
+    }
+  }
+
+  ASSERT_NO_THROW(obj->DefaultEvalMetric());
+}
+
+TEST(LambdaRank, NDCGGPair) {
+  Context ctx;
+  TestNDCGGPair(&ctx);
+}
+
+void TestUnbiasedNDCG(Context const* ctx) {
+  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
+  obj->Configure(Args{{"lambdarank_pair_method", "topk"},
+                      {"lambdarank_unbiased", "true"},
+                      {"lambdarank_bias_norm", "0"}});
+  std::shared_ptr<DMatrix> p_fmat{RandomDataGenerator{10, 1, 0.0f}.GenerateDMatrix(true, false, 2)};
+  auto h_label = p_fmat->Info().labels.HostView().Values();
+  // Move clicked samples to the beginning.
+  std::sort(h_label.begin(), h_label.end(), std::greater<>{});
+  HostDeviceVector<float> predt(p_fmat->Info().num_row_, 1.0f);
+
+  HostDeviceVector<GradientPair> out_gpair;
+  obj->GetGradient(predt, p_fmat->Info(), 0, &out_gpair);
+
+  Json config{Object{}};
+  obj->SaveConfig(&config);
+  auto ti_plus = get<F32Array const>(config["ti+"]);
+  ASSERT_FLOAT_EQ(ti_plus[0], 1.0);
+  // bias is non-increasing when prediction is constant. (constant cost on swapping documents)
+  for (std::size_t i = 1; i < ti_plus.size(); ++i) {
+    ASSERT_LE(ti_plus[i], ti_plus[i - 1]);
+  }
+  auto tj_minus = get<F32Array const>(config["tj-"]);
+  ASSERT_FLOAT_EQ(tj_minus[0], 1.0);
+}
+
+TEST(LambdaRank, UnbiasedNDCG) {
+  Context ctx;
+  TestUnbiasedNDCG(&ctx);
+}
+
 void InitMakePairTest(Context const* ctx, MetaInfo* out_info, HostDeviceVector<float>* out_predt) {
   out_predt->SetDevice(ctx->gpu_id);
   MetaInfo& info = *out_info;
@@ -103,4 +223,125 @@ TEST(LambdaRank, MakePair) {
     ASSERT_EQ(n_pairs, info.num_row_ * param.NumPair());
   }
 }
+
+void TestMAPStat(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  ltr::LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  {
+    std::vector<float> h_data{1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f};
+    info.labels.Reshape(h_data.size(), 1);
+    info.labels.Data()->HostVector() = h_data;
+    info.num_row_ = h_data.size();
+
+    HostDeviceVector<float> predt;
+    auto& h_predt = predt.HostVector();
+    h_predt.resize(h_data.size());
+    std::iota(h_predt.rbegin(), h_predt.rend(), 0.0f);
+
+    auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
+
+    predt.SetDevice(ctx->gpu_id);
+    auto rank_idx =
+        p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
+
+    if (ctx->IsCPU()) {
+      obj::cpu_impl::MAPStat(ctx, info.labels.HostView().Slice(linalg::All(), 0), rank_idx,
+                             p_cache);
+    } else {
+      obj::cuda_impl::MAPStat(ctx, info, rank_idx, p_cache);
+    }
+
+    Context cpu_ctx;
+    auto n_rel = p_cache->NumRelevant(&cpu_ctx);
+    auto acc = p_cache->Acc(&cpu_ctx);
+
+    ASSERT_EQ(n_rel[0], 1.0);
+    ASSERT_EQ(acc[0], 1.0);
+
+    ASSERT_EQ(n_rel.back(), h_data.size() - 1.0);
+    ASSERT_NEAR(acc.back(), 1.95 + (1.0 / h_data.size()), kRtEps);
+  }
+  {
+    info.labels.Reshape(16);
+    auto& h_label = info.labels.Data()->HostVector();
+    info.group_ptr_ = {0, 8, 16};
+    info.num_row_ = info.labels.Shape(0);
+
+    std::fill_n(h_label.begin(), 8, 1.0f);
+    std::fill_n(h_label.begin() + 8, 8, 0.0f);
+    HostDeviceVector<float> predt;
+    auto& h_predt = predt.HostVector();
+    h_predt.resize(h_label.size());
+    std::iota(h_predt.rbegin(), h_predt.rbegin() + 8, 0.0f);
+    std::iota(h_predt.rbegin() + 8, h_predt.rend(), 0.0f);
+
+    auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
+
+    predt.SetDevice(ctx->gpu_id);
+    auto rank_idx =
+        p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
+
+    if (ctx->IsCPU()) {
+      obj::cpu_impl::MAPStat(ctx, info.labels.HostView().Slice(linalg::All(), 0), rank_idx,
+                             p_cache);
+    } else {
+      obj::cuda_impl::MAPStat(ctx, info, rank_idx, p_cache);
+    }
+
+    Context cpu_ctx;
+    auto n_rel = p_cache->NumRelevant(&cpu_ctx);
+    ASSERT_EQ(n_rel[7], 8);      // first group
+    ASSERT_EQ(n_rel.back(), 0);  // second group
+  }
+}
+
+TEST(LambdaRank, MAPStat) {
+  Context ctx;
+  TestMAPStat(&ctx);
+}
+
+void TestMAPGPair(Context const* ctx) {
+  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:map", ctx)};
+  Args args;
+  obj->Configure(args);
+
+  CheckConfigReload(obj, "rank:map");
+
+  CheckRankingObjFunction(obj,                                                 // obj
+                          {0, 0.1f, 0, 0.1f},                                  // score
+                          {0, 1, 0, 1},                                        // label
+                          {2.0f, 2.0f},                                        // weight
+                          {0, 2, 4},                                           // group
+                          {1.2054923f, -1.2054923f, 1.2054923f, -1.2054923f},  // out grad
+                          {1.2657166f, 1.2657166f, 1.2657166f, 1.2657166f});
+  // disable the second query group with 0 weight
+  CheckRankingObjFunction(obj,                                  // obj
+                          {0, 0.1f, 0, 0.1f},                   // score
+                          {0, 1, 0, 1},                         // label
+                          {2.0f, 0.0f},                         // weight
+                          {0, 2, 4},                            // group
+                          {1.2054923f, -1.2054923f, .0f, .0f},  // out grad
+                          {1.2657166f, 1.2657166f, .0f, .0f});
+}
+
+TEST(LambdaRank, MAPGPair) {
+  Context ctx;
+  TestMAPGPair(&ctx);
+}
+
+void TestPairWiseGPair(Context const* ctx) {
+  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:pairwise", ctx)};
+  Args args;
+  obj->Configure(args);
+
+  args.emplace_back("lambdarank_unbiased", "true");
+}
+
+TEST(LambdaRank, Pairwise) {
+  Context ctx;
+  TestPairWiseGPair(&ctx);
+}
 }  // namespace xgboost::obj
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
index 03ccdef8b..d0f448993 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -12,6 +12,24 @@
 #include "test_lambdarank_obj.h"
 
 namespace xgboost::obj {
+TEST(LambdaRank, GPUNDCGJsonIO) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestNDCGJsonIO(&ctx);
+}
+
+TEST(LambdaRank, GPUMAPStat) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestMAPStat(&ctx);
+}
+
+TEST(LambdaRank, GPUNDCGGPair) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestNDCGGPair(&ctx);
+}
+
 void TestGPUMakePair() {
   Context ctx;
   ctx.gpu_id = 0;
@@ -107,6 +125,12 @@ void TestGPUMakePair() {
 
 TEST(LambdaRank, GPUMakePair) { TestGPUMakePair(); }
 
+TEST(LambdaRank, GPUUnbiasedNDCG) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestUnbiasedNDCG(&ctx);
+}
+
 template <typename CountFunctor>
 void RankItemCountImpl(std::vector<std::uint32_t> const &sorted_items, CountFunctor f,
                        std::uint32_t find_val, std::uint32_t exp_val) {
@@ -135,4 +159,10 @@ TEST(LambdaRank, RankItemCountOnRight) {
   RankItemCountImpl(sorted_items, wrapper, 1, static_cast<uint32_t>(1));
   RankItemCountImpl(sorted_items, wrapper, 0, static_cast<uint32_t>(0));
 }
+
+TEST(LambdaRank, GPUMAPGPair) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestMAPGPair(&ctx);
+}
 }  // namespace xgboost::obj
diff --git a/tests/cpp/objective/test_lambdarank_obj.h b/tests/cpp/objective/test_lambdarank_obj.h
index 8dd238d2b..9539f1a30 100644
--- a/tests/cpp/objective/test_lambdarank_obj.h
+++ b/tests/cpp/objective/test_lambdarank_obj.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright (c) 2023, XGBoost Contributors
  */
 #ifndef XGBOOST_OBJECTIVE_TEST_LAMBDARANK_OBJ_H_
 #define XGBOOST_OBJECTIVE_TEST_LAMBDARANK_OBJ_H_
@@ -18,6 +18,29 @@
 #include "../helpers.h"                             // for EmptyDMatrix
 
 namespace xgboost::obj {
+void TestMAPStat(Context const* ctx);
+
+inline void TestNDCGJsonIO(Context const* ctx) {
+  std::unique_ptr<xgboost::ObjFunction> obj{ObjFunction::Create("rank:ndcg", ctx)};
+
+  obj->Configure(Args{});
+  Json j_obj{Object()};
+  obj->SaveConfig(&j_obj);
+
+  ASSERT_EQ(get<String>(j_obj["name"]), "rank:ndcg");
+  auto const& j_param = j_obj["lambdarank_param"];
+
+  ASSERT_EQ(get<String>(j_param["ndcg_exp_gain"]), "1");
+  ASSERT_EQ(get<String>(j_param["lambdarank_num_pair_per_sample"]),
+            std::to_string(ltr::LambdaRankParam::NotSet()));
+}
+
+void TestNDCGGPair(Context const* ctx);
+
+void TestUnbiasedNDCG(Context const* ctx);
+
+void TestMAPGPair(Context const* ctx);
+
 /**
  * \brief Initialize test data for make pair tests.
  */
diff --git a/tests/cpp/objective/test_ranking_obj.cc b/tests/cpp/objective/test_ranking_obj.cc
deleted file mode 100644
index a007750e3..000000000
--- a/tests/cpp/objective/test_ranking_obj.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright by Contributors
-#include <xgboost/context.h>
-#include <xgboost/json.h>
-#include <xgboost/objective.h>
-
-#include "../helpers.h"
-
-namespace xgboost {
-
-TEST(Objective, DeclareUnifiedTest(PairwiseRankingGPair)) {
-  std::vector<std::pair<std::string, std::string>> args;
-  xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:pairwise", &ctx)};
-  obj->Configure(args);
-  CheckConfigReload(obj, "rank:pairwise");
-
-  // Test with setting sample weight to second query group
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {2.0f, 0.0f},
-                          {0, 2, 4},
-                          {1.9f, -1.9f, 0.0f, 0.0f},
-                          {1.995f, 1.995f, 0.0f, 0.0f});
-
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {1.0f, 1.0f},
-                          {0, 2, 4},
-                          {0.95f, -0.95f,  0.95f, -0.95f},
-                          {0.9975f, 0.9975f, 0.9975f, 0.9975f});
-
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
-}
-
-TEST(Objective, DeclareUnifiedTest(NDCG_JsonIO)) {
-  xgboost::Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-
-  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", &ctx)};
-
-  obj->Configure(Args{});
-  Json j_obj {Object()};
-  obj->SaveConfig(&j_obj);
-
-  ASSERT_EQ(get<String>(j_obj["name"]), "rank:ndcg");;
-
-  auto const& j_param = j_obj["lambda_rank_param"];
-
-  ASSERT_EQ(get<String>(j_param["num_pairsample"]), "1");
-  ASSERT_EQ(get<String>(j_param["fix_list_weight"]), "0");
-}
-
-TEST(Objective, DeclareUnifiedTest(PairwiseRankingGPairSameLabels)) {
-  std::vector<std::pair<std::string, std::string>> args;
-  xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("rank:pairwise", &ctx)};
-  obj->Configure(args);
-  // No computation of gradient/hessian, as there is no diversity in labels
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {1,   1, 1, 1},
-                          {2.0f, 0.0f},
-                          {0, 2, 4},
-                          {0.0f, 0.0f, 0.0f, 0.0f},
-                          {0.0f, 0.0f, 0.0f, 0.0f});
-
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
-}
-
-TEST(Objective, DeclareUnifiedTest(NDCGRankingGPair)) {
-  std::vector<std::pair<std::string, std::string>> args;
-  xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", &ctx)};
-  obj->Configure(args);
-  CheckConfigReload(obj, "rank:ndcg");
-
-  // Test with setting sample weight to second query group
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {2.0f, 0.0f},
-                          {0, 2, 4},
-                          {0.7f, -0.7f, 0.0f, 0.0f},
-                          {0.74f, 0.74f, 0.0f, 0.0f});
-
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {1.0f, 1.0f},
-                          {0, 2, 4},
-                          {0.35f, -0.35f,  0.35f, -0.35f},
-                          {0.368f, 0.368f, 0.368f, 0.368f});
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
-}
-
-TEST(Objective, DeclareUnifiedTest(MAPRankingGPair)) {
-  std::vector<std::pair<std::string, std::string>> args;
-  xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:map", &ctx)};
-  obj->Configure(args);
-  CheckConfigReload(obj, "rank:map");
-
-  // Test with setting sample weight to second query group
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {2.0f, 0.0f},
-                          {0, 2, 4},
-                          {0.95f, -0.95f,  0.0f, 0.0f},
-                          {0.9975f, 0.9975f, 0.0f, 0.0f});
-
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {1.0f, 1.0f},
-                          {0, 2, 4},
-                          {0.475f, -0.475f,  0.475f, -0.475f},
-                          {0.4988f, 0.4988f, 0.4988f, 0.4988f});
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
-}
-
-}  // namespace xgboost
diff --git a/tests/cpp/objective/test_ranking_obj_gpu.cu b/tests/cpp/objective/test_ranking_obj_gpu.cu
deleted file mode 100644
index 540560c1f..000000000
--- a/tests/cpp/objective/test_ranking_obj_gpu.cu
+++ /dev/null
@@ -1,231 +0,0 @@
-/*!
- * Copyright 2019-2021 by XGBoost Contributors
- */
-#include <thrust/host_vector.h>
-
-#include "test_ranking_obj.cc"
-#include "../../../src/objective/rank_obj.cu"
-
-namespace xgboost {
-
-template <typename T = uint32_t, typename Comparator = thrust::greater<T>>
-std::unique_ptr<dh::SegmentSorter<T>>
-RankSegmentSorterTestImpl(const std::vector<uint32_t> &group_indices,
-                          const std::vector<T> &hlabels,
-                          const std::vector<T> &expected_sorted_hlabels,
-                          const std::vector<uint32_t> &expected_orig_pos
-                          ) {
-  std::unique_ptr<dh::SegmentSorter<T>> seg_sorter_ptr(new dh::SegmentSorter<T>);
-  dh::SegmentSorter<T> &seg_sorter(*seg_sorter_ptr);
-
-  // Create a bunch of unsorted labels on the device and sort it via the segment sorter
-  dh::device_vector<T> dlabels(hlabels);
-  seg_sorter.SortItems(dlabels.data().get(), dlabels.size(), group_indices, Comparator());
-
-  auto num_items = seg_sorter.GetItemsSpan().size();
-  EXPECT_EQ(num_items, group_indices.back());
-  EXPECT_EQ(seg_sorter.GetNumGroups(), group_indices.size() - 1);
-
-  // Check the labels
-  dh::device_vector<T> sorted_dlabels(num_items);
-  sorted_dlabels.assign(dh::tcbegin(seg_sorter.GetItemsSpan()),
-                        dh::tcend(seg_sorter.GetItemsSpan()));
-  thrust::host_vector<T> sorted_hlabels(sorted_dlabels);
-  EXPECT_EQ(expected_sorted_hlabels, sorted_hlabels);
-
-  // Check the indices
-  dh::device_vector<uint32_t> dorig_pos(num_items);
-  dorig_pos.assign(dh::tcbegin(seg_sorter.GetOriginalPositionsSpan()),
-                   dh::tcend(seg_sorter.GetOriginalPositionsSpan()));
-  dh::device_vector<uint32_t> horig_pos(dorig_pos);
-  EXPECT_EQ(expected_orig_pos, horig_pos);
-
-  return seg_sorter_ptr;
-}
-
-TEST(Objective, RankSegmentSorterTest) {
-  RankSegmentSorterTestImpl({0, 2, 4, 7, 10, 14, 18, 22, 26},  // Groups
-                            {1, 1,                             // Labels
-                             1, 2,
-                             3, 2, 1,
-                             1, 2, 1,
-                             1, 3, 4, 2,
-                             1, 2, 1, 1,
-                             1, 2, 2, 3,
-                             3, 3, 1, 2},
-                            {1, 1,                             // Expected sorted labels
-                             2, 1,
-                             3, 2, 1,
-                             2, 1, 1,
-                             4, 3, 2, 1,
-                             2, 1, 1, 1,
-                             3, 2, 2, 1,
-                             3, 3, 2, 1},
-                            {0, 1,                             // Expected original positions
-                             3, 2,
-                             4, 5, 6,
-                             8, 7, 9,
-                             12, 11, 13, 10,
-                             15, 14, 16, 17,
-                             21, 19, 20, 18,
-                             22, 23, 25, 24});
-}
-
-TEST(Objective, RankSegmentSorterSingleGroupTest) {
-  RankSegmentSorterTestImpl({0, 7},                  // Groups
-                            {6, 1, 4, 3, 0, 5, 2},   // Labels
-                            {6, 5, 4, 3, 2, 1, 0},   // Expected sorted labels
-                            {0, 5, 2, 3, 6, 1, 4});  // Expected original positions
-}
-
-TEST(Objective, RankSegmentSorterAscendingTest) {
-  RankSegmentSorterTestImpl<uint32_t, thrust::less<uint32_t>>(
-                                                    {0, 4, 7},    // Groups
-                                                    {3, 1, 4, 2,  // Labels
-                                                     6, 5, 7},
-                                                    {1, 2, 3, 4,  // Expected sorted labels
-                                                     5, 6, 7},
-                                                    {1, 3, 0, 2,  // Expected original positions
-                                                     5, 4, 6});
-}
-
-TEST(Objective, NDCGLambdaWeightComputerTest) {
-  std::vector<float> hlabels = {3.1f, 1.2f, 2.3f, 4.4f,        // Labels
-                                7.8f, 5.01f, 6.96f,
-                                10.3f, 8.7f, 11.4f, 9.45f, 11.4f};
-  dh::device_vector<bst_float> dlabels(hlabels);
-
-  auto segment_label_sorter = RankSegmentSorterTestImpl<float>(
-    {0, 4, 7, 12},                  // Groups
-    hlabels,
-    {4.4f, 3.1f, 2.3f, 1.2f,        // Expected sorted labels
-     7.8f, 6.96f, 5.01f,
-     11.4f, 11.4f, 10.3f, 9.45f, 8.7f},
-    {3, 0, 2, 1,                    // Expected original positions
-     4, 6, 5,
-     9, 11, 7, 10, 8});
-
-  // Created segmented predictions for the labels from above
-  std::vector<bst_float> hpreds{-9.78f, 24.367f, 0.908f, -11.47f,
-                                -1.03f, -2.79f, -3.1f,
-                                104.22f, 103.1f, -101.7f, 100.5f, 45.1f};
-  dh::device_vector<bst_float> dpreds(hpreds);
-
-  xgboost::obj::NDCGLambdaWeightComputer ndcg_lw_computer(dpreds.data().get(),
-                                                          dlabels.data().get(),
-                                                          *segment_label_sorter);
-
-  // Where will the predictions move from its current position, if they were sorted
-  // descendingly?
-  auto dsorted_pred_pos = ndcg_lw_computer.GetPredictionSorter().GetIndexableSortedPositionsSpan();
-  std::vector<uint32_t> hsorted_pred_pos(segment_label_sorter->GetNumItems());
-  dh::CopyDeviceSpanToVector(&hsorted_pred_pos, dsorted_pred_pos);
-  std::vector<uint32_t> expected_sorted_pred_pos{2, 0, 1, 3,
-                                                 4, 5, 6,
-                                                 7, 8, 11, 9, 10};
-  EXPECT_EQ(expected_sorted_pred_pos, hsorted_pred_pos);
-
-  // Check group DCG values
-  std::vector<float> hgroup_dcgs(segment_label_sorter->GetNumGroups());
-  dh::CopyDeviceSpanToVector(&hgroup_dcgs, ndcg_lw_computer.GetGroupDcgsSpan());
-  std::vector<uint32_t> hgroups(segment_label_sorter->GetNumGroups() + 1);
-  dh::CopyDeviceSpanToVector(&hgroups, segment_label_sorter->GetGroupsSpan());
-  EXPECT_EQ(hgroup_dcgs.size(), segment_label_sorter->GetNumGroups());
-  std::vector<float> hsorted_labels(segment_label_sorter->GetNumItems());
-  dh::CopyDeviceSpanToVector(&hsorted_labels, segment_label_sorter->GetItemsSpan());
-  for (size_t i = 0; i < hgroup_dcgs.size(); ++i) {
-    // Compute group DCG value on CPU and compare
-    auto gbegin = hgroups[i];
-    auto gend = hgroups[i + 1];
-    EXPECT_NEAR(
-      hgroup_dcgs[i],
-      xgboost::obj::NDCGLambdaWeightComputer::ComputeGroupDCGWeight(&hsorted_labels[gbegin],
-                                                                    gend - gbegin),
-      0.01f);
-  }
-}
-
-TEST(Objective, IndexableSortedItemsTest) {
-  std::vector<float> hlabels = {3.1f, 1.2f, 2.3f, 4.4f,        // Labels
-                                7.8f, 5.01f, 6.96f,
-                                10.3f, 8.7f, 11.4f, 9.45f, 11.4f};
-  dh::device_vector<bst_float> dlabels(hlabels);
-
-  auto segment_label_sorter = RankSegmentSorterTestImpl<float>(
-    {0, 4, 7, 12},                  // Groups
-    hlabels,
-    {4.4f, 3.1f, 2.3f, 1.2f,        // Expected sorted labels
-     7.8f, 6.96f, 5.01f,
-     11.4f, 11.4f, 10.3f, 9.45f, 8.7f},
-    {3, 0, 2, 1,                    // Expected original positions
-     4, 6, 5,
-     9, 11, 7, 10, 8});
-
-  segment_label_sorter->CreateIndexableSortedPositions();
-  std::vector<uint32_t> sorted_indices(segment_label_sorter->GetNumItems());
-  dh::CopyDeviceSpanToVector(&sorted_indices,
-                             segment_label_sorter->GetIndexableSortedPositionsSpan());
-  std::vector<uint32_t> expected_sorted_indices = {
-    1, 3, 2, 0,
-    4, 6, 5,
-    9, 11, 7, 10, 8};
-  EXPECT_EQ(expected_sorted_indices, sorted_indices);
-}
-
-TEST(Objective, ComputeAndCompareMAPStatsTest) {
-  std::vector<float> hlabels = {3.1f, 0.0f, 2.3f, 4.4f,        // Labels
-                                0.0f, 5.01f, 0.0f,
-                                10.3f, 0.0f, 11.4f, 9.45f, 11.4f};
-  dh::device_vector<bst_float> dlabels(hlabels);
-
-  auto segment_label_sorter = RankSegmentSorterTestImpl<float>(
-    {0, 4, 7, 12},                  // Groups
-    hlabels,
-    {4.4f, 3.1f, 2.3f, 0.0f,        // Expected sorted labels
-     5.01f, 0.0f, 0.0f,
-     11.4f, 11.4f, 10.3f, 9.45f, 0.0f},
-    {3, 0, 2, 1,                    // Expected original positions
-     5, 4, 6,
-     9, 11, 7, 10, 8});
-
-  // Create MAP stats on the device first using the objective
-  std::vector<bst_float> hpreds{-9.78f, 24.367f, 0.908f, -11.47f,
-                                -1.03f, -2.79f, -3.1f,
-                                104.22f, 103.1f, -101.7f, 100.5f, 45.1f};
-  dh::device_vector<bst_float> dpreds(hpreds);
-
-  xgboost::obj::MAPLambdaWeightComputer map_lw_computer(dpreds.data().get(),
-                                                        dlabels.data().get(),
-                                                        *segment_label_sorter);
-
-  // Get the device MAP stats on host
-  std::vector<xgboost::obj::MAPLambdaWeightComputer::MAPStats> dmap_stats(
-    segment_label_sorter->GetNumItems());
-  dh::CopyDeviceSpanToVector(&dmap_stats, map_lw_computer.GetMapStatsSpan());
-
-  // Compute the MAP stats on host next to compare
-  std::vector<uint32_t> hgroups(segment_label_sorter->GetNumGroups() + 1);
-  dh::CopyDeviceSpanToVector(&hgroups, segment_label_sorter->GetGroupsSpan());
-
-  for (size_t i = 0; i < hgroups.size() - 1; ++i) {
-    auto gbegin = hgroups[i];
-    auto gend = hgroups[i + 1];
-    std::vector<xgboost::obj::ListEntry> lst_entry;
-    for (auto j = gbegin; j < gend; ++j) {
-      lst_entry.emplace_back(hpreds[j], hlabels[j], j);
-    }
-    std::stable_sort(lst_entry.begin(), lst_entry.end(), xgboost::obj::ListEntry::CmpPred);
-
-    // Compute the MAP stats with this list and compare with the ones computed on the device
-    std::vector<xgboost::obj::MAPLambdaWeightComputer::MAPStats> hmap_stats;
-    xgboost::obj::MAPLambdaWeightComputer::GetMAPStats(lst_entry, &hmap_stats);
-    for (auto j = gbegin; j < gend; ++j) {
-      EXPECT_EQ(dmap_stats[j].hits, hmap_stats[j - gbegin].hits);
-      EXPECT_NEAR(dmap_stats[j].ap_acc, hmap_stats[j - gbegin].ap_acc, 0.01f);
-      EXPECT_NEAR(dmap_stats[j].ap_acc_miss, hmap_stats[j - gbegin].ap_acc_miss, 0.01f);
-      EXPECT_NEAR(dmap_stats[j].ap_acc_add, hmap_stats[j - gbegin].ap_acc_add, 0.01f);
-    }
-  }
-}
-
-}  // namespace xgboost
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index 10ba68b49..0dbdeeca4 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -13,25 +13,6 @@
 #include "../../../plugin/federated/federated_server.h"
 #include "../../../src/collective/communicator-inl.h"
 
-inline int GenerateRandomPort(int low, int high) {
-  using namespace std::chrono_literals;
-  // Ensure unique timestamp by introducing a small artificial delay
-  std::this_thread::sleep_for(100ms);
-  auto timestamp = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::milliseconds>(
-                                             std::chrono::system_clock::now().time_since_epoch())
-                                             .count());
-  std::mt19937_64 rng(timestamp);
-  std::uniform_int_distribution<int> dist(low, high);
-  int port = dist(rng);
-  return port;
-}
-
-inline std::string GetServerAddress() {
-  int port = GenerateRandomPort(50000, 60000);
-  std::string address = std::string("localhost:") + std::to_string(port);
-  return address;
-}
-
 namespace xgboost {
 
 class ServerForTest {
@@ -41,13 +22,14 @@ class ServerForTest {
 
  public:
   explicit ServerForTest(std::int32_t world_size) {
-    server_address_ = GetServerAddress();
     server_thread_.reset(new std::thread([this, world_size] {
       grpc::ServerBuilder builder;
       xgboost::federated::FederatedService service{world_size};
-      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
+      int selected_port;
+      builder.AddListeningPort("localhost:0", grpc::InsecureServerCredentials(), &selected_port);
       builder.RegisterService(&service);
       server_ = builder.BuildAndStart();
+      server_address_ = std::string("localhost:") + std::to_string(selected_port);
       server_->Wait();
     }));
   }
@@ -56,7 +38,14 @@ class ServerForTest {
     server_->Shutdown();
     server_thread_->join();
   }
-  auto Address() const { return server_address_; }
+
+  auto Address() const {
+    using namespace std::chrono_literals;
+    while (server_address_.empty()) {
+      std::this_thread::sleep_for(100ms);
+    }
+    return server_address_;
+  }
 };
 
 class BaseFederatedTest : public ::testing::Test {
@@ -65,7 +54,7 @@ class BaseFederatedTest : public ::testing::Test {
 
   void TearDown() override { server_.reset(nullptr); }
 
-  static int const kWorldSize{3};
+  static int constexpr kWorldSize{3};
   std::unique_ptr<ServerForTest> server_;
 };
 
diff --git a/tests/cpp/plugin/test_federated_communicator.cc b/tests/cpp/plugin/test_federated_communicator.cc
index 340849606..62f33d5ee 100644
--- a/tests/cpp/plugin/test_federated_communicator.cc
+++ b/tests/cpp/plugin/test_federated_communicator.cc
@@ -62,34 +62,24 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
 };
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
-    FederatedCommunicator comm{0, 0, server_address, "", "", ""};
-  };
+  auto construct = [] { FederatedCommunicator comm{0, 0, "localhost:0", "", "", ""}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooSmall) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
-    FederatedCommunicator comm{1, -1, server_address, "", "", ""};
-  };
+  auto construct = [] { FederatedCommunicator comm{1, -1, "localhost:0", "", "", ""}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooBig) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
-    FederatedCommunicator comm{1, 1, server_address, "", "", ""};
-  };
+  auto construct = [] { FederatedCommunicator comm{1, 1, "localhost:0", "", "", ""}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeNotInteger) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
+  auto construct = [] {
     Json config{JsonObject()};
-    config["federated_server_address"] = server_address;
+    config["federated_server_address"] = std::string("localhost:0");
     config["federated_world_size"] = std::string("1");
     config["federated_rank"] = Integer(0);
     FederatedCommunicator::Create(config);
@@ -98,10 +88,9 @@ TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeNotInteger) {
 }
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnRankNotInteger) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
+  auto construct = [] {
     Json config{JsonObject()};
-    config["federated_server_address"] = server_address;
+    config["federated_server_address"] = std::string("localhost:0");
     config["federated_world_size"] = 1;
     config["federated_rank"] = std::string("0");
     FederatedCommunicator::Create(config);
@@ -110,15 +99,13 @@ TEST(FederatedCommunicatorSimpleTest, ThrowOnRankNotInteger) {
 }
 
 TEST(FederatedCommunicatorSimpleTest, GetWorldSizeAndRank) {
-  std::string server_address{GetServerAddress()};
-  FederatedCommunicator comm{6, 3, server_address};
+  FederatedCommunicator comm{6, 3, "localhost:0"};
   EXPECT_EQ(comm.GetWorldSize(), 6);
   EXPECT_EQ(comm.GetRank(), 3);
 }
 
 TEST(FederatedCommunicatorSimpleTest, IsDistributed) {
-  std::string server_address{GetServerAddress()};
-  FederatedCommunicator comm{2, 1, server_address};
+  FederatedCommunicator comm{2, 1, "localhost:0"};
   EXPECT_TRUE(comm.IsDistributed());
 }
 
diff --git a/tests/cpp/plugin/test_federated_learner.cc b/tests/cpp/plugin/test_federated_learner.cc
index 85d0a2b7d..b7066b6a0 100644
--- a/tests/cpp/plugin/test_federated_learner.cc
+++ b/tests/cpp/plugin/test_federated_learner.cc
@@ -70,7 +70,7 @@ void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json e
 
 class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
   std::unique_ptr<ServerForTest> server_;
-  static int const kWorldSize{3};
+  static int constexpr kWorldSize{3};
 
  protected:
   void SetUp() override { server_ = std::make_unique<ServerForTest>(kWorldSize); }
diff --git a/tests/cpp/plugin/test_federated_metrics.cc b/tests/cpp/plugin/test_federated_metrics.cc
new file mode 100644
index 000000000..1bdda567f
--- /dev/null
+++ b/tests/cpp/plugin/test_federated_metrics.cc
@@ -0,0 +1,243 @@
+/*!
+ * Copyright 2023 XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../metric/test_auc.h"
+#include "../metric/test_elementwise_metric.h"
+#include "../metric/test_multiclass_metric.h"
+#include "../metric/test_rank_metric.h"
+#include "../metric/test_survival_metric.h"
+#include "helpers.h"
+
+namespace {
+class FederatedMetricTest : public xgboost::BaseFederatedTest {};
+}  // anonymous namespace
+
+namespace xgboost {
+namespace metric {
+TEST_F(FederatedMetricTest, BinaryAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyBinaryAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, BinaryAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyBinaryAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiClassAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiClassAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, RankingAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRankingAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, RankingAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRankingAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, PRAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPRAUC, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, PRAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPRAUC, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiClassPRAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassPRAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiClassPRAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassPRAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, RankingPRAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRankingPRAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, RankingPRAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRankingPRAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, RMSERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRMSE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, RMSEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRMSE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, RMSLERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRMSLE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, RMSLEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRMSLE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MAERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MAEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MAPERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAPE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MAPEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAPE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MPHERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMPHE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MPHEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMPHE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, LogLossRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyLogLoss, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, LogLossColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyLogLoss, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, ErrorRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyError, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, ErrorColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyError, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, PoissonNegLogLikRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPoissonNegLogLik,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, PoissonNegLogLikColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPoissonNegLogLik,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiRMSERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiRMSE,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiRMSEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiRMSE,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, QuantileRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyQuantile,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, QuantileColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyQuantile,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiClassErrorRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassError,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiClassErrorColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassError,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiClassLogLossRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassLogLoss,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiClassLogLossColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassLogLoss,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, PrecisionRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPrecision,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, PrecisionColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPrecision,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, NDCGRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyNDCG, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, NDCGColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyNDCG, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MAPRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAP, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MAPColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAP, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, NDCGExpGainRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyNDCGExpGain,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, NDCGExpGainColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyNDCGExpGain,
+                               DataSplitMode::kCol);
+}
+}  // namespace metric
+}  // namespace xgboost
+
+namespace xgboost {
+namespace common {
+TEST_F(FederatedMetricTest, AFTNegLogLikRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAFTNegLogLik,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, AFTNegLogLikColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAFTNegLogLik,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, IntervalRegressionAccuracyRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyIntervalRegressionAccuracy,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, IntervalRegressionAccuracyColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyIntervalRegressionAccuracy,
+                               DataSplitMode::kCol);
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 38d4136c9..9d8248dfd 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -126,7 +126,8 @@ TEST(Learner, SLOW_CheckMultiBatch) {  // NOLINT
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/big.libsvm";
   CreateBigTestData(tmp_file, 50000);
-  std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"));
+  std::shared_ptr<DMatrix> dmat(
+      xgboost::DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache"));
   EXPECT_FALSE(dmat->SingleColBlock());
   size_t num_row = dmat->Info().num_row_;
   std::vector<bst_float> labels(num_row);
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 2724e58c4..dbe9825b7 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -203,7 +203,11 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
     learner->Save(&mem_out);
     ASSERT_EQ(model_at_kiter, serialised_model_tmp);
 
-    learner->SetParam("gpu_id", "0");
+    for (auto const& [key, value] : args) {
+      if (key == "tree_method" && value == "gpu_hist") {
+        learner->SetParam("gpu_id", "0");
+      }
+    }
     // Pull data to device
     for (auto &batch : p_dmat->GetBatches<SparsePage>()) {
       batch.data.SetDevice(0);
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index e211fe70a..95ae02aee 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -1,12 +1,13 @@
-/*!
- * Copyright 2020-2021 by XGBoost Contributors
+/**
+ * Copyright 2020-2023, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
 #include "../../../../src/data/ellpack_page.cuh"
 #include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
 #include "../../../../src/tree/param.h"
-#include "../../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../../../../src/tree/param.h"  // TrainParam
+#include "../../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../../helpers.h"
 
 namespace xgboost {
@@ -31,14 +32,15 @@ void VerifySampling(size_t page_size,
   }
   gpair.SetDevice(0);
 
-  BatchParam param{0, 256};
-  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   if (page_size != 0) {
     EXPECT_NE(page->n_rows, kRows);
   }
 
-  GradientBasedSampler sampler(page, kRows, param, subsample, sampling_method);
-  auto sample = sampler.Sample(gpair.DeviceSpan(), dmat.get());
+  GradientBasedSampler sampler(&ctx, page, kRows, param, subsample, sampling_method);
+  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
 
   if (fixed_size_sampling) {
     EXPECT_EQ(sample.sample_rows, kRows);
@@ -86,12 +88,13 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   auto gpair = GenerateRandomGradients(kRows);
   gpair.SetDevice(0);
 
-  BatchParam param{0, 256};
-  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_NE(page->n_rows, kRows);
 
-  GradientBasedSampler sampler(page, kRows, param, kSubsample, TrainParam::kUniform);
-  auto sample = sampler.Sample(gpair.DeviceSpan(), dmat.get());
+  GradientBasedSampler sampler(&ctx, page, kRows, param, kSubsample, TrainParam::kUniform);
+  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
   auto sampled_page = sample.page;
   EXPECT_EQ(sample.sample_rows, kRows);
   EXPECT_EQ(sample.gpair.size(), gpair.Size());
@@ -103,7 +106,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
       ci(buffer.data(), sampled_page->NumSymbols());
 
   size_t offset = 0;
-  for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     auto page = batch.Impl();
     std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.HostVector());
     common::CompressedIterator<common::CompressedByteT>
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 6f7700b6a..1f93ddff2 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -1,9 +1,14 @@
+/**
+ * Copyright 2020-2023, XGBoost Contributors
+ */
 #include <gtest/gtest.h>
+
 #include <vector>
 
 #include "../../../../src/common/categorical.h"
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
+#include "../../../../src/tree/param.h"  // TrainParam
 #include "../../categorical_helpers.h"
 #include "../../helpers.h"
 
@@ -11,15 +16,15 @@ namespace xgboost {
 namespace tree {
 
 void TestDeterministicHistogram(bool is_dense, int shm_size) {
-  Context ctx = CreateEmptyGenericParam(0);
+  Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
   float constexpr kLower = -1e-2, kUpper = 1e2;
 
   float sparsity = is_dense ? 0.0f : 0.5f;
   auto matrix = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix();
-  BatchParam batch_param{0, static_cast<int32_t>(kBins)};
+  auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()};
 
-  for (auto const& batch : matrix->GetBatches<EllpackPage>(batch_param)) {
+  for (auto const& batch : matrix->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
 
     tree::RowPartitioner row_partitioner(0, kRows);
@@ -132,13 +137,13 @@ void ValidateCategoricalHistogram(size_t n_categories, common::Span<GradientPair
 
 // Test 1 vs rest categorical histogram is equivalent to one hot encoded data.
 void TestGPUHistogramCategorical(size_t num_categories) {
-  auto ctx = CreateEmptyGenericParam(0);
+  auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows = 340;
   size_t constexpr kBins = 256;
   auto x = GenerateRandomCategoricalSingleColumn(kRows, num_categories);
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
-  BatchParam batch_param{0, static_cast<int32_t>(kBins)};
+  auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()};
   tree::RowPartitioner row_partitioner(0, kRows);
   auto ridx = row_partitioner.GetRows(0);
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
@@ -148,7 +153,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   /**
    * Generate hist with cat data.
    */
-  for (auto const &batch : cat_m->GetBatches<EllpackPage>(batch_param)) {
+  for (auto const &batch : cat_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
     BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
@@ -162,7 +167,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   auto x_encoded = OneHotEncodeFeature(x, num_categories);
   auto encode_m = GetDMatrixFromData(x_encoded, kRows, num_categories);
   dh::device_vector<GradientPairInt64> encode_hist(2 * num_categories);
-  for (auto const &batch : encode_m->GetBatches<EllpackPage>(batch_param)) {
+  for (auto const &batch : encode_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
     BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index dcd04f68a..c53d9d90b 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -41,7 +41,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
 
   size_t constexpr kMaxBins = 4;
   // dense, no missing values
-  GHistIndexMatrix gmat(dmat.get(), kMaxBins, 0.5, false, AllThreadsForTest());
+  GHistIndexMatrix gmat(&ctx, dmat.get(), kMaxBins, 0.5, false);
   common::RowSetCollection row_set_collection;
   std::vector<size_t> &row_indices = *row_set_collection.Data();
   row_indices.resize(kRows);
@@ -228,7 +228,7 @@ auto CompareOneHotAndPartition(bool onehot) {
   auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
   std::vector<CPUExpandEntry> entries(1);
 
-  for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>({32, param.sparse_threshold})) {
+  for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>(&ctx, {32, param.sparse_threshold})) {
     common::HistCollection hist;
 
     entries.front().nid = 0;
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 3b354bebb..2e620fd10 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -25,6 +25,7 @@ void InitRowPartitionForTest(common::RowSetCollection *row_set, size_t n_samples
 }  // anonymous namespace
 
 void TestAddHistRows(bool is_distributed) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
   std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
   int starting_index = std::numeric_limits<int>::max();
@@ -32,9 +33,9 @@ void TestAddHistRows(bool is_distributed) {
 
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
-  auto p_fmat =
-      RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
-  auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(BatchParam{kMaxBins, 0.5}).begin());
+  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  auto const &gmat =
+      *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());
 
   RegTree tree;
 
@@ -73,6 +74,7 @@ TEST(CPUHistogram, AddRows) {
 void TestSyncHist(bool is_distributed) {
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
 
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
   std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
@@ -80,9 +82,9 @@ void TestSyncHist(bool is_distributed) {
   int sync_count = 0;
   RegTree tree;
 
-  auto p_fmat =
-      RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
-  auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(BatchParam{kMaxBins, 0.5}).begin());
+  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  auto const &gmat =
+      *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());
 
   HistogramBuilder<CPUExpandEntry> histogram;
   uint32_t total_bins = gmat.cut.Ptrs().back();
@@ -227,12 +229,15 @@ TEST(CPUHistogram, SyncHist) {
 void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split) {
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
-  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  auto p_fmat =
+      RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
   if (is_col_split) {
     p_fmat = std::shared_ptr<DMatrix>{
         p_fmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
   }
-  auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(BatchParam{kMaxBins, 0.5}).begin());
+  auto const &gmat =
+      *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());
   uint32_t total_bins = gmat.cut.Ptrs().back();
 
   static double constexpr kEps = 1e-6;
@@ -257,9 +262,9 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
   CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
   nodes_for_explicit_hist_build.push_back(node);
-  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>({kMaxBins, 0.5})) {
-    histogram.BuildHist(0, gidx, &tree, row_set_collection,
-                        nodes_for_explicit_hist_build, {}, gpair, force_read_by_column);
+  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {kMaxBins, 0.5})) {
+    histogram.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
+                        gpair, force_read_by_column);
   }
 
   // Check if number of histogram bins is correct
@@ -325,6 +330,8 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   auto x = GenerateRandomCategoricalSingleColumn(kRows, n_categories);
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+
   BatchParam batch_param{0, static_cast<int32_t>(kBins)};
 
   RegTree tree;
@@ -345,12 +352,11 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
    * Generate hist with cat data.
    */
   HistogramBuilder<CPUExpandEntry> cat_hist;
-  for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>({kBins, 0.5})) {
+  for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
     cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
-    cat_hist.BuildHist(0, gidx, &tree, row_set_collection,
-                        nodes_for_explicit_hist_build, {}, gpair.HostVector(),
-                        force_read_by_column);
+    cat_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
+                       gpair.HostVector(), force_read_by_column);
   }
 
   /**
@@ -359,12 +365,11 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   auto x_encoded = OneHotEncodeFeature(x, n_categories);
   auto encode_m = GetDMatrixFromData(x_encoded, kRows, n_categories);
   HistogramBuilder<CPUExpandEntry> onehot_hist;
-  for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>({kBins, 0.5})) {
+  for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
     onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
     onehot_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                          gpair.HostVector(),
-                          force_read_by_column);
+                          gpair.HostVector(), force_read_by_column);
   }
 
   auto cat = cat_hist.Histogram()[0];
@@ -382,8 +387,8 @@ TEST(CPUHistogram, Categorical) {
   }
 }
 namespace {
-void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool force_read_by_column) {
-  Context ctx;
+void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, bool is_approx,
+                                 bool force_read_by_column) {
   size_t constexpr kEntries = 1 << 16;
   auto m = CreateSparsePageDMatrix(kEntries, "cache");
 
@@ -410,7 +415,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
      * Multi page
      */
     std::vector<common::RowSetCollection> rows_set;
-    for (auto const &page : m->GetBatches<GHistIndexMatrix>(batch_param)) {
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(ctx, batch_param)) {
       CHECK_LT(page.base_rowid, m->Info().num_row_);
       auto n_rows_in_node = page.Size();
       partition_size[0] = std::max(partition_size[0], n_rows_in_node);
@@ -426,12 +431,12 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
         1, [&](size_t nidx_in_set) { return partition_size.at(nidx_in_set); },
         256};
 
-    multi_build.Reset(total_bins, batch_param, ctx.Threads(), rows_set.size(), false, false);
+    multi_build.Reset(total_bins, batch_param, ctx->Threads(), rows_set.size(), false, false);
 
     size_t page_idx{0};
-    for (auto const &page : m->GetBatches<GHistIndexMatrix>(batch_param)) {
-      multi_build.BuildHist(page_idx, space, page, &tree, rows_set.at(page_idx), nodes, {},
-                            h_gpair, force_read_by_column);
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(ctx, batch_param)) {
+      multi_build.BuildHist(page_idx, space, page, &tree, rows_set.at(page_idx), nodes, {}, h_gpair,
+                            force_read_by_column);
       ++page_idx;
     }
     ASSERT_EQ(page_idx, 2);
@@ -447,16 +452,16 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
     common::RowSetCollection row_set_collection;
     InitRowPartitionForTest(&row_set_collection, n_samples);
 
-    single_build.Reset(total_bins, batch_param, ctx.Threads(), 1, false, false);
+    single_build.Reset(total_bins, batch_param, ctx->Threads(), 1, false, false);
     SparsePage concat;
     std::vector<float> hess(m->Info().num_row_, 1.0f);
     for (auto const& page : m->GetBatches<SparsePage>()) {
       concat.Push(page);
     }
 
-    auto cut = common::SketchOnDMatrix(m.get(), batch_param.max_bin, ctx.Threads(), false, hess);
+    auto cut = common::SketchOnDMatrix(ctx, m.get(), batch_param.max_bin, false, hess);
     GHistIndexMatrix gmat(concat, {}, cut, batch_param.max_bin, false,
-                          std::numeric_limits<double>::quiet_NaN(), ctx.Threads());
+                          std::numeric_limits<double>::quiet_NaN(), ctx->Threads());
     single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair, force_read_by_column);
     single_page = single_build.Histogram()[0];
   }
@@ -470,16 +475,17 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
 
 TEST(CPUHistogram, ExternalMemory) {
   int32_t constexpr kBins = 256;
-  TestHistogramExternalMemory(BatchParam{kBins, common::Span<float>{}, false}, true, false);
-  TestHistogramExternalMemory(BatchParam{kBins, common::Span<float>{}, false}, true, true);
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+
+  TestHistogramExternalMemory(&ctx, BatchParam{kBins, common::Span<float>{}, false}, true, false);
+  TestHistogramExternalMemory(&ctx, BatchParam{kBins, common::Span<float>{}, false}, true, true);
 
   float sparse_thresh{0.5};
-  TestHistogramExternalMemory({kBins, sparse_thresh}, false, false);
-  TestHistogramExternalMemory({kBins, sparse_thresh}, false, true);
+  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, false);
+  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, true);
   sparse_thresh = std::numeric_limits<float>::quiet_NaN();
-  TestHistogramExternalMemory({kBins, sparse_thresh}, false, false);
-  TestHistogramExternalMemory({kBins, sparse_thresh}, false, true);
-
+  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, false);
+  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, true);
 }
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 6f2b83511..38da629b1 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -34,7 +34,7 @@ TEST(Approx, Partitioner) {
   std::vector<CPUExpandEntry> candidates{{0, 0}};
   candidates.front().split.loss_chg = 0.4;
 
-  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, {64, hess, true})) {
     bst_feature_t const split_ind = 0;
     {
       auto min_value = page.cut.MinValues()[split_ind];
@@ -84,7 +84,7 @@ void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared
 
   Context ctx;
   ctx.InitAllowUnknown(Args{});
-  for (auto const& page : dmat->GetBatches<GHistIndexMatrix>({64, *hess, true})) {
+  for (auto const& page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, *hess, true})) {
     {
       RegTree tree;
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
@@ -133,7 +133,7 @@ TEST(Approx, PartitionerColSplit) {
   Context ctx;
   ctx.InitAllowUnknown(Args{});
   CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false};
-  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, {64, hess, true})) {
     bst_feature_t const split_ind = 0;
     min_value = page.cut.MinValues()[split_ind];
 
diff --git a/tests/cpp/tree/test_common_partitioner.cc b/tests/cpp/tree/test_common_partitioner.cc
index 7e47ec289..116802c6a 100644
--- a/tests/cpp/tree/test_common_partitioner.cc
+++ b/tests/cpp/tree/test_common_partitioner.cc
@@ -43,7 +43,7 @@ void TestLeafPartition(size_t n_samples) {
 
   std::vector<size_t> h_nptr;
   float split_value{0};
-  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({Context::kCpuId, 64})) {
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{64, 0.2})) {
     bst_feature_t const split_ind = 0;
     auto ptr = page.cut.Ptrs()[split_ind + 1];
     split_value = page.cut.Values().at(ptr / 2);
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index d960a6090..1a32a1ee9 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -218,17 +218,16 @@ TEST(GpuHist, TestHistogramIndex) {
   TestHistogramIndexImpl();
 }
 
-void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
-                size_t gpu_page_size, RegTree* tree,
-                HostDeviceVector<bst_float>* preds, float subsample = 1.0f,
-                const std::string& sampling_method = "uniform",
+void UpdateTree(Context const* ctx, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+                size_t gpu_page_size, RegTree* tree, HostDeviceVector<bst_float>* preds,
+                float subsample = 1.0f, const std::string& sampling_method = "uniform",
                 int max_bin = 2) {
-
   if (gpu_page_size > 0) {
     // Loop over the batches and count the records
     int64_t batch_count = 0;
     int64_t row_count = 0;
-    for (const auto& batch : dmat->GetBatches<EllpackPage>({0, max_bin})) {
+    for (const auto& batch : dmat->GetBatches<EllpackPage>(
+             ctx, BatchParam{max_bin, TrainParam::DftSparseThreshold()})) {
       EXPECT_LT(batch.Size(), dmat->Info().num_row_);
       batch_count++;
       row_count += batch.Size();
@@ -249,14 +248,13 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
   TrainParam param;
   param.UpdateAllowUnknown(args);
 
-  Context ctx(CreateEmptyGenericParam(0));
   ObjInfo task{ObjInfo::kRegression};
-  tree::GPUHistMaker hist_maker{&ctx, &task};
+  tree::GPUHistMaker hist_maker{ctx, &task};
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   hist_maker.Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
                     {tree});
-  auto cache = linalg::MakeTensorView(&ctx, preds->DeviceSpan(), preds->Size(), 1);
+  auto cache = linalg::MakeTensorView(ctx, preds->DeviceSpan(), preds->Size(), 1);
   hist_maker.UpdatePredictionCache(dmat, cache);
 }
 
@@ -274,12 +272,13 @@ TEST(GpuHist, UniformSampling) {
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
+  Context ctx(CreateEmptyGenericParam(0));
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
-             "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "uniform",
+             kRows);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -303,12 +302,13 @@ TEST(GpuHist, GradientBasedSampling) {
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
+  Context ctx(CreateEmptyGenericParam(0));
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
 
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
              "gradient_based", kRows);
 
   // Make sure the predictions are the same.
@@ -337,12 +337,13 @@ TEST(GpuHist, ExternalMemory) {
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
+  Context ctx(CreateEmptyGenericParam(0));
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using multiple ELLPACK pages.
   RegTree tree_ext;
   HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -374,17 +375,17 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
   // Build a tree using the in-memory DMatrix.
   auto rng = common::GlobalRandom();
 
+  Context ctx(CreateEmptyGenericParam(0));
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod,
-             kRows);
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
 
   // Build another tree using multiple ELLPACK pages.
   common::GlobalRandom() = rng;
   RegTree tree_ext;
   HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext,
-             kSubsample, kSamplingMethod, kRows);
+  UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample,
+             kSamplingMethod, kRows);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 2aa1b8f47..e5ce75585 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -36,7 +36,7 @@ void TestPartitioner(bst_target_t n_targets) {
   std::vector<ExpandEntry> candidates{{0, 0}};
   candidates.front().split.loss_chg = 0.4;
 
-  auto cuts = common::SketchOnDMatrix(Xy.get(), 64, ctx.Threads());
+  auto cuts = common::SketchOnDMatrix(&ctx, Xy.get(), 64);
 
   for (auto const& page : Xy->GetBatches<SparsePage>()) {
     GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
diff --git a/tests/cpp/tree/test_regen.cc b/tests/cpp/tree/test_regen.cc
index 24884b1cf..e9b3637a7 100644
--- a/tests/cpp/tree/test_regen.cc
+++ b/tests/cpp/tree/test_regen.cc
@@ -15,16 +15,17 @@ class DMatrixForTest : public data::SimpleDMatrix {
 
  public:
   using SimpleDMatrix::SimpleDMatrix;
-  BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) override {
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const* ctx,
+                                              const BatchParam& param) override {
     auto backup = this->gradient_index_;
-    auto iter = SimpleDMatrix::GetGradientIndex(param);
+    auto iter = SimpleDMatrix::GetGradientIndex(ctx, param);
     n_regen_ += (backup != this->gradient_index_);
     return iter;
   }
 
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override {
+  BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, const BatchParam& param) override {
     auto backup = this->ellpack_page_;
-    auto iter = SimpleDMatrix::GetEllpackBatches(param);
+    auto iter = SimpleDMatrix::GetEllpackBatches(ctx, param);
     n_regen_ += (backup != this->ellpack_page_);
     return iter;
   }
@@ -50,8 +51,8 @@ class RegenTest : public ::testing::Test {
     HostDeviceVector<float> storage;
     auto dense = RandomDataGenerator{kRows, kCols, 0.5}.GenerateArrayInterface(&storage);
     auto adapter = data::ArrayAdapter(StringView{dense});
-    p_fmat_ = std::shared_ptr<DMatrix>(new DMatrixForTest{
-        &adapter, std::numeric_limits<float>::quiet_NaN(), AllThreadsForTest()});
+    p_fmat_ = std::shared_ptr<DMatrix>(
+        new DMatrixForTest{&adapter, std::numeric_limits<float>::quiet_NaN(), AllThreadsForTest()});
 
     p_fmat_->Info().labels.Reshape(256, 1);
     auto labels = p_fmat_->Info().labels.Data();
@@ -74,7 +75,7 @@ class RegenTest : public ::testing::Test {
     auto for_test = dynamic_cast<DMatrixForTest*>(p_fmat_.get());
     CHECK(for_test);
     auto backup = for_test->NumRegen();
-    for_test->GetBatches<Page>(BatchParam{});
+    for_test->GetBatches<Page>(p_fmat_->Ctx(), BatchParam{});
     CHECK_EQ(for_test->NumRegen(), backup);
 
     if (reset) {
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index 3cd65e30f..c5b7e4fc5 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -18,6 +18,7 @@ class TestQuantileDMatrix:
     @pytest.mark.skipif(**tm.no_cupy())
     def test_dmatrix_feature_weights(self) -> None:
         import cupy as cp
+
         rng = cp.random.RandomState(1994)
         data = rng.randn(5, 5)
         m = xgb.DMatrix(data)
@@ -26,23 +27,91 @@ class TestQuantileDMatrix:
         m.set_info(feature_weights=feature_weights)
 
         cp.testing.assert_array_equal(
-            cp.array(m.get_float_info('feature_weights')),
-            feature_weights.astype(np.float32))
+            cp.array(m.get_float_info("feature_weights")),
+            feature_weights.astype(np.float32),
+        )
 
     @pytest.mark.skipif(**tm.no_cupy())
     def test_dmatrix_cupy_init(self) -> None:
         import cupy as cp
+
         data = cp.random.randn(5, 5)
         xgb.QuantileDMatrix(data, cp.ones(5, dtype=np.float64))
 
+    @pytest.mark.parametrize(
+        "on_device,tree_method",
+        [(True, "hist"), (False, "gpu_hist"), (False, "hist"), (True, "gpu_hist")],
+    )
+    def test_initialization(self, on_device: bool, tree_method: str) -> None:
+        n_samples, n_features, max_bin = 64, 3, 16
+        X, y, w = tm.make_batches(
+            n_samples,
+            n_features=n_features,
+            n_batches=1,
+            use_cupy=on_device,
+        )
+
+        # Init SparsePage
+        Xy = xgb.DMatrix(X[0], y[0], weight=w[0])
+        # Init GIDX/Ellpack
+        xgb.train(
+            {"tree_method": tree_method, "max_bin": max_bin},
+            Xy,
+            num_boost_round=1,
+        )
+        # query cuts from GIDX/Ellpack
+        qXy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin, ref=Xy)
+        tm.predictor_equal(Xy, qXy)
+        with pytest.raises(ValueError, match="Inconsistent"):
+            # max_bin changed.
+            xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin - 1, ref=Xy)
+
+        # No error, DMatrix can be modified for different training session.
+        xgb.train(
+            {"tree_method": tree_method, "max_bin": max_bin - 1},
+            Xy,
+            num_boost_round=1,
+        )
+
+        # Init Ellpack/GIDX
+        Xy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin)
+        # Init GIDX/Ellpack
+        xgb.train(
+            {"tree_method": tree_method, "max_bin": max_bin},
+            Xy,
+            num_boost_round=1,
+        )
+        # query cuts from GIDX/Ellpack
+        qXy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin, ref=Xy)
+        tm.predictor_equal(Xy, qXy)
+        with pytest.raises(ValueError, match="Inconsistent"):
+            # max_bin changed.
+            xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin - 1, ref=Xy)
+
+        Xy = xgb.DMatrix(X[0], y[0], weight=w[0])
+        booster0 = xgb.train(
+            {"tree_method": "hist", "max_bin": max_bin, "max_depth": 4},
+            Xy,
+            num_boost_round=1,
+        )
+        booster1 = xgb.train(
+            {"tree_method": "gpu_hist", "max_bin": max_bin, "max_depth": 4},
+            Xy,
+            num_boost_round=1,
+        )
+        qXy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin, ref=Xy)
+        predt0 = booster0.predict(qXy)
+        predt1 = booster1.predict(qXy)
+        np.testing.assert_allclose(predt0, predt1)
+
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.parametrize(
-        "tree_method,max_bin", [
-            ("hist", 16), ("gpu_hist", 16), ("hist", 64), ("gpu_hist", 64)
-        ]
+        "tree_method,max_bin",
+        [("hist", 16), ("gpu_hist", 16), ("hist", 64), ("gpu_hist", 64)],
     )
     def test_interoperability(self, tree_method: str, max_bin: int) -> None:
         import cupy as cp
+
         n_samples = 64
         n_features = 3
         X, y, w = tm.make_batches(
@@ -75,6 +144,7 @@ class TestQuantileDMatrix:
     @pytest.mark.skipif(**tm.no_cupy())
     def test_metainfo(self) -> None:
         import cupy as cp
+
         rng = cp.random.RandomState(1994)
 
         rows = 10
@@ -98,6 +168,7 @@ class TestQuantileDMatrix:
     @pytest.mark.skipif(**tm.no_cudf())
     def test_ref_dmatrix(self) -> None:
         import cupy as cp
+
         rng = cp.random.RandomState(1994)
         self.cputest.run_ref_dmatrix(rng, "gpu_hist", False)
 
@@ -158,5 +229,6 @@ class TestQuantileDMatrix:
     @pytest.mark.skipif(**tm.no_cupy())
     def test_check_inf(self) -> None:
         import cupy as cp
+
         rng = cp.random.default_rng(1994)
         check_inf(rng)
diff --git a/tests/python-gpu/test_gpu_eval_metrics.py b/tests/python-gpu/test_gpu_eval_metrics.py
index 6d16aa44e..1e9d1a282 100644
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -1,3 +1,4 @@
+import json
 import sys
 
 import pytest
@@ -36,19 +37,16 @@ class TestGPUEvalMetrics:
 
         Xy = xgboost.DMatrix(X, y, group=group)
 
-        cpu = xgboost.train(
+        booster = xgboost.train(
             {"tree_method": "hist", "eval_metric": "auc", "objective": "rank:ndcg"},
             Xy,
             num_boost_round=10,
         )
-        cpu_auc = float(cpu.eval(Xy).split(":")[1])
-
-        gpu = xgboost.train(
-            {"tree_method": "gpu_hist", "eval_metric": "auc", "objective": "rank:ndcg"},
-            Xy,
-            num_boost_round=10,
-        )
-        gpu_auc = float(gpu.eval(Xy).split(":")[1])
+        cpu_auc = float(booster.eval(Xy).split(":")[1])
+        booster.set_param({"gpu_id": "0"})
+        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        gpu_auc = float(booster.eval(Xy).split(":")[1])
+        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
 
         np.testing.assert_allclose(cpu_auc, gpu_auc)
 
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index ea8d5dcb5..75e403dbe 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -153,12 +153,18 @@ class TestGPUUpdaters:
         tm.dataset_strategy
     )
     @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
+    def test_gpu_hist_device_dmatrix(
+        self, param: dict, num_rounds: int, dataset: tm.TestDataset
+    ) -> None:
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
         param['tree_method'] = 'gpu_hist'
         param = dataset.set_params(param)
-        result = train_result(param, dataset.get_device_dmat(), num_rounds)
+        result = train_result(
+            param,
+            dataset.get_device_dmat(max_bin=param.get("max_bin", None)),
+            num_rounds
+        )
         note(result)
         assert tm.non_increasing(result['train'][dataset.metric], tolerance=1e-3)
 
diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
index fab2a6eca..e512e4bc6 100644
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -21,8 +21,7 @@ class TestBasic:
         assert not lazy_isinstance(a, 'numpy', 'dataframe')
 
     def test_basic(self):
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain, dtest = tm.load_agaricus(__file__)
         param = {'max_depth': 2, 'eta': 1,
                  'objective': 'binary:logistic'}
         # specify validations set to watch performance
@@ -61,8 +60,7 @@ class TestBasic:
     def test_metric_config(self):
         # Make sure that the metric configuration happens in booster so the
         # string `['error', 'auc']` doesn't get passed down to core.
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain, dtest = tm.load_agaricus(__file__)
         param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                  'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']}
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
@@ -78,8 +76,7 @@ class TestBasic:
             np.testing.assert_allclose(predt_0, predt_1)
 
     def test_multiclass(self):
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain, dtest = tm.load_agaricus(__file__)
         param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'num_class': 2}
         # specify validations set to watch performance
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
@@ -188,7 +185,7 @@ class TestBasic:
             assert dm.num_col() == cols
 
     def test_cv(self):
-        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dm, _ = tm.load_agaricus(__file__)
         params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                   'objective': 'binary:logistic'}
 
@@ -198,7 +195,7 @@ class TestBasic:
         assert len(cv) == (4)
 
     def test_cv_no_shuffle(self):
-        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dm, _ = tm.load_agaricus(__file__)
         params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                   'objective': 'binary:logistic'}
 
@@ -209,7 +206,7 @@ class TestBasic:
         assert len(cv) == (4)
 
     def test_cv_explicit_fold_indices(self):
-        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dm, _ = tm.load_agaricus(__file__)
         params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective':
                   'binary:logistic'}
         folds = [
@@ -268,8 +265,7 @@ class TestBasicPathLike:
 
     def test_DMatrix_init_from_path(self):
         """Initialization from the data path."""
-        dpath = Path('demo/data')
-        dtrain = xgb.DMatrix(dpath / 'agaricus.txt.train')
+        dtrain, _ = tm.load_agaricus(__file__)
         assert dtrain.num_row() == 6513
         assert dtrain.num_col() == 127
 
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index d76205593..610a9236e 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -42,8 +42,7 @@ class TestModels:
         param = {'verbosity': 0, 'objective': 'binary:logistic',
                  'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
                  'nthread': 1}
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         num_round = 4
         bst = xgb.train(param, dtrain, num_round, watchlist)
@@ -55,8 +54,7 @@ class TestModels:
         assert err < 0.2
 
     def test_dart(self):
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         param = {'max_depth': 5, 'objective': 'binary:logistic',
                  'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1}
         # specify validations set to watch performance
@@ -122,7 +120,7 @@ class TestModels:
 
     def test_boost_from_prediction(self):
         # Re-construct dtrain here to avoid modification
-        margined = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        margined, _ = tm.load_agaricus(__file__)
         bst = xgb.train({'tree_method': 'hist'}, margined, 1)
         predt_0 = bst.predict(margined, output_margin=True)
         margined.set_base_margin(predt_0)
@@ -130,13 +128,13 @@ class TestModels:
         predt_1 = bst.predict(margined)
 
         assert np.any(np.abs(predt_1 - predt_0) > 1e-6)
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        dtrain, _ = tm.load_agaricus(__file__)
         bst = xgb.train({'tree_method': 'hist'}, dtrain, 2)
         predt_2 = bst.predict(dtrain)
         assert np.all(np.abs(predt_2 - predt_1) < 1e-6)
 
     def test_boost_from_existing_model(self):
-        X = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        X, _ = tm.load_agaricus(__file__)
         booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4)
         assert booster.num_boosted_rounds() == 4
         booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4,
@@ -156,8 +154,7 @@ class TestModels:
             'objective': 'reg:logistic',
             "tree_method": tree_method
         }
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         num_round = 10
 
@@ -203,8 +200,7 @@ class TestModels:
         self.run_custom_objective()
 
     def test_multi_eval_metric(self):
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1,
                  'objective': 'binary:logistic'}
@@ -226,7 +222,7 @@ class TestModels:
             param['scale_pos_weight'] = ratio
             return (dtrain, dtest, param)
 
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        dtrain, _ = tm.load_agaricus(__file__)
         xgb.cv(param, dtrain, num_round, nfold=5,
                metrics={'auc'}, seed=0, fpreproc=fpreproc)
 
@@ -234,7 +230,7 @@ class TestModels:
         param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                  'objective': 'binary:logistic'}
         num_round = 2
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        dtrain, _ = tm.load_agaricus(__file__)
         xgb.cv(param, dtrain, num_round, nfold=5,
                metrics={'error'}, seed=0, show_stdv=False)
 
@@ -392,7 +388,7 @@ class TestModels:
         os.remove(model_path)
 
         try:
-            dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+            dtrain, _ = tm.load_agaricus(__file__)
             xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1)
         except ValueError as e:
             e_str = str(e)
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index e8375aa5e..d3ec05e6e 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -275,9 +275,7 @@ class TestCallbacks:
         """Test learning rate scheduler, used by both CPU and GPU tests."""
         scheduler = xgb.callback.LearningRateScheduler
 
-        dpath = tm.data_dir(__file__)
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
 
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         num_round = 4
@@ -361,9 +359,7 @@ class TestCallbacks:
         num_round = 4
         scheduler = xgb.callback.LearningRateScheduler
 
-        dpath = tm.data_dir(__file__)
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
 
         param = {
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index ef56ff656..bcc089afb 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -283,7 +283,7 @@ class TestDMatrix:
             assert m0.feature_types == m1.feature_types
 
     def test_get_info(self):
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dtrain, _ = tm.load_agaricus(__file__)
         dtrain.get_float_info('label')
         dtrain.get_float_info('weight')
         dtrain.get_float_info('base_margin')
@@ -432,7 +432,9 @@ class TestDMatrix:
     def test_uri_categorical(self):
         path = os.path.join(dpath, 'agaricus.txt.train')
         feature_types = ["q"] * 5 + ["c"] + ["q"] * 120
-        Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types)
+        Xy = xgb.DMatrix(
+            path + "?indexing_mode=1&format=libsvm", feature_types=feature_types
+        )
         np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
 
     def test_base_margin(self):
diff --git a/tests/python/test_interaction_constraints.py b/tests/python/test_interaction_constraints.py
index 96d2ba7dc..5eaaf1f8c 100644
--- a/tests/python/test_interaction_constraints.py
+++ b/tests/python/test_interaction_constraints.py
@@ -88,8 +88,12 @@ class TestInteractionConstraints:
     def training_accuracy(self, tree_method):
         """Test accuracy, reused by GPU tests."""
         from sklearn.metrics import accuracy_score
-        dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
-        dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
+        dtrain = xgboost.DMatrix(
+            dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm"
+        )
+        dtest = xgboost.DMatrix(
+            dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm"
+        )
         params = {
             'eta': 1,
             'max_depth': 6,
diff --git a/tests/python/test_monotone_constraints.py b/tests/python/test_monotone_constraints.py
index 4dbfaa60d..a3785f1cb 100644
--- a/tests/python/test_monotone_constraints.py
+++ b/tests/python/test_monotone_constraints.py
@@ -134,8 +134,8 @@ class TestMonotoneConstraints:
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_training_accuracy(self):
         from sklearn.metrics import accuracy_score
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
+        dtrain = xgb.DMatrix(dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm")
+        dtest = xgb.DMatrix(dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm")
         params = {'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic',
                   'tree_method': 'hist', 'monotone_constraints': '(1, 0)'}
         num_boost_round = 5
diff --git a/tests/python/test_openmp.py b/tests/python/test_openmp.py
index c53363736..82b0ba270 100644
--- a/tests/python/test_openmp.py
+++ b/tests/python/test_openmp.py
@@ -13,9 +13,7 @@ pytestmark = tm.timeout(10)
 
 class TestOMP:
     def test_omp(self):
-        dpath = 'demo/data/'
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain, dtest = tm.load_agaricus(__file__)
 
         param = {'booster': 'gbtree',
                  'objective': 'binary:logistic',
diff --git a/tests/python/test_parse_tree.py b/tests/python/test_parse_tree.py
index 885c0f1e2..9d80d0f6f 100644
--- a/tests/python/test_parse_tree.py
+++ b/tests/python/test_parse_tree.py
@@ -13,7 +13,7 @@ rng = np.random.RandomState(1994)
 
 class TestTreesToDataFrame:
     def build_model(self, max_depth, num_round):
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dtrain, _ = tm.load_agaricus(__file__)
         param = {'max_depth': max_depth, 'objective': 'binary:logistic',
                  'verbosity': 1}
         num_round = num_round
diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py
index dc45cd254..303c7c8c1 100644
--- a/tests/python/test_plotting.py
+++ b/tests/python/test_plotting.py
@@ -17,12 +17,10 @@ except ImportError:
 pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(),
                                                  tm.no_graphviz()))
 
-dpath = 'demo/data/agaricus.txt.train'
-
 
 class TestPlotting:
     def test_plotting(self):
-        m = xgb.DMatrix(dpath)
+        m, _ = tm.load_agaricus(__file__)
         booster = xgb.train({'max_depth': 2, 'eta': 1,
                              'objective': 'binary:logistic'}, m,
                             num_boost_round=2)
diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py
index 4d861ad6e..2585da088 100644
--- a/tests/python/test_shap.py
+++ b/tests/python/test_shap.py
@@ -46,8 +46,8 @@ class TestSHAP:
         fscores = bst.get_fscore()
         assert scores1 == fscores
 
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?format=libsvm')
+        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?format=libsvm')
 
         def fn(max_depth, num_rounds):
             # train
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index dd710f6a4..78097a4ea 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -154,9 +154,7 @@ class TestTreeMethod:
 
     def test_hist_categorical(self):
         # hist must be same as exact on all-categorial data
-        dpath = 'demo/data/'
-        ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        ag_dtrain, ag_dtest = tm.load_agaricus(__file__)
         ag_param = {'max_depth': 2,
                     'tree_method': 'hist',
                     'eta': 1,
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index 07295eb6c..f8a21b6ab 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -222,7 +222,7 @@ class TestPandas:
         set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist")
 
     def test_cv_as_pandas(self):
-        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dm, _ = tm.load_agaricus(__file__)
         params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                   'objective': 'binary:logistic', 'eval_metric': 'error'}
 
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 67620e6dd..e0d3d680b 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -176,7 +176,7 @@ def test_ranking():
 def test_ranking_metric() -> None:
     from sklearn.metrics import roc_auc_score
 
-    X, y, qid, w = tm.make_ltr(512, 4, 3, 2)
+    X, y, qid, w = tm.make_ltr(512, 4, 3, 1)
     # use auc for test as ndcg_score in sklearn works only on label gain instead of exp
     # gain.
     # note that the auc in sklearn is different from the one in XGBoost. The one in
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 0bf952025..5e9303a46 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -192,6 +192,25 @@ def deterministic_repartition(
     return X, y, m
 
 
+@pytest.mark.parametrize("to_frame", [True, False])
+def test_xgbclassifier_classes_type_and_value(to_frame: bool, client: "Client"):
+    X, y = make_classification(n_samples=1000, n_features=4, random_state=123)
+    if to_frame:
+        import pandas as pd
+        feats = [f"var_{i}" for i in range(4)]
+        df = pd.DataFrame(X, columns=feats)
+        df["target"] = y
+        df = dd.from_pandas(df, npartitions=1)
+        X, y = df[feats], df["target"]
+    else:
+        X = da.from_array(X)
+        y = da.from_array(y)
+
+    est = xgb.dask.DaskXGBClassifier(n_estimators=10).fit(X, y)
+    assert isinstance(est.classes_, np.ndarray)
+    np.testing.assert_array_equal(est.classes_, np.array([0, 1]))
+
+
 def test_from_dask_dataframe() -> None:
     with LocalCluster(n_workers=kWorkers, dashboard_address=":0") as cluster:
         with Client(cluster) as client:
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index a5e0f028a..6d88323ac 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -1343,61 +1343,94 @@ class XgboostLocalTest(SparkTestCase):
             SparkXGBClassifier(evals_result={})
 
 
-class XgboostRankerLocalTest(SparkTestCase):
-    def setUp(self):
-        self.session.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "8")
-        self.ranker_df_train = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0, 0),
-                (Vectors.dense(4.0, 5.0, 6.0), 1, 0),
-                (Vectors.dense(9.0, 4.0, 8.0), 2, 0),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 1),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 1),
-                (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 1),
-            ],
-            ["features", "label", "qid"],
-        )
-        self.ranker_df_test = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.5, 2.0, 3.0), 0, -1.87988),
-                (Vectors.dense(4.5, 5.0, 6.0), 0, 0.29556),
-                (Vectors.dense(9.0, 4.5, 8.0), 0, 2.36570),
-                (Vectors.sparse(3, {1: 1.0, 2: 6.0}), 1, -1.87988),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.0}), 1, -0.30612),
-                (Vectors.sparse(3, {1: 8.0, 2: 10.5}), 1, 2.44826),
-            ],
-            ["features", "qid", "expected_prediction"],
-        )
-        self.ranker_df_train_1 = self.session.createDataFrame(
-            [
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 9),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 9),
-                (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 9),
-                (Vectors.dense(1.0, 2.0, 3.0), 0, 8),
-                (Vectors.dense(4.0, 5.0, 6.0), 1, 8),
-                (Vectors.dense(9.0, 4.0, 8.0), 2, 8),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 7),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 7),
-                (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 7),
-                (Vectors.dense(1.0, 2.0, 3.0), 0, 6),
-                (Vectors.dense(4.0, 5.0, 6.0), 1, 6),
-                (Vectors.dense(9.0, 4.0, 8.0), 2, 6),
-            ]
-            * 4,
-            ["features", "label", "qid"],
-        )
+LTRData = namedtuple("LTRData", ("df_train", "df_test", "df_train_1"))
 
-    def test_ranker(self):
-        ranker = SparkXGBRanker(qid_col="qid")
+
+@pytest.fixture
+def ltr_data(spark: SparkSession) -> Generator[LTRData, None, None]:
+    spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "8")
+    ranker_df_train = spark.createDataFrame(
+        [
+            (Vectors.dense(1.0, 2.0, 3.0), 0, 0),
+            (Vectors.dense(4.0, 5.0, 6.0), 1, 0),
+            (Vectors.dense(9.0, 4.0, 8.0), 2, 0),
+            (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 1),
+            (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 1),
+            (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 1),
+        ],
+        ["features", "label", "qid"],
+    )
+    X_train = np.array(
+        [
+            [1.0, 2.0, 3.0],
+            [4.0, 5.0, 6.0],
+            [9.0, 4.0, 8.0],
+            [np.NaN, 1.0, 5.5],
+            [np.NaN, 6.0, 7.5],
+            [np.NaN, 8.0, 9.5],
+        ]
+    )
+    qid_train = np.array([0, 0, 0, 1, 1, 1])
+    y_train = np.array([0, 1, 2, 0, 1, 2])
+
+    X_test = np.array(
+        [
+            [1.5, 2.0, 3.0],
+            [4.5, 5.0, 6.0],
+            [9.0, 4.5, 8.0],
+            [np.NaN, 1.0, 6.0],
+            [np.NaN, 6.0, 7.0],
+            [np.NaN, 8.0, 10.5],
+        ]
+    )
+
+    ltr = xgb.XGBRanker(tree_method="approx", objective="rank:pairwise")
+    ltr.fit(X_train, y_train, qid=qid_train)
+    predt = ltr.predict(X_test)
+
+    ranker_df_test = spark.createDataFrame(
+        [
+            (Vectors.dense(1.5, 2.0, 3.0), 0, float(predt[0])),
+            (Vectors.dense(4.5, 5.0, 6.0), 0, float(predt[1])),
+            (Vectors.dense(9.0, 4.5, 8.0), 0, float(predt[2])),
+            (Vectors.sparse(3, {1: 1.0, 2: 6.0}), 1, float(predt[3])),
+            (Vectors.sparse(3, {1: 6.0, 2: 7.0}), 1, float(predt[4])),
+            (Vectors.sparse(3, {1: 8.0, 2: 10.5}), 1, float(predt[5])),
+        ],
+        ["features", "qid", "expected_prediction"],
+    )
+    ranker_df_train_1 = spark.createDataFrame(
+        [
+            (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 9),
+            (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 9),
+            (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 9),
+            (Vectors.dense(1.0, 2.0, 3.0), 0, 8),
+            (Vectors.dense(4.0, 5.0, 6.0), 1, 8),
+            (Vectors.dense(9.0, 4.0, 8.0), 2, 8),
+            (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 7),
+            (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 7),
+            (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 7),
+            (Vectors.dense(1.0, 2.0, 3.0), 0, 6),
+            (Vectors.dense(4.0, 5.0, 6.0), 1, 6),
+            (Vectors.dense(9.0, 4.0, 8.0), 2, 6),
+        ]
+        * 4,
+        ["features", "label", "qid"],
+    )
+    yield LTRData(ranker_df_train, ranker_df_test, ranker_df_train_1)
+
+
+class TestPySparkLocalLETOR:
+    def test_ranker(self, ltr_data: LTRData) -> None:
+        ranker = SparkXGBRanker(qid_col="qid", objective="rank:pairwise")
         assert ranker.getOrDefault(ranker.objective) == "rank:pairwise"
-        model = ranker.fit(self.ranker_df_train)
-        pred_result = model.transform(self.ranker_df_test).collect()
-
+        model = ranker.fit(ltr_data.df_train)
+        pred_result = model.transform(ltr_data.df_test).collect()
         for row in pred_result:
             assert np.isclose(row.prediction, row.expected_prediction, rtol=1e-3)
 
-    def test_ranker_qid_sorted(self):
-        ranker = SparkXGBRanker(qid_col="qid", num_workers=4)
-        assert ranker.getOrDefault(ranker.objective) == "rank:pairwise"
-        model = ranker.fit(self.ranker_df_train_1)
-        model.transform(self.ranker_df_test).collect()
+    def test_ranker_qid_sorted(self, ltr_data: LTRData) -> None:
+        ranker = SparkXGBRanker(qid_col="qid", num_workers=4, objective="rank:ndcg")
+        assert ranker.getOrDefault(ranker.objective) == "rank:ndcg"
+        model = ranker.fit(ltr_data.df_train_1)
+        model.transform(ltr_data.df_test).collect()