diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 000000000..5c71e130e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,18 @@
+* text=auto
+
+*.c   text eol=lf
+*.h   text eol=lf
+*.cc  text eol=lf
+*.cuh text eol=lf
+*.cu  text eol=lf
+*.py  text eol=lf
+*.txt text eol=lf
+*.R   text eol=lf
+*.scala text eol=lf
+*.java  text eol=lf
+
+*.sh text eol=lf
+
+*.rst text eol=lf
+*.md  text eol=lf
+*.csv text eol=lf
\ No newline at end of file
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ac50b744b..ab2a58fe9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -156,40 +156,3 @@ jobs:
             xgboost \
             cpp \
             include src python-package
-
-  sphinx:
-    runs-on: ubuntu-latest
-    name: Build docs using Sphinx
-    steps:
-    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
-      with:
-        submodules: 'true'
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
-      with:
-        python-version: "3.8"
-        architecture: 'x64'
-    - name: Install system packages
-      run: |
-        sudo apt-get install -y --no-install-recommends graphviz doxygen ninja-build
-        python -m pip install wheel setuptools awscli
-        python -m pip install -r doc/requirements.txt
-    - name: Extract branch name
-      shell: bash
-      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
-      id: extract_branch
-      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
-    - name: Run Sphinx
-      run: |
-        make -C doc html
-      env:
-        SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }}
-        READTHEDOCS: "True"
-
-    - name: Publish
-      run: |
-        tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doxygen/doc_doxygen/
-        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/doxygen/ --acl public-read
-      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
-      env:
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
diff --git a/README.md b/README.md
index 219831114..2fae68ac5 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo-m/xgboost.png width=135/>  eXtreme Gradient Boosting
+<img src="https://xgboost.ai/images/logo/xgboost-logo.svg" width=135/>  eXtreme Gradient Boosting
 ===========
 [![Build Status](https://xgboost-ci.net/job/xgboost/job/master/badge/icon)](https://xgboost-ci.net/blue/organizations/jenkins/xgboost/activity)
 [![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost-CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions)
diff --git a/demo/guide-python/multioutput_regression.py b/demo/guide-python/multioutput_regression.py
index 375377e4e..078ec6b7d 100644
--- a/demo/guide-python/multioutput_regression.py
+++ b/demo/guide-python/multioutput_regression.py
@@ -7,6 +7,12 @@ The demo is adopted from scikit-learn:
 https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py
 
 See :doc:`/tutorials/multioutput` for more information.
+
+.. note::
+
+    The feature is experimental. For the `multi_output_tree` strategy, many features are
+    missing.
+
 """
 
 import argparse
@@ -40,11 +46,18 @@ def gen_circle() -> Tuple[np.ndarray, np.ndarray]:
     return X, y
 
 
-def rmse_model(plot_result: bool):
+def rmse_model(plot_result: bool, strategy: str):
     """Draw a circle with 2-dim coordinate as target variables."""
     X, y = gen_circle()
     # Train a regressor on it
-    reg = xgb.XGBRegressor(tree_method="hist", n_estimators=64)
+    reg = xgb.XGBRegressor(
+        tree_method="hist",
+        n_estimators=128,
+        n_jobs=16,
+        max_depth=8,
+        multi_strategy=strategy,
+        subsample=0.6,
+    )
     reg.fit(X, y, eval_set=[(X, y)])
 
     y_predt = reg.predict(X)
@@ -52,7 +65,7 @@ def rmse_model(plot_result: bool):
         plot_predt(y, y_predt, "multi")
 
 
-def custom_rmse_model(plot_result: bool) -> None:
+def custom_rmse_model(plot_result: bool, strategy: str) -> None:
     """Train using Python implementation of Squared Error."""
 
     # As the experimental support status, custom objective doesn't support matrix as
@@ -88,9 +101,10 @@ def custom_rmse_model(plot_result: bool) -> None:
         {
             "tree_method": "hist",
             "num_target": y.shape[1],
+            "multi_strategy": strategy,
         },
         dtrain=Xy,
-        num_boost_round=100,
+        num_boost_round=128,
         obj=squared_log,
         evals=[(Xy, "Train")],
         evals_result=results,
@@ -107,6 +121,16 @@ if __name__ == "__main__":
     parser.add_argument("--plot", choices=[0, 1], type=int, default=1)
     args = parser.parse_args()
     # Train with builtin RMSE objective
-    rmse_model(args.plot == 1)
+    # - One model per output.
+    rmse_model(args.plot == 1, "one_output_per_tree")
+
+    # - One model for all outputs, this is still working in progress, many features are
+    # missing.
+    rmse_model(args.plot == 1, "multi_output_tree")
+
     # Train with custom objective.
-    custom_rmse_model(args.plot == 1)
+    # - One model per output.
+    custom_rmse_model(args.plot == 1, "one_output_per_tree")
+    # - One model for all outputs, this is still working in progress, many features are
+    # missing.
+    custom_rmse_model(args.plot == 1, "multi_output_tree")
diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py
index 5890987f9..cf33e959a 100644
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@@ -2,6 +2,9 @@
 Collection of examples for using sklearn interface
 ==================================================
 
+For an introduction to XGBoost's scikit-learn estimator interface, see
+:doc:`/python/sklearn_estimator`.
+
 Created on 1 Apr 2015
 
 @author: Jamie Hall
diff --git a/doc/c++.rst b/doc/c++.rst
index 4a045fc42..ce30bbefa 100644
--- a/doc/c++.rst
+++ b/doc/c++.rst
@@ -8,5 +8,5 @@ As a result it's changing quite often and we don't maintain its stability.  Alon
 plugin system (see ``plugin/example`` in XGBoost's source tree), users can utilize some
 existing c++ headers for gaining more access to the internal of XGBoost.
 
-* `C++ interface documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/files.html>`_
+* `C++ interface documentation (latest master branch) <./dev/files.html>`_
 * `C++ interface documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/files.html>`_
diff --git a/doc/c.rst b/doc/c.rst
index 02581b874..d63e779e1 100644
--- a/doc/c.rst
+++ b/doc/c.rst
@@ -10,7 +10,7 @@ simply look at function comments in ``include/xgboost/c_api.h``. The reference i
 to sphinx with the help of breathe, which doesn't contain links to examples but might be
 easier to read. For the original doxygen pages please visit:
 
-* `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
+* `C API documentation (latest master branch) <./dev/c__api_8h.html>`_
 * `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_
 
 ***************
diff --git a/doc/conf.py b/doc/conf.py
index 7d585e420..73fe48acc 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -13,53 +13,106 @@
 # serve to show the default.
 import os
 import re
+import shutil
 import subprocess
 import sys
+import tarfile
 import urllib.request
+import warnings
 from subprocess import call
 from urllib.error import HTTPError
 
 from sh.contrib import git
 
-git_branch = os.getenv('SPHINX_GIT_BRANCH', default=None)
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
+TMP_DIR = os.path.join(CURR_PATH, "tmp")
+DOX_DIR = "doxygen"
+
+
+def run_doxygen():
+    """Run the doxygen make command in the designated folder."""
+    curdir = os.path.normpath(os.path.abspath(os.path.curdir))
+    if os.path.exists(TMP_DIR):
+        print(f"Delete directory {TMP_DIR}")
+        shutil.rmtree(TMP_DIR)
+    else:
+        print(f"Create directory {TMP_DIR}")
+        os.mkdir(TMP_DIR)
+    try:
+        os.chdir(PROJECT_ROOT)
+        if not os.path.exists(DOX_DIR):
+            os.mkdir(DOX_DIR)
+        os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
+        print(
+            "Build doxygen at {}".format(
+                os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen")
+            )
+        )
+        subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
+        subprocess.check_call(["ninja", "doc_doxygen"])
+
+        src = os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen", "html")
+        dest = os.path.join(TMP_DIR, "dev")
+        print(f"Copy directory {src} -> {dest}")
+        shutil.copytree(src, dest)
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: %s" % e)
+    finally:
+        os.chdir(curdir)
+
+
+def is_readthedocs_build():
+    if os.environ.get("READTHEDOCS", None) == "True":
+        return True
+    warnings.warn(
+        "Skipping Doxygen build... You won't have documentation for C/C++ functions. "
+        "Set environment variable READTHEDOCS=True if you want to build Doxygen. "
+        "(If you do opt in, make sure to install Doxygen, Graphviz, CMake, and C++ compiler "
+        "on your system.)"
+    )
+    return False
+
+
+if is_readthedocs_build():
+    run_doxygen()
+
+
+git_branch = os.getenv("SPHINX_GIT_BRANCH", default=None)
 if not git_branch:
     # If SPHINX_GIT_BRANCH environment variable is not given, run git
     # to determine branch name
     git_branch = [
-        re.sub(r'origin/', '', x.lstrip(' ')) for x in str(
-            git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')
+        re.sub(r"origin/", "", x.lstrip(" "))
+        for x in str(git.branch("-r", "--contains", "HEAD")).rstrip("\n").split("\n")
     ]
-    git_branch = [x for x in git_branch if 'HEAD' not in x]
+    git_branch = [x for x in git_branch if "HEAD" not in x]
 else:
     git_branch = [git_branch]
-print('git_branch = {}'.format(git_branch[0]))
+print("git_branch = {}".format(git_branch[0]))
 
 try:
     filename, _ = urllib.request.urlretrieve(
-        'https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(
-            git_branch[0]))
-    call(
-        'if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'
-        .format(filename),
-        shell=True)
+        f"https://s3-us-west-2.amazonaws.com/xgboost-docs/{git_branch[0]}.tar.bz2"
+    )
+    if not os.path.exists(TMP_DIR):
+        print(f"Create directory {TMP_DIR}")
+        os.mkdir(TMP_DIR)
+    jvm_doc_dir = os.path.join(TMP_DIR, "jvm")
+    if os.path.exists(jvm_doc_dir):
+        print(f"Delete directory {jvm_doc_dir}")
+        shutil.rmtree(jvm_doc_dir)
+    print(f"Create directory {jvm_doc_dir}")
+    os.mkdir(jvm_doc_dir)
+
+    with tarfile.open(filename, "r:bz2") as t:
+        t.extractall(jvm_doc_dir)
 except HTTPError:
-    print('JVM doc not found. Skipping...')
-try:
-    filename, _ = urllib.request.urlretrieve(
-        'https://s3-us-west-2.amazonaws.com/xgboost-docs/doxygen/{}.tar.bz2'.
-        format(git_branch[0]))
-    call(
-        'mkdir -p tmp/dev; cd tmp/dev; tar xvf {}; mv doc_doxygen/html/* .; rm -rf doc_doxygen'
-        .format(filename),
-        shell=True)
-except HTTPError:
-    print('C API doc not found. Skipping...')
+    print("JVM doc not found. Skipping...")
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
 libpath = os.path.join(PROJECT_ROOT, "python-package/")
 sys.path.insert(0, libpath)
 sys.path.insert(0, CURR_PATH)
@@ -82,50 +135,56 @@ release = xgboost.__version__
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
 extensions = [
-    'matplotlib.sphinxext.plot_directive',
-    'sphinx.ext.autodoc',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.intersphinx',
+    "matplotlib.sphinxext.plot_directive",
+    "sphinxcontrib.jquery",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.intersphinx",
     "sphinx_gallery.gen_gallery",
-    'breathe',
-    'recommonmark'
+    "breathe",
+    "recommonmark",
 ]
 
 sphinx_gallery_conf = {
     # path to your example scripts
     "examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
     # path to where to save gallery generated output
-    "gallery_dirs": ["python/examples", "python/dask-examples", "python/survival-examples"],
+    "gallery_dirs": [
+        "python/examples",
+        "python/dask-examples",
+        "python/survival-examples",
+    ],
     "matplotlib_animations": True,
 }
 
 autodoc_typehints = "description"
 
-graphviz_output_format = 'png'
-plot_formats = [('svg', 300), ('png', 100), ('hires.png', 300)]
+graphviz_output_format = "png"
+plot_formats = [("svg", 300), ("png", 100), ("hires.png", 300)]
 plot_html_show_source_link = False
 plot_html_show_formats = False
 
 # Breathe extension variables
-DOX_DIR = "doxygen"
-breathe_projects = {
-    "xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml")
-}
+breathe_projects = {}
+if is_readthedocs_build():
+    breathe_projects = {
+        "xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml")
+    }
 breathe_default_project = "xgboost"
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]
 
 # The encoding of source files.
 # source_encoding = 'utf-8-sig'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -134,7 +193,7 @@ master_doc = 'index'
 # Usually you set "language" from the command line for these cases.
 language = "en"
 
-autoclass_content = 'both'
+autoclass_content = "both"
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
@@ -144,8 +203,10 @@ autoclass_content = 'both'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
-html_extra_path = ['./tmp']
+exclude_patterns = ["_build"]
+html_extra_path = []
+if is_readthedocs_build():
+    html_extra_path = [TMP_DIR]
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
@@ -163,7 +224,7 @@ html_extra_path = ['./tmp']
 # show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # A list of ignored prefixes for module index sorting.
 # modindex_common_prefix = []
@@ -186,27 +247,24 @@ html_logo = "https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/lo
 
 html_css_files = ["css/custom.css"]
 
-html_sidebars = {
-  '**': ['logo-text.html', 'globaltoc.html', 'searchbox.html']
-}
+html_sidebars = {"**": ["logo-text.html", "globaltoc.html", "searchbox.html"]}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = project + 'doc'
+htmlhelp_basename = project + "doc"
 
 # -- Options for LaTeX output ---------------------------------------------
-latex_elements = {
-}
+latex_elements = {}
 
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  (master_doc, '%s.tex' % project, project, author, 'manual'),
+    (master_doc, "%s.tex" % project, project, author, "manual"),
 ]
 
 intersphinx_mapping = {
@@ -221,30 +279,5 @@ intersphinx_mapping = {
 }
 
 
-# hook for doxygen
-def run_doxygen():
-    """Run the doxygen make command in the designated folder."""
-    curdir = os.path.normpath(os.path.abspath(os.path.curdir))
-    try:
-        os.chdir(PROJECT_ROOT)
-        if not os.path.exists(DOX_DIR):
-            os.mkdir(DOX_DIR)
-        os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
-        subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
-        subprocess.check_call(["ninja", "doc_doxygen"])
-    except OSError as e:
-        sys.stderr.write("doxygen execution failed: %s" % e)
-    finally:
-        os.chdir(curdir)
-
-
-def generate_doxygen_xml(app):
-    """Run the doxygen make commands if we're on the ReadTheDocs server"""
-    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
-    if read_the_docs_build:
-        run_doxygen()
-
-
 def setup(app):
-    app.add_css_file('custom.css')
-    app.connect("builder-inited", generate_doxygen_xml)
+    app.add_css_file("custom.css")
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 99d6f0585..1e703dacd 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -226,6 +226,18 @@ Parameters for Tree Booster
     list is a group of indices of features that are allowed to interact with each other.
     See :doc:`/tutorials/feature_interaction_constraint` for more information.
 
+* ``multi_strategy``, [default = ``one_output_per_tree``]
+
+  .. versionadded:: 2.0.0
+
+  .. note:: This parameter is working-in-progress.
+
+  - The strategy used for training multi-target models, including multi-target regression
+  and multi-class classification. See :doc:`/tutorials/multioutput` for more information.
+
+    - ``one_output_per_tree``: One model for each target.
+    - ``multi_output_tree``:  Use multi-target trees.
+
 .. _cat-param:
 
 Parameters for Categorical Feature
@@ -408,8 +420,17 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
     - ``ndcg``: `Normalized Discounted Cumulative Gain <http://en.wikipedia.org/wiki/NDCG>`_
     - ``map``: `Mean Average Precision <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_
-    - ``ndcg@n``, ``map@n``: 'n' can be assigned as an integer to cut off the top positions in the lists for evaluation.
-    - ``ndcg-``, ``map-``, ``ndcg@n-``, ``map@n-``: In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions.
+
+      The `average precision` is defined as:
+
+      .. math::
+
+	 AP@l = \frac{1}{min{(l, N)}}\sum^l_{k=1}P@k \cdot I_{(k)}
+
+      where :math:`I_{(k)}` is an indicator function that equals to :math:`1` when the document at :math:`k` is relevant and :math:`0` otherwise. The :math:`P@k` is the precision at :math:`k`, and :math:`N` is the total number of relevant documents. Lastly, the `mean average precision` is defined as the weighted average across all queries.
+
+    - ``ndcg@n``, ``map@n``: :math:`n` can be assigned as an integer to cut off the top positions in the lists for evaluation.
+    - ``ndcg-``, ``map-``, ``ndcg@n-``, ``map@n-``: In XGBoost, the NDCG and MAP evaluate the score of a list without any positive samples as :math:`1`. By appending "-" to the evaluation metric name, we can ask XGBoost to evaluate these scores as :math:`0` to be consistent under some conditions.
     - ``poisson-nloglik``: negative log-likelihood for Poisson regression
     - ``gamma-nloglik``: negative log-likelihood for gamma regression
     - ``cox-nloglik``: negative partial log-likelihood for Cox proportional hazards regression
diff --git a/doc/python/index.rst b/doc/python/index.rst
index 60608700b..fd34e0d43 100644
--- a/doc/python/index.rst
+++ b/doc/python/index.rst
@@ -10,6 +10,7 @@ Contents
 
 .. toctree::
   python_intro
+  sklearn_estimator
   python_api
   callbacks
   model
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index b27542a8b..0cbf63456 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -41,6 +41,7 @@ Learning API
 
 Scikit-Learn API
 ----------------
+
 .. automodule:: xgboost.sklearn
 .. autoclass:: xgboost.XGBRegressor
     :members:
diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index c36db91ff..505556383 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -305,7 +305,8 @@ Scikit-Learn interface
 ----------------------
 
 XGBoost provides an easy to use scikit-learn interface for some pre-defined models
-including regression, classification and ranking.
+including regression, classification and ranking. See :doc:`/python/sklearn_estimator`
+for more info.
 
 .. code-block:: python
 
diff --git a/doc/python/sklearn_estimator.rst b/doc/python/sklearn_estimator.rst
new file mode 100644
index 000000000..9748dbebd
--- /dev/null
+++ b/doc/python/sklearn_estimator.rst
@@ -0,0 +1,162 @@
+##########################################
+Using the Scikit-Learn Estimator Interface
+##########################################
+
+**Contents**
+
+.. contents::
+  :backlinks: none
+  :local:
+
+********
+Overview
+********
+
+In addition to the native interface, XGBoost features a sklearn estimator interface that
+conforms to `sklearn estimator guideline
+<https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator>`__. It
+supports regression, classification, and learning to rank. Survival training for the
+sklearn estimator interface is still working in progress.
+
+You can find some some quick start examples at
+:ref:`sphx_glr_python_examples_sklearn_examples.py`. The main advantage of using sklearn
+interface is that it works with most of the utilites provided by sklearn like
+:py:func:`sklearn.model_selection.cross_validate`. Also, many other libraries recognize
+the sklearn estimator interface thanks to its popularity.
+
+With the sklearn estimator interface, we can train a classification model with only a
+couple lines of Python code. Here's an example for training a classification model:
+
+.. code-block:: python
+
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.model_selection import train_test_split
+
+    import xgboost as xgb
+
+    X, y = load_breast_cancer(return_X_y=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=94)
+
+    # Use "hist" for constructing the trees, with early stopping enabled.
+    clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
+    # Fit the model, test sets are used for early stopping.
+    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+    # Save model into JSON format.
+    clf.save_model("clf.json")
+
+
+The ``tree_method`` parameter specifies the method to use for constructing the trees, and
+the early_stopping_rounds parameter enables early stopping. Early stopping can help
+prevent overfitting and save time during training.
+
+**************
+Early Stopping
+**************
+
+As demonstrated in the previous example, early stopping can be enabled by the parameter
+``early_stopping_rounds``. Alternatively, there's a callback function that can be used
+:py:class:`xgboost.callback.EarlyStopping` to specify more details about the behavior of
+early stopping, including whether XGBoost should return the best model instead of the full
+stack of trees:
+
+.. code-block:: python
+
+    early_stop = xgb.callback.EarlyStopping(
+        rounds=2, metric_name='logloss', data_name='Validation_0', save_best=True
+    )
+    clf = xgb.XGBClassifier(tree_method="hist", callbacks=[early_stop])
+    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+
+At present, XGBoost doesn't implement data spliting logic within the estimator and relies
+on the ``eval_set`` parameter of the :py:meth:`xgboost.XGBModel.fit` method. If you want
+to use early stopping to prevent overfitting, you'll need to manually split your data into
+training and testing sets using the :py:func:`sklearn.model_selection.train_test_split`
+function from the `sklearn` library. Some other machine learning algorithms, like those in
+`sklearn`, include early stopping as part of the estimator and may work with cross
+validation. However, using early stopping during cross validation may not be a perfect
+approach because it changes the model's number of trees for each validation fold, leading
+to different model. A better approach is to retrain the model after cross validation using
+the best hyperparameters along with early stopping. If you want to experiment with idea of
+using cross validation with early stopping, here is a snippet to begin with:
+
+.. code-block:: python
+
+    from sklearn.base import clone
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.model_selection import StratifiedKFold, cross_validate
+
+    import xgboost as xgb
+
+    X, y = load_breast_cancer(return_X_y=True)
+
+
+    def fit_and_score(estimator, X_train, X_test, y_train, y_test):
+        """Fit the estimator on the train set and score it on both sets"""
+        estimator.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+
+        train_score = estimator.score(X_train, y_train)
+        test_score = estimator.score(X_test, y_test)
+
+        return estimator, train_score, test_score
+
+
+    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=94)
+
+    clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=3)
+
+    resutls = {}
+
+    for train, test in cv.split(X, y):
+        X_train = X[train]
+        X_test = X[test]
+        y_train = y[train]
+        y_test = y[test]
+        est, train_score, test_score = fit_and_score(
+            clone(clf), X_train, X_test, y_train, y_test
+        )
+        resutls[est] = (train_score, test_score)
+
+
+***********************************
+Obtaining the native booster object
+***********************************
+
+The sklearn estimator interface primarily facilitates training and doesn't implement all
+features available in XGBoost. For instance, in order to have cached predictions,
+:py:class:`xgboost.DMatrix` needs to be used with :py:meth:`xgboost.Booster.predict`. One
+can obtain the booster object from the sklearn interface using
+:py:meth:`xgboost.XGBModel.get_booster`:
+
+.. code-block:: python
+
+   booster = clf.get_booster()
+   print(booster.num_boosted_rounds())
+
+
+**********
+Prediction
+**********
+
+When early stopping is enabled, prediction functions including the
+:py:meth:`xgboost.XGBModel.predict`, :py:meth:`xgboost.XGBModel.score`, and
+:py:meth:`xgboost.XGBModel.apply` methods will use the best model automatically. Meaning
+the :py:attr:`xgboost.XGBModel.best_iteration` is used to specify the range of trees used
+in prediction.
+
+To have cached results for incremental prediction, please use the
+:py:meth:`xgboost.Booster.predict` method instead.
+
+
+**************************
+Number of parallel threads
+**************************
+
+When working with XGBoost and other sklearn tools, you can specify how many threads you
+want to use by using the ``n_jobs`` parameter. By default, XGBoost uses all the available
+threads on your computer, which can lead to some interesting consequences when combined
+with other sklearn functions like :py:func:`sklearn.model_selection.cross_validate`. If
+both XGBoost and sklearn are set to use all threads, your computer may start to slow down
+significantly due to something called "thread thrashing". To avoid this, you can simply
+set the ``n_jobs`` parameter for XGBoost to `None` (which uses all threads) and the
+``n_jobs`` parameter for sklearn to `1`. This way, both programs will be able to work
+together smoothly without causing any unnecessary computer strain.
diff --git a/doc/tutorials/c_api_tutorial.rst b/doc/tutorials/c_api_tutorial.rst
index ca121e1d2..090743a0f 100644
--- a/doc/tutorials/c_api_tutorial.rst
+++ b/doc/tutorials/c_api_tutorial.rst
@@ -134,7 +134,7 @@ c. Assertion technique: It works both in C/ C++. If expression evaluates to 0 (f
       // do something with booster
 
       //free the memory
-      XGBoosterFree(booster)
+      XGBoosterFree(booster);
 
       DMatrixHandle DMatrixHandle_param;
 
@@ -156,7 +156,7 @@ c. Assertion technique: It works both in C/ C++. If expression evaluates to 0 (f
 .. code-block:: c
 
     BoosterHandle booster;
-    XGBoosterSetParam(booster, "paramter_name", "0.1");
+    XGBoosterSetParam(booster, "parameter_name", "0.1");
 
 
 **************************************************************
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index c010aa0e2..c66c6131f 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -190,9 +190,9 @@ Scikit-Learn wrapper object:
     booster = cls.get_booster()
 
 
-**********************
-Scikit-Learn interface
-**********************
+********************************
+Scikit-Learn Estimator Interface
+********************************
 
 As mentioned previously, there's another interface that mimics the scikit-learn estimators
 with higher level of of abstraction.  The interface is easier to use compared to the
@@ -488,12 +488,13 @@ with dask and optuna.
 Troubleshooting
 ***************
 
-.. versionadded:: 1.6.0
 
-In some environments XGBoost might fail to resolve the IP address of the scheduler, a
-symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
-during training.  A quick workaround is to specify the address explicitly.  To do that
-dask config is used:
+- In some environments XGBoost might fail to resolve the IP address of the scheduler, a
+  symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
+  during training.  A quick workaround is to specify the address explicitly.  To do that
+  dask config is used:
+
+  .. versionadded:: 1.6.0
 
 .. code-block:: python
 
@@ -511,10 +512,20 @@ dask config is used:
         reg = dxgb.DaskXGBRegressor()
 
 
-Please note that XGBoost requires a different port than dask. By default, on a unix-like
-system XGBoost uses the port 0 to find available ports, which may fail if a user is
-running in a restricted docker environment. In this case, please open additional ports in
-the container and specify it as in the above snippet.
+- Please note that XGBoost requires a different port than dask. By default, on a unix-like
+  system XGBoost uses the port 0 to find available ports, which may fail if a user is
+  running in a restricted docker environment. In this case, please open additional ports
+  in the container and specify it as in the above snippet.
+
+- If you encounter a NCCL system error while training with GPU enabled, which usually
+  includes the error message `NCCL failure: unhandled system error`, you can specify its
+  network configuration using one of the environment variables listed in the `NCCL
+  document <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ such as
+  the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
+  logs.
+
+- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
+  that includes `Multiple processes within a communication group ...` upon initialization.
 
 ************
 IPv6 Support
@@ -564,6 +575,69 @@ computations, one can explicitly wait for results of input data before construct
 Also dask's `diagnostics dashboard <https://distributed.dask.org/en/latest/web.html>`_ can be used to
 monitor what operations are currently being performed.
 
+*******************
+Reproducible Result
+*******************
+
+In a single node mode, we can always expect the same training result between runs as along
+as the underlying platforms are the same. However, it's difficult to obtain reproducible
+result in a distributed environment, since the tasks might get different machine
+allocation or have different amount of available resources during different
+sessions. There are heuristics and guidelines on how to achieve it but no proven method
+for guaranteeing such deterministic behavior. The Dask interface in XGBoost tries to
+provide reproducible result with best effort. This section highlights some known criteria
+and try to share some insights into the issue.
+
+There are primarily two different tasks for XGBoost the carry out, training and
+inference. Inference is reproducible given the same software and hardware along with the
+same run-time configurations. The remaining of this section will focus on training.
+
+Many of the challenges come from the fact that we are using approximation algorithms, The
+sketching algorithm used to find histogram bins is an approximation to the exact quantile
+algorithm, the `AUC` metric in a distributed environment is an approximation to the exact
+`AUC` score, and floating-point number is an approximation to real number. Floating-point
+is an issue as its summation is not associative, meaning :math:`(a + b) + c` does not
+necessarily equal to :math:`a + (b + c)`, even though this property holds true for real
+number. As a result, whenever we change the order of a summation, the result can
+differ. This imposes the requirement that, in order to have reproducible output from
+XGBoost, the entire pipeline needs to be reproducible.
+
+- The software stack is the same for each runs. This goes without saying. XGBoost might
+  generate different outputs between different versions. This is expected as we might
+  change the default value of hyper-parameter, or the parallel strategy that generates
+  different floating-point result. We guarantee the correctness the algorithms, but there
+  are lots of wiggle room for the final output. The situation is similar for many
+  dependencies, for instance, the random number generator might differ from platform to
+  platform.
+
+- The hardware stack is the same for each runs. This includes the number of workers, and
+  the amount of available resources on each worker. XGBoost can generate different results
+  using different number of workers. This is caused by the approximation issue mentioned
+  previously.
+
+- Similar to the hardware constraint, the network topology is also a factor in final
+  output. If we change topology the workers might be ordered differently, leading to
+  different ordering of floating-point operations.
+
+- The random seed used in various place of the pipeline.
+
+- The partitioning of data needs to be reproducible. This is related to the available
+  resources on each worker. Dask might partition the data differently for each run
+  according to its own scheduling policy. For instance, if there are some additional tasks
+  in the cluster while you are running the second training session for XGBoost, some of
+  the workers might have constrained memory and Dask may not push the training data for
+  XGBoost to that worker. This change in data partitioning can lead to different output
+  models. If you are using a shared Dask cluster, then the result is likely to vary
+  between runs.
+
+- The operations performed on dataframes need to be reproducible. There are some
+  operations like `DataFrame.merge` not being deterministic on parallel hardwares like GPU
+  where the order of the index might differ from run to run.
+
+It's expected to have different results when training the model in a distributed
+environment than training the model using a single node due to aforementioned criteria.
+
+
 ************
 Memory Usage
 ************
diff --git a/doc/tutorials/multioutput.rst b/doc/tutorials/multioutput.rst
index 280fb106f..983002aed 100644
--- a/doc/tutorials/multioutput.rst
+++ b/doc/tutorials/multioutput.rst
@@ -11,7 +11,11 @@ can be simultaneously classified as both sci-fi and comedy.  For detailed explan
 terminologies related to different multi-output models please refer to the
 :doc:`scikit-learn user guide <sklearn:modules/multiclass>`.
 
-Internally, XGBoost builds one model for each target similar to sklearn meta estimators,
+**********************************
+Training with One-Model-Per-Target
+**********************************
+
+By default, XGBoost builds one model for each target similar to sklearn meta estimators,
 with the added benefit of reusing data and other integrated features like SHAP.  For a
 worked example of regression, see
 :ref:`sphx_glr_python_examples_multioutput_regression.py`. For multi-label classification,
@@ -36,3 +40,26 @@ dense matrix for labels.
 
 
 The feature is still under development with limited support from objectives and metrics.
+
+*************************
+Training with Vector Leaf
+*************************
+
+.. versionadded:: 2.0
+
+.. note::
+
+   This is still working-in-progress, and many features are missing.
+
+XGBoost can optionally build multi-output trees with the size of leaf equals to the number
+of targets when the tree method `hist` is used. The behavior can be controlled by the
+``multi_strategy`` training parameter, which can take the value `one_output_per_tree` (the
+default) for building one model per-target or `multi_output_tree` for building
+multi-output trees.
+
+.. code-block:: python
+
+  clf = xgb.XGBClassifier(tree_method="hist", multi_strategy="multi_output_tree")
+
+See :ref:`sphx_glr_python_examples_multioutput_regression.py` for a worked example with
+regression.
diff --git a/include/xgboost/cache.h b/include/xgboost/cache.h
index 781f45b1c..32e1b21ac 100644
--- a/include/xgboost/cache.h
+++ b/include/xgboost/cache.h
@@ -116,6 +116,18 @@ class DMatrixCache {
    * \param cache_size Maximum size of the cache.
    */
   explicit DMatrixCache(std::size_t cache_size) : max_size_{cache_size} {}
+
+  DMatrixCache& operator=(DMatrixCache&& that) {
+    CHECK(lock_.try_lock());
+    lock_.unlock();
+    CHECK(that.lock_.try_lock());
+    that.lock_.unlock();
+    std::swap(this->container_, that.container_);
+    std::swap(this->queue_, that.queue_);
+    std::swap(this->max_size_, that.max_size_);
+    return *this;
+  }
+
   /**
    * \brief Cache a new DMatrix if it's not in the cache already.
    *
@@ -149,6 +161,26 @@ class DMatrixCache {
     }
     return container_.at(key).value;
   }
+  /**
+   * \brief Re-initialize the item in cache.
+   *
+   *   Since the shared_ptr is used to hold the item, any reference that lives outside of
+   *   the cache can no-longer be reached from the cache.
+   *
+   *   We use reset instead of erase to avoid walking through the whole cache for renewing
+   *   a single item. (the cache is FIFO, needs to maintain the order).
+   */
+  template <typename... Args>
+  std::shared_ptr<CacheT> ResetItem(std::shared_ptr<DMatrix> m, Args const&... args) {
+    std::lock_guard<std::mutex> guard{lock_};
+    CheckConsistent();
+    auto key = Key{m.get(), std::this_thread::get_id()};
+    auto it = container_.find(key);
+    CHECK(it != container_.cend());
+    it->second = {m, std::make_shared<CacheT>(args...)};
+    CheckConsistent();
+    return it->second.value;
+  }
   /**
    * \brief Get a const reference to the underlying hash map.  Clear expired caches before
    *        returning.
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index ec78c588d..57f8a0e36 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -171,6 +171,15 @@ class MetaInfo {
    */
   void Extend(MetaInfo const& that, bool accumulate_rows, bool check_column);
 
+  /**
+   * @brief Synchronize the number of columns across all workers.
+   *
+   * Normally we just need to find the maximum number of columns across all workers, but
+   * in vertical federated learning, since each worker loads its own list of columns,
+   * we need to sum them.
+   */
+  void SynchronizeNumberOfColumns();
+
  private:
   void SetInfoFromHost(Context const& ctx, StringView key, Json arr);
   void SetInfoFromCUDA(Context const& ctx, StringView key, Json arr);
@@ -325,6 +334,10 @@ class SparsePage {
    * \brief Check wether the column index is sorted.
    */
   bool IsIndicesSorted(int32_t n_threads) const;
+  /**
+   * \brief Reindex the column index with an offset.
+   */
+  void Reindex(uint64_t feature_offset, int32_t n_threads);
 
   void SortRows(int32_t n_threads);
 
@@ -559,17 +572,18 @@ class DMatrix {
    * \brief Creates a new DMatrix from an external data adapter.
    *
    * \tparam  AdapterT  Type of the adapter.
-   * \param [in,out]  adapter       View onto an external data.
-   * \param           missing       Values to count as missing.
-   * \param           nthread       Number of threads for construction.
-   * \param           cache_prefix  (Optional) The cache prefix for external memory.
-   * \param           page_size     (Optional) Size of the page.
+   * \param [in,out]  adapter         View onto an external data.
+   * \param           missing         Values to count as missing.
+   * \param           nthread         Number of threads for construction.
+   * \param           cache_prefix    (Optional) The cache prefix for external memory.
+   * \param           data_split_mode (Optional) Data split mode.
    *
    * \return  a Created DMatrix.
    */
   template <typename AdapterT>
   static DMatrix* Create(AdapterT* adapter, float missing, int nthread,
-                         const std::string& cache_prefix = "");
+                         const std::string& cache_prefix = "",
+                         DataSplitMode data_split_mode = DataSplitMode::kRow);
 
   /**
    * \brief Create a new Quantile based DMatrix used for histogram based algorithm.
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index d00f9ceaf..07758a524 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -9,7 +9,6 @@
 #define XGBOOST_GBM_H_
 
 #include <dmlc/registry.h>
-#include <dmlc/any.h>
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>
diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h
index e11545b04..3a73d170a 100644
--- a/include/xgboost/json_io.h
+++ b/include/xgboost/json_io.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) by Contributors 2019-2022
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  */
 #ifndef XGBOOST_JSON_IO_H_
 #define XGBOOST_JSON_IO_H_
@@ -17,44 +17,26 @@
 #include <vector>
 
 namespace xgboost {
-namespace detail {
-// Whether char is signed is undefined, as a result we might or might not need
-// static_cast and std::to_string.
-template <typename Char, std::enable_if_t<std::is_signed<Char>::value>* = nullptr>
-std::string CharToStr(Char c) {
-  static_assert(std::is_same<Char, char>::value);
-  return std::string{c};
-}
-
-template <typename Char, std::enable_if_t<!std::is_signed<Char>::value>* = nullptr>
-std::string CharToStr(Char c) {
-  static_assert(std::is_same<Char, char>::value);
-  return (c <= static_cast<char>(127) ? std::string{c} : std::to_string(c));
-}
-}  // namespace detail
-
-/*
+/**
  * \brief A json reader, currently error checking and utf-8 is not fully supported.
  */
 class JsonReader {
+ public:
+  using Char = std::int8_t;
+
  protected:
-  size_t constexpr static kMaxNumLength =
-      std::numeric_limits<double>::max_digits10 + 1;
+  size_t constexpr static kMaxNumLength = std::numeric_limits<double>::max_digits10 + 1;
 
   struct SourceLocation {
    private:
-    size_t pos_ { 0 };  // current position in raw_str_
+    std::size_t pos_{0};  // current position in raw_str_
 
    public:
     SourceLocation() = default;
-    size_t  Pos()  const { return pos_; }
+    size_t Pos() const { return pos_; }
 
-    void Forward() {
-      pos_++;
-    }
-    void Forward(uint32_t n) {
-      pos_ += n;
-    }
+    void Forward() { pos_++; }
+    void Forward(uint32_t n) { pos_ += n; }
   } cursor_;
 
   StringView raw_str_;
@@ -62,7 +44,7 @@ class JsonReader {
  protected:
   void SkipSpaces();
 
-  char GetNextChar() {
+  Char GetNextChar() {
     if (XGBOOST_EXPECT((cursor_.Pos() == raw_str_.size()), false)) {
       return -1;
     }
@@ -71,24 +53,24 @@ class JsonReader {
     return ch;
   }
 
-  char PeekNextChar() {
+  Char PeekNextChar() {
     if (cursor_.Pos() == raw_str_.size()) {
       return -1;
     }
-    char ch = raw_str_[cursor_.Pos()];
+    Char ch = raw_str_[cursor_.Pos()];
     return ch;
   }
 
   /* \brief Skip spaces and consume next character. */
-  char GetNextNonSpaceChar() {
+  Char GetNextNonSpaceChar() {
     SkipSpaces();
     return GetNextChar();
   }
   /* \brief Consume next character without first skipping empty space, throw when the next
    *        character is not the expected one.
    */
-  char GetConsecutiveChar(char expected_char) {
-    char result = GetNextChar();
+  Char GetConsecutiveChar(char expected_char) {
+    Char result = GetNextChar();
     if (XGBOOST_EXPECT(result != expected_char, false)) { Expect(expected_char, result); }
     return result;
   }
@@ -96,7 +78,7 @@ class JsonReader {
   void Error(std::string msg) const;
 
   // Report expected character
-  void Expect(char c, char got) {
+  void Expect(Char c, Char got) {
     std::string msg = "Expecting: \"";
     msg += c;
     msg += "\", got: \"";
@@ -105,7 +87,7 @@ class JsonReader {
     } else if (got == 0) {
       msg += "\\0\"";
     } else {
-      msg += detail::CharToStr(got) + " \"";
+      msg += std::to_string(got) + " \"";
     }
     Error(msg);
   }
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 1d4e35a94..08e1ded09 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -286,8 +286,8 @@ struct LearnerModelParamLegacy;
  * \brief Strategy for building multi-target models.
  */
 enum class MultiStrategy : std::int32_t {
-  kComposite = 0,
-  kMonolithic = 1,
+  kOneOutputPerTree = 0,
+  kMultiOutputTree = 1,
 };
 
 /**
@@ -317,7 +317,7 @@ struct LearnerModelParam {
   /**
    * \brief Strategy for building multi-target models.
    */
-  MultiStrategy multi_strategy{MultiStrategy::kComposite};
+  MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};
 
   LearnerModelParam() = default;
   // As the old `LearnerModelParamLegacy` is still used by binary IO, we keep
@@ -338,7 +338,7 @@ struct LearnerModelParam {
 
   void Copy(LearnerModelParam const& that);
   [[nodiscard]] bool IsVectorLeaf() const noexcept {
-    return multi_strategy == MultiStrategy::kMonolithic;
+    return multi_strategy == MultiStrategy::kMultiOutputTree;
   }
   [[nodiscard]] bst_target_t OutputLength() const noexcept { return this->num_output_group; }
   [[nodiscard]] bst_target_t LeafLength() const noexcept {
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 91aeb189c..65e9de6ba 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -30,11 +30,11 @@
 
 // decouple it from xgboost.
 #ifndef LINALG_HD
-#if defined(__CUDA__) || defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__)
+#if defined(__CUDA__) || defined(__NVCC__)
 #define LINALG_HD __host__ __device__
 #else
 #define LINALG_HD
-#endif  // defined (__CUDA__) || defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined (__CUDA__) || defined(__NVCC__)
 #endif  // LINALG_HD
 
 namespace xgboost::linalg {
@@ -118,9 +118,9 @@ using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value,
 
 template <int32_t n, typename Fn>
 LINALG_HD constexpr auto UnrollLoop(Fn fn) {
-#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
+#if defined __CUDA_ARCH__
 #pragma unroll n
-#endif  // defined __CUDA_ARCH__ || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined __CUDA_ARCH__
   for (int32_t i = 0; i < n; ++i) {
     fn(i);
   }
@@ -136,7 +136,7 @@ int32_t NativePopc(T v) {
 inline LINALG_HD int Popc(uint32_t v) {
 #if defined(__CUDA_ARCH__)
   return __popc(v);
-#elif defined(__GNUC__) || defined(__clang__) || defined(__HIP_PLATFORM_AMD__)
+#elif defined(__GNUC__) || defined(__clang__)
   return __builtin_popcount(v);
 #elif defined(_MSC_VER)
   return __popcnt(v);
@@ -148,7 +148,7 @@ inline LINALG_HD int Popc(uint32_t v) {
 inline LINALG_HD int Popc(uint64_t v) {
 #if defined(__CUDA_ARCH__)
   return __popcll(v);
-#elif defined(__GNUC__) || defined(__clang__) || defined(__HIP_PLATFORM_AMD__)
+#elif defined(__GNUC__) || defined(__clang__)
   return __builtin_popcountll(v);
 #elif defined(_MSC_VER) && _defined(_M_X64)
   return __popcnt64(v);
@@ -530,17 +530,17 @@ class TensorView {
   /**
    * \brief Number of items in the tensor.
    */
-  LINALG_HD std::size_t Size() const { return size_; }
+  [[nodiscard]] LINALG_HD std::size_t Size() const { return size_; }
   /**
    * \brief Whether this is a contiguous array, both C and F contiguous returns true.
    */
-  LINALG_HD bool Contiguous() const {
+  [[nodiscard]] LINALG_HD bool Contiguous() const {
     return data_.size() == this->Size() || this->CContiguous() || this->FContiguous();
   }
   /**
    * \brief Whether it's a c-contiguous array.
    */
-  LINALG_HD bool CContiguous() const {
+  [[nodiscard]] LINALG_HD bool CContiguous() const {
     StrideT stride;
     static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
     // It's contiguous if the stride can be calculated from shape.
@@ -550,7 +550,7 @@ class TensorView {
   /**
    * \brief Whether it's a f-contiguous array.
    */
-  LINALG_HD bool FContiguous() const {
+  [[nodiscard]] LINALG_HD bool FContiguous() const {
     StrideT stride;
     static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
     // It's contiguous if the stride can be calculated from shape.
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index dc24e882d..61dd94302 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -29,11 +29,6 @@
 namespace xgboost {
 class Json;
 
-#if defined(XGBOOST_USE_HIP)
-#define XGBOOST_NODISCARD
-#else
-#define XGBOOST_NODISCARD [[nodiscard]]
-#endif
 // FIXME(trivialfis): Once binary IO is gone, make this parameter internal as it should
 // not be configured by users.
 /*! \brief meta parameters of the tree */
@@ -64,7 +59,7 @@ struct TreeParam : public dmlc::Parameter<TreeParam> {
 
   // Swap byte order for all fields. Useful for transporting models between machines with different
   // endianness (big endian vs little endian)
-  XGBOOST_NODISCARD TreeParam ByteSwap() const {
+  [[nodiscard]] TreeParam ByteSwap() const {
     TreeParam x = *this;
     dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1);
     dmlc::ByteSwap(&x.num_nodes, sizeof(x.num_nodes), 1);
@@ -117,7 +112,7 @@ struct RTreeNodeStat {
   }
   // Swap byte order for all fields. Useful for transporting models between machines with different
   // endianness (big endian vs little endian)
-  XGBOOST_NODISCARD RTreeNodeStat ByteSwap() const {
+  [[nodiscard]] RTreeNodeStat ByteSwap() const {
     RTreeNodeStat x = *this;
     dmlc::ByteSwap(&x.loss_chg, sizeof(x.loss_chg), 1);
     dmlc::ByteSwap(&x.sum_hess, sizeof(x.sum_hess), 1);
@@ -183,51 +178,33 @@ class RegTree : public Model {
     }
 
     /*! \brief index of left child */
-    XGBOOST_DEVICE XGBOOST_NODISCARD int LeftChild() const {
-      return this->cleft_;
-    }
+    [[nodiscard]] XGBOOST_DEVICE int LeftChild() const { return this->cleft_; }
     /*! \brief index of right child */
-    XGBOOST_DEVICE XGBOOST_NODISCARD int RightChild() const {
-      return this->cright_;
-    }
+    [[nodiscard]] XGBOOST_DEVICE int RightChild() const { return this->cright_; }
     /*! \brief index of default child when feature is missing */
-    XGBOOST_DEVICE XGBOOST_NODISCARD int DefaultChild() const {
+    [[nodiscard]] XGBOOST_DEVICE int DefaultChild() const {
       return this->DefaultLeft() ? this->LeftChild() : this->RightChild();
     }
     /*! \brief feature index of split condition */
-    XGBOOST_DEVICE XGBOOST_NODISCARD unsigned SplitIndex() const {
+    [[nodiscard]] XGBOOST_DEVICE unsigned SplitIndex() const {
       return sindex_ & ((1U << 31) - 1U);
     }
     /*! \brief when feature is unknown, whether goes to left child */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool DefaultLeft() const {
-      return (sindex_ >> 31) != 0;
-    }
+    [[nodiscard]] XGBOOST_DEVICE bool DefaultLeft() const { return (sindex_ >> 31) != 0; }
     /*! \brief whether current node is leaf node */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool IsLeaf() const {
-      return cleft_ == kInvalidNodeId;
-    }
+    [[nodiscard]] XGBOOST_DEVICE bool IsLeaf() const { return cleft_ == kInvalidNodeId; }
     /*! \return get leaf value of leaf node */
-    XGBOOST_DEVICE XGBOOST_NODISCARD float LeafValue() const {
-      return (this->info_).leaf_value;
-    }
+    [[nodiscard]] XGBOOST_DEVICE float LeafValue() const { return (this->info_).leaf_value; }
     /*! \return get split condition of the node */
-    XGBOOST_DEVICE XGBOOST_NODISCARD SplitCondT SplitCond() const {
-      return (this->info_).split_cond;
-    }
+    [[nodiscard]] XGBOOST_DEVICE SplitCondT SplitCond() const { return (this->info_).split_cond; }
     /*! \brief get parent of the node */
-    XGBOOST_DEVICE XGBOOST_NODISCARD int Parent() const {
-      return parent_ & ((1U << 31) - 1);
-    }
+    [[nodiscard]] XGBOOST_DEVICE int Parent() const { return parent_ & ((1U << 31) - 1); }
     /*! \brief whether current node is left child */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool IsLeftChild() const {
-      return (parent_ & (1U << 31)) != 0;
-    }
+    [[nodiscard]] XGBOOST_DEVICE bool IsLeftChild() const { return (parent_ & (1U << 31)) != 0; }
     /*! \brief whether this node is deleted */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool IsDeleted() const {
-      return sindex_ == kDeletedNodeMarker;
-    }
+    [[nodiscard]] XGBOOST_DEVICE bool IsDeleted() const { return sindex_ == kDeletedNodeMarker; }
     /*! \brief whether current node is root */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool IsRoot() const { return parent_ == kInvalidNodeId; }
+    [[nodiscard]] XGBOOST_DEVICE bool IsRoot() const { return parent_ == kInvalidNodeId; }
     /*!
      * \brief set the left child
      * \param nid node id to right child
@@ -284,7 +261,7 @@ class RegTree : public Model {
              info_.leaf_value == b.info_.leaf_value;
     }
 
-    XGBOOST_NODISCARD Node ByteSwap() const {
+    [[nodiscard]] Node ByteSwap() const {
       Node x = *this;
       dmlc::ByteSwap(&x.parent_, sizeof(x.parent_), 1);
       dmlc::ByteSwap(&x.cleft_, sizeof(x.cleft_), 1);
@@ -342,15 +319,13 @@ class RegTree : public Model {
     this->ChangeToLeaf(rid, value);
   }
 
-  /*! \brief model parameter */
-  TreeParam param;
   RegTree() {
-    param.Init(Args{});
-    nodes_.resize(param.num_nodes);
-    stats_.resize(param.num_nodes);
-    split_types_.resize(param.num_nodes, FeatureType::kNumerical);
-    split_categories_segments_.resize(param.num_nodes);
-    for (int i = 0; i < param.num_nodes; i++) {
+    param_.Init(Args{});
+    nodes_.resize(param_.num_nodes);
+    stats_.resize(param_.num_nodes);
+    split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
+    split_categories_segments_.resize(param_.num_nodes);
+    for (int i = 0; i < param_.num_nodes; i++) {
       nodes_[i].SetLeaf(0.0f);
       nodes_[i].SetParent(kInvalidNodeId);
     }
@@ -359,10 +334,10 @@ class RegTree : public Model {
    * \brief Constructor that initializes the tree model with shape.
    */
   explicit RegTree(bst_target_t n_targets, bst_feature_t n_features) : RegTree{} {
-    param.num_feature = n_features;
-    param.size_leaf_vector = n_targets;
+    param_.num_feature = n_features;
+    param_.size_leaf_vector = n_targets;
     if (n_targets > 1) {
-      this->p_mt_tree_.reset(new MultiTargetTree{&param});
+      this->p_mt_tree_.reset(new MultiTargetTree{&param_});
     }
   }
 
@@ -376,17 +351,17 @@ class RegTree : public Model {
   }
 
   /*! \brief get const reference to nodes */
-  XGBOOST_NODISCARD const std::vector<Node>& GetNodes() const { return nodes_; }
+  [[nodiscard]] const std::vector<Node>& GetNodes() const { return nodes_; }
 
   /*! \brief get const reference to stats */
-  XGBOOST_NODISCARD const std::vector<RTreeNodeStat>& GetStats() const { return stats_; }
+  [[nodiscard]] const std::vector<RTreeNodeStat>& GetStats() const { return stats_; }
 
   /*! \brief get node statistics given nid */
   RTreeNodeStat& Stat(int nid) {
     return stats_[nid];
   }
   /*! \brief get node statistics given nid */
-  XGBOOST_NODISCARD const RTreeNodeStat& Stat(int nid) const {
+  [[nodiscard]] const RTreeNodeStat& Stat(int nid) const {
     return stats_[nid];
   }
 
@@ -406,7 +381,7 @@ class RegTree : public Model {
 
   bool operator==(const RegTree& b) const {
     return nodes_ == b.nodes_ && stats_ == b.stats_ &&
-           deleted_nodes_ == b.deleted_nodes_ && param == b.param;
+           deleted_nodes_ == b.deleted_nodes_ && param_ == b.param_;
   }
   /* \brief Iterate through all nodes in this tree.
    *
@@ -439,7 +414,7 @@ class RegTree : public Model {
    *
    * \param b The other tree.
    */
-  XGBOOST_NODISCARD bool Equal(const RegTree& b) const;
+  [[nodiscard]] bool Equal(const RegTree& b) const;
 
   /**
    * \brief Expands a leaf node into two additional leaf nodes.
@@ -464,7 +439,9 @@ class RegTree : public Model {
                   bst_float loss_change, float sum_hess, float left_sum,
                   float right_sum,
                   bst_node_t leaf_right_child = kInvalidNodeId);
-
+  /**
+   * \brief Expands a leaf node into two additional leaf nodes for a multi-target tree.
+   */
   void ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond, bool default_left,
                   linalg::VectorView<float const> base_weight,
                   linalg::VectorView<float const> left_weight,
@@ -490,25 +467,54 @@ class RegTree : public Model {
                          bst_float base_weight, bst_float left_leaf_weight,
                          bst_float right_leaf_weight, bst_float loss_change, float sum_hess,
                          float left_sum, float right_sum);
-
-  XGBOOST_NODISCARD bool HasCategoricalSplit() const {
-    return !split_categories_.empty();
-  }
+  /**
+   * \brief Whether this tree has categorical split.
+   */
+  [[nodiscard]] bool HasCategoricalSplit() const { return !split_categories_.empty(); }
   /**
    * \brief Whether this is a multi-target tree.
    */
-  XGBOOST_NODISCARD bool IsMultiTarget() const { return static_cast<bool>(p_mt_tree_); }
-  XGBOOST_NODISCARD bst_target_t NumTargets() const { return param.size_leaf_vector; }
-  XGBOOST_NODISCARD auto GetMultiTargetTree() const {
+  [[nodiscard]] bool IsMultiTarget() const { return static_cast<bool>(p_mt_tree_); }
+  /**
+   * \brief The size of leaf weight.
+   */
+  [[nodiscard]] bst_target_t NumTargets() const { return param_.size_leaf_vector; }
+  /**
+   * \brief Get the underlying implementaiton of multi-target tree.
+   */
+  [[nodiscard]] auto GetMultiTargetTree() const {
     CHECK(IsMultiTarget());
     return p_mt_tree_.get();
   }
+  /**
+   * \brief Get the number of features.
+   */
+  [[nodiscard]] bst_feature_t NumFeatures() const noexcept { return param_.num_feature; }
+  /**
+   * \brief Get the total number of nodes including deleted ones in this tree.
+   */
+  [[nodiscard]] bst_node_t NumNodes() const noexcept { return param_.num_nodes; }
+  /**
+   * \brief Get the total number of valid nodes in this tree.
+   */
+  [[nodiscard]] bst_node_t NumValidNodes() const noexcept {
+    return param_.num_nodes - param_.num_deleted;
+  }
+  /**
+   * \brief number of extra nodes besides the root
+   */
+  [[nodiscard]] bst_node_t NumExtraNodes() const noexcept {
+    return param_.num_nodes - 1 - param_.num_deleted;
+  }
+  /* \brief Count number of leaves in tree. */
+  [[nodiscard]] bst_node_t GetNumLeaves() const;
+  [[nodiscard]] bst_node_t GetNumSplitNodes() const;
 
   /*!
    * \brief get current depth
    * \param nid node id
    */
-  XGBOOST_NODISCARD std::int32_t GetDepth(bst_node_t nid) const {
+  [[nodiscard]] std::int32_t GetDepth(bst_node_t nid) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->Depth(nid);
     }
@@ -519,6 +525,9 @@ class RegTree : public Model {
     }
     return depth;
   }
+  /**
+   * \brief Set the leaf weight for a multi-target tree.
+   */
   void SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
     CHECK(IsMultiTarget());
     return this->p_mt_tree_->SetLeaf(nidx, weight);
@@ -528,27 +537,15 @@ class RegTree : public Model {
    * \brief get maximum depth
    * \param nid node id
    */
-  XGBOOST_NODISCARD int MaxDepth(int nid) const {
+  [[nodiscard]] int MaxDepth(int nid) const {
     if (nodes_[nid].IsLeaf()) return 0;
-    return std::max(MaxDepth(nodes_[nid].LeftChild())+1,
-                     MaxDepth(nodes_[nid].RightChild())+1);
+    return std::max(MaxDepth(nodes_[nid].LeftChild()) + 1, MaxDepth(nodes_[nid].RightChild()) + 1);
   }
 
   /*!
    * \brief get maximum depth
    */
-  int MaxDepth() {
-    return MaxDepth(0);
-  }
-
-  /*! \brief number of extra nodes besides the root */
-  XGBOOST_NODISCARD int NumExtraNodes() const {
-    return param.num_nodes - 1 - param.num_deleted;
-  }
-
-  /* \brief Count number of leaves in tree. */
-  XGBOOST_NODISCARD bst_node_t GetNumLeaves() const;
-  XGBOOST_NODISCARD bst_node_t GetNumSplitNodes() const;
+  int MaxDepth() { return MaxDepth(0); }
 
   /*!
    * \brief dense feature vector that can be taken by RegTree
@@ -575,20 +572,20 @@ class RegTree : public Model {
      * \brief returns the size of the feature vector
      * \return the size of the feature vector
      */
-    XGBOOST_NODISCARD size_t Size() const;
+    [[nodiscard]] size_t Size() const;
     /*!
      * \brief get ith value
      * \param i feature index.
      * \return the i-th feature value
      */
-    XGBOOST_NODISCARD bst_float GetFvalue(size_t i) const;
+    [[nodiscard]] bst_float GetFvalue(size_t i) const;
     /*!
      * \brief check whether i-th entry is missing
      * \param i feature index.
      * \return whether i-th value is missing.
      */
-    XGBOOST_NODISCARD bool IsMissing(size_t i) const;
-    XGBOOST_NODISCARD bool HasMissing() const;
+    [[nodiscard]] bool IsMissing(size_t i) const;
+    [[nodiscard]] bool HasMissing() const;
 
 
    private:
@@ -619,34 +616,34 @@ class RegTree : public Model {
    * \param format the format to dump the model in
    * \return the string of dumped model
    */
-  XGBOOST_NODISCARD std::string DumpModel(const FeatureMap& fmap, bool with_stats,
+  [[nodiscard]] std::string DumpModel(const FeatureMap& fmap, bool with_stats,
                                       std::string format) const;
   /*!
    * \brief Get split type for a node.
    * \param nidx Index of node.
    * \return The type of this split.  For leaf node it's always kNumerical.
    */
-  XGBOOST_NODISCARD FeatureType NodeSplitType(bst_node_t nidx) const { return split_types_.at(nidx); }
+  [[nodiscard]] FeatureType NodeSplitType(bst_node_t nidx) const { return split_types_.at(nidx); }
   /*!
    * \brief Get split types for all nodes.
    */
-  XGBOOST_NODISCARD std::vector<FeatureType> const& GetSplitTypes() const {
+  [[nodiscard]] std::vector<FeatureType> const& GetSplitTypes() const {
     return split_types_;
   }
-  XGBOOST_NODISCARD common::Span<uint32_t const> GetSplitCategories() const {
+  [[nodiscard]] common::Span<uint32_t const> GetSplitCategories() const {
     return split_categories_;
   }
   /*!
    * \brief Get the bit storage for categories
    */
-  XGBOOST_NODISCARD common::Span<uint32_t const> NodeCats(bst_node_t nidx) const {
+  [[nodiscard]] common::Span<uint32_t const> NodeCats(bst_node_t nidx) const {
     auto node_ptr = GetCategoriesMatrix().node_ptr;
     auto categories = GetCategoriesMatrix().categories;
     auto segment = node_ptr[nidx];
     auto node_cats = categories.subspan(segment.beg, segment.size);
     return node_cats;
   }
-  XGBOOST_NODISCARD auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; }
+  [[nodiscard]] auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; }
 
   /**
    * \brief CSR-like matrix for categorical splits.
@@ -665,7 +662,7 @@ class RegTree : public Model {
     common::Span<Segment const> node_ptr;
   };
 
-  XGBOOST_NODISCARD CategoricalSplitMatrix GetCategoriesMatrix() const {
+  [[nodiscard]] CategoricalSplitMatrix GetCategoriesMatrix() const {
     CategoricalSplitMatrix view;
     view.split_type = common::Span<FeatureType const>(this->GetSplitTypes());
     view.categories = this->GetSplitCategories();
@@ -673,55 +670,55 @@ class RegTree : public Model {
     return view;
   }
 
-  XGBOOST_NODISCARD bst_feature_t SplitIndex(bst_node_t nidx) const {
+  [[nodiscard]] bst_feature_t SplitIndex(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->SplitIndex(nidx);
     }
     return (*this)[nidx].SplitIndex();
   }
-  XGBOOST_NODISCARD float SplitCond(bst_node_t nidx) const {
+  [[nodiscard]] float SplitCond(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->SplitCond(nidx);
     }
     return (*this)[nidx].SplitCond();
   }
-  XGBOOST_NODISCARD bool DefaultLeft(bst_node_t nidx) const {
+  [[nodiscard]] bool DefaultLeft(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->DefaultLeft(nidx);
     }
     return (*this)[nidx].DefaultLeft();
   }
-  XGBOOST_NODISCARD bool IsRoot(bst_node_t nidx) const {
+  [[nodiscard]] bool IsRoot(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return nidx == kRoot;
     }
     return (*this)[nidx].IsRoot();
   }
-  XGBOOST_NODISCARD bool IsLeaf(bst_node_t nidx) const {
+  [[nodiscard]] bool IsLeaf(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->IsLeaf(nidx);
     }
     return (*this)[nidx].IsLeaf();
   }
-  XGBOOST_NODISCARD bst_node_t Parent(bst_node_t nidx) const {
+  [[nodiscard]] bst_node_t Parent(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->Parent(nidx);
     }
     return (*this)[nidx].Parent();
   }
-  XGBOOST_NODISCARD bst_node_t LeftChild(bst_node_t nidx) const {
+  [[nodiscard]] bst_node_t LeftChild(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->LeftChild(nidx);
     }
     return (*this)[nidx].LeftChild();
   }
-  XGBOOST_NODISCARD bst_node_t RightChild(bst_node_t nidx) const {
+  [[nodiscard]] bst_node_t RightChild(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->RightChild(nidx);
     }
     return (*this)[nidx].RightChild();
   }
-  XGBOOST_NODISCARD bool IsLeftChild(bst_node_t nidx) const {
+  [[nodiscard]] bool IsLeftChild(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       CHECK_NE(nidx, kRoot);
       auto p = this->p_mt_tree_->Parent(nidx);
@@ -729,7 +726,7 @@ class RegTree : public Model {
     }
     return (*this)[nidx].IsLeftChild();
   }
-  XGBOOST_NODISCARD bst_node_t Size() const {
+  [[nodiscard]] bst_node_t Size() const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->Size();
     }
@@ -740,6 +737,8 @@ class RegTree : public Model {
   template <bool typed>
   void LoadCategoricalSplit(Json const& in);
   void SaveCategoricalSplit(Json* p_out) const;
+  /*! \brief model parameter */
+  TreeParam param_;
   // vector of nodes
   std::vector<Node> nodes_;
   // free node space, used during training process
@@ -757,20 +756,20 @@ class RegTree : public Model {
   // allocate a new node,
   // !!!!!! NOTE: may cause BUG here, nodes.resize
   bst_node_t AllocNode() {
-    if (param.num_deleted != 0) {
+    if (param_.num_deleted != 0) {
       int nid = deleted_nodes_.back();
       deleted_nodes_.pop_back();
       nodes_[nid].Reuse();
-      --param.num_deleted;
+      --param_.num_deleted;
       return nid;
     }
-    int nd = param.num_nodes++;
-    CHECK_LT(param.num_nodes, std::numeric_limits<int>::max())
+    int nd = param_.num_nodes++;
+    CHECK_LT(param_.num_nodes, std::numeric_limits<int>::max())
         << "number of nodes in the tree exceed 2^31";
-    nodes_.resize(param.num_nodes);
-    stats_.resize(param.num_nodes);
-    split_types_.resize(param.num_nodes, FeatureType::kNumerical);
-    split_categories_segments_.resize(param.num_nodes);
+    nodes_.resize(param_.num_nodes);
+    stats_.resize(param_.num_nodes);
+    split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
+    split_categories_segments_.resize(param_.num_nodes);
     return nd;
   }
   // delete a tree node, keep the parent field to allow trace back
@@ -785,7 +784,7 @@ class RegTree : public Model {
 
     deleted_nodes_.push_back(nid);
     nodes_[nid].MarkDelete();
-    ++param.num_deleted;
+    ++param_.num_deleted;
   }
 };
 
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 852cf7f69..a5d219040 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -37,7 +37,7 @@
         <spark.version>3.1.1</spark.version>
         <scala.version>2.12.8</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
-        <hadoop.version>3.3.4</hadoop.version>
+        <hadoop.version>3.3.5</hadoop.version>
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
@@ -118,7 +118,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-release-plugin</artifactId>
-                        <version>2.5.3</version>
+                        <version>3.0.0</version>
                         <configuration>
                             <autoVersionSubmodules>true</autoVersionSubmodules>
                             <useReleaseProfile>false</useReleaseProfile>
@@ -427,7 +427,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
-                <version>2.22.2</version>
+                <version>3.0.0</version>
                 <configuration>
                     <skipTests>false</skipTests>
                     <useSystemClassLoader>false</useSystemClassLoader>
diff --git a/jvm-packages/xgboost4j-example/README.md b/jvm-packages/xgboost4j-example/README.md
index 4718f212f..50f268e83 100644
--- a/jvm-packages/xgboost4j-example/README.md
+++ b/jvm-packages/xgboost4j-example/README.md
@@ -1,30 +1,30 @@
-XGBoost4J Code Examples
-=======================
-
-## Java API
-* [Basic walkthrough of wrappers](src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java)
-* [Customize loss function, and evaluation metric](src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java)
-* [Boosting from existing prediction](src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java)
-* [Predicting using first n trees](src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java)
-* [Generalized Linear Model](src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java)
-* [Cross validation](src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java)
-* [Predicting leaf indices](src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java)
-* [External Memory](src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java)
-* [Early Stopping](src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java)
-
-## Scala API
-
-* [Basic walkthrough of wrappers](src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala)
-* [Customize loss function, and evaluation metric](src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala)
-* [Boosting from existing prediction](src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala)
-* [Predicting using first n trees](src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala)
-* [Generalized Linear Model](src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala)
-* [Cross validation](src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala)
-* [Predicting leaf indices](src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala)
-* [External Memory](src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala)
-
-## Spark API
-* [Distributed Training with Spark](src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala)
-
-## Flink API
-* [Distributed Training with Flink](src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala)
+XGBoost4J Code Examples
+=======================
+
+## Java API
+* [Basic walkthrough of wrappers](src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java)
+* [Customize loss function, and evaluation metric](src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java)
+* [Boosting from existing prediction](src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java)
+* [Predicting using first n trees](src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java)
+* [Generalized Linear Model](src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java)
+* [Cross validation](src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java)
+* [Predicting leaf indices](src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java)
+* [External Memory](src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java)
+* [Early Stopping](src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java)
+
+## Scala API
+
+* [Basic walkthrough of wrappers](src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala)
+* [Customize loss function, and evaluation metric](src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala)
+* [Boosting from existing prediction](src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala)
+* [Predicting using first n trees](src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala)
+* [Generalized Linear Model](src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala)
+* [Cross validation](src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala)
+* [Predicting leaf indices](src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala)
+* [External Memory](src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala)
+
+## Spark API
+* [Distributed Training with Spark](src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala)
+
+## Flink API
+* [Distributed Training with Flink](src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala)
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index e48feb876..b8b757eae 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -51,7 +51,7 @@
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
-            <version>3.3.4</version>
+            <version>3.3.5</version>
         </dependency>
     </dependencies>
 
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 4d35d2e76..1da88c3cc 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -41,13 +41,13 @@
         <dependency>
             <groupId>com.typesafe.akka</groupId>
             <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
             <scope>compile</scope>
         </dependency>
         <dependency>
             <groupId>com.typesafe.akka</groupId>
             <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
             <scope>test</scope>
         </dependency>
         <dependency>
diff --git a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
index 49d17b6be..25705fd1b 100644
--- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
+++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@@ -84,9 +84,10 @@ public class BoosterTest {
     };
 
     try (Table tmpTable = Table.readCSV(schema, opts, new File(trainingDataPath))) {
-      ColumnVector[] df = new ColumnVector[12];
-      for (int i = 0; i < 12; ++i) {
-        df[i] = tmpTable.getColumn(i);
+      ColumnVector[] df = new ColumnVector[10];
+      // exclude the first two columns, they are label bounds and contain inf.
+      for (int i = 2; i < 12; ++i) {
+        df[i - 2] = tmpTable.getColumn(i);
       }
       try (Table X = new Table(df);) {
         ColumnVector[] labels = new ColumnVector[1];
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
index fc26b2985..7e24fe0dd 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
@@ -21,7 +21,7 @@ import java.io.File
 import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
 
 import org.apache.spark.ml.feature.VectorAssembler
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.{col, udf, when}
 import org.apache.spark.sql.types.{FloatType, StructField, StructType}
 
 class GpuXGBoostClassifierSuite extends GpuTestSuite {
@@ -47,7 +47,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
         "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
         "features_cols" -> featureNames, "label_col" -> labelName)
       val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
       // Get a model
       val model = new XGBoostClassifier(xgbParam)
         .fit(originalDf)
@@ -64,7 +65,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
         "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
         "features_cols" -> featureNames, "label_col" -> labelName)
       val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
       val getWeightFromF1 = udf({ f1: Float => if (f1.toInt % 2 == 0) 1.0f else 0.001f })
       val dfWithWeight = originalDf.withColumn("weight", getWeightFromF1(col("f1")))
 
@@ -87,7 +89,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
       val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
         "num_round" -> 10, "num_workers" -> 1)
       val Array(rawInput, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       val classifier = new XGBoostClassifier(xgbParam)
         .setFeaturesCol(featureNames)
@@ -122,7 +125,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
       val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
         "num_round" -> 10, "num_workers" -> 1)
       val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       val vectorAssembler = new VectorAssembler()
         .setHandleInvalid("keep")
@@ -144,7 +148,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
     // transform on GPU
     withGpuSparkSession() { spark =>
       val Array(_, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       // Since CPU model does not know the information about the features cols that GPU transform
       // pipeline requires. End user needs to setFeaturesCol(features: Array[String]) in the model
@@ -174,7 +179,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
       val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
         "num_round" -> 10, "num_workers" -> 1)
       val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       val classifier = new XGBoostClassifier(xgbParam)
         .setFeaturesCol(featureNames)
@@ -190,7 +196,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
     // transform on CPU
     withCpuSparkSession() { spark =>
       val Array(_, rawInput) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       val featureColName = "feature_col"
       val vectorAssembler = new VectorAssembler()
diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/rank.test.csv b/jvm-packages/xgboost4j-spark/src/test/resources/rank.test.csv
index 83bf8b080..729732e5b 100644
--- a/jvm-packages/xgboost4j-spark/src/test/resources/rank.test.csv
+++ b/jvm-packages/xgboost4j-spark/src/test/resources/rank.test.csv
@@ -1,66 +1,66 @@
-0,10.0229017899,7.30178495562,0.118115020017,1
-0,9.93639621859,9.93102159291,0.0435030004396,1
-0,10.1301737265,0.00411765220572,2.4165878053,1
-1,9.87828587087,0.608588414992,0.111262590883,1
-0,10.1373430048,0.47764012225,0.991553052194,1
-0,10.0523814718,4.72152505167,0.672978832666,1
-0,10.0449715742,8.40373928536,0.384457573667,1
-1,996.398498791,941.976309154,0.230269231292,2
-0,1005.11269468,900.093680877,0.265031528873,2
-0,997.160349441,891.331101688,2.19362017313,2
-0,993.754139031,44.8000165317,1.03868009875,2
-1,994.831299184,241.959208453,0.667631827024,2
-0,995.948333283,7.94326917112,0.750490877118,3
-0,989.733981273,7.52077625436,0.0126335967282,3
-0,1003.54086516,6.48177510564,1.19441696788,3
-0,996.56177804,9.71959812613,1.33082465111,3
-0,1005.61382467,0.234339369309,1.17987797356,3
-1,980.215758708,6.85554542926,2.63965085259,3
-1,987.776408872,2.23354609991,0.841885278028,3
-0,1006.54260396,8.12142049834,2.26639471174,3
-0,1009.87927639,6.40028519044,0.775155669615,3
-0,9.95006244393,928.76896718,234.948458244,4
-1,10.0749152258,255.294574476,62.9728604166,4
-1,10.1916541988,312.682867085,92.299413677,4
-0,9.95646724484,742.263188416,53.3310473654,4
-0,9.86211293222,996.237023866,2.00760301168,4
-1,9.91801019468,303.971783709,50.3147230679,4
-0,996.983996934,9.52188222766,1.33588120981,5
-0,995.704388126,9.49260524915,0.908498516541,5
-0,987.86480767,0.0870786716821,0.108859297837,5
-0,1000.99561307,2.85272694575,0.171134518956,5
-0,1011.05508066,7.55336771768,1.04950084825,5
-1,985.52199365,0.763305780608,1.7402424375,5
-0,10.0430321467,813.185427181,4.97728254185,6
-0,10.0812334228,258.297288417,0.127477670549,6
-0,9.84210504292,887.205815261,0.991689193955,6
-1,9.94625332613,0.298622762132,0.147881353231,6
-0,9.97800659954,727.619819757,0.0718361141866,6
-1,9.8037938472,957.385549617,0.0618862028941,6
-0,10.0880634741,185.024638577,1.7028095095,6
-0,9.98630799154,109.10631473,0.681117359751,6
-0,9.91671416638,166.248076588,122.538291094,7
-0,10.1206910464,88.1539468531,141.189859069,7
-1,10.1767160518,1.02960996847,172.02256237,7
-0,9.93025147233,391.196641942,58.040338247,7
-0,9.84850936037,474.63346537,17.5627875397,7
-1,9.8162731343,61.9199554213,30.6740972851,7
-0,10.0403482984,987.50416929,73.0472906209,7
-1,997.019228359,133.294717663,0.0572254083186,8
-0,973.303999107,1.79080888849,0.100478717048,8
-0,1008.28808825,342.282350685,0.409806485495,8
-0,1014.55621524,0.680510407082,0.929530602495,8
-1,1012.74370325,823.105266455,0.0894693730585,8
-0,1003.63554038,727.334432075,0.58206275756,8
-0,10.1560432436,740.35938307,11.6823378533,9
-0,9.83949099701,512.828227154,138.206666681,9
-1,10.1837395682,179.287126088,185.479062365,9
-1,9.9761881495,12.1093388336,9.1264604171,9
-1,9.77402180766,318.561317743,80.6005221355,9
-0,1011.15705381,0.215825852155,1.34429667906,10
-0,1005.60353229,727.202346126,1.47146041005,10
-1,1013.93702961,58.7312725205,0.421041560754,10
-0,1004.86813074,757.693204258,0.566055205344,10
-0,999.996324692,813.12386828,0.864428279513,10
-0,996.55255931,918.760056995,0.43365051974,10
-1,1004.1394132,464.371823646,0.312492288321,10
+0,10.0229017899,7.30178495562,0.118115020017,1
+0,9.93639621859,9.93102159291,0.0435030004396,1
+0,10.1301737265,0.00411765220572,2.4165878053,1
+1,9.87828587087,0.608588414992,0.111262590883,1
+0,10.1373430048,0.47764012225,0.991553052194,1
+0,10.0523814718,4.72152505167,0.672978832666,1
+0,10.0449715742,8.40373928536,0.384457573667,1
+1,996.398498791,941.976309154,0.230269231292,2
+0,1005.11269468,900.093680877,0.265031528873,2
+0,997.160349441,891.331101688,2.19362017313,2
+0,993.754139031,44.8000165317,1.03868009875,2
+1,994.831299184,241.959208453,0.667631827024,2
+0,995.948333283,7.94326917112,0.750490877118,3
+0,989.733981273,7.52077625436,0.0126335967282,3
+0,1003.54086516,6.48177510564,1.19441696788,3
+0,996.56177804,9.71959812613,1.33082465111,3
+0,1005.61382467,0.234339369309,1.17987797356,3
+1,980.215758708,6.85554542926,2.63965085259,3
+1,987.776408872,2.23354609991,0.841885278028,3
+0,1006.54260396,8.12142049834,2.26639471174,3
+0,1009.87927639,6.40028519044,0.775155669615,3
+0,9.95006244393,928.76896718,234.948458244,4
+1,10.0749152258,255.294574476,62.9728604166,4
+1,10.1916541988,312.682867085,92.299413677,4
+0,9.95646724484,742.263188416,53.3310473654,4
+0,9.86211293222,996.237023866,2.00760301168,4
+1,9.91801019468,303.971783709,50.3147230679,4
+0,996.983996934,9.52188222766,1.33588120981,5
+0,995.704388126,9.49260524915,0.908498516541,5
+0,987.86480767,0.0870786716821,0.108859297837,5
+0,1000.99561307,2.85272694575,0.171134518956,5
+0,1011.05508066,7.55336771768,1.04950084825,5
+1,985.52199365,0.763305780608,1.7402424375,5
+0,10.0430321467,813.185427181,4.97728254185,6
+0,10.0812334228,258.297288417,0.127477670549,6
+0,9.84210504292,887.205815261,0.991689193955,6
+1,9.94625332613,0.298622762132,0.147881353231,6
+0,9.97800659954,727.619819757,0.0718361141866,6
+1,9.8037938472,957.385549617,0.0618862028941,6
+0,10.0880634741,185.024638577,1.7028095095,6
+0,9.98630799154,109.10631473,0.681117359751,6
+0,9.91671416638,166.248076588,122.538291094,7
+0,10.1206910464,88.1539468531,141.189859069,7
+1,10.1767160518,1.02960996847,172.02256237,7
+0,9.93025147233,391.196641942,58.040338247,7
+0,9.84850936037,474.63346537,17.5627875397,7
+1,9.8162731343,61.9199554213,30.6740972851,7
+0,10.0403482984,987.50416929,73.0472906209,7
+1,997.019228359,133.294717663,0.0572254083186,8
+0,973.303999107,1.79080888849,0.100478717048,8
+0,1008.28808825,342.282350685,0.409806485495,8
+0,1014.55621524,0.680510407082,0.929530602495,8
+1,1012.74370325,823.105266455,0.0894693730585,8
+0,1003.63554038,727.334432075,0.58206275756,8
+0,10.1560432436,740.35938307,11.6823378533,9
+0,9.83949099701,512.828227154,138.206666681,9
+1,10.1837395682,179.287126088,185.479062365,9
+1,9.9761881495,12.1093388336,9.1264604171,9
+1,9.77402180766,318.561317743,80.6005221355,9
+0,1011.15705381,0.215825852155,1.34429667906,10
+0,1005.60353229,727.202346126,1.47146041005,10
+1,1013.93702961,58.7312725205,0.421041560754,10
+0,1004.86813074,757.693204258,0.566055205344,10
+0,999.996324692,813.12386828,0.864428279513,10
+0,996.55255931,918.760056995,0.43365051974,10
+1,1004.1394132,464.371823646,0.312492288321,10
diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv b/jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv
index ebe232b51..bec3b034c 100644
--- a/jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv
+++ b/jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv
@@ -1,149 +1,149 @@
-0,985.574005058,320.223538037,0.621236086198,1
-0,1010.52917943,635.535543082,2.14984030531,1
-0,1012.91900422,132.387300057,0.488761066665,1
-0,990.829194034,135.102081162,0.747701610673,1
-0,1007.05103629,154.289183562,0.464118249201,1
-0,994.9573036,317.483732878,0.0313685555674,1
-0,987.8071541,731.349178363,0.244616944245,1
-1,10.0349544469,2.29750906143,36.4949974282,2
-0,9.92953881383,5.39134047297,120.041297548,2
-0,10.0909866713,9.06191026312,138.807825798,2
-1,10.2090970614,0.0784495944448,58.207703565,2
-0,9.85695905893,9.99500727713,56.8610243778,2
-1,10.0805758547,0.0410805760559,222.102302076,2
-0,10.1209914486,9.9729127088,171.888238763,2
-0,10.0331939798,0.853339303793,311.181328375,3
-0,9.93901762951,2.72757449146,78.4859514413,3
-0,10.0752365346,9.18695328235,49.8520256553,3
-1,10.0456548902,0.270936043122,123.462958597,3
-0,10.0568923673,0.82997113263,44.9391426001,3
-0,9.8214143472,0.277538931578,15.4217659578,3
-0,9.95258604431,8.69564346094,255.513470671,3
-0,9.91934976357,7.72809741413,82.171591817,3
-0,10.043239582,8.64168255553,38.9657919329,3
-1,10.0236147929,0.0496662263659,4.40889812286,3
-1,1001.85585324,3.75646886071,0.0179224994842,4
-0,1014.25578571,0.285765311201,0.510329864983,4
-1,1002.81422786,9.77676280375,0.433705951912,4
-1,998.072711553,2.82100686538,0.889829076909,4
-0,1003.77395036,2.55916592114,0.0359402151496,4
-1,10.0807877782,4.98513959013,47.5266363559,5
-0,10.0015013081,9.94302478763,78.3697486277,5
-1,10.0441936789,0.305091816635,56.8213984987,5
-0,9.94257106618,7.23909568913,442.463339039,5
-1,9.86479307916,6.41701315844,55.1365304834,5
-0,10.0428628516,9.98466447697,0.391632812588,5
-0,9.94445884566,9.99970945878,260.438436534,5
-1,9.84641392823,225.78051312,1.00525978847,6
-1,9.86907690608,26.8971083147,0.577959255991,6
-0,10.0177314626,0.110585342313,2.30545043031,6
-0,10.0688190907,412.023866234,1.22421542264,6
-0,10.1251769646,13.8212202925,0.129171734504,6
-0,10.0840758802,407.359097187,0.477000870705,6
-0,10.1007458705,987.183625145,0.149385677415,6
-0,9.86472656059,169.559640615,0.147221652519,6
-0,9.94207419238,507.290053755,0.41996207214,6
-0,9.9671005502,1.62610457716,0.408173666788,6
-0,1010.57126596,9.06673707562,0.672092284372,7
-0,1001.6718262,9.53203990055,4.7364050044,7
-0,995.777341384,4.43847316256,2.07229073634,7
-0,1002.95701386,5.51711016665,1.24294450546,7
-0,1016.0988238,0.626468941906,0.105627919134,7
-0,1013.67571419,0.042315529666,0.717619310322,7
-1,994.747747892,6.01989364024,0.772910130015,7
-1,991.654593872,7.35575736952,1.19822091548,7
-0,1008.47101732,8.28240754909,0.229582481359,7
-0,1000.81975227,1.52448354056,0.096441660362,7
-0,10.0900922344,322.656649307,57.8149073088,8
-1,10.0868337371,2.88652339174,54.8865514572,8
-0,10.0988984137,979.483832657,52.6809830901,8
-0,9.97678959238,665.770979738,481.069628909,8
-0,9.78554312773,257.309358658,47.7324475232,8
-0,10.0985967566,935.896512941,138.937052808,8
-0,10.0522252319,876.376299607,6.00373510669,8
-1,9.88065229501,9.99979825653,0.0674603696149,9
-0,10.0483244098,0.0653852316381,0.130679349938,9
-1,9.99685215607,1.76602542774,0.2551321159,9
-0,9.99750159428,1.01591534436,0.145445506504,9
-1,9.97380908941,0.940048645571,0.411805696316,9
-0,9.99977678382,6.91329929641,5.57858201258,9
-0,978.876096381,933.775364741,0.579170824236,10
-0,998.381016406,220.940470582,2.01491778565,10
-0,987.917644594,8.74667873567,0.364006099758,10
-0,1000.20994892,25.2945450565,3.5684398964,10
-0,1014.57141264,675.593540733,0.164174055535,10
-0,998.867283535,765.452750642,0.818425293238,10
-0,10.2143092481,273.576539531,137.111774354,11
-0,10.0366658918,842.469052609,2.32134375927,11
-0,10.1281202091,395.654057342,35.4184893063,11
-0,10.1443721289,960.058461049,272.887070637,11
-0,10.1353234784,535.51304462,2.15393842032,11
-1,10.0451640374,216.733858424,55.6533298016,11
-1,9.94254592171,44.5985537358,304.614176871,11
-0,10.1319257181,613.545504487,5.42391587912,11
-0,1020.63622468,997.476744201,0.509425590461,12
-0,986.304585519,822.669937965,0.605133561808,12
-1,1012.66863221,26.7185759069,0.0875458784828,12
-0,995.387656321,81.8540176995,0.691999430068,12
-0,1020.6587198,848.826964547,0.540159430526,12
-1,1003.81573853,379.84350931,0.0083682925194,12
-0,1021.60921516,641.376951467,1.12339054807,12
-0,1000.17585041,122.107138713,1.09906375372,12
-1,987.64802348,5.98448541152,0.124241987204,12
-1,9.94610136583,346.114985897,0.387708236565,13
-0,9.96812192337,313.278109696,0.00863026595671,13
-0,10.0181739194,36.7378924562,2.92179879835,13
-0,9.89000102695,164.273723971,0.685222591968,13
-0,10.1555212436,320.451459462,2.01341536261,13
-0,10.0085727613,999.767117646,0.462294934168,13
-1,9.93099658724,5.17478203909,0.213855205032,13
-0,10.0629454957,663.088181857,0.049022351462,13
-0,10.1109732417,734.904569784,1.6998450094,13
-0,1006.6015266,505.023453703,1.90870566777,14
-0,991.865769489,245.437343115,0.475109744256,14
-0,998.682734072,950.041057232,1.9256314201,14
-0,1005.02207209,2.9619314197,0.0517146822357,14
-0,1002.54526214,860.562681899,0.915687092848,14
-0,1000.38847359,808.416525088,0.209690673808,14
-1,992.557818382,373.889409453,0.107571728577,14
-0,1002.07722137,997.329626371,1.06504260496,14
-0,1000.40504333,949.832139189,0.539159980327,14
-0,10.1460179902,8.86082969819,135.953842715,15
-1,9.98529296553,2.87366448495,1.74249892194,15
-0,9.88942676744,9.4031821056,149.473066381,15
-1,10.0192953341,1.99685737576,1.79502473397,15
-0,10.0110654379,8.13112593726,87.7765628103,15
-0,997.148677047,733.936190093,1.49298494242,16
-0,1008.70465919,957.121652078,0.217414013634,16
-1,997.356154278,541.599587807,0.100855972216,16
-0,999.615897283,943.700501824,0.862874175879,16
-1,997.36859077,0.200859940848,0.13601892182,16
-0,10.0423255624,1.73855202168,0.956695338485,17
-1,9.88440755486,9.9994600678,0.305080529665,17
-0,10.0891026412,3.28031719474,0.364450973697,17
-0,9.90078644258,8.77839663617,0.456660574479,17
-1,9.79380029711,8.77220326156,0.527292005175,17
-0,9.93613887011,9.76270841268,1.40865693823,17
-0,10.0009239007,7.29056178263,0.498015866607,17
-0,9.96603319905,5.12498000925,0.517492532783,17
-0,10.0923827222,2.76652583955,1.56571226159,17
-1,10.0983782035,587.788120694,0.031756483687,18
-1,9.91397225464,994.527496819,3.72092164978,18
-0,10.1057472738,2.92894440088,0.683506438532,18
-0,10.1014053354,959.082038017,1.07039624129,18
-0,10.1433253044,322.515119317,0.51408278993,18
-1,9.82832510699,637.104433908,0.250272776427,18
-0,1000.49729075,2.75336888111,0.576634423274,19
-1,984.90338088,0.0295435794035,1.26273339929,19
-0,1001.53811442,4.64164410861,0.0293389959504,19
-1,995.875898395,5.08223403205,0.382330566779,19
-0,996.405937252,6.26395190757,0.453645816611,19
-0,10.0165140779,340.126072514,0.220794603312,20
-0,9.93482824816,951.672000448,0.124406293612,20
-0,10.1700278554,0.0140985961008,0.252452256311,20
-0,9.99825079542,950.382643896,0.875382402062,20
-0,9.87316410028,686.788257829,0.215886999825,20
-0,10.2893240654,89.3947931451,0.569578232133,20
-0,9.98689192703,0.430107535413,2.99869831728,20
-0,10.1365175107,972.279245093,0.0865099386744,20
-0,9.90744703306,50.810461183,3.00863325197,20
+0,985.574005058,320.223538037,0.621236086198,1
+0,1010.52917943,635.535543082,2.14984030531,1
+0,1012.91900422,132.387300057,0.488761066665,1
+0,990.829194034,135.102081162,0.747701610673,1
+0,1007.05103629,154.289183562,0.464118249201,1
+0,994.9573036,317.483732878,0.0313685555674,1
+0,987.8071541,731.349178363,0.244616944245,1
+1,10.0349544469,2.29750906143,36.4949974282,2
+0,9.92953881383,5.39134047297,120.041297548,2
+0,10.0909866713,9.06191026312,138.807825798,2
+1,10.2090970614,0.0784495944448,58.207703565,2
+0,9.85695905893,9.99500727713,56.8610243778,2
+1,10.0805758547,0.0410805760559,222.102302076,2
+0,10.1209914486,9.9729127088,171.888238763,2
+0,10.0331939798,0.853339303793,311.181328375,3
+0,9.93901762951,2.72757449146,78.4859514413,3
+0,10.0752365346,9.18695328235,49.8520256553,3
+1,10.0456548902,0.270936043122,123.462958597,3
+0,10.0568923673,0.82997113263,44.9391426001,3
+0,9.8214143472,0.277538931578,15.4217659578,3
+0,9.95258604431,8.69564346094,255.513470671,3
+0,9.91934976357,7.72809741413,82.171591817,3
+0,10.043239582,8.64168255553,38.9657919329,3
+1,10.0236147929,0.0496662263659,4.40889812286,3
+1,1001.85585324,3.75646886071,0.0179224994842,4
+0,1014.25578571,0.285765311201,0.510329864983,4
+1,1002.81422786,9.77676280375,0.433705951912,4
+1,998.072711553,2.82100686538,0.889829076909,4
+0,1003.77395036,2.55916592114,0.0359402151496,4
+1,10.0807877782,4.98513959013,47.5266363559,5
+0,10.0015013081,9.94302478763,78.3697486277,5
+1,10.0441936789,0.305091816635,56.8213984987,5
+0,9.94257106618,7.23909568913,442.463339039,5
+1,9.86479307916,6.41701315844,55.1365304834,5
+0,10.0428628516,9.98466447697,0.391632812588,5
+0,9.94445884566,9.99970945878,260.438436534,5
+1,9.84641392823,225.78051312,1.00525978847,6
+1,9.86907690608,26.8971083147,0.577959255991,6
+0,10.0177314626,0.110585342313,2.30545043031,6
+0,10.0688190907,412.023866234,1.22421542264,6
+0,10.1251769646,13.8212202925,0.129171734504,6
+0,10.0840758802,407.359097187,0.477000870705,6
+0,10.1007458705,987.183625145,0.149385677415,6
+0,9.86472656059,169.559640615,0.147221652519,6
+0,9.94207419238,507.290053755,0.41996207214,6
+0,9.9671005502,1.62610457716,0.408173666788,6
+0,1010.57126596,9.06673707562,0.672092284372,7
+0,1001.6718262,9.53203990055,4.7364050044,7
+0,995.777341384,4.43847316256,2.07229073634,7
+0,1002.95701386,5.51711016665,1.24294450546,7
+0,1016.0988238,0.626468941906,0.105627919134,7
+0,1013.67571419,0.042315529666,0.717619310322,7
+1,994.747747892,6.01989364024,0.772910130015,7
+1,991.654593872,7.35575736952,1.19822091548,7
+0,1008.47101732,8.28240754909,0.229582481359,7
+0,1000.81975227,1.52448354056,0.096441660362,7
+0,10.0900922344,322.656649307,57.8149073088,8
+1,10.0868337371,2.88652339174,54.8865514572,8
+0,10.0988984137,979.483832657,52.6809830901,8
+0,9.97678959238,665.770979738,481.069628909,8
+0,9.78554312773,257.309358658,47.7324475232,8
+0,10.0985967566,935.896512941,138.937052808,8
+0,10.0522252319,876.376299607,6.00373510669,8
+1,9.88065229501,9.99979825653,0.0674603696149,9
+0,10.0483244098,0.0653852316381,0.130679349938,9
+1,9.99685215607,1.76602542774,0.2551321159,9
+0,9.99750159428,1.01591534436,0.145445506504,9
+1,9.97380908941,0.940048645571,0.411805696316,9
+0,9.99977678382,6.91329929641,5.57858201258,9
+0,978.876096381,933.775364741,0.579170824236,10
+0,998.381016406,220.940470582,2.01491778565,10
+0,987.917644594,8.74667873567,0.364006099758,10
+0,1000.20994892,25.2945450565,3.5684398964,10
+0,1014.57141264,675.593540733,0.164174055535,10
+0,998.867283535,765.452750642,0.818425293238,10
+0,10.2143092481,273.576539531,137.111774354,11
+0,10.0366658918,842.469052609,2.32134375927,11
+0,10.1281202091,395.654057342,35.4184893063,11
+0,10.1443721289,960.058461049,272.887070637,11
+0,10.1353234784,535.51304462,2.15393842032,11
+1,10.0451640374,216.733858424,55.6533298016,11
+1,9.94254592171,44.5985537358,304.614176871,11
+0,10.1319257181,613.545504487,5.42391587912,11
+0,1020.63622468,997.476744201,0.509425590461,12
+0,986.304585519,822.669937965,0.605133561808,12
+1,1012.66863221,26.7185759069,0.0875458784828,12
+0,995.387656321,81.8540176995,0.691999430068,12
+0,1020.6587198,848.826964547,0.540159430526,12
+1,1003.81573853,379.84350931,0.0083682925194,12
+0,1021.60921516,641.376951467,1.12339054807,12
+0,1000.17585041,122.107138713,1.09906375372,12
+1,987.64802348,5.98448541152,0.124241987204,12
+1,9.94610136583,346.114985897,0.387708236565,13
+0,9.96812192337,313.278109696,0.00863026595671,13
+0,10.0181739194,36.7378924562,2.92179879835,13
+0,9.89000102695,164.273723971,0.685222591968,13
+0,10.1555212436,320.451459462,2.01341536261,13
+0,10.0085727613,999.767117646,0.462294934168,13
+1,9.93099658724,5.17478203909,0.213855205032,13
+0,10.0629454957,663.088181857,0.049022351462,13
+0,10.1109732417,734.904569784,1.6998450094,13
+0,1006.6015266,505.023453703,1.90870566777,14
+0,991.865769489,245.437343115,0.475109744256,14
+0,998.682734072,950.041057232,1.9256314201,14
+0,1005.02207209,2.9619314197,0.0517146822357,14
+0,1002.54526214,860.562681899,0.915687092848,14
+0,1000.38847359,808.416525088,0.209690673808,14
+1,992.557818382,373.889409453,0.107571728577,14
+0,1002.07722137,997.329626371,1.06504260496,14
+0,1000.40504333,949.832139189,0.539159980327,14
+0,10.1460179902,8.86082969819,135.953842715,15
+1,9.98529296553,2.87366448495,1.74249892194,15
+0,9.88942676744,9.4031821056,149.473066381,15
+1,10.0192953341,1.99685737576,1.79502473397,15
+0,10.0110654379,8.13112593726,87.7765628103,15
+0,997.148677047,733.936190093,1.49298494242,16
+0,1008.70465919,957.121652078,0.217414013634,16
+1,997.356154278,541.599587807,0.100855972216,16
+0,999.615897283,943.700501824,0.862874175879,16
+1,997.36859077,0.200859940848,0.13601892182,16
+0,10.0423255624,1.73855202168,0.956695338485,17
+1,9.88440755486,9.9994600678,0.305080529665,17
+0,10.0891026412,3.28031719474,0.364450973697,17
+0,9.90078644258,8.77839663617,0.456660574479,17
+1,9.79380029711,8.77220326156,0.527292005175,17
+0,9.93613887011,9.76270841268,1.40865693823,17
+0,10.0009239007,7.29056178263,0.498015866607,17
+0,9.96603319905,5.12498000925,0.517492532783,17
+0,10.0923827222,2.76652583955,1.56571226159,17
+1,10.0983782035,587.788120694,0.031756483687,18
+1,9.91397225464,994.527496819,3.72092164978,18
+0,10.1057472738,2.92894440088,0.683506438532,18
+0,10.1014053354,959.082038017,1.07039624129,18
+0,10.1433253044,322.515119317,0.51408278993,18
+1,9.82832510699,637.104433908,0.250272776427,18
+0,1000.49729075,2.75336888111,0.576634423274,19
+1,984.90338088,0.0295435794035,1.26273339929,19
+0,1001.53811442,4.64164410861,0.0293389959504,19
+1,995.875898395,5.08223403205,0.382330566779,19
+0,996.405937252,6.26395190757,0.453645816611,19
+0,10.0165140779,340.126072514,0.220794603312,20
+0,9.93482824816,951.672000448,0.124406293612,20
+0,10.1700278554,0.0140985961008,0.252452256311,20
+0,9.99825079542,950.382643896,0.875382402062,20
+0,9.87316410028,686.788257829,0.215886999825,20
+0,10.2893240654,89.3947931451,0.569578232133,20
+0,9.98689192703,0.430107535413,2.99869831728,20
+0,10.1365175107,972.279245093,0.0865099386744,20
+0,9.90744703306,50.810461183,3.00863325197,20
diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py
index ff651a4f7..edc9759bd 100644
--- a/jvm-packages/xgboost4j-tester/generate_pom.py
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@@ -51,13 +51,13 @@ pom_template = """
     <dependency>
       <groupId>com.typesafe.akka</groupId>
       <artifactId>akka-actor_${{scala.binary.version}}</artifactId>
-      <version>2.7.0</version>
+      <version>2.6.20</version>
       <scope>compile</scope>
     </dependency>
     <dependency>
       <groupId>com.typesafe.akka</groupId>
       <artifactId>akka-testkit_${{scala.binary.version}}</artifactId>
-      <version>2.7.0</version>
+      <version>2.6.20</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index dcc4bf60c..946b11108 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -34,13 +34,13 @@
         <dependency>
             <groupId>com.typesafe.akka</groupId>
             <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
             <scope>compile</scope>
         </dependency>
         <dependency>
             <groupId>com.typesafe.akka</groupId>
             <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
             <scope>test</scope>
         </dependency>
         <dependency>
diff --git a/plugin/updater_oneapi/predictor_oneapi.cc b/plugin/updater_oneapi/predictor_oneapi.cc
index eafe83e19..25a14186c 100755
--- a/plugin/updater_oneapi/predictor_oneapi.cc
+++ b/plugin/updater_oneapi/predictor_oneapi.cc
@@ -1,448 +1,447 @@
-/*!
- * Copyright by Contributors 2017-2020
- */
-#include <cstddef>
-#include <limits>
-#include <mutex>
-
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/predictor.h"
-#include "xgboost/tree_model.h"
-#include "xgboost/tree_updater.h"
-#include "xgboost/logging.h"
-#include "xgboost/host_device_vector.h"
-
-#include "../../src/data/adapter.h"
-#include "../../src/common/math.h"
-#include "../../src/gbm/gbtree_model.h"
-
-#include "CL/sycl.hpp"
-
-namespace xgboost {
-namespace predictor {
-
-DMLC_REGISTRY_FILE_TAG(predictor_oneapi);
-
-/*! \brief Element from a sparse vector */
-struct EntryOneAPI {
-  /*! \brief feature index */
-  bst_feature_t index;
-  /*! \brief feature value */
-  bst_float fvalue;
-  /*! \brief default constructor */
-  EntryOneAPI() = default;
-  /*!
-   * \brief constructor with index and value
-   * \param index The feature or row index.
-   * \param fvalue The feature value.
-   */
-  EntryOneAPI(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {}
-
-  EntryOneAPI(const Entry& entry) : index(entry.index), fvalue(entry.fvalue) {}
-
-  /*! \brief reversely compare feature values */
-  inline static bool CmpValue(const EntryOneAPI& a, const EntryOneAPI& b) {
-    return a.fvalue < b.fvalue;
-  }
-  inline bool operator==(const EntryOneAPI& other) const {
-    return (this->index == other.index && this->fvalue == other.fvalue);
-  }
-};
-
-struct DeviceMatrixOneAPI {
-  DMatrix* p_mat;  // Pointer to the original matrix on the host
-  cl::sycl::queue qu_;
-  size_t* row_ptr;
-  size_t row_ptr_size;
-  EntryOneAPI* data;
-
-  DeviceMatrixOneAPI(DMatrix* dmat, cl::sycl::queue qu) : p_mat(dmat), qu_(qu) {
-    size_t num_row = 0;
-    size_t num_nonzero = 0;
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const auto& data_vec = batch.data.HostVector();
-      const auto& offset_vec = batch.offset.HostVector();
-      num_nonzero += data_vec.size();
-      num_row += batch.Size();
-    }
-
-    row_ptr = cl::sycl::malloc_shared<size_t>(num_row + 1, qu_);
-    data = cl::sycl::malloc_shared<EntryOneAPI>(num_nonzero, qu_);
-
-    size_t data_offset = 0;
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const auto& data_vec = batch.data.HostVector();
-      const auto& offset_vec = batch.offset.HostVector();
-      size_t batch_size = batch.Size();
-      if (batch_size > 0) {
-        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
-                  row_ptr + batch.base_rowid);
-        if (batch.base_rowid > 0) {
-          for(size_t i = 0; i < batch_size; i++)
-            row_ptr[i + batch.base_rowid] += batch.base_rowid;
-        }
-        std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size],
-                  data + data_offset);
-        data_offset += offset_vec[batch_size];
-      }
-    }
-    row_ptr[num_row] = data_offset;
-    row_ptr_size = num_row + 1;
-  }
-
-  ~DeviceMatrixOneAPI() {
-    if (row_ptr) {
-      cl::sycl::free(row_ptr, qu_);
-    }
-    if (data) {
-      cl::sycl::free(data, qu_);
-    }
-  }
-};
-
-struct DeviceNodeOneAPI {
-  DeviceNodeOneAPI()
-      : fidx(-1), left_child_idx(-1), right_child_idx(-1) {}
-
-  union NodeValue {
-    float leaf_weight;
-    float fvalue;
-  };
-
-  int fidx;
-  int left_child_idx;
-  int right_child_idx;
-  NodeValue val;
-
-  DeviceNodeOneAPI(const RegTree::Node& n) {  // NOLINT
-    this->left_child_idx = n.LeftChild();
-    this->right_child_idx = n.RightChild();
-    this->fidx = n.SplitIndex();
-    if (n.DefaultLeft()) {
-      fidx |= (1U << 31);
-    }
-
-    if (n.IsLeaf()) {
-      this->val.leaf_weight = n.LeafValue();
-    } else {
-      this->val.fvalue = n.SplitCond();
-    }
-  }
-
-  bool IsLeaf() const { return left_child_idx == -1; }
-
-  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
-
-  bool MissingLeft() const { return (fidx >> 31) != 0; }
-
-  int MissingIdx() const {
-    if (MissingLeft()) {
-      return this->left_child_idx;
-    } else {
-      return this->right_child_idx;
-    }
-  }
-
-  float GetFvalue() const { return val.fvalue; }
-
-  float GetWeight() const { return val.leaf_weight; }
-};
-
-class DeviceModelOneAPI {
- public:
-  cl::sycl::queue qu_;
-  DeviceNodeOneAPI* nodes;
-  size_t* tree_segments;
-  int* tree_group;
-  size_t tree_beg_;
-  size_t tree_end_;
-  int num_group;
-
-  DeviceModelOneAPI() : nodes(nullptr), tree_segments(nullptr), tree_group(nullptr) {}
-
-  ~DeviceModelOneAPI() {
-    Reset();
-  }
-
-  void Reset() {
-    if (nodes)
-      cl::sycl::free(nodes, qu_);
-    if (tree_segments)
-      cl::sycl::free(tree_segments, qu_);
-    if (tree_group)
-      cl::sycl::free(tree_group, qu_);
-  }
-
-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, cl::sycl::queue qu) {
-    qu_ = qu;
-    CHECK_EQ(model.param.size_leaf_vector, 0);
-    Reset();
-
-    tree_segments = cl::sycl::malloc_shared<size_t>((tree_end - tree_begin) + 1, qu_);
-    int sum = 0;
-    tree_segments[0] = sum;
-    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      sum += model.trees[tree_idx]->GetNodes().size();
-      tree_segments[tree_idx - tree_begin + 1] = sum;
-    }
-
-    nodes = cl::sycl::malloc_shared<DeviceNodeOneAPI>(sum, qu_);
-    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto& src_nodes = model.trees[tree_idx]->GetNodes();
-      for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++)
-        nodes[node_idx + tree_segments[tree_idx - tree_begin]] = src_nodes[node_idx];
-    }
-
-    tree_group = cl::sycl::malloc_shared<int>(model.tree_info.size(), qu_);
-    for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++)
-      tree_group[tree_idx] = model.tree_info[tree_idx];
-
-    tree_beg_ = tree_begin;
-    tree_end_ = tree_end;
-    num_group = model.learner_model_param->num_output_group; 
-  }
-};
-
-float GetFvalue(int ridx, int fidx, EntryOneAPI* data, size_t* row_ptr, bool& is_missing) {
-  // Binary search
-  auto begin_ptr = data + row_ptr[ridx];
-  auto end_ptr = data + row_ptr[ridx + 1];
-  EntryOneAPI* previous_middle = nullptr;
-  while (end_ptr != begin_ptr) {
-    auto middle = begin_ptr + (end_ptr - begin_ptr) / 2;
-    if (middle == previous_middle) {
-      break;
-    } else {
-      previous_middle = middle;
-    }
-
-    if (middle->index == fidx) {
-      is_missing = false;
-      return middle->fvalue;
-    } else if (middle->index < fidx) {
-      begin_ptr = middle;
-    } else {
-      end_ptr = middle;
-    }
-  }
-  is_missing = true;
-  return 0.0;
-}
-
-float GetLeafWeight(int ridx, const DeviceNodeOneAPI* tree, EntryOneAPI* data, size_t* row_ptr) {
-  DeviceNodeOneAPI n = tree[0];
-  int node_id = 0;
-  bool is_missing;
-  while (!n.IsLeaf()) {
-    float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, is_missing);
-    // Missing value
-    if (is_missing) {
-      n = tree[n.MissingIdx()];
-    } else {
-      if (fvalue < n.GetFvalue()) {
-        node_id = n.left_child_idx;
-        n = tree[n.left_child_idx];
-      } else {
-        node_id = n.right_child_idx;
-        n = tree[n.right_child_idx];
-      }
-    }
-  }
-  return n.GetWeight();
-}
-
-class PredictorOneAPI : public Predictor {
- protected:
-  void InitOutPredictions(const MetaInfo& info,
-                          HostDeviceVector<bst_float>* out_preds,
-                          const gbm::GBTreeModel& model) const {
-    CHECK_NE(model.learner_model_param->num_output_group, 0);
-    size_t n = model.learner_model_param->num_output_group * info.num_row_;
-    const auto& base_margin = info.base_margin_.HostVector();
-    out_preds->Resize(n);
-    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
-    if (base_margin.size() == n) {
-      CHECK_EQ(out_preds->Size(), n);
-      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
-    } else {
-      if (!base_margin.empty()) {
-        std::ostringstream oss;
-        oss << "Ignoring the base margin, since it has incorrect length. "
-            << "The base margin must be an array of length ";
-        if (model.learner_model_param->num_output_group > 1) {
-          oss << "[num_class] * [number of data points], i.e. "
-              << model.learner_model_param->num_output_group << " * " << info.num_row_
-              << " = " << n << ". ";
-        } else {
-          oss << "[number of data points], i.e. " << info.num_row_ << ". ";
-        }
-        oss << "Instead, all data points will use "
-            << "base_score = " << model.learner_model_param->base_score;
-        LOG(WARNING) << oss.str();
-      }
-      std::fill(out_preds_h.begin(), out_preds_h.end(),
-                model.learner_model_param->base_score);
-    }
-  }
-
-  void DevicePredictInternal(DeviceMatrixOneAPI* dmat, HostDeviceVector<float>* out_preds,
-                             const gbm::GBTreeModel& model, size_t tree_begin,
-                             size_t tree_end) {
-    if (tree_end - tree_begin == 0) {
-      return;
-    }
-    model_.Init(model, tree_begin, tree_end, qu_);
-
-    auto& out_preds_vec = out_preds->HostVector();
-
-    DeviceNodeOneAPI* nodes = model_.nodes;
-    cl::sycl::buffer<float, 1> out_preds_buf(out_preds_vec.data(), out_preds_vec.size());
-    size_t* tree_segments = model_.tree_segments;
-    int* tree_group = model_.tree_group;
-    size_t* row_ptr = dmat->row_ptr;
-    EntryOneAPI* data = dmat->data;
-    int num_features = dmat->p_mat->Info().num_col_;
-    int num_rows = dmat->row_ptr_size - 1;
-    int num_group = model.learner_model_param->num_output_group;
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto out_predictions = out_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
-      cgh.parallel_for<class PredictInternal>(cl::sycl::range<1>(num_rows), [=](cl::sycl::id<1> pid) {
-        int global_idx = pid[0];
-        if (global_idx >= num_rows) return;
-        if (num_group == 1) {
-          float sum = 0.0;
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
-            sum += GetLeafWeight(global_idx, tree, data, row_ptr);
-          }
-          out_predictions[global_idx] += sum;
-        } else {
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
-            int out_prediction_idx = global_idx * num_group + tree_group[tree_idx];
-            out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr);
-          }
-        }
-      });
-    }).wait();
-  }
-
- public:
-  explicit PredictorOneAPI(Context const* generic_param) :
-      Predictor::Predictor{generic_param}, cpu_predictor(Predictor::Create("cpu_predictor", generic_param)) {
-    cl::sycl::default_selector selector;
-    qu_ = cl::sycl::queue(selector);
-  }
-
-  // ntree_limit is a very problematic parameter, as it's ambiguous in the context of
-  // multi-output and forest.  Same problem exists for tree_begin
-  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
-                    const gbm::GBTreeModel& model, int tree_begin,
-                    uint32_t const ntree_limit = 0) override {
-    if (this->device_matrix_cache_.find(dmat) ==
-        this->device_matrix_cache_.end()) {
-      this->device_matrix_cache_.emplace(
-          dmat, std::unique_ptr<DeviceMatrixOneAPI>(
-                    new DeviceMatrixOneAPI(dmat, qu_)));
-    }
-    DeviceMatrixOneAPI* device_matrix = device_matrix_cache_.find(dmat)->second.get();
-
-    // tree_begin is not used, right now we just enforce it to be 0.
-    CHECK_EQ(tree_begin, 0);
-    auto* out_preds = &predts->predictions;
-    CHECK_GE(predts->version, tree_begin);
-    if (out_preds->Size() == 0 && dmat->Info().num_row_ != 0) {
-      CHECK_EQ(predts->version, 0);
-    }
-    if (predts->version == 0) {
-      // out_preds->Size() can be non-zero as it's initialized here before any tree is
-      // built at the 0^th iterator.
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-    }
-
-    uint32_t const output_groups = model.learner_model_param->num_output_group;
-    CHECK_NE(output_groups, 0);
-    // Right now we just assume ntree_limit provided by users means number of tree layers
-    // in the context of multi-output model
-    uint32_t real_ntree_limit = ntree_limit * output_groups;
-    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
-      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
-    }
-
-    uint32_t const end_version = (tree_begin + real_ntree_limit) / output_groups;
-    // When users have provided ntree_limit, end_version can be lesser, cache is violated
-    if (predts->version > end_version) {
-      CHECK_NE(ntree_limit, 0);
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-      predts->version = 0;
-    }
-    uint32_t const beg_version = predts->version;
-    CHECK_LE(beg_version, end_version);
-
-    if (beg_version < end_version) {
-      DevicePredictInternal(device_matrix, out_preds, model,
-                            beg_version * output_groups,
-                            end_version * output_groups);
-    }
-
-    // delta means {size of forest} * {number of newly accumulated layers}
-    uint32_t delta = end_version - beg_version;
-    CHECK_LE(delta, model.trees.size());
-    predts->Update(delta);
-
-    CHECK(out_preds->Size() == output_groups * dmat->Info().num_row_ ||
-          out_preds->Size() == dmat->Info().num_row_);
-  }
-
-  void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
-                      float missing, PredictionCacheEntry *out_preds,
-                      uint32_t tree_begin, unsigned tree_end) const override {
-    cpu_predictor->InplacePredict(x, model, missing, out_preds, tree_begin, tree_end);
-  }
-
-  void PredictInstance(const SparsePage::Inst& inst,
-                       std::vector<bst_float>* out_preds,
-                       const gbm::GBTreeModel& model, unsigned ntree_limit) override {
-    cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit);
-  }
-
-  void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
-                   const gbm::GBTreeModel& model, unsigned ntree_limit) override {
-    cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
-  }
-
-  void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
-                           const gbm::GBTreeModel& model, uint32_t ntree_limit,
-                           std::vector<bst_float>* tree_weights,
-                           bool approximate, int condition,
-                           unsigned condition_feature) override {
-    cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate, condition, condition_feature);
-  }
-
-  void PredictInteractionContributions(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
-                                       const gbm::GBTreeModel& model, unsigned ntree_limit,
-                                       std::vector<bst_float>* tree_weights,
-                                       bool approximate) override {
-    cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate);
-  }
-
- private:
-  cl::sycl::queue qu_;
-  DeviceModelOneAPI model_;
-
-  std::mutex lock_;
-  std::unique_ptr<Predictor> cpu_predictor;
-
-  std::unordered_map<DMatrix*, std::unique_ptr<DeviceMatrixOneAPI>>
-      device_matrix_cache_;
-};
-
-XGBOOST_REGISTER_PREDICTOR(PredictorOneAPI, "oneapi_predictor")
-.describe("Make predictions using DPC++.")
-.set_body([](Context const* generic_param) {
-            return new PredictorOneAPI(generic_param);
-          });
-}  // namespace predictor
-}  // namespace xgboost
+/*!
+ * Copyright by Contributors 2017-2020
+ */
+#include <any>  // for any
+#include <cstddef>
+#include <limits>
+#include <mutex>
+
+#include "../../src/common/math.h"
+#include "../../src/data/adapter.h"
+#include "../../src/gbm/gbtree_model.h"
+#include "CL/sycl.hpp"
+#include "xgboost/base.h"
+#include "xgboost/data.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/logging.h"
+#include "xgboost/predictor.h"
+#include "xgboost/tree_model.h"
+#include "xgboost/tree_updater.h"
+
+namespace xgboost {
+namespace predictor {
+
+DMLC_REGISTRY_FILE_TAG(predictor_oneapi);
+
+/*! \brief Element from a sparse vector */
+struct EntryOneAPI {
+  /*! \brief feature index */
+  bst_feature_t index;
+  /*! \brief feature value */
+  bst_float fvalue;
+  /*! \brief default constructor */
+  EntryOneAPI() = default;
+  /*!
+   * \brief constructor with index and value
+   * \param index The feature or row index.
+   * \param fvalue The feature value.
+   */
+  EntryOneAPI(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {}
+
+  EntryOneAPI(const Entry& entry) : index(entry.index), fvalue(entry.fvalue) {}
+
+  /*! \brief reversely compare feature values */
+  inline static bool CmpValue(const EntryOneAPI& a, const EntryOneAPI& b) {
+    return a.fvalue < b.fvalue;
+  }
+  inline bool operator==(const EntryOneAPI& other) const {
+    return (this->index == other.index && this->fvalue == other.fvalue);
+  }
+};
+
+struct DeviceMatrixOneAPI {
+  DMatrix* p_mat;  // Pointer to the original matrix on the host
+  cl::sycl::queue qu_;
+  size_t* row_ptr;
+  size_t row_ptr_size;
+  EntryOneAPI* data;
+
+  DeviceMatrixOneAPI(DMatrix* dmat, cl::sycl::queue qu) : p_mat(dmat), qu_(qu) {
+    size_t num_row = 0;
+    size_t num_nonzero = 0;
+    for (auto &batch : dmat->GetBatches<SparsePage>()) {
+      const auto& data_vec = batch.data.HostVector();
+      const auto& offset_vec = batch.offset.HostVector();
+      num_nonzero += data_vec.size();
+      num_row += batch.Size();
+    }
+
+    row_ptr = cl::sycl::malloc_shared<size_t>(num_row + 1, qu_);
+    data = cl::sycl::malloc_shared<EntryOneAPI>(num_nonzero, qu_);
+
+    size_t data_offset = 0;
+    for (auto &batch : dmat->GetBatches<SparsePage>()) {
+      const auto& data_vec = batch.data.HostVector();
+      const auto& offset_vec = batch.offset.HostVector();
+      size_t batch_size = batch.Size();
+      if (batch_size > 0) {
+        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
+                  row_ptr + batch.base_rowid);
+        if (batch.base_rowid > 0) {
+          for(size_t i = 0; i < batch_size; i++)
+            row_ptr[i + batch.base_rowid] += batch.base_rowid;
+        }
+        std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size],
+                  data + data_offset);
+        data_offset += offset_vec[batch_size];
+      }
+    }
+    row_ptr[num_row] = data_offset;
+    row_ptr_size = num_row + 1;
+  }
+
+  ~DeviceMatrixOneAPI() {
+    if (row_ptr) {
+      cl::sycl::free(row_ptr, qu_);
+    }
+    if (data) {
+      cl::sycl::free(data, qu_);
+    }
+  }
+};
+
+struct DeviceNodeOneAPI {
+  DeviceNodeOneAPI()
+      : fidx(-1), left_child_idx(-1), right_child_idx(-1) {}
+
+  union NodeValue {
+    float leaf_weight;
+    float fvalue;
+  };
+
+  int fidx;
+  int left_child_idx;
+  int right_child_idx;
+  NodeValue val;
+
+  DeviceNodeOneAPI(const RegTree::Node& n) {  // NOLINT
+    this->left_child_idx = n.LeftChild();
+    this->right_child_idx = n.RightChild();
+    this->fidx = n.SplitIndex();
+    if (n.DefaultLeft()) {
+      fidx |= (1U << 31);
+    }
+
+    if (n.IsLeaf()) {
+      this->val.leaf_weight = n.LeafValue();
+    } else {
+      this->val.fvalue = n.SplitCond();
+    }
+  }
+
+  bool IsLeaf() const { return left_child_idx == -1; }
+
+  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
+
+  bool MissingLeft() const { return (fidx >> 31) != 0; }
+
+  int MissingIdx() const {
+    if (MissingLeft()) {
+      return this->left_child_idx;
+    } else {
+      return this->right_child_idx;
+    }
+  }
+
+  float GetFvalue() const { return val.fvalue; }
+
+  float GetWeight() const { return val.leaf_weight; }
+};
+
+class DeviceModelOneAPI {
+ public:
+  cl::sycl::queue qu_;
+  DeviceNodeOneAPI* nodes;
+  size_t* tree_segments;
+  int* tree_group;
+  size_t tree_beg_;
+  size_t tree_end_;
+  int num_group;
+
+  DeviceModelOneAPI() : nodes(nullptr), tree_segments(nullptr), tree_group(nullptr) {}
+
+  ~DeviceModelOneAPI() {
+    Reset();
+  }
+
+  void Reset() {
+    if (nodes)
+      cl::sycl::free(nodes, qu_);
+    if (tree_segments)
+      cl::sycl::free(tree_segments, qu_);
+    if (tree_group)
+      cl::sycl::free(tree_group, qu_);
+  }
+
+  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, cl::sycl::queue qu) {
+    qu_ = qu;
+    CHECK_EQ(model.param.size_leaf_vector, 0);
+    Reset();
+
+    tree_segments = cl::sycl::malloc_shared<size_t>((tree_end - tree_begin) + 1, qu_);
+    int sum = 0;
+    tree_segments[0] = sum;
+    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      sum += model.trees[tree_idx]->GetNodes().size();
+      tree_segments[tree_idx - tree_begin + 1] = sum;
+    }
+
+    nodes = cl::sycl::malloc_shared<DeviceNodeOneAPI>(sum, qu_);
+    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      auto& src_nodes = model.trees[tree_idx]->GetNodes();
+      for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++)
+        nodes[node_idx + tree_segments[tree_idx - tree_begin]] = src_nodes[node_idx];
+    }
+
+    tree_group = cl::sycl::malloc_shared<int>(model.tree_info.size(), qu_);
+    for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++)
+      tree_group[tree_idx] = model.tree_info[tree_idx];
+
+    tree_beg_ = tree_begin;
+    tree_end_ = tree_end;
+    num_group = model.learner_model_param->num_output_group;
+  }
+};
+
+float GetFvalue(int ridx, int fidx, EntryOneAPI* data, size_t* row_ptr, bool& is_missing) {
+  // Binary search
+  auto begin_ptr = data + row_ptr[ridx];
+  auto end_ptr = data + row_ptr[ridx + 1];
+  EntryOneAPI* previous_middle = nullptr;
+  while (end_ptr != begin_ptr) {
+    auto middle = begin_ptr + (end_ptr - begin_ptr) / 2;
+    if (middle == previous_middle) {
+      break;
+    } else {
+      previous_middle = middle;
+    }
+
+    if (middle->index == fidx) {
+      is_missing = false;
+      return middle->fvalue;
+    } else if (middle->index < fidx) {
+      begin_ptr = middle;
+    } else {
+      end_ptr = middle;
+    }
+  }
+  is_missing = true;
+  return 0.0;
+}
+
+float GetLeafWeight(int ridx, const DeviceNodeOneAPI* tree, EntryOneAPI* data, size_t* row_ptr) {
+  DeviceNodeOneAPI n = tree[0];
+  int node_id = 0;
+  bool is_missing;
+  while (!n.IsLeaf()) {
+    float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, is_missing);
+    // Missing value
+    if (is_missing) {
+      n = tree[n.MissingIdx()];
+    } else {
+      if (fvalue < n.GetFvalue()) {
+        node_id = n.left_child_idx;
+        n = tree[n.left_child_idx];
+      } else {
+        node_id = n.right_child_idx;
+        n = tree[n.right_child_idx];
+      }
+    }
+  }
+  return n.GetWeight();
+}
+
+class PredictorOneAPI : public Predictor {
+ protected:
+  void InitOutPredictions(const MetaInfo& info,
+                          HostDeviceVector<bst_float>* out_preds,
+                          const gbm::GBTreeModel& model) const {
+    CHECK_NE(model.learner_model_param->num_output_group, 0);
+    size_t n = model.learner_model_param->num_output_group * info.num_row_;
+    const auto& base_margin = info.base_margin_.HostVector();
+    out_preds->Resize(n);
+    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
+    if (base_margin.size() == n) {
+      CHECK_EQ(out_preds->Size(), n);
+      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
+    } else {
+      if (!base_margin.empty()) {
+        std::ostringstream oss;
+        oss << "Ignoring the base margin, since it has incorrect length. "
+            << "The base margin must be an array of length ";
+        if (model.learner_model_param->num_output_group > 1) {
+          oss << "[num_class] * [number of data points], i.e. "
+              << model.learner_model_param->num_output_group << " * " << info.num_row_
+              << " = " << n << ". ";
+        } else {
+          oss << "[number of data points], i.e. " << info.num_row_ << ". ";
+        }
+        oss << "Instead, all data points will use "
+            << "base_score = " << model.learner_model_param->base_score;
+        LOG(WARNING) << oss.str();
+      }
+      std::fill(out_preds_h.begin(), out_preds_h.end(),
+                model.learner_model_param->base_score);
+    }
+  }
+
+  void DevicePredictInternal(DeviceMatrixOneAPI* dmat, HostDeviceVector<float>* out_preds,
+                             const gbm::GBTreeModel& model, size_t tree_begin,
+                             size_t tree_end) {
+    if (tree_end - tree_begin == 0) {
+      return;
+    }
+    model_.Init(model, tree_begin, tree_end, qu_);
+
+    auto& out_preds_vec = out_preds->HostVector();
+
+    DeviceNodeOneAPI* nodes = model_.nodes;
+    cl::sycl::buffer<float, 1> out_preds_buf(out_preds_vec.data(), out_preds_vec.size());
+    size_t* tree_segments = model_.tree_segments;
+    int* tree_group = model_.tree_group;
+    size_t* row_ptr = dmat->row_ptr;
+    EntryOneAPI* data = dmat->data;
+    int num_features = dmat->p_mat->Info().num_col_;
+    int num_rows = dmat->row_ptr_size - 1;
+    int num_group = model.learner_model_param->num_output_group;
+
+    qu_.submit([&](cl::sycl::handler& cgh) {
+      auto out_predictions = out_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+      cgh.parallel_for<class PredictInternal>(cl::sycl::range<1>(num_rows), [=](cl::sycl::id<1> pid) {
+        int global_idx = pid[0];
+        if (global_idx >= num_rows) return;
+        if (num_group == 1) {
+          float sum = 0.0;
+          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
+            sum += GetLeafWeight(global_idx, tree, data, row_ptr);
+          }
+          out_predictions[global_idx] += sum;
+        } else {
+          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
+            int out_prediction_idx = global_idx * num_group + tree_group[tree_idx];
+            out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr);
+          }
+        }
+      });
+    }).wait();
+  }
+
+ public:
+  explicit PredictorOneAPI(Context const* generic_param) :
+      Predictor::Predictor{generic_param}, cpu_predictor(Predictor::Create("cpu_predictor", generic_param)) {
+    cl::sycl::default_selector selector;
+    qu_ = cl::sycl::queue(selector);
+  }
+
+  // ntree_limit is a very problematic parameter, as it's ambiguous in the context of
+  // multi-output and forest.  Same problem exists for tree_begin
+  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
+                    const gbm::GBTreeModel& model, int tree_begin,
+                    uint32_t const ntree_limit = 0) override {
+    if (this->device_matrix_cache_.find(dmat) ==
+        this->device_matrix_cache_.end()) {
+      this->device_matrix_cache_.emplace(
+          dmat, std::unique_ptr<DeviceMatrixOneAPI>(
+                    new DeviceMatrixOneAPI(dmat, qu_)));
+    }
+    DeviceMatrixOneAPI* device_matrix = device_matrix_cache_.find(dmat)->second.get();
+
+    // tree_begin is not used, right now we just enforce it to be 0.
+    CHECK_EQ(tree_begin, 0);
+    auto* out_preds = &predts->predictions;
+    CHECK_GE(predts->version, tree_begin);
+    if (out_preds->Size() == 0 && dmat->Info().num_row_ != 0) {
+      CHECK_EQ(predts->version, 0);
+    }
+    if (predts->version == 0) {
+      // out_preds->Size() can be non-zero as it's initialized here before any tree is
+      // built at the 0^th iterator.
+      this->InitOutPredictions(dmat->Info(), out_preds, model);
+    }
+
+    uint32_t const output_groups = model.learner_model_param->num_output_group;
+    CHECK_NE(output_groups, 0);
+    // Right now we just assume ntree_limit provided by users means number of tree layers
+    // in the context of multi-output model
+    uint32_t real_ntree_limit = ntree_limit * output_groups;
+    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
+      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
+    }
+
+    uint32_t const end_version = (tree_begin + real_ntree_limit) / output_groups;
+    // When users have provided ntree_limit, end_version can be lesser, cache is violated
+    if (predts->version > end_version) {
+      CHECK_NE(ntree_limit, 0);
+      this->InitOutPredictions(dmat->Info(), out_preds, model);
+      predts->version = 0;
+    }
+    uint32_t const beg_version = predts->version;
+    CHECK_LE(beg_version, end_version);
+
+    if (beg_version < end_version) {
+      DevicePredictInternal(device_matrix, out_preds, model,
+                            beg_version * output_groups,
+                            end_version * output_groups);
+    }
+
+    // delta means {size of forest} * {number of newly accumulated layers}
+    uint32_t delta = end_version - beg_version;
+    CHECK_LE(delta, model.trees.size());
+    predts->Update(delta);
+
+    CHECK(out_preds->Size() == output_groups * dmat->Info().num_row_ ||
+          out_preds->Size() == dmat->Info().num_row_);
+  }
+
+  void InplacePredict(std::any const& x, const gbm::GBTreeModel& model, float missing,
+                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
+                      unsigned tree_end) const override {
+    cpu_predictor->InplacePredict(x, model, missing, out_preds, tree_begin, tree_end);
+  }
+
+  void PredictInstance(const SparsePage::Inst& inst,
+                       std::vector<bst_float>* out_preds,
+                       const gbm::GBTreeModel& model, unsigned ntree_limit) override {
+    cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit);
+  }
+
+  void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
+                   const gbm::GBTreeModel& model, unsigned ntree_limit) override {
+    cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
+  }
+
+  void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
+                           const gbm::GBTreeModel& model, uint32_t ntree_limit,
+                           std::vector<bst_float>* tree_weights,
+                           bool approximate, int condition,
+                           unsigned condition_feature) override {
+    cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate, condition, condition_feature);
+  }
+
+  void PredictInteractionContributions(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
+                                       const gbm::GBTreeModel& model, unsigned ntree_limit,
+                                       std::vector<bst_float>* tree_weights,
+                                       bool approximate) override {
+    cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate);
+  }
+
+ private:
+  cl::sycl::queue qu_;
+  DeviceModelOneAPI model_;
+
+  std::mutex lock_;
+  std::unique_ptr<Predictor> cpu_predictor;
+
+  std::unordered_map<DMatrix*, std::unique_ptr<DeviceMatrixOneAPI>>
+      device_matrix_cache_;
+};
+
+XGBOOST_REGISTER_PREDICTOR(PredictorOneAPI, "oneapi_predictor")
+.describe("Make predictions using DPC++.")
+.set_body([](Context const* generic_param) {
+            return new PredictorOneAPI(generic_param);
+          });
+}  // namespace predictor
+}  // namespace xgboost
diff --git a/plugin/updater_oneapi/regression_loss_oneapi.h b/plugin/updater_oneapi/regression_loss_oneapi.h
index 4759f5c3f..b0299ff7f 100755
--- a/plugin/updater_oneapi/regression_loss_oneapi.h
+++ b/plugin/updater_oneapi/regression_loss_oneapi.h
@@ -1,145 +1,145 @@
-/*!
- * Copyright 2017-2020 XGBoost contributors
- */
-#ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
-#define XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
-
-#include <dmlc/omp.h>
-#include <xgboost/logging.h>
-#include <algorithm>
-
-#include "CL/sycl.hpp"
-
-namespace xgboost {
-namespace obj {
-
-/*!
- * \brief calculate the sigmoid of the input.
- * \param x input parameter
- * \return the transformed value.
- */
-inline float SigmoidOneAPI(float x) {
-  return 1.0f / (1.0f + cl::sycl::exp(-x));
-}
-
-// common regressions
-// linear regression
-struct LinearSquareLossOneAPI {
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bool CheckLabel(bst_float x) { return true; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    return 1.0f;
-  }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
-  static const char* LabelErrorMsg() { return ""; }
-  static const char* DefaultEvalMetric() { return "rmse"; }
-
-  static const char* Name() { return "reg:squarederror_oneapi"; }
-};
-
-// TODO: DPC++ does not fully support std math inside offloaded kernels
-struct SquaredLogErrorOneAPI {
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bool CheckLabel(bst_float label) {
-    return label > -1;
-  }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    predt = std::max(predt, (bst_float)(-1 + 1e-6));  // ensure correct value for log1p
-    return (cl::sycl::log1p(predt) - cl::sycl::log1p(label)) / (predt + 1);
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    predt = std::max(predt, (bst_float)(-1 + 1e-6));
-    float res = (-cl::sycl::log1p(predt) + cl::sycl::log1p(label) + 1) /
-                cl::sycl::pow(predt + 1, (bst_float)2);
-    res = std::max(res, (bst_float)1e-6f);
-    return res;
-  }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
-  static const char* LabelErrorMsg() {
-    return "label must be greater than -1 for rmsle so that log(label + 1) can be valid.";
-  }
-  static const char* DefaultEvalMetric() { return "rmsle"; }
-
-  static const char* Name() { return "reg:squaredlogerror_oneapi"; }
-};
-
-// logistic loss for probability regression task
-struct LogisticRegressionOneAPI {
-  // duplication is necessary, as __device__ specifier
-  // cannot be made conditional on template parameter
-  static bst_float PredTransform(bst_float x) { return SigmoidOneAPI(x); }
-  static bool CheckLabel(bst_float x) { return x >= 0.0f && x <= 1.0f; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    const bst_float eps = 1e-16f;
-    return std::max(predt * (1.0f - predt), eps);
-  }
-  template <typename T>
-  static T PredTransform(T x) { return SigmoidOneAPI(x); }
-  template <typename T>
-  static T FirstOrderGradient(T predt, T label) { return predt - label; }
-  template <typename T>
-  static T SecondOrderGradient(T predt, T label) {
-    const T eps = T(1e-16f);
-    return std::max(predt * (T(1.0f) - predt), eps);
-  }
-  static bst_float ProbToMargin(bst_float base_score) {
-    CHECK(base_score > 0.0f && base_score < 1.0f)
-        << "base_score must be in (0,1) for logistic loss, got: " << base_score;
-    return -logf(1.0f / base_score - 1.0f);
-  }
-  static const char* LabelErrorMsg() {
-    return "label must be in [0,1] for logistic regression";
-  }
-  static const char* DefaultEvalMetric() { return "rmse"; }
-
-  static const char* Name() { return "reg:logistic_oneapi"; }
-};
-
-// logistic loss for binary classification task
-struct LogisticClassificationOneAPI : public LogisticRegressionOneAPI {
-  static const char* DefaultEvalMetric() { return "logloss"; }
-  static const char* Name() { return "binary:logistic_oneapi"; }
-};
-
-// logistic loss, but predict un-transformed margin
-struct LogisticRawOneAPI : public LogisticRegressionOneAPI {
-  // duplication is necessary, as __device__ specifier
-  // cannot be made conditional on template parameter
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    predt = SigmoidOneAPI(predt);
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    const bst_float eps = 1e-16f;
-    predt = SigmoidOneAPI(predt);
-    return std::max(predt * (1.0f - predt), eps);
-  }
-  template <typename T>
-    static T PredTransform(T x) { return x; }
-  template <typename T>
-    static T FirstOrderGradient(T predt, T label) {
-    predt = SigmoidOneAPI(predt);
-    return predt - label;
-  }
-  template <typename T>
-    static T SecondOrderGradient(T predt, T label) {
-    const T eps = T(1e-16f);
-    predt = SigmoidOneAPI(predt);
-    return std::max(predt * (T(1.0f) - predt), eps);
-  }
-  static const char* DefaultEvalMetric() { return "logloss"; }
-
-  static const char* Name() { return "binary:logitraw_oneapi"; }
-};
-
-}  // namespace obj
-}  // namespace xgboost
-
-#endif  // XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
+/*!
+ * Copyright 2017-2020 XGBoost contributors
+ */
+#ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
+#define XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
+
+#include <dmlc/omp.h>
+#include <xgboost/logging.h>
+#include <algorithm>
+
+#include "CL/sycl.hpp"
+
+namespace xgboost {
+namespace obj {
+
+/*!
+ * \brief calculate the sigmoid of the input.
+ * \param x input parameter
+ * \return the transformed value.
+ */
+inline float SigmoidOneAPI(float x) {
+  return 1.0f / (1.0f + cl::sycl::exp(-x));
+}
+
+// common regressions
+// linear regression
+struct LinearSquareLossOneAPI {
+  static bst_float PredTransform(bst_float x) { return x; }
+  static bool CheckLabel(bst_float x) { return true; }
+  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
+    return predt - label;
+  }
+  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
+    return 1.0f;
+  }
+  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
+  static const char* LabelErrorMsg() { return ""; }
+  static const char* DefaultEvalMetric() { return "rmse"; }
+
+  static const char* Name() { return "reg:squarederror_oneapi"; }
+};
+
+// TODO: DPC++ does not fully support std math inside offloaded kernels
+struct SquaredLogErrorOneAPI {
+  static bst_float PredTransform(bst_float x) { return x; }
+  static bool CheckLabel(bst_float label) {
+    return label > -1;
+  }
+  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
+    predt = std::max(predt, (bst_float)(-1 + 1e-6));  // ensure correct value for log1p
+    return (cl::sycl::log1p(predt) - cl::sycl::log1p(label)) / (predt + 1);
+  }
+  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
+    predt = std::max(predt, (bst_float)(-1 + 1e-6));
+    float res = (-cl::sycl::log1p(predt) + cl::sycl::log1p(label) + 1) /
+                cl::sycl::pow(predt + 1, (bst_float)2);
+    res = std::max(res, (bst_float)1e-6f);
+    return res;
+  }
+  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
+  static const char* LabelErrorMsg() {
+    return "label must be greater than -1 for rmsle so that log(label + 1) can be valid.";
+  }
+  static const char* DefaultEvalMetric() { return "rmsle"; }
+
+  static const char* Name() { return "reg:squaredlogerror_oneapi"; }
+};
+
+// logistic loss for probability regression task
+struct LogisticRegressionOneAPI {
+  // duplication is necessary, as __device__ specifier
+  // cannot be made conditional on template parameter
+  static bst_float PredTransform(bst_float x) { return SigmoidOneAPI(x); }
+  static bool CheckLabel(bst_float x) { return x >= 0.0f && x <= 1.0f; }
+  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
+    return predt - label;
+  }
+  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
+    const bst_float eps = 1e-16f;
+    return std::max(predt * (1.0f - predt), eps);
+  }
+  template <typename T>
+  static T PredTransform(T x) { return SigmoidOneAPI(x); }
+  template <typename T>
+  static T FirstOrderGradient(T predt, T label) { return predt - label; }
+  template <typename T>
+  static T SecondOrderGradient(T predt, T label) {
+    const T eps = T(1e-16f);
+    return std::max(predt * (T(1.0f) - predt), eps);
+  }
+  static bst_float ProbToMargin(bst_float base_score) {
+    CHECK(base_score > 0.0f && base_score < 1.0f)
+        << "base_score must be in (0,1) for logistic loss, got: " << base_score;
+    return -logf(1.0f / base_score - 1.0f);
+  }
+  static const char* LabelErrorMsg() {
+    return "label must be in [0,1] for logistic regression";
+  }
+  static const char* DefaultEvalMetric() { return "rmse"; }
+
+  static const char* Name() { return "reg:logistic_oneapi"; }
+};
+
+// logistic loss for binary classification task
+struct LogisticClassificationOneAPI : public LogisticRegressionOneAPI {
+  static const char* DefaultEvalMetric() { return "logloss"; }
+  static const char* Name() { return "binary:logistic_oneapi"; }
+};
+
+// logistic loss, but predict un-transformed margin
+struct LogisticRawOneAPI : public LogisticRegressionOneAPI {
+  // duplication is necessary, as __device__ specifier
+  // cannot be made conditional on template parameter
+  static bst_float PredTransform(bst_float x) { return x; }
+  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
+    predt = SigmoidOneAPI(predt);
+    return predt - label;
+  }
+  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
+    const bst_float eps = 1e-16f;
+    predt = SigmoidOneAPI(predt);
+    return std::max(predt * (1.0f - predt), eps);
+  }
+  template <typename T>
+    static T PredTransform(T x) { return x; }
+  template <typename T>
+    static T FirstOrderGradient(T predt, T label) {
+    predt = SigmoidOneAPI(predt);
+    return predt - label;
+  }
+  template <typename T>
+    static T SecondOrderGradient(T predt, T label) {
+    const T eps = T(1e-16f);
+    predt = SigmoidOneAPI(predt);
+    return std::max(predt * (T(1.0f) - predt), eps);
+  }
+  static const char* DefaultEvalMetric() { return "logloss"; }
+
+  static const char* Name() { return "binary:logitraw_oneapi"; }
+};
+
+}  // namespace obj
+}  // namespace xgboost
+
+#endif  // XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
diff --git a/plugin/updater_oneapi/regression_obj_oneapi.cc b/plugin/updater_oneapi/regression_obj_oneapi.cc
index 4a1bd7229..3ee5741e7 100755
--- a/plugin/updater_oneapi/regression_obj_oneapi.cc
+++ b/plugin/updater_oneapi/regression_obj_oneapi.cc
@@ -1,182 +1,182 @@
-#include <xgboost/logging.h>
-#include <xgboost/objective.h>
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "xgboost/host_device_vector.h"
-#include "xgboost/json.h"
-#include "xgboost/parameter.h"
-#include "xgboost/span.h"
-
-#include "../../src/common/transform.h"
-#include "../../src/common/common.h"
-#include "./regression_loss_oneapi.h"
-
-#include "CL/sycl.hpp"
-
-namespace xgboost {
-namespace obj {
-
-DMLC_REGISTRY_FILE_TAG(regression_obj_oneapi);
-
-struct RegLossParamOneAPI : public XGBoostParameter<RegLossParamOneAPI> {
-  float scale_pos_weight;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(RegLossParamOneAPI) {
-    DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f)
-      .describe("Scale the weight of positive examples by this factor");
-  }
-};
-
-template<typename Loss>
-class RegLossObjOneAPI : public ObjFunction {
- protected:
-  HostDeviceVector<int> label_correct_;
-
- public:
-  RegLossObjOneAPI() = default;
-
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.UpdateAllowUnknown(args);
-
-    cl::sycl::default_selector selector;
-    qu_ = cl::sycl::queue(selector);
-  }
-
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo &info,
-                   int iter,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
-    if (info.labels_.Size() == 0U) {
-      LOG(WARNING) << "Label set is empty.";
-    }
-    CHECK_EQ(preds.Size(), info.labels_.Size())
-        << " " << "labels are not correctly provided"
-        << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size() << ", "
-        << "Loss: " << Loss::Name();
-
-    size_t const ndata = preds.Size();
-    out_gpair->Resize(ndata);
-
-    // TODO: add label_correct check
-    label_correct_.Resize(1);
-    label_correct_.Fill(1);
-
-    bool is_null_weight = info.weights_.Size() == 0;
-
-    cl::sycl::buffer<bst_float, 1> preds_buf(preds.HostPointer(), preds.Size());
-    cl::sycl::buffer<bst_float, 1> labels_buf(info.labels_.HostPointer(), info.labels_.Size());
-    cl::sycl::buffer<GradientPair, 1> out_gpair_buf(out_gpair->HostPointer(), out_gpair->Size());
-    cl::sycl::buffer<bst_float, 1> weights_buf(is_null_weight ? NULL : info.weights_.HostPointer(),
-                                               is_null_weight ? 1 : info.weights_.Size());
-
-	cl::sycl::buffer<int, 1> additional_input_buf(1);
-	{
-		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>();
-		additional_input_acc[0] = 1; // Fill the label_correct flag
-	}
-
-    auto scale_pos_weight = param_.scale_pos_weight;
-    if (!is_null_weight) {
-      CHECK_EQ(info.weights_.Size(), ndata)
-        << "Number of weights should be equal to number of data points.";
-    }
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto preds_acc            = preds_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto labels_acc           = labels_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto weights_acc          = weights_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto out_gpair_acc        = out_gpair_buf.get_access<cl::sycl::access::mode::write>(cgh);
-      auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>(cgh);
-      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
-        int idx = pid[0];
-        bst_float p = Loss::PredTransform(preds_acc[idx]);
-        bst_float w = is_null_weight ? 1.0f : weights_acc[idx];
-        bst_float label = labels_acc[idx];
-        if (label == 1.0f) {
-          w *= scale_pos_weight;
-        }
-        if (!Loss::CheckLabel(label)) {
-          // If there is an incorrect label, the host code will know.
-          additional_input_acc[0] = 0;
-        }
-        out_gpair_acc[idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
-                                          Loss::SecondOrderGradient(p, label) * w);
-      });
-    }).wait();
-
-    int flag = 1;
-	{
-		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::read>();
-		flag = additional_input_acc[0];
-	}
-
-    if (flag == 0) {
-      LOG(FATAL) << Loss::LabelErrorMsg();
-    }
-  
-  }
-
- public:
-  const char* DefaultEvalMetric() const override {
-    return Loss::DefaultEvalMetric();
-  }
-
-  void PredTransform(HostDeviceVector<float> *io_preds) override {
-    size_t const ndata = io_preds->Size();
-
-    cl::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto io_preds_acc = io_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
-      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
-        int idx = pid[0];
-        io_preds_acc[idx] = Loss::PredTransform(io_preds_acc[idx]);
-      });
-    }).wait();
-  }
-
-  float ProbToMargin(float base_score) const override {
-    return Loss::ProbToMargin(base_score);
-  }
-
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["name"] = String(Loss::Name());
-    out["reg_loss_param"] = ToJson(param_);
-  }
-
-  void LoadConfig(Json const& in) override {
-    FromJson(in["reg_loss_param"], &param_);
-  }
-
- protected:
-  RegLossParamOneAPI param_;
-
-  cl::sycl::queue qu_;
-};
-
-// register the objective functions
-DMLC_REGISTER_PARAMETER(RegLossParamOneAPI);
-
-// TODO: Find a better way to dispatch names of DPC++ kernels with various template parameters of loss function
-XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegressionOneAPI, LinearSquareLossOneAPI::Name())
-.describe("Regression with squared error with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LinearSquareLossOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(SquareLogErrorOneAPI, SquaredLogErrorOneAPI::Name())
-.describe("Regression with root mean squared logarithmic error with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<SquaredLogErrorOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticRegressionOneAPI, LogisticRegressionOneAPI::Name())
-.describe("Logistic regression for probability regression task with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticRegressionOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticClassificationOneAPI, LogisticClassificationOneAPI::Name())
-.describe("Logistic regression for binary classification task with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticClassificationOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticRawOneAPI, LogisticRawOneAPI::Name())
-.describe("Logistic regression for classification, output score "
-          "before logistic transformation with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticRawOneAPI>(); });
-
-}  // namespace obj
-}  // namespace xgboost
+#include <xgboost/logging.h>
+#include <xgboost/objective.h>
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "xgboost/host_device_vector.h"
+#include "xgboost/json.h"
+#include "xgboost/parameter.h"
+#include "xgboost/span.h"
+
+#include "../../src/common/transform.h"
+#include "../../src/common/common.h"
+#include "./regression_loss_oneapi.h"
+
+#include "CL/sycl.hpp"
+
+namespace xgboost {
+namespace obj {
+
+DMLC_REGISTRY_FILE_TAG(regression_obj_oneapi);
+
+struct RegLossParamOneAPI : public XGBoostParameter<RegLossParamOneAPI> {
+  float scale_pos_weight;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(RegLossParamOneAPI) {
+    DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f)
+      .describe("Scale the weight of positive examples by this factor");
+  }
+};
+
+template<typename Loss>
+class RegLossObjOneAPI : public ObjFunction {
+ protected:
+  HostDeviceVector<int> label_correct_;
+
+ public:
+  RegLossObjOneAPI() = default;
+
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param_.UpdateAllowUnknown(args);
+
+    cl::sycl::default_selector selector;
+    qu_ = cl::sycl::queue(selector);
+  }
+
+  void GetGradient(const HostDeviceVector<bst_float>& preds,
+                   const MetaInfo &info,
+                   int iter,
+                   HostDeviceVector<GradientPair>* out_gpair) override {
+    if (info.labels_.Size() == 0U) {
+      LOG(WARNING) << "Label set is empty.";
+    }
+    CHECK_EQ(preds.Size(), info.labels_.Size())
+        << " " << "labels are not correctly provided"
+        << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size() << ", "
+        << "Loss: " << Loss::Name();
+
+    size_t const ndata = preds.Size();
+    out_gpair->Resize(ndata);
+
+    // TODO: add label_correct check
+    label_correct_.Resize(1);
+    label_correct_.Fill(1);
+
+    bool is_null_weight = info.weights_.Size() == 0;
+
+    cl::sycl::buffer<bst_float, 1> preds_buf(preds.HostPointer(), preds.Size());
+    cl::sycl::buffer<bst_float, 1> labels_buf(info.labels_.HostPointer(), info.labels_.Size());
+    cl::sycl::buffer<GradientPair, 1> out_gpair_buf(out_gpair->HostPointer(), out_gpair->Size());
+    cl::sycl::buffer<bst_float, 1> weights_buf(is_null_weight ? NULL : info.weights_.HostPointer(),
+                                               is_null_weight ? 1 : info.weights_.Size());
+
+	cl::sycl::buffer<int, 1> additional_input_buf(1);
+	{
+		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>();
+		additional_input_acc[0] = 1; // Fill the label_correct flag
+	}
+
+    auto scale_pos_weight = param_.scale_pos_weight;
+    if (!is_null_weight) {
+      CHECK_EQ(info.weights_.Size(), ndata)
+        << "Number of weights should be equal to number of data points.";
+    }
+
+    qu_.submit([&](cl::sycl::handler& cgh) {
+      auto preds_acc            = preds_buf.get_access<cl::sycl::access::mode::read>(cgh);
+      auto labels_acc           = labels_buf.get_access<cl::sycl::access::mode::read>(cgh);
+      auto weights_acc          = weights_buf.get_access<cl::sycl::access::mode::read>(cgh);
+      auto out_gpair_acc        = out_gpair_buf.get_access<cl::sycl::access::mode::write>(cgh);
+      auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>(cgh);
+      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
+        int idx = pid[0];
+        bst_float p = Loss::PredTransform(preds_acc[idx]);
+        bst_float w = is_null_weight ? 1.0f : weights_acc[idx];
+        bst_float label = labels_acc[idx];
+        if (label == 1.0f) {
+          w *= scale_pos_weight;
+        }
+        if (!Loss::CheckLabel(label)) {
+          // If there is an incorrect label, the host code will know.
+          additional_input_acc[0] = 0;
+        }
+        out_gpair_acc[idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
+                                          Loss::SecondOrderGradient(p, label) * w);
+      });
+    }).wait();
+
+    int flag = 1;
+	{
+		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::read>();
+		flag = additional_input_acc[0];
+	}
+
+    if (flag == 0) {
+      LOG(FATAL) << Loss::LabelErrorMsg();
+    }
+  
+  }
+
+ public:
+  const char* DefaultEvalMetric() const override {
+    return Loss::DefaultEvalMetric();
+  }
+
+  void PredTransform(HostDeviceVector<float> *io_preds) override {
+    size_t const ndata = io_preds->Size();
+
+    cl::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
+
+    qu_.submit([&](cl::sycl::handler& cgh) {
+      auto io_preds_acc = io_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
+        int idx = pid[0];
+        io_preds_acc[idx] = Loss::PredTransform(io_preds_acc[idx]);
+      });
+    }).wait();
+  }
+
+  float ProbToMargin(float base_score) const override {
+    return Loss::ProbToMargin(base_score);
+  }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String(Loss::Name());
+    out["reg_loss_param"] = ToJson(param_);
+  }
+
+  void LoadConfig(Json const& in) override {
+    FromJson(in["reg_loss_param"], &param_);
+  }
+
+ protected:
+  RegLossParamOneAPI param_;
+
+  cl::sycl::queue qu_;
+};
+
+// register the objective functions
+DMLC_REGISTER_PARAMETER(RegLossParamOneAPI);
+
+// TODO: Find a better way to dispatch names of DPC++ kernels with various template parameters of loss function
+XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegressionOneAPI, LinearSquareLossOneAPI::Name())
+.describe("Regression with squared error with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<LinearSquareLossOneAPI>(); });
+XGBOOST_REGISTER_OBJECTIVE(SquareLogErrorOneAPI, SquaredLogErrorOneAPI::Name())
+.describe("Regression with root mean squared logarithmic error with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<SquaredLogErrorOneAPI>(); });
+XGBOOST_REGISTER_OBJECTIVE(LogisticRegressionOneAPI, LogisticRegressionOneAPI::Name())
+.describe("Logistic regression for probability regression task with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<LogisticRegressionOneAPI>(); });
+XGBOOST_REGISTER_OBJECTIVE(LogisticClassificationOneAPI, LogisticClassificationOneAPI::Name())
+.describe("Logistic regression for binary classification task with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<LogisticClassificationOneAPI>(); });
+XGBOOST_REGISTER_OBJECTIVE(LogisticRawOneAPI, LogisticRawOneAPI::Name())
+.describe("Logistic regression for classification, output score "
+          "before logistic transformation with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<LogisticRawOneAPI>(); });
+
+}  // namespace obj
+}  // namespace xgboost
diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index 5be6a058a..6569f7e3d 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -324,7 +324,7 @@ class EarlyStopping(TrainingCallback):
 
             es = xgboost.callback.EarlyStopping(
                 rounds=2,
-                abs_tol=1e-3,
+                min_delta=1e-3,
                 save_best=True,
                 maximize=False,
                 data_name="validation_0",
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 3204f5a2a..52175981a 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -312,6 +312,19 @@ __model_doc = f"""
         needs to be set to have categorical feature support. See :doc:`Categorical Data
         </tutorials/categorical>` and :ref:`cat-param` for details.
 
+    multi_strategy : Optional[str]
+
+        .. versionadded:: 2.0.0
+
+        .. note:: This parameter is working-in-progress.
+
+        The strategy used for training multi-target models, including multi-target
+        regression and multi-class classification. See :doc:`/tutorials/multioutput` for
+        more information.
+
+        - ``one_output_per_tree``: One model for each target.
+        - ``multi_output_tree``:  Use multi-target trees.
+
     eval_metric : Optional[Union[str, List[str], Callable]]
 
         .. versionadded:: 1.6.0
@@ -355,18 +368,21 @@ __model_doc = f"""
 
         .. versionadded:: 1.6.0
 
-        Activates early stopping. Validation metric needs to improve at least once in
-        every **early_stopping_rounds** round(s) to continue training.  Requires at least
-        one item in **eval_set** in :py:meth:`fit`.
+        - Activates early stopping. Validation metric needs to improve at least once in
+          every **early_stopping_rounds** round(s) to continue training.  Requires at
+          least one item in **eval_set** in :py:meth:`fit`.
 
-        The method returns the model from the last iteration (not the best one).  If
-        there's more than one item in **eval_set**, the last entry will be used for early
-        stopping.  If there's more than one metric in **eval_metric**, the last metric
-        will be used for early stopping.
+        - The method returns the model from the last iteration, not the best one, use a
+          callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
+          model is preferred.
 
-        If early stopping occurs, the model will have three additional fields:
-        :py:attr:`best_score`, :py:attr:`best_iteration` and
-        :py:attr:`best_ntree_limit`.
+        - If there's more than one item in **eval_set**, the last entry will be used for
+          early stopping.  If there's more than one metric in **eval_metric**, the last
+          metric will be used for early stopping.
+
+        - If early stopping occurs, the model will have three additional fields:
+          :py:attr:`best_score`, :py:attr:`best_iteration` and
+          :py:attr:`best_ntree_limit`.
 
         .. note::
 
@@ -466,7 +482,9 @@ Parameters
         doc.extend([get_doc(i) for i in items])
         if end_note:
             doc.append(end_note)
-        full_doc = [header + "\n\n"]
+        full_doc = [
+            header + "\nSee :doc:`/python/sklearn_estimator` for more information.\n"
+        ]
         full_doc.extend(doc)
         cls.__doc__ = "".join(full_doc)
         return cls
@@ -624,6 +642,7 @@ class XGBModel(XGBModelBase):
         feature_types: Optional[FeatureTypes] = None,
         max_cat_to_onehot: Optional[int] = None,
         max_cat_threshold: Optional[int] = None,
+        multi_strategy: Optional[str] = None,
         eval_metric: Optional[Union[str, List[str], Callable]] = None,
         early_stopping_rounds: Optional[int] = None,
         callbacks: Optional[List[TrainingCallback]] = None,
@@ -670,6 +689,7 @@ class XGBModel(XGBModelBase):
         self.feature_types = feature_types
         self.max_cat_to_onehot = max_cat_to_onehot
         self.max_cat_threshold = max_cat_threshold
+        self.multi_strategy = multi_strategy
         self.eval_metric = eval_metric
         self.early_stopping_rounds = early_stopping_rounds
         self.callbacks = callbacks
@@ -1131,10 +1151,10 @@ class XGBModel(XGBModelBase):
         base_margin: Optional[ArrayLike] = None,
         iteration_range: Optional[Tuple[int, int]] = None,
     ) -> ArrayLike:
-        """Predict with `X`.  If the model is trained with early stopping, then `best_iteration`
-        is used automatically.  For tree models, when data is on GPU, like cupy array or
-        cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
-        automatically, otherwise it will run on CPU.
+        """Predict with `X`.  If the model is trained with early stopping, then
+        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
+        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
+        prediction is run on GPU automatically, otherwise it will run on CPU.
 
         .. note:: This function is only thread safe for `gbtree` and `dart`.
 
@@ -1209,8 +1229,8 @@ class XGBModel(XGBModelBase):
         ntree_limit: int = 0,
         iteration_range: Optional[Tuple[int, int]] = None,
     ) -> np.ndarray:
-        """Return the predicted leaf every tree for each sample. If the model is trained with
-        early stopping, then `best_iteration` is used automatically.
+        """Return the predicted leaf every tree for each sample. If the model is trained
+        with early stopping, then :py:attr:`best_iteration` is used automatically.
 
         Parameters
         ----------
@@ -1620,7 +1640,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
         base_margin: Optional[ArrayLike] = None,
         iteration_range: Optional[Tuple[int, int]] = None,
     ) -> np.ndarray:
-        """Predict the probability of each `X` example being of a given class.
+        """Predict the probability of each `X` example being of a given class. If the
+        model is trained with early stopping, then :py:attr:`best_iteration` is used
+        automatically.
 
         .. note:: This function is only thread safe for `gbtree` and `dart`.
 
@@ -1646,6 +1668,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
         prediction :
             a numpy array of shape array-like of shape (n_samples, n_classes) with the
             probability of each data example being of a given class.
+
         """
         # custom obj:      Do nothing as we don't know what to do.
         # softprob:        Do nothing, output is proba.
@@ -2107,11 +2130,13 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
         return super().apply(X, ntree_limit, iteration_range)
 
     def score(self, X: ArrayLike, y: ArrayLike) -> float:
-        """Evaluate score for data using the last evaluation metric.
+        """Evaluate score for data using the last evaluation metric. If the model is
+        trained with early stopping, then :py:attr:`best_iteration` is used
+        automatically.
 
         Parameters
         ----------
-        X : pd.DataFrame|cudf.DataFrame
+        X : Union[pd.DataFrame, cudf.DataFrame]
           Feature matrix. A DataFrame with a special `qid` column.
 
         y :
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 3b33e8774..20a4c681e 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -10,7 +10,6 @@ import os
 import platform
 import socket
 import sys
-import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from io import StringIO
@@ -28,7 +27,6 @@ from typing import (
     TypedDict,
     Union,
 )
-from urllib import request
 
 import numpy as np
 import pytest
@@ -37,6 +35,13 @@ from scipy import sparse
 import xgboost as xgb
 from xgboost.core import ArrayLike
 from xgboost.sklearn import SklObjective
+from xgboost.testing.data import (
+    get_california_housing,
+    get_cancer,
+    get_digits,
+    get_sparse,
+    memory,
+)
 
 hypothesis = pytest.importorskip("hypothesis")
 
@@ -44,13 +49,8 @@ hypothesis = pytest.importorskip("hypothesis")
 from hypothesis import strategies
 from hypothesis.extra.numpy import arrays
 
-joblib = pytest.importorskip("joblib")
 datasets = pytest.importorskip("sklearn.datasets")
 
-Memory = joblib.Memory
-
-memory = Memory("./cachedir", verbose=0)
-
 PytestSkip = TypedDict("PytestSkip", {"condition": bool, "reason": str})
 
 
@@ -352,137 +352,6 @@ class TestDataset:
         return self.name
 
 
-@memory.cache
-def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
-    data = datasets.fetch_california_housing()
-    return data.data, data.target
-
-
-@memory.cache
-def get_digits() -> Tuple[np.ndarray, np.ndarray]:
-    data = datasets.load_digits()
-    return data.data, data.target
-
-
-@memory.cache
-def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
-    return datasets.load_breast_cancer(return_X_y=True)
-
-
-@memory.cache
-def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
-    rng = np.random.RandomState(199)
-    n = 2000
-    sparsity = 0.75
-    X, y = datasets.make_regression(n, random_state=rng)
-    flag = rng.binomial(1, sparsity, X.shape)
-    for i in range(X.shape[0]):
-        for j in range(X.shape[1]):
-            if flag[i, j]:
-                X[i, j] = np.nan
-    return X, y
-
-
-@memory.cache
-def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
-    """
-    Number of samples: 1460
-    Number of features: 20
-    Number of categorical features: 10
-    Number of numerical features: 10
-    """
-    from sklearn.datasets import fetch_openml
-
-    X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
-
-    categorical_columns_subset: List[str] = [
-        "BldgType",  # 5 cats, no nan
-        "GarageFinish",  # 3 cats, nan
-        "LotConfig",  # 5 cats, no nan
-        "Functional",  # 7 cats, no nan
-        "MasVnrType",  # 4 cats, nan
-        "HouseStyle",  # 8 cats, no nan
-        "FireplaceQu",  # 5 cats, nan
-        "ExterCond",  # 5 cats, no nan
-        "ExterQual",  # 4 cats, no nan
-        "PoolQC",  # 3 cats, nan
-    ]
-
-    numerical_columns_subset: List[str] = [
-        "3SsnPorch",
-        "Fireplaces",
-        "BsmtHalfBath",
-        "HalfBath",
-        "GarageCars",
-        "TotRmsAbvGrd",
-        "BsmtFinSF1",
-        "BsmtFinSF2",
-        "GrLivArea",
-        "ScreenPorch",
-    ]
-
-    X = X[categorical_columns_subset + numerical_columns_subset]
-    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
-    return X, y
-
-
-@memory.cache
-def get_mq2008(
-    dpath: str,
-) -> Tuple[
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-]:
-    from sklearn.datasets import load_svmlight_files
-
-    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
-    target = dpath + "/MQ2008.zip"
-    if not os.path.exists(target):
-        request.urlretrieve(url=src, filename=target)
-
-    with zipfile.ZipFile(target, "r") as f:
-        f.extractall(path=dpath)
-
-    (
-        x_train,
-        y_train,
-        qid_train,
-        x_test,
-        y_test,
-        qid_test,
-        x_valid,
-        y_valid,
-        qid_valid,
-    ) = load_svmlight_files(
-        (
-            dpath + "MQ2008/Fold1/train.txt",
-            dpath + "MQ2008/Fold1/test.txt",
-            dpath + "MQ2008/Fold1/vali.txt",
-        ),
-        query_id=True,
-        zero_based=False,
-    )
-
-    return (
-        x_train,
-        y_train,
-        qid_train,
-        x_test,
-        y_test,
-        qid_test,
-        x_valid,
-        y_valid,
-        qid_valid,
-    )
-
-
 # pylint: disable=too-many-arguments,too-many-locals
 @memory.cache
 def make_categorical(
@@ -737,20 +606,7 @@ _unweighted_datasets_strategy = strategies.sampled_from(
         TestDataset(
             "calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
         ),
-        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
         TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
-        TestDataset(
-            "mtreg",
-            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
-            "reg:squarederror",
-            "rmse",
-        ),
-        TestDataset(
-            "mtreg-l1",
-            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
-            "reg:absoluteerror",
-            "mae",
-        ),
         TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
         TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
         TestDataset(
@@ -763,37 +619,71 @@ _unweighted_datasets_strategy = strategies.sampled_from(
 )
 
 
-@strategies.composite
-def _dataset_weight_margin(draw: Callable) -> TestDataset:
-    data: TestDataset = draw(_unweighted_datasets_strategy)
-    if draw(strategies.booleans()):
-        data.w = draw(
-            arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
-        )
-    if draw(strategies.booleans()):
-        num_class = 1
-        if data.objective == "multi:softmax":
-            num_class = int(np.max(data.y) + 1)
-        elif data.name.startswith("mtreg"):
-            num_class = data.y.shape[1]
+def make_datasets_with_margin(
+    unweighted_strategy: strategies.SearchStrategy,
+) -> Callable:
+    """Factory function for creating strategies that generates datasets with weight and
+    base margin.
 
-        data.margin = draw(
-            arrays(
-                np.float64,
-                (data.y.shape[0] * num_class),
-                elements=strategies.floats(0.5, 1.0),
+    """
+
+    @strategies.composite
+    def weight_margin(draw: Callable) -> TestDataset:
+        data: TestDataset = draw(unweighted_strategy)
+        if draw(strategies.booleans()):
+            data.w = draw(
+                arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
             )
-        )
-        assert data.margin is not None
-        if num_class != 1:
-            data.margin = data.margin.reshape(data.y.shape[0], num_class)
+        if draw(strategies.booleans()):
+            num_class = 1
+            if data.objective == "multi:softmax":
+                num_class = int(np.max(data.y) + 1)
+            elif data.name.startswith("mtreg"):
+                num_class = data.y.shape[1]
 
-    return data
+            data.margin = draw(
+                arrays(
+                    np.float64,
+                    (data.y.shape[0] * num_class),
+                    elements=strategies.floats(0.5, 1.0),
+                )
+            )
+            assert data.margin is not None
+            if num_class != 1:
+                data.margin = data.margin.reshape(data.y.shape[0], num_class)
+
+        return data
+
+    return weight_margin
 
 
-# A strategy for drawing from a set of example datasets
-# May add random weights to the dataset
-dataset_strategy = _dataset_weight_margin()
+# A strategy for drawing from a set of example datasets. May add random weights to the
+# dataset
+dataset_strategy = make_datasets_with_margin(_unweighted_datasets_strategy)()
+
+
+_unweighted_multi_datasets_strategy = strategies.sampled_from(
+    [
+        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
+        TestDataset(
+            "mtreg",
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
+            "reg:squarederror",
+            "rmse",
+        ),
+        TestDataset(
+            "mtreg-l1",
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
+            "reg:absoluteerror",
+            "mae",
+        ),
+    ]
+)
+
+# A strategy for drawing from a set of multi-target/multi-class datasets.
+multi_dataset_strategy = make_datasets_with_margin(
+    _unweighted_multi_datasets_strategy
+)()
 
 
 def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool:
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index 4f79d7358..477d0cf3d 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -1,10 +1,20 @@
 """Utilities for data generation."""
-from typing import Any, Generator, Tuple, Union
+import os
+import zipfile
+from typing import Any, Generator, List, Tuple, Union
+from urllib import request
 
 import numpy as np
+import pytest
+from numpy.random import Generator as RNG
+from scipy import sparse
 
+import xgboost
 from xgboost.data import pandas_pyarrow_mapper
 
+joblib = pytest.importorskip("joblib")
+memory = joblib.Memory("./cachedir", verbose=0)
+
 
 def np_dtypes(
     n_samples: int, n_features: int
@@ -179,3 +189,154 @@ def pd_arrow_dtypes() -> Generator:
         dtype=pd.ArrowDtype(pa.bool_()),
     )
     yield orig, df
+
+
+def check_inf(rng: RNG) -> None:
+    """Validate there's no inf in X."""
+    X = rng.random(size=32).reshape(8, 4)
+    y = rng.random(size=8)
+    X[5, 2] = np.inf
+
+    with pytest.raises(ValueError, match="Input data contains `inf`"):
+        xgboost.QuantileDMatrix(X, y)
+
+    with pytest.raises(ValueError, match="Input data contains `inf`"):
+        xgboost.DMatrix(X, y)
+
+
+@memory.cache
+def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the California housing dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    data = datasets.fetch_california_housing()
+    return data.data, data.target
+
+
+@memory.cache
+def get_digits() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the digits dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    data = datasets.load_digits()
+    return data.data, data.target
+
+
+@memory.cache
+def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the breast cancer dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    return datasets.load_breast_cancer(return_X_y=True)
+
+
+@memory.cache
+def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
+    """Generate a sparse dataset."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    rng = np.random.RandomState(199)
+    n = 2000
+    sparsity = 0.75
+    X, y = datasets.make_regression(n, random_state=rng)
+    flag = rng.binomial(1, sparsity, X.shape)
+    for i in range(X.shape[0]):
+        for j in range(X.shape[1]):
+            if flag[i, j]:
+                X[i, j] = np.nan
+    return X, y
+
+
+@memory.cache
+def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Number of samples: 1460
+    Number of features: 20
+    Number of categorical features: 10
+    Number of numerical features: 10
+    """
+    datasets = pytest.importorskip("sklearn.datasets")
+    X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+
+    categorical_columns_subset: List[str] = [
+        "BldgType",  # 5 cats, no nan
+        "GarageFinish",  # 3 cats, nan
+        "LotConfig",  # 5 cats, no nan
+        "Functional",  # 7 cats, no nan
+        "MasVnrType",  # 4 cats, nan
+        "HouseStyle",  # 8 cats, no nan
+        "FireplaceQu",  # 5 cats, nan
+        "ExterCond",  # 5 cats, no nan
+        "ExterQual",  # 4 cats, no nan
+        "PoolQC",  # 3 cats, nan
+    ]
+
+    numerical_columns_subset: List[str] = [
+        "3SsnPorch",
+        "Fireplaces",
+        "BsmtHalfBath",
+        "HalfBath",
+        "GarageCars",
+        "TotRmsAbvGrd",
+        "BsmtFinSF1",
+        "BsmtFinSF2",
+        "GrLivArea",
+        "ScreenPorch",
+    ]
+
+    X = X[categorical_columns_subset + numerical_columns_subset]
+    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
+    return X, y
+
+
+@memory.cache
+def get_mq2008(
+    dpath: str,
+) -> Tuple[
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+]:
+    """Fetch the mq2008 dataset."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
+    target = os.path.join(dpath, "MQ2008.zip")
+    if not os.path.exists(target):
+        request.urlretrieve(url=src, filename=target)
+
+    with zipfile.ZipFile(target, "r") as f:
+        f.extractall(path=dpath)
+
+    (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    ) = datasets.load_svmlight_files(
+        (
+            os.path.join(dpath, "MQ2008/Fold1/train.txt"),
+            os.path.join(dpath, "MQ2008/Fold1/test.txt"),
+            os.path.join(dpath, "MQ2008/Fold1/vali.txt"),
+        ),
+        query_id=True,
+        zero_based=False,
+    )
+
+    return (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    )
diff --git a/python-package/xgboost/testing/params.py b/python-package/xgboost/testing/params.py
index 3af3306da..e6ba73e1f 100644
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@@ -4,8 +4,8 @@ from typing import cast
 
 import pytest
 
-hypothesis = pytest.importorskip("hypothesis")
-from hypothesis import strategies  # pylint:disable=wrong-import-position
+strategies = pytest.importorskip("hypothesis.strategies")
+
 
 exact_parameter_strategy = strategies.fixed_dictionaries(
     {
@@ -41,6 +41,26 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
     and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
 )
 
+hist_multi_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(1, 11),
+        "max_leaves": strategies.integers(0, 1024),
+        "max_bin": strategies.integers(2, 512),
+        "multi_strategy": strategies.sampled_from(
+            ["multi_output_tree", "one_output_per_tree"]
+        ),
+        "grow_policy": strategies.sampled_from(["lossguide", "depthwise"]),
+        "min_child_weight": strategies.floats(0.5, 2.0),
+        # We cannot enable subsampling as the training loss can increase
+        # 'subsample': strategies.floats(0.5, 1.0),
+        "colsample_bytree": strategies.floats(0.5, 1.0),
+        "colsample_bylevel": strategies.floats(0.5, 1.0),
+    }
+).filter(
+    lambda x: (cast(int, x["max_depth"]) > 0 or cast(int, x["max_leaves"]) > 0)
+    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
+)
+
 cat_parameter_strategy = strategies.fixed_dictionaries(
     {
         "max_cat_to_onehot": strategies.integers(1, 128),
diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
index fe4fc8404..7c75012c2 100644
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@@ -48,7 +48,12 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
     def neg_mse(*args: Any, **kwargs: Any) -> float:
         return -float(mean_squared_error(*args, **kwargs))
 
-    ranker = xgb.XGBRanker(n_estimators=3, eval_metric=neg_mse, tree_method=tree_method)
+    ranker = xgb.XGBRanker(
+        n_estimators=3,
+        eval_metric=neg_mse,
+        tree_method=tree_method,
+        disable_default_eval_metric=True,
+    )
     ranker.fit(df, y, eval_set=[(valid_df, y)])
     score = ranker.score(valid_df, y)
     assert np.isclose(score, ranker.evals_result()["validation_0"]["neg_mse"][-1])
diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h
index c0ee65e00..d0bf00cad 100644
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -55,6 +55,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
       *out_dim = 2;
       shape.resize(*out_dim);
       shape.front() = rows;
+      // chunksize can be 1 if it's softmax
       shape.back() = std::min(groups, chunksize);
     }
     break;
diff --git a/src/common/algorithm.h b/src/common/algorithm.h
index 739a84968..a34010cd0 100644
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -14,7 +14,7 @@
 
 // clang with libstdc++ works as well
 #if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
-    !defined(__APPLE__) && __has_include(<omp.h>)
+    !defined(__APPLE__) && __has_include(<omp.h>) && __has_include(<parallel/algorithm>)
 #define GCC_HAS_PARALLEL 1
 #endif  // GLIC_VERSION
 
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index b1d165c42..956c9cf04 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -121,17 +121,20 @@ namespace dh {
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
 
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
-                                     int line) {
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
   if (code != ncclSuccess) {
     std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
+    ss << " " << file << "(" << line << ")\n";
     if (code == ncclUnhandledCudaError) {
       // nccl usually preserves the last error so we can get more details.
       auto err = cudaPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+    } else if (code == ncclSystemError) {
+      ss << "  This might be caused by a network configuration issue. Please consider specifying "
+            "the network interface for NCCL via environment variables listed in its reference: "
+            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
     }
-    ss << " " << file << "(" << line << ")";
     LOG(FATAL) << ss.str();
   }
 
diff --git a/src/common/device_helpers.hip.h b/src/common/device_helpers.hip.h
index 365126465..38bc29f91 100644
--- a/src/common/device_helpers.hip.h
+++ b/src/common/device_helpers.hip.h
@@ -2,6 +2,9 @@
  * Copyright 2017-2023 XGBoost contributors
  */
 #pragma once
+
+#if defined(XGBOOST_USE_CUDA)
+
 #include <thrust/binary_search.h>  // thrust::upper_bound
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/device_ptr.h>
@@ -95,20 +98,23 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) {  // NOLINT
 }
 namespace dh {
 
-#ifdef XGBOOST_USE_NCCL
+#ifdef XGBOOST_USE_RCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
 
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
-                                     int line) {
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
   if (code != ncclSuccess) {
     std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "RCCL failure: " << ncclGetErrorString(code) << ".";
+    ss << " " << file << "(" << line << ")\n";
     if (code == ncclUnhandledCudaError) {
       // nccl usually preserves the last error so we can get more details.
       auto err = hipPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::hip_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+    } else if (code == ncclSystemError) {
+      ss << "  This might be caused by a network configuration issue. Please consider specifying "
+            "the network interface for NCCL via environment variables listed in its reference: "
+            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
     }
-    ss << " " << file << "(" << line << ")";
     LOG(FATAL) << ss.str();
   }
 
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 48a2c92a4..3dbb7f52c 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -20,5 +20,9 @@ constexpr StringView GroupSize() {
 constexpr StringView LabelScoreSize() {
   return "The size of label doesn't match the size of prediction.";
 }
+
+constexpr StringView InfInData() {
+  return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index c09e5c71a..d95d405eb 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -7,23 +7,22 @@
 #ifndef XGBOOST_COMMON_HIST_UTIL_H_
 #define XGBOOST_COMMON_HIST_UTIL_H_
 
-#include <xgboost/data.h>
-
 #include <algorithm>
+#include <cstdint>  // for uint32_t
 #include <limits>
 #include <map>
 #include <memory>
 #include <utility>
 #include <vector>
 
-#include "algorithm.h"  // SegmentId
 #include "categorical.h"
 #include "common.h"
 #include "quantile.h"
 #include "row_set.h"
 #include "threading_utils.h"
 #include "timer.h"
-#include "xgboost/base.h"  // bst_feature_t, bst_bin_t
+#include "xgboost/base.h"  // for bst_feature_t, bst_bin_t
+#include "xgboost/data.h"
 
 namespace xgboost {
 class GHistIndexMatrix;
@@ -392,15 +391,18 @@ class HistCollection {
   }
 
   // have we computed a histogram for i-th node?
-  bool RowExists(bst_uint nid) const {
+  [[nodiscard]] bool RowExists(bst_uint nid) const {
     const uint32_t k_max = std::numeric_limits<uint32_t>::max();
     return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
   }
-
-  // initialize histogram collection
-  void Init(uint32_t nbins) {
-    if (nbins_ != nbins) {
-      nbins_ = nbins;
+  /**
+   * \brief Initialize histogram collection.
+   *
+   * \param n_total_bins Number of bins across all features.
+   */
+  void Init(std::uint32_t n_total_bins) {
+    if (nbins_ != n_total_bins) {
+      nbins_ = n_total_bins;
       // quite expensive operation, so let's do this only once
       data_.clear();
     }
diff --git a/src/common/json.cc b/src/common/json.cc
index 8e2dd05ff..c3d61b47d 100644
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -333,7 +333,7 @@ size_t constexpr JsonReader::kMaxNumLength;
 Json JsonReader::Parse() {
   while (true) {
     SkipSpaces();
-    char c = PeekNextChar();
+    auto c = PeekNextChar();
     if (c == -1) { break; }
 
     if (c == '{') {
@@ -408,13 +408,13 @@ void JsonReader::Error(std::string msg) const {
 }
 
 namespace {
-bool IsSpace(char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
+bool IsSpace(JsonReader::Char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
 }  // anonymous namespace
 
 // Json class
 void JsonReader::SkipSpaces() {
   while (cursor_.Pos() < raw_str_.size()) {
-    char c = raw_str_[cursor_.Pos()];
+    Char c = raw_str_[cursor_.Pos()];
     if (IsSpace(c)) {
       cursor_.Forward();
     } else {
@@ -436,12 +436,12 @@ void ParseStr(std::string const& str) {
 }
 
 Json JsonReader::ParseString() {
-  char ch { GetConsecutiveChar('\"') };  // NOLINT
+  Char ch { GetConsecutiveChar('\"') };  // NOLINT
   std::string str;
   while (true) {
     ch = GetNextChar();
     if (ch == '\\') {
-      char next = static_cast<char>(GetNextChar());
+      Char next{GetNextChar()};
       switch (next) {
         case 'r':  str += u8"\r"; break;
         case 'n':  str += u8"\n"; break;
@@ -466,8 +466,8 @@ Json JsonReader::ParseString() {
 }
 
 Json JsonReader::ParseNull() {
-  char ch = GetNextNonSpaceChar();
-  std::string buffer{ch};
+  Char ch = GetNextNonSpaceChar();
+  std::string buffer{static_cast<char>(ch)};
   for (size_t i = 0; i < 3; ++i) {
     buffer.push_back(GetNextChar());
   }
@@ -480,7 +480,7 @@ Json JsonReader::ParseNull() {
 Json JsonReader::ParseArray() {
   std::vector<Json> data;
 
-  char ch { GetConsecutiveChar('[') };  // NOLINT
+  Char ch { GetConsecutiveChar('[') };  // NOLINT
   while (true) {
     if (PeekNextChar() == ']') {
       GetConsecutiveChar(']');
@@ -503,7 +503,7 @@ Json JsonReader::ParseObject() {
 
   Object::Map data;
   SkipSpaces();
-  char ch = PeekNextChar();
+  auto ch = PeekNextChar();
 
   if (ch == '}') {
     GetConsecutiveChar('}');
@@ -652,7 +652,7 @@ Json JsonReader::ParseNumber() {
 
 Json JsonReader::ParseBoolean() {
   bool result = false;
-  char ch = GetNextNonSpaceChar();
+  Char ch = GetNextNonSpaceChar();
   std::string const t_value = u8"true";
   std::string const f_value = u8"false";
 
@@ -737,7 +737,8 @@ Json UBJReader::ParseArray() {
       case 'L':
         return ParseTypedArray<I64Array>(n);
       default:
-        LOG(FATAL) << "`" + std::string{type} + "` is not supported for typed array.";  // NOLINT
+        LOG(FATAL) << "`" + std::string{static_cast<char>(type)} +  // NOLINT
+                          "` is not supported for typed array.";
     }
   }
   std::vector<Json> results;
@@ -794,7 +795,7 @@ Json UBJReader::Load() {
 
 Json UBJReader::Parse() {
   while (true) {
-    char c = PeekNextChar();
+    auto c = PeekNextChar();
     if (c == -1) {
       break;
     }
diff --git a/src/common/numeric.h b/src/common/numeric.h
index 9d255e9af..3cd6db0e1 100644
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -1,13 +1,15 @@
-/*!
- * Copyright 2022, XGBoost contributors.
+/**
+ * Copyright 2022-2023 by XGBoost contributors.
  */
 #ifndef XGBOOST_COMMON_NUMERIC_H_
 #define XGBOOST_COMMON_NUMERIC_H_
 
 #include <dmlc/common.h>  // OMPException
 
-#include <algorithm>  // std::max
-#include <iterator>   // std::iterator_traits
+#include <algorithm>  // for std::max
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t
+#include <iterator>   // for iterator_traits
 #include <vector>
 
 #include "common.h"                      // AssertGPUSupport
@@ -15,8 +17,7 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 
 /**
  * \brief Run length encode on CPU, input must be sorted.
@@ -111,11 +112,11 @@ inline double Reduce(Context const*, HostDeviceVector<float> const&) {
 namespace cpu_impl {
 template <typename It, typename V = typename It::value_type>
 V Reduce(Context const* ctx, It first, It second, V const& init) {
-  size_t n = std::distance(first, second);
-  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(ctx->Threads(), init);
-  common::ParallelFor(n, ctx->Threads(),
-                      [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
-  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + ctx->Threads(), init);
+  std::size_t n = std::distance(first, second);
+  auto n_threads = static_cast<std::size_t>(std::min(n, static_cast<std::size_t>(ctx->Threads())));
+  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(n_threads, init);
+  common::ParallelFor(n, n_threads, [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
+  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + n_threads, init);
   return result;
 }
 }  // namespace cpu_impl
@@ -144,7 +145,6 @@ void Iota(Context const* ctx, It first, It last,
     });
   }
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 
 #endif  // XGBOOST_COMMON_NUMERIC_H_
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 9a9c162d2..e5e6971e5 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -1,391 +1,386 @@
-/*!
- * Copyright 2021-2022 by Contributors
- * \file row_set.h
- * \brief Quick Utility to compute subset of rows
- * \author Philip Cho, Tianqi Chen
- */
-#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
-#define XGBOOST_COMMON_PARTITION_BUILDER_H_
-
-#include <xgboost/data.h>
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "../tree/hist/expand_entry.h"
-#include "categorical.h"
-#include "column_matrix.h"
-#include "xgboost/context.h"
-#include "xgboost/tree_model.h"
-
-namespace xgboost {
-namespace common {
-
-// The builder is required for samples partition to left and rights children for set of nodes
-// Responsible for:
-// 1) Effective memory allocation for intermediate results for multi-thread work
-// 2) Merging partial results produced by threads into original row set (row_set_collection_)
-// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
-template<size_t BlockSize>
-class PartitionBuilder {
-  using BitVector = RBitField8;
-
- public:
-  template<typename Func>
-  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
-    left_right_nodes_sizes_.resize(n_nodes);
-    blocks_offsets_.resize(n_nodes+1);
-
-    blocks_offsets_[0] = 0;
-    for (size_t i = 1; i < n_nodes+1; ++i) {
-      blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
-    }
-
-    if (n_tasks > max_n_tasks_) {
-      mem_blocks_.resize(n_tasks);
-      max_n_tasks_ = n_tasks;
-    }
-  }
-
-  // split row indexes (rid_span) to 2 parts (left_part, right_part) depending
-  // on comparison of indexes values (idx_span) and split point (split_cond)
-  // Handle dense columns
-  // Analog of std::stable_partition, but in no-inplace manner
-  template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
-  inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
-                                                   common::Span<const size_t> row_indices,
-                                                   common::Span<size_t> left_part,
-                                                   common::Span<size_t> right_part,
-                                                   size_t base_rowid, Predicate&& pred) {
-    auto& column = *p_column;
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
-
-    auto p_row_indices = row_indices.data();
-    auto n_samples = row_indices.size();
-
-    for (size_t i = 0; i < n_samples; ++i) {
-      auto rid = p_row_indices[i];
-      const int32_t bin_id = column[rid - base_rowid];
-      if (any_missing && bin_id == ColumnType::kMissingId) {
-        if (default_left) {
-          p_left_part[nleft_elems++] = rid;
-        } else {
-          p_right_part[nright_elems++] = rid;
-        }
-      } else {
-        if (pred(rid, bin_id)) {
-          p_left_part[nleft_elems++] = rid;
-        } else {
-          p_right_part[nright_elems++] = rid;
-        }
-      }
-    }
-
-    return {nleft_elems, nright_elems};
-  }
-
-  template <typename Pred>
-  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
-                                                        common::Span<size_t> left_part,
-                                                        common::Span<size_t> right_part,
-                                                        Pred pred) {
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
-    for (auto row_id : ridx) {
-      if (pred(row_id)) {
-        p_left_part[nleft_elems++] = row_id;
-      } else {
-        p_right_part[nright_elems++] = row_id;
-      }
-    }
-    return {nleft_elems, nright_elems};
-  }
-
-  template <typename BinIdxType, bool any_missing, bool any_cat>
-  void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
-                 const common::Range1d range,
-                 const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
-                 const common::ColumnMatrix& column_matrix,
-                 const RegTree& tree, const size_t* rid) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
-    std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
-    bool default_left = tree[nid].DefaultLeft();
-    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
-    auto node_cats = tree.NodeCats(nid);
-    auto const& cut_values = gmat.cut.Values();
-
-    auto pred_hist = [&](auto ridx, auto bin_id) {
-      if (any_cat && is_cat) {
-        auto gidx = gmat.GetGindex(ridx, fid);
-        bool go_left = default_left;
-        if (gidx > -1) {
-          go_left = Decision(node_cats, cut_values[gidx]);
-        }
-        return go_left;
-      } else {
-        return bin_id <= split_cond;
-      }
-    };
-
-    auto pred_approx = [&](auto ridx) {
-      auto gidx = gmat.GetGindex(ridx, fid);
-      bool go_left = default_left;
-      if (gidx > -1) {
-        if (is_cat) {
-          go_left = Decision(node_cats, cut_values[gidx]);
-        } else {
-          go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
-        }
-      }
-      return go_left;
-    };
-
-    std::pair<size_t, size_t> child_nodes_sizes;
-    if (!column_matrix.IsInitialized()) {
-      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
-    } else {
-      if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
-        auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
-        if (default_left) {
-          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
-                                                                 gmat.base_rowid, pred_hist);
-        } else {
-          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
-                                                                  gmat.base_rowid, pred_hist);
-        }
-      } else {
-        CHECK_EQ(any_missing, true);
-        auto column =
-            column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
-        if (default_left) {
-          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
-                                                                 gmat.base_rowid, pred_hist);
-        } else {
-          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
-                                                                  gmat.base_rowid, pred_hist);
-        }
-      }
-    }
-
-    const size_t n_left  = child_nodes_sizes.first;
-    const size_t n_right = child_nodes_sizes.second;
-
-    SetNLeftElems(node_in_set, range.begin(), n_left);
-    SetNRightElems(node_in_set, range.begin(), n_right);
-  }
-
-  /**
-   * @brief When data is split by column, we don't have all the features locally on the current
-   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
-   * to go right, or if the feature value used for the split is missing.
-   */
-  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
-                const common::Range1d range, GHistIndexMatrix const& gmat,
-                const common::ColumnMatrix& column_matrix,
-                const RegTree& tree, const size_t* rid,
-                BitVector* decision_bits, BitVector* missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
-    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
-    auto node_cats = tree.NodeCats(nid);
-    auto const& cut_values = gmat.cut.Values();
-
-    if (!column_matrix.IsInitialized()) {
-      for (auto row_id : rid_span) {
-        auto gidx = gmat.GetGindex(row_id, fid);
-        if (gidx > -1) {
-          bool go_left = false;
-          if (is_cat) {
-            go_left = Decision(node_cats, cut_values[gidx]);
-          } else {
-            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
-          }
-          if (go_left) {
-            decision_bits->Set(row_id - gmat.base_rowid);
-          }
-        } else {
-          missing_bits->Set(row_id - gmat.base_rowid);
-        }
-      }
-    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
-    }
-  }
-
-  /**
-   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
-   * use them to partition the rows accordingly.
-   */
-  void PartitionByMask(const size_t node_in_set,
-                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
-                       const common::Range1d range, GHistIndexMatrix const& gmat,
-                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
-                       const size_t* rid, BitVector const& decision_bits,
-                       BitVector const& missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
-    std::size_t nid = nodes[node_in_set].nid;
-    bool default_left = tree[nid].DefaultLeft();
-
-    auto pred_approx = [&](auto ridx) {
-      bool go_left = default_left;
-      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
-      if (!is_missing) {
-        go_left = decision_bits.Check(ridx - gmat.base_rowid);
-      }
-      return go_left;
-    };
-
-    std::pair<size_t, size_t> child_nodes_sizes;
-    if (!column_matrix.IsInitialized()) {
-      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
-    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
-    }
-
-    const size_t n_left  = child_nodes_sizes.first;
-    const size_t n_right = child_nodes_sizes.second;
-
-    SetNLeftElems(node_in_set, range.begin(), n_left);
-    SetNRightElems(node_in_set, range.begin(), n_right);
-  }
-
-  // allocate thread local memory, should be called for each specific task
-  void AllocateForTask(size_t id) {
-    if (mem_blocks_[id].get() == nullptr) {
-      BlockInfo* local_block_ptr = new BlockInfo;
-      CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
-      mem_blocks_[id].reset(local_block_ptr);
-    }
-  }
-
-  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
-    const size_t task_idx = GetTaskIdx(nid, begin);
-    return { mem_blocks_.at(task_idx)->Left(), end - begin };
-  }
-
-  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
-    const size_t task_idx = GetTaskIdx(nid, begin);
-    return { mem_blocks_.at(task_idx)->Right(), end - begin };
-  }
-
-  void SetNLeftElems(int nid, size_t begin, size_t n_left) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-    mem_blocks_.at(task_idx)->n_left = n_left;
-  }
-
-  void SetNRightElems(int nid, size_t begin, size_t n_right) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-    mem_blocks_.at(task_idx)->n_right = n_right;
-  }
-
-
-  size_t GetNLeftElems(int nid) const {
-    return left_right_nodes_sizes_[nid].first;
-  }
-
-  size_t GetNRightElems(int nid) const {
-    return left_right_nodes_sizes_[nid].second;
-  }
-
-  // Each thread has partial results for some set of tree-nodes
-  // The function decides order of merging partial results into final row set
-  void CalculateRowOffsets() {
-    for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
-      size_t n_left = 0;
-      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
-        mem_blocks_[j]->n_offset_left = n_left;
-        n_left += mem_blocks_[j]->n_left;
-      }
-      size_t n_right = 0;
-      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
-        mem_blocks_[j]->n_offset_right = n_left + n_right;
-        n_right += mem_blocks_[j]->n_right;
-      }
-      left_right_nodes_sizes_[i] = {n_left, n_right};
-    }
-  }
-
-  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-
-    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
-    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
-
-    const size_t* left = mem_blocks_[task_idx]->Left();
-    const size_t* right = mem_blocks_[task_idx]->Right();
-
-    std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
-    std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
-  }
-
-  size_t GetTaskIdx(int nid, size_t begin) {
-    return blocks_offsets_[nid] + begin / BlockSize;
-  }
-
-  // Copy row partitions into global cache for reuse in objective
-  template <typename Sampledp>
-  void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
-                     std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
-    auto& h_pos = *p_position;
-    h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
-
-    auto p_begin = row_set.Data()->data();
-    ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
-      auto const& node = row_set[i];
-      if (node.node_id < 0) {
-        return;
-      }
-      CHECK(tree[node.node_id].IsLeaf());
-      if (node.begin) {  // guard for empty node.
-        size_t ptr_offset = node.end - p_begin;
-        CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
-        for (auto idx = node.begin; idx != node.end; ++idx) {
-          h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
-        }
-      }
-    });
-  }
-
- protected:
-  struct BlockInfo{
-    size_t n_left;
-    size_t n_right;
-
-    size_t n_offset_left;
-    size_t n_offset_right;
-
-    size_t* Left() {
-      return &left_data_[0];
-    }
-
-    size_t* Right() {
-      return &right_data_[0];
-    }
-   private:
-    size_t left_data_[BlockSize];
-    size_t right_data_[BlockSize];
-  };
-  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
-  std::vector<size_t> blocks_offsets_;
-  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
-  size_t max_n_tasks_ = 0;
-};
-
-}  // namespace common
-}  // namespace xgboost
-
-#endif  // XGBOOST_COMMON_PARTITION_BUILDER_H_
+/**
+ * Copyright 2021-2023 by Contributors
+ * \file row_set.h
+ * \brief Quick Utility to compute subset of rows
+ * \author Philip Cho, Tianqi Chen
+ */
+#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
+#define XGBOOST_COMMON_PARTITION_BUILDER_H_
+
+#include <xgboost/data.h>
+
+#include <algorithm>
+#include <cstddef>  // for size_t
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "../tree/hist/expand_entry.h"
+#include "categorical.h"
+#include "column_matrix.h"
+#include "xgboost/context.h"
+#include "xgboost/tree_model.h"
+
+namespace xgboost::common {
+// The builder is required for samples partition to left and rights children for set of nodes
+// Responsible for:
+// 1) Effective memory allocation for intermediate results for multi-thread work
+// 2) Merging partial results produced by threads into original row set (row_set_collection_)
+// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
+template<size_t BlockSize>
+class PartitionBuilder {
+  using BitVector = RBitField8;
+
+ public:
+  template<typename Func>
+  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
+    left_right_nodes_sizes_.resize(n_nodes);
+    blocks_offsets_.resize(n_nodes+1);
+
+    blocks_offsets_[0] = 0;
+    for (size_t i = 1; i < n_nodes+1; ++i) {
+      blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
+    }
+
+    if (n_tasks > max_n_tasks_) {
+      mem_blocks_.resize(n_tasks);
+      max_n_tasks_ = n_tasks;
+    }
+  }
+
+  // split row indexes (rid_span) to 2 parts (left_part, right_part) depending
+  // on comparison of indexes values (idx_span) and split point (split_cond)
+  // Handle dense columns
+  // Analog of std::stable_partition, but in no-inplace manner
+  template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
+  inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
+                                                   common::Span<const size_t> row_indices,
+                                                   common::Span<size_t> left_part,
+                                                   common::Span<size_t> right_part,
+                                                   size_t base_rowid, Predicate&& pred) {
+    auto& column = *p_column;
+    size_t* p_left_part = left_part.data();
+    size_t* p_right_part = right_part.data();
+    size_t nleft_elems = 0;
+    size_t nright_elems = 0;
+
+    auto p_row_indices = row_indices.data();
+    auto n_samples = row_indices.size();
+
+    for (size_t i = 0; i < n_samples; ++i) {
+      auto rid = p_row_indices[i];
+      const int32_t bin_id = column[rid - base_rowid];
+      if (any_missing && bin_id == ColumnType::kMissingId) {
+        if (default_left) {
+          p_left_part[nleft_elems++] = rid;
+        } else {
+          p_right_part[nright_elems++] = rid;
+        }
+      } else {
+        if (pred(rid, bin_id)) {
+          p_left_part[nleft_elems++] = rid;
+        } else {
+          p_right_part[nright_elems++] = rid;
+        }
+      }
+    }
+
+    return {nleft_elems, nright_elems};
+  }
+
+  template <typename Pred>
+  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
+                                                        common::Span<size_t> left_part,
+                                                        common::Span<size_t> right_part,
+                                                        Pred pred) {
+    size_t* p_left_part = left_part.data();
+    size_t* p_right_part = right_part.data();
+    size_t nleft_elems = 0;
+    size_t nright_elems = 0;
+    for (auto row_id : ridx) {
+      if (pred(row_id)) {
+        p_left_part[nleft_elems++] = row_id;
+      } else {
+        p_right_part[nright_elems++] = row_id;
+      }
+    }
+    return {nleft_elems, nright_elems};
+  }
+
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
+  void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
+                 const common::Range1d range, const bst_bin_t split_cond,
+                 GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
+                 const RegTree& tree, const size_t* rid) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree.SplitIndex(nid);
+    bool default_left = tree.DefaultLeft(nid);
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    auto pred_hist = [&](auto ridx, auto bin_id) {
+      if (any_cat && is_cat) {
+        auto gidx = gmat.GetGindex(ridx, fid);
+        bool go_left = default_left;
+        if (gidx > -1) {
+          go_left = Decision(node_cats, cut_values[gidx]);
+        }
+        return go_left;
+      } else {
+        return bin_id <= split_cond;
+      }
+    };
+
+    auto pred_approx = [&](auto ridx) {
+      auto gidx = gmat.GetGindex(ridx, fid);
+      bool go_left = default_left;
+      if (gidx > -1) {
+        if (is_cat) {
+          go_left = Decision(node_cats, cut_values[gidx]);
+        } else {
+          go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+        }
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
+        auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
+        if (default_left) {
+          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
+                                                                 gmat.base_rowid, pred_hist);
+        } else {
+          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
+                                                                  gmat.base_rowid, pred_hist);
+        }
+      } else {
+        CHECK_EQ(any_missing, true);
+        auto column =
+            column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
+        if (default_left) {
+          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
+                                                                 gmat.base_rowid, pred_hist);
+        } else {
+          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
+                                                                  gmat.base_rowid, pred_hist);
+        }
+      }
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
+  /**
+   * @brief When data is split by column, we don't have all the features locally on the current
+   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
+   * to go right, or if the feature value used for the split is missing.
+   */
+  template <typename ExpandEntry>
+  void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
+                const common::Range1d range, GHistIndexMatrix const& gmat,
+                const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
+                BitVector* decision_bits, BitVector* missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree[nid].SplitIndex();
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    if (!column_matrix.IsInitialized()) {
+      for (auto row_id : rid_span) {
+        auto gidx = gmat.GetGindex(row_id, fid);
+        if (gidx > -1) {
+          bool go_left = false;
+          if (is_cat) {
+            go_left = Decision(node_cats, cut_values[gidx]);
+          } else {
+            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+          }
+          if (go_left) {
+            decision_bits->Set(row_id - gmat.base_rowid);
+          }
+        } else {
+          missing_bits->Set(row_id - gmat.base_rowid);
+        }
+      }
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+  }
+
+  /**
+   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
+   * use them to partition the rows accordingly.
+   */
+  template <typename ExpandEntry>
+  void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
+                       const common::Range1d range, GHistIndexMatrix const& gmat,
+                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                       const size_t* rid, BitVector const& decision_bits,
+                       BitVector const& missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bool default_left = tree[nid].DefaultLeft();
+
+    auto pred_approx = [&](auto ridx) {
+      bool go_left = default_left;
+      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
+      if (!is_missing) {
+        go_left = decision_bits.Check(ridx - gmat.base_rowid);
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
+  // allocate thread local memory, should be called for each specific task
+  void AllocateForTask(size_t id) {
+    if (mem_blocks_[id].get() == nullptr) {
+      BlockInfo* local_block_ptr = new BlockInfo;
+      CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
+      mem_blocks_[id].reset(local_block_ptr);
+    }
+  }
+
+  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx)->Left(), end - begin };
+  }
+
+  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx)->Right(), end - begin };
+  }
+
+  void SetNLeftElems(int nid, size_t begin, size_t n_left) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx)->n_left = n_left;
+  }
+
+  void SetNRightElems(int nid, size_t begin, size_t n_right) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx)->n_right = n_right;
+  }
+
+
+  [[nodiscard]] std::size_t GetNLeftElems(int nid) const {
+    return left_right_nodes_sizes_[nid].first;
+  }
+
+  [[nodiscard]] std::size_t GetNRightElems(int nid) const {
+    return left_right_nodes_sizes_[nid].second;
+  }
+
+  // Each thread has partial results for some set of tree-nodes
+  // The function decides order of merging partial results into final row set
+  void CalculateRowOffsets() {
+    for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
+      size_t n_left = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
+        mem_blocks_[j]->n_offset_left = n_left;
+        n_left += mem_blocks_[j]->n_left;
+      }
+      size_t n_right = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
+        mem_blocks_[j]->n_offset_right = n_left + n_right;
+        n_right += mem_blocks_[j]->n_right;
+      }
+      left_right_nodes_sizes_[i] = {n_left, n_right};
+    }
+  }
+
+  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+
+    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
+    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
+
+    const size_t* left = mem_blocks_[task_idx]->Left();
+    const size_t* right = mem_blocks_[task_idx]->Right();
+
+    std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
+    std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
+  }
+
+  size_t GetTaskIdx(int nid, size_t begin) {
+    return blocks_offsets_[nid] + begin / BlockSize;
+  }
+
+  // Copy row partitions into global cache for reuse in objective
+  template <typename Sampledp>
+  void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
+                     std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
+    auto& h_pos = *p_position;
+    h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
+
+    auto p_begin = row_set.Data()->data();
+    ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
+      auto const& node = row_set[i];
+      if (node.node_id < 0) {
+        return;
+      }
+      CHECK(tree.IsLeaf(node.node_id));
+      if (node.begin) {  // guard for empty node.
+        size_t ptr_offset = node.end - p_begin;
+        CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
+        for (auto idx = node.begin; idx != node.end; ++idx) {
+          h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
+        }
+      }
+    });
+  }
+
+ protected:
+  struct BlockInfo{
+    size_t n_left;
+    size_t n_right;
+
+    size_t n_offset_left;
+    size_t n_offset_right;
+
+    size_t* Left() {
+      return &left_data_[0];
+    }
+
+    size_t* Right() {
+      return &right_data_[0];
+    }
+   private:
+    size_t left_data_[BlockSize];
+    size_t right_data_[BlockSize];
+  };
+  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
+  std::vector<size_t> blocks_offsets_;
+  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
+  size_t max_n_tasks_ = 0;
+};
+}  // namespace xgboost::common
+#endif  // XGBOOST_COMMON_PARTITION_BUILDER_H_
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 87eb0ec20..aaf271934 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -359,6 +359,7 @@ void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_b
                  HistogramCuts *cuts) {
   size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
   auto &cut_values = cuts->cut_values_.HostVector();
+  // we use the min_value as the first (0th) element, hence starting from 1.
   for (size_t i = 1; i < required_cuts; ++i) {
     bst_float cpt = summary.data[i].value;
     if (i == 1 || cpt > cut_values.back()) {
@@ -419,8 +420,8 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
     } else {
       AddCutPoint<WQSketch>(a, max_num_bins, cuts);
       // push a value that is greater than anything
-      const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
-                                         : cuts->min_vals_.HostVector()[fid];
+      const bst_float cpt =
+          (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
       // this must be bigger than last value in a scale
       const bst_float last = cpt + (fabs(cpt) + 1e-5f);
       cuts->cut_values_.HostVector().push_back(last);
diff --git a/src/common/quantile.h b/src/common/quantile.h
index c8dcf6ada..a19b4bbb0 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -352,19 +352,6 @@ struct WQSummary {
       prev_rmax = data[i].rmax;
     }
   }
-  // check consistency of the summary
-  inline bool Check(const char *msg) const {
-    const float tol = 10.0f;
-    for (size_t i = 0; i < this->size; ++i) {
-      if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
-          data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
-        LOG(INFO) << "---------- WQSummary::Check did not pass ----------";
-        this->Print();
-        return false;
-      }
-    }
-    return true;
-  }
 };
 
 /*! \brief try to do efficient pruning */
diff --git a/src/common/ranking_utils.cc b/src/common/ranking_utils.cc
index 8fad9a206..d831b551c 100644
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@@ -6,9 +6,7 @@
 #include <algorithm>          // for copy_n, max, min, none_of, all_of
 #include <cstddef>            // for size_t
 #include <cstdio>             // for sscanf
-#include <exception>          // for exception
 #include <functional>         // for greater
-#include <iterator>           // for reverse_iterator
 #include <string>             // for char_traits, string
 
 #include "algorithm.h"        // for ArgSort
@@ -18,12 +16,113 @@
 #include "xgboost/base.h"     // for bst_group_t
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"     // for MetaInfo
-#include "xgboost/linalg.h"   // for All, TensorView, Range, Tensor, Vector
-#include "xgboost/logging.h"  // for Error, LogCheck_EQ, CHECK_EQ
+#include "xgboost/linalg.h"   // for All, TensorView, Range
+#include "xgboost/logging.h"  // for CHECK_EQ
 
 namespace xgboost::ltr {
+void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
+  if (info.group_ptr_.empty()) {
+    group_ptr_.Resize(2, 0);
+    group_ptr_.HostVector()[1] = info.num_row_;
+  } else {
+    group_ptr_.HostVector() = info.group_ptr_;
+  }
+
+  auto const& gptr = group_ptr_.ConstHostVector();
+  for (std::size_t i = 1; i < gptr.size(); ++i) {
+    std::size_t n = gptr[i] - gptr[i - 1];
+    max_group_size_ = std::max(max_group_size_, n);
+  }
+
+  double sum_weights = 0;
+  auto n_groups = Groups();
+  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
+  for (bst_omp_uint k = 0; k < n_groups; ++k) {
+    sum_weights += weight[k];
+  }
+  weight_norm_ = static_cast<double>(n_groups) / sum_weights;
+}
+
+common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
+                                                            common::Span<float const> predt) {
+  auto gptr = this->DataGroupPtr(ctx);
+  auto rank = this->sorted_idx_cache_.HostSpan();
+  CHECK_EQ(rank.size(), predt.size());
+
+  common::ParallelFor(this->Groups(), ctx->Threads(), [&](auto g) {
+    auto cnt = gptr[g + 1] - gptr[g];
+    auto g_predt = predt.subspan(gptr[g], cnt);
+    auto g_rank = rank.subspan(gptr[g], cnt);
+    auto sorted_idx = common::ArgSort<std::size_t>(
+        ctx, g_predt.data(), g_predt.data() + g_predt.size(), std::greater<>{});
+    CHECK_EQ(g_rank.size(), sorted_idx.size());
+    std::copy_n(sorted_idx.data(), sorted_idx.size(), g_rank.data());
+  });
+
+  return rank;
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
+common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
+                                                             common::Span<float const>) {
+  common::AssertGPUSupport();
+  return {};
+}
+#endif  // !defined()
+
+void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
+  auto const h_group_ptr = this->DataGroupPtr(ctx);
+
+  discounts_.Resize(MaxGroupSize(), 0);
+  auto& h_discounts = discounts_.HostVector();
+  for (std::size_t i = 0; i < MaxGroupSize(); ++i) {
+    h_discounts[i] = CalcDCGDiscount(i);
+  }
+
+  auto n_groups = h_group_ptr.size() - 1;
+  auto h_labels = info.labels.HostView().Slice(linalg::All(), 0);
+
+  CheckNDCGLabels(this->Param(), h_labels,
+                  [](auto beg, auto end, auto op) { return std::none_of(beg, end, op); });
+
+  inv_idcg_.Reshape(n_groups);
+  auto h_inv_idcg = inv_idcg_.HostView();
+  std::size_t topk = this->Param().TopK();
+  auto const exp_gain = this->Param().ndcg_exp_gain;
+
+  common::ParallelFor(n_groups, ctx->Threads(), [&](auto g) {
+    auto g_labels = h_labels.Slice(linalg::Range(h_group_ptr[g], h_group_ptr[g + 1]));
+    auto sorted_idx = common::ArgSort<std::size_t>(ctx, linalg::cbegin(g_labels),
+                                                   linalg::cend(g_labels), std::greater<>{});
+
+    double idcg{0.0};
+    for (std::size_t i = 0; i < std::min(g_labels.Size(), topk); ++i) {
+      if (exp_gain) {
+        idcg += h_discounts[i] * CalcDCGGain(g_labels(sorted_idx[i]));
+      } else {
+        idcg += h_discounts[i] * g_labels(sorted_idx[i]);
+      }
+    }
+    h_inv_idcg(g) = CalcInvIDCG(idcg);
+  });
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 DMLC_REGISTER_PARAMETER(LambdaRankParam);
 
+void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
+  auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
+  CheckMapLabels(h_label, [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
   std::string out_name;
   if (!param.empty()) {
diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
new file mode 100644
index 000000000..8fbf89818
--- /dev/null
+++ b/src/common/ranking_utils.cu
@@ -0,0 +1,212 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include <thrust/functional.h>                  // for maximum
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/logical.h>                     // for none_of, all_of
+#include <thrust/pair.h>                        // for pair, make_pair
+#include <thrust/reduce.h>                      // for reduce
+#include <thrust/scan.h>                        // for inclusive_scan
+
+#include <cstddef>                              // for size_t
+
+#include "algorithm.cuh"                        // for SegmentedArgSort
+#include "cuda_context.cuh"                     // for CUDAContext
+#include "device_helpers.cuh"                   // for MakeTransformIterator, LaunchN
+#include "optional_weight.h"                    // for MakeOptionalWeights, OptionalWeights
+#include "ranking_utils.cuh"                    // for ThreadsForMean
+#include "ranking_utils.h"
+#include "threading_utils.cuh"                  // for SegmentedTrapezoidThreads
+#include "xgboost/base.h"                       // for XGBOOST_DEVICE, bst_group_t
+#include "xgboost/context.h"                    // for Context
+#include "xgboost/linalg.h"                     // for VectorView, All, Range
+#include "xgboost/logging.h"                    // for CHECK
+#include "xgboost/span.h"                       // for Span
+
+namespace xgboost::ltr {
+namespace cuda_impl {
+void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
+                    common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
+                    common::Span<bst_group_t const> d_group_ptr, std::size_t k,
+                    linalg::VectorView<double> out_dcg) {
+  CHECK_EQ(d_group_ptr.size() - 1, out_dcg.Size());
+  using IdxGroup = thrust::pair<std::size_t, std::size_t>;
+  auto group_it = dh::MakeTransformIterator<IdxGroup>(
+      thrust::make_counting_iterator(0ull), [=] XGBOOST_DEVICE(std::size_t idx) {
+        return thrust::make_pair(idx, dh::SegmentId(d_group_ptr, idx));  // NOLINT
+      });
+  auto value_it = dh::MakeTransformIterator<double>(
+      group_it,
+      [exp_gain, d_labels, d_group_ptr, k,
+       d_sorted_idx] XGBOOST_DEVICE(IdxGroup const& l) -> double {
+        auto g_begin = d_group_ptr[l.second];
+        auto g_size = d_group_ptr[l.second + 1] - g_begin;
+
+        auto idx_in_group = l.first - g_begin;
+        if (idx_in_group >= k) {
+          return 0.0;
+        }
+        double gain{0.0};
+        auto g_sorted_idx = d_sorted_idx.subspan(g_begin, g_size);
+        auto g_labels = d_labels.Slice(linalg::Range(g_begin, g_begin + g_size));
+
+        if (exp_gain) {
+          gain = ltr::CalcDCGGain(g_labels(g_sorted_idx[idx_in_group]));
+        } else {
+          gain = g_labels(g_sorted_idx[idx_in_group]);
+        }
+        double discount = CalcDCGDiscount(idx_in_group);
+        return gain * discount;
+      });
+
+  CHECK(out_dcg.Contiguous());
+  std::size_t bytes;
+  cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
+                                  d_group_ptr.size() - 1, d_group_ptr.data(),
+                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+  dh::TemporaryArray<char> temp(bytes);
+  cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, value_it, out_dcg.Values().data(),
+                                  d_group_ptr.size() - 1, d_group_ptr.data(),
+                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+}
+
+void CalcQueriesInvIDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
+                        common::Span<bst_group_t const> d_group_ptr,
+                        linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const& p) {
+  CHECK_GE(d_group_ptr.size(), 2ul);
+  size_t n_groups = d_group_ptr.size() - 1;
+  CHECK_EQ(out_inv_IDCG.Size(), n_groups);
+  dh::device_vector<std::size_t> sorted_idx(d_labels.Size());
+  auto d_sorted_idx = dh::ToSpan(sorted_idx);
+  common::SegmentedArgSort<false, true>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
+  CalcQueriesDCG(ctx, d_labels, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(), out_inv_IDCG);
+  dh::LaunchN(out_inv_IDCG.Size(), ctx->CUDACtx()->Stream(),
+              [out_inv_IDCG] XGBOOST_DEVICE(size_t idx) mutable {
+                double idcg = out_inv_IDCG(idx);
+                out_inv_IDCG(idx) = CalcInvIDCG(idcg);
+              });
+}
+}  // namespace cuda_impl
+
+namespace {
+struct CheckNDCGOp {
+  CUDAContext const* cuctx;
+  template <typename It, typename Op>
+  bool operator()(It beg, It end, Op op) {
+    return thrust::none_of(cuctx->CTP(), beg, end, op);
+  }
+};
+struct CheckMAPOp {
+  CUDAContext const* cuctx;
+  template <typename It, typename Op>
+  bool operator()(It beg, It end, Op op) {
+    return thrust::all_of(cuctx->CTP(), beg, end, op);
+  }
+};
+
+struct ThreadGroupOp {
+  common::Span<bst_group_t const> d_group_ptr;
+  std::size_t n_pairs;
+
+  common::Span<std::size_t> out_thread_group_ptr;
+
+  XGBOOST_DEVICE void operator()(std::size_t i) {
+    out_thread_group_ptr[i + 1] =
+        cuda_impl::ThreadsForMean(d_group_ptr[i + 1] - d_group_ptr[i], n_pairs);
+  }
+};
+
+struct GroupSizeOp {
+  common::Span<bst_group_t const> d_group_ptr;
+
+  XGBOOST_DEVICE auto operator()(std::size_t i) -> std::size_t {
+    return d_group_ptr[i + 1] - d_group_ptr[i];
+  }
+};
+
+struct WeightOp {
+  common::OptionalWeights d_weight;
+  XGBOOST_DEVICE auto operator()(std::size_t i) -> double { return d_weight[i]; }
+};
+}  // anonymous namespace
+
+void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
+  CUDAContext const* cuctx = ctx->CUDACtx();
+
+  group_ptr_.SetDevice(ctx->gpu_id);
+  if (info.group_ptr_.empty()) {
+    group_ptr_.Resize(2, 0);
+    group_ptr_.HostVector()[1] = info.num_row_;
+  } else {
+    auto const& h_group_ptr = info.group_ptr_;
+    group_ptr_.Resize(h_group_ptr.size());
+    auto d_group_ptr = group_ptr_.DeviceSpan();
+    dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
+                                  cudaMemcpyHostToDevice, cuctx->Stream()));
+  }
+
+  auto d_group_ptr = DataGroupPtr(ctx);
+  std::size_t n_groups = Groups();
+
+  auto it = dh::MakeTransformIterator<std::size_t>(thrust::make_counting_iterator(0ul),
+                                                   GroupSizeOp{d_group_ptr});
+  max_group_size_ =
+      thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
+
+  threads_group_ptr_.SetDevice(ctx->gpu_id);
+  threads_group_ptr_.Resize(n_groups + 1, 0);
+  auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
+  if (param_.HasTruncation()) {
+    n_cuda_threads_ =
+        common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair());
+  } else {
+    auto n_pairs = Param().NumPair();
+    dh::LaunchN(n_groups, cuctx->Stream(),
+                ThreadGroupOp{d_group_ptr, n_pairs, d_threads_group_ptr});
+    thrust::inclusive_scan(cuctx->CTP(), dh::tcbegin(d_threads_group_ptr),
+                           dh::tcend(d_threads_group_ptr), dh::tbegin(d_threads_group_ptr));
+    n_cuda_threads_ = info.num_row_ * param_.NumPair();
+  }
+
+  sorted_idx_cache_.SetDevice(ctx->gpu_id);
+  sorted_idx_cache_.Resize(info.labels.Size(), 0);
+
+  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
+  auto w_it =
+      dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), WeightOp{weight});
+  weight_norm_ = static_cast<double>(n_groups) / thrust::reduce(w_it, w_it + n_groups);
+}
+
+common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,
+                                                             common::Span<float const> predt) {
+  auto d_sorted_idx = sorted_idx_cache_.DeviceSpan();
+  auto d_group_ptr = DataGroupPtr(ctx);
+  common::SegmentedArgSort<false, true>(ctx, predt, d_group_ptr, d_sorted_idx);
+  return d_sorted_idx;
+}
+
+void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
+  CUDAContext const* cuctx = ctx->CUDACtx();
+  auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});
+
+  auto d_group_ptr = this->DataGroupPtr(ctx);
+
+  std::size_t n_groups = d_group_ptr.size() - 1;
+  inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
+  auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
+  cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
+  CHECK_GE(this->Param().NumPair(), 1ul);
+
+  discounts_.SetDevice(ctx->gpu_id);
+  discounts_.Resize(MaxGroupSize());
+  auto d_discount = discounts_.DeviceSpan();
+  dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
+              [=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
+}
+
+void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
+  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  CheckMapLabels(d_label, CheckMAPOp{ctx->CUDACtx()});
+}
+}  // namespace xgboost::ltr
diff --git a/src/common/ranking_utils.cuh b/src/common/ranking_utils.cuh
new file mode 100644
index 000000000..297f5157e
--- /dev/null
+++ b/src/common/ranking_utils.cuh
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_RANKING_UTILS_CUH_
+#define XGBOOST_COMMON_RANKING_UTILS_CUH_
+
+#include <cstddef>            // for size_t
+
+#include "ranking_utils.h"    // for LambdaRankParam
+#include "xgboost/base.h"     // for bst_group_t, XGBOOST_DEVICE
+#include "xgboost/context.h"  // for Context
+#include "xgboost/linalg.h"   // for VectorView
+#include "xgboost/span.h"     // for Span
+
+namespace xgboost {
+namespace ltr {
+namespace cuda_impl {
+void CalcQueriesDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
+                    common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
+                    common::Span<bst_group_t const> d_group_ptr, std::size_t k,
+                    linalg::VectorView<double> out_dcg);
+
+void CalcQueriesInvIDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
+                        common::Span<bst_group_t const> d_group_ptr,
+                        linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const &p);
+
+// Functions for creating number of threads for CUDA, and getting back the number of pairs
+// from the number of threads.
+XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size,
+                                                          std::size_t n_pairs) {
+  return group_size * n_pairs;
+}
+XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,
+                                                         std::size_t group_size) {
+  return n_threads / group_size;
+}
+}  // namespace cuda_impl
+}  // namespace ltr
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_RANKING_UTILS_CUH_
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index 631de4d70..727f918f2 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -11,7 +11,6 @@
 #include <string>                        // for char_traits, string
 #include <vector>                        // for vector
 
-#include "./math.h"                      // for CloseTo
 #include "dmlc/parameter.h"              // for FieldEntry, DMLC_DECLARE_FIELD
 #include "error_msg.h"                   // for GroupWeight, GroupSize
 #include "xgboost/base.h"                // for XGBOOST_DEVICE, bst_group_t
@@ -19,7 +18,7 @@
 #include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/linalg.h"              // for Vector, VectorView, Tensor
-#include "xgboost/logging.h"             // for LogCheck_EQ, CHECK_EQ, CHECK
+#include "xgboost/logging.h"             // for CHECK_EQ, CHECK
 #include "xgboost/parameter.h"           // for XGBoostParameter
 #include "xgboost/span.h"                // for Span
 #include "xgboost/string_view.h"         // for StringView
@@ -34,6 +33,25 @@ using rel_degree_t = std::uint32_t;  // NOLINT
  */
 using position_t = std::uint32_t;  // NOLINT
 
+/**
+ * \brief Maximum relevance degree for NDCG
+ */
+constexpr std::size_t MaxRel() { return sizeof(rel_degree_t) * 8 - 1; }
+static_assert(MaxRel() == 31);
+
+XGBOOST_DEVICE inline double CalcDCGGain(rel_degree_t label) {
+  return static_cast<double>((1u << label) - 1);
+}
+
+XGBOOST_DEVICE inline double CalcDCGDiscount(std::size_t idx) {
+  return 1.0 / std::log2(static_cast<double>(idx) + 2.0);
+}
+
+XGBOOST_DEVICE inline double CalcInvIDCG(double idcg) {
+  auto inv_idcg = (idcg == 0.0 ? 0.0 : (1.0 / idcg));  // handle irrelevant document
+  return inv_idcg;
+}
+
 enum class PairMethod : std::int32_t {
   kTopK = 0,
   kMean = 1,
@@ -115,7 +133,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
         .describe("Number of pairs for each sample in the list.");
     DMLC_DECLARE_FIELD(lambdarank_unbiased)
         .set_default(false)
-        .describe("Unbiased lambda mart. Use IPW to debias click position");
+        .describe("Unbiased lambda mart. Use extended IPW to debias click position");
     DMLC_DECLARE_FIELD(lambdarank_bias_norm)
         .set_default(2.0)
         .set_lower_bound(0.0)
@@ -126,6 +144,285 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
   }
 };
 
+/**
+ * \brief Common cached items for ranking tasks.
+ */
+class RankingCache {
+ private:
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+  // Cached parameter
+  LambdaRankParam param_;
+  // offset to data groups.
+  HostDeviceVector<bst_group_t> group_ptr_;
+  // store the sorted index of prediction.
+  HostDeviceVector<std::size_t> sorted_idx_cache_;
+  // Maximum size of group
+  std::size_t max_group_size_{0};
+  // Normalization for weight
+  double weight_norm_{1.0};
+  /**
+   * CUDA cache
+   */
+  // offset to threads assigned to each group for gradient calculation
+  HostDeviceVector<std::size_t> threads_group_ptr_;
+  // Sorted index of label for finding buckets.
+  HostDeviceVector<std::size_t> y_sorted_idx_cache_;
+  // Cached labels sorted by the model
+  HostDeviceVector<float> y_ranked_by_model_;
+  // store rounding factor for objective for each group
+  linalg::Vector<GradientPair> roundings_;
+  // rounding factor for cost
+  HostDeviceVector<double> cost_rounding_;
+  // temporary storage for creating rounding factors. Stored as byte to avoid having cuda
+  // data structure in here.
+  HostDeviceVector<std::uint8_t> max_lambdas_;
+  // total number of cuda threads used for gradient calculation
+  std::size_t n_cuda_threads_{0};
+
+  // Create model rank list on GPU
+  common::Span<std::size_t const> MakeRankOnCUDA(Context const* ctx,
+                                                 common::Span<float const> predt);
+  // Create model rank list on CPU
+  common::Span<std::size_t const> MakeRankOnCPU(Context const* ctx,
+                                                common::Span<float const> predt);
+
+ protected:
+  [[nodiscard]] std::size_t MaxGroupSize() const { return max_group_size_; }
+
+ public:
+  RankingCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : param_{p} {
+    CHECK(param_.GetInitialised());
+    if (!info.group_ptr_.empty()) {
+      CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
+          << error::GroupSize() << "the size of label.";
+    }
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+    if (!info.weights_.Empty()) {
+      CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
+    }
+  }
+  [[nodiscard]] std::size_t MaxPositionSize() const {
+    // Use truncation level as bound.
+    if (param_.HasTruncation()) {
+      return param_.NumPair();
+    }
+    // Hardcoded maximum size of positions to track. We don't need too many of them as the
+    // bias decreases exponentially.
+    return std::min(max_group_size_, static_cast<std::size_t>(32));
+  }
+  // Constructed as [1, n_samples] if group ptr is not supplied by the user
+  common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
+    group_ptr_.SetDevice(ctx->gpu_id);
+    return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
+  }
+
+  [[nodiscard]] auto const& Param() const { return param_; }
+  [[nodiscard]] std::size_t Groups() const { return group_ptr_.Size() - 1; }
+  [[nodiscard]] double WeightNorm() const { return weight_norm_; }
+
+  // Create a rank list by model prediction
+  common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
+    if (sorted_idx_cache_.Empty()) {
+      sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      sorted_idx_cache_.Resize(predt.size());
+    }
+    if (ctx->IsCPU()) {
+      return this->MakeRankOnCPU(ctx, predt);
+    } else {
+      return this->MakeRankOnCUDA(ctx, predt);
+    }
+  }
+  // The function simply returns a uninitialized buffer as this is only used by the
+  // objective for creating pairs.
+  common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
+    CHECK(ctx->IsCUDA());
+    if (y_sorted_idx_cache_.Empty()) {
+      y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      y_sorted_idx_cache_.Resize(n_samples);
+    }
+    return y_sorted_idx_cache_.DeviceSpan();
+  }
+  common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
+    CHECK(ctx->IsCUDA());
+    if (y_ranked_by_model_.Empty()) {
+      y_ranked_by_model_.SetDevice(ctx->gpu_id);
+      y_ranked_by_model_.Resize(n_samples);
+    }
+    return y_ranked_by_model_.DeviceSpan();
+  }
+
+  // CUDA cache getters, the cache is shared between metric and objective, some of these
+  // fields are lazy initialized to avoid unnecessary allocation.
+  [[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
+    CHECK(!threads_group_ptr_.Empty());
+    return threads_group_ptr_.ConstDeviceSpan();
+  }
+  [[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
+
+  linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
+    if (roundings_.Size() == 0) {
+      roundings_.SetDevice(ctx->gpu_id);
+      roundings_.Reshape(Groups());
+    }
+    return roundings_.View(ctx->gpu_id);
+  }
+  common::Span<double> CUDACostRounding(Context const* ctx) {
+    if (cost_rounding_.Size() == 0) {
+      cost_rounding_.SetDevice(ctx->gpu_id);
+      cost_rounding_.Resize(1);
+    }
+    return cost_rounding_.DeviceSpan();
+  }
+  template <typename Type>
+  common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
+    max_lambdas_.SetDevice(ctx->gpu_id);
+    std::size_t bytes = n * sizeof(Type);
+    if (bytes != max_lambdas_.Size()) {
+      max_lambdas_.Resize(bytes);
+    }
+    return common::Span<Type>{reinterpret_cast<Type*>(max_lambdas_.DevicePointer()), n};
+  }
+};
+
+class NDCGCache : public RankingCache {
+  // NDCG discount
+  HostDeviceVector<double> discounts_;
+  // 1.0 / IDCG
+  linalg::Vector<double> inv_idcg_;
+  /**
+   * CUDA cache
+   */
+  // store the intermediate DCG calculation result for metric
+  linalg::Vector<double> dcg_;
+
+ public:
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+
+ public:
+  NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
+      : RankingCache{ctx, info, p} {
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+  }
+
+  linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
+    return inv_idcg_.View(ctx->gpu_id);
+  }
+  common::Span<double const> Discount(Context const* ctx) const {
+    return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
+  }
+  linalg::VectorView<double> Dcg(Context const* ctx) {
+    if (dcg_.Size() == 0) {
+      dcg_.SetDevice(ctx->gpu_id);
+      dcg_.Reshape(this->Groups());
+    }
+    return dcg_.View(ctx->gpu_id);
+  }
+};
+
+/**
+ * \brief Validate label for NDCG
+ *
+ * \tparam NoneOf Implementation of std::none_of. Specified as a parameter to reuse the
+ *                check for both CPU and GPU.
+ */
+template <typename NoneOf>
+void CheckNDCGLabels(ltr::LambdaRankParam const& p, linalg::VectorView<float const> labels,
+                     NoneOf none_of) {
+  auto d_labels = labels.Values();
+  if (p.ndcg_exp_gain) {
+    auto label_is_integer =
+        none_of(d_labels.data(), d_labels.data() + d_labels.size(), [] XGBOOST_DEVICE(float v) {
+          auto l = std::floor(v);
+          return std::fabs(l - v) > kRtEps || v < 0.0f;
+        });
+    CHECK(label_is_integer)
+        << "When using relevance degree as target, label must be either 0 or positive integer.";
+  }
+
+  if (p.ndcg_exp_gain) {
+    auto label_is_valid = none_of(d_labels.data(), d_labels.data() + d_labels.size(),
+                                  [] XGBOOST_DEVICE(ltr::rel_degree_t v) { return v > MaxRel(); });
+    CHECK(label_is_valid) << "Relevance degress must be lesser than or equal to " << MaxRel()
+                          << " when the exponential NDCG gain function is used. "
+                          << "Set `ndcg_exp_gain` to false to use custom DCG gain.";
+  }
+}
+
+template <typename AllOf>
+bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
+  auto s_label = label.Values();
+  return all_of(s_label.data(), s_label.data() + s_label.size(), [] XGBOOST_DEVICE(float y) {
+    return std::abs(y - 1.0f) < kRtEps || std::abs(y - 0.0f) < kRtEps;
+  });
+}
+/**
+ * \brief Validate label for MAP
+ *
+ * \tparam Implementation of std::all_of. Specified as a parameter to reuse the check for
+ *         both CPU and GPU.
+ */
+template <typename AllOf>
+void CheckMapLabels(linalg::VectorView<float const> label, AllOf all_of) {
+  auto s_label = label.Values();
+  auto is_binary = IsBinaryRel(label, all_of);
+  CHECK(is_binary) << "MAP can only be used with binary labels.";
+}
+
+class MAPCache : public RankingCache {
+  // Total number of relevant documents for each group
+  HostDeviceVector<double> n_rel_;
+  // \sum l_k/k
+  HostDeviceVector<double> acc_;
+  HostDeviceVector<double> map_;
+  // Number of samples in this dataset.
+  std::size_t n_samples_{0};
+
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+
+ public:
+  MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
+      : RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+  }
+
+  common::Span<double> NumRelevant(Context const* ctx) {
+    if (n_rel_.Empty()) {
+      n_rel_.SetDevice(ctx->gpu_id);
+      n_rel_.Resize(n_samples_);
+    }
+    return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
+  }
+  common::Span<double> Acc(Context const* ctx) {
+    if (acc_.Empty()) {
+      acc_.SetDevice(ctx->gpu_id);
+      acc_.Resize(n_samples_);
+    }
+    return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
+  }
+  common::Span<double> Map(Context const* ctx) {
+    if (map_.Empty()) {
+      map_.SetDevice(ctx->gpu_id);
+      map_.Resize(this->Groups());
+    }
+    return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
+  }
+};
+
 /**
  * \brief Parse name for ranking metric given parameters.
  *
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index a52695e02..d80008cc0 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -8,9 +8,11 @@
 #include <dmlc/omp.h>
 
 #include <algorithm>
-#include <cstdint>  // std::int32_t
+#include <cstdint>  // for int32_t
+#include <cstdlib>  // for malloc, free
 #include <limits>
-#include <type_traits>  // std::is_signed
+#include <new>          // for bad_alloc
+#include <type_traits>  // for is_signed
 #include <vector>
 
 #include "xgboost/logging.h"
@@ -266,7 +268,7 @@ class MemStackAllocator {
     if (MaxStackSize >= required_size_) {
       ptr_ = stack_mem_;
     } else {
-      ptr_ = reinterpret_cast<T*>(malloc(required_size_ * sizeof(T)));
+      ptr_ = reinterpret_cast<T*>(std::malloc(required_size_ * sizeof(T)));
     }
     if (!ptr_) {
       throw std::bad_alloc{};
@@ -278,7 +280,7 @@ class MemStackAllocator {
 
   ~MemStackAllocator() {
     if (required_size_ > MaxStackSize) {
-      free(ptr_);
+      std::free(ptr_);
     }
   }
   T& operator[](size_t i) { return ptr_[i]; }
diff --git a/src/data/data.cc b/src/data/data.cc
index b61534ce4..829c385b7 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -10,13 +10,16 @@
 #include <cstring>
 
 #include "../collective/communicator-inl.h"
-#include "../common/algorithm.h"  // StableSort
-#include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
+#include "../collective/communicator.h"
+#include "../common/common.h"
+#include "../common/algorithm.h"  // for StableSort
+#include "../common/api_entry.h"  // for XGBAPIThreadLocalEntry
+#include "../common/error_msg.h"  // for InfInData
 #include "../common/group_data.h"
 #include "../common/io.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
-#include "../common/numeric.h"  // Iota
+#include "../common/numeric.h"  // for Iota
 #include "../common/threading_utils.h"
 #include "../common/version.h"
 #include "../data/adapter.h"
@@ -700,6 +703,14 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
   }
 }
 
+void MetaInfo::SynchronizeNumberOfColumns() {
+  if (collective::IsFederated() && data_split_mode == DataSplitMode::kCol) {
+    collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
+  } else {
+    collective::Allreduce<collective::Operation::kMax>(&num_col_, 1);
+  }
+}
+
 void MetaInfo::Validate(std::int32_t device) const {
   if (group_ptr_.size() != 0 && weights_.Size() != 0) {
     CHECK_EQ(group_ptr_.size(), weights_.Size() + 1)
@@ -867,7 +878,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
           dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
       data::FileAdapter adapter(parser.get());
       dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
-                             cache_file);
+                             cache_file, data_split_mode);
     } else {
       data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart),
                               file_format};
@@ -903,11 +914,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     LOG(FATAL) << "Encountered parser error:\n" << e.what();
   }
 
-  /* sync up number of features after matrix loaded.
-   * partitioned data will fail the train/val validation check
-   * since partitioned data not knowing the real number of features. */
-  collective::Allreduce<collective::Operation::kMax>(&dmat->Info().num_col_, 1);
-
   if (need_split && data_split_mode == DataSplitMode::kCol) {
     if (!cache_file.empty()) {
       LOG(FATAL) << "Column-wise data split is not support for external memory.";
@@ -917,7 +923,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     delete dmat;
     return sliced;
   } else {
-    dmat->Info().data_split_mode = data_split_mode;
     return dmat;
   }
 }
@@ -954,39 +959,49 @@ template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
     XGDMatrixCallbackNext *next, float missing, int32_t n_threads, std::string);
 
 template <typename AdapterT>
-DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string&) {
-  return new data::SimpleDMatrix(adapter, missing, nthread);
+DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string&,
+                         DataSplitMode data_split_mode) {
+  return new data::SimpleDMatrix(adapter, missing, nthread, data_split_mode);
 }
 
 template DMatrix* DMatrix::Create<data::DenseAdapter>(data::DenseAdapter* adapter, float missing,
                                                       std::int32_t nthread,
-                                                      const std::string& cache_prefix);
+                                                      const std::string& cache_prefix,
+                                                      DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::ArrayAdapter>(data::ArrayAdapter* adapter, float missing,
                                                       std::int32_t nthread,
-                                                      const std::string& cache_prefix);
+                                                      const std::string& cache_prefix,
+                                                      DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSRAdapter>(data::CSRAdapter* adapter, float missing,
                                                     std::int32_t nthread,
-                                                    const std::string& cache_prefix);
+                                                    const std::string& cache_prefix,
+                                                    DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSCAdapter>(data::CSCAdapter* adapter, float missing,
                                                     std::int32_t nthread,
-                                                    const std::string& cache_prefix);
+                                                    const std::string& cache_prefix,
+                                                    DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::DataTableAdapter>(data::DataTableAdapter* adapter,
                                                           float missing, std::int32_t nthread,
-                                                          const std::string& cache_prefix);
+                                                          const std::string& cache_prefix,
+                                                          DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::FileAdapter>(data::FileAdapter* adapter, float missing,
                                                      std::int32_t nthread,
-                                                     const std::string& cache_prefix);
+                                                     const std::string& cache_prefix,
+                                                     DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(data::CSRArrayAdapter* adapter,
                                                          float missing, std::int32_t nthread,
-                                                         const std::string& cache_prefix);
+                                                         const std::string& cache_prefix,
+                                                         DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter* adapter,
                                                          float missing, std::int32_t nthread,
-                                                         const std::string& cache_prefix);
+                                                         const std::string& cache_prefix,
+                                                         DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create(
     data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
-    float missing, int nthread, const std::string& cache_prefix);
+    float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
-    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&);
+    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&,
+    DataSplitMode data_split_mode);
 
 SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
   SparsePage transpose;
@@ -1048,6 +1063,13 @@ void SparsePage::SortIndices(int32_t n_threads) {
   });
 }
 
+void SparsePage::Reindex(uint64_t feature_offset, int32_t n_threads) {
+  auto& h_data = this->data.HostVector();
+  common::ParallelFor(h_data.size(), n_threads, [&](auto i) {
+    h_data[i].index += feature_offset;
+  });
+}
+
 void SparsePage::SortRows(int32_t n_threads) {
   auto& h_offset = this->offset.HostVector();
   auto& h_data = this->data.HostVector();
@@ -1144,7 +1166,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
     });
   }
   exec.Rethrow();
-  CHECK(valid) << "Input data contains `inf` or `nan`";
+  CHECK(valid) << error::InfInData();
   for (const auto & max : max_columns_vector) {
     max_columns = std::max(max_columns, max[0]);
   }
diff --git a/src/data/data.cu b/src/data/data.cu
index 08a4f05fd..fe6f8c8cf 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -208,17 +208,17 @@ void MetaInfo::SetInfoFromCUDA(Context const& ctx, StringView key, Json array) {
 
 template <typename AdapterT>
 DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
-                         const std::string& cache_prefix) {
+                         const std::string& cache_prefix, DataSplitMode data_split_mode) {
   CHECK_EQ(cache_prefix.size(), 0)
       << "Device memory construction is not currently supported with external "
          "memory.";
-  return new data::SimpleDMatrix(adapter, missing, nthread);
+  return new data::SimpleDMatrix(adapter, missing, nthread, data_split_mode);
 }
 
 template DMatrix* DMatrix::Create<data::CudfAdapter>(
     data::CudfAdapter* adapter, float missing, int nthread,
-    const std::string& cache_prefix);
+    const std::string& cache_prefix, DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CupyAdapter>(
     data::CupyAdapter* adapter, float missing, int nthread,
-    const std::string& cache_prefix);
+    const std::string& cache_prefix, DataSplitMode data_split_mode);
 }  // namespace xgboost
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 5eeb5fd5c..97b1e8874 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -4,7 +4,10 @@
  */
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
 #define XGBOOST_DATA_DEVICE_ADAPTER_H_
-#include <cstddef>  // for size_t
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/logical.h>                     // for none_of
+
+#include <cstddef>                              // for size_t
 #include <limits>
 #include <memory>
 #include <string>
@@ -240,6 +243,20 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
 
   return row_stride;
 }
+
+/**
+ * \brief Check there's no inf in data.
+ */
+template <typename AdapterBatchT>
+bool HasInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
+  auto counting = thrust::make_counting_iterator(0llu);
+  auto value_iter = dh::MakeTransformIterator<float>(
+      counting, [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
+  auto valid =
+      thrust::none_of(value_iter, value_iter + batch.Size(),
+                      [is_valid] XGBOOST_DEVICE(float v) { return is_valid(v) && std::isinf(v); });
+  return valid;
+}
 };  // namespace data
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_DEVICE_ADAPTER_H_
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index fc46df4a7..c1a964348 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2022 XGBoost contributors
+/**
+ * Copyright 2019-2023 by XGBoost contributors
  */
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
@@ -9,7 +9,7 @@
 #include "../common/random.h"
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
 #include "./ellpack_page.cuh"
-#include "device_adapter.cuh"
+#include "device_adapter.cuh"  // for HasInfInData
 #include "gradient_index.h"
 #include "xgboost/data.h"
 
@@ -203,9 +203,8 @@ struct TupleScanOp {
 // Here the data is already correctly ordered and simply needs to be compacted
 // to remove missing data
 template <typename AdapterBatchT>
-void CopyDataToEllpack(const AdapterBatchT &batch,
-                       common::Span<FeatureType const> feature_types,
-                       EllpackPageImpl *dst, int device_idx, float missing) {
+void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
+                       EllpackPageImpl* dst, int device_idx, float missing) {
   // Some witchcraft happens here
   // The goal is to copy valid elements out of the input to an ELLPACK matrix
   // with a given row stride, using no extra working memory Standard stream
@@ -215,6 +214,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
   // correct output position
   auto counting = thrust::make_counting_iterator(0llu);
   data::IsValidFunctor is_valid(missing);
+  bool valid = data::HasInfInData(batch, is_valid);
+  CHECK(valid) << error::InfInData();
+
   auto key_iter = dh::MakeTransformIterator<size_t>(
       counting,
       [=] __device__(size_t idx) {
@@ -255,9 +257,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
       cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
                         TupleScanOp<Tuple>, cub::NullType, int64_t>;
 #if THRUST_MAJOR_VERSION >= 2
-  DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
-                         TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
-                         nullptr);
+  dh::safe_cuda(DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
+                                       TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
+                                       nullptr));
 #else
   DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
                          TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
@@ -265,9 +267,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
 #endif
   dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
 #if THRUST_MAJOR_VERSION >= 2
-  DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
-                         key_value_index_iter, out, TupleScanOp<Tuple>(),
-                         cub::NullType(), batch.Size(), nullptr);
+  dh::safe_cuda(DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
+                                       key_value_index_iter, out, TupleScanOp<Tuple>(),
+                                       cub::NullType(), batch.Size(), nullptr));
 #else
   DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
                          key_value_index_iter, out, TupleScanOp<Tuple>(),
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 9eba9637f..3cb0709bd 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -1,21 +1,23 @@
-/*!
- * Copyright 2017-2022 by XGBoost Contributors
+/**
+ * Copyright 2017-2023 by XGBoost Contributors
  * \brief Data type for fast histogram aggregation.
  */
 #ifndef XGBOOST_DATA_GRADIENT_INDEX_H_
 #define XGBOOST_DATA_GRADIENT_INDEX_H_
 
-#include <algorithm>  // std::min
-#include <cinttypes>  // std::uint32_t
-#include <cstddef>    // std::size_t
+#include <algorithm>  // for min
+#include <atomic>     // for atomic
+#include <cinttypes>  // for uint32_t
+#include <cstddef>    // for size_t
 #include <memory>
 #include <vector>
 
 #include "../common/categorical.h"
+#include "../common/error_msg.h"  // for InfInData
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
 #include "../common/threading_utils.h"
-#include "../common/transform_iterator.h"  // common::MakeIndexTransformIter
+#include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
 #include "proxy_dmatrix.h"
 #include "xgboost/base.h"
@@ -62,6 +64,7 @@ class GHistIndexMatrix {
     BinIdxType* index_data = index_data_span.data();
     auto const& ptrs = cut.Ptrs();
     auto const& values = cut.Values();
+    std::atomic<bool> valid{true};
     common::ParallelFor(batch_size, batch_threads, [&](size_t i) {
       auto line = batch.GetLine(i);
       size_t ibegin = row_ptr[rbegin + i];  // index of first entry for current block
@@ -70,6 +73,9 @@ class GHistIndexMatrix {
       for (size_t j = 0; j < line.Size(); ++j) {
         data::COOTuple elem = line.GetElement(j);
         if (is_valid(elem)) {
+          if (XGBOOST_EXPECT((std::isinf(elem.value)), false)) {
+            valid = false;
+          }
           bst_bin_t bin_idx{-1};
           if (common::IsCat(ft, elem.column_idx)) {
             bin_idx = cut.SearchCatBin(elem.value, elem.column_idx, ptrs, values);
@@ -82,6 +88,8 @@ class GHistIndexMatrix {
         }
       }
     });
+
+    CHECK(valid) << error::InfInData();
   }
 
   // Gather hit_count from all threads
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index ae0cfc4a4..dc6fb55e8 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -190,7 +190,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   // From here on Info() has the correct data shape
   Info().num_row_ = accumulated_rows;
   Info().num_nonzero_ = nnz;
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  Info().SynchronizeNumberOfColumns();
   CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) {
     return f > accumulated_rows;
   })) << "Something went wrong during iteration.";
@@ -257,6 +257,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   }
   iter.Reset();
   CHECK_EQ(rbegin, Info().num_row_);
+  CHECK_EQ(this->ghist_->Features(), Info().num_col_);
 
   /**
    * Generate column matrix
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 976fcc832..0cdffa124 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -195,7 +195,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
 
   iter.Reset();
   // Synchronise worker columns
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.SynchronizeNumberOfColumns();
 }
 
 BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& param) {
diff --git a/src/data/proxy_dmatrix.cuh b/src/data/proxy_dmatrix.cuh
index 38cbffe50..6ea858e7e 100644
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -1,27 +1,24 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
  */
+#include <any>  // for any, any_cast
+
 #include "device_adapter.cuh"
 #include "proxy_dmatrix.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 template <typename Fn>
 decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
   if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
-    auto value = dmlc::get<std::shared_ptr<CupyAdapter>>(
-        proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
     return fn(value);
   } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
-    auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
-        proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
     return fn(value);
   } else {
     LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
-    auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
-        proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
     return fn(value);
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index fa2901c47..587510bd2 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -1,11 +1,10 @@
-/*!
- * Copyright 2020-2022, XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
 #ifndef XGBOOST_DATA_PROXY_DMATRIX_H_
 #define XGBOOST_DATA_PROXY_DMATRIX_H_
 
-#include <dmlc/any.h>
-
+#include <any>  // for any, any_cast
 #include <memory>
 #include <string>
 #include <utility>
@@ -15,8 +14,7 @@
 #include "xgboost/context.h"
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 /*
  * \brief A proxy to external iterator.
  */
@@ -44,7 +42,7 @@ class DataIterProxy {
  */
 class DMatrixProxy : public DMatrix {
   MetaInfo info_;
-  dmlc::any batch_;
+  std::any batch_;
   Context ctx_;
 
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -115,9 +113,7 @@ class DMatrixProxy : public DMatrix {
     LOG(FATAL) << "Not implemented.";
     return BatchSet<ExtSparsePage>(BatchIterator<ExtSparsePage>(nullptr));
   }
-  dmlc::any Adapter() const {
-    return batch_;
-  }
+  std::any Adapter() const { return batch_; }
 };
 
 inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
@@ -131,15 +127,13 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
 template <typename Fn>
 decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
   if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
-    auto value =
-        dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
     if (type_error) {
       *type_error = false;
     }
     return fn(value);
   } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
-    auto value = dmlc::get<std::shared_ptr<ArrayAdapter>>(
-        proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
     if (type_error) {
       *type_error = false;
     }
@@ -154,6 +148,5 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
         decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 014b57282..098c3c4f2 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -73,6 +73,19 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
   return out;
 }
 
+void SimpleDMatrix::ReindexFeatures() {
+  if (collective::IsFederated() && info_.data_split_mode == DataSplitMode::kCol) {
+    std::vector<uint64_t> buffer(collective::GetWorldSize());
+    buffer[collective::GetRank()] = info_.num_col_;
+    collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
+    auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0);
+    if (offset == 0) {
+      return;
+    }
+    sparse_page_->Reindex(offset, ctx_.Threads());
+  }
+}
+
 BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
   // since csr is the default data structure so `source_` is always available.
   auto begin_iter = BatchIterator<SparsePage>(
@@ -151,7 +164,8 @@ BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(BatchParam const&) {
 }
 
 template <typename AdapterT>
-SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
+SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
+                             DataSplitMode data_split_mode) {
   this->ctx_.nthread = nthread;
 
   std::vector<uint64_t> qids;
@@ -217,7 +231,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
 
 
   // Synchronise worker columns
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.data_split_mode = data_split_mode;
+  ReindexFeatures();
+  info_.SynchronizeNumberOfColumns();
 
   if (adapter->NumRows() == kAdapterUnknownSize) {
     using IteratorAdapterT
@@ -272,22 +288,31 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
     fo->Write(sparse_page_->data.HostVector());
 }
 
-template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread);
+template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(
     IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>
         *adapter,
-    float missing, int nthread);
+    float missing, int nthread, DataSplitMode data_split_mode);
 
 template <>
-SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread) {
-  ctx_.nthread = nthread;
+SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
+                             DataSplitMode data_split_mode) {
+    ctx_.nthread = nthread;
 
   auto& offset_vec = sparse_page_->offset.HostVector();
   auto& data_vec = sparse_page_->data.HostVector();
@@ -346,7 +371,10 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
   }
   // Synchronise worker columns
   info_.num_col_ = adapter->NumColumns();
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.data_split_mode = data_split_mode;
+  ReindexFeatures();
+  info_.SynchronizeNumberOfColumns();
+
   info_.num_row_ = total_batch_size;
   info_.num_nonzero_ = data_vec.size();
   CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index 421e14575..b52333fe6 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -15,7 +15,10 @@ namespace data {
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
 template <typename AdapterT>
-SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/) {
+SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/,
+                             DataSplitMode data_split_mode) {
+  CHECK(data_split_mode != DataSplitMode::kCol)
+      << "Column-wise data split is currently not supported on the GPU.";
   auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
                                                                       : adapter->DeviceIdx();
   CHECK_GE(device, 0);
@@ -40,12 +43,13 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
   info_.num_col_ = adapter->NumColumns();
   info_.num_row_ = adapter->NumRows();
   // Synchronise worker columns
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.data_split_mode = data_split_mode;
+  info_.SynchronizeNumberOfColumns();
 }
 
 template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
-                                      int nthread);
+                                      int nthread, DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
-                                      int nthread);
+                                      int nthread, DataSplitMode data_split_mode);
 }  // namespace data
 }  // namespace xgboost
diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh
index 961e2d5d0..c72af07b6 100644
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -1,14 +1,13 @@
-/*!
- * Copyright 2019-2021 by XGBoost Contributors
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
  * \file simple_dmatrix.cuh
  */
 #ifndef XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
 #define XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
 
 #include <thrust/copy.h>
-#include <thrust/scan.h>
 #include <thrust/execution_policy.h>
-#include "device_adapter.cuh"
+#include <thrust/scan.h>
 
 #if defined(XGBOOST_USE_CUDA)
 #include "../common/device_helpers.cuh"
@@ -16,8 +15,10 @@
 #include "../common/device_helpers.hip.h"
 #endif
 
-namespace xgboost {
-namespace data {
+#include "../common/error_msg.h"  // for InfInData
+#include "device_adapter.cuh"     // for HasInfInData
+
+namespace xgboost::data {
 
 #if defined(XGBOOST_USE_CUDA)
 template <typename AdapterBatchT>
@@ -94,7 +95,11 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
 }
 
 template <typename AdapterBatchT>
-size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing, SparsePage* page) {
+size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
+                        SparsePage* page) {
+  bool valid = HasInfInData(batch, IsValidFunctor{missing});
+  CHECK(valid) << error::InfInData();
+
   page->offset.SetDevice(device);
   page->data.SetDevice(device);
   page->offset.Resize(batch.NumRows() + 1);
@@ -106,6 +111,5 @@ size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missin
 
   return num_nonzero_;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
index 897abfcf0..853e765af 100644
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -22,7 +22,8 @@ class SimpleDMatrix : public DMatrix {
  public:
   SimpleDMatrix() = default;
   template <typename AdapterT>
-  explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread);
+  explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
+                         DataSplitMode data_split_mode = DataSplitMode::kRow);
 
   explicit SimpleDMatrix(dmlc::Stream* in_stream);
   ~SimpleDMatrix() override = default;
@@ -61,6 +62,15 @@ class SimpleDMatrix : public DMatrix {
   bool GHistIndexExists() const override { return static_cast<bool>(gradient_index_); }
   bool SparsePageExists() const override { return true; }
 
+  /**
+   * \brief Reindex the features based on a global view.
+   *
+   * In some cases (e.g. vertical federated learning), features are loaded locally with indices
+   * starting from 0. However, all the algorithms assume the features are globally indexed, so we
+   * reindex the features based on the offset needed to obtain the global view.
+   */
+  void ReindexFeatures();
+
  private:
   Context ctx_;
 };
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index ccd780618..f9b74ebcf 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -96,7 +96,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
   this->info_.num_col_ = n_features;
   this->info_.num_nonzero_ = nnz;
 
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.SynchronizeNumberOfColumns();
   CHECK_NE(info_.num_col_, 0);
 }
 
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 7c2c59688..3c0b269a5 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -10,6 +10,7 @@
 #include <dmlc/parameter.h>
 
 #include <algorithm>
+#include <cinttypes>  // for uint32_t
 #include <limits>
 #include <memory>
 #include <string>
@@ -27,9 +28,11 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
 #include "xgboost/logging.h"
+#include "xgboost/model.h"
 #include "xgboost/objective.h"
 #include "xgboost/predictor.h"
-#include "xgboost/string_view.h"
+#include "xgboost/string_view.h"  // for StringView
+#include "xgboost/tree_model.h"   // for RegTree
 #include "xgboost/tree_updater.h"
 
 namespace xgboost::gbm {
@@ -131,6 +134,12 @@ void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
     // set, since only experts are expected to do so.
     return;
   }
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    CHECK(tparam_.tree_method == TreeMethod::kHist)
+        << "Only the hist tree method is supported for building multi-target trees with vector "
+           "leaf.";
+  }
+
   // tparam_ is set before calling this function.
   if (tparam_.tree_method != TreeMethod::kAuto) {
     return;
@@ -175,12 +184,12 @@ void GBTree::ConfigureUpdaters() {
     case TreeMethod::kExact:
       tparam_.updater_seq = "grow_colmaker,prune";
       break;
-    case TreeMethod::kHist:
-      LOG(INFO) <<
-          "Tree method is selected to be 'hist', which uses a "
-          "single updater grow_quantile_histmaker.";
+    case TreeMethod::kHist: {
+      LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
+                   "grow_quantile_histmaker.";
       tparam_.updater_seq = "grow_quantile_histmaker";
       break;
+    }
     case TreeMethod::kGPUHist: {
       common::AssertGPUSupport();
       tparam_.updater_seq = "grow_gpu_hist";
@@ -209,11 +218,9 @@ void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_thre
     GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
   } else {
     std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
-    auto nsize = static_cast<bst_omp_uint>(out_gpair->Size());
-    const auto &gpair_h = in_gpair->ConstHostVector();
-    common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
-      tmp_h[i] = gpair_h[i * n_groups + group_id];
-    });
+    const auto& gpair_h = in_gpair->ConstHostVector();
+    common::ParallelFor(out_gpair->Size(), n_threads,
+                        [&](auto i) { tmp_h[i] = gpair_h[i * n_groups + group_id]; });
   }
 }
 
@@ -234,6 +241,7 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
   CHECK_EQ(model_.param.num_parallel_tree, trees.size());
   CHECK_EQ(model_.param.num_parallel_tree, 1)
       << "Boosting random forest is not supported for current objective.";
+  CHECK(!trees.front()->IsMultiTarget()) << "Update tree leaf" << MTNotImplemented();
   CHECK_EQ(trees.size(), model_.param.num_parallel_tree);
   for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
     auto const& position = node_position.at(tree_idx);
@@ -245,17 +253,18 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
 void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                      PredictionCacheEntry* predt, ObjFunction const* obj) {
   std::vector<std::vector<std::unique_ptr<RegTree>>> new_trees;
-  const int ngroup = model_.learner_model_param->num_output_group;
+  const int ngroup = model_.learner_model_param->OutputLength();
   ConfigureWithKnownData(this->cfg_, p_fmat);
   monitor_.Start("BoostNewTrees");
+
   // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
   // `gpu_id` be the single source of determining what algorithms to run, but that will
   // break a lots of existing code.
   auto device = tparam_.tree_method != TreeMethod::kGPUHist ? Context::kCpuId : ctx_->gpu_id;
-  auto out = linalg::TensorView<float, 2>{
+  auto out = linalg::MakeTensorView(
+      device,
       device == Context::kCpuId ? predt->predictions.HostSpan() : predt->predictions.DeviceSpan(),
-      {static_cast<size_t>(p_fmat->Info().num_row_), static_cast<size_t>(ngroup)},
-      device};
+      p_fmat->Info().num_row_, model_.learner_model_param->OutputLength());
   CHECK_NE(ngroup, 0);
 
   if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf()) {
@@ -266,7 +275,13 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
   // position is negated if the row is sampled out.
   std::vector<HostDeviceVector<bst_node_t>> node_position;
 
-  if (ngroup == 1) {
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    std::vector<std::unique_ptr<RegTree>> ret;
+    BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
+    UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
+    // No update prediction cache yet.
+    new_trees.push_back(std::move(ret));
+  } else if (model_.learner_model_param->OutputLength() == 1) {
     std::vector<std::unique_ptr<RegTree>> ret;
     BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
     UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
@@ -360,8 +375,8 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
           << "Set `process_type` to `update` if you want to update existing "
              "trees.";
       // create new tree
-      std::unique_ptr<RegTree> ptr(new RegTree());
-      ptr->param.UpdateAllowUnknown(this->cfg_);
+      std::unique_ptr<RegTree> ptr(new RegTree{this->model_.learner_model_param->LeafLength(),
+                                               this->model_.learner_model_param->num_feature});
       new_trees.push_back(ptr.get());
       ret->push_back(std::move(ptr));
     } else if (tparam_.process_type == TreeProcessType::kUpdate) {
@@ -383,11 +398,15 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
   }
 
   // update the trees
-  CHECK_EQ(gpair->Size(), p_fmat->Info().num_row_)
-      << "Mismatching size between number of rows from input data and size of "
-         "gradient vector.";
+  auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_;
+  StringView msg{
+      "Mismatching size between number of rows from input data and size of gradient vector."};
+  if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
+    CHECK_EQ(n_out % gpair->Size(), 0) << msg;
+  } else {
+    CHECK_EQ(gpair->Size(), n_out) << msg;
+  }
 
-  CHECK(out_position);
   out_position->resize(new_trees.size());
 
   // Rescale learning rate according to the size of trees
@@ -402,8 +421,12 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
 
 void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
   monitor_.Start("CommitModel");
-  for (uint32_t gid = 0; gid < model_.learner_model_param->num_output_group; ++gid) {
-    model_.CommitModel(std::move(new_trees[gid]), gid);
+  if (this->model_.learner_model_param->IsVectorLeaf()) {
+    model_.CommitModel(std::move(new_trees[0]), 0);
+  } else {
+    for (std::uint32_t gid = 0; gid < model_.learner_model_param->OutputLength(); ++gid) {
+      model_.CommitModel(std::move(new_trees[gid]), gid);
+    }
   }
   monitor_.Stop("CommitModel");
 }
@@ -564,11 +587,10 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
   if (out_preds->version == 0) {
     // out_preds->Size() can be non-zero as it's initialized here before any
     // tree is built at the 0^th iterator.
-    predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions,
-                                  model_);
+    predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions, model_);
   }
 
-  uint32_t tree_begin, tree_end;
+  std::uint32_t tree_begin, tree_end;
   std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
   CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
   if (tree_end > tree_begin) {
@@ -577,7 +599,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
   if (reset) {
     out_preds->version = 0;
   } else {
-    uint32_t delta = layer_end - out_preds->version;
+    std::uint32_t delta = layer_end - out_preds->version;
     out_preds->Update(delta);
   }
 }
@@ -770,6 +792,7 @@ class Dart : public GBTree {
   void PredictBatchImpl(DMatrix *p_fmat, PredictionCacheEntry *p_out_preds,
                         bool training, unsigned layer_begin,
                         unsigned layer_end) const {
+    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
     auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
     CHECK(predictor);
     predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
@@ -830,6 +853,7 @@ class Dart : public GBTree {
   void InplacePredict(std::shared_ptr<DMatrix> p_fmat, float missing,
                       PredictionCacheEntry* p_out_preds, uint32_t layer_begin,
                       unsigned layer_end) const override {
+    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
     uint32_t tree_begin, tree_end;
     std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
     auto n_groups = model_.learner_model_param->num_output_group;
@@ -996,8 +1020,9 @@ class Dart : public GBTree {
   }
 
   // set normalization factors
-  inline size_t NormalizeTrees(size_t size_new_trees) {
-    float lr = 1.0 * dparam_.learning_rate / size_new_trees;
+  std::size_t NormalizeTrees(size_t size_new_trees) {
+    CHECK(tree_param_.GetInitialised());
+    float lr = 1.0 * tree_param_.learning_rate / size_new_trees;
     size_t num_drop = idx_drop_.size();
     if (num_drop == 0) {
       for (size_t i = 0; i < size_new_trees; ++i) {
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 177f1ca44..157b3b84e 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -111,8 +111,6 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
   bool one_drop;
   /*! \brief probability of skipping the dropout during an iteration */
   float skip_drop;
-  /*! \brief learning step size for a time */
-  float learning_rate;
   // declare parameters
   DMLC_DECLARE_PARAMETER(DartTrainParam) {
     DMLC_DECLARE_FIELD(sample_type)
@@ -136,24 +134,27 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
         .set_range(0.0f, 1.0f)
         .set_default(0.0f)
         .describe("Probability of skipping the dropout during a boosting iteration.");
-    DMLC_DECLARE_FIELD(learning_rate)
-        .set_lower_bound(0.0f)
-        .set_default(0.3f)
-        .describe("Learning rate(step size) of update.");
-    DMLC_DECLARE_ALIAS(learning_rate, eta);
   }
 };
 
 namespace detail {
 // From here on, layer becomes concrete trees.
-inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const &model,
-                                                 size_t layer_begin,
-                                                 size_t layer_end) {
-  bst_group_t groups = model.learner_model_param->num_output_group;
-  uint32_t tree_begin = layer_begin * groups * model.param.num_parallel_tree;
-  uint32_t tree_end = layer_end * groups * model.param.num_parallel_tree;
+inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const& model,
+                                                 std::uint32_t layer_begin,
+                                                 std::uint32_t layer_end) {
+  std::uint32_t tree_begin;
+  std::uint32_t tree_end;
+  if (model.learner_model_param->IsVectorLeaf()) {
+    tree_begin = layer_begin * model.param.num_parallel_tree;
+    tree_end = layer_end * model.param.num_parallel_tree;
+  } else {
+    bst_group_t groups = model.learner_model_param->OutputLength();
+    tree_begin = layer_begin * groups * model.param.num_parallel_tree;
+    tree_end = layer_end * groups * model.param.num_parallel_tree;
+  }
+
   if (tree_end == 0) {
-    tree_end = static_cast<uint32_t>(model.trees.size());
+    tree_end = model.trees.size();
   }
   if (model.trees.size() != 0) {
     CHECK_LE(tree_begin, tree_end);
@@ -241,22 +242,25 @@ class GBTree : public GradientBooster {
   void LoadModel(Json const& in) override;
 
   // Number of trees per layer.
-  auto LayerTrees() const {
-    auto n_trees = model_.learner_model_param->num_output_group * model_.param.num_parallel_tree;
-    return n_trees;
+  [[nodiscard]] std::uint32_t LayerTrees() const {
+    if (model_.learner_model_param->IsVectorLeaf()) {
+      return model_.param.num_parallel_tree;
+    }
+    return model_.param.num_parallel_tree * model_.learner_model_param->OutputLength();
   }
 
   // slice the trees, out must be already allocated
   void Slice(int32_t layer_begin, int32_t layer_end, int32_t step,
              GradientBooster *out, bool* out_of_bound) const override;
 
-  int32_t BoostedRounds() const override {
+  [[nodiscard]] std::int32_t BoostedRounds() const override {
     CHECK_NE(model_.param.num_parallel_tree, 0);
     CHECK_NE(model_.learner_model_param->num_output_group, 0);
+
     return model_.trees.size() / this->LayerTrees();
   }
 
-  bool ModelFitted() const override {
+  [[nodiscard]] bool ModelFitted() const override {
     return !model_.trees.empty() || !model_.trees_to_update.empty();
   }
 
diff --git a/src/learner.cc b/src/learner.cc
index 454855355..50d54c9fc 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -326,7 +326,7 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
   std::string booster;
   std::string objective;
   // This is a training parameter and is not saved (nor loaded) in the model.
-  MultiStrategy multi_strategy{MultiStrategy::kComposite};
+  MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
@@ -339,12 +339,12 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
         .set_default("reg:squarederror")
         .describe("Objective function used for obtaining gradient.");
     DMLC_DECLARE_FIELD(multi_strategy)
-        .add_enum("composite", MultiStrategy::kComposite)
-        .add_enum("monolithic", MultiStrategy::kMonolithic)
-        .set_default(MultiStrategy::kComposite)
+        .add_enum("one_output_per_tree", MultiStrategy::kOneOutputPerTree)
+        .add_enum("multi_output_tree", MultiStrategy::kMultiOutputTree)
+        .set_default(MultiStrategy::kOneOutputPerTree)
         .describe(
-            "Strategy used for training multi-target models. `mono` means building one single tree "
-            "for all targets.");
+            "Strategy used for training multi-target models. `multi_output_tree` means building "
+            "one single tree for all targets.");
   }
 };
 
@@ -440,7 +440,7 @@ class LearnerConfiguration : public Learner {
         info.Validate(Ctx()->gpu_id);
         // We estimate it from input data.
         linalg::Tensor<float, 1> base_score;
-        UsePtr(obj_)->InitEstimation(info, &base_score);
+        InitEstimation(info, &base_score);
         CHECK_EQ(base_score.Size(), 1);
         mparam_.base_score = base_score(0);
         CHECK(!std::isnan(mparam_.base_score));
@@ -775,8 +775,6 @@ class LearnerConfiguration : public Learner {
     }
     CHECK_NE(mparam_.num_feature, 0)
         << "0 feature is supplied.  Are you using raw Booster interface?";
-    // Remove these once binary IO is gone.
-    cfg_["num_feature"] = common::ToString(mparam_.num_feature);
   }
 
   void ConfigureGBM(LearnerTrainParam const& old, Args const& args) {
@@ -859,17 +857,37 @@ class LearnerConfiguration : public Learner {
       mparam_.num_target = n_targets;
     }
   }
+
+  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
+    // Special handling for vertical federated learning.
+    if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) {
+      // We assume labels are only available on worker 0, so the estimation is calculated there
+      // and added to other workers.
+      if (collective::GetRank() == 0) {
+        UsePtr(obj_)->InitEstimation(info, base_score);
+        collective::Broadcast(base_score->Data()->HostPointer(),
+                              sizeof(bst_float) * base_score->Size(), 0);
+      } else {
+        base_score->Reshape(1);
+        collective::Broadcast(base_score->Data()->HostPointer(),
+                              sizeof(bst_float) * base_score->Size(), 0);
+      }
+    } else {
+      UsePtr(obj_)->InitEstimation(info, base_score);
+    }
+  }
 };
 
 std::string const LearnerConfiguration::kEvalMetric {"eval_metric"};  // NOLINT
 
 class LearnerIO : public LearnerConfiguration {
  private:
-  std::set<std::string> saved_configs_ = {"num_round"};
   // Used to identify the offset of JSON string when
   // Will be removed once JSON takes over.  Right now we still loads some RDS files from R.
   std::string const serialisation_header_ { u8"CONFIG-offset:" };
 
+  void ClearCaches() { this->prediction_container_ = PredictionContainer{}; }
+
  public:
   explicit LearnerIO(std::vector<std::shared_ptr<DMatrix>> cache) : LearnerConfiguration{cache} {}
 
@@ -922,6 +940,7 @@ class LearnerIO : public LearnerConfiguration {
     }
 
     this->need_configuration_ = true;
+    this->ClearCaches();
   }
 
   void SaveModel(Json* p_out) const override {
@@ -1015,21 +1034,11 @@ class LearnerIO : public LearnerConfiguration {
     CHECK(fi->Read(&tparam_.booster)) << "BoostLearner: wrong model format";
 
     obj_.reset(ObjFunction::Create(tparam_.objective, &ctx_));
-    gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_,
-                                       &learner_model_param_));
+    gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_, &learner_model_param_));
     gbm_->Load(fi);
     if (mparam_.contain_extra_attrs != 0) {
       std::vector<std::pair<std::string, std::string> > attr;
       fi->Read(&attr);
-      for (auto& kv : attr) {
-        const std::string prefix = "SAVED_PARAM_";
-        if (kv.first.find(prefix) == 0) {
-          const std::string saved_param = kv.first.substr(prefix.length());
-          if (saved_configs_.find(saved_param) != saved_configs_.end()) {
-            cfg_[saved_param] = kv.second;
-          }
-        }
-      }
       attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
     }
     bool warn_old_model { false };
@@ -1098,6 +1107,7 @@ class LearnerIO : public LearnerConfiguration {
     cfg_.insert(n.cbegin(), n.cend());
 
     this->need_configuration_ = true;
+    this->ClearCaches();
   }
 
   // Save model into binary format.  The code is about to be deprecated by more robust
@@ -1111,16 +1121,6 @@ class LearnerIO : public LearnerConfiguration {
     std::vector<std::pair<std::string, std::string> > extra_attr;
     mparam.contain_extra_attrs = 1;
 
-    {
-      std::vector<std::string> saved_params;
-      for (const auto& key : saved_params) {
-        auto it = cfg_.find(key);
-        if (it != cfg_.end()) {
-          mparam.contain_extra_attrs = 1;
-          extra_attr.emplace_back("SAVED_PARAM_" + key, it->second);
-        }
-      }
-    }
     {
       // Similar to JSON model IO, we save the objective.
       Json j_obj { Object() };
@@ -1305,7 +1305,7 @@ class LearnerImpl : public LearnerIO {
     monitor_.Stop("PredictRaw");
 
     monitor_.Start("GetGradient");
-    obj_->GetGradient(predt.predictions, train->Info(), iter, &gpair_);
+    GetGradient(predt.predictions, train->Info(), iter, &gpair_);
     monitor_.Stop("GetGradient");
     TrainingObserver::Instance().Observe(gpair_, "Gradients");
 
@@ -1484,6 +1484,28 @@ class LearnerImpl : public LearnerIO {
   }
 
  private:
+  void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration,
+                   HostDeviceVector<GradientPair>* out_gpair) {
+    // Special handling for vertical federated learning.
+    if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) {
+      // We assume labels are only available on worker 0, so the gradients are calculated there
+      // and broadcast to other workers.
+      if (collective::GetRank() == 0) {
+        obj_->GetGradient(preds, info, iteration, out_gpair);
+        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
+                              0);
+      } else {
+        CHECK_EQ(info.labels.Size(), 0)
+            << "In vertical federated learning, labels should only be on the first worker";
+        out_gpair->Resize(preds.Size());
+        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
+                              0);
+      }
+    } else {
+      obj_->GetGradient(preds, info, iteration, out_gpair);
+    }
+  }
+
   /*! \brief random number transformation seed. */
   static int32_t constexpr kRandSeedMagic = 127;
   // gradient pairs
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 69e6e24cd..3a1416b0f 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -20,23 +20,51 @@
 //   corresponding headers that brings in those function declaration can't be included with CUDA).
 //   This precludes the CPU and GPU logic to coexist inside a .cu file
 
+#include "rank_metric.h"
+
+#include <dmlc/omp.h>
 #include <dmlc/registry.h>
-#include <xgboost/metric.h>
 
-#include <cmath>
-#include <vector>
+#include <algorithm>                         // for stable_sort, copy, fill_n, min, max
+#include <array>                             // for array
+#include <cmath>                             // for log, sqrt
+#include <cstddef>                           // for size_t, std
+#include <cstdint>                           // for uint32_t
+#include <functional>                        // for less, greater
+#include <map>                               // for operator!=, _Rb_tree_const_iterator
+#include <memory>                            // for allocator, unique_ptr, shared_ptr, __shared_...
+#include <numeric>                           // for accumulate
+#include <ostream>                           // for operator<<, basic_ostream, ostringstream
+#include <string>                            // for char_traits, operator<, basic_string, to_string
+#include <utility>                           // for pair, make_pair
+#include <vector>                            // for vector
 
-#include "../collective/communicator-inl.h"
-#include "../common/algorithm.h"  // Sort
-#include "../common/math.h"
-#include "../common/ranking_utils.h"  // MakeMetricName
-#include "../common/threading_utils.h"
-#include "metric_common.h"
-#include "xgboost/host_device_vector.h"
+#include "../collective/communicator-inl.h"  // for IsDistributed, Allreduce
+#include "../collective/communicator.h"      // for Operation
+#include "../common/algorithm.h"             // for ArgSort, Sort
+#include "../common/linalg_op.h"             // for cbegin, cend
+#include "../common/math.h"                  // for CmpFirst
+#include "../common/optional_weight.h"       // for OptionalWeights, MakeOptionalWeights
+#include "../common/ranking_utils.h"         // for LambdaRankParam, NDCGCache, ParseMetricName
+#include "../common/threading_utils.h"       // for ParallelFor
+#include "../common/transform_iterator.h"    // for IndexTransformIter
+#include "dmlc/common.h"                     // for OMPException
+#include "metric_common.h"                   // for MetricNoCache, GPUMetric, PackedReduceResult
+#include "xgboost/base.h"                    // for bst_float, bst_omp_uint, bst_group_t, Args
+#include "xgboost/cache.h"                   // for DMatrixCache
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/data.h"                    // for MetaInfo, DMatrix
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/json.h"                    // for Json, FromJson, IsA, ToJson, get, Null, Object
+#include "xgboost/linalg.h"                  // for Tensor, TensorView, Range, VectorView, MakeT...
+#include "xgboost/logging.h"                 // for CHECK, ConsoleLogger, LOG_INFO, CHECK_EQ
+#include "xgboost/metric.h"                  // for MetricReg, XGBOOST_REGISTER_METRIC, Metric
+#include "xgboost/span.h"                    // for Span, operator!=
+#include "xgboost/string_view.h"             // for StringView
 
 namespace {
 
-using PredIndPair = std::pair<xgboost::bst_float, uint32_t>;
+using PredIndPair = std::pair<xgboost::bst_float, xgboost::ltr::rel_degree_t>;
 using PredIndPairContainer = std::vector<PredIndPair>;
 
 /*
@@ -87,8 +115,7 @@ class PerGroupWeightPolicy {
 
 }  // anonymous namespace
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(rank_metric);
 
@@ -257,71 +284,6 @@ struct EvalPrecision : public EvalRank {
   }
 };
 
-/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
-struct EvalNDCG : public EvalRank {
- private:
-  double CalcDCG(const PredIndPairContainer &rec) const {
-    double sumdcg = 0.0;
-    for (size_t i = 0; i < rec.size() && i < this->topn; ++i) {
-      const unsigned rel = rec[i].second;
-      if (rel != 0) {
-        sumdcg += ((1 << rel) - 1) / std::log2(i + 2.0);
-      }
-    }
-    return sumdcg;
-  }
-
- public:
-  explicit EvalNDCG(const char* name, const char* param) : EvalRank(name, param) {}
-
-  double EvalGroup(PredIndPairContainer *recptr) const override {
-    PredIndPairContainer &rec(*recptr);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
-    double dcg = CalcDCG(rec);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpSecond);
-    double idcg = CalcDCG(rec);
-    if (idcg == 0.0f) {
-      if (this->minus) {
-        return 0.0f;
-      } else {
-        return 1.0f;
-      }
-    }
-    return dcg/idcg;
-  }
-};
-
-/*! \brief Mean Average Precision at N, for both classification and rank */
-struct EvalMAP : public EvalRank {
- public:
-  explicit EvalMAP(const char* name, const char* param) : EvalRank(name, param) {}
-
-  double EvalGroup(PredIndPairContainer *recptr) const override {
-    PredIndPairContainer &rec(*recptr);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
-    unsigned nhits = 0;
-    double sumap = 0.0;
-    for (size_t i = 0; i < rec.size(); ++i) {
-      if (rec[i].second != 0) {
-        nhits += 1;
-        if (i < this->topn) {
-          sumap += static_cast<double>(nhits) / (i + 1);
-        }
-      }
-    }
-    if (nhits != 0) {
-      sumap /= nhits;
-      return sumap;
-    } else {
-      if (this->minus) {
-        return 0.0;
-      } else {
-        return 1.0;
-      }
-    }
-  }
-};
-
 /*! \brief Cox: Partial likelihood of the Cox proportional hazards model */
 struct EvalCox : public MetricNoCache {
  public:
@@ -377,16 +339,213 @@ XGBOOST_REGISTER_METRIC(Precision, "pre")
 .describe("precision@k for rank.")
 .set_body([](const char* param) { return new EvalPrecision("pre", param); });
 
-XGBOOST_REGISTER_METRIC(NDCG, "ndcg")
-.describe("ndcg@k for rank.")
-.set_body([](const char* param) { return new EvalNDCG("ndcg", param); });
-
-XGBOOST_REGISTER_METRIC(MAP, "map")
-.describe("map@k for rank.")
-.set_body([](const char* param) { return new EvalMAP("map", param); });
-
 XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik")
 .describe("Negative log partial likelihood of Cox proportional hazards model.")
 .set_body([](const char*) { return new EvalCox(); });
-}  // namespace metric
-}  // namespace xgboost
+
+// ranking metrics that requires cache
+template <typename Cache>
+class EvalRankWithCache : public Metric {
+ protected:
+  ltr::LambdaRankParam param_;
+  bool minus_{false};
+  std::string name_;
+
+  DMatrixCache<Cache> cache_{DMatrixCache<Cache>::DefaultSize()};
+
+ public:
+  EvalRankWithCache(StringView name, const char* param) {
+    auto constexpr kMax = ltr::LambdaRankParam::NotSet();
+    std::uint32_t topn{kMax};
+    this->name_ = ltr::ParseMetricName(name, param, &topn, &minus_);
+    if (topn != kMax) {
+      param_.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", std::to_string(topn)},
+                                     {"lambdarank_pair_method", "topk"}});
+    }
+    param_.UpdateAllowUnknown(Args{});
+  }
+  void Configure(Args const&) override {
+    // do not configure, otherwise the ndcg param will be forced into the same as the one in
+    // objective.
+  }
+  void LoadConfig(Json const& in) override {
+    if (IsA<Null>(in)) {
+      return;
+    }
+    auto const& obj = get<Object const>(in);
+    auto it = obj.find("lambdarank_param");
+    if (it != obj.cend()) {
+      FromJson(it->second, &param_);
+    }
+  }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String{this->Name()};
+    out["lambdarank_param"] = ToJson(param_);
+  }
+
+  double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
+    auto const& info = p_fmat->Info();
+    auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
+    if (p_cache->Param() != param_) {
+      p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
+    }
+    CHECK(p_cache->Param() == param_);
+    CHECK_EQ(preds.Size(), info.labels.Size());
+
+    return this->Eval(preds, info, p_cache);
+  }
+
+  virtual double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
+                      std::shared_ptr<Cache> p_cache) = 0;
+};
+
+namespace {
+double Finalize(double score, double sw) {
+  std::array<double, 2> dat{score, sw};
+  collective::Allreduce<collective::Operation::kSum>(dat.data(), dat.size());
+  if (sw > 0.0) {
+    score = score / sw;
+  }
+
+  CHECK_LE(score, 1.0 + kRtEps)
+      << "Invalid output score, might be caused by invalid query group weight.";
+  score = std::min(1.0, score);
+
+  return score;
+}
+}  // namespace
+
+/**
+ * \brief Implement the NDCG score function for learning to rank.
+ *
+ *     Ties are ignored, which can lead to different result with other implementations.
+ */
+class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
+ public:
+  using EvalRankWithCache::EvalRankWithCache;
+  const char* Name() const override { return name_.c_str(); }
+
+  double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
+              std::shared_ptr<ltr::NDCGCache> p_cache) override {
+    if (ctx_->IsCUDA()) {
+      auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
+      return Finalize(ndcg.Residue(), ndcg.Weights());
+    }
+
+    // group local ndcg
+    auto group_ptr = p_cache->DataGroupPtr(ctx_);
+    bst_group_t n_groups = group_ptr.size() - 1;
+    auto ndcg_gloc = p_cache->Dcg(ctx_);
+    std::fill_n(ndcg_gloc.Values().data(), ndcg_gloc.Size(), 0.0);
+
+    auto h_inv_idcg = p_cache->InvIDCG(ctx_);
+    auto p_discount = p_cache->Discount(ctx_).data();
+
+    auto h_label = info.labels.HostView();
+    auto h_predt = linalg::MakeTensorView(ctx_, &preds, preds.Size());
+    auto weights = common::MakeOptionalWeights(ctx_, info.weights_);
+
+    common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
+      auto g_predt = h_predt.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
+      auto g_labels = h_label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]), 0);
+      auto sorted_idx = common::ArgSort<std::size_t>(ctx_, linalg::cbegin(g_predt),
+                                                     linalg::cend(g_predt), std::greater<>{});
+      double ndcg{.0};
+      double inv_idcg = h_inv_idcg(g);
+      if (inv_idcg <= 0.0) {
+        ndcg_gloc(g) = minus_ ? 0.0 : 1.0;
+        return;
+      }
+      std::size_t n{std::min(sorted_idx.size(), static_cast<std::size_t>(param_.TopK()))};
+      if (param_.ndcg_exp_gain) {
+        for (std::size_t i = 0; i < n; ++i) {
+          ndcg += p_discount[i] * ltr::CalcDCGGain(g_labels(sorted_idx[i])) * inv_idcg;
+        }
+      } else {
+        for (std::size_t i = 0; i < n; ++i) {
+          ndcg += p_discount[i] * g_labels(sorted_idx[i]) * inv_idcg;
+        }
+      }
+      ndcg_gloc(g) += ndcg * weights[g];
+    });
+    double sum_w{0};
+    if (weights.Empty()) {
+      sum_w = n_groups;
+    } else {
+      sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
+    }
+    auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
+    return Finalize(ndcg, sum_w);
+  }
+};
+
+class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
+ public:
+  using EvalRankWithCache::EvalRankWithCache;
+  const char* Name() const override { return name_.c_str(); }
+
+  double Eval(HostDeviceVector<float> const& predt, MetaInfo const& info,
+              std::shared_ptr<ltr::MAPCache> p_cache) override {
+    if (ctx_->IsCUDA()) {
+      auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
+      return Finalize(map.Residue(), map.Weights());
+    }
+
+    auto gptr = p_cache->DataGroupPtr(ctx_);
+    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
+    auto h_predt = linalg::MakeTensorView(ctx_, &predt, predt.Size());
+
+    auto map_gloc = p_cache->Map(ctx_);
+    std::fill_n(map_gloc.data(), map_gloc.size(), 0.0);
+    auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
+
+    common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
+      auto g_predt = h_predt.Slice(linalg::Range(gptr[g], gptr[g + 1]));
+      auto g_label = h_label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
+      auto g_rank = rank_idx.subspan(gptr[g]);
+
+      auto n = std::min(static_cast<std::size_t>(param_.TopK()), g_label.Size());
+      double n_hits{0.0};
+      for (std::size_t i = 0; i < n; ++i) {
+        auto p = g_label(g_rank[i]);
+        n_hits += p;
+        map_gloc[g] += n_hits / static_cast<double>((i + 1)) * p;
+      }
+      for (std::size_t i = n; i < g_label.Size(); ++i) {
+        n_hits += g_label(g_rank[i]);
+      }
+      if (n_hits > 0.0) {
+        map_gloc[g] /= std::min(n_hits, static_cast<double>(param_.TopK()));
+      } else {
+        map_gloc[g] = minus_ ? 0.0 : 1.0;
+      }
+    });
+
+    auto sw = 0.0;
+    auto weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    if (!weight.Empty()) {
+      CHECK_EQ(weight.weights.size(), p_cache->Groups());
+    }
+    for (std::size_t i = 0; i < map_gloc.size(); ++i) {
+      map_gloc[i] = map_gloc[i] * weight[i];
+      sw += weight[i];
+    }
+    auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
+    return Finalize(sum, sw);
+  }
+};
+
+XGBOOST_REGISTER_METRIC(EvalMAP, "map")
+    .describe("map@k for ranking.")
+    .set_body([](char const* param) {
+      return new EvalMAPScore{"map", param};
+    });
+
+XGBOOST_REGISTER_METRIC(EvalNDCG, "ndcg")
+    .describe("ndcg@k for ranking.")
+    .set_body([](char const* param) {
+      return new EvalNDCG{"ndcg", param};
+    });
+}  // namespace xgboost::metric
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index b19571559..113857439 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -2,22 +2,29 @@
  * Copyright 2020-2023 by XGBoost Contributors
  */
 #include <dmlc/registry.h>
-#include <thrust/iterator/counting_iterator.h>  // make_counting_iterator
-#include <thrust/reduce.h>                      // reduce
-#include <xgboost/metric.h>
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/reduce.h>                      // for reduce
 
-#include <cstddef>                       // std::size_t
-#include <memory>                        // std::shared_ptr
+#include <algorithm>                            // for transform
+#include <cstddef>                              // for size_t
+#include <memory>                               // for shared_ptr
+#include <vector>                               // for vector
 
-#include "../common/cuda_context.cuh"    // CUDAContext
+#include "../common/cuda_context.cuh"           // for CUDAContext
+#include "../common/device_helpers.cuh"         // for MakeTransformIterator
+#include "../common/optional_weight.h"          // for MakeOptionalWeights
+#include "../common/ranking_utils.cuh"          // for CalcQueriesDCG, NDCGCache
 #include "metric_common.h"
-#include "xgboost/base.h"                // XGBOOST_DEVICE
-#include "xgboost/context.h"             // Context
-#include "xgboost/data.h"                // MetaInfo
-#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "rank_metric.h"
+#include "xgboost/base.h"                // for XGBOOST_DEVICE
+#include "xgboost/context.h"             // for Context
+#include "xgboost/data.h"                // for MetaInfo
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/linalg.h"              // for MakeTensorView
+#include "xgboost/logging.h"             // for CHECK
+#include "xgboost/metric.h"
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(rank_metric_gpu);
 
@@ -134,200 +141,125 @@ struct EvalPrecisionGpu {
   }
 };
 
-/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
-struct EvalNDCGGpu {
- public:
-  static void ComputeDCG(const dh::SegmentSorter<float> &pred_sorter,
-                         const float *dlabels,
-                         const EvalRankConfig &ecfg,
-                         // The order in which labels have to be accessed. The order is determined
-                         // by sorting the predictions or the labels for the entire dataset
-                         const xgboost::common::Span<const uint32_t> &dlabels_sort_order,
-                         dh::caching_device_vector<double> *dcgptr) {
-    dh::caching_device_vector<double> &dcgs(*dcgptr);
-    // Group info on device
-    const auto &dgroups = pred_sorter.GetGroupsSpan();
-    const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
-
-    // First, determine non zero labels in the dataset individually
-    auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
-      return (static_cast<unsigned>(dlabels[dlabels_sort_order[idx]]));
-    };  // NOLINT
-
-    // Find each group's DCG value
-    const auto nitems = pred_sorter.GetNumItems();
-    auto *ddcgs = dcgs.data().get();
-
-    int device_id = -1;
-
-#if defined(XGBOOST_USE_CUDA)
-    dh::safe_cuda(cudaGetDevice(&device_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipGetDevice(&device_id));
-#endif
-
-    // For each group item compute the aggregated precision
-    dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
-      const auto group_idx = dgroup_idx[idx];
-      const auto group_begin = dgroups[group_idx];
-      const auto ridx = idx - group_begin;
-      auto label = DetermineNonTrivialLabelLambda(idx);
-      if (ridx < ecfg.topn && label) {
-        atomicAdd(&ddcgs[group_idx], ((1 << label) - 1) / std::log2(ridx + 2.0));
-      }
-    });
-  }
-
-  static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
-                           const float *dlabels,
-                           const EvalRankConfig &ecfg) {
-    // Sort the labels and compute IDCG
-    dh::SegmentSorter<float> segment_label_sorter;
-    segment_label_sorter.SortItems(dlabels, pred_sorter.GetNumItems(),
-                                   pred_sorter.GetGroupSegmentsSpan());
-
-    uint32_t ngroups = pred_sorter.GetNumGroups();
-
-    dh::caching_device_vector<double> idcg(ngroups, 0);
-    ComputeDCG(pred_sorter, dlabels, ecfg, segment_label_sorter.GetOriginalPositionsSpan(), &idcg);
-
-    // Compute the DCG values next
-    dh::caching_device_vector<double> dcg(ngroups, 0);
-    ComputeDCG(pred_sorter, dlabels, ecfg, pred_sorter.GetOriginalPositionsSpan(), &dcg);
-
-    double *ddcg = dcg.data().get();
-    double *didcg = idcg.data().get();
-
-    int device_id = -1;
-
-#if defined(XGBOOST_USE_CUDA)
-    dh::safe_cuda(cudaGetDevice(&device_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipGetDevice(&device_id));
-#endif
-
-    // Compute the group's DCG and reduce it across all groups
-    dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
-      if (didcg[gidx] == 0.0f) {
-        ddcg[gidx] = (ecfg.minus) ? 0.0f : 1.0f;
-      } else {
-        ddcg[gidx] /= didcg[gidx];
-      }
-    });
-
-    // Allocator to be used for managing space overhead while performing reductions
-    dh::XGBCachingDeviceAllocator<char> alloc;
-
-#if defined(XGBOOST_USE_CUDA)
-    return thrust::reduce(thrust::cuda::par(alloc), dcg.begin(), dcg.end());
-#elif defined(XGBOOST_USE_HIP)
-    return thrust::reduce(thrust::hip::par(alloc), dcg.begin(), dcg.end());
-#endif
-  }
-};
-
-/*! \brief Mean Average Precision at N, for both classification and rank */
-struct EvalMAPGpu {
- public:
-  static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
-                           const float *dlabels,
-                           const EvalRankConfig &ecfg) {
-    // Group info on device
-    const auto &dgroups = pred_sorter.GetGroupsSpan();
-    const auto ngroups = pred_sorter.GetNumGroups();
-    const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
-
-    // Original positions of the predictions after they have been sorted
-    const auto &dpreds_orig_pos = pred_sorter.GetOriginalPositionsSpan();
-
-    // First, determine non zero labels in the dataset individually
-    const auto nitems = pred_sorter.GetNumItems();
-    dh::caching_device_vector<uint32_t> hits(nitems, 0);
-    auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
-      return (static_cast<unsigned>(dlabels[dpreds_orig_pos[idx]]) != 0) ? 1 : 0;
-    };  // NOLINT
-
-    thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
-                      thrust::make_counting_iterator(nitems),
-                      hits.begin(),
-                      DetermineNonTrivialLabelLambda);
-
-    // Allocator to be used by sort for managing space overhead while performing prefix scans
-    dh::XGBCachingDeviceAllocator<char> alloc;
-
-    // Next, prefix scan the nontrivial labels that are segmented to accumulate them.
-    // This is required for computing the metric sum
-    // Data segmented into different groups...
-#if defined(XGBOOST_USE_CUDA)
-    thrust::inclusive_scan_by_key(thrust::cuda::par(alloc),
-                                  dh::tcbegin(dgroup_idx), dh::tcend(dgroup_idx),
-                                  hits.begin(),  // Input value
-                                  hits.begin());  // In-place scan
-#elif defined(XGBOOST_USE_HIP)
-    thrust::inclusive_scan_by_key(thrust::hip::par(alloc),
-                                  dh::tcbegin(dgroup_idx), dh::tcend(dgroup_idx),
-                                  hits.begin(),  // Input value
-                                  hits.begin());  // In-place scan
-#endif
-
-    // Find each group's metric sum
-    dh::caching_device_vector<double> sumap(ngroups, 0);
-    auto *dsumap = sumap.data().get();
-    const auto *dhits = hits.data().get();
-
-    int device_id = -1;
-
-#if defined(XGBOOST_USE_CUDA)
-    dh::safe_cuda(cudaGetDevice(&device_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipGetDevice(&device_id));
-#endif
-
-    // For each group item compute the aggregated precision
-    dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
-      if (DetermineNonTrivialLabelLambda(idx)) {
-        const auto group_idx = dgroup_idx[idx];
-        const auto group_begin = dgroups[group_idx];
-        const auto ridx = idx - group_begin;
-        if (ridx < ecfg.topn) {
-          atomicAdd(&dsumap[group_idx],
-                    static_cast<double>(dhits[idx]) / (ridx + 1));
-        }
-      }
-    });
-
-    // Aggregate the group's item precisions
-    dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
-      auto nhits = dgroups[gidx + 1] ? dhits[dgroups[gidx + 1] - 1] : 0;
-      if (nhits != 0) {
-        dsumap[gidx] /= nhits;
-      } else {
-        if (ecfg.minus) {
-          dsumap[gidx] = 0;
-        } else {
-          dsumap[gidx] = 1;
-        }
-      }
-    });
-
-#if defined(XGBOOST_USE_CUDA)
-    return thrust::reduce(thrust::cuda::par(alloc), sumap.begin(), sumap.end());
-#elif defined(XGBOOST_USE_HIP)
-    return thrust::reduce(thrust::hip::par(alloc), sumap.begin(), sumap.end());
-#endif
-  }
-};
-
 XGBOOST_REGISTER_GPU_METRIC(PrecisionGpu, "pre")
 .describe("precision@k for rank computed on GPU.")
 .set_body([](const char* param) { return new EvalRankGpu<EvalPrecisionGpu>("pre", param); });
 
-XGBOOST_REGISTER_GPU_METRIC(NDCGGpu, "ndcg")
-.describe("ndcg@k for rank computed on GPU.")
-.set_body([](const char* param) { return new EvalRankGpu<EvalNDCGGpu>("ndcg", param); });
+namespace cuda_impl {
+PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
+                             HostDeviceVector<float> const &predt, bool minus,
+                             std::shared_ptr<ltr::NDCGCache> p_cache) {
+  CHECK(p_cache);
 
-XGBOOST_REGISTER_GPU_METRIC(MAPGpu, "map")
-.describe("map@k for rank computed on GPU.")
-.set_body([](const char* param) { return new EvalRankGpu<EvalMAPGpu>("map", param); });
-}  // namespace metric
-}  // namespace xgboost
+  auto const &p = p_cache->Param();
+  auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
+  if (!d_weight.Empty()) {
+    CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
+  }
+  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  predt.SetDevice(ctx->gpu_id);
+  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
+
+  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
+
+  auto d_inv_idcg = p_cache->InvIDCG(ctx);
+  auto d_sorted_idx = p_cache->SortedIdx(ctx, d_predt.Values());
+  auto d_out_dcg = p_cache->Dcg(ctx);
+
+  ltr::cuda_impl::CalcQueriesDCG(ctx, d_label, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(),
+                                 d_out_dcg);
+  auto it = dh::MakeTransformIterator<PackedReduceResult>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
+        if (d_inv_idcg(i) <= 0.0) {
+          return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[i])};
+        }
+        return PackedReduceResult{d_out_dcg(i) * d_inv_idcg(i) * d_weight[i],
+                                  static_cast<double>(d_weight[i])};
+      });
+  auto pair = thrust::reduce(ctx->CUDACtx()->CTP(), it, it + d_out_dcg.Size(),
+                             PackedReduceResult{0.0, 0.0});
+  return pair;
+}
+
+PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
+                            HostDeviceVector<float> const &predt, bool minus,
+                            std::shared_ptr<ltr::MAPCache> p_cache) {
+  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
+  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+
+  predt.SetDevice(ctx->gpu_id);
+  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
+  auto key_it = dh::MakeTransformIterator<std::size_t>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(std::size_t i) { return dh::SegmentId(d_group_ptr, i); });
+
+  auto get_label = [=] XGBOOST_DEVICE(std::size_t i) {
+    auto g = key_it[i];
+    auto g_begin = d_group_ptr[g];
+    auto g_end = d_group_ptr[g + 1];
+    i -= g_begin;
+    auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
+    auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
+    return g_label(g_rank[i]);
+  };
+  auto it = dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), get_label);
+
+  auto cuctx = ctx->CUDACtx();
+  auto n_rel = p_cache->NumRelevant(ctx);
+  thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + d_label.Size(), it, n_rel.data());
+
+  double topk = p_cache->Param().TopK();
+  auto map = p_cache->Map(ctx);
+  thrust::fill_n(cuctx->CTP(), map.data(), map.size(), 0.0);
+  {
+    auto val_it = dh::MakeTransformIterator<double>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
+          auto g = key_it[i];
+          auto g_begin = d_group_ptr[g];
+          auto g_end = d_group_ptr[g + 1];
+          i -= g_begin;
+          if (i >= topk) {
+            return 0.0;
+          }
+
+          auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
+          auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
+          auto label = g_label(g_rank[i]);
+
+          auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
+          auto nhits = g_n_rel[i];
+          return nhits / static_cast<double>(i + 1) * label;
+        });
+
+    std::size_t bytes;
+    cub::DeviceSegmentedReduce::Sum(nullptr, bytes, val_it, map.data(), p_cache->Groups(),
+                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
+    dh::TemporaryArray<char> temp(bytes);
+    cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, val_it, map.data(), p_cache->Groups(),
+                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
+  }
+
+  PackedReduceResult result{0.0, 0.0};
+  {
+    auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
+    if (!d_weight.Empty()) {
+      CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
+    }
+    auto val_it = dh::MakeTransformIterator<PackedReduceResult>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t g) {
+          auto g_begin = d_group_ptr[g];
+          auto g_end = d_group_ptr[g + 1];
+          auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
+          if (!g_n_rel.empty() && g_n_rel.back() > 0.0) {
+            return PackedReduceResult{map[g] * d_weight[g] / std::min(g_n_rel.back(), topk),
+                                      static_cast<double>(d_weight[g])};
+          }
+          return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[g])};
+        });
+    result =
+        thrust::reduce(cuctx->CTP(), val_it, val_it + map.size(), PackedReduceResult{0.0, 0.0});
+  }
+  return result;
+}
+}  // namespace cuda_impl
+}  // namespace xgboost::metric
diff --git a/src/metric/rank_metric.h b/src/metric/rank_metric.h
new file mode 100644
index 000000000..b3b121973
--- /dev/null
+++ b/src/metric/rank_metric.h
@@ -0,0 +1,44 @@
+#ifndef XGBOOST_METRIC_RANK_METRIC_H_
+#define XGBOOST_METRIC_RANK_METRIC_H_
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include <memory>                        // for shared_ptr
+
+#include "../common/common.h"            // for AssertGPUSupport
+#include "../common/ranking_utils.h"     // for NDCGCache, MAPCache
+#include "metric_common.h"               // for PackedReduceResult
+#include "xgboost/context.h"             // for Context
+#include "xgboost/data.h"                // for MetaInfo
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+
+namespace xgboost {
+namespace metric {
+namespace cuda_impl {
+PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
+                             HostDeviceVector<float> const &predt, bool minus,
+                             std::shared_ptr<ltr::NDCGCache> p_cache);
+
+PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
+                            HostDeviceVector<float> const &predt, bool minus,
+                            std::shared_ptr<ltr::MAPCache> p_cache);
+
+#if !defined(XGBOOST_USE_CUDA)
+inline PackedReduceResult NDCGScore(Context const *, MetaInfo const &,
+                                    HostDeviceVector<float> const &, bool,
+                                    std::shared_ptr<ltr::NDCGCache>) {
+  common::AssertGPUSupport();
+  return {};
+}
+
+inline PackedReduceResult MAPScore(Context const *, MetaInfo const &,
+                                   HostDeviceVector<float> const &, bool,
+                                   std::shared_ptr<ltr::MAPCache>) {
+  common::AssertGPUSupport();
+  return {};
+}
+#endif
+}  // namespace cuda_impl
+}  // namespace metric
+}  // namespace xgboost
+#endif  // XGBOOST_METRIC_RANK_METRIC_H_
diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc
index 96fd5d653..938ceb59d 100644
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -33,7 +33,7 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
   new_obj->GetGradient(dummy_predt, info, 0, &gpair);
   bst_target_t n_targets = this->Targets(info);
   linalg::Vector<float> leaf_weight;
-  tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
+  tree::FitStump(this->ctx_, info, gpair, n_targets, &leaf_weight);
 
   // workaround, we don't support multi-target due to binary model serialization for
   // base margin.
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 288dc5fb0..3d5dfbd67 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -1,52 +1,64 @@
 /**
  * Copyright 2017-2023 by XGBoost Contributors
  */
-#include <dmlc/any.h>
-#include <dmlc/omp.h>
+#include <algorithm>  // for max, fill, min
+#include <any>        // for any, any_cast
+#include <cassert>    // for assert
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t, int32_t, uint64_t
+#include <memory>     // for unique_ptr, shared_ptr
+#include <ostream>    // for char_traits, operator<<, basic_ostream
+#include <typeinfo>   // for type_info
+#include <vector>     // for vector
 
-#include <cstddef>
-#include <limits>
-#include <mutex>
+#include "../collective/communicator-inl.h"   // for Allreduce, IsDistributed
+#include "../collective/communicator.h"       // for Operation
+#include "../common/bitfield.h"               // for RBitField8
+#include "../common/categorical.h"            // for IsCat, Decision
+#include "../common/common.h"                 // for DivRoundUp
+#include "../common/math.h"                   // for CheckNAN
+#include "../common/threading_utils.h"        // for ParallelFor
+#include "../data/adapter.h"                  // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
+#include "../data/gradient_index.h"           // for GHistIndexMatrix
+#include "../data/proxy_dmatrix.h"            // for DMatrixProxy
+#include "../gbm/gbtree_model.h"              // for GBTreeModel, GBTreeModelParam
+#include "cpu_treeshap.h"                     // for CalculateContributions
+#include "dmlc/registry.h"                    // for DMLC_REGISTRY_FILE_TAG
+#include "predict_fn.h"                       // for GetNextNode, GetNextNodeMulti
+#include "xgboost/base.h"                     // for bst_float, bst_node_t, bst_omp_uint, bst_fe...
+#include "xgboost/context.h"                  // for Context
+#include "xgboost/data.h"                     // for Entry, DMatrix, MetaInfo, SparsePage, Batch...
+#include "xgboost/host_device_vector.h"       // for HostDeviceVector
+#include "xgboost/learner.h"                  // for LearnerModelParam
+#include "xgboost/linalg.h"                   // for TensorView, All, VectorView, Tensor
+#include "xgboost/logging.h"                  // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_NE
+#include "xgboost/multi_target_tree_model.h"  // for MultiTargetTree
+#include "xgboost/predictor.h"                // for PredictionCacheEntry, Predictor, PredictorReg
+#include "xgboost/span.h"                     // for Span
+#include "xgboost/tree_model.h"               // for RegTree, MTNotImplemented, RTreeNodeStat
 
-#include "../collective/communicator-inl.h"
-#include "../common/categorical.h"
-#include "../common/math.h"
-#include "../common/threading_utils.h"
-#include "../data/adapter.h"
-#include "../data/gradient_index.h"
-#include "../gbm/gbtree_model.h"
-#include "cpu_treeshap.h"  // CalculateContributions
-#include "predict_fn.h"
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/logging.h"
-#include "xgboost/predictor.h"
-#include "xgboost/tree_model.h"
-
-namespace xgboost {
-namespace predictor {
+namespace xgboost::predictor {
 
 DMLC_REGISTRY_FILE_TAG(cpu_predictor);
 
+namespace scalar {
 template <bool has_missing, bool has_categorical>
 bst_node_t GetLeafIndex(RegTree const &tree, const RegTree::FVec &feat,
-                        RegTree::CategoricalSplitMatrix const& cats) {
-  bst_node_t nid = 0;
-  while (!tree[nid].IsLeaf()) {
-    unsigned split_index = tree[nid].SplitIndex();
+                        RegTree::CategoricalSplitMatrix const &cats) {
+  bst_node_t nidx{0};
+  while (!tree[nidx].IsLeaf()) {
+    bst_feature_t split_index = tree[nidx].SplitIndex();
     auto fvalue = feat.GetFvalue(split_index);
-    nid = GetNextNode<has_missing, has_categorical>(
-        tree[nid], nid, fvalue, has_missing && feat.IsMissing(split_index), cats);
+    nidx = GetNextNode<has_missing, has_categorical>(
+        tree[nidx], nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
   }
-  return nid;
+  return nidx;
 }
 
 bst_float PredValue(const SparsePage::Inst &inst,
                     const std::vector<std::unique_ptr<RegTree>> &trees,
-                    const std::vector<int> &tree_info, int bst_group,
-                    RegTree::FVec *p_feats, unsigned tree_begin,
-                    unsigned tree_end) {
+                    const std::vector<int> &tree_info, std::int32_t bst_group,
+                    RegTree::FVec *p_feats, std::uint32_t tree_begin, std::uint32_t tree_end) {
   bst_float psum = 0.0f;
   p_feats->Fill(inst);
   for (size_t i = tree_begin; i < tree_end; ++i) {
@@ -68,36 +80,80 @@ bst_float PredValue(const SparsePage::Inst &inst,
 }
 
 template <bool has_categorical>
-bst_float
-PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
-                   RegTree::CategoricalSplitMatrix const& cats) {
-  const bst_node_t leaf = p_feats.HasMissing() ?
-    GetLeafIndex<true, has_categorical>(tree, p_feats, cats) :
-    GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
+bst_float PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
+                             RegTree::CategoricalSplitMatrix const &cats) {
+  const bst_node_t leaf = p_feats.HasMissing()
+                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
+                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
   return tree[leaf].LeafValue();
 }
+}  // namespace scalar
 
-void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
-                       const size_t tree_end, std::vector<bst_float> *out_preds,
-                       const size_t predict_offset, const size_t num_group,
-                       const std::vector<RegTree::FVec> &thread_temp,
-                       const size_t offset, const size_t block_size) {
-  std::vector<bst_float> &preds = *out_preds;
-  for (size_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
-    const size_t gid = model.tree_info[tree_id];
-    auto const &tree = *model.trees[tree_id];
-    auto const& cats = tree.GetCategoriesMatrix();
-    auto has_categorical = tree.HasCategoricalSplit();
+namespace multi {
+template <bool has_missing, bool has_categorical>
+bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
+                        RegTree::CategoricalSplitMatrix const &cats) {
+  bst_node_t nidx{0};
+  while (!tree.IsLeaf(nidx)) {
+    unsigned split_index = tree.SplitIndex(nidx);
+    auto fvalue = feat.GetFvalue(split_index);
+    nidx = GetNextNodeMulti<has_missing, has_categorical>(
+        tree, nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
+  }
+  return nidx;
+}
 
-    if (has_categorical) {
-      for (size_t i = 0; i < block_size; ++i) {
-        preds[(predict_offset + i) * num_group + gid] +=
-            PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
+template <bool has_categorical>
+void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tree,
+                        RegTree::CategoricalSplitMatrix const &cats,
+                        linalg::VectorView<float> out_predt) {
+  bst_node_t const leaf = p_feats.HasMissing()
+                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
+                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
+  auto leaf_value = tree.LeafValue(leaf);
+  assert(out_predt.Shape(0) == leaf_value.Shape(0) && "shape mismatch.");
+  for (size_t i = 0; i < leaf_value.Size(); ++i) {
+    out_predt(i) += leaf_value(i);
+  }
+}
+}  // namespace multi
+
+namespace {
+void PredictByAllTrees(gbm::GBTreeModel const &model, std::uint32_t const tree_begin,
+                       std::uint32_t const tree_end, std::size_t const predict_offset,
+                       std::vector<RegTree::FVec> const &thread_temp, std::size_t const offset,
+                       std::size_t const block_size, linalg::MatrixView<float> out_predt) {
+  for (std::uint32_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
+    auto const &tree = *model.trees.at(tree_id);
+    auto const &cats = tree.GetCategoriesMatrix();
+    bool has_categorical = tree.HasCategoricalSplit();
+
+    if (tree.IsMultiTarget()) {
+      if (has_categorical) {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
+          multi::PredValueByOneTree<true>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
+                                          t_predts);
+        }
+      } else {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
+          multi::PredValueByOneTree<false>(thread_temp[offset + i], *tree.GetMultiTargetTree(),
+                                           cats, t_predts);
+        }
       }
     } else {
-      for (size_t i = 0; i < block_size; ++i) {
-        preds[(predict_offset + i) * num_group + gid] +=
-            PredValueByOneTree<false>(thread_temp[offset + i], tree, cats);
+      auto const gid = model.tree_info[tree_id];
+      if (has_categorical) {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          out_predt(predict_offset + i, gid) +=
+              scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
+        }
+      } else {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          out_predt(predict_offset + i, gid) +=
+              scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
+        }
       }
     }
   }
@@ -105,7 +161,7 @@ void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
 
 template <typename DataView>
 void FVecFill(const size_t block_size, const size_t batch_offset, const int num_feature,
-              DataView* batch, const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
+              DataView *batch, const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
   for (size_t i = 0; i < block_size; ++i) {
     RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
     if (feats.Size() == 0) {
@@ -117,8 +173,8 @@ void FVecFill(const size_t block_size, const size_t batch_offset, const int num_
 }
 
 template <typename DataView>
-void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batch,
-              const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
+void FVecDrop(const size_t block_size, const size_t batch_offset, DataView *batch,
+              const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
   for (size_t i = 0; i < block_size; ++i) {
     RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
     const SparsePage::Inst inst = (*batch)[batch_offset + i];
@@ -126,9 +182,7 @@ void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batc
   }
 }
 
-namespace {
-static size_t constexpr kUnroll = 8;
-}  // anonymous namespace
+static std::size_t constexpr kUnroll = 8;
 
 struct SparsePageView {
   bst_row_t base_rowid;
@@ -227,15 +281,13 @@ class AdapterView {
 };
 
 template <typename DataView, size_t block_of_rows_size>
-void PredictBatchByBlockOfRowsKernel(
-    DataView batch, std::vector<bst_float> *out_preds,
-    gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end,
-    std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads) {
+void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &model,
+                                     std::uint32_t tree_begin, std::uint32_t tree_end,
+                                     std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads,
+                                     linalg::TensorView<float, 2> out_predt) {
   auto &thread_temp = *p_thread_temp;
-  int32_t const num_group = model.learner_model_param->num_output_group;
 
-  CHECK_EQ(model.param.size_leaf_vector, 0)
-      << "size_leaf_vector is enforced to 0 so far";
+  CHECK_EQ(model.param.size_leaf_vector, 0) << "size_leaf_vector is enforced to 0 so far";
   // parallel over local batch
   const auto nsize = static_cast<bst_omp_uint>(batch.Size());
   const int num_feature = model.learner_model_param->num_feature;
@@ -243,16 +295,13 @@ void PredictBatchByBlockOfRowsKernel(
 
   common::ParallelFor(n_blocks, n_threads, [&](bst_omp_uint block_id) {
     const size_t batch_offset = block_id * block_of_rows_size;
-    const size_t block_size =
-        std::min(nsize - batch_offset, block_of_rows_size);
+    const size_t block_size = std::min(nsize - batch_offset, block_of_rows_size);
     const size_t fvec_offset = omp_get_thread_num() * block_of_rows_size;
 
-    FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset,
-             p_thread_temp);
+    FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, p_thread_temp);
     // process block of rows through all trees to keep cache locality
-    PredictByAllTrees(model, tree_begin, tree_end, out_preds,
-                      batch_offset + batch.base_rowid, num_group, thread_temp,
-                      fvec_offset, block_size);
+    PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid, thread_temp,
+                      fvec_offset, block_size, out_predt);
     FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
   });
 }
@@ -275,7 +324,7 @@ float FillNodeMeanValues(RegTree const *tree, bst_node_t nidx, std::vector<float
 }
 
 void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
-  size_t num_nodes = tree->param.num_nodes;
+  size_t num_nodes = tree->NumNodes();
   if (mean_values->size() == num_nodes) {
     return;
   }
@@ -283,7 +332,6 @@ void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
   FillNodeMeanValues(tree, 0, mean_values);
 }
 
-namespace {
 // init thread buffers
 static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
   int prev_thread_temp_size = out->size();
@@ -557,33 +605,6 @@ class ColumnSplitHelper {
 
 class CPUPredictor : public Predictor {
  protected:
-  void PredictGHistIndex(DMatrix *p_fmat, gbm::GBTreeModel const &model, int32_t tree_begin,
-                         int32_t tree_end, std::vector<bst_float> *out_preds) const {
-    auto const n_threads = this->ctx_->Threads();
-
-    constexpr double kDensityThresh = .5;
-    size_t total =
-        std::max(p_fmat->Info().num_row_ * p_fmat->Info().num_col_, static_cast<uint64_t>(1));
-    double density = static_cast<double>(p_fmat->Info().num_nonzero_) / static_cast<double>(total);
-    bool blocked = density > kDensityThresh;
-
-    std::vector<RegTree::FVec> feat_vecs;
-    InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
-    std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
-    auto ft = p_fmat->Info().feature_types.ConstHostVector();
-    for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
-      if (blocked) {
-        PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
-            GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads},
-            out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
-      } else {
-        PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, 1>(
-            GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads},
-            out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
-      }
-    }
-  }
-
   void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
                       gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end) const {
     if (p_fmat->IsColumnSplit()) {
@@ -592,11 +613,6 @@ class CPUPredictor : public Predictor {
       return;
     }
 
-    if (!p_fmat->PageExists<SparsePage>()) {
-      this->PredictGHistIndex(p_fmat, model, tree_begin, tree_end, out_preds);
-      return;
-    }
-
     auto const n_threads = this->ctx_->Threads();
     constexpr double kDensityThresh = .5;
     size_t total =
@@ -606,16 +622,38 @@ class CPUPredictor : public Predictor {
 
     std::vector<RegTree::FVec> feat_vecs;
     InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
-    for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
-      CHECK_EQ(out_preds->size(),
-               p_fmat->Info().num_row_ * model.learner_model_param->num_output_group);
-      if (blocked) {
-        PredictBatchByBlockOfRowsKernel<SparsePageView, kBlockOfRowsSize>(
-            SparsePageView{&batch}, out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
 
-      } else {
-        PredictBatchByBlockOfRowsKernel<SparsePageView, 1>(
-            SparsePageView{&batch}, out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
+    std::size_t n_samples = p_fmat->Info().num_row_;
+    std::size_t n_groups = model.learner_model_param->OutputLength();
+    CHECK_EQ(out_preds->size(), n_samples * n_groups);
+    linalg::TensorView<float, 2> out_predt{*out_preds, {n_samples, n_groups}, ctx_->gpu_id};
+
+    if (!p_fmat->PageExists<SparsePage>()) {
+      std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
+      auto ft = p_fmat->Info().feature_types.ConstHostVector();
+      for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
+        if (blocked) {
+          PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
+              GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
+              tree_begin, tree_end, &feat_vecs, n_threads, out_predt);
+        } else {
+          PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, 1>(
+              GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
+              tree_begin, tree_end, &feat_vecs, n_threads, out_predt);
+        }
+      }
+    } else {
+      for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
+        if (blocked) {
+          PredictBatchByBlockOfRowsKernel<SparsePageView, kBlockOfRowsSize>(
+              SparsePageView{&batch}, model, tree_begin, tree_end, &feat_vecs, n_threads,
+              out_predt);
+
+        } else {
+          PredictBatchByBlockOfRowsKernel<SparsePageView, 1>(SparsePageView{&batch}, model,
+                                                             tree_begin, tree_end, &feat_vecs,
+                                                             n_threads, out_predt);
+        }
       }
     }
   }
@@ -623,26 +661,24 @@ class CPUPredictor : public Predictor {
  public:
   explicit CPUPredictor(Context const *ctx) : Predictor::Predictor{ctx} {}
 
-  void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
-                    const gbm::GBTreeModel &model, uint32_t tree_begin,
-                    uint32_t tree_end = 0) const override {
-    auto* out_preds = &predts->predictions;
+  void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts, const gbm::GBTreeModel &model,
+                    uint32_t tree_begin, uint32_t tree_end = 0) const override {
+    auto *out_preds = &predts->predictions;
     // This is actually already handled in gbm, but large amount of tests rely on the
     // behaviour.
     if (tree_end == 0) {
       tree_end = model.trees.size();
     }
-    this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin,
-                         tree_end);
+    this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin, tree_end);
   }
 
   template <typename Adapter, size_t kBlockSize>
-  void DispatchedInplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
+  void DispatchedInplacePredict(std::any const &x, std::shared_ptr<DMatrix> p_m,
                                 const gbm::GBTreeModel &model, float missing,
-                                PredictionCacheEntry *out_preds,
-                                uint32_t tree_begin, uint32_t tree_end) const {
+                                PredictionCacheEntry *out_preds, uint32_t tree_begin,
+                                uint32_t tree_end) const {
     auto const n_threads = this->ctx_->Threads();
-    auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
+    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
     CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
         << "Number of columns in data must equal to trained model.";
     if (p_m) {
@@ -653,13 +689,16 @@ class CPUPredictor : public Predictor {
       info.num_row_ = m->NumRows();
       this->InitOutPredictions(info, &(out_preds->predictions), model);
     }
+
     std::vector<Entry> workspace(m->NumColumns() * kUnroll * n_threads);
     auto &predictions = out_preds->predictions.HostVector();
     std::vector<RegTree::FVec> thread_temp;
     InitThreadTemp(n_threads * kBlockSize, &thread_temp);
+    std::size_t n_groups = model.learner_model_param->OutputLength();
+    linalg::TensorView<float, 2> out_predt{predictions, {m->NumRows(), n_groups}, Context::kCpuId};
     PredictBatchByBlockOfRowsKernel<AdapterView<Adapter>, kBlockSize>(
-        AdapterView<Adapter>(m.get(), missing, common::Span<Entry>{workspace}, n_threads),
-        &predictions, model, tree_begin, tree_end, &thread_temp, n_threads);
+        AdapterView<Adapter>(m.get(), missing, common::Span<Entry>{workspace}, n_threads), model,
+        tree_begin, tree_end, &thread_temp, n_threads, out_predt);
   }
 
   bool InplacePredict(std::shared_ptr<DMatrix> p_m, const gbm::GBTreeModel &model, float missing,
@@ -689,6 +728,7 @@ class CPUPredictor : public Predictor {
   void PredictInstance(const SparsePage::Inst& inst,
                        std::vector<bst_float>* out_preds,
                        const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
+    CHECK(!model.learner_model_param->IsVectorLeaf()) << "predict instance" << MTNotImplemented();
     std::vector<RegTree::FVec> feat_vecs;
     feat_vecs.resize(1, RegTree::FVec());
     feat_vecs[0].Init(model.learner_model_param->num_feature);
@@ -701,31 +741,30 @@ class CPUPredictor : public Predictor {
     auto base_score = model.learner_model_param->BaseScore(ctx_)(0);
     // loop over output groups
     for (uint32_t gid = 0; gid < model.learner_model_param->num_output_group; ++gid) {
-      (*out_preds)[gid] =
-          PredValue(inst, model.trees, model.tree_info, gid, &feat_vecs[0], 0, ntree_limit) +
-          base_score;
+      (*out_preds)[gid] = scalar::PredValue(inst, model.trees, model.tree_info, gid, &feat_vecs[0],
+                                            0, ntree_limit) +
+                          base_score;
     }
   }
 
-  void PredictLeaf(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_preds,
-                   const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
+  void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *out_preds,
+                   const gbm::GBTreeModel &model, unsigned ntree_limit) const override {
     auto const n_threads = this->ctx_->Threads();
     std::vector<RegTree::FVec> feat_vecs;
     const int num_feature = model.learner_model_param->num_feature;
     InitThreadTemp(n_threads, &feat_vecs);
-    const MetaInfo& info = p_fmat->Info();
+    const MetaInfo &info = p_fmat->Info();
     // number of valid trees
     if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
       ntree_limit = static_cast<unsigned>(model.trees.size());
     }
-    std::vector<bst_float>& preds = out_preds->HostVector();
+    std::vector<bst_float> &preds = out_preds->HostVector();
     preds.resize(info.num_row_ * ntree_limit);
     // start collecting the prediction
     for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
       // parallel over local batch
       auto page = batch.GetView();
-      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
-      common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
+      common::ParallelFor(page.Size(), n_threads, [&](auto i) {
         const int tid = omp_get_thread_num();
         auto ridx = static_cast<size_t>(batch.base_rowid + i);
         RegTree::FVec &feats = feat_vecs[tid];
@@ -733,23 +772,28 @@ class CPUPredictor : public Predictor {
           feats.Init(num_feature);
         }
         feats.Fill(page[i]);
-        for (unsigned j = 0; j < ntree_limit; ++j) {
-          auto const& tree = *model.trees[j];
-          auto const& cats = tree.GetCategoriesMatrix();
-          bst_node_t tid = GetLeafIndex<true, true>(tree, feats, cats);
-          preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
+        for (std::uint32_t j = 0; j < ntree_limit; ++j) {
+          auto const &tree = *model.trees[j];
+          auto const &cats = tree.GetCategoriesMatrix();
+          bst_node_t nidx;
+          if (tree.IsMultiTarget()) {
+            nidx = multi::GetLeafIndex<true, true>(*tree.GetMultiTargetTree(), feats, cats);
+          } else {
+            nidx = scalar::GetLeafIndex<true, true>(tree, feats, cats);
+          }
+          preds[ridx * ntree_limit + j] = static_cast<bst_float>(nidx);
         }
         feats.Drop(page[i]);
       });
     }
   }
 
-  void PredictContribution(DMatrix *p_fmat,
-                           HostDeviceVector<float> *out_contribs,
+  void PredictContribution(DMatrix *p_fmat, HostDeviceVector<float> *out_contribs,
                            const gbm::GBTreeModel &model, uint32_t ntree_limit,
-                           std::vector<bst_float> const *tree_weights,
-                           bool approximate, int condition,
-                           unsigned condition_feature) const override {
+                           std::vector<bst_float> const *tree_weights, bool approximate,
+                           int condition, unsigned condition_feature) const override {
+    CHECK(!model.learner_model_param->IsVectorLeaf())
+        << "Predict contribution" << MTNotImplemented();
     auto const n_threads = this->ctx_->Threads();
     const int num_feature = model.learner_model_param->num_feature;
     std::vector<RegTree::FVec> feat_vecs;
@@ -825,11 +869,12 @@ class CPUPredictor : public Predictor {
     }
   }
 
-  void PredictInteractionContributions(
-      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
-      const gbm::GBTreeModel &model, unsigned ntree_limit,
-      std::vector<bst_float> const *tree_weights,
-      bool approximate) const override {
+  void PredictInteractionContributions(DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
+                                       const gbm::GBTreeModel &model, unsigned ntree_limit,
+                                       std::vector<bst_float> const *tree_weights,
+                                       bool approximate) const override {
+    CHECK(!model.learner_model_param->IsVectorLeaf())
+        << "Predict interaction contribution" << MTNotImplemented();
     const MetaInfo& info = p_fmat->Info();
     const int ngroup = model.learner_model_param->num_output_group;
     size_t const ncolumns = model.learner_model_param->num_feature;
@@ -884,5 +929,4 @@ class CPUPredictor : public Predictor {
 XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
     .describe("Make predictions using CPU.")
     .set_body([](Context const *ctx) { return new CPUPredictor(ctx); });
-}  // namespace predictor
-}  // namespace xgboost
+}  // namespace xgboost::predictor
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 46c342040..0ab587693 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -9,6 +9,7 @@
 #include <thrust/fill.h>
 #include <thrust/host_vector.h>
 
+#include <any>  // for any, any_cast
 #include <memory>
 
 #include "../common/bitfield.h"
@@ -431,7 +432,7 @@ class DeviceModel {
 
     this->tree_beg_ = tree_begin;
     this->tree_end_ = tree_end;
-    this->num_group = model.learner_model_param->num_output_group;
+    this->num_group = model.learner_model_param->OutputLength();
   }
 };
 
@@ -792,13 +793,13 @@ class GPUPredictor : public xgboost::Predictor {
   }
 
   template <typename Adapter, typename Loader>
-  void DispatchedInplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
-                                const gbm::GBTreeModel &model, float missing,
-                                PredictionCacheEntry *out_preds,
-                                uint32_t tree_begin, uint32_t tree_end) const {
+  void DispatchedInplacePredict(std::any const& x, std::shared_ptr<DMatrix> p_m,
+                                const gbm::GBTreeModel& model, float missing,
+                                PredictionCacheEntry* out_preds, uint32_t tree_begin,
+                                uint32_t tree_end) const {
     uint32_t const output_groups =  model.learner_model_param->num_output_group;
 
-    auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
+    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
     CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
         << "Number of columns in data must equal to trained model.";
     CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
diff --git a/src/predictor/predict_fn.h b/src/predictor/predict_fn.h
index 5d0c175fc..dbaf4a75e 100644
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -1,13 +1,12 @@
-/*!
- * Copyright 2021 by XGBoost Contributors
+/**
+ * Copyright 2021-2023 by XGBoost Contributors
  */
 #ifndef XGBOOST_PREDICTOR_PREDICT_FN_H_
 #define XGBOOST_PREDICTOR_PREDICT_FN_H_
 #include "../common/categorical.h"
 #include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace predictor {
+namespace xgboost::predictor {
 template <bool has_missing, bool has_categorical>
 inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_t nid,
                                              float fvalue, bool is_missing,
@@ -24,6 +23,25 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
     }
   }
 }
-}      // namespace predictor
-}      // namespace xgboost
+
+template <bool has_missing, bool has_categorical>
+inline XGBOOST_DEVICE bst_node_t GetNextNodeMulti(MultiTargetTree const &tree,
+                                                  bst_node_t const nidx, float fvalue,
+                                                  bool is_missing,
+                                                  RegTree::CategoricalSplitMatrix const &cats) {
+  if (has_missing && is_missing) {
+    return tree.DefaultChild(nidx);
+  } else {
+    if (has_categorical && common::IsCat(cats.split_type, nidx)) {
+      auto node_categories =
+          cats.categories.subspan(cats.node_ptr[nidx].beg, cats.node_ptr[nidx].size);
+      return common::Decision(node_categories, fvalue) ? tree.LeftChild(nidx)
+                                                       : tree.RightChild(nidx);
+    } else {
+      return tree.LeftChild(nidx) + !(fvalue < tree.SplitCond(nidx));
+    }
+  }
+}
+
+}  // namespace xgboost::predictor
 #endif  // XGBOOST_PREDICTOR_PREDICT_FN_H_
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index 3a46a168a..ba69d8921 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -1,22 +1,26 @@
-/*!
- * Copyright 2021-2022 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
  * \file common_row_partitioner.h
  * \brief Common partitioner logic for hist and approx methods.
  */
 #ifndef XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
 #define XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
 
+#include <algorithm>  // std::all_of
+#include <cinttypes>  // std::uint32_t
 #include <limits>  // std::numeric_limits
 #include <vector>
 
 #include "../collective/communicator-inl.h"
+#include "../common/linalg_op.h"  // cbegin
 #include "../common/numeric.h"  // Iota
 #include "../common/partition_builder.h"
 #include "hist/expand_entry.h"  // CPUExpandEntry
+#include "xgboost/base.h"
 #include "xgboost/context.h"    // Context
+#include "xgboost/linalg.h"       // TensorView
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 
 static constexpr size_t kPartitionBlockSize = 2048;
 
@@ -34,9 +38,10 @@ class ColumnSplitHelper {
     missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
   }
 
+  template <typename ExpandEntry>
   void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
                  GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
-                 std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                 std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
     // When data is split by column, we don't have all the feature values in the local worker, so
     // we first collect all the decisions and whether the feature is missing into bit vectors.
     std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
@@ -97,41 +102,47 @@ class CommonRowPartitioner {
     }
   }
 
-  void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
+  template <typename ExpandEntry>
+  void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
                            const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions) {
-    for (size_t i = 0; i < nodes.size(); ++i) {
-      const int32_t nid = nodes[i].nid;
-      const bst_uint fid = tree[nid].SplitIndex();
-      const bst_float split_pt = tree[nid].SplitCond();
-      const uint32_t lower_bound = gmat.cut.Ptrs()[fid];
-      const uint32_t upper_bound = gmat.cut.Ptrs()[fid + 1];
+    auto const& ptrs = gmat.cut.Ptrs();
+    auto const& vals = gmat.cut.Values();
+
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+      bst_node_t const nidx = nodes[i].nid;
+      bst_feature_t const fidx = tree.SplitIndex(nidx);
+      float const split_pt = tree.SplitCond(nidx);
+      std::uint32_t const lower_bound = ptrs[fidx];
+      std::uint32_t const upper_bound = ptrs[fidx + 1];
       bst_bin_t split_cond = -1;
       // convert floating-point split_pt into corresponding bin_id
       // split_cond = -1 indicates that split_pt is less than all known cut points
       CHECK_LT(upper_bound, static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
       for (auto bound = lower_bound; bound < upper_bound; ++bound) {
-        if (split_pt == gmat.cut.Values()[bound]) {
-          split_cond = static_cast<int32_t>(bound);
+        if (split_pt == vals[bound]) {
+          split_cond = static_cast<bst_bin_t>(bound);
         }
       }
-      (*split_conditions).at(i) = split_cond;
+      (*split_conditions)[i] = split_cond;
     }
   }
 
-  void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree const* p_tree) {
+  template <typename ExpandEntry>
+  void AddSplitsToRowSet(const std::vector<ExpandEntry>& nodes, RegTree const* p_tree) {
     const size_t n_nodes = nodes.size();
     for (unsigned int i = 0; i < n_nodes; ++i) {
-      const int32_t nid = nodes[i].nid;
+      const int32_t nidx = nodes[i].nid;
       const size_t n_left = partition_builder_.GetNLeftElems(i);
       const size_t n_right = partition_builder_.GetNRightElems(i);
-      CHECK_EQ((*p_tree)[nid].LeftChild() + 1, (*p_tree)[nid].RightChild());
-      row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild(),
-                                   n_left, n_right);
+      CHECK_EQ(p_tree->LeftChild(nidx) + 1, p_tree->RightChild(nidx));
+      row_set_collection_.AddSplit(nidx, p_tree->LeftChild(nidx), p_tree->RightChild(nidx), n_left,
+                                   n_right);
     }
   }
 
+  template <typename ExpandEntry>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
     auto const& column_matrix = gmat.Transpose();
     if (column_matrix.IsInitialized()) {
       if (gmat.cut.HasCategorical()) {
@@ -149,10 +160,10 @@ class CommonRowPartitioner {
     }
   }
 
-  template <bool any_cat>
+  template <bool any_cat, typename ExpandEntry>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
     if (column_matrix.AnyMissing()) {
       this->template UpdatePosition<true, any_cat>(ctx, gmat, column_matrix, nodes, p_tree);
     } else {
@@ -160,33 +171,21 @@ class CommonRowPartitioner {
     }
   }
 
-  template <bool any_missing, bool any_cat>
+  template <bool any_missing, bool any_cat, typename ExpandEntry>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
-    switch (column_matrix.GetTypeSize()) {
-      case common::kUint8BinsTypeSize:
-        this->template UpdatePosition<uint8_t, any_missing, any_cat>(ctx, gmat, column_matrix,
-                                                                     nodes, p_tree);
-        break;
-      case common::kUint16BinsTypeSize:
-        this->template UpdatePosition<uint16_t, any_missing, any_cat>(ctx, gmat, column_matrix,
-                                                                      nodes, p_tree);
-        break;
-      case common::kUint32BinsTypeSize:
-        this->template UpdatePosition<uint32_t, any_missing, any_cat>(ctx, gmat, column_matrix,
-                                                                      nodes, p_tree);
-        break;
-      default:
-        // no default behavior
-        CHECK(false) << column_matrix.GetTypeSize();
-    }
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
+    common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto t) {
+      using T = decltype(t);
+      this->template UpdatePosition<T, any_missing, any_cat>(ctx, gmat, column_matrix, nodes,
+                                                             p_tree);
+    });
   }
 
-  template <typename BinIdxType, bool any_missing, bool any_cat>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
     // 1. Find split condition for each split
     size_t n_nodes = nodes.size();
 
@@ -248,9 +247,9 @@ class CommonRowPartitioner {
     AddSplitsToRowSet(nodes, p_tree);
   }
 
-  auto const& Partitions() const { return row_set_collection_; }
+  [[nodiscard]] auto const& Partitions() const { return row_set_collection_; }
 
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
     return std::distance(row_set_collection_.begin(), row_set_collection_.end());
   }
 
@@ -263,12 +262,29 @@ class CommonRowPartitioner {
                                      [&](size_t idx) -> bool { return hess[idx] - .0f == .0f; });
   }
 
+  void LeafPartition(Context const* ctx, RegTree const& tree,
+                     linalg::TensorView<GradientPair const, 2> gpair,
+                     std::vector<bst_node_t>* p_out_position) const {
+    if (gpair.Shape(1) > 1) {
+      partition_builder_.LeafPartition(
+          ctx, tree, this->Partitions(), p_out_position, [&](std::size_t idx) -> bool {
+            auto sample = gpair.Slice(idx, linalg::All());
+            return std::all_of(linalg::cbegin(sample), linalg::cend(sample),
+                               [](GradientPair const& g) { return g.GetHess() - .0f == .0f; });
+          });
+    } else {
+      auto s = gpair.Slice(linalg::All(), 0);
+      partition_builder_.LeafPartition(
+          ctx, tree, this->Partitions(), p_out_position,
+          [&](std::size_t idx) -> bool { return s(idx).GetHess() - .0f == .0f; });
+    }
+  }
   void LeafPartition(Context const* ctx, RegTree const& tree,
                      common::Span<GradientPair const> gpair,
                      std::vector<bst_node_t>* p_out_position) const {
     partition_builder_.LeafPartition(
         ctx, tree, this->Partitions(), p_out_position,
-        [&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
+        [&](std::size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
   }
 
  private:
@@ -278,6 +294,5 @@ class CommonRowPartitioner {
   ColumnSplitHelper column_split_helper_;
 };
 
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
diff --git a/src/tree/driver.h b/src/tree/driver.h
index a4a0dd4a6..c3189a70c 100644
--- a/src/tree/driver.h
+++ b/src/tree/driver.h
@@ -1,111 +1,111 @@
-/*!
- * Copyright 2021 by XGBoost Contributors
- */
-#ifndef XGBOOST_TREE_DRIVER_H_
-#define XGBOOST_TREE_DRIVER_H_
-#include <xgboost/span.h>
-#include <queue>
-#include <vector>
-#include "./param.h"
-
-namespace xgboost {
-namespace tree {
-
-template <typename ExpandEntryT>
-inline bool DepthWise(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
-  return lhs.GetNodeId() > rhs.GetNodeId();  // favor small depth
-}
-
-template <typename ExpandEntryT>
-inline bool LossGuide(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
-  if (lhs.GetLossChange() == rhs.GetLossChange()) {
-    return lhs.GetNodeId() > rhs.GetNodeId();  // favor small timestamp
-  } else {
-    return lhs.GetLossChange() < rhs.GetLossChange();  // favor large loss_chg
-  }
-}
-
-// Drives execution of tree building on device
-template <typename ExpandEntryT>
-class Driver {
-  using ExpandQueue =
-      std::priority_queue<ExpandEntryT, std::vector<ExpandEntryT>,
-                          std::function<bool(ExpandEntryT, ExpandEntryT)>>;
-
- public:
-  explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256)
-      : param_(param),
-        max_node_batch_size_(max_node_batch_size),
-        queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT>
-                                                           : LossGuide<ExpandEntryT>) {}
-  template <typename EntryIterT>
-  void Push(EntryIterT begin, EntryIterT end) {
-    for (auto it = begin; it != end; ++it) {
-      const ExpandEntryT& e = *it;
-      if (e.split.loss_chg > kRtEps) {
-        queue_.push(e);
-      }
-    }
-  }
-  void Push(const std::vector<ExpandEntryT> &entries) {
-    this->Push(entries.begin(), entries.end());
-  }
-  void Push(ExpandEntryT const& e) { queue_.push(e); }
-
-  bool IsEmpty() {
-    return queue_.empty();
-  }
-
-  // Can a child of this entry still be expanded?
-  // can be used to avoid extra work
-  bool IsChildValid(ExpandEntryT const& parent_entry) {
-    if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false;
-    if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false;
-    return true;
-  }
-
-  // Return the set of nodes to be expanded
-  // This set has no dependencies between entries so they may be expanded in
-  // parallel or asynchronously
-  std::vector<ExpandEntryT> Pop() {
-    if (queue_.empty()) return {};
-    // Return a single entry for loss guided mode
-    if (param_.grow_policy == TrainParam::kLossGuide) {
-      ExpandEntryT e = queue_.top();
-      queue_.pop();
-
-      if (e.IsValid(param_, num_leaves_)) {
-        num_leaves_++;
-        return {e};
-      } else {
-        return {};
-      }
-    }
-    // Return nodes on same level for depth wise
-    std::vector<ExpandEntryT> result;
-    ExpandEntryT e = queue_.top();
-    int level = e.depth;
-    while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) {
-      queue_.pop();
-      if (e.IsValid(param_, num_leaves_)) {
-        num_leaves_++;
-        result.emplace_back(e);
-      }
-
-      if (!queue_.empty()) {
-        e = queue_.top();
-      }
-    }
-    return result;
-  }
-
- private:
-  TrainParam param_;
-  bst_node_t num_leaves_ = 1;
-  std::size_t max_node_batch_size_;
-  ExpandQueue queue_;
-};
-}  // namespace tree
-}  // namespace xgboost
-
-#endif  // XGBOOST_TREE_DRIVER_H_
+/*!
+ * Copyright 2021 by XGBoost Contributors
+ */
+#ifndef XGBOOST_TREE_DRIVER_H_
+#define XGBOOST_TREE_DRIVER_H_
+#include <xgboost/span.h>
+#include <queue>
+#include <vector>
+#include "./param.h"
+
+namespace xgboost {
+namespace tree {
+
+template <typename ExpandEntryT>
+inline bool DepthWise(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
+  return lhs.GetNodeId() > rhs.GetNodeId();  // favor small depth
+}
+
+template <typename ExpandEntryT>
+inline bool LossGuide(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
+  if (lhs.GetLossChange() == rhs.GetLossChange()) {
+    return lhs.GetNodeId() > rhs.GetNodeId();  // favor small timestamp
+  } else {
+    return lhs.GetLossChange() < rhs.GetLossChange();  // favor large loss_chg
+  }
+}
+
+// Drives execution of tree building on device
+template <typename ExpandEntryT>
+class Driver {
+  using ExpandQueue =
+      std::priority_queue<ExpandEntryT, std::vector<ExpandEntryT>,
+                          std::function<bool(ExpandEntryT, ExpandEntryT)>>;
+
+ public:
+  explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256)
+      : param_(param),
+        max_node_batch_size_(max_node_batch_size),
+        queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT>
+                                                           : LossGuide<ExpandEntryT>) {}
+  template <typename EntryIterT>
+  void Push(EntryIterT begin, EntryIterT end) {
+    for (auto it = begin; it != end; ++it) {
+      const ExpandEntryT& e = *it;
+      if (e.split.loss_chg > kRtEps) {
+        queue_.push(e);
+      }
+    }
+  }
+  void Push(const std::vector<ExpandEntryT> &entries) {
+    this->Push(entries.begin(), entries.end());
+  }
+  void Push(ExpandEntryT const& e) { queue_.push(e); }
+
+  bool IsEmpty() {
+    return queue_.empty();
+  }
+
+  // Can a child of this entry still be expanded?
+  // can be used to avoid extra work
+  bool IsChildValid(ExpandEntryT const& parent_entry) {
+    if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false;
+    if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false;
+    return true;
+  }
+
+  // Return the set of nodes to be expanded
+  // This set has no dependencies between entries so they may be expanded in
+  // parallel or asynchronously
+  std::vector<ExpandEntryT> Pop() {
+    if (queue_.empty()) return {};
+    // Return a single entry for loss guided mode
+    if (param_.grow_policy == TrainParam::kLossGuide) {
+      ExpandEntryT e = queue_.top();
+      queue_.pop();
+
+      if (e.IsValid(param_, num_leaves_)) {
+        num_leaves_++;
+        return {e};
+      } else {
+        return {};
+      }
+    }
+    // Return nodes on same level for depth wise
+    std::vector<ExpandEntryT> result;
+    ExpandEntryT e = queue_.top();
+    int level = e.depth;
+    while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) {
+      queue_.pop();
+      if (e.IsValid(param_, num_leaves_)) {
+        num_leaves_++;
+        result.emplace_back(e);
+      }
+
+      if (!queue_.empty()) {
+        e = queue_.top();
+      }
+    }
+    return result;
+  }
+
+ private:
+  TrainParam param_;
+  bst_node_t num_leaves_ = 1;
+  std::size_t max_node_batch_size_;
+  ExpandQueue queue_;
+};
+}  // namespace tree
+}  // namespace xgboost
+
+#endif  // XGBOOST_TREE_DRIVER_H_
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index 4213e74ad..dde1fec96 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -21,7 +21,8 @@
 namespace xgboost {
 namespace tree {
 namespace cpu_impl {
-void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
+void FitStump(Context const* ctx, MetaInfo const& info,
+              linalg::TensorView<GradientPair const, 2> gpair,
               linalg::VectorView<float> out) {
   auto n_targets = out.Size();
   CHECK_EQ(n_targets, gpair.Shape(1));
@@ -43,8 +44,12 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
     }
   }
   CHECK(h_sum.CContiguous());
-  collective::Allreduce<collective::Operation::kSum>(
-      reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
+
+  // In vertical federated learning, only worker 0 needs to call this, no need to do an allreduce.
+  if (!collective::IsFederated() || info.data_split_mode != DataSplitMode::kCol) {
+    collective::Allreduce<collective::Operation::kSum>(
+        reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
+  }
 
   for (std::size_t i = 0; i < h_sum.Size(); ++i) {
     out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
@@ -64,7 +69,7 @@ inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
 #endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace cuda_impl
 
-void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
               bst_target_t n_targets, linalg::Vector<float>* out) {
   out->SetDevice(ctx->gpu_id);
   out->Reshape(n_targets);
@@ -72,7 +77,7 @@ void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
 
   gpair.SetDevice(ctx->gpu_id);
   auto gpair_t = linalg::MakeTensorView(ctx, &gpair, n_samples, n_targets);
-  ctx->IsCPU() ? cpu_impl::FitStump(ctx, gpair_t, out->HostView())
+  ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
                : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
 }
 }  // namespace tree
diff --git a/src/tree/fit_stump.h b/src/tree/fit_stump.h
index 1f5cd60b4..4778ecfc5 100644
--- a/src/tree/fit_stump.h
+++ b/src/tree/fit_stump.h
@@ -16,6 +16,7 @@
 #include "../common/common.h"            // AssertGPUSupport
 #include "xgboost/base.h"                // GradientPair
 #include "xgboost/context.h"             // Context
+#include "xgboost/data.h"                // MetaInfo
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // TensorView
 
@@ -30,7 +31,7 @@ XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) {
 /**
  * @brief Fit a tree stump as an estimation of base_score.
  */
-void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
               bst_target_t n_targets, linalg::Vector<float>* out);
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 31a61fb9d..925a5fb76 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -4,22 +4,25 @@
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 
-#include <algorithm>
-#include <cstddef>  // for size_t
-#include <limits>
-#include <memory>
-#include <numeric>
-#include <utility>
-#include <vector>
+#include <algorithm>                   // for copy
+#include <cstddef>                     // for size_t
+#include <limits>                      // for numeric_limits
+#include <memory>                      // for shared_ptr
+#include <numeric>                     // for accumulate
+#include <utility>                     // for move
+#include <vector>                      // for vector
 
-#include "../../common/categorical.h"
-#include "../../common/hist_util.h"
-#include "../../common/random.h"
-#include "../../data/gradient_index.h"
-#include "../constraints.h"
-#include "../param.h"  // for TrainParam
-#include "../split_evaluator.h"
-#include "xgboost/context.h"
+#include "../../common/categorical.h"  // for CatBitField
+#include "../../common/hist_util.h"    // for GHistRow, HistogramCuts
+#include "../../common/linalg_op.h"    // for cbegin, cend, begin
+#include "../../common/random.h"       // for ColumnSampler
+#include "../constraints.h"            // for FeatureInteractionConstraintHost
+#include "../param.h"                  // for TrainParam
+#include "../split_evaluator.h"        // for TreeEvaluator
+#include "expand_entry.h"              // for MultiExpandEntry
+#include "xgboost/base.h"              // for bst_node_t, bst_target_t, bst_feature_t
+#include "xgboost/context.h"           // for COntext
+#include "xgboost/linalg.h"            // for Constants, Vector
 
 namespace xgboost::tree {
 template <typename ExpandEntry>
@@ -410,8 +413,6 @@ class HistEvaluator {
                              tree[candidate.nid].SplitIndex(), left_weight,
                              right_weight);
 
-    auto max_node = std::max(left_child, tree[candidate.nid].RightChild());
-    max_node = std::max(candidate.nid, max_node);
     snode_.resize(tree.GetNodes().size());
     snode_.at(left_child).stats = candidate.split.left_sum;
     snode_.at(left_child).root_gain =
@@ -456,6 +457,216 @@ class HistEvaluator {
   }
 };
 
+class HistMultiEvaluator {
+  std::vector<double> gain_;
+  linalg::Matrix<GradientPairPrecise> stats_;
+  TrainParam const *param_;
+  FeatureInteractionConstraintHost interaction_constraints_;
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
+  Context const *ctx_;
+
+ private:
+  static double MultiCalcSplitGain(TrainParam const &param,
+                                   linalg::VectorView<GradientPairPrecise const> left_sum,
+                                   linalg::VectorView<GradientPairPrecise const> right_sum,
+                                   linalg::VectorView<float> left_weight,
+                                   linalg::VectorView<float> right_weight) {
+    CalcWeight(param, left_sum, left_weight);
+    CalcWeight(param, right_sum, right_weight);
+
+    auto left_gain = CalcGainGivenWeight(param, left_sum, left_weight);
+    auto right_gain = CalcGainGivenWeight(param, right_sum, right_weight);
+    return left_gain + right_gain;
+  }
+
+  template <bst_bin_t d_step>
+  bool EnumerateSplit(common::HistogramCuts const &cut, bst_feature_t fidx,
+                      common::Span<common::GHistRow const> hist,
+                      linalg::VectorView<GradientPairPrecise const> parent_sum, double parent_gain,
+                      SplitEntryContainer<std::vector<GradientPairPrecise>> *p_best) const {
+    auto const &cut_ptr = cut.Ptrs();
+    auto const &cut_val = cut.Values();
+    auto const &min_val = cut.MinValues();
+
+    auto sum = linalg::Empty<GradientPairPrecise>(ctx_, 2, hist.size());
+    auto left_sum = sum.Slice(0, linalg::All());
+    auto right_sum = sum.Slice(1, linalg::All());
+
+    bst_bin_t ibegin, iend;
+    if (d_step > 0) {
+      ibegin = static_cast<bst_bin_t>(cut_ptr[fidx]);
+      iend = static_cast<bst_bin_t>(cut_ptr[fidx + 1]);
+    } else {
+      ibegin = static_cast<bst_bin_t>(cut_ptr[fidx + 1]) - 1;
+      iend = static_cast<bst_bin_t>(cut_ptr[fidx]) - 1;
+    }
+    const auto imin = static_cast<bst_bin_t>(cut_ptr[fidx]);
+
+    auto n_targets = hist.size();
+    auto weight = linalg::Empty<float>(ctx_, 2, n_targets);
+    auto left_weight = weight.Slice(0, linalg::All());
+    auto right_weight = weight.Slice(1, linalg::All());
+
+    for (bst_bin_t i = ibegin; i != iend; i += d_step) {
+      for (bst_target_t t = 0; t < n_targets; ++t) {
+        auto t_hist = hist[t];
+        auto t_p = parent_sum(t);
+        left_sum(t) += t_hist[i];
+        right_sum(t) = t_p - left_sum(t);
+      }
+
+      if (d_step > 0) {
+        auto split_pt = cut_val[i];
+        auto loss_chg =
+            MultiCalcSplitGain(*param_, right_sum, left_sum, right_weight, left_weight) -
+            parent_gain;
+        p_best->Update(loss_chg, fidx, split_pt, d_step == -1, false, left_sum, right_sum);
+      } else {
+        float split_pt;
+        if (i == imin) {
+          split_pt = min_val[fidx];
+        } else {
+          split_pt = cut_val[i - 1];
+        }
+        auto loss_chg =
+            MultiCalcSplitGain(*param_, right_sum, left_sum, left_weight, right_weight) -
+            parent_gain;
+        p_best->Update(loss_chg, fidx, split_pt, d_step == -1, false, right_sum, left_sum);
+      }
+    }
+    // return true if there's missing. Doesn't handle floating-point error well.
+    if (d_step == +1) {
+      return !std::equal(linalg::cbegin(left_sum), linalg::cend(left_sum),
+                         linalg::cbegin(parent_sum));
+    }
+    return false;
+  }
+
+ public:
+  void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
+                      common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
+    auto &entries = *p_entries;
+    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
+
+    for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+      auto nidx = entries[nidx_in_set].nid;
+      features[nidx_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
+    }
+    CHECK(!features.empty());
+
+    std::int32_t n_threads = ctx_->Threads();
+    std::size_t const grain_size = std::max<std::size_t>(1, features.front()->Size() / n_threads);
+    common::BlockedSpace2d space(
+        entries.size(), [&](std::size_t nidx_in_set) { return features[nidx_in_set]->Size(); },
+        grain_size);
+
+    std::vector<MultiExpandEntry> tloc_candidates(n_threads * entries.size());
+    for (std::size_t i = 0; i < entries.size(); ++i) {
+      for (std::int32_t j = 0; j < n_threads; ++j) {
+        tloc_candidates[i * n_threads + j] = entries[i];
+      }
+    }
+    common::ParallelFor2d(space, n_threads, [&](std::size_t nidx_in_set, common::Range1d r) {
+      auto tidx = omp_get_thread_num();
+      auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
+      auto best = &entry->split;
+      auto parent_sum = stats_.Slice(entry->nid, linalg::All());
+      std::vector<common::GHistRow> node_hist;
+      for (auto t_hist : hist) {
+        node_hist.push_back((*t_hist)[entry->nid]);
+      }
+      auto features_set = features[nidx_in_set]->ConstHostSpan();
+
+      for (auto fidx_in_set = r.begin(); fidx_in_set < r.end(); fidx_in_set++) {
+        auto fidx = features_set[fidx_in_set];
+        if (!interaction_constraints_.Query(entry->nid, fidx)) {
+          continue;
+        }
+        auto parent_gain = gain_[entry->nid];
+        bool missing =
+            this->EnumerateSplit<+1>(cut, fidx, node_hist, parent_sum, parent_gain, best);
+        if (missing) {
+          this->EnumerateSplit<-1>(cut, fidx, node_hist, parent_sum, parent_gain, best);
+        }
+      }
+    });
+
+    for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+      for (auto tidx = 0; tidx < n_threads; ++tidx) {
+        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
+      }
+    }
+  }
+
+  linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
+    auto n_targets = root_sum.Size();
+    stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
+    gain_.resize(1);
+
+    linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
+    CalcWeight(*param_, root_sum, weight.HostView());
+    auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
+    gain_.front() = root_gain;
+
+    auto h_stats = stats_.HostView();
+    std::copy(linalg::cbegin(root_sum), linalg::cend(root_sum), linalg::begin(h_stats));
+
+    return weight;
+  }
+
+  void ApplyTreeSplit(MultiExpandEntry const &candidate, RegTree *p_tree) {
+    auto n_targets = p_tree->NumTargets();
+    auto parent_sum = stats_.Slice(candidate.nid, linalg::All());
+
+    auto weight = linalg::Empty<float>(ctx_, 3, n_targets);
+    auto base_weight = weight.Slice(0, linalg::All());
+    CalcWeight(*param_, parent_sum, base_weight);
+
+    auto left_weight = weight.Slice(1, linalg::All());
+    auto left_sum =
+        linalg::MakeVec(candidate.split.left_sum.data(), candidate.split.left_sum.size());
+    CalcWeight(*param_, left_sum, param_->learning_rate, left_weight);
+
+    auto right_weight = weight.Slice(2, linalg::All());
+    auto right_sum =
+        linalg::MakeVec(candidate.split.right_sum.data(), candidate.split.right_sum.size());
+    CalcWeight(*param_, right_sum, param_->learning_rate, right_weight);
+
+    p_tree->ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
+                       candidate.split.DefaultLeft(), base_weight, left_weight, right_weight);
+    CHECK(p_tree->IsMultiTarget());
+    auto left_child = p_tree->LeftChild(candidate.nid);
+    CHECK_GT(left_child, candidate.nid);
+    auto right_child = p_tree->RightChild(candidate.nid);
+    CHECK_GT(right_child, candidate.nid);
+
+    std::size_t n_nodes = p_tree->Size();
+    gain_.resize(n_nodes);
+    gain_[left_child] = CalcGainGivenWeight(*param_, left_sum, left_weight);
+    gain_[right_child] = CalcGainGivenWeight(*param_, right_sum, right_weight);
+
+    if (n_nodes >= stats_.Shape(0)) {
+      stats_.Reshape(n_nodes * 2, stats_.Shape(1));
+    }
+    CHECK_EQ(stats_.Shape(1), n_targets);
+    auto left_sum_stat = stats_.Slice(left_child, linalg::All());
+    std::copy(candidate.split.left_sum.cbegin(), candidate.split.left_sum.cend(),
+              linalg::begin(left_sum_stat));
+    auto right_sum_stat = stats_.Slice(right_child, linalg::All());
+    std::copy(candidate.split.right_sum.cbegin(), candidate.split.right_sum.cend(),
+              linalg::begin(right_sum_stat));
+  }
+
+  explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
+                              std::shared_ptr<common::ColumnSampler> sampler)
+      : param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
+    interaction_constraints_.Configure(*param, info.num_col_);
+    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
+                          param_->colsample_bynode, param_->colsample_bylevel,
+                          param_->colsample_bytree);
+  }
+};
+
 /**
  * \brief CPU implementation of update prediction cache, which calculates the leaf value
  *        for the last tree and accumulates it to prediction vector.
diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h
index 885a109bf..acd6edf2b 100644
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -1,29 +1,51 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
  */
 #ifndef XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
 #define XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
 
-#include <utility>
-#include "../param.h"
+#include <algorithm>       // for all_of
+#include <ostream>         // for ostream
+#include <utility>         // for move
+#include <vector>          // for vector
 
-namespace xgboost {
-namespace tree {
+#include "../param.h"      // for SplitEntry, SplitEntryContainer, TrainParam
+#include "xgboost/base.h"  // for GradientPairPrecise, bst_node_t
 
-struct CPUExpandEntry {
-  int nid;
-  int depth;
-  SplitEntry split;
-  CPUExpandEntry() = default;
-  XGBOOST_DEVICE
-  CPUExpandEntry(int nid, int depth, SplitEntry split)
-      : nid(nid), depth(depth), split(std::move(split)) {}
-  CPUExpandEntry(int nid, int depth, float loss_chg)
-      : nid(nid), depth(depth)  {
-    split.loss_chg = loss_chg;
+namespace xgboost::tree {
+/**
+ * \brief Structure for storing tree split candidate.
+ */
+template <typename Impl>
+struct ExpandEntryImpl {
+  bst_node_t nid;
+  bst_node_t depth;
+
+  [[nodiscard]] float GetLossChange() const {
+    return static_cast<Impl const*>(this)->split.loss_chg;
+  }
+  [[nodiscard]] bst_node_t GetNodeId() const { return nid; }
+
+  static bool ChildIsValid(TrainParam const& param, bst_node_t depth, bst_node_t num_leaves) {
+    if (param.max_depth > 0 && depth >= param.max_depth) return false;
+    if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
+    return true;
   }
 
-  bool IsValid(const TrainParam& param, int num_leaves) const {
+  [[nodiscard]] bool IsValid(TrainParam const& param, bst_node_t num_leaves) const {
+    return static_cast<Impl const*>(this)->IsValidImpl(param, num_leaves);
+  }
+};
+
+struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
+  SplitEntry split;
+
+  CPUExpandEntry() = default;
+  CPUExpandEntry(bst_node_t nidx, bst_node_t depth, SplitEntry split)
+      : ExpandEntryImpl{nidx, depth}, split(std::move(split)) {}
+  CPUExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {}
+
+  [[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const {
     if (split.loss_chg <= kRtEps) return false;
     if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
       return false;
@@ -40,16 +62,7 @@ struct CPUExpandEntry {
     return true;
   }
 
-  float GetLossChange() const { return split.loss_chg; }
-  bst_node_t GetNodeId() const { return nid; }
-
-  static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
-    if (param.max_depth > 0 && depth >= param.max_depth) return false;
-    if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
-    return true;
-  }
-
-  friend std::ostream& operator<<(std::ostream& os, const CPUExpandEntry& e) {
+  friend std::ostream& operator<<(std::ostream& os, CPUExpandEntry const& e) {
     os << "ExpandEntry:\n";
     os << "nidx: " << e.nid << "\n";
     os << "depth: " << e.depth << "\n";
@@ -58,6 +71,54 @@ struct CPUExpandEntry {
     return os;
   }
 };
-}  // namespace tree
-}  // namespace xgboost
+
+struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
+  SplitEntryContainer<std::vector<GradientPairPrecise>> split;
+
+  MultiExpandEntry() = default;
+  MultiExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {}
+
+  [[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const {
+    if (split.loss_chg <= kRtEps) return false;
+    auto is_zero = [](auto const& sum) {
+      return std::all_of(sum.cbegin(), sum.cend(),
+                         [&](auto const& g) { return g.GetHess() - .0 == .0; });
+    };
+    if (is_zero(split.left_sum) || is_zero(split.right_sum)) {
+      return false;
+    }
+    if (split.loss_chg < param.min_split_loss) {
+      return false;
+    }
+    if (param.max_depth > 0 && depth == param.max_depth) {
+      return false;
+    }
+    if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
+      return false;
+    }
+    return true;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& e) {
+    os << "ExpandEntry: \n";
+    os << "nidx: " << e.nid << "\n";
+    os << "depth: " << e.depth << "\n";
+    os << "loss: " << e.split.loss_chg << "\n";
+    os << "split cond:" << e.split.split_value << "\n";
+    os << "split ind:" << e.split.SplitIndex() << "\n";
+    os << "left_sum: [";
+    for (auto v : e.split.left_sum) {
+      os << v << ", ";
+    }
+    os << "]\n";
+
+    os << "right_sum: [";
+    for (auto v : e.split.right_sum) {
+      os << v << ", ";
+    }
+    os << "]\n";
+    return os;
+  }
+};
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index 50b90f244..562a0b2d4 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -306,9 +306,9 @@ class HistogramBuilder {
 
 // Construct a work space for building histogram.  Eventually we should move this
 // function into histogram builder once hist tree method supports external memory.
-template <typename Partitioner>
+template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
 common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
-                                          std::vector<CPUExpandEntry> const &nodes_to_build) {
+                                          std::vector<ExpandEntry> const &nodes_to_build) {
   std::vector<size_t> partition_size(nodes_to_build.size(), 0);
   for (auto const &partition : partitioners) {
     size_t k = 0;
diff --git a/src/tree/param.h b/src/tree/param.h
index 98895e5a2..0d59a5c35 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -14,10 +14,12 @@
 #include <string>
 #include <vector>
 
-#include "xgboost/parameter.h"
-#include "xgboost/data.h"
 #include "../common/categorical.h"
+#include "../common/linalg_op.h"
 #include "../common/math.h"
+#include "xgboost/data.h"
+#include "xgboost/linalg.h"
+#include "xgboost/parameter.h"
 
 namespace xgboost {
 namespace tree {
@@ -197,12 +199,11 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
   }
 
   /*! \brief given the loss change, whether we need to invoke pruning */
-  bool NeedPrune(double loss_chg, int depth) const {
-    return loss_chg < this->min_split_loss ||
-           (this->max_depth != 0 && depth > this->max_depth);
+  [[nodiscard]] bool NeedPrune(double loss_chg, int depth) const {
+    return loss_chg < this->min_split_loss || (this->max_depth != 0 && depth > this->max_depth);
   }
 
-  bst_node_t MaxNodes() const {
+  [[nodiscard]] bst_node_t MaxNodes() const {
     if (this->max_depth == 0 && this->max_leaves == 0) {
       LOG(FATAL) << "Max leaves and max depth cannot both be unconstrained.";
     }
@@ -292,6 +293,34 @@ XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, GpairT sum_grad)
   return CalcWeight(p, sum_grad.GetGrad(), sum_grad.GetHess());
 }
 
+/**
+ * \brief multi-target weight, calculated with learning rate.
+ */
+inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
+                       float eta, linalg::VectorView<float> out_w) {
+  for (bst_target_t i = 0; i < out_w.Size(); ++i) {
+    out_w(i) = CalcWeight(p, grad_sum(i).GetGrad(), grad_sum(i).GetHess()) * eta;
+  }
+}
+
+/**
+ * \brief multi-target weight
+ */
+inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
+                       linalg::VectorView<float> out_w) {
+  return CalcWeight(p, grad_sum, 1.0f, out_w);
+}
+
+inline double CalcGainGivenWeight(TrainParam const &p,
+                                  linalg::VectorView<GradientPairPrecise const> sum_grad,
+                                  linalg::VectorView<float const> weight) {
+  double gain{0};
+  for (bst_target_t i = 0; i < weight.Size(); ++i) {
+    gain += -weight(i) * ThresholdL1(sum_grad(i).GetGrad(), p.reg_alpha);
+  }
+  return gain;
+}
+
 /*! \brief core statistics used for tree construction */
 struct XGBOOST_ALIGNAS(16) GradStats {
   using GradType = double;
@@ -301,8 +330,8 @@ struct XGBOOST_ALIGNAS(16) GradStats {
   GradType sum_hess { 0 };
 
  public:
-  XGBOOST_DEVICE GradType GetGrad() const { return sum_grad; }
-  XGBOOST_DEVICE GradType GetHess() const { return sum_hess; }
+  [[nodiscard]] XGBOOST_DEVICE GradType GetGrad() const { return sum_grad; }
+  [[nodiscard]] XGBOOST_DEVICE GradType GetHess() const { return sum_hess; }
 
   friend std::ostream& operator<<(std::ostream& os, GradStats s) {
     os << s.GetGrad() << "/" << s.GetHess();
@@ -340,7 +369,7 @@ struct XGBOOST_ALIGNAS(16) GradStats {
     sum_hess = a.sum_hess - b.sum_hess;
   }
   /*! \return whether the statistics is not used yet */
-  inline bool Empty() const { return sum_hess == 0.0; }
+  [[nodiscard]] bool Empty() const { return sum_hess == 0.0; }
   /*! \brief add statistics to the data */
   inline void Add(GradType grad, GradType hess) {
     sum_grad += grad;
@@ -348,6 +377,19 @@ struct XGBOOST_ALIGNAS(16) GradStats {
   }
 };
 
+// Helper functions for copying gradient statistic, one for vector leaf, another for normal scalar.
+template <typename T, typename U>
+std::vector<T> &CopyStats(linalg::VectorView<U> const &src, std::vector<T> *dst) {  // NOLINT
+  dst->resize(src.Size());
+  std::copy(linalg::cbegin(src), linalg::cend(src), dst->begin());
+  return *dst;
+}
+
+inline GradStats &CopyStats(GradStats const &src, GradStats *dst) {  // NOLINT
+  *dst = src;
+  return *dst;
+}
+
 /*!
  * \brief statistics that is helpful to store
  *   and represent a split solution for the tree
@@ -378,9 +420,9 @@ struct SplitEntryContainer {
     return os;
   }
   /*!\return feature index to split on */
-  bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
+  [[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
   /*!\return whether missing value goes to left branch */
-  bool DefaultLeft() const { return (sindex >> 31) != 0; }
+  [[nodiscard]] bool DefaultLeft() const { return (sindex >> 31) != 0; }
   /*!
    * \brief decides whether we can replace current entry with the given statistics
    *
@@ -391,10 +433,10 @@ struct SplitEntryContainer {
    * \param new_loss_chg the loss reduction get through the split
    * \param split_index the feature index where the split is on
    */
-  bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
+  [[nodiscard]] bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
     if (std::isinf(new_loss_chg)) {  // in some cases new_loss_chg can be NaN or Inf,
-                                         // for example when lambda = 0 & min_child_weight = 0
-                                         // skip value in this case
+                                     // for example when lambda = 0 & min_child_weight = 0
+                                     // skip value in this case
       return false;
     } else if (this->SplitIndex() <= split_index) {
       return new_loss_chg > this->loss_chg;
@@ -429,9 +471,10 @@ struct SplitEntryContainer {
    * \param default_left whether the missing value goes to left
    * \return whether the proposed split is better and can replace current split
    */
-  bool Update(bst_float new_loss_chg, unsigned split_index,
-              bst_float new_split_value, bool default_left, bool is_cat,
-              const GradientT &left_sum, const GradientT &right_sum) {
+  template <typename GradientSumT>
+  bool Update(bst_float new_loss_chg, unsigned split_index, bst_float new_split_value,
+              bool default_left, bool is_cat, GradientSumT const &left_sum,
+              GradientSumT const &right_sum) {
     if (this->NeedReplace(new_loss_chg, split_index)) {
       this->loss_chg = new_loss_chg;
       if (default_left) {
@@ -440,8 +483,8 @@ struct SplitEntryContainer {
       this->sindex = split_index;
       this->split_value = new_split_value;
       this->is_cat = is_cat;
-      this->left_sum = left_sum;
-      this->right_sum = right_sum;
+      CopyStats(left_sum, &this->left_sum);
+      CopyStats(right_sum, &this->right_sum);
       return true;
     } else {
       return false;
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 0891ec3b2..7550904b5 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -815,9 +815,9 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
                          linalg::VectorView<float const> left_weight,
                          linalg::VectorView<float const> right_weight) {
   CHECK(IsMultiTarget());
-  CHECK_LT(split_index, this->param.num_feature);
+  CHECK_LT(split_index, this->param_.num_feature);
   CHECK(this->p_mt_tree_);
-  CHECK_GT(param.size_leaf_vector, 1);
+  CHECK_GT(param_.size_leaf_vector, 1);
 
   this->p_mt_tree_->Expand(nidx, split_index, split_cond, default_left, base_weight, left_weight,
                            right_weight);
@@ -826,7 +826,7 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
   split_categories_segments_.resize(this->Size());
   this->split_types_.at(nidx) = FeatureType::kNumerical;
 
-  this->param.num_nodes = this->p_mt_tree_->Size();
+  this->param_.num_nodes = this->p_mt_tree_->Size();
 }
 
 void RegTree::ExpandCategorical(bst_node_t nid, bst_feature_t split_index,
@@ -850,13 +850,13 @@ void RegTree::ExpandCategorical(bst_node_t nid, bst_feature_t split_index,
 }
 
 void RegTree::Load(dmlc::Stream* fi) {
-  CHECK_EQ(fi->Read(&param, sizeof(TreeParam)), sizeof(TreeParam));
+  CHECK_EQ(fi->Read(&param_, sizeof(TreeParam)), sizeof(TreeParam));
   if (!DMLC_IO_NO_ENDIAN_SWAP) {
-    param = param.ByteSwap();
+    param_ = param_.ByteSwap();
   }
-  nodes_.resize(param.num_nodes);
-  stats_.resize(param.num_nodes);
-  CHECK_NE(param.num_nodes, 0);
+  nodes_.resize(param_.num_nodes);
+  stats_.resize(param_.num_nodes);
+  CHECK_NE(param_.num_nodes, 0);
   CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()),
            sizeof(Node) * nodes_.size());
   if (!DMLC_IO_NO_ENDIAN_SWAP) {
@@ -873,29 +873,31 @@ void RegTree::Load(dmlc::Stream* fi) {
   }
   // chg deleted nodes
   deleted_nodes_.resize(0);
-  for (int i = 1; i < param.num_nodes; ++i) {
+  for (int i = 1; i < param_.num_nodes; ++i) {
     if (nodes_[i].IsDeleted()) {
       deleted_nodes_.push_back(i);
     }
   }
-  CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param.num_deleted);
+  CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param_.num_deleted);
 
-  split_types_.resize(param.num_nodes, FeatureType::kNumerical);
-  split_categories_segments_.resize(param.num_nodes);
+  split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
+  split_categories_segments_.resize(param_.num_nodes);
 }
 
 void RegTree::Save(dmlc::Stream* fo) const {
-  CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
-  CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
-  CHECK_EQ(param.deprecated_num_roots, 1);
-  CHECK_NE(param.num_nodes, 0);
+  CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
+  CHECK_EQ(param_.deprecated_num_roots, 1);
+  CHECK_NE(param_.num_nodes, 0);
+  CHECK(!IsMultiTarget())
+      << "Please use JSON/UBJSON for saving models with multi-target trees.";
   CHECK(!HasCategoricalSplit())
       << "Please use JSON/UBJSON for saving models with categorical splits.";
 
   if (DMLC_IO_NO_ENDIAN_SWAP) {
-    fo->Write(&param, sizeof(TreeParam));
+    fo->Write(&param_, sizeof(TreeParam));
   } else {
-    TreeParam x = param.ByteSwap();
+    TreeParam x = param_.ByteSwap();
     fo->Write(&x, sizeof(x));
   }
 
@@ -1081,7 +1083,7 @@ void RegTree::LoadModel(Json const& in) {
   bool typed = IsA<I32Array>(in[tf::kParent]);
   auto const& in_obj = get<Object const>(in);
   // basic properties
-  FromJson(in["tree_param"], &param);
+  FromJson(in["tree_param"], &param_);
   // categorical splits
   bool has_cat = in_obj.find("split_type") != in_obj.cend();
   if (has_cat) {
@@ -1092,55 +1094,55 @@ void RegTree::LoadModel(Json const& in) {
     }
   }
   // multi-target
-  if (param.size_leaf_vector > 1) {
-    this->p_mt_tree_.reset(new MultiTargetTree{&param});
+  if (param_.size_leaf_vector > 1) {
+    this->p_mt_tree_.reset(new MultiTargetTree{&param_});
     this->GetMultiTargetTree()->LoadModel(in);
     return;
   }
 
   bool feature_is_64 = IsA<I64Array>(in["split_indices"]);
   if (typed && feature_is_64) {
-    LoadModelImpl<true, true>(in, param, &stats_, &nodes_);
+    LoadModelImpl<true, true>(in, param_, &stats_, &nodes_);
   } else if (typed && !feature_is_64) {
-    LoadModelImpl<true, false>(in, param, &stats_, &nodes_);
+    LoadModelImpl<true, false>(in, param_, &stats_, &nodes_);
   } else if (!typed && feature_is_64) {
-    LoadModelImpl<false, true>(in, param, &stats_, &nodes_);
+    LoadModelImpl<false, true>(in, param_, &stats_, &nodes_);
   } else {
-    LoadModelImpl<false, false>(in, param, &stats_, &nodes_);
+    LoadModelImpl<false, false>(in, param_, &stats_, &nodes_);
   }
 
   if (!has_cat) {
-    this->split_categories_segments_.resize(this->param.num_nodes);
-    this->split_types_.resize(this->param.num_nodes);
+    this->split_categories_segments_.resize(this->param_.num_nodes);
+    this->split_types_.resize(this->param_.num_nodes);
     std::fill(split_types_.begin(), split_types_.end(), FeatureType::kNumerical);
   }
 
   deleted_nodes_.clear();
-  for (bst_node_t i = 1; i < param.num_nodes; ++i) {
+  for (bst_node_t i = 1; i < param_.num_nodes; ++i) {
     if (nodes_[i].IsDeleted()) {
       deleted_nodes_.push_back(i);
     }
   }
   // easier access to [] operator
   auto& self = *this;
-  for (auto nid = 1; nid < param.num_nodes; ++nid) {
+  for (auto nid = 1; nid < param_.num_nodes; ++nid) {
     auto parent = self[nid].Parent();
     CHECK_NE(parent, RegTree::kInvalidNodeId);
     self[nid].SetParent(self[nid].Parent(), self[parent].LeftChild() == nid);
   }
-  CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param.num_deleted);
-  CHECK_EQ(this->split_categories_segments_.size(), param.num_nodes);
+  CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param_.num_deleted);
+  CHECK_EQ(this->split_categories_segments_.size(), param_.num_nodes);
 }
 
 void RegTree::SaveModel(Json* p_out) const {
   auto& out = *p_out;
   // basic properties
-  out["tree_param"] = ToJson(param);
+  out["tree_param"] = ToJson(param_);
   // categorical splits
   this->SaveCategoricalSplit(p_out);
   // multi-target
   if (this->IsMultiTarget()) {
-    CHECK_GT(param.size_leaf_vector, 1);
+    CHECK_GT(param_.size_leaf_vector, 1);
     this->GetMultiTargetTree()->SaveModel(p_out);
     return;
   }
@@ -1150,11 +1152,11 @@ void RegTree::SaveModel(Json* p_out) const {
    *  pruner, and this pruner can be used inside another updater so leaf are not necessary
    *  at the end of node array.
    */
-  CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
-  CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
 
-  CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param.num_nodes));
-  auto n_nodes = param.num_nodes;
+  CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param_.num_nodes));
+  auto n_nodes = param_.num_nodes;
 
   // stats
   F32Array loss_changes(n_nodes);
@@ -1168,7 +1170,7 @@ void RegTree::SaveModel(Json* p_out) const {
 
   F32Array conds(n_nodes);
   U8Array default_left(n_nodes);
-  CHECK_EQ(this->split_types_.size(), param.num_nodes);
+  CHECK_EQ(this->split_types_.size(), param_.num_nodes);
 
   namespace tf = tree_field;
 
@@ -1189,7 +1191,7 @@ void RegTree::SaveModel(Json* p_out) const {
       default_left.Set(i, static_cast<uint8_t>(!!n.DefaultLeft()));
     }
   };
-  if (this->param.num_feature > static_cast<bst_feature_t>(std::numeric_limits<int32_t>::max())) {
+  if (this->param_.num_feature > static_cast<bst_feature_t>(std::numeric_limits<int32_t>::max())) {
     I64Array indices_64(n_nodes);
     save_tree(&indices_64);
     out[tf::kSplitIdx] = std::move(indices_64);
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 5af2721a6..fd636d3a3 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -226,8 +226,8 @@ class GloablApproxBuilder {
         for (auto const &candidate : valid_candidates) {
           int left_child_nidx = tree[candidate.nid].LeftChild();
           int right_child_nidx = tree[candidate.nid].RightChild();
-          CPUExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx), {}};
-          CPUExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx), {}};
+          CPUExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx)};
+          CPUExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx)};
           best_splits.push_back(l_best);
           best_splits.push_back(r_best);
         }
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 06579c429..02edfa74a 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -190,7 +190,7 @@ class ColMaker: public TreeUpdater {
         (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
       }
       // remember auxiliary statistics in the tree node
-      for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
+      for (int nid = 0; nid < p_tree->NumNodes(); ++nid) {
         p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
         p_tree->Stat(nid).base_weight = snode_[nid].weight;
         p_tree->Stat(nid).sum_hess = static_cast<float>(snode_[nid].stats.sum_hess);
@@ -255,9 +255,9 @@ class ColMaker: public TreeUpdater {
       {
         // setup statistics space for each tree node
         for (auto& i : stemp_) {
-          i.resize(tree.param.num_nodes, ThreadEntry());
+          i.resize(tree.NumNodes(), ThreadEntry());
         }
-        snode_.resize(tree.param.num_nodes, NodeEntry());
+        snode_.resize(tree.NumNodes(), NodeEntry());
       }
       const MetaInfo& info = fmat.Info();
       // setup position
diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc
index 0970d2f79..29f9917ba 100644
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -72,7 +72,7 @@ class TreePruner : public TreeUpdater {
   void DoPrune(TrainParam const* param, RegTree* p_tree) {
     auto& tree = *p_tree;
     bst_node_t npruned = 0;
-    for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
+    for (int nid = 0; nid < tree.NumNodes(); ++nid) {
       if (tree[nid].IsLeaf() && !tree[nid].IsDeleted()) {
         npruned = this->TryPruneLeaf(param, p_tree, nid, tree.GetDepth(nid), npruned);
       }
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 76c402ff5..012b8e781 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -4,263 +4,368 @@
  * \brief use quantized feature values to construct a tree
  * \author Philip Cho, Tianqi Checn, Egor Smirnov
  */
-#include "./updater_quantile_hist.h"
+#include <algorithm>                         // for max, copy, transform
+#include <cstddef>                           // for size_t
+#include <cstdint>                           // for uint32_t, int32_t
+#include <memory>                            // for unique_ptr, allocator, make_unique, shared_ptr
+#include <numeric>                           // for accumulate
+#include <ostream>                           // for basic_ostream, char_traits, operator<<
+#include <utility>                           // for move, swap
+#include <vector>                            // for vector
 
-#include <algorithm>
-#include <cstddef>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
+#include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
+#include "../collective/communicator.h"      // for Operation
+#include "../common/hist_util.h"             // for HistogramCuts, HistCollection
+#include "../common/linalg_op.h"             // for begin, cbegin, cend
+#include "../common/random.h"                // for ColumnSampler
+#include "../common/threading_utils.h"       // for ParallelFor
+#include "../common/timer.h"                 // for Monitor
+#include "../common/transform_iterator.h"    // for IndexTransformIter, MakeIndexTransformIter
+#include "../data/gradient_index.h"          // for GHistIndexMatrix
+#include "common_row_partitioner.h"          // for CommonRowPartitioner
+#include "dmlc/omp.h"                        // for omp_get_thread_num
+#include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
+#include "driver.h"                          // for Driver
+#include "hist/evaluate_splits.h"            // for HistEvaluator, HistMultiEvaluator, UpdatePre...
+#include "hist/expand_entry.h"               // for MultiExpandEntry, CPUExpandEntry
+#include "hist/histogram.h"                  // for HistogramBuilder, ConstructHistSpace
+#include "hist/sampler.h"                    // for SampleGradient
+#include "param.h"                           // for TrainParam, SplitEntryContainer, GradStats
+#include "xgboost/base.h"                    // for GradientPairInternal, GradientPair, bst_targ...
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/data.h"                    // for BatchIterator, BatchSet, DMatrix, MetaInfo
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/linalg.h"                  // for All, MatrixView, TensorView, Matrix, Empty
+#include "xgboost/logging.h"                 // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
+#include "xgboost/span.h"                    // for Span, operator!=, SpanIterator
+#include "xgboost/string_view.h"             // for operator<<
+#include "xgboost/task.h"                    // for ObjInfo
+#include "xgboost/tree_model.h"              // for RegTree, MTNotImplemented, RTreeNodeStat
+#include "xgboost/tree_updater.h"            // for TreeUpdater, TreeUpdaterReg, XGBOOST_REGISTE...
 
-#include "common_row_partitioner.h"
-#include "constraints.h"
-#include "hist/evaluate_splits.h"
-#include "hist/histogram.h"
-#include "hist/sampler.h"
-#include "param.h"
-#include "xgboost/linalg.h"
-#include "xgboost/logging.h"
-#include "xgboost/tree_updater.h"
-
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 
 DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
 
-void QuantileHistMaker::Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair,
-                               DMatrix *dmat,
-                               common::Span<HostDeviceVector<bst_node_t>> out_position,
-                               const std::vector<RegTree *> &trees) {
-  // build tree
-  const size_t n_trees = trees.size();
-  if (!pimpl_) {
-    pimpl_.reset(new Builder(n_trees, param, dmat, *task_, ctx_));
-  }
+BatchParam HistBatch(TrainParam const *param) { return {param->max_bin, param->sparse_threshold}; }
 
-  size_t t_idx{0};
-  for (auto p_tree : trees) {
-    auto &t_row_position = out_position[t_idx];
-    this->pimpl_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
-    ++t_idx;
-  }
-}
-
-bool QuantileHistMaker::UpdatePredictionCache(const DMatrix *data,
-                                              linalg::VectorView<float> out_preds) {
-  if (pimpl_) {
-    return pimpl_->UpdatePredictionCache(data, out_preds);
-  } else {
-    return false;
-  }
-}
-
-CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
-    DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h) {
-  CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0), 0.0f);
-
-  size_t page_id = 0;
-  auto space = ConstructHistSpace(partitioner_, {node});
-  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-    std::vector<CPUExpandEntry> nodes_to_build{node};
-    std::vector<CPUExpandEntry> nodes_to_sub;
-    this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
-                                        partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                        nodes_to_sub, gpair_h);
-    ++page_id;
-  }
-
-  {
-    GradientPairPrecise grad_stat;
-    if (p_fmat->IsDense()) {
-      /**
-       * Specialized code for dense data: For dense data (with no missing value), the sum
-       * of gradient histogram is equal to snode[nid]
-       */
-      auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
-      std::vector<uint32_t> const &row_ptr = gmat.cut.Ptrs();
-      CHECK_GE(row_ptr.size(), 2);
-      uint32_t const ibegin = row_ptr[0];
-      uint32_t const iend = row_ptr[1];
-      auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
-      auto begin = hist.data();
-      for (uint32_t i = ibegin; i < iend; ++i) {
-        GradientPairPrecise const &et = begin[i];
-        grad_stat.Add(et.GetGrad(), et.GetHess());
-      }
-    } else {
-      for (auto const &grad : gpair_h) {
-        grad_stat.Add(grad.GetGrad(), grad.GetHess());
-      }
-      collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat), 2);
-    }
-
-    auto weight = evaluator_->InitRoot(GradStats{grad_stat});
-    p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess();
-    p_tree->Stat(RegTree::kRoot).base_weight = weight;
-    (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
-
-    std::vector<CPUExpandEntry> entries{node};
-    monitor_->Start("EvaluateSplits");
-    auto ft = p_fmat->Info().feature_types.ConstHostSpan();
-    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-      evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree, &entries);
-      break;
-    }
-    monitor_->Stop("EvaluateSplits");
-    node = entries.front();
-  }
-
-  return node;
-}
-
-void QuantileHistMaker::Builder::BuildHistogram(DMatrix *p_fmat, RegTree *p_tree,
-                                                std::vector<CPUExpandEntry> const &valid_candidates,
-                                                std::vector<GradientPair> const &gpair) {
-  std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
-  std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
-
-  size_t n_idx = 0;
-  for (auto const &c : valid_candidates) {
-    auto left_nidx = (*p_tree)[c.nid].LeftChild();
-    auto right_nidx = (*p_tree)[c.nid].RightChild();
-    auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
-
-    auto build_nidx = left_nidx;
-    auto subtract_nidx = right_nidx;
-    if (fewer_right) {
-      std::swap(build_nidx, subtract_nidx);
-    }
-    nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
-    nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
-    n_idx++;
-  }
-
-  size_t page_id{0};
-  auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-    histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
-                                  partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                  nodes_to_sub, gpair);
-    ++page_id;
-  }
-}
-
-void QuantileHistMaker::Builder::LeafPartition(RegTree const &tree,
-                                               common::Span<GradientPair const> gpair,
-                                               std::vector<bst_node_t> *p_out_position) {
+template <typename ExpandEntry, typename Updater>
+void UpdateTree(common::Monitor *monitor_, linalg::MatrixView<GradientPair const> gpair,
+                Updater *updater, DMatrix *p_fmat, TrainParam const *param,
+                HostDeviceVector<bst_node_t> *p_out_position, RegTree *p_tree) {
   monitor_->Start(__func__);
-  if (!task_.UpdateTreeLeaf()) {
-    return;
-  }
-  for (auto const &part : partitioner_) {
-    part.LeafPartition(ctx_, tree, gpair, p_out_position);
-  }
-  monitor_->Stop(__func__);
-}
+  updater->InitData(p_fmat, p_tree);
 
-void QuantileHistMaker::Builder::ExpandTree(DMatrix *p_fmat, RegTree *p_tree,
-                                            const std::vector<GradientPair> &gpair_h,
-                                            HostDeviceVector<bst_node_t> *p_out_position) {
-  monitor_->Start(__func__);
-
-  Driver<CPUExpandEntry> driver(*param_);
-  driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
+  Driver<ExpandEntry> driver{*param};
   auto const &tree = *p_tree;
+  driver.Push(updater->InitRoot(p_fmat, gpair, p_tree));
   auto expand_set = driver.Pop();
 
+  /**
+   * Note for update position
+   * Root:
+   *   Not applied: No need to update position as initialization has got all the rows ordered.
+   *   Applied: Update position is run on applied nodes so the rows are partitioned.
+   * Non-root:
+   *   Not applied: That node is root of the subtree, same rule as root.
+   *   Applied: Ditto
+   */
   while (!expand_set.empty()) {
     // candidates that can be further splited.
-    std::vector<CPUExpandEntry> valid_candidates;
+    std::vector<ExpandEntry> valid_candidates;
     // candidaates that can be applied.
-    std::vector<CPUExpandEntry> applied;
-    int32_t depth = expand_set.front().depth + 1;
-    for (auto const& candidate : expand_set) {
-      evaluator_->ApplyTreeSplit(candidate, p_tree);
+    std::vector<ExpandEntry> applied;
+    for (auto const &candidate : expand_set) {
+      updater->ApplyTreeSplit(candidate, p_tree);
+      CHECK_GT(p_tree->LeftChild(candidate.nid), candidate.nid);
       applied.push_back(candidate);
       if (driver.IsChildValid(candidate)) {
         valid_candidates.emplace_back(candidate);
       }
     }
 
-    monitor_->Start("UpdatePosition");
-    size_t page_id{0};
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-      partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
-      ++page_id;
-    }
-    monitor_->Stop("UpdatePosition");
+    updater->UpdatePosition(p_fmat, p_tree, applied);
 
-    std::vector<CPUExpandEntry> best_splits;
+    std::vector<ExpandEntry> best_splits;
     if (!valid_candidates.empty()) {
-      this->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair_h);
+      updater->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair);
       for (auto const &candidate : valid_candidates) {
-        int left_child_nidx = tree[candidate.nid].LeftChild();
-        int right_child_nidx = tree[candidate.nid].RightChild();
-        CPUExpandEntry l_best{left_child_nidx, depth, 0.0};
-        CPUExpandEntry r_best{right_child_nidx, depth, 0.0};
+        auto left_child_nidx = tree.LeftChild(candidate.nid);
+        auto right_child_nidx = tree.RightChild(candidate.nid);
+        ExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx)};
+        ExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx)};
         best_splits.push_back(l_best);
         best_splits.push_back(r_best);
       }
-      auto const &histograms = histogram_builder_->Histogram();
-      auto ft = p_fmat->Info().feature_types.ConstHostSpan();
-      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-        evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, &best_splits);
-        break;
-      }
+      updater->EvaluateSplits(p_fmat, p_tree, &best_splits);
     }
     driver.Push(best_splits.begin(), best_splits.end());
     expand_set = driver.Pop();
   }
 
   auto &h_out_position = p_out_position->HostVector();
-  this->LeafPartition(tree, gpair_h, &h_out_position);
+  updater->LeafPartition(tree, gpair, &h_out_position);
   monitor_->Stop(__func__);
 }
 
-void QuantileHistMaker::Builder::UpdateTree(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
-                                            RegTree *p_tree,
-                                            HostDeviceVector<bst_node_t> *p_out_position) {
-  monitor_->Start(__func__);
+/**
+ * \brief Updater for building multi-target trees. The implementation simply iterates over
+ *        each target.
+ */
+class MultiTargetHistBuilder {
+ private:
+  common::Monitor *monitor_{nullptr};
+  TrainParam const *param_{nullptr};
+  std::shared_ptr<common::ColumnSampler> col_sampler_;
+  std::unique_ptr<HistMultiEvaluator> evaluator_;
+  // Histogram builder for each target.
+  std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
+  Context const *ctx_{nullptr};
+  // Partitioner for each data batch.
+  std::vector<CommonRowPartitioner> partitioner_;
+  // Pointer to last updated tree, used for update prediction cache.
+  RegTree const *p_last_tree_{nullptr};
 
-  std::vector<GradientPair> *gpair_ptr = &(gpair->HostVector());
-  // in case 'num_parallel_trees != 1' no posibility to change initial gpair
-  if (GetNumberOfTrees() != 1) {
-    gpair_local_.resize(gpair_ptr->size());
-    gpair_local_ = *gpair_ptr;
-    gpair_ptr = &gpair_local_;
+  ObjInfo const *task_{nullptr};
+
+ public:
+  void UpdatePosition(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<MultiExpandEntry> const &applied) {
+    monitor_->Start(__func__);
+    std::size_t page_id{0};
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
+      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
+      page_id++;
+    }
+    monitor_->Stop(__func__);
   }
 
-  this->InitData(p_fmat, *p_tree, gpair_ptr);
-
-  ExpandTree(p_fmat, p_tree, *gpair_ptr, p_out_position);
-  monitor_->Stop(__func__);
-}
-
-bool QuantileHistMaker::Builder::UpdatePredictionCache(DMatrix const *data,
-                                                       linalg::VectorView<float> out_preds) const {
-  // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
-  // conjunction with Update().
-  if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
-    return false;
+  void ApplyTreeSplit(MultiExpandEntry const &candidate, RegTree *p_tree) {
+    this->evaluator_->ApplyTreeSplit(candidate, p_tree);
   }
-  monitor_->Start(__func__);
-  CHECK_EQ(out_preds.Size(), data->Info().num_row_);
-  UpdatePredictionCacheImpl(ctx_, p_last_tree_, partitioner_, out_preds);
-  monitor_->Stop(__func__);
-  return true;
-}
 
-size_t QuantileHistMaker::Builder::GetNumberOfTrees() { return n_trees_; }
+  void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
+    monitor_->Start(__func__);
 
-void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
-                                          std::vector<GradientPair> *gpair) {
-  monitor_->Start(__func__);
-  const auto& info = fmat->Info();
+    std::size_t page_id = 0;
+    bst_bin_t n_total_bins = 0;
+    partitioner_.clear();
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      if (n_total_bins == 0) {
+        n_total_bins = page.cut.TotalBins();
+      } else {
+        CHECK_EQ(n_total_bins, page.cut.TotalBins());
+      }
+      partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->IsColumnSplit());
+      page_id++;
+    }
 
-  {
-    size_t page_id{0};
-    int32_t n_total_bins{0};
+    bst_target_t n_targets = p_tree->NumTargets();
+    histogram_builder_.clear();
+    for (std::size_t i = 0; i < n_targets; ++i) {
+      histogram_builder_.emplace_back();
+      histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
+                                      collective::IsDistributed(), p_fmat->IsColumnSplit());
+    }
+
+    evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
+    p_last_tree_ = p_tree;
+    monitor_->Stop(__func__);
+  }
+
+  MultiExpandEntry InitRoot(DMatrix *p_fmat, linalg::MatrixView<GradientPair const> gpair,
+                            RegTree *p_tree) {
+    monitor_->Start(__func__);
+    MultiExpandEntry best;
+    best.nid = RegTree::kRoot;
+    best.depth = 0;
+
+    auto n_targets = p_tree->NumTargets();
+    linalg::Matrix<GradientPairPrecise> root_sum_tloc =
+        linalg::Empty<GradientPairPrecise>(ctx_, ctx_->Threads(), n_targets);
+    CHECK_EQ(root_sum_tloc.Shape(1), gpair.Shape(1));
+    auto h_root_sum_tloc = root_sum_tloc.HostView();
+    common::ParallelFor(gpair.Shape(0), ctx_->Threads(), [&](auto i) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        h_root_sum_tloc(omp_get_thread_num(), t) += GradientPairPrecise{gpair(i, t)};
+      }
+    });
+    // Aggregate to the first row.
+    auto root_sum = h_root_sum_tloc.Slice(0, linalg::All());
+    for (std::int32_t tidx{1}; tidx < ctx_->Threads(); ++tidx) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        root_sum(t) += h_root_sum_tloc(tidx, t);
+      }
+    }
+    CHECK(root_sum.CContiguous());
+    collective::Allreduce<collective::Operation::kSum>(
+        reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
+
+    std::vector<MultiExpandEntry> nodes{best};
+    std::size_t i = 0;
+    auto space = ConstructHistSpace(partitioner_, nodes);
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
+                                        nodes, {}, t_gpair.Values());
+      }
+      i++;
+    }
+
+    auto weight = evaluator_->InitRoot(root_sum);
+    auto weight_t = weight.HostView();
+    std::transform(linalg::cbegin(weight_t), linalg::cend(weight_t), linalg::begin(weight_t),
+                   [&](float w) { return w * param_->learning_rate; });
+
+    p_tree->SetLeaf(RegTree::kRoot, weight_t);
+    std::vector<common::HistCollection const *> hists;
+    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
+      hists.push_back(&histogram_builder_[t].Histogram());
+    }
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
+      break;
+    }
+    monitor_->Stop(__func__);
+
+    return nodes.front();
+  }
+
+  void BuildHistogram(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<MultiExpandEntry> const &valid_candidates,
+                      linalg::MatrixView<GradientPair const> gpair) {
+    monitor_->Start(__func__);
+    std::vector<MultiExpandEntry> nodes_to_build;
+    std::vector<MultiExpandEntry> nodes_to_sub;
+
+    for (auto const &c : valid_candidates) {
+      auto left_nidx = p_tree->LeftChild(c.nid);
+      auto right_nidx = p_tree->RightChild(c.nid);
+
+      auto build_nidx = left_nidx;
+      auto subtract_nidx = right_nidx;
+      auto lit =
+          common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
+      auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
+      auto rit =
+          common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
+      auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
+      auto fewer_right = right_sum < left_sum;
+      if (fewer_right) {
+        std::swap(build_nidx, subtract_nidx);
+      }
+      nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
+      nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
+    }
+
+    std::size_t i = 0;
+    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        // Make sure the gradient matrix is f-order.
+        CHECK(t_gpair.Contiguous());
+        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
+                                        nodes_to_build, nodes_to_sub, t_gpair.Values());
+      }
+      i++;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<MultiExpandEntry> *best_splits) {
+    monitor_->Start(__func__);
+    std::vector<common::HistCollection const *> hists;
+    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
+      hists.push_back(&histogram_builder_[t].Histogram());
+    }
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
+      break;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void LeafPartition(RegTree const &tree, linalg::MatrixView<GradientPair const> gpair,
+                     std::vector<bst_node_t> *p_out_position) {
+    monitor_->Start(__func__);
+    if (!task_->UpdateTreeLeaf()) {
+      return;
+    }
+    for (auto const &part : partitioner_) {
+      part.LeafPartition(ctx_, tree, gpair, p_out_position);
+    }
+    monitor_->Stop(__func__);
+  }
+
+ public:
+  explicit MultiTargetHistBuilder(Context const *ctx, MetaInfo const &info, TrainParam const *param,
+                                  std::shared_ptr<common::ColumnSampler> column_sampler,
+                                  ObjInfo const *task, common::Monitor *monitor)
+      : monitor_{monitor},
+        param_{param},
+        col_sampler_{std::move(column_sampler)},
+        evaluator_{std::make_unique<HistMultiEvaluator>(ctx, info, param, col_sampler_)},
+        ctx_{ctx},
+        task_{task} {
+    monitor_->Init(__func__);
+  }
+};
+
+class HistBuilder {
+ private:
+  common::Monitor *monitor_;
+  TrainParam const *param_;
+  std::shared_ptr<common::ColumnSampler> col_sampler_;
+  std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
+  std::vector<CommonRowPartitioner> partitioner_;
+
+  // back pointers to tree and data matrix
+  const RegTree *p_last_tree_{nullptr};
+  DMatrix const *const p_last_fmat_{nullptr};
+
+  std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
+  ObjInfo const *task_{nullptr};
+  // Context for number of threads
+  Context const *ctx_{nullptr};
+
+ public:
+  explicit HistBuilder(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
+                       TrainParam const *param, DMatrix const *fmat, ObjInfo const *task,
+                       common::Monitor *monitor)
+      : monitor_{monitor},
+        param_{param},
+        col_sampler_{std::move(column_sampler)},
+        evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
+                                                                   col_sampler_)},
+        p_last_fmat_(fmat),
+        histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
+        task_{task},
+        ctx_{ctx} {
+    monitor_->Init(__func__);
+  }
+
+  bool UpdatePredictionCache(DMatrix const *data, linalg::VectorView<float> out_preds) const {
+    // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
+    // conjunction with Update().
+    if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
+      return false;
+    }
+    monitor_->Start(__func__);
+    CHECK_EQ(out_preds.Size(), data->Info().num_row_);
+    UpdatePredictionCacheImpl(ctx_, p_last_tree_, partitioner_, out_preds);
+    monitor_->Stop(__func__);
+    return true;
+  }
+
+ public:
+  // initialize temp data structure
+  void InitData(DMatrix *fmat, RegTree const *p_tree) {
+    monitor_->Start(__func__);
+    std::size_t page_id{0};
+    bst_bin_t n_total_bins{0};
     partitioner_.clear();
     for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
       if (n_total_bins == 0) {
@@ -273,22 +378,227 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
     }
     histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
                               collective::IsDistributed(), fmat->IsColumnSplit());
-
-    auto m_gpair = linalg::MakeTensorView(ctx_, *gpair, gpair->size(), static_cast<std::size_t>(1));
-    SampleGradient(ctx_, *param_, m_gpair);
+    evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
+                                                                 col_sampler_);
+    p_last_tree_ = p_tree;
   }
 
-  // store a pointer to the tree
-  p_last_tree_ = &tree;
-  evaluator_.reset(new HistEvaluator<CPUExpandEntry>{ctx_, param_, info, column_sampler_});
+  void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<CPUExpandEntry> *best_splits) {
+    monitor_->Start(__func__);
+    auto const &histograms = histogram_builder_->Histogram();
+    auto ft = p_fmat->Info().feature_types.ConstHostSpan();
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
+      break;
+    }
+    monitor_->Stop(__func__);
+  }
 
-  monitor_->Stop(__func__);
-}
+  void ApplyTreeSplit(CPUExpandEntry const &candidate, RegTree *p_tree) {
+    this->evaluator_->ApplyTreeSplit(candidate, p_tree);
+  }
+
+  CPUExpandEntry InitRoot(DMatrix *p_fmat, linalg::MatrixView<GradientPair const> gpair,
+                          RegTree *p_tree) {
+    CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
+
+    std::size_t page_id = 0;
+    auto space = ConstructHistSpace(partitioner_, {node});
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      std::vector<CPUExpandEntry> nodes_to_build{node};
+      std::vector<CPUExpandEntry> nodes_to_sub;
+      this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
+                                          partitioner_.at(page_id).Partitions(), nodes_to_build,
+                                          nodes_to_sub, gpair.Slice(linalg::All(), 0).Values());
+      ++page_id;
+    }
+
+    {
+      GradientPairPrecise grad_stat;
+      if (p_fmat->IsDense()) {
+        /**
+         * Specialized code for dense data: For dense data (with no missing value), the sum
+         * of gradient histogram is equal to snode[nid]
+         */
+        auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
+        std::vector<std::uint32_t> const &row_ptr = gmat.cut.Ptrs();
+        CHECK_GE(row_ptr.size(), 2);
+        std::uint32_t const ibegin = row_ptr[0];
+        std::uint32_t const iend = row_ptr[1];
+        auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
+        auto begin = hist.data();
+        for (std::uint32_t i = ibegin; i < iend; ++i) {
+          GradientPairPrecise const &et = begin[i];
+          grad_stat.Add(et.GetGrad(), et.GetHess());
+        }
+      } else {
+        auto gpair_h = gpair.Slice(linalg::All(), 0).Values();
+        for (auto const &grad : gpair_h) {
+          grad_stat.Add(grad.GetGrad(), grad.GetHess());
+        }
+        collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
+                                                           2);
+      }
+
+      auto weight = evaluator_->InitRoot(GradStats{grad_stat});
+      p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess();
+      p_tree->Stat(RegTree::kRoot).base_weight = weight;
+      (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
+
+      std::vector<CPUExpandEntry> entries{node};
+      monitor_->Start("EvaluateSplits");
+      auto ft = p_fmat->Info().feature_types.ConstHostSpan();
+      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+        evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
+                                   &entries);
+        break;
+      }
+      monitor_->Stop("EvaluateSplits");
+      node = entries.front();
+    }
+
+    return node;
+  }
+
+  void BuildHistogram(DMatrix *p_fmat, RegTree *p_tree,
+                      std::vector<CPUExpandEntry> const &valid_candidates,
+                      linalg::MatrixView<GradientPair const> gpair) {
+    std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
+    std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
+
+    std::size_t n_idx = 0;
+    for (auto const &c : valid_candidates) {
+      auto left_nidx = (*p_tree)[c.nid].LeftChild();
+      auto right_nidx = (*p_tree)[c.nid].RightChild();
+      auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
+
+      auto build_nidx = left_nidx;
+      auto subtract_nidx = right_nidx;
+      if (fewer_right) {
+        std::swap(build_nidx, subtract_nidx);
+      }
+      nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
+      nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
+      n_idx++;
+    }
+
+    std::size_t page_id{0};
+    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
+                                    partitioner_.at(page_id).Partitions(), nodes_to_build,
+                                    nodes_to_sub, gpair.Values());
+      ++page_id;
+    }
+  }
+
+  void UpdatePosition(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<CPUExpandEntry> const &applied) {
+    monitor_->Start(__func__);
+    std::size_t page_id{0};
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
+      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
+      page_id++;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void LeafPartition(RegTree const &tree, linalg::MatrixView<GradientPair const> gpair,
+                     std::vector<bst_node_t> *p_out_position) {
+    monitor_->Start(__func__);
+    if (!task_->UpdateTreeLeaf()) {
+      return;
+    }
+    for (auto const &part : partitioner_) {
+      part.LeafPartition(ctx_, tree, gpair, p_out_position);
+    }
+    monitor_->Stop(__func__);
+  }
+};
+
+/*! \brief construct a tree using quantized feature values */
+class QuantileHistMaker : public TreeUpdater {
+  std::unique_ptr<HistBuilder> p_impl_{nullptr};
+  std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
+  std::shared_ptr<common::ColumnSampler> column_sampler_ =
+      std::make_shared<common::ColumnSampler>();
+  common::Monitor monitor_;
+  ObjInfo const *task_{nullptr};
+
+ public:
+  explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
+      : TreeUpdater{ctx}, task_{task} {}
+  void Configure(const Args &) override {}
+
+  void LoadConfig(Json const &) override {}
+  void SaveConfig(Json *) const override {}
+
+  [[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
+
+  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree *> &trees) override {
+    if (trees.front()->IsMultiTarget()) {
+      CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
+      if (!p_mtimpl_) {
+        this->p_mtimpl_ = std::make_unique<MultiTargetHistBuilder>(
+            ctx_, p_fmat->Info(), param, column_sampler_, task_, &monitor_);
+      }
+    } else {
+      if (!p_impl_) {
+        p_impl_ =
+            std::make_unique<HistBuilder>(ctx_, column_sampler_, param, p_fmat, task_, &monitor_);
+      }
+    }
+
+    bst_target_t n_targets = trees.front()->NumTargets();
+    auto h_gpair =
+        linalg::MakeTensorView(ctx_, gpair->HostSpan(), p_fmat->Info().num_row_, n_targets);
+
+    linalg::Matrix<GradientPair> sample_out;
+    auto h_sample_out = h_gpair;
+    auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
+    if (need_copy()) {
+      // allocate buffer
+      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
+      h_sample_out = sample_out.HostView();
+    }
+
+    for (auto tree_it = trees.begin(); tree_it != trees.end(); ++tree_it) {
+      if (need_copy()) {
+        // Copy gradient into buffer for sampling. This converts C-order to F-order.
+        std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
+      }
+      SampleGradient(ctx_, *param, h_sample_out);
+      auto *h_out_position = &out_position[tree_it - trees.begin()];
+      if ((*tree_it)->IsMultiTarget()) {
+        UpdateTree<MultiExpandEntry>(&monitor_, h_sample_out, p_mtimpl_.get(), p_fmat, param,
+                                     h_out_position, *tree_it);
+      } else {
+        UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
+                                   h_out_position, *tree_it);
+      }
+    }
+  }
+
+  bool UpdatePredictionCache(const DMatrix *data, linalg::VectorView<float> out_preds) override {
+    if (p_impl_) {
+      return p_impl_->UpdatePredictionCache(data, out_preds);
+    } else if (p_mtimpl_) {
+      // Not yet supported.
+      return false;
+    } else {
+      return false;
+    }
+  }
+
+  [[nodiscard]] bool HasNodePosition() const override { return true; }
+};
 
 XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker")
     .describe("Grow tree using quantized histogram.")
     .set_body([](Context const *ctx, ObjInfo const *task) {
-      return new QuantileHistMaker(ctx, task);
+      return new QuantileHistMaker{ctx, task};
     });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h
deleted file mode 100644
index 138d5646a..000000000
--- a/src/tree/updater_quantile_hist.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*!
- * Copyright 2017-2022 by XGBoost Contributors
- * \file updater_quantile_hist.h
- * \brief use quantized feature values to construct a tree
- * \author Philip Cho, Tianqi Chen, Egor Smirnov
- */
-#ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
-#define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
-
-#include <xgboost/tree_updater.h>
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/json.h"
-
-#include "hist/evaluate_splits.h"
-#include "hist/histogram.h"
-#include "hist/expand_entry.h"
-
-#include "common_row_partitioner.h"
-#include "constraints.h"
-#include "./param.h"
-#include "./driver.h"
-#include "../common/random.h"
-#include "../common/timer.h"
-#include "../common/hist_util.h"
-#include "../common/row_set.h"
-#include "../common/partition_builder.h"
-#include "../common/column_matrix.h"
-
-namespace xgboost::tree {
-inline BatchParam HistBatch(TrainParam const* param) {
-  return {param->max_bin, param->sparse_threshold};
-}
-
-/*! \brief construct a tree using quantized feature values */
-class QuantileHistMaker: public TreeUpdater {
- public:
-  explicit QuantileHistMaker(Context const* ctx, ObjInfo const* task)
-      : TreeUpdater(ctx), task_{task} {}
-  void Configure(const Args&) override {}
-
-  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
-              common::Span<HostDeviceVector<bst_node_t>> out_position,
-              const std::vector<RegTree*>& trees) override;
-
-  bool UpdatePredictionCache(const DMatrix *data,
-                             linalg::VectorView<float> out_preds) override;
-
-  void LoadConfig(Json const&) override {}
-  void SaveConfig(Json*) const override {}
-
-  [[nodiscard]] char const* Name() const override { return "grow_quantile_histmaker"; }
-  [[nodiscard]] bool HasNodePosition() const override { return true; }
-
- protected:
-  // actual builder that runs the algorithm
-  struct Builder {
-   public:
-    // constructor
-    explicit Builder(const size_t n_trees, TrainParam const* param, DMatrix const* fmat,
-                     ObjInfo task, Context const* ctx)
-        : n_trees_(n_trees),
-          param_(param),
-          p_last_fmat_(fmat),
-          histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
-          task_{task},
-          ctx_{ctx},
-          monitor_{std::make_unique<common::Monitor>()} {
-      monitor_->Init("Quantile::Builder");
-    }
-    // update one tree, growing
-    void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
-                    HostDeviceVector<bst_node_t>* p_out_position);
-
-    bool UpdatePredictionCache(DMatrix const* data, linalg::VectorView<float> out_preds) const;
-
-   private:
-    // initialize temp data structure
-    void InitData(DMatrix* fmat, const RegTree& tree, std::vector<GradientPair>* gpair);
-
-    size_t GetNumberOfTrees();
-
-    CPUExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree,
-                            const std::vector<GradientPair>& gpair_h);
-
-    void BuildHistogram(DMatrix* p_fmat, RegTree* p_tree,
-                        std::vector<CPUExpandEntry> const& valid_candidates,
-                        std::vector<GradientPair> const& gpair);
-
-    void LeafPartition(RegTree const& tree, common::Span<GradientPair const> gpair,
-                       std::vector<bst_node_t>* p_out_position);
-
-    void ExpandTree(DMatrix* p_fmat, RegTree* p_tree, const std::vector<GradientPair>& gpair_h,
-                    HostDeviceVector<bst_node_t>* p_out_position);
-
-   private:
-    const size_t n_trees_;
-    TrainParam const* param_;
-    std::shared_ptr<common::ColumnSampler> column_sampler_{
-        std::make_shared<common::ColumnSampler>()};
-
-    std::vector<GradientPair> gpair_local_;
-
-    std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
-    std::vector<CommonRowPartitioner> partitioner_;
-
-    // back pointers to tree and data matrix
-    const RegTree* p_last_tree_{nullptr};
-    DMatrix const* const p_last_fmat_;
-
-    std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
-    ObjInfo task_;
-    // Context for number of threads
-    Context const* ctx_;
-
-    std::unique_ptr<common::Monitor> monitor_;
-  };
-
- protected:
-  std::unique_ptr<Builder> pimpl_;
-  ObjInfo const* task_;
-};
-}  // namespace xgboost::tree
-
-#endif  // XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 4bfe603e0..17c565490 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -50,11 +50,11 @@ class TreeRefresher : public TreeUpdater {
         int tid = omp_get_thread_num();
         int num_nodes = 0;
         for (auto tree : trees) {
-          num_nodes += tree->param.num_nodes;
+          num_nodes += tree->NumNodes();
         }
         stemp[tid].resize(num_nodes, GradStats());
         std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
-        fvec_temp[tid].Init(trees[0]->param.num_feature);
+        fvec_temp[tid].Init(trees[0]->NumFeatures());
       });
     }
     exc.Rethrow();
@@ -77,7 +77,7 @@ class TreeRefresher : public TreeUpdater {
           for (auto tree : trees) {
             AddStats(*tree, feats, gpair_h, info, ridx,
                      dmlc::BeginPtr(stemp[tid]) + offset);
-            offset += tree->param.num_nodes;
+            offset += tree->NumNodes();
           }
           feats.Drop(inst);
         });
@@ -96,7 +96,7 @@ class TreeRefresher : public TreeUpdater {
     int offset = 0;
     for (auto tree : trees) {
       this->Refresh(param, dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
-      offset += tree->param.num_nodes;
+      offset += tree->NumNodes();
     }
   }
 
diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh
index 75a600d7a..7c8f5e505 100755
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -12,13 +12,12 @@ tests/ci_build/ci_build.sh gpu nvidia-docker \
   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
   build/testxgboost
 
-# Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved
-# echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
-# rm -rfv build/
-# buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
-# chmod +x build/testxgboost
-# tests/ci_build/ci_build.sh rmm nvidia-docker \
-#   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
-#   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
-#   --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
-#   "source activate gpu_test && build/testxgboost --use-rmm-pool"
+echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
+rm -rfv build/
+buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
+chmod +x build/testxgboost
+tests/ci_build/ci_build.sh rmm nvidia-docker \
+  --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
+  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
+  "source activate gpu_test && build/testxgboost --use-rmm-pool"
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 8d601f355..b7864bb50 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -3,7 +3,7 @@ import os
 import subprocess
 import sys
 from multiprocessing import Pool, cpu_count
-from typing import Dict, Optional, Tuple
+from typing import Dict, Tuple
 
 from pylint import epylint
 from test_utils import PY_PACKAGE, ROOT, cd, print_time, record_time
@@ -15,8 +15,11 @@ SRCPATH = os.path.normpath(
 
 
 @record_time
-def run_black(rel_path: str) -> bool:
-    cmd = ["black", "-q", "--check", rel_path]
+def run_black(rel_path: str, fix: bool) -> bool:
+    if fix:
+        cmd = ["black", "-q", rel_path]
+    else:
+        cmd = ["black", "-q", "--check", rel_path]
     ret = subprocess.run(cmd).returncode
     if ret != 0:
         subprocess.run(["black", "--version"])
@@ -31,8 +34,11 @@ Please run the following command on your machine to address the formatting error
 
 
 @record_time
-def run_isort(rel_path: str) -> bool:
-    cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
+def run_isort(rel_path: str, fix: bool) -> bool:
+    if fix:
+        cmd = ["isort", f"--src={SRCPATH}", "--profile=black", rel_path]
+    else:
+        cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
     ret = subprocess.run(cmd).returncode
     if ret != 0:
         subprocess.run(["isort", "--version"])
@@ -132,7 +138,7 @@ def run_pylint() -> bool:
 def main(args: argparse.Namespace) -> None:
     if args.format == 1:
         black_results = [
-            run_black(path)
+            run_black(path, args.fix)
             for path in [
                 # core
                 "python-package/",
@@ -166,7 +172,7 @@ def main(args: argparse.Namespace) -> None:
             sys.exit(-1)
 
         isort_results = [
-            run_isort(path)
+            run_isort(path, args.fix)
             for path in [
                 # core
                 "python-package/",
@@ -230,6 +236,11 @@ if __name__ == "__main__":
     parser.add_argument("--format", type=int, choices=[0, 1], default=1)
     parser.add_argument("--type-check", type=int, choices=[0, 1], default=1)
     parser.add_argument("--pylint", type=int, choices=[0, 1], default=1)
+    parser.add_argument(
+        "--fix",
+        action="store_true",
+        help="Fix the formatting issues instead of emitting an error.",
+    )
     args = parser.parse_args()
     try:
         main(args)
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index 47de054c6..8ce877aef 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -1,10 +1,12 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023, XGBoost contributors
  */
 #ifdef XGBOOST_USE_NCCL
 
 #include <gtest/gtest.h>
 
+#include <string>  // for string
+
 #include "../../../src/collective/nccl_device_communicator.cuh"
 
 namespace xgboost {
@@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
+TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
+  try {
+    dh::safe_nccl(ncclSystemError);
+  } catch (dmlc::Error const& e) {
+    auto str = std::string{e.what()};
+    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
+  }
+}
 }  // namespace collective
 }  // namespace xgboost
 
-#endif
+#endif  // XGBOOST_USE_NCCL
diff --git a/tests/cpp/common/test_partition_builder.cc b/tests/cpp/common/test_partition_builder.cc
index 093f87708..08dd345f2 100644
--- a/tests/cpp/common/test_partition_builder.cc
+++ b/tests/cpp/common/test_partition_builder.cc
@@ -1,79 +1,79 @@
-#include <gtest/gtest.h>
-#include <vector>
-#include <string>
-#include <utility>
-
-#include "../../../src/common/row_set.h"
-#include "../../../src/common/partition_builder.h"
-#include "../helpers.h"
-
-namespace xgboost {
-namespace common {
-
-TEST(PartitionBuilder, BasicTest) {
-  constexpr size_t kBlockSize = 16;
-  constexpr size_t kNodes = 5;
-  constexpr size_t kTasks = 3 + 5 + 10 + 1 + 2;
-
-  std::vector<size_t> tasks = { 3, 5, 10, 1, 2 };
-
-  PartitionBuilder<kBlockSize> builder;
-  builder.Init(kTasks, kNodes, [&](size_t i) {
-    return tasks[i];
-  });
-
-  std::vector<size_t> rows_for_left_node = { 2, 12, 0, 16, 8 };
-
-  for(size_t nid = 0; nid < kNodes; ++nid) {
-    size_t value_left = 0;
-    size_t value_right = 0;
-
-    size_t left_total = tasks[nid] * rows_for_left_node[nid];
-
-    for(size_t j = 0; j < tasks[nid]; ++j) {
-      size_t begin = kBlockSize*j;
-      size_t end = kBlockSize*(j+1);
-      const size_t id = builder.GetTaskIdx(nid, begin);
-      builder.AllocateForTask(id);
-
-      auto left  = builder.GetLeftBuffer(nid, begin, end);
-      auto right = builder.GetRightBuffer(nid, begin, end);
-
-      size_t n_left   = rows_for_left_node[nid];
-      size_t n_right = kBlockSize - rows_for_left_node[nid];
-
-      for(size_t i = 0; i < n_left; i++) {
-        left[i] = value_left++;
-      }
-
-      for(size_t i = 0; i < n_right; i++) {
-        right[i] = left_total + value_right++;
-      }
-
-      builder.SetNLeftElems(nid, begin, n_left);
-      builder.SetNRightElems(nid, begin, n_right);
-    }
-  }
-  builder.CalculateRowOffsets();
-
-  std::vector<size_t> v(*std::max_element(tasks.begin(), tasks.end()) * kBlockSize);
-
-  for(size_t nid = 0; nid < kNodes; ++nid) {
-
-    for(size_t j = 0; j < tasks[nid]; ++j) {
-      builder.MergeToArray(nid, kBlockSize*j, v.data());
-    }
-
-    for(size_t j = 0; j < tasks[nid] * kBlockSize; ++j) {
-      ASSERT_EQ(v[j], j);
-    }
-    size_t n_left  = builder.GetNLeftElems(nid);
-    size_t n_right = builder.GetNRightElems(nid);
-
-    ASSERT_EQ(n_left, rows_for_left_node[nid] * tasks[nid]);
-    ASSERT_EQ(n_right, (kBlockSize - rows_for_left_node[nid]) * tasks[nid]);
-  }
-}
-
-}  // namespace common
-}  // namespace xgboost
+/**
+ * Copyright 2020-2023 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../../../src/common/partition_builder.h"
+#include "../../../src/common/row_set.h"
+#include "../helpers.h"
+
+namespace xgboost::common {
+TEST(PartitionBuilder, BasicTest) {
+  constexpr size_t kBlockSize = 16;
+  constexpr size_t kNodes = 5;
+  constexpr size_t kTasks = 3 + 5 + 10 + 1 + 2;
+
+  std::vector<size_t> tasks = { 3, 5, 10, 1, 2 };
+
+  PartitionBuilder<kBlockSize> builder;
+  builder.Init(kTasks, kNodes, [&](size_t i) {
+    return tasks[i];
+  });
+
+  std::vector<size_t> rows_for_left_node = { 2, 12, 0, 16, 8 };
+
+  for(size_t nid = 0; nid < kNodes; ++nid) {
+    size_t value_left = 0;
+    size_t value_right = 0;
+
+    size_t left_total = tasks[nid] * rows_for_left_node[nid];
+
+    for(size_t j = 0; j < tasks[nid]; ++j) {
+      size_t begin = kBlockSize*j;
+      size_t end = kBlockSize*(j+1);
+      const size_t id = builder.GetTaskIdx(nid, begin);
+      builder.AllocateForTask(id);
+
+      auto left  = builder.GetLeftBuffer(nid, begin, end);
+      auto right = builder.GetRightBuffer(nid, begin, end);
+
+      size_t n_left   = rows_for_left_node[nid];
+      size_t n_right = kBlockSize - rows_for_left_node[nid];
+
+      for(size_t i = 0; i < n_left; i++) {
+        left[i] = value_left++;
+      }
+
+      for(size_t i = 0; i < n_right; i++) {
+        right[i] = left_total + value_right++;
+      }
+
+      builder.SetNLeftElems(nid, begin, n_left);
+      builder.SetNRightElems(nid, begin, n_right);
+    }
+  }
+  builder.CalculateRowOffsets();
+
+  std::vector<size_t> v(*std::max_element(tasks.begin(), tasks.end()) * kBlockSize);
+
+  for(size_t nid = 0; nid < kNodes; ++nid) {
+
+    for(size_t j = 0; j < tasks[nid]; ++j) {
+      builder.MergeToArray(nid, kBlockSize*j, v.data());
+    }
+
+    for(size_t j = 0; j < tasks[nid] * kBlockSize; ++j) {
+      ASSERT_EQ(v[j], j);
+    }
+    size_t n_left  = builder.GetNLeftElems(nid);
+    size_t n_right = builder.GetNRightElems(nid);
+
+    ASSERT_EQ(n_left, rows_for_left_node[nid] * tasks[nid]);
+    ASSERT_EQ(n_right, (kBlockSize - rows_for_left_node[nid]) * tasks[nid]);
+  }
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_ranking_utils.cc b/tests/cpp/common/test_ranking_utils.cc
index c73cffed7..919102278 100644
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@@ -1,16 +1,25 @@
 /**
  * Copyright 2023 by XGBoost Contributors
  */
-#include <gtest/gtest.h>                        // for Test, AssertionResult, Message, TestPartR...
-#include <gtest/gtest.h>                        // for ASSERT_NEAR, ASSERT_T...
-#include <xgboost/base.h>                       // for Args
+#include "test_ranking_utils.h"
+
+#include <gtest/gtest.h>
+#include <xgboost/base.h>                       // for Args, bst_group_t, kRtEps
 #include <xgboost/context.h>                    // for Context
+#include <xgboost/data.h>                       // for MetaInfo, DMatrix
+#include <xgboost/host_device_vector.h>         // for HostDeviceVector
+#include <xgboost/logging.h>                    // for Error
 #include <xgboost/string_view.h>                // for StringView
 
+#include <cstddef>                              // for size_t
 #include <cstdint>                              // for uint32_t
-#include <utility>                              // for pair
+#include <numeric>                              // for iota
+#include <utility>                              // for move
+#include <vector>                               // for vector
 
+#include "../../../src/common/numeric.h"        // for Iota
 #include "../../../src/common/ranking_utils.h"  // for LambdaRankParam, ParseMetricName, MakeMet...
+#include "../helpers.h"                         // for EmptyDMatrix
 
 namespace xgboost::ltr {
 TEST(RankingUtils, LambdaRankParam) {
@@ -66,4 +75,138 @@ TEST(RankingUtils, MakeMetricName) {
   name = MakeMetricName("map", 2, false);
   ASSERT_EQ(name, "map@2");
 }
+
+void TestRankingCache(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+
+  info.num_row_ = 16;
+  info.labels.Reshape(info.num_row_);
+  auto& h_label = info.labels.Data()->HostVector();
+  for (std::size_t i = 0; i < h_label.size(); ++i) {
+    h_label[i] = i % 2;
+  }
+
+  LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  RankingCache cache{ctx, info, param};
+
+  HostDeviceVector<float> predt(info.num_row_, 0);
+  auto& h_predt = predt.HostVector();
+  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
+  predt.SetDevice(ctx->gpu_id);
+
+  auto rank_idx =
+      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
+
+  for (std::size_t i = 0; i < rank_idx.size(); ++i) {
+    ASSERT_EQ(rank_idx[i], rank_idx.size() - i - 1);
+  }
+}
+
+TEST(RankingCache, InitFromCPU) {
+  Context ctx;
+  TestRankingCache(&ctx);
+}
+
+void TestNDCGCache(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  {
+    // empty
+    NDCGCache cache{ctx, info, param};
+    ASSERT_EQ(cache.DataGroupPtr(ctx).size(), 2);
+  }
+
+  info.num_row_ = 3;
+  info.group_ptr_ = {static_cast<bst_group_t>(0), static_cast<bst_group_t>(info.num_row_)};
+
+  {
+    auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
+    // empty label
+    ASSERT_THROW(fail(), dmlc::Error);
+    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
+    // invalid label
+    ASSERT_THROW(fail(), dmlc::Error);
+    auto h_labels = info.labels.HostView();
+    for (std::size_t i = 0; i < h_labels.Size(); ++i) {
+      h_labels(i) *= 10;
+    }
+    param.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
+    NDCGCache cache{ctx, info, param};
+    Context cpuctx;
+    auto inv_idcg = cache.InvIDCG(&cpuctx);
+    ASSERT_EQ(inv_idcg.Size(), 1);
+    ASSERT_NEAR(1.0 / inv_idcg(0), 2.63093, kRtEps);
+  }
+
+  {
+    param.UpdateAllowUnknown(Args{{"lambdarank_unbiased", "false"}});
+
+    std::vector<float> h_data(32);
+
+    common::Iota(ctx, h_data.begin(), h_data.end(), 0.0f);
+    info.labels.Reshape(h_data.size());
+    info.num_row_ = h_data.size();
+    info.group_ptr_.back() = info.num_row_;
+    info.labels.Data()->HostVector() = std::move(h_data);
+
+    {
+      NDCGCache cache{ctx, info, param};
+      Context cpuctx;
+      auto inv_idcg = cache.InvIDCG(&cpuctx);
+      ASSERT_NEAR(inv_idcg(0), 0.00551782, kRtEps);
+    }
+
+    param.UpdateAllowUnknown(
+        Args{{"lambdarank_num_pair_per_sample", "3"}, {"lambdarank_pair_method", "topk"}});
+    {
+      NDCGCache cache{ctx, info, param};
+      Context cpuctx;
+      auto inv_idcg = cache.InvIDCG(&cpuctx);
+      ASSERT_NEAR(inv_idcg(0), 0.01552123, kRtEps);
+    }
+  }
+}
+
+TEST(NDCGCache, InitFromCPU) {
+  Context ctx;
+  TestNDCGCache(&ctx);
+}
+
+void TestMAPCache(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  std::vector<float> h_data(32);
+
+  common::Iota(ctx, h_data.begin(), h_data.end(), 0.0f);
+  info.labels.Reshape(h_data.size());
+  info.num_row_ = h_data.size();
+  info.labels.Data()->HostVector() = std::move(h_data);
+
+  auto fail = [&]() { std::make_shared<MAPCache>(ctx, info, param); };
+  // binary label
+  ASSERT_THROW(fail(), dmlc::Error);
+
+  h_data = std::vector<float>(32, 0.0f);
+  h_data[1] = 1.0f;
+  info.labels.Data()->HostVector() = h_data;
+  auto p_cache = std::make_shared<MAPCache>(ctx, info, param);
+
+  ASSERT_EQ(p_cache->Acc(ctx).size(), info.num_row_);
+  ASSERT_EQ(p_cache->NumRelevant(ctx).size(), info.num_row_);
+}
+
+TEST(MAPCache, InitFromCPU) {
+  Context ctx;
+  ctx.Init(Args{});
+  TestMAPCache(&ctx);
+}
 }  // namespace xgboost::ltr
diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu
new file mode 100644
index 000000000..db0ff3b66
--- /dev/null
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -0,0 +1,104 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>                          // for Args, XGBOOST_DEVICE, bst_group_t, kRtEps
+#include <xgboost/context.h>                       // for Context
+#include <xgboost/linalg.h>                        // for MakeTensorView, Vector
+
+#include <cstddef>                                 // for size_t
+#include <memory>                                  // for shared_ptr
+#include <numeric>                                 // for iota
+#include <vector>                                  // for vector
+
+#include "../../../src/common/algorithm.cuh"       // for SegmentedSequence
+#include "../../../src/common/cuda_context.cuh"    // for CUDAContext
+#include "../../../src/common/device_helpers.cuh"  // for device_vector, ToSpan
+#include "../../../src/common/ranking_utils.cuh"   // for CalcQueriesInvIDCG
+#include "../../../src/common/ranking_utils.h"     // for LambdaRankParam, RankingCache
+#include "../helpers.h"                            // for EmptyDMatrix
+#include "test_ranking_utils.h"                    // for TestNDCGCache
+#include "xgboost/data.h"                          // for MetaInfo
+#include "xgboost/host_device_vector.h"            // for HostDeviceVector
+
+namespace xgboost::ltr {
+void TestCalcQueriesInvIDCG() {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  std::size_t n_groups = 5, n_samples_per_group = 32;
+
+  dh::device_vector<float> scores(n_samples_per_group * n_groups);
+  dh::device_vector<bst_group_t> group_ptr(n_groups + 1);
+  auto d_group_ptr = dh::ToSpan(group_ptr);
+  dh::LaunchN(d_group_ptr.size(), ctx.CUDACtx()->Stream(),
+              [=] XGBOOST_DEVICE(std::size_t i) { d_group_ptr[i] = i * n_samples_per_group; });
+
+  auto d_scores = dh::ToSpan(scores);
+  common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
+
+  linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
+
+  ltr::LambdaRankParam p;
+  p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
+
+  cuda_impl::CalcQueriesInvIDCG(&ctx, linalg::MakeTensorView(&ctx, d_scores, d_scores.size()),
+                                dh::ToSpan(group_ptr), inv_IDCG.View(ctx.gpu_id), p);
+  for (std::size_t i = 0; i < n_groups; ++i) {
+    double inv_idcg = inv_IDCG(i);
+    ASSERT_NEAR(inv_idcg, 0.00551782, kRtEps);
+  }
+}
+
+TEST(RankingUtils, CalcQueriesInvIDCG) { TestCalcQueriesInvIDCG(); }
+
+namespace {
+void TestRankingCache(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+
+  info.num_row_ = 16;
+  info.labels.Reshape(info.num_row_);
+  auto& h_label = info.labels.Data()->HostVector();
+  for (std::size_t i = 0; i < h_label.size(); ++i) {
+    h_label[i] = i % 2;
+  }
+
+  LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  RankingCache cache{ctx, info, param};
+
+  HostDeviceVector<float> predt(info.num_row_, 0);
+  auto& h_predt = predt.HostVector();
+  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
+  predt.SetDevice(ctx->gpu_id);
+
+  auto rank_idx =
+      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
+
+  std::vector<std::size_t> h_rank_idx(rank_idx.size());
+  dh::CopyDeviceSpanToVector(&h_rank_idx, rank_idx);
+  for (std::size_t i = 0; i < rank_idx.size(); ++i) {
+    ASSERT_EQ(h_rank_idx[i], h_rank_idx.size() - i - 1);
+  }
+}
+}  // namespace
+
+TEST(RankingCache, InitFromGPU) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestRankingCache(&ctx);
+}
+
+TEST(NDCGCache, InitFromGPU) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestNDCGCache(&ctx);
+}
+
+TEST(MAPCache, InitFromGPU) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestMAPCache(&ctx);
+}
+}  // namespace xgboost::ltr
diff --git a/tests/cpp/common/test_ranking_utils.h b/tests/cpp/common/test_ranking_utils.h
new file mode 100644
index 000000000..8ff92df9a
--- /dev/null
+++ b/tests/cpp/common/test_ranking_utils.h
@@ -0,0 +1,11 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#pragma once
+#include <xgboost/context.h>  // for Context
+
+namespace xgboost::ltr {
+void TestNDCGCache(Context const* ctx);
+
+void TestMAPCache(Context const* ctx);
+}  // namespace xgboost::ltr
diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc
index c37328192..99cd72cc0 100644
--- a/tests/cpp/data/test_data.cc
+++ b/tests/cpp/data/test_data.cc
@@ -112,31 +112,12 @@ TEST(SparsePage, SortIndices) {
 }
 
 TEST(DMatrix, Uri) {
-  size_t constexpr kRows {16};
-  size_t constexpr kCols {8};
-  std::vector<float> data (kRows * kCols);
-
-  for (size_t i = 0; i < kRows * kCols; ++i) {
-    data[i] = i;
-  }
+  auto constexpr kRows {16};
+  auto constexpr kCols {8};
 
   dmlc::TemporaryDirectory tmpdir;
-  std::string path = tmpdir.path + "/small.csv";
-
-  std::ofstream fout(path);
-  size_t i = 0;
-  for (size_t r = 0; r < kRows; ++r) {
-    for (size_t c = 0; c < kCols; ++c) {
-      fout << data[i];
-      i++;
-      if (c != kCols - 1) {
-        fout << ",";
-      }
-    }
-    fout << "\n";
-  }
-  fout.flush();
-  fout.close();
+  auto const path = tmpdir.path + "/small.csv";
+  CreateTestCSV(path, kRows, kCols);
 
   std::unique_ptr<DMatrix> dmat;
   // FIXME(trivialfis): Enable the following test by restricting csv parser in dmlc-core.
diff --git a/tests/cpp/data/test_file_iterator.cc b/tests/cpp/data/test_file_iterator.cc
index 21029620b..31da2c1fa 100644
--- a/tests/cpp/data/test_file_iterator.cc
+++ b/tests/cpp/data/test_file_iterator.cc
@@ -1,8 +1,9 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
  */
 #include <gtest/gtest.h>
 
+#include <any>  // for any_cast
 #include <memory>
 
 #include "../../../src/data/adapter.h"
@@ -11,15 +12,14 @@
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(FileIterator, Basic) {
   auto check_n_features = [](FileIterator *iter) {
     size_t n_features = 0;
     iter->Reset();
     while (iter->Next()) {
       auto proxy = MakeProxy(iter->Proxy());
-      auto csr = dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
+      auto csr = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
       n_features = std::max(n_features, csr->NumColumns());
     }
     ASSERT_EQ(n_features, 5);
@@ -42,5 +42,4 @@ TEST(FileIterator, Basic) {
     check_n_features(&iter);
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu
index e13cb54f1..ab38f51bb 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cu
+++ b/tests/cpp/data/test_proxy_dmatrix.cu
@@ -1,23 +1,24 @@
+/**
+ * Copyright 2020-2023 XGBoost contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
+
+#include <any>  // for any_cast
 #include <memory>
-#include "../helpers.h"
 
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/data/proxy_dmatrix.h"
+#include "../helpers.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(ProxyDMatrix, DeviceData) {
   constexpr size_t kRows{100}, kCols{100};
   HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5)
-                  .Device(0)
-                  .GenerateArrayInterface(&storage);
+  auto data = RandomDataGenerator(kRows, kCols, 0.5).Device(0).GenerateArrayInterface(&storage);
   std::vector<HostDeviceVector<float>> label_storage(1);
-  auto labels = RandomDataGenerator(kRows, 1, 0)
-                    .Device(0)
-                    .GenerateColumnarArrayInterface(&label_storage);
+  auto labels =
+      RandomDataGenerator(kRows, 1, 0).Device(0).GenerateColumnarArrayInterface(&label_storage);
 
   DMatrixProxy proxy;
   proxy.SetCUDAArray(data.c_str());
@@ -25,23 +26,16 @@ TEST(ProxyDMatrix, DeviceData) {
 
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CupyAdapter>));
   ASSERT_EQ(proxy.Info().labels.Size(), kRows);
-  ASSERT_EQ(dmlc::get<std::shared_ptr<CupyAdapter>>(proxy.Adapter())->NumRows(),
-            kRows);
-  ASSERT_EQ(
-      dmlc::get<std::shared_ptr<CupyAdapter>>(proxy.Adapter())->NumColumns(),
-      kCols);
+  ASSERT_EQ(std::any_cast<std::shared_ptr<CupyAdapter>>(proxy.Adapter())->NumRows(), kRows);
+  ASSERT_EQ(std::any_cast<std::shared_ptr<CupyAdapter>>(proxy.Adapter())->NumColumns(), kCols);
 
   std::vector<HostDeviceVector<float>> columnar_storage(kCols);
   data = RandomDataGenerator(kRows, kCols, 0)
-                    .Device(0)
-                    .GenerateColumnarArrayInterface(&columnar_storage);
+             .Device(0)
+             .GenerateColumnarArrayInterface(&columnar_storage);
   proxy.SetCUDAArray(data.c_str());
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
-  ASSERT_EQ(dmlc::get<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumRows(),
-            kRows);
-  ASSERT_EQ(
-      dmlc::get<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumColumns(),
-      kCols);
+  ASSERT_EQ(std::any_cast<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumRows(), kRows);
+  ASSERT_EQ(std::any_cast<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumColumns(), kCols);
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index c99adc06e..916c126d4 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -412,7 +412,7 @@ std::pair<Json, Json> TestModelSlice(std::string booster) {
     j++;
   }
 
-  // CHECK sliced model doesn't have dependency on old one
+  // CHECK sliced model doesn't have dependency on the old one
   learner.reset();
   CHECK_EQ(sliced->GetNumFeature(), kCols);
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index e2d645f93..ff27da5eb 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -65,6 +65,29 @@ void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_
   }
 }
 
+void CreateTestCSV(std::string const& path, size_t rows, size_t cols) {
+  std::vector<float> data(rows * cols);
+
+  for (size_t i = 0; i < rows * cols; ++i) {
+    data[i] = i;
+  }
+
+  std::ofstream fout(path);
+  size_t i = 0;
+  for (size_t r = 0; r < rows; ++r) {
+    for (size_t c = 0; c < cols; ++c) {
+      fout << data[i];
+      i++;
+      if (c != cols - 1) {
+        fout << ",";
+      }
+    }
+    fout << "\n";
+  }
+  fout.flush();
+  fout.close();
+}
+
 void CheckObjFunctionImpl(std::unique_ptr<xgboost::ObjFunction> const& obj,
                           std::vector<xgboost::bst_float> preds,
                           std::vector<xgboost::bst_float> labels,
@@ -224,19 +247,18 @@ std::string RandomDataGenerator::GenerateArrayInterface(
   return out;
 }
 
-std::pair<std::vector<std::string>, std::string>
-RandomDataGenerator::GenerateArrayInterfaceBatch(
-    HostDeviceVector<float> *storage, size_t batches) const {
-  this->GenerateDense(storage);
+std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
+    HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features,
+    std::size_t batches, std::int32_t device) {
   std::vector<std::string> result(batches);
   std::vector<Json> objects;
 
-  size_t const rows_per_batch = rows_ / batches;
+  size_t const rows_per_batch = n_samples / batches;
 
-  auto make_interface = [storage, this](size_t offset, size_t rows) {
+  auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) {
     Json array_interface{Object()};
     array_interface["data"] = std::vector<Json>(2);
-    if (device_ >= 0) {
+    if (device >= 0) {
       array_interface["data"][0] =
           Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
       array_interface["stream"] = Null{};
@@ -249,22 +271,22 @@ RandomDataGenerator::GenerateArrayInterfaceBatch(
 
     array_interface["shape"] = std::vector<Json>(2);
     array_interface["shape"][0] = rows;
-    array_interface["shape"][1] = cols_;
+    array_interface["shape"][1] = n_features;
 
     array_interface["typestr"] = String("<f4");
     array_interface["version"] = 3;
     return array_interface;
   };
 
-  auto j_interface = make_interface(0, rows_);
+  auto j_interface = make_interface(0, n_samples);
   size_t offset = 0;
   for (size_t i = 0; i < batches - 1; ++i) {
     objects.emplace_back(make_interface(offset, rows_per_batch));
-    offset += rows_per_batch * cols_;
+    offset += rows_per_batch * n_features;
   }
 
-  size_t const remaining = rows_ - offset / cols_;
-  CHECK_LE(offset, rows_ * cols_);
+  size_t const remaining = n_samples - offset / n_features;
+  CHECK_LE(offset, n_samples * n_features);
   objects.emplace_back(make_interface(offset, remaining));
 
   for (size_t i = 0; i < batches; ++i) {
@@ -276,6 +298,12 @@ RandomDataGenerator::GenerateArrayInterfaceBatch(
   return {result, interface_str};
 }
 
+std::pair<std::vector<std::string>, std::string> RandomDataGenerator::GenerateArrayInterfaceBatch(
+    HostDeviceVector<float>* storage, size_t batches) const {
+  this->GenerateDense(storage);
+  return MakeArrayInterfaceBatch(storage, rows_, cols_, batches, device_);
+}
+
 std::string RandomDataGenerator::GenerateColumnarArrayInterface(
     std::vector<HostDeviceVector<float>> *data) const {
   CHECK(data);
@@ -400,11 +428,14 @@ int NumpyArrayIterForTest::Next() {
   return 1;
 }
 
-std::shared_ptr<DMatrix>
-GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns){
+std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::size_t num_rows,
+                                            bst_feature_t num_columns) {
   data::DenseAdapter adapter(x.data(), num_rows, num_columns);
-  return std::shared_ptr<DMatrix>(new data::SimpleDMatrix(
-      &adapter, std::numeric_limits<float>::quiet_NaN(), 1));
+  auto p_fmat = std::shared_ptr<DMatrix>(
+      new data::SimpleDMatrix(&adapter, std::numeric_limits<float>::quiet_NaN(), 1));
+  CHECK_EQ(p_fmat->Info().num_row_, num_rows);
+  CHECK_EQ(p_fmat->Info().num_col_, num_columns);
+  return p_fmat;
 }
 
 std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features,
@@ -572,12 +603,23 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
   return gbm;
 }
 
-ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols,
-                                   size_t batches) : rows_{rows}, cols_{cols}, n_batches_{batches} {
+ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches)
+    : rows_{rows}, cols_{cols}, n_batches_{batches} {
   XGProxyDMatrixCreate(&proxy_);
   rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
+  std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
+}
+
+ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
+                                   std::size_t n_samples, bst_feature_t n_features,
+                                   std::size_t n_batches)
+    : rows_{n_samples}, cols_{n_features}, n_batches_{n_batches} {
+  XGProxyDMatrixCreate(&proxy_);
+  this->data_.Resize(data.Size());
+  CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
+  this->data_.Copy(data);
   std::tie(batches_, interface_) =
-      rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
+      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->gpu_id);
 }
 
 ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 1baa096cf..7f1720068 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -59,6 +59,8 @@ void CreateSimpleTestData(const std::string& filename);
 // 0-based indexing.
 void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true);
 
+void CreateTestCSV(std::string const& path, size_t rows, size_t cols);
+
 void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                       std::vector<xgboost::bst_float> preds,
                       std::vector<xgboost::bst_float> labels,
@@ -188,7 +190,7 @@ class SimpleRealUniformDistribution {
 };
 
 template <typename T>
-Json GetArrayInterface(HostDeviceVector<T> *storage, size_t rows, size_t cols) {
+Json GetArrayInterface(HostDeviceVector<T> const* storage, size_t rows, size_t cols) {
   Json array_interface{Object()};
   array_interface["data"] = std::vector<Json>(2);
   if (storage->DeviceCanRead()) {
@@ -318,8 +320,8 @@ GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
   return x;
 }
 
-std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
-                                            int num_rows, int num_columns);
+std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::size_t num_rows,
+                                            bst_feature_t num_columns);
 
 /**
  * \brief Create Sparse Page using data iterator.
@@ -394,7 +396,7 @@ typedef void *DMatrixHandle;  // NOLINT(*);
 class ArrayIterForTest {
  protected:
   HostDeviceVector<float> data_;
-  size_t iter_ {0};
+  size_t iter_{0};
   DMatrixHandle proxy_;
   std::unique_ptr<RandomDataGenerator> rng_;
 
@@ -418,6 +420,11 @@ class ArrayIterForTest {
   auto Proxy() -> decltype(proxy_) { return proxy_; }
 
   explicit ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches);
+  /**
+   * \brief Create iterator with user provided data.
+   */
+  explicit ArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
+                            std::size_t n_samples, bst_feature_t n_features, std::size_t n_batches);
   virtual ~ArrayIterForTest();
 };
 
@@ -433,6 +440,10 @@ class NumpyArrayIterForTest : public ArrayIterForTest {
  public:
   explicit NumpyArrayIterForTest(float sparsity, size_t rows = Rows(), size_t cols = Cols(),
                                  size_t batches = Batches());
+  explicit NumpyArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
+                                 std::size_t n_samples, bst_feature_t n_features,
+                                 std::size_t n_batches)
+      : ArrayIterForTest{ctx, data, n_samples, n_features, n_batches} {}
   int Next() override;
   ~NumpyArrayIterForTest() override = default;
 };
@@ -462,7 +473,7 @@ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint
                                 int32_t device = Context::kCpuId) {
   size_t shape[1]{1};
   LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
-                           n_groups, 1, MultiStrategy::kComposite);
+                           n_groups, 1, MultiStrategy::kOneOutputPerTree);
   return mparam;
 }
 
diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc
index faad00455..fa506a412 100644
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -1,7 +1,20 @@
-// Copyright by Contributors
-#include <xgboost/metric.h>
+/**
+ * Copyright 2016-2023 by XGBoost Contributors
+ */
+#include <gtest/gtest.h>                 // for Test, EXPECT_NEAR, ASSERT_STREQ
+#include <xgboost/context.h>             // for Context
+#include <xgboost/data.h>                // for MetaInfo, DMatrix
+#include <xgboost/linalg.h>              // for Matrix
+#include <xgboost/metric.h>              // for Metric
 
-#include "../helpers.h"
+#include <algorithm>                     // for max
+#include <memory>                        // for unique_ptr
+#include <vector>                        // for vector
+
+#include "../helpers.h"                  // for GetMetricEval, CreateEmptyGe...
+#include "xgboost/base.h"                // for bst_float, kRtEps
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/json.h"                // for Json, String, Object
 
 #if !defined(__CUDACC__) && !defined(__HIP_PLATFORM_AMD__)
 TEST(Metric, AMS) {
@@ -51,15 +64,17 @@ TEST(Metric, DeclareUnifiedTest(Precision)) {
   delete metric;
 }
 
+namespace xgboost {
+namespace metric {
 TEST(Metric, DeclareUnifiedTest(NDCG)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
   ASSERT_STREQ(metric->Name(), "ndcg");
   EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
-  EXPECT_NEAR(GetMetricEval(metric,
+  ASSERT_NEAR(GetMetricEval(metric,
                             xgboost::HostDeviceVector<xgboost::bst_float>{},
                             {}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
@@ -80,7 +95,7 @@ TEST(Metric, DeclareUnifiedTest(NDCG)) {
   EXPECT_NEAR(GetMetricEval(metric,
                             xgboost::HostDeviceVector<xgboost::bst_float>{},
                             {}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
@@ -91,29 +106,30 @@ TEST(Metric, DeclareUnifiedTest(NDCG)) {
   EXPECT_NEAR(GetMetricEval(metric,
                             xgboost::HostDeviceVector<xgboost::bst_float>{},
                             {}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
-              0.6509f, 0.001f);
+               0.6509f, 0.001f);
 
   delete metric;
   metric = xgboost::Metric::Create("ndcg@2-", &ctx);
   ASSERT_STREQ(metric->Name(), "ndcg@2-");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
-              0.3868f, 0.001f);
+              1.f - 0.3868f, 1.f - 0.001f);
 
   delete metric;
 }
 
 TEST(Metric, DeclareUnifiedTest(MAP)) {
   auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("map", &ctx);
+  Metric * metric = xgboost::Metric::Create("map", &ctx);
   ASSERT_STREQ(metric->Name(), "map");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, kRtEps);
+
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
@@ -125,7 +141,7 @@ TEST(Metric, DeclareUnifiedTest(MAP)) {
   // Rank metric with group info
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.2f, 0.8f, 0.4f, 1.7f},
-                            {2, 7, 1, 0, 5, 0},  // Labels
+                            {1, 1, 1, 0, 1, 0},  // Labels
                             {},  // Weights
                             {0, 2, 5, 6}),  // Group info
               0.8611f, 0.001f);
@@ -154,3 +170,39 @@ TEST(Metric, DeclareUnifiedTest(MAP)) {
               0.25f, 0.001f);
   delete metric;
 }
+
+TEST(Metric, DeclareUnifiedTest(NDCGExpGain)) {
+  Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id};
+  info.num_row_ = info.labels.Shape(0);
+  info.group_ptr_.resize(2);
+  info.group_ptr_[0] = 0;
+  info.group_ptr_[1] = info.num_row_;
+  HostDeviceVector<float> predt{{0.1f, 0.2f, 0.3f, 4.0f, 70.0f}};
+
+  std::unique_ptr<Metric> metric{Metric::Create("ndcg", &ctx)};
+  Json config{Object{}};
+  config["name"] = String{"ndcg"};
+  config["lambdarank_param"] = Object{};
+  config["lambdarank_param"]["ndcg_exp_gain"] = String{"true"};
+  config["lambdarank_param"]["lambdarank_num_pair_per_sample"] = String{"32"};
+  metric->LoadConfig(config);
+
+  auto ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 0.409738f, kRtEps);
+
+  config["lambdarank_param"]["ndcg_exp_gain"] = String{"false"};
+  metric->LoadConfig(config);
+
+  ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 0.695694f, kRtEps);
+
+  predt.HostVector() = info.labels.Data()->HostVector();
+  ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 1.0, kRtEps);
+}
+}  // namespace metric
+}  // namespace xgboost
diff --git a/tests/cpp/plugin/helpers.cc b/tests/cpp/plugin/helpers.cc
deleted file mode 100644
index a70479b1b..000000000
--- a/tests/cpp/plugin/helpers.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <chrono>
-#include <thread>
-#include <random>
-#include <cstdint>
-
-#include "helpers.h"
-
-using namespace std::chrono_literals;
-
-int GenerateRandomPort(int low, int high) {
-  // Ensure unique timestamp by introducing a small artificial delay
-  std::this_thread::sleep_for(100ms);
-  auto timestamp = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::milliseconds>(
-    std::chrono::system_clock::now().time_since_epoch()).count());
-  std::mt19937_64 rng(timestamp);
-  std::uniform_int_distribution<int> dist(low, high);
-  int port = dist(rng);
-  return port;
-}
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index ea72f1538..0ac6746f8 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -1,10 +1,69 @@
 /*!
- * Copyright 2022 XGBoost contributors
+ * Copyright 2022-2023 XGBoost contributors
  */
+#pragma once
 
-#ifndef XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
-#define XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
+#include <grpcpp/server_builder.h>
+#include <gtest/gtest.h>
+#include <xgboost/json.h>
 
-int GenerateRandomPort(int low, int high);
+#include <random>
 
-#endif  // XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
+#include "../../../plugin/federated/federated_server.h"
+#include "../../../src/collective/communicator-inl.h"
+
+inline int GenerateRandomPort(int low, int high) {
+  using namespace std::chrono_literals;
+  // Ensure unique timestamp by introducing a small artificial delay
+  std::this_thread::sleep_for(100ms);
+  auto timestamp = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::milliseconds>(
+                                             std::chrono::system_clock::now().time_since_epoch())
+                                             .count());
+  std::mt19937_64 rng(timestamp);
+  std::uniform_int_distribution<int> dist(low, high);
+  int port = dist(rng);
+  return port;
+}
+
+inline std::string GetServerAddress() {
+  int port = GenerateRandomPort(50000, 60000);
+  std::string address = std::string("localhost:") + std::to_string(port);
+  return address;
+}
+
+namespace xgboost {
+
+class BaseFederatedTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    server_address_ = GetServerAddress();
+    server_thread_.reset(new std::thread([this] {
+      grpc::ServerBuilder builder;
+      xgboost::federated::FederatedService service{kWorldSize};
+      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
+      builder.RegisterService(&service);
+      server_ = builder.BuildAndStart();
+      server_->Wait();
+    }));
+  }
+
+  void TearDown() override {
+    server_->Shutdown();
+    server_thread_->join();
+  }
+
+  void InitCommunicator(int rank) {
+    Json config{JsonObject()};
+    config["xgboost_communicator"] = String("federated");
+    config["federated_server_address"] = String(server_address_);
+    config["federated_world_size"] = kWorldSize;
+    config["federated_rank"] = rank;
+    xgboost::collective::Init(config);
+  }
+
+  static int const kWorldSize{3};
+  std::string server_address_;
+  std::unique_ptr<std::thread> server_thread_;
+  std::unique_ptr<grpc::Server> server_;
+};
+}  // namespace xgboost
diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu
index 794c60909..c4816ff18 100644
--- a/tests/cpp/plugin/test_federated_adapter.cu
+++ b/tests/cpp/plugin/test_federated_adapter.cu
@@ -1,56 +1,20 @@
 /*!
  * Copyright 2022 XGBoost contributors
  */
-#include <grpcpp/server_builder.h>
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>
 
+#include <ctime>
 #include <iostream>
 #include <thread>
-#include <ctime>
 
-#include "./helpers.h"
 #include "../../../plugin/federated/federated_communicator.h"
-#include "../../../plugin/federated/federated_server.h"
 #include "../../../src/collective/device_communicator_adapter.cuh"
+#include "./helpers.h"
 
-namespace {
+namespace xgboost::collective {
 
-std::string GetServerAddress() {
-  int port = GenerateRandomPort(50000, 60000);
-  std::string address = std::string("localhost:") + std::to_string(port);
-  return address;
-}
-
-}  // anonymous namespace
-
-namespace xgboost {
-namespace collective {
-
-class FederatedAdapterTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    server_address_ = GetServerAddress();
-    server_thread_.reset(new std::thread([this] {
-      grpc::ServerBuilder builder;
-      federated::FederatedService service{kWorldSize};
-      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
-      builder.RegisterService(&service);
-      server_ = builder.BuildAndStart();
-      server_->Wait();
-    }));
-  }
-
-  void TearDown() override {
-    server_->Shutdown();
-    server_thread_->join();
-  }
-
-  static int const kWorldSize{2};
-  std::string server_address_;
-  std::unique_ptr<std::thread> server_thread_;
-  std::unique_ptr<grpc::Server> server_;
-};
+class FederatedAdapterTest : public BaseFederatedTest {};
 
 TEST(FederatedAdapterSimpleTest, ThrowOnInvalidDeviceOrdinal) {
   auto construct = []() { DeviceCommunicatorAdapter adapter{-1, nullptr}; };
@@ -65,20 +29,20 @@ TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) {
 TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread([rank, server_address=server_address_] {
+    threads.emplace_back([rank, server_address = server_address_] {
       FederatedCommunicator comm{kWorldSize, rank, server_address};
       // Assign device 0 to all workers, since we run gtest in a single-GPU machine
       DeviceCommunicatorAdapter adapter{0, &comm};
-      int const count = 3;
+      int count = 3;
       thrust::device_vector<double> buffer(count, 0);
       thrust::sequence(buffer.begin(), buffer.end());
       adapter.AllReduceSum(buffer.data().get(), count);
       thrust::host_vector<double> host_buffer = buffer;
       EXPECT_EQ(host_buffer.size(), count);
       for (auto i = 0; i < count; i++) {
-        EXPECT_EQ(host_buffer[i], i * 2);
+        EXPECT_EQ(host_buffer[i], i * kWorldSize);
       }
-    }));
+    });
   }
   for (auto& thread : threads) {
     thread.join();
@@ -88,7 +52,7 @@ TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
 TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread([rank, server_address=server_address_] {
+    threads.emplace_back([rank, server_address = server_address_] {
       FederatedCommunicator comm{kWorldSize, rank, server_address};
       // Assign device 0 to all workers, since we run gtest in a single-GPU machine
       DeviceCommunicatorAdapter adapter{0, &comm};
@@ -104,17 +68,16 @@ TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
       EXPECT_EQ(segments[0], 2);
       EXPECT_EQ(segments[1], 3);
       thrust::host_vector<char> host_buffer = receive_buffer;
-      EXPECT_EQ(host_buffer.size(), 5);
-      int expected[] = {0, 1, 0, 1, 2};
-      for (auto i = 0; i < 5; i++) {
+      EXPECT_EQ(host_buffer.size(), 9);
+      int expected[] = {0, 1, 0, 1, 2, 0, 1, 2, 3};
+      for (auto i = 0; i < 9; i++) {
         EXPECT_EQ(host_buffer[i], expected[i]);
       }
-    }));
+    });
   }
   for (auto& thread : threads) {
     thread.join();
   }
 }
 
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/tests/cpp/plugin/test_federated_communicator.cc b/tests/cpp/plugin/test_federated_communicator.cc
index f5d72e5f4..5177187c5 100644
--- a/tests/cpp/plugin/test_federated_communicator.cc
+++ b/tests/cpp/plugin/test_federated_communicator.cc
@@ -2,65 +2,34 @@
  * Copyright 2022 XGBoost contributors
  */
 #include <dmlc/parameter.h>
-#include <grpcpp/server_builder.h>
 #include <gtest/gtest.h>
 
 #include <iostream>
 #include <thread>
-#include <ctime>
 
-#include "helpers.h"
 #include "../../../plugin/federated/federated_communicator.h"
-#include "../../../plugin/federated/federated_server.h"
+#include "helpers.h"
 
-namespace {
+namespace xgboost::collective {
 
-std::string GetServerAddress() {
-  int port = GenerateRandomPort(50000, 60000);
-  std::string address = std::string("localhost:") + std::to_string(port);
-  return address;
-}
-
-}  // anonymous namespace
-
-namespace xgboost {
-namespace collective {
-
-class FederatedCommunicatorTest : public ::testing::Test {
+class FederatedCommunicatorTest : public BaseFederatedTest {
  public:
-  static void VerifyAllgather(int rank, const std::string& server_address) {
+  static void VerifyAllgather(int rank, const std::string &server_address) {
     FederatedCommunicator comm{kWorldSize, rank, server_address};
     CheckAllgather(comm, rank);
   }
 
-  static void VerifyAllreduce(int rank, const std::string& server_address) {
+  static void VerifyAllreduce(int rank, const std::string &server_address) {
     FederatedCommunicator comm{kWorldSize, rank, server_address};
     CheckAllreduce(comm);
   }
 
-  static void VerifyBroadcast(int rank, const std::string& server_address) {
+  static void VerifyBroadcast(int rank, const std::string &server_address) {
     FederatedCommunicator comm{kWorldSize, rank, server_address};
     CheckBroadcast(comm, rank);
   }
 
  protected:
-  void SetUp() override {
-    server_address_ = GetServerAddress();
-    server_thread_.reset(new std::thread([this] {
-      grpc::ServerBuilder builder;
-      federated::FederatedService service{kWorldSize};
-      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
-      builder.RegisterService(&service);
-      server_ = builder.BuildAndStart();
-      server_->Wait();
-    }));
-  }
-
-  void TearDown() override {
-    server_->Shutdown();
-    server_thread_->join();
-  }
-
   static void CheckAllgather(FederatedCommunicator &comm, int rank) {
     int buffer[kWorldSize] = {0, 0, 0};
     buffer[rank] = rank;
@@ -90,11 +59,6 @@ class FederatedCommunicatorTest : public ::testing::Test {
       EXPECT_EQ(buffer, "hello");
     }
   }
-
-  static int const kWorldSize{3};
-  std::string server_address_;
-  std::unique_ptr<std::thread> server_thread_;
-  std::unique_ptr<grpc::Server> server_;
 };
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
@@ -161,8 +125,7 @@ TEST(FederatedCommunicatorSimpleTest, IsDistributed) {
 TEST_F(FederatedCommunicatorTest, Allgather) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(
-        std::thread(&FederatedCommunicatorTest::VerifyAllgather, rank, server_address_));
+    threads.emplace_back(&FederatedCommunicatorTest::VerifyAllgather, rank, server_address_);
   }
   for (auto &thread : threads) {
     thread.join();
@@ -172,8 +135,7 @@ TEST_F(FederatedCommunicatorTest, Allgather) {
 TEST_F(FederatedCommunicatorTest, Allreduce) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(
-        std::thread(&FederatedCommunicatorTest::VerifyAllreduce, rank, server_address_));
+    threads.emplace_back(&FederatedCommunicatorTest::VerifyAllreduce, rank, server_address_);
   }
   for (auto &thread : threads) {
     thread.join();
@@ -183,12 +145,10 @@ TEST_F(FederatedCommunicatorTest, Allreduce) {
 TEST_F(FederatedCommunicatorTest, Broadcast) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(
-        std::thread(&FederatedCommunicatorTest::VerifyBroadcast, rank, server_address_));
+    threads.emplace_back(&FederatedCommunicatorTest::VerifyBroadcast, rank, server_address_);
   }
   for (auto &thread : threads) {
     thread.join();
   }
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/tests/cpp/plugin/test_federated_data.cc b/tests/cpp/plugin/test_federated_data.cc
new file mode 100644
index 000000000..8ac89e887
--- /dev/null
+++ b/tests/cpp/plugin/test_federated_data.cc
@@ -0,0 +1,65 @@
+/*!
+ * Copyright 2023 XGBoost contributors
+ */
+#include <dmlc/parameter.h>
+#include <gtest/gtest.h>
+#include <xgboost/data.h>
+
+#include <fstream>
+#include <iostream>
+#include <thread>
+
+#include "../../../plugin/federated/federated_server.h"
+#include "../../../src/collective/communicator-inl.h"
+#include "../filesystem.h"
+#include "../helpers.h"
+#include "helpers.h"
+
+namespace xgboost {
+
+class FederatedDataTest : public BaseFederatedTest {
+ public:
+  void VerifyLoadUri(int rank) {
+    InitCommunicator(rank);
+
+    size_t constexpr kRows{16};
+    size_t const kCols = 8 + rank;
+
+    dmlc::TemporaryDirectory tmpdir;
+    std::string path = tmpdir.path + "/small" + std::to_string(rank) + ".csv";
+    CreateTestCSV(path, kRows, kCols);
+
+    std::unique_ptr<DMatrix> dmat;
+    std::string uri = path + "?format=csv";
+    dmat.reset(DMatrix::Load(uri, false, DataSplitMode::kCol));
+
+    ASSERT_EQ(dmat->Info().num_col_, 8 * kWorldSize + 3);
+    ASSERT_EQ(dmat->Info().num_row_, kRows);
+
+    for (auto const& page : dmat->GetBatches<SparsePage>()) {
+      auto entries = page.GetView().data;
+      auto index = 0;
+      int offsets[] = {0, 8, 17};
+      int offset = offsets[rank];
+      for (auto row = 0; row < kRows; row++) {
+        for (auto col = 0; col < kCols; col++) {
+          EXPECT_EQ(entries[index].index, col + offset);
+          index++;
+        }
+      }
+    }
+
+    xgboost::collective::Finalize();
+  }
+};
+
+TEST_F(FederatedDataTest, LoadUri) {
+  std::vector<std::thread> threads;
+  for (auto rank = 0; rank < kWorldSize; rank++) {
+    threads.emplace_back(&FederatedDataTest_LoadUri_Test::VerifyLoadUri, this, rank);
+  }
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+}  // namespace xgboost
diff --git a/tests/cpp/plugin/test_federated_server.cc b/tests/cpp/plugin/test_federated_server.cc
index fa9c272d2..79e06bf5f 100644
--- a/tests/cpp/plugin/test_federated_server.cc
+++ b/tests/cpp/plugin/test_federated_server.cc
@@ -1,30 +1,17 @@
 /*!
  * Copyright 2017-2020 XGBoost contributors
  */
-#include <grpcpp/server_builder.h>
 #include <gtest/gtest.h>
 
-#include <ctime>
 #include <iostream>
 #include <thread>
 
 #include "federated_client.h"
-#include "federated_server.h"
 #include "helpers.h"
 
-namespace {
-
-std::string GetServerAddress() {
-  int port = GenerateRandomPort(50000, 60000);
-  std::string address = std::string("localhost:") + std::to_string(port);
-  return address;
-}
-
-}  // anonymous namespace
-
 namespace xgboost {
 
-class FederatedServerTest : public ::testing::Test {
+class FederatedServerTest : public BaseFederatedTest {
  public:
   static void VerifyAllgather(int rank, const std::string& server_address) {
     federated::FederatedClient client{server_address, rank};
@@ -51,23 +38,6 @@ class FederatedServerTest : public ::testing::Test {
   }
 
  protected:
-  void SetUp() override {
-    server_address_ = GetServerAddress();
-    server_thread_.reset(new std::thread([this] {
-      grpc::ServerBuilder builder;
-      federated::FederatedService service{kWorldSize};
-      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
-      builder.RegisterService(&service);
-      server_ = builder.BuildAndStart();
-      server_->Wait();
-    }));
-  }
-
-  void TearDown() override {
-    server_->Shutdown();
-    server_thread_->join();
-  }
-
   static void CheckAllgather(federated::FederatedClient& client, int rank) {
     int data[kWorldSize] = {0, 0, 0};
     data[rank] = rank;
@@ -98,17 +68,12 @@ class FederatedServerTest : public ::testing::Test {
     auto reply = client.Broadcast(send_buffer, 0);
     EXPECT_EQ(reply, "hello broadcast") << "rank " << rank;
   }
-
-  static int const kWorldSize{3};
-  std::string server_address_;
-  std::unique_ptr<std::thread> server_thread_;
-  std::unique_ptr<grpc::Server> server_;
 };
 
 TEST_F(FederatedServerTest, Allgather) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllgather, rank, server_address_));
+    threads.emplace_back(&FederatedServerTest::VerifyAllgather, rank, server_address_);
   }
   for (auto& thread : threads) {
     thread.join();
@@ -118,7 +83,7 @@ TEST_F(FederatedServerTest, Allgather) {
 TEST_F(FederatedServerTest, Allreduce) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllreduce, rank, server_address_));
+    threads.emplace_back(&FederatedServerTest::VerifyAllreduce, rank, server_address_);
   }
   for (auto& thread : threads) {
     thread.join();
@@ -128,7 +93,7 @@ TEST_F(FederatedServerTest, Allreduce) {
 TEST_F(FederatedServerTest, Broadcast) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread(&FederatedServerTest::VerifyBroadcast, rank, server_address_));
+    threads.emplace_back(&FederatedServerTest::VerifyBroadcast, rank, server_address_);
   }
   for (auto& thread : threads) {
     thread.join();
@@ -138,7 +103,7 @@ TEST_F(FederatedServerTest, Broadcast) {
 TEST_F(FederatedServerTest, Mixture) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread(&FederatedServerTest::VerifyMixture, rank, server_address_));
+    threads.emplace_back(&FederatedServerTest::VerifyMixture, rank, server_address_);
   }
   for (auto& thread : threads) {
     thread.join();
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 9a0ebee18..401d33c4d 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -305,4 +305,10 @@ TEST(CpuPredictor, Sparse) {
   TestSparsePrediction(0.2, "cpu_predictor");
   TestSparsePrediction(0.8, "cpu_predictor");
 }
+
+TEST(CpuPredictor, Multi) {
+  Context ctx;
+  ctx.nthread = 1;
+  TestVectorLeafPrediction(&ctx);
+}
 }  // namespace xgboost
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 7ab8946f7..92c661f35 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -1,28 +1,34 @@
-/*!
- * Copyright 2020-2021 by Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
  */
-
 #include "test_predictor.h"
 
 #include <gtest/gtest.h>
-#include <xgboost/context.h>
-#include <xgboost/data.h>
-#include <xgboost/host_device_vector.h>
-#include <xgboost/predictor.h>
+#include <xgboost/context.h>                      // for Context
+#include <xgboost/data.h>                         // for DMatrix, BatchIterator, BatchSet, MetaInfo
+#include <xgboost/host_device_vector.h>           // for HostDeviceVector
+#include <xgboost/predictor.h>                    // for PredictionCacheEntry, Predictor, Predic...
 
-#include "../../../src/common/bitfield.h"
-#include "../../../src/common/categorical.h"
-#include "../../../src/common/io.h"
-#include "../../../src/data/adapter.h"
-#include "../../../src/data/proxy_dmatrix.h"
-#include "../helpers.h"
+#include <algorithm>                              // for max
+#include <limits>                                 // for numeric_limits
+#include <unordered_map>                          // for unordered_map
+
+#include "../../../src/common/bitfield.h"         // for LBitField32
+#include "../../../src/data/iterative_dmatrix.h"  // for IterativeDMatrix
+#include "../../../src/data/proxy_dmatrix.h"      // for DMatrixProxy
+#include "../helpers.h"                           // for GetDMatrixFromData, RandomDataGenerator
+#include "xgboost/json.h"                         // for Json, Object, get, String
+#include "xgboost/linalg.h"                       // for MakeVec, Tensor, TensorView, Vector
+#include "xgboost/logging.h"                      // for CHECK
+#include "xgboost/span.h"                         // for operator!=, SpanIterator, Span
+#include "xgboost/tree_model.h"                   // for RegTree
 
 namespace xgboost {
 TEST(Predictor, PredictionCache) {
   size_t constexpr kRows = 16, kCols = 4;
 
   PredictionContainer container;
-  DMatrix* m;
+  DMatrix *m;
   // Add a cache that is immediately expired.
   auto add_cache = [&]() {
     auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
@@ -412,4 +418,101 @@ void TestSparsePrediction(float sparsity, std::string predictor) {
     }
   }
 }
+
+void TestVectorLeafPrediction(Context const *ctx) {
+  std::unique_ptr<Predictor> cpu_predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", ctx));
+
+  size_t constexpr kRows = 5;
+  size_t constexpr kCols = 5;
+
+  LearnerModelParam mparam{static_cast<bst_feature_t>(kCols),
+                           linalg::Vector<float>{{0.5}, {1}, Context::kCpuId}, 1, 3,
+                           MultiStrategy::kMultiOutputTree};
+
+  std::vector<std::unique_ptr<RegTree>> trees;
+  trees.emplace_back(new RegTree{mparam.LeafLength(), mparam.num_feature});
+
+  std::vector<float> p_w(mparam.LeafLength(), 0.0f);
+  std::vector<float> l_w(mparam.LeafLength(), 1.0f);
+  std::vector<float> r_w(mparam.LeafLength(), 2.0f);
+
+  auto &tree = trees.front();
+  tree->ExpandNode(0, static_cast<bst_feature_t>(1), 2.0, true,
+                   linalg::MakeVec(p_w.data(), p_w.size()), linalg::MakeVec(l_w.data(), l_w.size()),
+                   linalg::MakeVec(r_w.data(), r_w.size()));
+  ASSERT_TRUE(tree->IsMultiTarget());
+  ASSERT_TRUE(mparam.IsVectorLeaf());
+
+  gbm::GBTreeModel model{&mparam, ctx};
+  model.CommitModel(std::move(trees), 0);
+
+  auto run_test = [&](float expected, HostDeviceVector<float> *p_data) {
+    {
+      auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
+      PredictionCacheEntry predt_cache;
+      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+      ASSERT_EQ(predt_cache.predictions.Size(), kRows * mparam.LeafLength());
+      cpu_predictor->PredictBatch(p_fmat.get(), &predt_cache, model, 0, 1);
+      auto const &h_predt = predt_cache.predictions.HostVector();
+      for (auto v : h_predt) {
+        ASSERT_EQ(v, expected);
+      }
+    }
+
+    {
+      // inplace
+      PredictionCacheEntry predt_cache;
+      auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
+      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+      auto arr = GetArrayInterface(p_data, kRows, kCols);
+      std::string str;
+      Json::Dump(arr, &str);
+      auto proxy = std::shared_ptr<DMatrix>(new data::DMatrixProxy{});
+      dynamic_cast<data::DMatrixProxy *>(proxy.get())->SetArrayData(str.data());
+      cpu_predictor->InplacePredict(proxy, model, std::numeric_limits<float>::quiet_NaN(),
+                                    &predt_cache, 0, 1);
+      auto const &h_predt = predt_cache.predictions.HostVector();
+      for (auto v : h_predt) {
+        ASSERT_EQ(v, expected);
+      }
+    }
+
+    {
+      // ghist
+      PredictionCacheEntry predt_cache;
+      auto &h_data = p_data->HostVector();
+      // give it at least two bins, otherwise the histogram cuts only have min and max values.
+      for (std::size_t i = 0; i < 5; ++i) {
+        h_data[i] = 1.0;
+      }
+      auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
+
+      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+
+      auto iter = NumpyArrayIterForTest{ctx, *p_data, kRows, static_cast<bst_feature_t>(kCols),
+                                        static_cast<std::size_t>(1)};
+      p_fmat =
+          std::make_shared<data::IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
+                                                   std::numeric_limits<float>::quiet_NaN(), 0, 256);
+
+      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+      cpu_predictor->PredictBatch(p_fmat.get(), &predt_cache, model, 0, 1);
+      auto const &h_predt = predt_cache.predictions.HostVector();
+      // the smallest v uses the min_value from histogram cuts, which leads to a left leaf
+      // during prediction.
+      for (std::size_t i = 5; i < h_predt.size(); ++i) {
+        ASSERT_EQ(h_predt[i], expected) << i;
+      }
+    }
+  };
+
+  // go to right
+  HostDeviceVector<float> data(kRows * kCols, model.trees.front()->SplitCond(RegTree::kRoot) + 1.0);
+  run_test(2.5, &data);
+
+  // go to left
+  data.HostVector().assign(data.Size(), model.trees.front()->SplitCond(RegTree::kRoot) - 1.0);
+  run_test(1.5, &data);
+}
 }  // namespace xgboost
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 61b05b31b..56c1523a1 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -1,9 +1,16 @@
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
+ */
 #ifndef XGBOOST_TEST_PREDICTOR_H_
 #define XGBOOST_TEST_PREDICTOR_H_
 
+#include <xgboost/context.h>  // for Context
 #include <xgboost/predictor.h>
-#include <string>
+
 #include <cstddef>
+#include <string>
+
+#include "../../../src/gbm/gbtree_model.h"  // for GBTreeModel
 #include "../helpers.h"
 
 namespace xgboost {
@@ -48,7 +55,7 @@ void TestPredictionFromGradientIndex(std::string name, size_t rows, size_t cols,
     PredictionCacheEntry precise_out_predictions;
     predictor->InitOutPredictions(p_dmat->Info(), &precise_out_predictions.predictions, model);
     predictor->PredictBatch(p_dmat.get(), &precise_out_predictions, model, 0);
-    ASSERT_FALSE(p_dmat->PageExists<Page>());
+    CHECK(!p_dmat->PageExists<Page>());
   }
 }
 
@@ -69,6 +76,8 @@ void TestCategoricalPredictLeaf(StringView name);
 void TestIterationRange(std::string name);
 
 void TestSparsePrediction(float sparsity, std::string predictor);
+
+void TestVectorLeafPrediction(Context const* ctx);
 }  // namespace xgboost
 
 #endif  // XGBOOST_TEST_PREDICTOR_H_
diff --git a/tests/cpp/test_multi_target.cc b/tests/cpp/test_multi_target.cc
index e96c2eb06..2331098e0 100644
--- a/tests/cpp/test_multi_target.cc
+++ b/tests/cpp/test_multi_target.cc
@@ -124,11 +124,11 @@ TEST(MultiStrategy, Configure) {
   auto p_fmat = RandomDataGenerator{12ul, 3ul, 0.0}.GenerateDMatrix();
   p_fmat->Info().labels.Reshape(p_fmat->Info().num_row_, 2);
   std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-  learner->SetParams(Args{{"multi_strategy", "monolithic"}, {"num_target", "2"}});
+  learner->SetParams(Args{{"multi_strategy", "multi_output_tree"}, {"num_target", "2"}});
   learner->Configure();
   ASSERT_EQ(learner->Groups(), 2);
 
-  learner->SetParams(Args{{"multi_strategy", "monolithic"}, {"num_target", "0"}});
+  learner->SetParams(Args{{"multi_strategy", "multi_output_tree"}, {"num_target", "0"}});
   ASSERT_THROW({ learner->Configure(); }, dmlc::Error);
 }
 }  // namespace xgboost
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 4582f546a..f1317fc02 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -304,7 +304,7 @@ void TestEvaluateSingleSplit(bool is_categorical) {
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
-  auto feature_histogram = ConvertToInteger({          {-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
+  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(),
                                                FeatureType::kCategorical);
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index fc94f3130..dcd04f68a 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -1,18 +1,27 @@
 /**
  * Copyright 2021-2023 by XGBoost Contributors
  */
-#include <gtest/gtest.h>
-#include <xgboost/base.h>
-
-#include "../../../../src/common/hist_util.h"
-#include "../../../../src/tree/common_row_partitioner.h"
-#include "../../../../src/tree/hist/evaluate_splits.h"
 #include "../test_evaluate_splits.h"
-#include "../../helpers.h"
-#include "xgboost/context.h"  // Context
 
-namespace xgboost {
-namespace tree {
+#include <gtest/gtest.h>
+#include <xgboost/base.h>                               // for GradientPairPrecise, Args, Gradie...
+#include <xgboost/context.h>                            // for Context
+#include <xgboost/data.h>                               // for FeatureType, DMatrix, MetaInfo
+#include <xgboost/logging.h>                            // for CHECK_EQ
+#include <xgboost/tree_model.h>                         // for RegTree, RTreeNodeStat
+
+#include <memory>                                       // for make_shared, shared_ptr, addressof
+
+#include "../../../../src/common/hist_util.h"           // for HistCollection, HistogramCuts
+#include "../../../../src/common/random.h"              // for ColumnSampler
+#include "../../../../src/common/row_set.h"             // for RowSetCollection
+#include "../../../../src/data/gradient_index.h"        // for GHistIndexMatrix
+#include "../../../../src/tree/hist/evaluate_splits.h"  // for HistEvaluator
+#include "../../../../src/tree/hist/expand_entry.h"     // for CPUExpandEntry
+#include "../../../../src/tree/param.h"                 // for GradStats, TrainParam
+#include "../../helpers.h"                              // for RandomDataGenerator, AllThreadsFo...
+
+namespace xgboost::tree {
 void TestEvaluateSplits(bool force_read_by_column) {
   Context ctx;
   ctx.nthread = 4;
@@ -87,6 +96,68 @@ TEST(HistEvaluator, Evaluate) {
   TestEvaluateSplits(true);
 }
 
+TEST(HistMultiEvaluator, Evaluate) {
+  Context ctx;
+  ctx.nthread = 1;
+
+  TrainParam param;
+  param.Init(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
+  auto sampler = std::make_shared<common::ColumnSampler>();
+
+  std::size_t n_samples = 3;
+  bst_feature_t n_features = 2;
+  bst_target_t n_targets = 2;
+  bst_bin_t n_bins = 2;
+
+  auto p_fmat =
+      RandomDataGenerator{n_samples, n_features, 0.5}.Targets(n_targets).GenerateDMatrix(true);
+
+  HistMultiEvaluator evaluator{&ctx, p_fmat->Info(), &param, sampler};
+  std::vector<common::HistCollection> histogram(n_targets);
+  linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
+  for (bst_target_t t{0}; t < n_targets; ++t) {
+    auto &hist = histogram[t];
+    hist.Init(n_bins * n_features);
+    hist.AddHistRow(0);
+    hist.AllocateAllData();
+    auto node_hist = hist[0];
+    node_hist[0] = {-0.5, 0.5};
+    node_hist[1] = {2.0, 0.5};
+    node_hist[2] = {0.5, 0.5};
+    node_hist[3] = {1.0, 0.5};
+
+    root_sum(t) += node_hist[0];
+    root_sum(t) += node_hist[1];
+  }
+
+  RegTree tree{n_targets, n_features};
+  auto weight = evaluator.InitRoot(root_sum.HostView());
+  tree.SetLeaf(RegTree::kRoot, weight.HostView());
+  auto w = weight.HostView();
+  ASSERT_EQ(w.Size(), n_targets);
+  ASSERT_EQ(w(0), -1.5);
+  ASSERT_EQ(w(1), -1.5);
+
+  common::HistogramCuts cuts;
+  cuts.cut_ptrs_ = {0, 2, 4};
+  cuts.cut_values_ = {0.5, 1.0, 2.0, 3.0};
+  cuts.min_vals_ = {-0.2, 1.8};
+
+  std::vector<MultiExpandEntry> entries(1, {/*nidx=*/0, /*depth=*/0});
+
+  std::vector<common::HistCollection const *> ptrs;
+  std::transform(histogram.cbegin(), histogram.cend(), std::back_inserter(ptrs),
+                 [](auto const &h) { return std::addressof(h); });
+
+  evaluator.EvaluateSplits(tree, ptrs, cuts, &entries);
+
+  ASSERT_EQ(entries.front().split.loss_chg, 12.5);
+  ASSERT_EQ(entries.front().split.split_value, 0.5);
+  ASSERT_EQ(entries.front().split.SplitIndex(), 0);
+
+  ASSERT_EQ(sampler->GetFeatureSet(0)->Size(), n_features);
+}
+
 TEST(HistEvaluator, Apply) {
   Context ctx;
   ctx.nthread = 4;
@@ -98,7 +169,8 @@ TEST(HistEvaluator, Apply) {
   auto sampler = std::make_shared<common::ColumnSampler>();
   auto evaluator_ = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
 
-  CPUExpandEntry entry{0, 0, 10.0f};
+  CPUExpandEntry entry{0, 0};
+  entry.split.loss_chg = 10.0f;
   entry.split.left_sum = GradStats{0.4, 0.6f};
   entry.split.right_sum = GradStats{0.5, 0.5f};
 
@@ -210,12 +282,11 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
   std::vector<CPUExpandEntry> entries(1);
   RegTree tree;
   evaluator.EvaluateSplits(hist, cuts_, info.feature_types.ConstHostSpan(), tree, &entries);
-  auto const& split = entries.front().split;
+  auto const &split = entries.front().split;
 
   this->CheckResult(split.loss_chg, split.SplitIndex(), split.split_value, split.is_cat,
                     split.DefaultLeft(),
                     GradientPairPrecise{split.left_sum.GetGrad(), split.left_sum.GetHess()},
                     GradientPairPrecise{split.right_sum.GetGrad(), split.right_sum.GetHess()});
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 8462fa7d5..3b354bebb 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -41,10 +41,10 @@ void TestAddHistRows(bool is_distributed) {
   tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3), 0.0f);
-  nodes_for_explicit_hist_build_.emplace_back(4, tree.GetDepth(4), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(5, tree.GetDepth(5), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6), 0.0f);
+  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
+  nodes_for_explicit_hist_build_.emplace_back(4, tree.GetDepth(4));
+  nodes_for_subtraction_trick_.emplace_back(5, tree.GetDepth(5));
+  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
 
   HistogramBuilder<CPUExpandEntry> histogram_builder;
   histogram_builder.Reset(gmat.cut.TotalBins(), {kMaxBins, 0.5}, omp_get_max_threads(), 1,
@@ -98,7 +98,7 @@ void TestSyncHist(bool is_distributed) {
   }
 
   // level 0
-  nodes_for_explicit_hist_build_.emplace_back(0, tree.GetDepth(0), 0.0f);
+  nodes_for_explicit_hist_build_.emplace_back(0, tree.GetDepth(0));
   histogram.AddHistRows(&starting_index, &sync_count,
                         nodes_for_explicit_hist_build_,
                         nodes_for_subtraction_trick_, &tree);
@@ -108,10 +108,8 @@ void TestSyncHist(bool is_distributed) {
   nodes_for_subtraction_trick_.clear();
 
   // level 1
-  nodes_for_explicit_hist_build_.emplace_back(tree[0].LeftChild(),
-                                              tree.GetDepth(1), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(tree[0].RightChild(),
-                                            tree.GetDepth(2), 0.0f);
+  nodes_for_explicit_hist_build_.emplace_back(tree[0].LeftChild(), tree.GetDepth(1));
+  nodes_for_subtraction_trick_.emplace_back(tree[0].RightChild(), tree.GetDepth(2));
 
   histogram.AddHistRows(&starting_index, &sync_count,
                         nodes_for_explicit_hist_build_,
@@ -123,10 +121,10 @@ void TestSyncHist(bool is_distributed) {
   nodes_for_explicit_hist_build_.clear();
   nodes_for_subtraction_trick_.clear();
   // level 2
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(4, tree.GetDepth(4), 0.0f);
-  nodes_for_explicit_hist_build_.emplace_back(5, tree.GetDepth(5), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6), 0.0f);
+  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
+  nodes_for_subtraction_trick_.emplace_back(4, tree.GetDepth(4));
+  nodes_for_explicit_hist_build_.emplace_back(5, tree.GetDepth(5));
+  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
 
   histogram.AddHistRows(&starting_index, &sync_count,
                         nodes_for_explicit_hist_build_,
@@ -256,7 +254,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
 
-  CPUExpandEntry node(RegTree::kRoot, tree.GetDepth(0), 0.0f);
+  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
   nodes_for_explicit_hist_build.push_back(node);
   for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>({kMaxBins, 0.5})) {
@@ -330,7 +328,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   BatchParam batch_param{0, static_cast<int32_t>(kBins)};
 
   RegTree tree;
-  CPUExpandEntry node(RegTree::kRoot, tree.GetDepth(0), 0.0f);
+  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
   nodes_for_explicit_hist_build.push_back(node);
 
@@ -403,7 +401,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
 
   RegTree tree;
   std::vector<CPUExpandEntry> nodes;
-  nodes.emplace_back(0, tree.GetDepth(0), 0.0f);
+  nodes.emplace_back(0, tree.GetDepth(0));
 
   common::GHistRow multi_page;
   HistogramBuilder<CPUExpandEntry> multi_build;
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index cae76c373..6f2b83511 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021-2022, XGBoost contributors.
+/**
+ * Copyright 2021-2023 by XGBoost contributors.
  */
 #include <gtest/gtest.h>
 
@@ -10,7 +10,6 @@
 
 namespace xgboost {
 namespace tree {
-
 namespace {
 std::vector<float> GenerateHess(size_t n_samples) {
   auto grad = GenerateRandomGradients(n_samples);
@@ -32,7 +31,8 @@ TEST(Approx, Partitioner) {
 
   auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
   auto hess = GenerateHess(n_samples);
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
 
   for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
     bst_feature_t const split_ind = 0;
@@ -79,7 +79,9 @@ void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared
                                 CommonRowPartitioner const& expected_mid_partitioner) {
   auto dmat =
       std::unique_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
+
   Context ctx;
   ctx.InitAllowUnknown(Args{});
   for (auto const& page : dmat->GetBatches<GHistIndexMatrix>({64, *hess, true})) {
@@ -124,7 +126,8 @@ TEST(Approx, PartitionerColSplit) {
   size_t n_samples = 1024, n_features = 16, base_rowid = 0;
   auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
   auto hess = GenerateHess(n_samples);
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
 
   float min_value, mid_value;
   Context ctx;
@@ -145,77 +148,5 @@ TEST(Approx, PartitionerColSplit) {
   RunWithInMemoryCommunicator(kWorkers, TestColumnSplitPartitioner, n_samples, base_rowid, Xy,
                               &hess, min_value, mid_value, mid_partitioner);
 }
-
-namespace {
-void TestLeafPartition(size_t n_samples) {
-  size_t const n_features = 2, base_rowid = 0;
-  Context ctx;
-  common::RowSetCollection row_set;
-  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
-
-  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
-  RegTree tree;
-  std::vector<float> hess(n_samples, 0);
-  // emulate sampling
-  auto not_sampled = [](size_t i) {
-    size_t const kSampleFactor{3};
-    return i % kSampleFactor != 0;
-  };
-  for (size_t i = 0; i < hess.size(); ++i) {
-    if (not_sampled(i)) {
-      hess[i] = 1.0f;
-    }
-  }
-
-  std::vector<size_t> h_nptr;
-  float split_value{0};
-  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({Context::kCpuId, 64})) {
-    bst_feature_t const split_ind = 0;
-    auto ptr = page.cut.Ptrs()[split_ind + 1];
-    split_value = page.cut.Values().at(ptr / 2);
-    GetSplit(&tree, split_value, &candidates);
-    partitioner.UpdatePosition(&ctx, page, candidates, &tree);
-    std::vector<bst_node_t> position;
-    partitioner.LeafPartition(&ctx, tree, hess, &position);
-    std::sort(position.begin(), position.end());
-    size_t beg = std::distance(
-        position.begin(),
-        std::find_if(position.begin(), position.end(), [&](bst_node_t nidx) { return nidx >= 0; }));
-    std::vector<size_t> nptr;
-    common::RunLengthEncode(position.cbegin() + beg, position.cend(), &nptr);
-    std::transform(nptr.begin(), nptr.end(), nptr.begin(), [&](size_t x) { return x + beg; });
-    auto n_uniques = std::unique(position.begin() + beg, position.end()) - (position.begin() + beg);
-    ASSERT_EQ(nptr.size(), n_uniques + 1);
-    ASSERT_EQ(nptr[0], beg);
-    ASSERT_EQ(nptr.back(), n_samples);
-
-    h_nptr = nptr;
-  }
-
-  if (h_nptr.front() == n_samples) {
-    return;
-  }
-
-  ASSERT_GE(h_nptr.size(), 2);
-
-  for (auto const& page : Xy->GetBatches<SparsePage>()) {
-    auto batch = page.GetView();
-    size_t left{0};
-    for (size_t i = 0; i < batch.Size(); ++i) {
-      if (not_sampled(i) && batch[i].front().fvalue < split_value) {
-        left++;
-      }
-    }
-    ASSERT_EQ(left, h_nptr[1] - h_nptr[0]);  // equal to number of sampled assigned to left
-  }
-}
-}  // anonymous namespace
-
-TEST(Approx, LeafPartition) {
-  for (auto n_samples : {0ul, 1ul, 128ul, 256ul}) {
-    TestLeafPartition(n_samples);
-  }
-}
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_common_partitioner.cc b/tests/cpp/tree/test_common_partitioner.cc
new file mode 100644
index 000000000..7e47ec289
--- /dev/null
+++ b/tests/cpp/tree/test_common_partitioner.cc
@@ -0,0 +1,93 @@
+/**
+ * Copyright 2022-2023 by XGBoost contributors.
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>                         // for bst_node_t
+#include <xgboost/context.h>                      // for Context
+
+#include <algorithm>                              // for transform
+#include <iterator>                               // for distance
+#include <vector>                                 // for vector
+
+#include "../../../src/common/numeric.h"          // for ==RunLengthEncode
+#include "../../../src/common/row_set.h"          // for RowSetCollection
+#include "../../../src/data/gradient_index.h"     // for GHistIndexMatrix
+#include "../../../src/tree/common_row_partitioner.h"
+#include "../../../src/tree/hist/expand_entry.h"  // for CPUExpandEntry
+#include "../helpers.h"                           // for RandomDataGenerator
+#include "test_partitioner.h"                     // for GetSplit
+
+namespace xgboost::tree {
+namespace {
+void TestLeafPartition(size_t n_samples) {
+  size_t const n_features = 2, base_rowid = 0;
+  Context ctx;
+  common::RowSetCollection row_set;
+  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
+
+  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
+  RegTree tree;
+  std::vector<float> hess(n_samples, 0);
+  // emulate sampling
+  auto not_sampled = [](size_t i) {
+    size_t const kSampleFactor{3};
+    return i % kSampleFactor != 0;
+  };
+  for (size_t i = 0; i < hess.size(); ++i) {
+    if (not_sampled(i)) {
+      hess[i] = 1.0f;
+    }
+  }
+
+  std::vector<size_t> h_nptr;
+  float split_value{0};
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({Context::kCpuId, 64})) {
+    bst_feature_t const split_ind = 0;
+    auto ptr = page.cut.Ptrs()[split_ind + 1];
+    split_value = page.cut.Values().at(ptr / 2);
+    GetSplit(&tree, split_value, &candidates);
+    partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+    std::vector<bst_node_t> position;
+    partitioner.LeafPartition(&ctx, tree, hess, &position);
+    std::sort(position.begin(), position.end());
+    size_t beg = std::distance(
+        position.begin(),
+        std::find_if(position.begin(), position.end(), [&](bst_node_t nidx) { return nidx >= 0; }));
+    std::vector<size_t> nptr;
+    common::RunLengthEncode(position.cbegin() + beg, position.cend(), &nptr);
+    std::transform(nptr.begin(), nptr.end(), nptr.begin(), [&](size_t x) { return x + beg; });
+    auto n_uniques = std::unique(position.begin() + beg, position.end()) - (position.begin() + beg);
+    ASSERT_EQ(nptr.size(), n_uniques + 1);
+    ASSERT_EQ(nptr[0], beg);
+    ASSERT_EQ(nptr.back(), n_samples);
+
+    h_nptr = nptr;
+  }
+
+  if (h_nptr.front() == n_samples) {
+    return;
+  }
+
+  ASSERT_GE(h_nptr.size(), 2);
+
+  for (auto const& page : Xy->GetBatches<SparsePage>()) {
+    auto batch = page.GetView();
+    size_t left{0};
+    for (size_t i = 0; i < batch.Size(); ++i) {
+      if (not_sampled(i) && batch[i].front().fvalue < split_value) {
+        left++;
+      }
+    }
+    ASSERT_EQ(left, h_nptr[1] - h_nptr[0]);  // equal to number of sampled assigned to left
+  }
+}
+}  // anonymous namespace
+
+TEST(CommonRowPartitioner, LeafPartition) {
+  for (auto n_samples : {0ul, 1ul, 128ul, 256ul}) {
+    TestLeafPartition(n_samples);
+  }
+}
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index a74739faa..a7e8972e5 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -2,15 +2,26 @@
  * Copyright 2022-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/data.h>
+#include <xgboost/base.h>                       // for GradientPairInternal, GradientPairPrecise
+#include <xgboost/data.h>                       // for MetaInfo
+#include <xgboost/host_device_vector.h>         // for HostDeviceVector
+#include <xgboost/span.h>                       // for operator!=, Span, SpanIterator
 
-#include <algorithm>  // next_permutation
-#include <numeric>    // iota
+#include <algorithm>                            // for max, max_element, next_permutation, copy
+#include <cmath>                                // for isnan
+#include <cstddef>                              // for size_t
+#include <cstdint>                              // for int32_t, uint64_t, uint32_t
+#include <limits>                               // for numeric_limits
+#include <numeric>                              // for iota
+#include <tuple>                                // for make_tuple, tie, tuple
+#include <utility>                              // for pair
+#include <vector>                               // for vector
 
-#include "../../../src/common/hist_util.h"  // HistogramCuts,HistCollection
-#include "../../../src/tree/param.h"        // TrainParam
-#include "../../../src/tree/split_evaluator.h"
-#include "../helpers.h"
+#include "../../../src/common/hist_util.h"      // for HistogramCuts, HistCollection, GHistRow
+#include "../../../src/tree/param.h"            // for TrainParam, GradStats
+#include "../../../src/tree/split_evaluator.h"  // for TreeEvaluator
+#include "../helpers.h"                         // for SimpleLCG, SimpleRealUniformDistribution
+#include "gtest/gtest_pred_impl.h"              // for AssertionResult, ASSERT_EQ, ASSERT_TRUE
 
 namespace xgboost::tree {
 /**
diff --git a/tests/cpp/tree/test_fit_stump.cc b/tests/cpp/tree/test_fit_stump.cc
index 7fdb6f6ea..c9327d411 100644
--- a/tests/cpp/tree/test_fit_stump.cc
+++ b/tests/cpp/tree/test_fit_stump.cc
@@ -21,7 +21,8 @@ void TestFitStump(Context const *ctx) {
     }
   }
   linalg::Vector<float> out;
-  FitStump(ctx, gpair, kTargets, &out);
+  MetaInfo info;
+  FitStump(ctx, info, gpair, kTargets, &out);
   auto h_out = out.HostView();
   for (auto it = linalg::cbegin(h_out); it != linalg::cend(h_out); ++it) {
     // sum_hess == kRows
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index aa6a18797..881de57e1 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -40,8 +40,7 @@ TEST(GrowHistMaker, InteractionConstraint)
   ObjInfo task{ObjInfo::kRegression};
   {
     // With constraints
-    RegTree tree;
-    tree.param.num_feature = kCols;
+    RegTree tree{1, kCols};
 
     std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
     TrainParam param;
@@ -58,8 +57,7 @@ TEST(GrowHistMaker, InteractionConstraint)
   }
   {
     // Without constraints
-    RegTree tree;
-    tree.param.num_feature = kCols;
+    RegTree tree{1u, kCols};
 
     std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -76,7 +74,7 @@ TEST(GrowHistMaker, InteractionConstraint)
 }
 
 namespace {
-void TestColumnSplit(int32_t rows, int32_t cols, RegTree const& expected_tree) {
+void TestColumnSplit(int32_t rows, bst_feature_t cols, RegTree const& expected_tree) {
   auto p_dmat = GenerateDMatrix(rows, cols);
   auto p_gradients = GenerateGradients(rows);
   Context ctx;
@@ -87,8 +85,7 @@ void TestColumnSplit(int32_t rows, int32_t cols, RegTree const& expected_tree) {
   std::unique_ptr<DMatrix> sliced{
       p_dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
 
-  RegTree tree;
-  tree.param.num_feature = cols;
+  RegTree tree{1u, cols};
   TrainParam param;
   param.Init(Args{});
   updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
@@ -107,8 +104,7 @@ TEST(GrowHistMaker, ColumnSplit) {
   auto constexpr kRows = 32;
   auto constexpr kCols = 16;
 
-  RegTree expected_tree;
-  expected_tree.param.num_feature = kCols;
+  RegTree expected_tree{1u, kCols};
   ObjInfo task{ObjInfo::kRegression};
   {
     auto p_dmat = GenerateDMatrix(kRows, kCols);
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index 7d2bd9c7c..af83ed7eb 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -17,8 +17,8 @@ TEST(MultiTargetTree, JsonIO) {
   linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, Context::kCpuId};
   tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
                   left_weight.HostView(), right_weight.HostView());
-  ASSERT_EQ(tree.param.num_nodes, 3);
-  ASSERT_EQ(tree.param.size_leaf_vector, 3);
+  ASSERT_EQ(tree.NumNodes(), 3);
+  ASSERT_EQ(tree.NumTargets(), 3);
   ASSERT_EQ(tree.GetMultiTargetTree()->Size(), 3);
   ASSERT_EQ(tree.Size(), 3);
 
@@ -26,20 +26,19 @@ TEST(MultiTargetTree, JsonIO) {
   tree.SaveModel(&jtree);
 
   auto check_jtree = [](Json jtree, RegTree const& tree) {
-    ASSERT_EQ(get<String const>(jtree["tree_param"]["num_nodes"]),
-              std::to_string(tree.param.num_nodes));
+    ASSERT_EQ(get<String const>(jtree["tree_param"]["num_nodes"]), std::to_string(tree.NumNodes()));
     ASSERT_EQ(get<F32Array const>(jtree["base_weights"]).size(),
-              tree.param.num_nodes * tree.param.size_leaf_vector);
-    ASSERT_EQ(get<I32Array const>(jtree["parents"]).size(), tree.param.num_nodes);
-    ASSERT_EQ(get<I32Array const>(jtree["left_children"]).size(), tree.param.num_nodes);
-    ASSERT_EQ(get<I32Array const>(jtree["right_children"]).size(), tree.param.num_nodes);
+              tree.NumNodes() * tree.NumTargets());
+    ASSERT_EQ(get<I32Array const>(jtree["parents"]).size(), tree.NumNodes());
+    ASSERT_EQ(get<I32Array const>(jtree["left_children"]).size(), tree.NumNodes());
+    ASSERT_EQ(get<I32Array const>(jtree["right_children"]).size(), tree.NumNodes());
   };
   check_jtree(jtree, tree);
 
   RegTree loaded;
   loaded.LoadModel(jtree);
   ASSERT_TRUE(loaded.IsMultiTarget());
-  ASSERT_EQ(loaded.param.num_nodes, 3);
+  ASSERT_EQ(loaded.NumNodes(), 3);
 
   Json jtree1{Object{}};
   loaded.SaveModel(&jtree1);
diff --git a/tests/cpp/tree/test_partitioner.h b/tests/cpp/tree/test_partitioner.h
index 093aa69eb..fbd98ddf9 100644
--- a/tests/cpp/tree/test_partitioner.h
+++ b/tests/cpp/tree/test_partitioner.h
@@ -1,17 +1,20 @@
-/*!
- * Copyright 2021-2022, XGBoost contributors.
+/**
+ * Copyright 2021-2023 by XGBoost contributors.
  */
 #ifndef XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_
 #define XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_
-#include <xgboost/tree_model.h>
+#include <xgboost/context.h>                      // for Context
+#include <xgboost/linalg.h>                       // for Constant, Vector
+#include <xgboost/logging.h>                      // for CHECK
+#include <xgboost/tree_model.h>                   // for RegTree
 
-#include <vector>
+#include <vector>                                 // for vector
 
-#include "../../../src/tree/hist/expand_entry.h"
+#include "../../../src/tree/hist/expand_entry.h"  // for CPUExpandEntry, MultiExpandEntry
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 inline void GetSplit(RegTree *tree, float split_value, std::vector<CPUExpandEntry> *candidates) {
+  CHECK(!tree->IsMultiTarget());
   tree->ExpandNode(
       /*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
       /*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
@@ -21,6 +24,22 @@ inline void GetSplit(RegTree *tree, float split_value, std::vector<CPUExpandEntr
   candidates->front().split.sindex = 0;
   candidates->front().split.sindex |= (1U << 31);
 }
-}  // namespace tree
-}  // namespace xgboost
+
+inline void GetMultiSplitForTest(RegTree *tree, float split_value,
+                                 std::vector<MultiExpandEntry> *candidates) {
+  CHECK(tree->IsMultiTarget());
+  auto n_targets = tree->NumTargets();
+  Context ctx;
+  linalg::Vector<float> base_weight{linalg::Constant(&ctx, 0.0f, n_targets)};
+  linalg::Vector<float> left_weight{linalg::Constant(&ctx, 0.0f, n_targets)};
+  linalg::Vector<float> right_weight{linalg::Constant(&ctx, 0.0f, n_targets)};
+
+  tree->ExpandNode(/*nidx=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
+                   /*default_left=*/true, base_weight.HostView(), left_weight.HostView(),
+                   right_weight.HostView());
+  candidates->front().split.split_value = split_value;
+  candidates->front().split.sindex = 0;
+  candidates->front().split.sindex |= (1U << 31);
+}
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_
diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc
index 063816def..78161cac9 100644
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -32,8 +32,7 @@ TEST(Updater, Prune) {
   auto ctx = CreateEmptyGenericParam(GPUIDX);
 
   // prepare tree
-  RegTree tree = RegTree();
-  tree.param.UpdateAllowUnknown(cfg);
+  RegTree tree = RegTree{1u, kCols};
   std::vector<RegTree*> trees {&tree};
   // prepare pruner
   TrainParam param;
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index ad98d1d6b..2aa1b8f47 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -1,25 +1,29 @@
-/*!
- * Copyright 2018-2022 by XGBoost Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/tree_updater.h>
 
 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <string>
 #include <vector>
 
+#include "../../../src/tree/common_row_partitioner.h"
+#include "../../../src/tree/hist/expand_entry.h"  // for MultiExpandEntry, CPUExpandEntry
 #include "../../../src/tree/param.h"
 #include "../../../src/tree/split_evaluator.h"
-#include "../../../src/tree/common_row_partitioner.h"
 #include "../helpers.h"
 #include "test_partitioner.h"
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace tree {
-TEST(QuantileHist, Partitioner) {
-  size_t n_samples = 1024, n_features = 1, base_rowid = 0;
+namespace xgboost::tree {
+template <typename ExpandEntry>
+void TestPartitioner(bst_target_t n_targets) {
+  std::size_t n_samples = 1024, base_rowid = 0;
+  bst_feature_t n_features = 1;
+
   Context ctx;
   ctx.InitAllowUnknown(Args{});
 
@@ -29,7 +33,8 @@ TEST(QuantileHist, Partitioner) {
   ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
 
   auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  std::vector<ExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
 
   auto cuts = common::SketchOnDMatrix(Xy.get(), 64, ctx.Threads());
 
@@ -40,9 +45,13 @@ TEST(QuantileHist, Partitioner) {
     column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
     {
       auto min_value = gmat.cut.MinValues()[split_ind];
-      RegTree tree;
+      RegTree tree{n_targets, n_features};
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
-      GetSplit(&tree, min_value, &candidates);
+      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+        GetSplit(&tree, min_value, &candidates);
+      } else {
+        GetMultiSplitForTest(&tree, min_value, &candidates);
+      }
       partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
       ASSERT_EQ(partitioner.Size(), 3);
       ASSERT_EQ(partitioner[1].Size(), 0);
@@ -52,9 +61,13 @@ TEST(QuantileHist, Partitioner) {
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
       auto ptr = gmat.cut.Ptrs()[split_ind + 1];
       float split_value = gmat.cut.Values().at(ptr / 2);
-      RegTree tree;
-      GetSplit(&tree, split_value, &candidates);
-      auto left_nidx = tree[RegTree::kRoot].LeftChild();
+      RegTree tree{n_targets, n_features};
+      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+        GetSplit(&tree, split_value, &candidates);
+      } else {
+        GetMultiSplitForTest(&tree, split_value, &candidates);
+      }
+      auto left_nidx = tree.LeftChild(RegTree::kRoot);
       partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
 
       auto elem = partitioner[left_nidx];
@@ -64,14 +77,17 @@ TEST(QuantileHist, Partitioner) {
         auto value = gmat.cut.Values().at(gmat.index[*it]);
         ASSERT_LE(value, split_value);
       }
-      auto right_nidx = tree[RegTree::kRoot].RightChild();
+      auto right_nidx = tree.RightChild(RegTree::kRoot);
       elem = partitioner[right_nidx];
       for (auto it = elem.begin; it != elem.end; ++it) {
         auto value = gmat.cut.Values().at(gmat.index[*it]);
-        ASSERT_GT(value, split_value) << *it;
+        ASSERT_GT(value, split_value);
       }
     }
   }
 }
-}  // namespace tree
-}  // namespace xgboost
+
+TEST(QuantileHist, Partitioner) { TestPartitioner<CPUExpandEntry>(1); }
+
+TEST(QuantileHist, MultiPartitioner) { TestPartitioner<MultiExpandEntry>(3); }
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc
index 80a0cbe6f..f46ec2880 100644
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -28,9 +28,8 @@ TEST(Updater, Refresh) {
       {"num_feature", std::to_string(kCols)},
       {"reg_lambda", "1"}};
 
-  RegTree tree = RegTree();
+  RegTree tree = RegTree{1u, kCols};
   auto ctx = CreateEmptyGenericParam(GPUIDX);
-  tree.param.UpdateAllowUnknown(cfg);
   std::vector<RegTree*> trees{&tree};
 
   ObjInfo task{ObjInfo::kRegression};
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index 130a0ef70..44708ebd1 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -11,9 +11,8 @@
 namespace xgboost {
 TEST(Tree, ModelShape) {
   bst_feature_t n_features = std::numeric_limits<uint32_t>::max();
-  RegTree tree;
-  tree.param.UpdateAllowUnknown(Args{{"num_feature", std::to_string(n_features)}});
-  ASSERT_EQ(tree.param.num_feature, n_features);
+  RegTree tree{1u, n_features};
+  ASSERT_EQ(tree.NumFeatures(), n_features);
 
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/tree.model";
@@ -27,7 +26,7 @@ TEST(Tree, ModelShape) {
     RegTree new_tree;
     std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(tmp_file.c_str(), "r"));
     new_tree.Load(fi.get());
-    ASSERT_EQ(new_tree.param.num_feature, n_features);
+    ASSERT_EQ(new_tree.NumFeatures(), n_features);
   }
   {
     // json
@@ -39,7 +38,7 @@ TEST(Tree, ModelShape) {
 
     auto j_loaded = Json::Load(StringView{dumped.data(), dumped.size()});
     new_tree.LoadModel(j_loaded);
-    ASSERT_EQ(new_tree.param.num_feature, n_features);
+    ASSERT_EQ(new_tree.NumFeatures(), n_features);
   }
   {
     // ubjson
@@ -51,7 +50,7 @@ TEST(Tree, ModelShape) {
 
     auto j_loaded = Json::Load(StringView{dumped.data(), dumped.size()}, std::ios::binary);
     new_tree.LoadModel(j_loaded);
-    ASSERT_EQ(new_tree.param.num_feature, n_features);
+    ASSERT_EQ(new_tree.NumFeatures(), n_features);
   }
 }
 
@@ -488,8 +487,7 @@ TEST(Tree, JsonIO) {
 
   RegTree loaded_tree;
   loaded_tree.LoadModel(j_tree);
-  ASSERT_EQ(loaded_tree.param.num_nodes, 3);
-
+  ASSERT_EQ(loaded_tree.NumNodes(), 3);
   ASSERT_TRUE(loaded_tree == tree);
 
   auto left = tree[0].LeftChild();
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index eab34f752..f5fe53165 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -37,8 +37,7 @@ class UpdaterTreeStatTest : public ::testing::Test {
                                            : CreateEmptyGenericParam(Context::kCpuId));
     auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
     up->Configure(Args{});
-    RegTree tree;
-    tree.param.num_feature = kCols;
+    RegTree tree{1u, kCols};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
     up->Update(&param, &gpairs_, p_dmat_.get(), position, {&tree});
 
@@ -95,16 +94,14 @@ class UpdaterEtaTest : public ::testing::Test {
     param1.Init(Args{{"eta", "1.0"}});
 
     for (size_t iter = 0; iter < 4; ++iter) {
-      RegTree tree_0;
+      RegTree tree_0{1u, kCols};
       {
-        tree_0.param.num_feature = kCols;
         std::vector<HostDeviceVector<bst_node_t>> position(1);
         up_0->Update(&param0, &gpairs_, p_dmat_.get(), position, {&tree_0});
       }
 
-      RegTree tree_1;
+      RegTree tree_1{1u, kCols};
       {
-        tree_1.param.num_feature = kCols;
         std::vector<HostDeviceVector<bst_node_t>> position(1);
         up_1->Update(&param1, &gpairs_, p_dmat_.get(), position, {&tree_1});
       }
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index 0250cea3f..3cd65e30f 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -6,6 +6,7 @@ from hypothesis import given, settings, strategies
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import check_inf
 
 sys.path.append("tests/python")
 import test_quantile_dmatrix as tqd
@@ -153,3 +154,9 @@ class TestQuantileDMatrix:
         from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
 
         assert tm.predictor_equal(from_qdm, from_dm)
+
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_check_inf(self) -> None:
+        import cupy as cp
+        rng = cp.random.default_rng(1994)
+        check_inf(rng)
diff --git a/tests/python-gpu/test_gpu_ranking.py b/tests/python-gpu/test_gpu_ranking.py
index d86c1aa14..50bbc3f1c 100644
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -1,194 +1,130 @@
-import itertools
 import os
-import shutil
-import urllib.request
-import zipfile
+from typing import Dict
 
 import numpy as np
+import pytest
 
 import xgboost
 from xgboost import testing as tm
 
-pytestmark = tm.timeout(10)
+pytestmark = tm.timeout(30)
 
 
-class TestRanking:
-    @classmethod
-    def setup_class(cls):
-        """
-        Download and setup the test fixtures
-        """
-        from sklearn.datasets import load_svmlight_files
+def comp_training_with_rank_objective(
+    dtrain: xgboost.DMatrix,
+    dtest: xgboost.DMatrix,
+    rank_objective: str,
+    metric_name: str,
+    tolerance: float = 1e-02,
+) -> None:
+    """Internal method that trains the dataset using the rank objective on GPU and CPU,
+    evaluates the metric and determines if the delta between the metric is within the
+    tolerance level.
 
-        # download the test data
-        cls.dpath = os.path.join(tm.demo_dir(__file__), "rank/")
-        src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
-        target = os.path.join(cls.dpath, "MQ2008.zip")
+    """
+    # specify validations set to watch performance
+    watchlist = [(dtest, "eval"), (dtrain, "train")]
 
-        if os.path.exists(cls.dpath) and os.path.exists(target):
-            print("Skipping dataset download...")
-        else:
-            urllib.request.urlretrieve(url=src, filename=target)
-            with zipfile.ZipFile(target, 'r') as f:
-                f.extractall(path=cls.dpath)
+    params = {
+        "booster": "gbtree",
+        "tree_method": "gpu_hist",
+        "gpu_id": 0,
+        "predictor": "gpu_predictor",
+    }
 
-        (x_train, y_train, qid_train, x_test, y_test, qid_test,
-         x_valid, y_valid, qid_valid) = load_svmlight_files(
-            (cls.dpath + "MQ2008/Fold1/train.txt",
-             cls.dpath + "MQ2008/Fold1/test.txt",
-             cls.dpath + "MQ2008/Fold1/vali.txt"),
-            query_id=True, zero_based=False)
-        # instantiate the matrices
-        cls.dtrain = xgboost.DMatrix(x_train, y_train)
-        cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
-        cls.dtest = xgboost.DMatrix(x_test, y_test)
-        # set the group counts from the query IDs
-        cls.dtrain.set_group([len(list(items))
-                              for _key, items in itertools.groupby(qid_train)])
-        cls.dtest.set_group([len(list(items))
-                             for _key, items in itertools.groupby(qid_test)])
-        cls.dvalid.set_group([len(list(items))
-                              for _key, items in itertools.groupby(qid_valid)])
-        # save the query IDs for testing
-        cls.qid_train = qid_train
-        cls.qid_test = qid_test
-        cls.qid_valid = qid_valid
+    num_trees = 100
+    check_metric_improvement_rounds = 10
 
-        def setup_weighted(x, y, groups):
-            # Setup weighted data
-            data = xgboost.DMatrix(x, y)
-            groups_segment = [len(list(items))
-                              for _key, items in itertools.groupby(groups)]
-            data.set_group(groups_segment)
-            n_groups = len(groups_segment)
-            weights = np.ones((n_groups,))
-            data.set_weight(weights)
-            return data
+    evals_result: Dict[str, Dict] = {}
+    params["objective"] = rank_objective
+    params["eval_metric"] = metric_name
+    bst = xgboost.train(
+        params,
+        dtrain,
+        num_boost_round=num_trees,
+        early_stopping_rounds=check_metric_improvement_rounds,
+        evals=watchlist,
+        evals_result=evals_result,
+    )
+    gpu_scores = evals_result["train"][metric_name][-1]
 
-        cls.dtrain_w = setup_weighted(x_train, y_train, qid_train)
-        cls.dtest_w = setup_weighted(x_test, y_test, qid_test)
-        cls.dvalid_w = setup_weighted(x_valid, y_valid, qid_valid)
+    evals_result = {}
 
-        # model training parameters
-        cls.params = {'booster': 'gbtree',
-                      'tree_method': 'gpu_hist',
-                      'gpu_id': 0,
-                      'predictor': 'gpu_predictor'}
-        cls.cpu_params = {'booster': 'gbtree',
-                          'tree_method': 'hist',
-                          'gpu_id': -1,
-                          'predictor': 'cpu_predictor'}
+    cpu_params = {
+        "booster": "gbtree",
+        "tree_method": "hist",
+        "gpu_id": -1,
+        "predictor": "cpu_predictor",
+    }
+    cpu_params["objective"] = rank_objective
+    cpu_params["eval_metric"] = metric_name
+    bstc = xgboost.train(
+        cpu_params,
+        dtrain,
+        num_boost_round=num_trees,
+        early_stopping_rounds=check_metric_improvement_rounds,
+        evals=watchlist,
+        evals_result=evals_result,
+    )
+    cpu_scores = evals_result["train"][metric_name][-1]
 
-    @classmethod
-    def teardown_class(cls):
-        """
-        Cleanup test artifacts from download and unpacking
-        :return:
-        """
-        os.remove(os.path.join(cls.dpath, "MQ2008.zip"))
-        shutil.rmtree(os.path.join(cls.dpath, "MQ2008"))
+    info = (rank_objective, metric_name)
+    assert np.allclose(gpu_scores, cpu_scores, tolerance, tolerance), info
+    assert np.allclose(bst.best_score, bstc.best_score, tolerance, tolerance), info
 
-    @classmethod
-    def __test_training_with_rank_objective(cls, rank_objective, metric_name, tolerance=1e-02):
-        """
-        Internal method that trains the dataset using the rank objective on GPU and CPU, evaluates
-        the metric and determines if the delta between the metric is within the tolerance level
-        :return:
-        """
-        # specify validations set to watch performance
-        watchlist = [(cls.dtest, 'eval'), (cls.dtrain, 'train')]
+    evals_result_weighted: Dict[str, Dict] = {}
+    dtest.set_weight(np.ones((dtest.get_group().size,)))
+    dtrain.set_weight(np.ones((dtrain.get_group().size,)))
+    watchlist = [(dtest, "eval"), (dtrain, "train")]
+    bst_w = xgboost.train(
+        params,
+        dtrain,
+        num_boost_round=num_trees,
+        early_stopping_rounds=check_metric_improvement_rounds,
+        evals=watchlist,
+        evals_result=evals_result_weighted,
+    )
+    weighted_metric = evals_result_weighted["train"][metric_name][-1]
 
-        num_trees = 100
-        check_metric_improvement_rounds = 10
+    tolerance = 1e-5
+    assert np.allclose(bst_w.best_score, bst.best_score, tolerance, tolerance)
+    assert np.allclose(weighted_metric, gpu_scores, tolerance, tolerance)
 
-        evals_result = {}
-        cls.params['objective'] = rank_objective
-        cls.params['eval_metric'] = metric_name
-        bst = xgboost.train(
-            cls.params, cls.dtrain, num_boost_round=num_trees,
-            early_stopping_rounds=check_metric_improvement_rounds,
-            evals=watchlist, evals_result=evals_result)
-        gpu_map_metric = evals_result['train'][metric_name][-1]
 
-        evals_result = {}
-        cls.cpu_params['objective'] = rank_objective
-        cls.cpu_params['eval_metric'] = metric_name
-        bstc = xgboost.train(
-            cls.cpu_params, cls.dtrain, num_boost_round=num_trees,
-            early_stopping_rounds=check_metric_improvement_rounds,
-            evals=watchlist, evals_result=evals_result)
-        cpu_map_metric = evals_result['train'][metric_name][-1]
+@pytest.mark.parametrize(
+    "objective,metric",
+    [
+        ("rank:pairwise", "auc"),
+        ("rank:pairwise", "ndcg"),
+        ("rank:pairwise", "map"),
+        ("rank:ndcg", "auc"),
+        ("rank:ndcg", "ndcg"),
+        ("rank:ndcg", "map"),
+        ("rank:map", "auc"),
+        ("rank:map", "ndcg"),
+        ("rank:map", "map"),
+    ],
+)
+def test_with_mq2008(objective, metric) -> None:
+    (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    ) = tm.data.get_mq2008(os.path.join(os.path.join(tm.demo_dir(__file__), "rank")))
 
-        assert np.allclose(gpu_map_metric, cpu_map_metric, tolerance,
-                           tolerance)
-        assert np.allclose(bst.best_score, bstc.best_score, tolerance,
-                           tolerance)
+    if metric.find("map") != -1 or objective.find("map") != -1:
+        y_train[y_train <= 1] = 0.0
+        y_train[y_train > 1] = 1.0
+        y_test[y_test <= 1] = 0.0
+        y_test[y_test > 1] = 1.0
 
-        evals_result_weighted = {}
-        watchlist = [(cls.dtest_w, 'eval'), (cls.dtrain_w, 'train')]
-        bst_w = xgboost.train(
-            cls.params, cls.dtrain_w, num_boost_round=num_trees,
-            early_stopping_rounds=check_metric_improvement_rounds,
-            evals=watchlist, evals_result=evals_result_weighted)
-        weighted_metric = evals_result_weighted['train'][metric_name][-1]
-        # GPU Ranking is not deterministic due to `AtomicAddGpair`,
-        # remove tolerance once the issue is resolved.
-        # https://github.com/dmlc/xgboost/issues/5561
-        assert np.allclose(bst_w.best_score, bst.best_score,
-                           tolerance, tolerance)
-        assert np.allclose(weighted_metric, gpu_map_metric,
-                           tolerance, tolerance)
+    dtrain = xgboost.DMatrix(x_train, y_train, qid=qid_train)
+    dtest = xgboost.DMatrix(x_test, y_test, qid=qid_test)
 
-    def test_training_rank_pairwise_map_metric(self):
-        """
-        Train an XGBoost ranking model with pairwise objective function and compare map metric
-        """
-        self.__test_training_with_rank_objective('rank:pairwise', 'map')
-
-    def test_training_rank_pairwise_auc_metric(self):
-        """
-        Train an XGBoost ranking model with pairwise objective function and compare auc metric
-        """
-        self.__test_training_with_rank_objective('rank:pairwise', 'auc')
-
-    def test_training_rank_pairwise_ndcg_metric(self):
-        """
-        Train an XGBoost ranking model with pairwise objective function and compare ndcg metric
-        """
-        self.__test_training_with_rank_objective('rank:pairwise', 'ndcg')
-
-    def test_training_rank_ndcg_map(self):
-        """
-        Train an XGBoost ranking model with ndcg objective function and compare map metric
-        """
-        self.__test_training_with_rank_objective('rank:ndcg', 'map')
-
-    def test_training_rank_ndcg_auc(self):
-        """
-        Train an XGBoost ranking model with ndcg objective function and compare auc metric
-        """
-        self.__test_training_with_rank_objective('rank:ndcg', 'auc')
-
-    def test_training_rank_ndcg_ndcg(self):
-        """
-        Train an XGBoost ranking model with ndcg objective function and compare ndcg metric
-        """
-        self.__test_training_with_rank_objective('rank:ndcg', 'ndcg')
-
-    def test_training_rank_map_map(self):
-        """
-        Train an XGBoost ranking model with map objective function and compare map metric
-        """
-        self.__test_training_with_rank_objective('rank:map', 'map')
-
-    def test_training_rank_map_auc(self):
-        """
-        Train an XGBoost ranking model with map objective function and compare auc metric
-        """
-        self.__test_training_with_rank_objective('rank:map', 'auc')
-
-    def test_training_rank_map_ndcg(self):
-        """
-        Train an XGBoost ranking model with map objective function and compare ndcg metric
-        """
-        self.__test_training_with_rank_objective('rank:map', 'ndcg')
+    comp_training_with_rank_objective(dtrain, dtest, objective, metric)
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 6b28296b2..ea8d5dcb5 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -32,6 +32,19 @@ def train_result(param, dmat: xgb.DMatrix, num_rounds: int) -> dict:
     return result
 
 
+class TestGPUUpdatersMulti:
+    @given(
+        hist_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
+    )
+    @settings(deadline=None, max_examples=50, print_blob=True)
+    def test_hist(self, param, num_rounds, dataset):
+        param["tree_method"] = "gpu_hist"
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+
 class TestGPUUpdaters:
     cputest = test_up.TestTreeMethod()
 
@@ -101,7 +114,7 @@ class TestGPUUpdaters:
     ) -> None:
         cat_parameters.update(hist_parameters)
         dataset = tm.TestDataset(
-            "ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
+            "ames_housing", tm.data.get_ames_housing, "reg:squarederror", "rmse"
         )
         cat_parameters["tree_method"] = "gpu_hist"
         results = train_result(cat_parameters, dataset.get_dmat(), 16)
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index 06f666da1..d03ce142b 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -15,13 +15,17 @@ rng = np.random.RandomState(1994)
 
 
 def json_model(model_path: str, parameters: dict) -> dict:
-    X = np.random.random((10, 3))
-    y = np.random.randint(2, size=(10,))
+    datasets = pytest.importorskip("sklearn.datasets")
+
+    X, y = datasets.make_classification(64, n_features=8, n_classes=3, n_informative=6)
+    if parameters.get("objective", None) == "multi:softmax":
+        parameters["num_class"] = 3
 
     dm1 = xgb.DMatrix(X, y)
 
     bst = xgb.train(parameters, dm1)
     bst.save_model(model_path)
+
     if model_path.endswith("ubj"):
         import ubjson
         with open(model_path, "rb") as ubjfd:
@@ -234,6 +238,27 @@ class TestModels:
         xgb.cv(param, dtrain, num_round, nfold=5,
                metrics={'error'}, seed=0, show_stdv=False)
 
+    def test_prediction_cache(self) -> None:
+        X, y = tm.make_sparse_regression(512, 4, 0.5, as_dense=False)
+        Xy = xgb.DMatrix(X, y)
+        param = {"max_depth": 8}
+        booster = xgb.train(param, Xy, num_boost_round=1)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "model.json")
+            booster.save_model(path)
+
+            predt_0 = booster.predict(Xy)
+
+            param["max_depth"] = 2
+
+            booster = xgb.train(param, Xy, num_boost_round=1)
+            predt_1 = booster.predict(Xy)
+            assert not np.isclose(predt_0, predt_1).all()
+
+            booster.load_model(path)
+            predt_2 = booster.predict(Xy)
+            np.testing.assert_allclose(predt_0, predt_2)
+
     def test_feature_names_validation(self):
         X = np.random.random((10, 3))
         y = np.random.randint(2, size=(10,))
@@ -305,24 +330,43 @@ class TestModels:
         from_ubjraw = xgb.Booster()
         from_ubjraw.load_model(ubj_raw)
 
-        old_from_json = from_jraw.save_raw(raw_format="deprecated")
-        old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
+        if parameters.get("multi_strategy", None) != "multi_output_tree":
+            # old binary model is not supported.
+            old_from_json = from_jraw.save_raw(raw_format="deprecated")
+            old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
 
-        assert old_from_json == old_from_ubj
+            assert old_from_json == old_from_ubj
 
         raw_json = bst.save_raw(raw_format="json")
         pretty = json.dumps(json.loads(raw_json), indent=2) + "\n\n"
         bst.load_model(bytearray(pretty, encoding="ascii"))
 
-        old_from_json = from_jraw.save_raw(raw_format="deprecated")
-        old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
+        if parameters.get("multi_strategy", None) != "multi_output_tree":
+            # old binary model is not supported.
+            old_from_json = from_jraw.save_raw(raw_format="deprecated")
+            old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
 
-        assert old_from_json == old_from_ubj
+            assert old_from_json == old_from_ubj
+
+        rng = np.random.default_rng()
+        X = rng.random(size=from_jraw.num_features() * 10).reshape(
+            (10, from_jraw.num_features())
+        )
+        predt_from_jraw = from_jraw.predict(xgb.DMatrix(X))
+        predt_from_bst = bst.predict(xgb.DMatrix(X))
+        np.testing.assert_allclose(predt_from_jraw, predt_from_bst)
 
     @pytest.mark.parametrize("ext", ["json", "ubj"])
     def test_model_json_io(self, ext: str) -> None:
         parameters = {"booster": "gbtree", "tree_method": "hist"}
         self.run_model_json_io(parameters, ext)
+        parameters = {
+            "booster": "gbtree",
+            "tree_method": "hist",
+            "multi_strategy": "multi_output_tree",
+            "objective": "multi:softmax",
+        }
+        self.run_model_json_io(parameters, ext)
         parameters = {"booster": "gblinear"}
         self.run_model_json_io(parameters, ext)
         parameters = {"booster": "dart", "tree_method": "hist"}
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index fabf8672e..e8375aa5e 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -465,7 +465,7 @@ class TestCallbacks:
                 assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".pkl"))
 
     def test_callback_list(self):
-        X, y = tm.get_california_housing()
+        X, y = tm.data.get_california_housing()
         m = xgb.DMatrix(X, y)
         callbacks = [xgb.callback.EarlyStopping(rounds=10)]
         for i in range(4):
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index 316d0e5f6..537910725 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -15,7 +15,7 @@ from xgboost.testing import (
     make_sparse_regression,
     predictor_equal,
 )
-from xgboost.testing.data import np_dtypes
+from xgboost.testing.data import check_inf, np_dtypes
 
 
 class TestQuantileDMatrix:
@@ -244,6 +244,10 @@ class TestQuantileDMatrix:
         from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
         assert predictor_equal(from_qdm, from_dm)
 
+    def test_check_inf(self) -> None:
+        rng = np.random.default_rng(1994)
+        check_inf(rng)
+
     # we don't test empty Quantile DMatrix in single node construction.
     @given(
         strategies.integers(1, 1000),
diff --git a/tests/python/test_ranking.py b/tests/python/test_ranking.py
index 239271ec7..30de920f7 100644
--- a/tests/python/test_ranking.py
+++ b/tests/python/test_ranking.py
@@ -82,7 +82,7 @@ class TestRanking:
         """
         cls.dpath = 'demo/rank/'
         (x_train, y_train, qid_train, x_test, y_test, qid_test,
-         x_valid, y_valid, qid_valid) = tm.get_mq2008(cls.dpath)
+         x_valid, y_valid, qid_valid) = tm.data.get_mq2008(cls.dpath)
 
         # instantiate the matrices
         cls.dtrain = xgboost.DMatrix(x_train, y_train)
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index be72793e7..dd710f6a4 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -11,6 +11,7 @@ from xgboost import testing as tm
 from xgboost.testing.params import (
     cat_parameter_strategy,
     exact_parameter_strategy,
+    hist_multi_parameter_strategy,
     hist_parameter_strategy,
 )
 from xgboost.testing.updater import check_init_estimation, check_quantile_loss
@@ -18,11 +19,70 @@ from xgboost.testing.updater import check_init_estimation, check_quantile_loss
 
 def train_result(param, dmat, num_rounds):
     result = {}
-    xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
-              evals_result=result)
+    booster = xgb.train(
+        param,
+        dmat,
+        num_rounds,
+        [(dmat, "train")],
+        verbose_eval=False,
+        evals_result=result,
+    )
+    assert booster.num_features() == dmat.num_col()
+    assert booster.num_boosted_rounds() == num_rounds
+    assert booster.feature_names == dmat.feature_names
+    assert booster.feature_types == dmat.feature_types
+
     return result
 
 
+class TestTreeMethodMulti:
+    @given(
+        exact_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_exact(self, param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
+        if dataset.name.endswith("-l1"):
+            return
+        param["tree_method"] = "exact"
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    @given(
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.multi_dataset_strategy,
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_approx(self, param, hist_param, num_rounds, dataset):
+        param["tree_method"] = "approx"
+        param = dataset.set_params(param)
+        param.update(hist_param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    @given(
+        exact_parameter_strategy,
+        hist_multi_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.multi_dataset_strategy,
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_hist(
+        self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset
+    ) -> None:
+        if dataset.name.endswith("-l1"):
+            return
+        param["tree_method"] = "hist"
+        param = dataset.set_params(param)
+        param.update(hist_param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+
 class TestTreeMethod:
     USE_ONEHOT = np.iinfo(np.int32).max
     USE_PART = 1
@@ -77,10 +137,14 @@ class TestTreeMethod:
         # Second prune should not change the tree
         assert after_prune == second_prune
 
-    @given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
-           tm.dataset_strategy)
+    @given(
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.dataset_strategy
+    )
     @settings(deadline=None, print_blob=True)
-    def test_hist(self, param, hist_param, num_rounds, dataset):
+    def test_hist(self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
         param['tree_method'] = 'hist'
         param = dataset.set_params(param)
         param.update(hist_param)
@@ -88,23 +152,6 @@ class TestTreeMethod:
         note(result)
         assert tm.non_increasing(result['train'][dataset.metric])
 
-    @given(tm.sparse_datasets_strategy)
-    @settings(deadline=None, print_blob=True)
-    def test_sparse(self, dataset):
-        param = {"tree_method": "hist", "max_bin": 64}
-        hist_result = train_result(param, dataset.get_dmat(), 16)
-        note(hist_result)
-        assert tm.non_increasing(hist_result['train'][dataset.metric])
-
-        param = {"tree_method": "approx", "max_bin": 64}
-        approx_result = train_result(param, dataset.get_dmat(), 16)
-        note(approx_result)
-        assert tm.non_increasing(approx_result['train'][dataset.metric])
-
-        np.testing.assert_allclose(
-            hist_result["train"]["rmse"], approx_result["train"]["rmse"]
-        )
-
     def test_hist_categorical(self):
         # hist must be same as exact on all-categorial data
         dpath = 'demo/data/'
@@ -143,6 +190,23 @@ class TestTreeMethod:
         w = [0, 0, 1, 0]
         model.fit(X, y, sample_weight=w)
 
+    @given(tm.sparse_datasets_strategy)
+    @settings(deadline=None, print_blob=True)
+    def test_sparse(self, dataset):
+        param = {"tree_method": "hist", "max_bin": 64}
+        hist_result = train_result(param, dataset.get_dmat(), 16)
+        note(hist_result)
+        assert tm.non_increasing(hist_result['train'][dataset.metric])
+
+        param = {"tree_method": "approx", "max_bin": 64}
+        approx_result = train_result(param, dataset.get_dmat(), 16)
+        note(approx_result)
+        assert tm.non_increasing(approx_result['train'][dataset.metric])
+
+        np.testing.assert_allclose(
+            hist_result["train"]["rmse"], approx_result["train"]["rmse"]
+        )
+
     def run_invalid_category(self, tree_method: str) -> None:
         rng = np.random.default_rng()
         # too large
@@ -365,7 +429,7 @@ class TestTreeMethod:
     ) -> None:
         cat_parameters.update(hist_parameters)
         dataset = tm.TestDataset(
-            "ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
+            "ames_housing", tm.data.get_ames_housing, "reg:squarederror", "rmse"
         )
         cat_parameters["tree_method"] = tree_method
         results = train_result(cat_parameters, dataset.get_dmat(), 16)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index baef690ee..c34b7d2d1 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -128,12 +128,23 @@ def test_ranking():
 
     x_test = np.random.rand(100, 10)
 
-    params = {'tree_method': 'exact', 'objective': 'rank:pairwise',
-              'learning_rate': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1,
-              'max_depth': 6, 'n_estimators': 4}
+    params = {
+        "tree_method": "exact",
+        "learning_rate": 0.1,
+        "gamma": 1.0,
+        "min_child_weight": 0.1,
+        "max_depth": 6,
+        "eval_metric": "ndcg",
+        "n_estimators": 4,
+    }
     model = xgb.sklearn.XGBRanker(**params)
-    model.fit(x_train, y_train, group=train_group,
-              eval_set=[(x_valid, y_valid)], eval_group=[valid_group])
+    model.fit(
+        x_train,
+        y_train,
+        group=train_group,
+        eval_set=[(x_valid, y_valid)],
+        eval_group=[valid_group],
+    )
     assert model.evals_result()
 
     pred = model.predict(x_test)
@@ -145,11 +156,18 @@ def test_ranking():
     assert train_data.get_label().shape[0] == x_train.shape[0]
     valid_data.set_group(valid_group)
 
-    params_orig = {'tree_method': 'exact', 'objective': 'rank:pairwise',
-                   'eta': 0.1, 'gamma': 1.0,
-                   'min_child_weight': 0.1, 'max_depth': 6}
-    xgb_model_orig = xgb.train(params_orig, train_data, num_boost_round=4,
-                               evals=[(valid_data, 'validation')])
+    params_orig = {
+        "tree_method": "exact",
+        "objective": "rank:pairwise",
+        "eta": 0.1,
+        "gamma": 1.0,
+        "min_child_weight": 0.1,
+        "max_depth": 6,
+        "eval_metric": "ndcg",
+    }
+    xgb_model_orig = xgb.train(
+        params_orig, train_data, num_boost_round=4, evals=[(valid_data, "validation")]
+    )
     pred_orig = xgb_model_orig.predict(test_data)
 
     np.testing.assert_almost_equal(pred, pred_orig)
@@ -165,7 +183,11 @@ def test_ranking_metric() -> None:
     # sklearn compares the number of mis-classified docs, while the one in xgboost
     # compares the number of mis-classified pairs.
     ltr = xgb.XGBRanker(
-        eval_metric=roc_auc_score, n_estimators=10, tree_method="hist", max_depth=2
+        eval_metric=roc_auc_score,
+        n_estimators=10,
+        tree_method="hist",
+        max_depth=2,
+        objective="rank:pairwise",
     )
     ltr.fit(
         X,
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 369dcd421..0bf952025 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1168,7 +1168,7 @@ def test_dask_aft_survival() -> None:
 
 def test_dask_ranking(client: "Client") -> None:
     dpath = "demo/rank/"
-    mq2008 = tm.get_mq2008(dpath)
+    mq2008 = tm.data.get_mq2008(dpath)
     data = []
     for d in mq2008:
         if isinstance(d, scipy.sparse.csr_matrix):