initial merge

2023-03-25 04:31:55 +01:00 · 2023-03-25 04:31:55 +01:00 · 7fbc561e17
commit 7fbc561e17
parent d97be6f396 cff50fe3ef
146 changed files with 6730 additions and 4082 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,18 @@
 * text=auto
 *.c   text eol=lf
 *.h   text eol=lf
 *.cc  text eol=lf
 *.cuh text eol=lf
 *.cu  text eol=lf
 *.py  text eol=lf
 *.txt text eol=lf
 *.R   text eol=lf
 *.scala text eol=lf
 *.java  text eol=lf
 *.sh text eol=lf
 *.rst text eol=lf
 *.md  text eol=lf
 *.csv text eol=lf
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -156,40 +156,3 @@ jobs:
            xgboost \
            cpp \
            include src python-package
  sphinx:
    runs-on: ubuntu-latest
    name: Build docs using Sphinx
    steps:
    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
      with:
        submodules: 'true'
    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
      with:
        python-version: "3.8"
        architecture: 'x64'
    - name: Install system packages
      run: |
        sudo apt-get install -y --no-install-recommends graphviz doxygen ninja-build
        python -m pip install wheel setuptools awscli
        python -m pip install -r doc/requirements.txt
    - name: Extract branch name
      shell: bash
      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
      id: extract_branch
      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
    - name: Run Sphinx
      run: |
        make -C doc html
      env:
        SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }}
        READTHEDOCS: "True"
    - name: Publish
      run: |
        tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doxygen/doc_doxygen/
        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/doxygen/ --acl public-read
      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
      env:
        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo-m/xgboost.png width=135/>  eXtreme Gradient Boosting
+<img src="https://xgboost.ai/images/logo/xgboost-logo.svg" width=135/>  eXtreme Gradient Boosting
 ===========
 [![Build Status](https://xgboost-ci.net/job/xgboost/job/master/badge/icon)](https://xgboost-ci.net/blue/organizations/jenkins/xgboost/activity)
 [![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost-CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions)
--- a/demo/guide-python/multioutput_regression.py
+++ b/demo/guide-python/multioutput_regression.py
@ -7,6 +7,12 @@ The demo is adopted from scikit-learn:
 https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py
 See :doc:`/tutorials/multioutput` for more information.
 .. note::
    The feature is experimental. For the `multi_output_tree` strategy, many features are
    missing.
 """
 import argparse
@ -40,11 +46,18 @@ def gen_circle() -> Tuple[np.ndarray, np.ndarray]:
    return X, y
-def rmse_model(plot_result: bool):
+def rmse_model(plot_result: bool, strategy: str):
    """Draw a circle with 2-dim coordinate as target variables."""
    X, y = gen_circle()
    # Train a regressor on it
-    reg = xgb.XGBRegressor(tree_method="hist", n_estimators=64)
+    reg = xgb.XGBRegressor(
        tree_method="hist",
        n_estimators=128,
        n_jobs=16,
        max_depth=8,
        multi_strategy=strategy,
        subsample=0.6,
    )
    reg.fit(X, y, eval_set=[(X, y)])
    y_predt = reg.predict(X)
@ -52,7 +65,7 @@ def rmse_model(plot_result: bool):
        plot_predt(y, y_predt, "multi")
-def custom_rmse_model(plot_result: bool) -> None:
+def custom_rmse_model(plot_result: bool, strategy: str) -> None:
    """Train using Python implementation of Squared Error."""
    # As the experimental support status, custom objective doesn't support matrix as
@ -88,9 +101,10 @@ def custom_rmse_model(plot_result: bool) -> None:
        {
            "tree_method": "hist",
            "num_target": y.shape[1],
            "multi_strategy": strategy,
        },
        dtrain=Xy,
-        num_boost_round=100,
+        num_boost_round=128,
        obj=squared_log,
        evals=[(Xy, "Train")],
        evals_result=results,
@ -107,6 +121,16 @@ if __name__ == "__main__":
    parser.add_argument("--plot", choices=[0, 1], type=int, default=1)
    args = parser.parse_args()
    # Train with builtin RMSE objective
-    rmse_model(args.plot == 1)
+    # - One model per output.
    rmse_model(args.plot == 1, "one_output_per_tree")
    # - One model for all outputs, this is still working in progress, many features are
    # missing.
    rmse_model(args.plot == 1, "multi_output_tree")
    # Train with custom objective.
-    custom_rmse_model(args.plot == 1)
+    # - One model per output.
    custom_rmse_model(args.plot == 1, "one_output_per_tree")
    # - One model for all outputs, this is still working in progress, many features are
    # missing.
    custom_rmse_model(args.plot == 1, "multi_output_tree")
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@ -2,6 +2,9 @@
 Collection of examples for using sklearn interface
 ==================================================
 For an introduction to XGBoost's scikit-learn estimator interface, see
 :doc:`/python/sklearn_estimator`.
 Created on 1 Apr 2015
@author: Jamie Hall
--- a/doc/c++.rst
+++ b/doc/c++.rst
@ -8,5 +8,5 @@ As a result it's changing quite often and we don't maintain its stability.  Alon
 plugin system (see ``plugin/example`` in XGBoost's source tree), users can utilize some
 existing c++ headers for gaining more access to the internal of XGBoost.
-* `C++ interface documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/files.html>`_
+* `C++ interface documentation (latest master branch) <./dev/files.html>`_
 * `C++ interface documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/files.html>`_
--- a/doc/c.rst
+++ b/doc/c.rst
@ -10,7 +10,7 @@ simply look at function comments in ``include/xgboost/c_api.h``. The reference i
 to sphinx with the help of breathe, which doesn't contain links to examples but might be
 easier to read. For the original doxygen pages please visit:
-* `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
+* `C API documentation (latest master branch) <./dev/c__api_8h.html>`_
 * `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_
 ***************
--- a/doc/conf.py
+++ b/doc/conf.py
@ -13,53 +13,106 @@
 # serve to show the default.
 import os
 import re
 import shutil
 import subprocess
 import sys
 import tarfile
 import urllib.request
 import warnings
 from subprocess import call
 from urllib.error import HTTPError
 from sh.contrib import git
-git_branch = os.getenv('SPHINX_GIT_BRANCH', default=None)
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
 TMP_DIR = os.path.join(CURR_PATH, "tmp")
 DOX_DIR = "doxygen"
 def run_doxygen():
    """Run the doxygen make command in the designated folder."""
    curdir = os.path.normpath(os.path.abspath(os.path.curdir))
    if os.path.exists(TMP_DIR):
        print(f"Delete directory {TMP_DIR}")
        shutil.rmtree(TMP_DIR)
    else:
        print(f"Create directory {TMP_DIR}")
        os.mkdir(TMP_DIR)
    try:
        os.chdir(PROJECT_ROOT)
        if not os.path.exists(DOX_DIR):
            os.mkdir(DOX_DIR)
        os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
        print(
            "Build doxygen at {}".format(
                os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen")
            )
        )
        subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
        subprocess.check_call(["ninja", "doc_doxygen"])
        src = os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen", "html")
        dest = os.path.join(TMP_DIR, "dev")
        print(f"Copy directory {src} -> {dest}")
        shutil.copytree(src, dest)
    except OSError as e:
        sys.stderr.write("doxygen execution failed: %s" % e)
    finally:
        os.chdir(curdir)
 def is_readthedocs_build():
    if os.environ.get("READTHEDOCS", None) == "True":
        return True
    warnings.warn(
        "Skipping Doxygen build... You won't have documentation for C/C++ functions. "
        "Set environment variable READTHEDOCS=True if you want to build Doxygen. "
        "(If you do opt in, make sure to install Doxygen, Graphviz, CMake, and C++ compiler "
        "on your system.)"
    )
    return False
 if is_readthedocs_build():
    run_doxygen()
 git_branch = os.getenv("SPHINX_GIT_BRANCH", default=None)
 if not git_branch:
    # If SPHINX_GIT_BRANCH environment variable is not given, run git
    # to determine branch name
    git_branch = [
-        re.sub(r'origin/', '', x.lstrip(' ')) for x in str(
+        re.sub(r"origin/", "", x.lstrip(" "))
-            git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')
+        for x in str(git.branch("-r", "--contains", "HEAD")).rstrip("\n").split("\n")
    ]
-    git_branch = [x for x in git_branch if 'HEAD' not in x]
+    git_branch = [x for x in git_branch if "HEAD" not in x]
 else:
    git_branch = [git_branch]
-print('git_branch = {}'.format(git_branch[0]))
+print("git_branch = {}".format(git_branch[0]))
 try:
    filename, _ = urllib.request.urlretrieve(
-        'https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(
+        f"https://s3-us-west-2.amazonaws.com/xgboost-docs/{git_branch[0]}.tar.bz2"
-            git_branch[0]))
+    )
-    call(
+    if not os.path.exists(TMP_DIR):
-        'if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'
+        print(f"Create directory {TMP_DIR}")
-        .format(filename),
+        os.mkdir(TMP_DIR)
-        shell=True)
+    jvm_doc_dir = os.path.join(TMP_DIR, "jvm")
    if os.path.exists(jvm_doc_dir):
        print(f"Delete directory {jvm_doc_dir}")
        shutil.rmtree(jvm_doc_dir)
    print(f"Create directory {jvm_doc_dir}")
    os.mkdir(jvm_doc_dir)
    with tarfile.open(filename, "r:bz2") as t:
        t.extractall(jvm_doc_dir)
 except HTTPError:
-    print('JVM doc not found. Skipping...')
+    print("JVM doc not found. Skipping...")
 try:
    filename, _ = urllib.request.urlretrieve(
        'https://s3-us-west-2.amazonaws.com/xgboost-docs/doxygen/{}.tar.bz2'.
        format(git_branch[0]))
    call(
        'mkdir -p tmp/dev; cd tmp/dev; tar xvf {}; mv doc_doxygen/html/* .; rm -rf doc_doxygen'
        .format(filename),
        shell=True)
 except HTTPError:
    print('C API doc not found. Skipping...')
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
 libpath = os.path.join(PROJECT_ROOT, "python-package/")
 sys.path.insert(0, libpath)
 sys.path.insert(0, CURR_PATH)
@ -82,50 +135,56 @@ release = xgboost.__version__
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
 extensions = [
-    'matplotlib.sphinxext.plot_directive',
+    "matplotlib.sphinxext.plot_directive",
-    'sphinx.ext.autodoc',
+    "sphinxcontrib.jquery",
-    'sphinx.ext.napoleon',
+    "sphinx.ext.autodoc",
-    'sphinx.ext.mathjax',
+    "sphinx.ext.napoleon",
-    'sphinx.ext.intersphinx',
+    "sphinx.ext.mathjax",
    "sphinx.ext.intersphinx",
    "sphinx_gallery.gen_gallery",
-    'breathe',
+    "breathe",
-    'recommonmark'
+    "recommonmark",
 ]
 sphinx_gallery_conf = {
    # path to your example scripts
    "examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
    # path to where to save gallery generated output
-    "gallery_dirs": ["python/examples", "python/dask-examples", "python/survival-examples"],
+    "gallery_dirs": [
        "python/examples",
        "python/dask-examples",
        "python/survival-examples",
    ],
    "matplotlib_animations": True,
 }
 autodoc_typehints = "description"
-graphviz_output_format = 'png'
+graphviz_output_format = "png"
-plot_formats = [('svg', 300), ('png', 100), ('hires.png', 300)]
+plot_formats = [("svg", 300), ("png", 100), ("hires.png", 300)]
 plot_html_show_source_link = False
 plot_html_show_formats = False
 # Breathe extension variables
-DOX_DIR = "doxygen"
+breathe_projects = {}
 if is_readthedocs_build():
    breathe_projects = {
        "xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml")
    }
 breathe_default_project = "xgboost"
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]
 # The encoding of source files.
 # source_encoding = 'utf-8-sig'
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@ -134,7 +193,7 @@ master_doc = 'index'
 # Usually you set "language" from the command line for these cases.
 language = "en"
-autoclass_content = 'both'
+autoclass_content = "both"
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
@ -144,8 +203,10 @@ autoclass_content = 'both'
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
-html_extra_path = ['./tmp']
+html_extra_path = []
 if is_readthedocs_build():
    html_extra_path = [TMP_DIR]
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
@ -163,7 +224,7 @@ html_extra_path = ['./tmp']
 # show_authors = False
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 # A list of ignored prefixes for module index sorting.
 # modindex_common_prefix = []
@ -186,27 +247,24 @@ html_logo = "https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/lo
 html_css_files = ["css/custom.css"]
-html_sidebars = {
+html_sidebars = {"**": ["logo-text.html", "globaltoc.html", "searchbox.html"]}
  '**': ['logo-text.html', 'globaltoc.html', 'searchbox.html']
 }
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 # Output file base name for HTML help builder.
-htmlhelp_basename = project + 'doc'
+htmlhelp_basename = project + "doc"
 # -- Options for LaTeX output ---------------------------------------------
-latex_elements = {
+latex_elements = {}
 }
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  (master_doc, '%s.tex' % project, project, author, 'manual'),
+    (master_doc, "%s.tex" % project, project, author, "manual"),
 ]
 intersphinx_mapping = {
@ -221,30 +279,5 @@ intersphinx_mapping = {
 }
 # hook for doxygen
 def run_doxygen():
    """Run the doxygen make command in the designated folder."""
    curdir = os.path.normpath(os.path.abspath(os.path.curdir))
    try:
        os.chdir(PROJECT_ROOT)
        if not os.path.exists(DOX_DIR):
            os.mkdir(DOX_DIR)
        os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
        subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
        subprocess.check_call(["ninja", "doc_doxygen"])
    except OSError as e:
        sys.stderr.write("doxygen execution failed: %s" % e)
    finally:
        os.chdir(curdir)
 def generate_doxygen_xml(app):
    """Run the doxygen make commands if we're on the ReadTheDocs server"""
    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
    if read_the_docs_build:
        run_doxygen()
 def setup(app):
-    app.add_css_file('custom.css')
+    app.add_css_file("custom.css")
    app.connect("builder-inited", generate_doxygen_xml)
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -226,6 +226,18 @@ Parameters for Tree Booster
    list is a group of indices of features that are allowed to interact with each other.
    See :doc:`/tutorials/feature_interaction_constraint` for more information.
 * ``multi_strategy``, [default = ``one_output_per_tree``]
  .. versionadded:: 2.0.0
  .. note:: This parameter is working-in-progress.
  - The strategy used for training multi-target models, including multi-target regression
  and multi-class classification. See :doc:`/tutorials/multioutput` for more information.
    - ``one_output_per_tree``: One model for each target.
    - ``multi_output_tree``:  Use multi-target trees.
 .. _cat-param:
 Parameters for Categorical Feature
@ -408,8 +420,17 @@ Specify the learning task and the corresponding learning objective. The objectiv
    - ``ndcg``: `Normalized Discounted Cumulative Gain <http://en.wikipedia.org/wiki/NDCG>`_
    - ``map``: `Mean Average Precision <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_
-    - ``ndcg@n``, ``map@n``: 'n' can be assigned as an integer to cut off the top positions in the lists for evaluation.
+
-    - ``ndcg-``, ``map-``, ``ndcg@n-``, ``map@n-``: In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions.
+      The `average precision` is defined as:
      .. math::
 	 AP@l = \frac{1}{min{(l, N)}}\sum^l_{k=1}P@k \cdot I_{(k)}
      where :math:`I_{(k)}` is an indicator function that equals to :math:`1` when the document at :math:`k` is relevant and :math:`0` otherwise. The :math:`P@k` is the precision at :math:`k`, and :math:`N` is the total number of relevant documents. Lastly, the `mean average precision` is defined as the weighted average across all queries.
    - ``ndcg@n``, ``map@n``: :math:`n` can be assigned as an integer to cut off the top positions in the lists for evaluation.
    - ``ndcg-``, ``map-``, ``ndcg@n-``, ``map@n-``: In XGBoost, the NDCG and MAP evaluate the score of a list without any positive samples as :math:`1`. By appending "-" to the evaluation metric name, we can ask XGBoost to evaluate these scores as :math:`0` to be consistent under some conditions.
    - ``poisson-nloglik``: negative log-likelihood for Poisson regression
    - ``gamma-nloglik``: negative log-likelihood for gamma regression
    - ``cox-nloglik``: negative partial log-likelihood for Cox proportional hazards regression
--- a/doc/python/index.rst
+++ b/doc/python/index.rst
@ -10,6 +10,7 @@ Contents
 .. toctree::
  python_intro
  sklearn_estimator
  python_api
  callbacks
  model
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@ -41,6 +41,7 @@ Learning API
 Scikit-Learn API
 ----------------
 .. automodule:: xgboost.sklearn
 .. autoclass:: xgboost.XGBRegressor
    :members:
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@ -305,7 +305,8 @@ Scikit-Learn interface
 ----------------------
 XGBoost provides an easy to use scikit-learn interface for some pre-defined models
-including regression, classification and ranking.
+including regression, classification and ranking. See :doc:`/python/sklearn_estimator`
 for more info.
 .. code-block:: python
--- a/doc/python/sklearn_estimator.rst
+++ b/doc/python/sklearn_estimator.rst
@ -0,0 +1,162 @@
 ##########################################
 Using the Scikit-Learn Estimator Interface
 ##########################################
 **Contents**
 .. contents::
  :backlinks: none
  :local:
 ********
 Overview
 ********
 In addition to the native interface, XGBoost features a sklearn estimator interface that
 conforms to `sklearn estimator guideline
 <https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator>`__. It
 supports regression, classification, and learning to rank. Survival training for the
 sklearn estimator interface is still working in progress.
 You can find some some quick start examples at
 :ref:`sphx_glr_python_examples_sklearn_examples.py`. The main advantage of using sklearn
 interface is that it works with most of the utilites provided by sklearn like
 :py:func:`sklearn.model_selection.cross_validate`. Also, many other libraries recognize
 the sklearn estimator interface thanks to its popularity.
 With the sklearn estimator interface, we can train a classification model with only a
 couple lines of Python code. Here's an example for training a classification model:
 .. code-block:: python
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    import xgboost as xgb
    X, y = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=94)
    # Use "hist" for constructing the trees, with early stopping enabled.
    clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
    # Fit the model, test sets are used for early stopping.
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    # Save model into JSON format.
    clf.save_model("clf.json")
 The ``tree_method`` parameter specifies the method to use for constructing the trees, and
 the early_stopping_rounds parameter enables early stopping. Early stopping can help
 prevent overfitting and save time during training.
 **************
 Early Stopping
 **************
 As demonstrated in the previous example, early stopping can be enabled by the parameter
 ``early_stopping_rounds``. Alternatively, there's a callback function that can be used
 :py:class:`xgboost.callback.EarlyStopping` to specify more details about the behavior of
 early stopping, including whether XGBoost should return the best model instead of the full
 stack of trees:
 .. code-block:: python
    early_stop = xgb.callback.EarlyStopping(
        rounds=2, metric_name='logloss', data_name='Validation_0', save_best=True
    )
    clf = xgb.XGBClassifier(tree_method="hist", callbacks=[early_stop])
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
 At present, XGBoost doesn't implement data spliting logic within the estimator and relies
 on the ``eval_set`` parameter of the :py:meth:`xgboost.XGBModel.fit` method. If you want
 to use early stopping to prevent overfitting, you'll need to manually split your data into
 training and testing sets using the :py:func:`sklearn.model_selection.train_test_split`
 function from the `sklearn` library. Some other machine learning algorithms, like those in
 `sklearn`, include early stopping as part of the estimator and may work with cross
 validation. However, using early stopping during cross validation may not be a perfect
 approach because it changes the model's number of trees for each validation fold, leading
 to different model. A better approach is to retrain the model after cross validation using
 the best hyperparameters along with early stopping. If you want to experiment with idea of
 using cross validation with early stopping, here is a snippet to begin with:
 .. code-block:: python
    from sklearn.base import clone
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import StratifiedKFold, cross_validate
    import xgboost as xgb
    X, y = load_breast_cancer(return_X_y=True)
    def fit_and_score(estimator, X_train, X_test, y_train, y_test):
        """Fit the estimator on the train set and score it on both sets"""
        estimator.fit(X_train, y_train, eval_set=[(X_test, y_test)])
        train_score = estimator.score(X_train, y_train)
        test_score = estimator.score(X_test, y_test)
        return estimator, train_score, test_score
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=94)
    clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=3)
    resutls = {}
    for train, test in cv.split(X, y):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        est, train_score, test_score = fit_and_score(
            clone(clf), X_train, X_test, y_train, y_test
        )
        resutls[est] = (train_score, test_score)
 ***********************************
 Obtaining the native booster object
 ***********************************
 The sklearn estimator interface primarily facilitates training and doesn't implement all
 features available in XGBoost. For instance, in order to have cached predictions,
 :py:class:`xgboost.DMatrix` needs to be used with :py:meth:`xgboost.Booster.predict`. One
 can obtain the booster object from the sklearn interface using
 :py:meth:`xgboost.XGBModel.get_booster`:
 .. code-block:: python
   booster = clf.get_booster()
   print(booster.num_boosted_rounds())
 **********
 Prediction
 **********
 When early stopping is enabled, prediction functions including the
 :py:meth:`xgboost.XGBModel.predict`, :py:meth:`xgboost.XGBModel.score`, and
 :py:meth:`xgboost.XGBModel.apply` methods will use the best model automatically. Meaning
 the :py:attr:`xgboost.XGBModel.best_iteration` is used to specify the range of trees used
 in prediction.
 To have cached results for incremental prediction, please use the
 :py:meth:`xgboost.Booster.predict` method instead.
 **************************
 Number of parallel threads
 **************************
 When working with XGBoost and other sklearn tools, you can specify how many threads you
 want to use by using the ``n_jobs`` parameter. By default, XGBoost uses all the available
 threads on your computer, which can lead to some interesting consequences when combined
 with other sklearn functions like :py:func:`sklearn.model_selection.cross_validate`. If
 both XGBoost and sklearn are set to use all threads, your computer may start to slow down
 significantly due to something called "thread thrashing". To avoid this, you can simply
 set the ``n_jobs`` parameter for XGBoost to `None` (which uses all threads) and the
 ``n_jobs`` parameter for sklearn to `1`. This way, both programs will be able to work
 together smoothly without causing any unnecessary computer strain.
--- a/doc/tutorials/c_api_tutorial.rst
+++ b/doc/tutorials/c_api_tutorial.rst
@ -134,7 +134,7 @@ c. Assertion technique: It works both in C/ C++. If expression evaluates to 0 (f
      // do something with booster
      //free the memory
-      XGBoosterFree(booster)
+      XGBoosterFree(booster);
      DMatrixHandle DMatrixHandle_param;
@ -156,7 +156,7 @@ c. Assertion technique: It works both in C/ C++. If expression evaluates to 0 (f
 .. code-block:: c
    BoosterHandle booster;
-    XGBoosterSetParam(booster, "paramter_name", "0.1");
+    XGBoosterSetParam(booster, "parameter_name", "0.1");
 **************************************************************
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@ -190,9 +190,9 @@ Scikit-Learn wrapper object:
    booster = cls.get_booster()
-**********************
+********************************
-Scikit-Learn interface
+Scikit-Learn Estimator Interface
-**********************
+********************************
 As mentioned previously, there's another interface that mimics the scikit-learn estimators
 with higher level of of abstraction.  The interface is easier to use compared to the
@ -488,13 +488,14 @@ with dask and optuna.
 Troubleshooting
 ***************
 .. versionadded:: 1.6.0
-In some environments XGBoost might fail to resolve the IP address of the scheduler, a
+- In some environments XGBoost might fail to resolve the IP address of the scheduler, a
  symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
  during training.  A quick workaround is to specify the address explicitly.  To do that
  dask config is used:
  .. versionadded:: 1.6.0
 .. code-block:: python
    import dask
@ -511,10 +512,20 @@ dask config is used:
        reg = dxgb.DaskXGBRegressor()
-Please note that XGBoost requires a different port than dask. By default, on a unix-like
+- Please note that XGBoost requires a different port than dask. By default, on a unix-like
  system XGBoost uses the port 0 to find available ports, which may fail if a user is
-running in a restricted docker environment. In this case, please open additional ports in
+  running in a restricted docker environment. In this case, please open additional ports
-the container and specify it as in the above snippet.
+  in the container and specify it as in the above snippet.
 - If you encounter a NCCL system error while training with GPU enabled, which usually
  includes the error message `NCCL failure: unhandled system error`, you can specify its
  network configuration using one of the environment variables listed in the `NCCL
  document <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ such as
  the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
  logs.
 - MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
  that includes `Multiple processes within a communication group ...` upon initialization.
 ************
 IPv6 Support
@ -564,6 +575,69 @@ computations, one can explicitly wait for results of input data before construct
 Also dask's `diagnostics dashboard <https://distributed.dask.org/en/latest/web.html>`_ can be used to
 monitor what operations are currently being performed.
 *******************
 Reproducible Result
 *******************
 In a single node mode, we can always expect the same training result between runs as along
 as the underlying platforms are the same. However, it's difficult to obtain reproducible
 result in a distributed environment, since the tasks might get different machine
 allocation or have different amount of available resources during different
 sessions. There are heuristics and guidelines on how to achieve it but no proven method
 for guaranteeing such deterministic behavior. The Dask interface in XGBoost tries to
 provide reproducible result with best effort. This section highlights some known criteria
 and try to share some insights into the issue.
 There are primarily two different tasks for XGBoost the carry out, training and
 inference. Inference is reproducible given the same software and hardware along with the
 same run-time configurations. The remaining of this section will focus on training.
 Many of the challenges come from the fact that we are using approximation algorithms, The
 sketching algorithm used to find histogram bins is an approximation to the exact quantile
 algorithm, the `AUC` metric in a distributed environment is an approximation to the exact
 `AUC` score, and floating-point number is an approximation to real number. Floating-point
 is an issue as its summation is not associative, meaning :math:`(a + b) + c` does not
 necessarily equal to :math:`a + (b + c)`, even though this property holds true for real
 number. As a result, whenever we change the order of a summation, the result can
 differ. This imposes the requirement that, in order to have reproducible output from
 XGBoost, the entire pipeline needs to be reproducible.
 - The software stack is the same for each runs. This goes without saying. XGBoost might
  generate different outputs between different versions. This is expected as we might
  change the default value of hyper-parameter, or the parallel strategy that generates
  different floating-point result. We guarantee the correctness the algorithms, but there
  are lots of wiggle room for the final output. The situation is similar for many
  dependencies, for instance, the random number generator might differ from platform to
  platform.
 - The hardware stack is the same for each runs. This includes the number of workers, and
  the amount of available resources on each worker. XGBoost can generate different results
  using different number of workers. This is caused by the approximation issue mentioned
  previously.
 - Similar to the hardware constraint, the network topology is also a factor in final
  output. If we change topology the workers might be ordered differently, leading to
  different ordering of floating-point operations.
 - The random seed used in various place of the pipeline.
 - The partitioning of data needs to be reproducible. This is related to the available
  resources on each worker. Dask might partition the data differently for each run
  according to its own scheduling policy. For instance, if there are some additional tasks
  in the cluster while you are running the second training session for XGBoost, some of
  the workers might have constrained memory and Dask may not push the training data for
  XGBoost to that worker. This change in data partitioning can lead to different output
  models. If you are using a shared Dask cluster, then the result is likely to vary
  between runs.
 - The operations performed on dataframes need to be reproducible. There are some
  operations like `DataFrame.merge` not being deterministic on parallel hardwares like GPU
  where the order of the index might differ from run to run.
 It's expected to have different results when training the model in a distributed
 environment than training the model using a single node due to aforementioned criteria.
 ************
 Memory Usage
 ************
--- a/doc/tutorials/multioutput.rst
+++ b/doc/tutorials/multioutput.rst
@ -11,7 +11,11 @@ can be simultaneously classified as both sci-fi and comedy.  For detailed explan
 terminologies related to different multi-output models please refer to the
 :doc:`scikit-learn user guide <sklearn:modules/multiclass>`.
-Internally, XGBoost builds one model for each target similar to sklearn meta estimators,
+**********************************
 Training with One-Model-Per-Target
 **********************************
 By default, XGBoost builds one model for each target similar to sklearn meta estimators,
 with the added benefit of reusing data and other integrated features like SHAP.  For a
 worked example of regression, see
 :ref:`sphx_glr_python_examples_multioutput_regression.py`. For multi-label classification,
@ -36,3 +40,26 @@ dense matrix for labels.
 The feature is still under development with limited support from objectives and metrics.
 *************************
 Training with Vector Leaf
 *************************
 .. versionadded:: 2.0
 .. note::
   This is still working-in-progress, and many features are missing.
 XGBoost can optionally build multi-output trees with the size of leaf equals to the number
 of targets when the tree method `hist` is used. The behavior can be controlled by the
 ``multi_strategy`` training parameter, which can take the value `one_output_per_tree` (the
 default) for building one model per-target or `multi_output_tree` for building
 multi-output trees.
 .. code-block:: python
  clf = xgb.XGBClassifier(tree_method="hist", multi_strategy="multi_output_tree")
 See :ref:`sphx_glr_python_examples_multioutput_regression.py` for a worked example with
 regression.
--- a/include/xgboost/cache.h
+++ b/include/xgboost/cache.h
@ -116,6 +116,18 @@ class DMatrixCache {
   * \param cache_size Maximum size of the cache.
   */
  explicit DMatrixCache(std::size_t cache_size) : max_size_{cache_size} {}
  DMatrixCache& operator=(DMatrixCache&& that) {
    CHECK(lock_.try_lock());
    lock_.unlock();
    CHECK(that.lock_.try_lock());
    that.lock_.unlock();
    std::swap(this->container_, that.container_);
    std::swap(this->queue_, that.queue_);
    std::swap(this->max_size_, that.max_size_);
    return *this;
  }
  /**
   * \brief Cache a new DMatrix if it's not in the cache already.
   *
@ -149,6 +161,26 @@ class DMatrixCache {
    }
    return container_.at(key).value;
  }
  /**
   * \brief Re-initialize the item in cache.
   *
   *   Since the shared_ptr is used to hold the item, any reference that lives outside of
   *   the cache can no-longer be reached from the cache.
   *
   *   We use reset instead of erase to avoid walking through the whole cache for renewing
   *   a single item. (the cache is FIFO, needs to maintain the order).
   */
  template <typename... Args>
  std::shared_ptr<CacheT> ResetItem(std::shared_ptr<DMatrix> m, Args const&... args) {
    std::lock_guard<std::mutex> guard{lock_};
    CheckConsistent();
    auto key = Key{m.get(), std::this_thread::get_id()};
    auto it = container_.find(key);
    CHECK(it != container_.cend());
    it->second = {m, std::make_shared<CacheT>(args...)};
    CheckConsistent();
    return it->second.value;
  }
  /**
   * \brief Get a const reference to the underlying hash map.  Clear expired caches before
   *        returning.
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@ -171,6 +171,15 @@ class MetaInfo {
   */
  void Extend(MetaInfo const& that, bool accumulate_rows, bool check_column);
  /**
   * @brief Synchronize the number of columns across all workers.
   *
   * Normally we just need to find the maximum number of columns across all workers, but
   * in vertical federated learning, since each worker loads its own list of columns,
   * we need to sum them.
   */
  void SynchronizeNumberOfColumns();
 private:
  void SetInfoFromHost(Context const& ctx, StringView key, Json arr);
  void SetInfoFromCUDA(Context const& ctx, StringView key, Json arr);
@ -325,6 +334,10 @@ class SparsePage {
   * \brief Check wether the column index is sorted.
   */
  bool IsIndicesSorted(int32_t n_threads) const;
  /**
   * \brief Reindex the column index with an offset.
   */
  void Reindex(uint64_t feature_offset, int32_t n_threads);
  void SortRows(int32_t n_threads);
@ -563,13 +576,14 @@ class DMatrix {
   * \param           missing         Values to count as missing.
   * \param           nthread         Number of threads for construction.
   * \param           cache_prefix    (Optional) The cache prefix for external memory.
-   * \param           page_size     (Optional) Size of the page.
+   * \param           data_split_mode (Optional) Data split mode.
   *
   * \return  a Created DMatrix.
   */
  template <typename AdapterT>
  static DMatrix* Create(AdapterT* adapter, float missing, int nthread,
-                         const std::string& cache_prefix = "");
+                         const std::string& cache_prefix = "",
                         DataSplitMode data_split_mode = DataSplitMode::kRow);
  /**
   * \brief Create a new Quantile based DMatrix used for histogram based algorithm.
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@ -9,7 +9,6 @@
 #define XGBOOST_GBM_H_
 #include <dmlc/registry.h>
 #include <dmlc/any.h>
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>
--- a/include/xgboost/json_io.h
+++ b/include/xgboost/json_io.h
@ -1,5 +1,5 @@
-/*!
+/**
- * Copyright (c) by Contributors 2019-2022
+ * Copyright 2019-2023, XGBoost Contributors
 */
 #ifndef XGBOOST_JSON_IO_H_
 #define XGBOOST_JSON_IO_H_
@ -17,44 +17,26 @@
 #include <vector>
 namespace xgboost {
-namespace detail {
+/**
 // Whether char is signed is undefined, as a result we might or might not need
 // static_cast and std::to_string.
 template <typename Char, std::enable_if_t<std::is_signed<Char>::value>* = nullptr>
 std::string CharToStr(Char c) {
  static_assert(std::is_same<Char, char>::value);
  return std::string{c};
 }
 template <typename Char, std::enable_if_t<!std::is_signed<Char>::value>* = nullptr>
 std::string CharToStr(Char c) {
  static_assert(std::is_same<Char, char>::value);
  return (c <= static_cast<char>(127) ? std::string{c} : std::to_string(c));
 }
 }  // namespace detail
 /*
 * \brief A json reader, currently error checking and utf-8 is not fully supported.
 */
 class JsonReader {
 public:
  using Char = std::int8_t;
 protected:
-  size_t constexpr static kMaxNumLength =
+  size_t constexpr static kMaxNumLength = std::numeric_limits<double>::max_digits10 + 1;
      std::numeric_limits<double>::max_digits10 + 1;
  struct SourceLocation {
   private:
-    size_t pos_ { 0 };  // current position in raw_str_
+    std::size_t pos_{0};  // current position in raw_str_
   public:
    SourceLocation() = default;
    size_t Pos() const { return pos_; }
-    void Forward() {
+    void Forward() { pos_++; }
-      pos_++;
+    void Forward(uint32_t n) { pos_ += n; }
    }
    void Forward(uint32_t n) {
      pos_ += n;
    }
  } cursor_;
  StringView raw_str_;
@ -62,7 +44,7 @@ class JsonReader {
 protected:
  void SkipSpaces();
-  char GetNextChar() {
+  Char GetNextChar() {
    if (XGBOOST_EXPECT((cursor_.Pos() == raw_str_.size()), false)) {
      return -1;
    }
@ -71,24 +53,24 @@ class JsonReader {
    return ch;
  }
-  char PeekNextChar() {
+  Char PeekNextChar() {
    if (cursor_.Pos() == raw_str_.size()) {
      return -1;
    }
-    char ch = raw_str_[cursor_.Pos()];
+    Char ch = raw_str_[cursor_.Pos()];
    return ch;
  }
  /* \brief Skip spaces and consume next character. */
-  char GetNextNonSpaceChar() {
+  Char GetNextNonSpaceChar() {
    SkipSpaces();
    return GetNextChar();
  }
  /* \brief Consume next character without first skipping empty space, throw when the next
   *        character is not the expected one.
   */
-  char GetConsecutiveChar(char expected_char) {
+  Char GetConsecutiveChar(char expected_char) {
-    char result = GetNextChar();
+    Char result = GetNextChar();
    if (XGBOOST_EXPECT(result != expected_char, false)) { Expect(expected_char, result); }
    return result;
  }
@ -96,7 +78,7 @@ class JsonReader {
  void Error(std::string msg) const;
  // Report expected character
-  void Expect(char c, char got) {
+  void Expect(Char c, Char got) {
    std::string msg = "Expecting: \"";
    msg += c;
    msg += "\", got: \"";
@ -105,7 +87,7 @@ class JsonReader {
    } else if (got == 0) {
      msg += "\\0\"";
    } else {
-      msg += detail::CharToStr(got) + " \"";
+      msg += std::to_string(got) + " \"";
    }
    Error(msg);
  }
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@ -286,8 +286,8 @@ struct LearnerModelParamLegacy;
 * \brief Strategy for building multi-target models.
 */
 enum class MultiStrategy : std::int32_t {
-  kComposite = 0,
+  kOneOutputPerTree = 0,
-  kMonolithic = 1,
+  kMultiOutputTree = 1,
 };
 /**
@ -317,7 +317,7 @@ struct LearnerModelParam {
  /**
   * \brief Strategy for building multi-target models.
   */
-  MultiStrategy multi_strategy{MultiStrategy::kComposite};
+  MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};
  LearnerModelParam() = default;
  // As the old `LearnerModelParamLegacy` is still used by binary IO, we keep
@ -338,7 +338,7 @@ struct LearnerModelParam {
  void Copy(LearnerModelParam const& that);
  [[nodiscard]] bool IsVectorLeaf() const noexcept {
-    return multi_strategy == MultiStrategy::kMonolithic;
+    return multi_strategy == MultiStrategy::kMultiOutputTree;
  }
  [[nodiscard]] bst_target_t OutputLength() const noexcept { return this->num_output_group; }
  [[nodiscard]] bst_target_t LeafLength() const noexcept {
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@ -30,11 +30,11 @@
 // decouple it from xgboost.
 #ifndef LINALG_HD
-#if defined(__CUDA__) || defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__)
+#if defined(__CUDA__) || defined(__NVCC__)
 #define LINALG_HD __host__ __device__
 #else
 #define LINALG_HD
-#endif  // defined (__CUDA__) || defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined (__CUDA__) || defined(__NVCC__)
 #endif  // LINALG_HD
 namespace xgboost::linalg {
@ -118,9 +118,9 @@ using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value,
 template <int32_t n, typename Fn>
 LINALG_HD constexpr auto UnrollLoop(Fn fn) {
-#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
+#if defined __CUDA_ARCH__
 #pragma unroll n
-#endif  // defined __CUDA_ARCH__ || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined __CUDA_ARCH__
  for (int32_t i = 0; i < n; ++i) {
    fn(i);
  }
@ -136,7 +136,7 @@ int32_t NativePopc(T v) {
 inline LINALG_HD int Popc(uint32_t v) {
 #if defined(__CUDA_ARCH__)
  return __popc(v);
-#elif defined(__GNUC__) || defined(__clang__) || defined(__HIP_PLATFORM_AMD__)
+#elif defined(__GNUC__) || defined(__clang__)
  return __builtin_popcount(v);
 #elif defined(_MSC_VER)
  return __popcnt(v);
@ -148,7 +148,7 @@ inline LINALG_HD int Popc(uint32_t v) {
 inline LINALG_HD int Popc(uint64_t v) {
 #if defined(__CUDA_ARCH__)
  return __popcll(v);
-#elif defined(__GNUC__) || defined(__clang__) || defined(__HIP_PLATFORM_AMD__)
+#elif defined(__GNUC__) || defined(__clang__)
  return __builtin_popcountll(v);
 #elif defined(_MSC_VER) && _defined(_M_X64)
  return __popcnt64(v);
@ -530,17 +530,17 @@ class TensorView {
  /**
   * \brief Number of items in the tensor.
   */
-  LINALG_HD std::size_t Size() const { return size_; }
+  [[nodiscard]] LINALG_HD std::size_t Size() const { return size_; }
  /**
   * \brief Whether this is a contiguous array, both C and F contiguous returns true.
   */
-  LINALG_HD bool Contiguous() const {
+  [[nodiscard]] LINALG_HD bool Contiguous() const {
    return data_.size() == this->Size() || this->CContiguous() || this->FContiguous();
  }
  /**
   * \brief Whether it's a c-contiguous array.
   */
-  LINALG_HD bool CContiguous() const {
+  [[nodiscard]] LINALG_HD bool CContiguous() const {
    StrideT stride;
    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
    // It's contiguous if the stride can be calculated from shape.
@ -550,7 +550,7 @@ class TensorView {
  /**
   * \brief Whether it's a f-contiguous array.
   */
-  LINALG_HD bool FContiguous() const {
+  [[nodiscard]] LINALG_HD bool FContiguous() const {
    StrideT stride;
    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
    // It's contiguous if the stride can be calculated from shape.
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@ -29,11 +29,6 @@
 namespace xgboost {
 class Json;
 #if defined(XGBOOST_USE_HIP)
 #define XGBOOST_NODISCARD
 #else
 #define XGBOOST_NODISCARD [[nodiscard]]
 #endif
 // FIXME(trivialfis): Once binary IO is gone, make this parameter internal as it should
 // not be configured by users.
 /*! \brief meta parameters of the tree */
@ -64,7 +59,7 @@ struct TreeParam : public dmlc::Parameter<TreeParam> {
  // Swap byte order for all fields. Useful for transporting models between machines with different
  // endianness (big endian vs little endian)
-  XGBOOST_NODISCARD TreeParam ByteSwap() const {
+  [[nodiscard]] TreeParam ByteSwap() const {
    TreeParam x = *this;
    dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1);
    dmlc::ByteSwap(&x.num_nodes, sizeof(x.num_nodes), 1);
@ -117,7 +112,7 @@ struct RTreeNodeStat {
  }
  // Swap byte order for all fields. Useful for transporting models between machines with different
  // endianness (big endian vs little endian)
-  XGBOOST_NODISCARD RTreeNodeStat ByteSwap() const {
+  [[nodiscard]] RTreeNodeStat ByteSwap() const {
    RTreeNodeStat x = *this;
    dmlc::ByteSwap(&x.loss_chg, sizeof(x.loss_chg), 1);
    dmlc::ByteSwap(&x.sum_hess, sizeof(x.sum_hess), 1);
@ -183,51 +178,33 @@ class RegTree : public Model {
    }
    /*! \brief index of left child */
-    XGBOOST_DEVICE XGBOOST_NODISCARD int LeftChild() const {
+    [[nodiscard]] XGBOOST_DEVICE int LeftChild() const { return this->cleft_; }
      return this->cleft_;
    }
    /*! \brief index of right child */
-    XGBOOST_DEVICE XGBOOST_NODISCARD int RightChild() const {
+    [[nodiscard]] XGBOOST_DEVICE int RightChild() const { return this->cright_; }
      return this->cright_;
    }
    /*! \brief index of default child when feature is missing */
-    XGBOOST_DEVICE XGBOOST_NODISCARD int DefaultChild() const {
+    [[nodiscard]] XGBOOST_DEVICE int DefaultChild() const {
      return this->DefaultLeft() ? this->LeftChild() : this->RightChild();
    }
    /*! \brief feature index of split condition */
-    XGBOOST_DEVICE XGBOOST_NODISCARD unsigned SplitIndex() const {
+    [[nodiscard]] XGBOOST_DEVICE unsigned SplitIndex() const {
      return sindex_ & ((1U << 31) - 1U);
    }
    /*! \brief when feature is unknown, whether goes to left child */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool DefaultLeft() const {
+    [[nodiscard]] XGBOOST_DEVICE bool DefaultLeft() const { return (sindex_ >> 31) != 0; }
      return (sindex_ >> 31) != 0;
    }
    /*! \brief whether current node is leaf node */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool IsLeaf() const {
+    [[nodiscard]] XGBOOST_DEVICE bool IsLeaf() const { return cleft_ == kInvalidNodeId; }
      return cleft_ == kInvalidNodeId;
    }
    /*! \return get leaf value of leaf node */
-    XGBOOST_DEVICE XGBOOST_NODISCARD float LeafValue() const {
+    [[nodiscard]] XGBOOST_DEVICE float LeafValue() const { return (this->info_).leaf_value; }
      return (this->info_).leaf_value;
    }
    /*! \return get split condition of the node */
-    XGBOOST_DEVICE XGBOOST_NODISCARD SplitCondT SplitCond() const {
+    [[nodiscard]] XGBOOST_DEVICE SplitCondT SplitCond() const { return (this->info_).split_cond; }
      return (this->info_).split_cond;
    }
    /*! \brief get parent of the node */
-    XGBOOST_DEVICE XGBOOST_NODISCARD int Parent() const {
+    [[nodiscard]] XGBOOST_DEVICE int Parent() const { return parent_ & ((1U << 31) - 1); }
      return parent_ & ((1U << 31) - 1);
    }
    /*! \brief whether current node is left child */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool IsLeftChild() const {
+    [[nodiscard]] XGBOOST_DEVICE bool IsLeftChild() const { return (parent_ & (1U << 31)) != 0; }
      return (parent_ & (1U << 31)) != 0;
    }
    /*! \brief whether this node is deleted */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool IsDeleted() const {
+    [[nodiscard]] XGBOOST_DEVICE bool IsDeleted() const { return sindex_ == kDeletedNodeMarker; }
      return sindex_ == kDeletedNodeMarker;
    }
    /*! \brief whether current node is root */
-    XGBOOST_DEVICE XGBOOST_NODISCARD bool IsRoot() const { return parent_ == kInvalidNodeId; }
+    [[nodiscard]] XGBOOST_DEVICE bool IsRoot() const { return parent_ == kInvalidNodeId; }
    /*!
     * \brief set the left child
     * \param nid node id to right child
@ -284,7 +261,7 @@ class RegTree : public Model {
             info_.leaf_value == b.info_.leaf_value;
    }
-    XGBOOST_NODISCARD Node ByteSwap() const {
+    [[nodiscard]] Node ByteSwap() const {
      Node x = *this;
      dmlc::ByteSwap(&x.parent_, sizeof(x.parent_), 1);
      dmlc::ByteSwap(&x.cleft_, sizeof(x.cleft_), 1);
@ -342,15 +319,13 @@ class RegTree : public Model {
    this->ChangeToLeaf(rid, value);
  }
  /*! \brief model parameter */
  TreeParam param;
  RegTree() {
-    param.Init(Args{});
+    param_.Init(Args{});
-    nodes_.resize(param.num_nodes);
+    nodes_.resize(param_.num_nodes);
-    stats_.resize(param.num_nodes);
+    stats_.resize(param_.num_nodes);
-    split_types_.resize(param.num_nodes, FeatureType::kNumerical);
+    split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
-    split_categories_segments_.resize(param.num_nodes);
+    split_categories_segments_.resize(param_.num_nodes);
-    for (int i = 0; i < param.num_nodes; i++) {
+    for (int i = 0; i < param_.num_nodes; i++) {
      nodes_[i].SetLeaf(0.0f);
      nodes_[i].SetParent(kInvalidNodeId);
    }
@ -359,10 +334,10 @@ class RegTree : public Model {
   * \brief Constructor that initializes the tree model with shape.
   */
  explicit RegTree(bst_target_t n_targets, bst_feature_t n_features) : RegTree{} {
-    param.num_feature = n_features;
+    param_.num_feature = n_features;
-    param.size_leaf_vector = n_targets;
+    param_.size_leaf_vector = n_targets;
    if (n_targets > 1) {
-      this->p_mt_tree_.reset(new MultiTargetTree{&param});
+      this->p_mt_tree_.reset(new MultiTargetTree{&param_});
    }
  }
@ -376,17 +351,17 @@ class RegTree : public Model {
  }
  /*! \brief get const reference to nodes */
-  XGBOOST_NODISCARD const std::vector<Node>& GetNodes() const { return nodes_; }
+  [[nodiscard]] const std::vector<Node>& GetNodes() const { return nodes_; }
  /*! \brief get const reference to stats */
-  XGBOOST_NODISCARD const std::vector<RTreeNodeStat>& GetStats() const { return stats_; }
+  [[nodiscard]] const std::vector<RTreeNodeStat>& GetStats() const { return stats_; }
  /*! \brief get node statistics given nid */
  RTreeNodeStat& Stat(int nid) {
    return stats_[nid];
  }
  /*! \brief get node statistics given nid */
-  XGBOOST_NODISCARD const RTreeNodeStat& Stat(int nid) const {
+  [[nodiscard]] const RTreeNodeStat& Stat(int nid) const {
    return stats_[nid];
  }
@ -406,7 +381,7 @@ class RegTree : public Model {
  bool operator==(const RegTree& b) const {
    return nodes_ == b.nodes_ && stats_ == b.stats_ &&
-           deleted_nodes_ == b.deleted_nodes_ && param == b.param;
+           deleted_nodes_ == b.deleted_nodes_ && param_ == b.param_;
  }
  /* \brief Iterate through all nodes in this tree.
   *
@ -439,7 +414,7 @@ class RegTree : public Model {
   *
   * \param b The other tree.
   */
-  XGBOOST_NODISCARD bool Equal(const RegTree& b) const;
+  [[nodiscard]] bool Equal(const RegTree& b) const;
  /**
   * \brief Expands a leaf node into two additional leaf nodes.
@ -464,7 +439,9 @@ class RegTree : public Model {
                  bst_float loss_change, float sum_hess, float left_sum,
                  float right_sum,
                  bst_node_t leaf_right_child = kInvalidNodeId);
-
+  /**
   * \brief Expands a leaf node into two additional leaf nodes for a multi-target tree.
   */
  void ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond, bool default_left,
                  linalg::VectorView<float const> base_weight,
                  linalg::VectorView<float const> left_weight,
@ -490,25 +467,54 @@ class RegTree : public Model {
                         bst_float base_weight, bst_float left_leaf_weight,
                         bst_float right_leaf_weight, bst_float loss_change, float sum_hess,
                         float left_sum, float right_sum);
-
+  /**
-  XGBOOST_NODISCARD bool HasCategoricalSplit() const {
+   * \brief Whether this tree has categorical split.
-    return !split_categories_.empty();
+   */
-  }
+  [[nodiscard]] bool HasCategoricalSplit() const { return !split_categories_.empty(); }
  /**
   * \brief Whether this is a multi-target tree.
   */
-  XGBOOST_NODISCARD bool IsMultiTarget() const { return static_cast<bool>(p_mt_tree_); }
+  [[nodiscard]] bool IsMultiTarget() const { return static_cast<bool>(p_mt_tree_); }
-  XGBOOST_NODISCARD bst_target_t NumTargets() const { return param.size_leaf_vector; }
+  /**
-  XGBOOST_NODISCARD auto GetMultiTargetTree() const {
+   * \brief The size of leaf weight.
   */
  [[nodiscard]] bst_target_t NumTargets() const { return param_.size_leaf_vector; }
  /**
   * \brief Get the underlying implementaiton of multi-target tree.
   */
  [[nodiscard]] auto GetMultiTargetTree() const {
    CHECK(IsMultiTarget());
    return p_mt_tree_.get();
  }
  /**
   * \brief Get the number of features.
   */
  [[nodiscard]] bst_feature_t NumFeatures() const noexcept { return param_.num_feature; }
  /**
   * \brief Get the total number of nodes including deleted ones in this tree.
   */
  [[nodiscard]] bst_node_t NumNodes() const noexcept { return param_.num_nodes; }
  /**
   * \brief Get the total number of valid nodes in this tree.
   */
  [[nodiscard]] bst_node_t NumValidNodes() const noexcept {
    return param_.num_nodes - param_.num_deleted;
  }
  /**
   * \brief number of extra nodes besides the root
   */
  [[nodiscard]] bst_node_t NumExtraNodes() const noexcept {
    return param_.num_nodes - 1 - param_.num_deleted;
  }
  /* \brief Count number of leaves in tree. */
  [[nodiscard]] bst_node_t GetNumLeaves() const;
  [[nodiscard]] bst_node_t GetNumSplitNodes() const;
  /*!
   * \brief get current depth
   * \param nid node id
   */
-  XGBOOST_NODISCARD std::int32_t GetDepth(bst_node_t nid) const {
+  [[nodiscard]] std::int32_t GetDepth(bst_node_t nid) const {
    if (IsMultiTarget()) {
      return this->p_mt_tree_->Depth(nid);
    }
@ -519,6 +525,9 @@ class RegTree : public Model {
    }
    return depth;
  }
  /**
   * \brief Set the leaf weight for a multi-target tree.
   */
  void SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
    CHECK(IsMultiTarget());
    return this->p_mt_tree_->SetLeaf(nidx, weight);
@ -528,27 +537,15 @@ class RegTree : public Model {
   * \brief get maximum depth
   * \param nid node id
   */
-  XGBOOST_NODISCARD int MaxDepth(int nid) const {
+  [[nodiscard]] int MaxDepth(int nid) const {
    if (nodes_[nid].IsLeaf()) return 0;
-    return std::max(MaxDepth(nodes_[nid].LeftChild())+1,
+    return std::max(MaxDepth(nodes_[nid].LeftChild()) + 1, MaxDepth(nodes_[nid].RightChild()) + 1);
                     MaxDepth(nodes_[nid].RightChild())+1);
  }
  /*!
   * \brief get maximum depth
   */
-  int MaxDepth() {
+  int MaxDepth() { return MaxDepth(0); }
    return MaxDepth(0);
  }
  /*! \brief number of extra nodes besides the root */
  XGBOOST_NODISCARD int NumExtraNodes() const {
    return param.num_nodes - 1 - param.num_deleted;
  }
  /* \brief Count number of leaves in tree. */
  XGBOOST_NODISCARD bst_node_t GetNumLeaves() const;
  XGBOOST_NODISCARD bst_node_t GetNumSplitNodes() const;
  /*!
   * \brief dense feature vector that can be taken by RegTree
@ -575,20 +572,20 @@ class RegTree : public Model {
     * \brief returns the size of the feature vector
     * \return the size of the feature vector
     */
-    XGBOOST_NODISCARD size_t Size() const;
+    [[nodiscard]] size_t Size() const;
    /*!
     * \brief get ith value
     * \param i feature index.
     * \return the i-th feature value
     */
-    XGBOOST_NODISCARD bst_float GetFvalue(size_t i) const;
+    [[nodiscard]] bst_float GetFvalue(size_t i) const;
    /*!
     * \brief check whether i-th entry is missing
     * \param i feature index.
     * \return whether i-th value is missing.
     */
-    XGBOOST_NODISCARD bool IsMissing(size_t i) const;
+    [[nodiscard]] bool IsMissing(size_t i) const;
-    XGBOOST_NODISCARD bool HasMissing() const;
+    [[nodiscard]] bool HasMissing() const;
   private:
@ -619,34 +616,34 @@ class RegTree : public Model {
   * \param format the format to dump the model in
   * \return the string of dumped model
   */
-  XGBOOST_NODISCARD std::string DumpModel(const FeatureMap& fmap, bool with_stats,
+  [[nodiscard]] std::string DumpModel(const FeatureMap& fmap, bool with_stats,
                                      std::string format) const;
  /*!
   * \brief Get split type for a node.
   * \param nidx Index of node.
   * \return The type of this split.  For leaf node it's always kNumerical.
   */
-  XGBOOST_NODISCARD FeatureType NodeSplitType(bst_node_t nidx) const { return split_types_.at(nidx); }
+  [[nodiscard]] FeatureType NodeSplitType(bst_node_t nidx) const { return split_types_.at(nidx); }
  /*!
   * \brief Get split types for all nodes.
   */
-  XGBOOST_NODISCARD std::vector<FeatureType> const& GetSplitTypes() const {
+  [[nodiscard]] std::vector<FeatureType> const& GetSplitTypes() const {
    return split_types_;
  }
-  XGBOOST_NODISCARD common::Span<uint32_t const> GetSplitCategories() const {
+  [[nodiscard]] common::Span<uint32_t const> GetSplitCategories() const {
    return split_categories_;
  }
  /*!
   * \brief Get the bit storage for categories
   */
-  XGBOOST_NODISCARD common::Span<uint32_t const> NodeCats(bst_node_t nidx) const {
+  [[nodiscard]] common::Span<uint32_t const> NodeCats(bst_node_t nidx) const {
    auto node_ptr = GetCategoriesMatrix().node_ptr;
    auto categories = GetCategoriesMatrix().categories;
    auto segment = node_ptr[nidx];
    auto node_cats = categories.subspan(segment.beg, segment.size);
    return node_cats;
  }
-  XGBOOST_NODISCARD auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; }
+  [[nodiscard]] auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; }
  /**
   * \brief CSR-like matrix for categorical splits.
@ -665,7 +662,7 @@ class RegTree : public Model {
    common::Span<Segment const> node_ptr;
  };
-  XGBOOST_NODISCARD CategoricalSplitMatrix GetCategoriesMatrix() const {
+  [[nodiscard]] CategoricalSplitMatrix GetCategoriesMatrix() const {
    CategoricalSplitMatrix view;
    view.split_type = common::Span<FeatureType const>(this->GetSplitTypes());
    view.categories = this->GetSplitCategories();
@ -673,55 +670,55 @@ class RegTree : public Model {
    return view;
  }
-  XGBOOST_NODISCARD bst_feature_t SplitIndex(bst_node_t nidx) const {
+  [[nodiscard]] bst_feature_t SplitIndex(bst_node_t nidx) const {
    if (IsMultiTarget()) {
      return this->p_mt_tree_->SplitIndex(nidx);
    }
    return (*this)[nidx].SplitIndex();
  }
-  XGBOOST_NODISCARD float SplitCond(bst_node_t nidx) const {
+  [[nodiscard]] float SplitCond(bst_node_t nidx) const {
    if (IsMultiTarget()) {
      return this->p_mt_tree_->SplitCond(nidx);
    }
    return (*this)[nidx].SplitCond();
  }
-  XGBOOST_NODISCARD bool DefaultLeft(bst_node_t nidx) const {
+  [[nodiscard]] bool DefaultLeft(bst_node_t nidx) const {
    if (IsMultiTarget()) {
      return this->p_mt_tree_->DefaultLeft(nidx);
    }
    return (*this)[nidx].DefaultLeft();
  }
-  XGBOOST_NODISCARD bool IsRoot(bst_node_t nidx) const {
+  [[nodiscard]] bool IsRoot(bst_node_t nidx) const {
    if (IsMultiTarget()) {
      return nidx == kRoot;
    }
    return (*this)[nidx].IsRoot();
  }
-  XGBOOST_NODISCARD bool IsLeaf(bst_node_t nidx) const {
+  [[nodiscard]] bool IsLeaf(bst_node_t nidx) const {
    if (IsMultiTarget()) {
      return this->p_mt_tree_->IsLeaf(nidx);
    }
    return (*this)[nidx].IsLeaf();
  }
-  XGBOOST_NODISCARD bst_node_t Parent(bst_node_t nidx) const {
+  [[nodiscard]] bst_node_t Parent(bst_node_t nidx) const {
    if (IsMultiTarget()) {
      return this->p_mt_tree_->Parent(nidx);
    }
    return (*this)[nidx].Parent();
  }
-  XGBOOST_NODISCARD bst_node_t LeftChild(bst_node_t nidx) const {
+  [[nodiscard]] bst_node_t LeftChild(bst_node_t nidx) const {
    if (IsMultiTarget()) {
      return this->p_mt_tree_->LeftChild(nidx);
    }
    return (*this)[nidx].LeftChild();
  }
-  XGBOOST_NODISCARD bst_node_t RightChild(bst_node_t nidx) const {
+  [[nodiscard]] bst_node_t RightChild(bst_node_t nidx) const {
    if (IsMultiTarget()) {
      return this->p_mt_tree_->RightChild(nidx);
    }
    return (*this)[nidx].RightChild();
  }
-  XGBOOST_NODISCARD bool IsLeftChild(bst_node_t nidx) const {
+  [[nodiscard]] bool IsLeftChild(bst_node_t nidx) const {
    if (IsMultiTarget()) {
      CHECK_NE(nidx, kRoot);
      auto p = this->p_mt_tree_->Parent(nidx);
@ -729,7 +726,7 @@ class RegTree : public Model {
    }
    return (*this)[nidx].IsLeftChild();
  }
-  XGBOOST_NODISCARD bst_node_t Size() const {
+  [[nodiscard]] bst_node_t Size() const {
    if (IsMultiTarget()) {
      return this->p_mt_tree_->Size();
    }
@ -740,6 +737,8 @@ class RegTree : public Model {
  template <bool typed>
  void LoadCategoricalSplit(Json const& in);
  void SaveCategoricalSplit(Json* p_out) const;
  /*! \brief model parameter */
  TreeParam param_;
  // vector of nodes
  std::vector<Node> nodes_;
  // free node space, used during training process
@ -757,20 +756,20 @@ class RegTree : public Model {
  // allocate a new node,
  // !!!!!! NOTE: may cause BUG here, nodes.resize
  bst_node_t AllocNode() {
-    if (param.num_deleted != 0) {
+    if (param_.num_deleted != 0) {
      int nid = deleted_nodes_.back();
      deleted_nodes_.pop_back();
      nodes_[nid].Reuse();
-      --param.num_deleted;
+      --param_.num_deleted;
      return nid;
    }
-    int nd = param.num_nodes++;
+    int nd = param_.num_nodes++;
-    CHECK_LT(param.num_nodes, std::numeric_limits<int>::max())
+    CHECK_LT(param_.num_nodes, std::numeric_limits<int>::max())
        << "number of nodes in the tree exceed 2^31";
-    nodes_.resize(param.num_nodes);
+    nodes_.resize(param_.num_nodes);
-    stats_.resize(param.num_nodes);
+    stats_.resize(param_.num_nodes);
-    split_types_.resize(param.num_nodes, FeatureType::kNumerical);
+    split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
-    split_categories_segments_.resize(param.num_nodes);
+    split_categories_segments_.resize(param_.num_nodes);
    return nd;
  }
  // delete a tree node, keep the parent field to allow trace back
@ -785,7 +784,7 @@ class RegTree : public Model {
    deleted_nodes_.push_back(nid);
    nodes_[nid].MarkDelete();
-    ++param.num_deleted;
+    ++param_.num_deleted;
  }
 };
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -37,7 +37,7 @@
        <spark.version>3.1.1</spark.version>
        <scala.version>2.12.8</scala.version>
        <scala.binary.version>2.12</scala.binary.version>
-        <hadoop.version>3.3.4</hadoop.version>
+        <hadoop.version>3.3.5</hadoop.version>
        <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
        <log.capi.invocation>OFF</log.capi.invocation>
        <use.cuda>OFF</use.cuda>
@ -118,7 +118,7 @@
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-release-plugin</artifactId>
-                        <version>2.5.3</version>
+                        <version>3.0.0</version>
                        <configuration>
                            <autoVersionSubmodules>true</autoVersionSubmodules>
                            <useReleaseProfile>false</useReleaseProfile>
@ -427,7 +427,7 @@
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
-                <version>2.22.2</version>
+                <version>3.0.0</version>
                <configuration>
                    <skipTests>false</skipTests>
                    <useSystemClassLoader>false</useSystemClassLoader>
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@ -51,7 +51,7 @@
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
-            <version>3.3.4</version>
+            <version>3.3.5</version>
        </dependency>
    </dependencies>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@ -41,13 +41,13 @@
        <dependency>
            <groupId>com.typesafe.akka</groupId>
            <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
            <scope>compile</scope>
        </dependency>
        <dependency>
            <groupId>com.typesafe.akka</groupId>
            <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
            <scope>test</scope>
        </dependency>
        <dependency>
--- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
+++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@ -84,9 +84,10 @@ public class BoosterTest {
    };
    try (Table tmpTable = Table.readCSV(schema, opts, new File(trainingDataPath))) {
-      ColumnVector[] df = new ColumnVector[12];
+      ColumnVector[] df = new ColumnVector[10];
-      for (int i = 0; i < 12; ++i) {
+      // exclude the first two columns, they are label bounds and contain inf.
-        df[i] = tmpTable.getColumn(i);
+      for (int i = 2; i < 12; ++i) {
        df[i - 2] = tmpTable.getColumn(i);
      }
      try (Table X = new Table(df);) {
        ColumnVector[] labels = new ColumnVector[1];
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
@ -21,7 +21,7 @@ import java.io.File
 import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
 import org.apache.spark.ml.feature.VectorAssembler
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.{col, udf, when}
 import org.apache.spark.sql.types.{FloatType, StructField, StructType}
 class GpuXGBoostClassifierSuite extends GpuTestSuite {
@ -47,7 +47,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
        "features_cols" -> featureNames, "label_col" -> labelName)
      val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
        .randomSplit(Array(0.7, 0.3), seed = 1)
      // Get a model
      val model = new XGBoostClassifier(xgbParam)
        .fit(originalDf)
@ -64,7 +65,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
        "features_cols" -> featureNames, "label_col" -> labelName)
      val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
        .randomSplit(Array(0.7, 0.3), seed = 1)
      val getWeightFromF1 = udf({ f1: Float => if (f1.toInt % 2 == 0) 1.0f else 0.001f })
      val dfWithWeight = originalDf.withColumn("weight", getWeightFromF1(col("f1")))
@ -87,7 +89,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
        "num_round" -> 10, "num_workers" -> 1)
      val Array(rawInput, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
        .randomSplit(Array(0.7, 0.3), seed = 1)
      val classifier = new XGBoostClassifier(xgbParam)
        .setFeaturesCol(featureNames)
@ -122,7 +125,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
        "num_round" -> 10, "num_workers" -> 1)
      val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
        .randomSplit(Array(0.7, 0.3), seed = 1)
      val vectorAssembler = new VectorAssembler()
        .setHandleInvalid("keep")
@ -144,7 +148,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
    // transform on GPU
    withGpuSparkSession() { spark =>
      val Array(_, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
        .randomSplit(Array(0.7, 0.3), seed = 1)
      // Since CPU model does not know the information about the features cols that GPU transform
      // pipeline requires. End user needs to setFeaturesCol(features: Array[String]) in the model
@ -174,7 +179,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
        "num_round" -> 10, "num_workers" -> 1)
      val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
        .randomSplit(Array(0.7, 0.3), seed = 1)
      val classifier = new XGBoostClassifier(xgbParam)
        .setFeaturesCol(featureNames)
@ -190,7 +196,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
    // transform on CPU
    withCpuSparkSession() { spark =>
      val Array(_, rawInput) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
        .randomSplit(Array(0.7, 0.3), seed = 1)
      val featureColName = "feature_col"
      val vectorAssembler = new VectorAssembler()
--- a/jvm-packages/xgboost4j-tester/generate_pom.py
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@ -51,13 +51,13 @@ pom_template = """
    <dependency>
      <groupId>com.typesafe.akka</groupId>
      <artifactId>akka-actor_${{scala.binary.version}}</artifactId>
-      <version>2.7.0</version>
+      <version>2.6.20</version>
      <scope>compile</scope>
    </dependency>
    <dependency>
      <groupId>com.typesafe.akka</groupId>
      <artifactId>akka-testkit_${{scala.binary.version}}</artifactId>
-      <version>2.7.0</version>
+      <version>2.6.20</version>
      <scope>test</scope>
    </dependency>
    <dependency>
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@ -34,13 +34,13 @@
        <dependency>
            <groupId>com.typesafe.akka</groupId>
            <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
            <scope>compile</scope>
        </dependency>
        <dependency>
            <groupId>com.typesafe.akka</groupId>
            <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
            <scope>test</scope>
        </dependency>
        <dependency>
--- a/plugin/updater_oneapi/predictor_oneapi.cc
+++ b/plugin/updater_oneapi/predictor_oneapi.cc
@ -1,23 +1,22 @@
 /*!
 * Copyright by Contributors 2017-2020
 */
 #include <any>  // for any
 #include <cstddef>
 #include <limits>
 #include <mutex>
 #include "../../src/common/math.h"
 #include "../../src/data/adapter.h"
 #include "../../src/gbm/gbtree_model.h"
 #include "CL/sycl.hpp"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/logging.h"
 #include "xgboost/predictor.h"
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"
 #include "xgboost/logging.h"
 #include "xgboost/host_device_vector.h"
 #include "../../src/data/adapter.h"
 #include "../../src/common/math.h"
 #include "../../src/gbm/gbtree_model.h"
 #include "CL/sycl.hpp"
 namespace xgboost {
 namespace predictor {
@ -396,9 +395,9 @@ class PredictorOneAPI : public Predictor {
          out_preds->Size() == dmat->Info().num_row_);
  }
-  void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
+  void InplacePredict(std::any const& x, const gbm::GBTreeModel& model, float missing,
-                      float missing, PredictionCacheEntry *out_preds,
+                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
-                      uint32_t tree_begin, unsigned tree_end) const override {
+                      unsigned tree_end) const override {
    cpu_predictor->InplacePredict(x, model, missing, out_preds, tree_begin, tree_end);
  }
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@ -324,7 +324,7 @@ class EarlyStopping(TrainingCallback):
            es = xgboost.callback.EarlyStopping(
                rounds=2,
-                abs_tol=1e-3,
+                min_delta=1e-3,
                save_best=True,
                maximize=False,
                data_name="validation_0",
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -312,6 +312,19 @@ __model_doc = f"""
        needs to be set to have categorical feature support. See :doc:`Categorical Data
        </tutorials/categorical>` and :ref:`cat-param` for details.
    multi_strategy : Optional[str]
        .. versionadded:: 2.0.0
        .. note:: This parameter is working-in-progress.
        The strategy used for training multi-target models, including multi-target
        regression and multi-class classification. See :doc:`/tutorials/multioutput` for
        more information.
        - ``one_output_per_tree``: One model for each target.
        - ``multi_output_tree``:  Use multi-target trees.
    eval_metric : Optional[Union[str, List[str], Callable]]
        .. versionadded:: 1.6.0
@ -355,16 +368,19 @@ __model_doc = f"""
        .. versionadded:: 1.6.0
-        Activates early stopping. Validation metric needs to improve at least once in
+        - Activates early stopping. Validation metric needs to improve at least once in
-        every **early_stopping_rounds** round(s) to continue training.  Requires at least
+          every **early_stopping_rounds** round(s) to continue training.  Requires at
-        one item in **eval_set** in :py:meth:`fit`.
+          least one item in **eval_set** in :py:meth:`fit`.
-        The method returns the model from the last iteration (not the best one).  If
+        - The method returns the model from the last iteration, not the best one, use a
-        there's more than one item in **eval_set**, the last entry will be used for early
+          callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
-        stopping.  If there's more than one metric in **eval_metric**, the last metric
+          model is preferred.
        will be used for early stopping.
-        If early stopping occurs, the model will have three additional fields:
+        - If there's more than one item in **eval_set**, the last entry will be used for
          early stopping.  If there's more than one metric in **eval_metric**, the last
          metric will be used for early stopping.
        - If early stopping occurs, the model will have three additional fields:
          :py:attr:`best_score`, :py:attr:`best_iteration` and
          :py:attr:`best_ntree_limit`.
@ -466,7 +482,9 @@ Parameters
        doc.extend([get_doc(i) for i in items])
        if end_note:
            doc.append(end_note)
-        full_doc = [header + "\n\n"]
+        full_doc = [
            header + "\nSee :doc:`/python/sklearn_estimator` for more information.\n"
        ]
        full_doc.extend(doc)
        cls.__doc__ = "".join(full_doc)
        return cls
@ -624,6 +642,7 @@ class XGBModel(XGBModelBase):
        feature_types: Optional[FeatureTypes] = None,
        max_cat_to_onehot: Optional[int] = None,
        max_cat_threshold: Optional[int] = None,
        multi_strategy: Optional[str] = None,
        eval_metric: Optional[Union[str, List[str], Callable]] = None,
        early_stopping_rounds: Optional[int] = None,
        callbacks: Optional[List[TrainingCallback]] = None,
@ -670,6 +689,7 @@ class XGBModel(XGBModelBase):
        self.feature_types = feature_types
        self.max_cat_to_onehot = max_cat_to_onehot
        self.max_cat_threshold = max_cat_threshold
        self.multi_strategy = multi_strategy
        self.eval_metric = eval_metric
        self.early_stopping_rounds = early_stopping_rounds
        self.callbacks = callbacks
@ -1131,10 +1151,10 @@ class XGBModel(XGBModelBase):
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> ArrayLike:
-        """Predict with `X`.  If the model is trained with early stopping, then `best_iteration`
+        """Predict with `X`.  If the model is trained with early stopping, then
-        is used automatically.  For tree models, when data is on GPU, like cupy array or
+        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
-        cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
+        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
-        automatically, otherwise it will run on CPU.
+        prediction is run on GPU automatically, otherwise it will run on CPU.
        .. note:: This function is only thread safe for `gbtree` and `dart`.
@ -1209,8 +1229,8 @@ class XGBModel(XGBModelBase):
        ntree_limit: int = 0,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> np.ndarray:
-        """Return the predicted leaf every tree for each sample. If the model is trained with
+        """Return the predicted leaf every tree for each sample. If the model is trained
-        early stopping, then `best_iteration` is used automatically.
+        with early stopping, then :py:attr:`best_iteration` is used automatically.
        Parameters
        ----------
@ -1620,7 +1640,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> np.ndarray:
-        """Predict the probability of each `X` example being of a given class.
+        """Predict the probability of each `X` example being of a given class. If the
        model is trained with early stopping, then :py:attr:`best_iteration` is used
        automatically.
        .. note:: This function is only thread safe for `gbtree` and `dart`.
@ -1646,6 +1668,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        prediction :
            a numpy array of shape array-like of shape (n_samples, n_classes) with the
            probability of each data example being of a given class.
        """
        # custom obj:      Do nothing as we don't know what to do.
        # softprob:        Do nothing, output is proba.
@ -2107,11 +2130,13 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        return super().apply(X, ntree_limit, iteration_range)
    def score(self, X: ArrayLike, y: ArrayLike) -> float:
-        """Evaluate score for data using the last evaluation metric.
+        """Evaluate score for data using the last evaluation metric. If the model is
        trained with early stopping, then :py:attr:`best_iteration` is used
        automatically.
        Parameters
        ----------
-        X : pd.DataFrame|cudf.DataFrame
+        X : Union[pd.DataFrame, cudf.DataFrame]
          Feature matrix. A DataFrame with a special `qid` column.
        y :
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@ -10,7 +10,6 @@ import os
 import platform
 import socket
 import sys
 import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from io import StringIO
@ -28,7 +27,6 @@ from typing import (
    TypedDict,
    Union,
 )
 from urllib import request
 import numpy as np
 import pytest
@ -37,6 +35,13 @@ from scipy import sparse
 import xgboost as xgb
 from xgboost.core import ArrayLike
 from xgboost.sklearn import SklObjective
 from xgboost.testing.data import (
    get_california_housing,
    get_cancer,
    get_digits,
    get_sparse,
    memory,
 )
 hypothesis = pytest.importorskip("hypothesis")
@ -44,13 +49,8 @@ hypothesis = pytest.importorskip("hypothesis")
 from hypothesis import strategies
 from hypothesis.extra.numpy import arrays
 joblib = pytest.importorskip("joblib")
 datasets = pytest.importorskip("sklearn.datasets")
 Memory = joblib.Memory
 memory = Memory("./cachedir", verbose=0)
 PytestSkip = TypedDict("PytestSkip", {"condition": bool, "reason": str})
@ -352,137 +352,6 @@ class TestDataset:
        return self.name
@memory.cache
 def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
    data = datasets.fetch_california_housing()
    return data.data, data.target
@memory.cache
 def get_digits() -> Tuple[np.ndarray, np.ndarray]:
    data = datasets.load_digits()
    return data.data, data.target
@memory.cache
 def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
    return datasets.load_breast_cancer(return_X_y=True)
@memory.cache
 def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
    rng = np.random.RandomState(199)
    n = 2000
    sparsity = 0.75
    X, y = datasets.make_regression(n, random_state=rng)
    flag = rng.binomial(1, sparsity, X.shape)
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            if flag[i, j]:
                X[i, j] = np.nan
    return X, y
@memory.cache
 def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
    """
    Number of samples: 1460
    Number of features: 20
    Number of categorical features: 10
    Number of numerical features: 10
    """
    from sklearn.datasets import fetch_openml
    X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
    categorical_columns_subset: List[str] = [
        "BldgType",  # 5 cats, no nan
        "GarageFinish",  # 3 cats, nan
        "LotConfig",  # 5 cats, no nan
        "Functional",  # 7 cats, no nan
        "MasVnrType",  # 4 cats, nan
        "HouseStyle",  # 8 cats, no nan
        "FireplaceQu",  # 5 cats, nan
        "ExterCond",  # 5 cats, no nan
        "ExterQual",  # 4 cats, no nan
        "PoolQC",  # 3 cats, nan
    ]
    numerical_columns_subset: List[str] = [
        "3SsnPorch",
        "Fireplaces",
        "BsmtHalfBath",
        "HalfBath",
        "GarageCars",
        "TotRmsAbvGrd",
        "BsmtFinSF1",
        "BsmtFinSF2",
        "GrLivArea",
        "ScreenPorch",
    ]
    X = X[categorical_columns_subset + numerical_columns_subset]
    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
    return X, y
@memory.cache
 def get_mq2008(
    dpath: str,
 ) -> Tuple[
    sparse.csr_matrix,
    np.ndarray,
    np.ndarray,
    sparse.csr_matrix,
    np.ndarray,
    np.ndarray,
    sparse.csr_matrix,
    np.ndarray,
    np.ndarray,
 ]:
    from sklearn.datasets import load_svmlight_files
    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
    target = dpath + "/MQ2008.zip"
    if not os.path.exists(target):
        request.urlretrieve(url=src, filename=target)
    with zipfile.ZipFile(target, "r") as f:
        f.extractall(path=dpath)
    (
        x_train,
        y_train,
        qid_train,
        x_test,
        y_test,
        qid_test,
        x_valid,
        y_valid,
        qid_valid,
    ) = load_svmlight_files(
        (
            dpath + "MQ2008/Fold1/train.txt",
            dpath + "MQ2008/Fold1/test.txt",
            dpath + "MQ2008/Fold1/vali.txt",
        ),
        query_id=True,
        zero_based=False,
    )
    return (
        x_train,
        y_train,
        qid_train,
        x_test,
        y_test,
        qid_test,
        x_valid,
        y_valid,
        qid_valid,
    )
 # pylint: disable=too-many-arguments,too-many-locals
@memory.cache
 def make_categorical(
@ -737,20 +606,7 @@ _unweighted_datasets_strategy = strategies.sampled_from(
        TestDataset(
            "calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
        ),
        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
        TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
        TestDataset(
            "mtreg",
            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
            "reg:squarederror",
            "rmse",
        ),
        TestDataset(
            "mtreg-l1",
            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
            "reg:absoluteerror",
            "mae",
        ),
        TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
        TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
        TestDataset(
@ -763,9 +619,17 @@ _unweighted_datasets_strategy = strategies.sampled_from(
 )
 def make_datasets_with_margin(
    unweighted_strategy: strategies.SearchStrategy,
 ) -> Callable:
    """Factory function for creating strategies that generates datasets with weight and
    base margin.
    """
    @strategies.composite
-def _dataset_weight_margin(draw: Callable) -> TestDataset:
+    def weight_margin(draw: Callable) -> TestDataset:
-    data: TestDataset = draw(_unweighted_datasets_strategy)
+        data: TestDataset = draw(unweighted_strategy)
        if draw(strategies.booleans()):
            data.w = draw(
                arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
@ -790,10 +654,36 @@ def _dataset_weight_margin(draw: Callable) -> TestDataset:
        return data
    return weight_margin
-# A strategy for drawing from a set of example datasets
+
-# May add random weights to the dataset
+# A strategy for drawing from a set of example datasets. May add random weights to the
-dataset_strategy = _dataset_weight_margin()
+# dataset
 dataset_strategy = make_datasets_with_margin(_unweighted_datasets_strategy)()
 _unweighted_multi_datasets_strategy = strategies.sampled_from(
    [
        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
        TestDataset(
            "mtreg",
            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
            "reg:squarederror",
            "rmse",
        ),
        TestDataset(
            "mtreg-l1",
            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
            "reg:absoluteerror",
            "mae",
        ),
    ]
 )
 # A strategy for drawing from a set of multi-target/multi-class datasets.
 multi_dataset_strategy = make_datasets_with_margin(
    _unweighted_multi_datasets_strategy
 )()
 def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool:
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@ -1,10 +1,20 @@
 """Utilities for data generation."""
-from typing import Any, Generator, Tuple, Union
+import os
 import zipfile
 from typing import Any, Generator, List, Tuple, Union
 from urllib import request
 import numpy as np
 import pytest
 from numpy.random import Generator as RNG
 from scipy import sparse
 import xgboost
 from xgboost.data import pandas_pyarrow_mapper
 joblib = pytest.importorskip("joblib")
 memory = joblib.Memory("./cachedir", verbose=0)
 def np_dtypes(
    n_samples: int, n_features: int
@ -179,3 +189,154 @@ def pd_arrow_dtypes() -> Generator:
        dtype=pd.ArrowDtype(pa.bool_()),
    )
    yield orig, df
 def check_inf(rng: RNG) -> None:
    """Validate there's no inf in X."""
    X = rng.random(size=32).reshape(8, 4)
    y = rng.random(size=8)
    X[5, 2] = np.inf
    with pytest.raises(ValueError, match="Input data contains `inf`"):
        xgboost.QuantileDMatrix(X, y)
    with pytest.raises(ValueError, match="Input data contains `inf`"):
        xgboost.DMatrix(X, y)
@memory.cache
 def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
    """Fetch the California housing dataset from sklearn."""
    datasets = pytest.importorskip("sklearn.datasets")
    data = datasets.fetch_california_housing()
    return data.data, data.target
@memory.cache
 def get_digits() -> Tuple[np.ndarray, np.ndarray]:
    """Fetch the digits dataset from sklearn."""
    datasets = pytest.importorskip("sklearn.datasets")
    data = datasets.load_digits()
    return data.data, data.target
@memory.cache
 def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
    """Fetch the breast cancer dataset from sklearn."""
    datasets = pytest.importorskip("sklearn.datasets")
    return datasets.load_breast_cancer(return_X_y=True)
@memory.cache
 def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
    """Generate a sparse dataset."""
    datasets = pytest.importorskip("sklearn.datasets")
    rng = np.random.RandomState(199)
    n = 2000
    sparsity = 0.75
    X, y = datasets.make_regression(n, random_state=rng)
    flag = rng.binomial(1, sparsity, X.shape)
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            if flag[i, j]:
                X[i, j] = np.nan
    return X, y
@memory.cache
 def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
    """
    Number of samples: 1460
    Number of features: 20
    Number of categorical features: 10
    Number of numerical features: 10
    """
    datasets = pytest.importorskip("sklearn.datasets")
    X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
    categorical_columns_subset: List[str] = [
        "BldgType",  # 5 cats, no nan
        "GarageFinish",  # 3 cats, nan
        "LotConfig",  # 5 cats, no nan
        "Functional",  # 7 cats, no nan
        "MasVnrType",  # 4 cats, nan
        "HouseStyle",  # 8 cats, no nan
        "FireplaceQu",  # 5 cats, nan
        "ExterCond",  # 5 cats, no nan
        "ExterQual",  # 4 cats, no nan
        "PoolQC",  # 3 cats, nan
    ]
    numerical_columns_subset: List[str] = [
        "3SsnPorch",
        "Fireplaces",
        "BsmtHalfBath",
        "HalfBath",
        "GarageCars",
        "TotRmsAbvGrd",
        "BsmtFinSF1",
        "BsmtFinSF2",
        "GrLivArea",
        "ScreenPorch",
    ]
    X = X[categorical_columns_subset + numerical_columns_subset]
    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
    return X, y
@memory.cache
 def get_mq2008(
    dpath: str,
 ) -> Tuple[
    sparse.csr_matrix,
    np.ndarray,
    np.ndarray,
    sparse.csr_matrix,
    np.ndarray,
    np.ndarray,
    sparse.csr_matrix,
    np.ndarray,
    np.ndarray,
 ]:
    """Fetch the mq2008 dataset."""
    datasets = pytest.importorskip("sklearn.datasets")
    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
    target = os.path.join(dpath, "MQ2008.zip")
    if not os.path.exists(target):
        request.urlretrieve(url=src, filename=target)
    with zipfile.ZipFile(target, "r") as f:
        f.extractall(path=dpath)
    (
        x_train,
        y_train,
        qid_train,
        x_test,
        y_test,
        qid_test,
        x_valid,
        y_valid,
        qid_valid,
    ) = datasets.load_svmlight_files(
        (
            os.path.join(dpath, "MQ2008/Fold1/train.txt"),
            os.path.join(dpath, "MQ2008/Fold1/test.txt"),
            os.path.join(dpath, "MQ2008/Fold1/vali.txt"),
        ),
        query_id=True,
        zero_based=False,
    )
    return (
        x_train,
        y_train,
        qid_train,
        x_test,
        y_test,
        qid_test,
        x_valid,
        y_valid,
        qid_valid,
    )
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@ -4,8 +4,8 @@ from typing import cast
 import pytest
-hypothesis = pytest.importorskip("hypothesis")
+strategies = pytest.importorskip("hypothesis.strategies")
-from hypothesis import strategies  # pylint:disable=wrong-import-position
+
 exact_parameter_strategy = strategies.fixed_dictionaries(
    {
@ -41,6 +41,26 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
 )
 hist_multi_parameter_strategy = strategies.fixed_dictionaries(
    {
        "max_depth": strategies.integers(1, 11),
        "max_leaves": strategies.integers(0, 1024),
        "max_bin": strategies.integers(2, 512),
        "multi_strategy": strategies.sampled_from(
            ["multi_output_tree", "one_output_per_tree"]
        ),
        "grow_policy": strategies.sampled_from(["lossguide", "depthwise"]),
        "min_child_weight": strategies.floats(0.5, 2.0),
        # We cannot enable subsampling as the training loss can increase
        # 'subsample': strategies.floats(0.5, 1.0),
        "colsample_bytree": strategies.floats(0.5, 1.0),
        "colsample_bylevel": strategies.floats(0.5, 1.0),
    }
 ).filter(
    lambda x: (cast(int, x["max_depth"]) > 0 or cast(int, x["max_leaves"]) > 0)
    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
 )
 cat_parameter_strategy = strategies.fixed_dictionaries(
    {
        "max_cat_to_onehot": strategies.integers(1, 128),
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@ -48,7 +48,12 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
    def neg_mse(*args: Any, **kwargs: Any) -> float:
        return -float(mean_squared_error(*args, **kwargs))
-    ranker = xgb.XGBRanker(n_estimators=3, eval_metric=neg_mse, tree_method=tree_method)
+    ranker = xgb.XGBRanker(
        n_estimators=3,
        eval_metric=neg_mse,
        tree_method=tree_method,
        disable_default_eval_metric=True,
    )
    ranker.fit(df, y, eval_set=[(valid_df, y)])
    score = ranker.score(valid_df, y)
    assert np.isclose(score, ranker.evals_result()["validation_0"]["neg_mse"][-1])
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@ -55,6 +55,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
      *out_dim = 2;
      shape.resize(*out_dim);
      shape.front() = rows;
      // chunksize can be 1 if it's softmax
      shape.back() = std::min(groups, chunksize);
    }
    break;
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@ -14,7 +14,7 @@
 // clang with libstdc++ works as well
 #if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
-    !defined(__APPLE__) && __has_include(<omp.h>)
+    !defined(__APPLE__) && __has_include(<omp.h>) && __has_include(<parallel/algorithm>)
 #define GCC_HAS_PARALLEL 1
 #endif  // GLIC_VERSION
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -121,17 +121,20 @@ namespace dh {
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
                                     int line) {
  if (code != ncclSuccess) {
    std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
    ss << " " << file << "(" << line << ")\n";
    if (code == ncclUnhandledCudaError) {
      // nccl usually preserves the last error so we can get more details.
      auto err = cudaPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
    } else if (code == ncclSystemError) {
      ss << "  This might be caused by a network configuration issue. Please consider specifying "
            "the network interface for NCCL via environment variables listed in its reference: "
            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
    }
    ss << " " << file << "(" << line << ")";
    LOG(FATAL) << ss.str();
  }
--- a/src/common/device_helpers.hip.h
+++ b/src/common/device_helpers.hip.h
@ -2,6 +2,9 @@
 * Copyright 2017-2023 XGBoost contributors
 */
 #pragma once
 #if defined(XGBOOST_USE_CUDA)
 #include <thrust/binary_search.h>  // thrust::upper_bound
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/device_ptr.h>
@ -95,20 +98,23 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) {  // NOLINT
 }
 namespace dh {
-#ifdef XGBOOST_USE_NCCL
+#ifdef XGBOOST_USE_RCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
                                     int line) {
  if (code != ncclSuccess) {
    std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "RCCL failure: " << ncclGetErrorString(code) << ".";
    ss << " " << file << "(" << line << ")\n";
    if (code == ncclUnhandledCudaError) {
      // nccl usually preserves the last error so we can get more details.
      auto err = hipPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::hip_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
    } else if (code == ncclSystemError) {
      ss << "  This might be caused by a network configuration issue. Please consider specifying "
            "the network interface for NCCL via environment variables listed in its reference: "
            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
    }
    ss << " " << file << "(" << line << ")";
    LOG(FATAL) << ss.str();
  }
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@ -20,5 +20,9 @@ constexpr StringView GroupSize() {
 constexpr StringView LabelScoreSize() {
  return "The size of label doesn't match the size of prediction.";
 }
 constexpr StringView InfInData() {
  return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
 }
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@ -7,23 +7,22 @@
 #ifndef XGBOOST_COMMON_HIST_UTIL_H_
 #define XGBOOST_COMMON_HIST_UTIL_H_
 #include <xgboost/data.h>
 #include <algorithm>
 #include <cstdint>  // for uint32_t
 #include <limits>
 #include <map>
 #include <memory>
 #include <utility>
 #include <vector>
 #include "algorithm.h"  // SegmentId
 #include "categorical.h"
 #include "common.h"
 #include "quantile.h"
 #include "row_set.h"
 #include "threading_utils.h"
 #include "timer.h"
-#include "xgboost/base.h"  // bst_feature_t, bst_bin_t
+#include "xgboost/base.h"  // for bst_feature_t, bst_bin_t
 #include "xgboost/data.h"
 namespace xgboost {
 class GHistIndexMatrix;
@ -392,15 +391,18 @@ class HistCollection {
  }
  // have we computed a histogram for i-th node?
-  bool RowExists(bst_uint nid) const {
+  [[nodiscard]] bool RowExists(bst_uint nid) const {
    const uint32_t k_max = std::numeric_limits<uint32_t>::max();
    return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
  }
-
+  /**
-  // initialize histogram collection
+   * \brief Initialize histogram collection.
-  void Init(uint32_t nbins) {
+   *
-    if (nbins_ != nbins) {
+   * \param n_total_bins Number of bins across all features.
-      nbins_ = nbins;
+   */
  void Init(std::uint32_t n_total_bins) {
    if (nbins_ != n_total_bins) {
      nbins_ = n_total_bins;
      // quite expensive operation, so let's do this only once
      data_.clear();
    }
--- a/src/common/json.cc
+++ b/src/common/json.cc
@ -333,7 +333,7 @@ size_t constexpr JsonReader::kMaxNumLength;
 Json JsonReader::Parse() {
  while (true) {
    SkipSpaces();
-    char c = PeekNextChar();
+    auto c = PeekNextChar();
    if (c == -1) { break; }
    if (c == '{') {
@ -408,13 +408,13 @@ void JsonReader::Error(std::string msg) const {
 }
 namespace {
-bool IsSpace(char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
+bool IsSpace(JsonReader::Char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
 }  // anonymous namespace
 // Json class
 void JsonReader::SkipSpaces() {
  while (cursor_.Pos() < raw_str_.size()) {
-    char c = raw_str_[cursor_.Pos()];
+    Char c = raw_str_[cursor_.Pos()];
    if (IsSpace(c)) {
      cursor_.Forward();
    } else {
@ -436,12 +436,12 @@ void ParseStr(std::string const& str) {
 }
 Json JsonReader::ParseString() {
-  char ch { GetConsecutiveChar('\"') };  // NOLINT
+  Char ch { GetConsecutiveChar('\"') };  // NOLINT
  std::string str;
  while (true) {
    ch = GetNextChar();
    if (ch == '\\') {
-      char next = static_cast<char>(GetNextChar());
+      Char next{GetNextChar()};
      switch (next) {
        case 'r':  str += u8"\r"; break;
        case 'n':  str += u8"\n"; break;
@ -466,8 +466,8 @@ Json JsonReader::ParseString() {
 }
 Json JsonReader::ParseNull() {
-  char ch = GetNextNonSpaceChar();
+  Char ch = GetNextNonSpaceChar();
-  std::string buffer{ch};
+  std::string buffer{static_cast<char>(ch)};
  for (size_t i = 0; i < 3; ++i) {
    buffer.push_back(GetNextChar());
  }
@ -480,7 +480,7 @@ Json JsonReader::ParseNull() {
 Json JsonReader::ParseArray() {
  std::vector<Json> data;
-  char ch { GetConsecutiveChar('[') };  // NOLINT
+  Char ch { GetConsecutiveChar('[') };  // NOLINT
  while (true) {
    if (PeekNextChar() == ']') {
      GetConsecutiveChar(']');
@ -503,7 +503,7 @@ Json JsonReader::ParseObject() {
  Object::Map data;
  SkipSpaces();
-  char ch = PeekNextChar();
+  auto ch = PeekNextChar();
  if (ch == '}') {
    GetConsecutiveChar('}');
@ -652,7 +652,7 @@ Json JsonReader::ParseNumber() {
 Json JsonReader::ParseBoolean() {
  bool result = false;
-  char ch = GetNextNonSpaceChar();
+  Char ch = GetNextNonSpaceChar();
  std::string const t_value = u8"true";
  std::string const f_value = u8"false";
@ -737,7 +737,8 @@ Json UBJReader::ParseArray() {
      case 'L':
        return ParseTypedArray<I64Array>(n);
      default:
-        LOG(FATAL) << "`" + std::string{type} + "` is not supported for typed array.";  // NOLINT
+        LOG(FATAL) << "`" + std::string{static_cast<char>(type)} +  // NOLINT
                          "` is not supported for typed array.";
    }
  }
  std::vector<Json> results;
@ -794,7 +795,7 @@ Json UBJReader::Load() {
 Json UBJReader::Parse() {
  while (true) {
-    char c = PeekNextChar();
+    auto c = PeekNextChar();
    if (c == -1) {
      break;
    }
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@ -1,13 +1,15 @@
-/*!
+/**
- * Copyright 2022, XGBoost contributors.
+ * Copyright 2022-2023 by XGBoost contributors.
 */
 #ifndef XGBOOST_COMMON_NUMERIC_H_
 #define XGBOOST_COMMON_NUMERIC_H_
 #include <dmlc/common.h>  // OMPException
-#include <algorithm>  // std::max
+#include <algorithm>  // for std::max
-#include <iterator>   // std::iterator_traits
+#include <cstddef>    // for size_t
 #include <cstdint>    // for int32_t
 #include <iterator>   // for iterator_traits
 #include <vector>
 #include "common.h"                      // AssertGPUSupport
@ -15,8 +17,7 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
-namespace xgboost {
+namespace xgboost::common {
 namespace common {
 /**
 * \brief Run length encode on CPU, input must be sorted.
@ -111,11 +112,11 @@ inline double Reduce(Context const*, HostDeviceVector<float> const&) {
 namespace cpu_impl {
 template <typename It, typename V = typename It::value_type>
 V Reduce(Context const* ctx, It first, It second, V const& init) {
-  size_t n = std::distance(first, second);
+  std::size_t n = std::distance(first, second);
-  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(ctx->Threads(), init);
+  auto n_threads = static_cast<std::size_t>(std::min(n, static_cast<std::size_t>(ctx->Threads())));
-  common::ParallelFor(n, ctx->Threads(),
+  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(n_threads, init);
-                      [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
+  common::ParallelFor(n, n_threads, [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
-  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + ctx->Threads(), init);
+  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + n_threads, init);
  return result;
 }
 }  // namespace cpu_impl
@ -144,7 +145,6 @@ void Iota(Context const* ctx, It first, It last,
    });
  }
 }
-}  // namespace common
+}  // namespace xgboost::common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_NUMERIC_H_
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@ -1,5 +1,5 @@
-/*!
+/**
- * Copyright 2021-2022 by Contributors
+ * Copyright 2021-2023 by Contributors
 * \file row_set.h
 * \brief Quick Utility to compute subset of rows
 * \author Philip Cho, Tianqi Chen
@ -10,6 +10,7 @@
 #include <xgboost/data.h>
 #include <algorithm>
 #include <cstddef>  // for size_t
 #include <limits>
 #include <memory>
 #include <utility>
@ -21,9 +22,7 @@
 #include "xgboost/context.h"
 #include "xgboost/tree_model.h"
-namespace xgboost {
+namespace xgboost::common {
 namespace common {
 // The builder is required for samples partition to left and rights children for set of nodes
 // Responsible for:
 // 1) Effective memory allocation for intermediate results for multi-thread work
@ -109,18 +108,17 @@ class PartitionBuilder {
    return {nleft_elems, nright_elems};
  }
-  template <typename BinIdxType, bool any_missing, bool any_cat>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
-  void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
+  void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
-                 const common::Range1d range,
+                 const common::Range1d range, const bst_bin_t split_cond,
-                 const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
+                 GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
                 const common::ColumnMatrix& column_matrix,
                 const RegTree& tree, const size_t* rid) {
    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
    std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
+    bst_feature_t fid = tree.SplitIndex(nid);
-    bool default_left = tree[nid].DefaultLeft();
+    bool default_left = tree.DefaultLeft(nid);
    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
    auto node_cats = tree.NodeCats(nid);
    auto const& cut_values = gmat.cut.Values();
@ -190,10 +188,10 @@ class PartitionBuilder {
   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
   * to go right, or if the feature value used for the split is missing.
   */
-  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
+  template <typename ExpandEntry>
  void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                const common::Range1d range, GHistIndexMatrix const& gmat,
-                const common::ColumnMatrix& column_matrix,
+                const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
                const RegTree& tree, const size_t* rid,
                BitVector* decision_bits, BitVector* missing_bits) {
    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
    std::size_t nid = nodes[node_in_set].nid;
@ -228,8 +226,8 @@ class PartitionBuilder {
   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
   * use them to partition the rows accordingly.
   */
-  void PartitionByMask(const size_t node_in_set,
+  template <typename ExpandEntry>
-                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
+  void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                       const common::Range1d range, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
                       const size_t* rid, BitVector const& decision_bits,
@ -293,11 +291,11 @@ class PartitionBuilder {
  }
-  size_t GetNLeftElems(int nid) const {
+  [[nodiscard]] std::size_t GetNLeftElems(int nid) const {
    return left_right_nodes_sizes_[nid].first;
  }
-  size_t GetNRightElems(int nid) const {
+  [[nodiscard]] std::size_t GetNRightElems(int nid) const {
    return left_right_nodes_sizes_[nid].second;
  }
@ -349,7 +347,7 @@ class PartitionBuilder {
      if (node.node_id < 0) {
        return;
      }
-      CHECK(tree[node.node_id].IsLeaf());
+      CHECK(tree.IsLeaf(node.node_id));
      if (node.begin) {  // guard for empty node.
        size_t ptr_offset = node.end - p_begin;
        CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
@ -384,8 +382,5 @@ class PartitionBuilder {
  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
  size_t max_n_tasks_ = 0;
 };
-
+}  // namespace xgboost::common
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_PARTITION_BUILDER_H_
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@ -359,6 +359,7 @@ void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_b
                 HistogramCuts *cuts) {
  size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
  auto &cut_values = cuts->cut_values_.HostVector();
  // we use the min_value as the first (0th) element, hence starting from 1.
  for (size_t i = 1; i < required_cuts; ++i) {
    bst_float cpt = summary.data[i].value;
    if (i == 1 || cpt > cut_values.back()) {
@ -419,8 +420,8 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
    } else {
      AddCutPoint<WQSketch>(a, max_num_bins, cuts);
      // push a value that is greater than anything
-      const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
+      const bst_float cpt =
-                                         : cuts->min_vals_.HostVector()[fid];
+          (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
      // this must be bigger than last value in a scale
      const bst_float last = cpt + (fabs(cpt) + 1e-5f);
      cuts->cut_values_.HostVector().push_back(last);
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@ -352,19 +352,6 @@ struct WQSummary {
      prev_rmax = data[i].rmax;
    }
  }
  // check consistency of the summary
  inline bool Check(const char *msg) const {
    const float tol = 10.0f;
    for (size_t i = 0; i < this->size; ++i) {
      if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
          data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
        LOG(INFO) << "---------- WQSummary::Check did not pass ----------";
        this->Print();
        return false;
      }
    }
    return true;
  }
 };
 /*! \brief try to do efficient pruning */
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@ -6,9 +6,7 @@
 #include <algorithm>          // for copy_n, max, min, none_of, all_of
 #include <cstddef>            // for size_t
 #include <cstdio>             // for sscanf
 #include <exception>          // for exception
 #include <functional>         // for greater
 #include <iterator>           // for reverse_iterator
 #include <string>             // for char_traits, string
 #include "algorithm.h"        // for ArgSort
@ -18,12 +16,113 @@
 #include "xgboost/base.h"     // for bst_group_t
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"     // for MetaInfo
-#include "xgboost/linalg.h"   // for All, TensorView, Range, Tensor, Vector
+#include "xgboost/linalg.h"   // for All, TensorView, Range
-#include "xgboost/logging.h"  // for Error, LogCheck_EQ, CHECK_EQ
+#include "xgboost/logging.h"  // for CHECK_EQ
 namespace xgboost::ltr {
 void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
  if (info.group_ptr_.empty()) {
    group_ptr_.Resize(2, 0);
    group_ptr_.HostVector()[1] = info.num_row_;
  } else {
    group_ptr_.HostVector() = info.group_ptr_;
  }
  auto const& gptr = group_ptr_.ConstHostVector();
  for (std::size_t i = 1; i < gptr.size(); ++i) {
    std::size_t n = gptr[i] - gptr[i - 1];
    max_group_size_ = std::max(max_group_size_, n);
  }
  double sum_weights = 0;
  auto n_groups = Groups();
  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
  for (bst_omp_uint k = 0; k < n_groups; ++k) {
    sum_weights += weight[k];
  }
  weight_norm_ = static_cast<double>(n_groups) / sum_weights;
 }
 common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
                                                            common::Span<float const> predt) {
  auto gptr = this->DataGroupPtr(ctx);
  auto rank = this->sorted_idx_cache_.HostSpan();
  CHECK_EQ(rank.size(), predt.size());
  common::ParallelFor(this->Groups(), ctx->Threads(), [&](auto g) {
    auto cnt = gptr[g + 1] - gptr[g];
    auto g_predt = predt.subspan(gptr[g], cnt);
    auto g_rank = rank.subspan(gptr[g], cnt);
    auto sorted_idx = common::ArgSort<std::size_t>(
        ctx, g_predt.data(), g_predt.data() + g_predt.size(), std::greater<>{});
    CHECK_EQ(g_rank.size(), sorted_idx.size());
    std::copy_n(sorted_idx.data(), sorted_idx.size(), g_rank.data());
  });
  return rank;
 }
 #if !defined(XGBOOST_USE_CUDA)
 void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
 common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
                                                             common::Span<float const>) {
  common::AssertGPUSupport();
  return {};
 }
 #endif  // !defined()
 void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
  auto const h_group_ptr = this->DataGroupPtr(ctx);
  discounts_.Resize(MaxGroupSize(), 0);
  auto& h_discounts = discounts_.HostVector();
  for (std::size_t i = 0; i < MaxGroupSize(); ++i) {
    h_discounts[i] = CalcDCGDiscount(i);
  }
  auto n_groups = h_group_ptr.size() - 1;
  auto h_labels = info.labels.HostView().Slice(linalg::All(), 0);
  CheckNDCGLabels(this->Param(), h_labels,
                  [](auto beg, auto end, auto op) { return std::none_of(beg, end, op); });
  inv_idcg_.Reshape(n_groups);
  auto h_inv_idcg = inv_idcg_.HostView();
  std::size_t topk = this->Param().TopK();
  auto const exp_gain = this->Param().ndcg_exp_gain;
  common::ParallelFor(n_groups, ctx->Threads(), [&](auto g) {
    auto g_labels = h_labels.Slice(linalg::Range(h_group_ptr[g], h_group_ptr[g + 1]));
    auto sorted_idx = common::ArgSort<std::size_t>(ctx, linalg::cbegin(g_labels),
                                                   linalg::cend(g_labels), std::greater<>{});
    double idcg{0.0};
    for (std::size_t i = 0; i < std::min(g_labels.Size(), topk); ++i) {
      if (exp_gain) {
        idcg += h_discounts[i] * CalcDCGGain(g_labels(sorted_idx[i]));
      } else {
        idcg += h_discounts[i] * g_labels(sorted_idx[i]);
      }
    }
    h_inv_idcg(g) = CalcInvIDCG(idcg);
  });
 }
 #if !defined(XGBOOST_USE_CUDA)
 void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
 #endif  // !defined(XGBOOST_USE_CUDA)
 DMLC_REGISTER_PARAMETER(LambdaRankParam);
 void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
  auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
  CheckMapLabels(h_label, [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
 }
 #if !defined(XGBOOST_USE_CUDA)
 void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
 #endif  // !defined(XGBOOST_USE_CUDA)
 std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
  std::string out_name;
  if (!param.empty()) {
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@ -0,0 +1,212 @@
 /**
 * Copyright 2023 by XGBoost Contributors
 */
 #include <thrust/functional.h>                  // for maximum
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
 #include <thrust/logical.h>                     // for none_of, all_of
 #include <thrust/pair.h>                        // for pair, make_pair
 #include <thrust/reduce.h>                      // for reduce
 #include <thrust/scan.h>                        // for inclusive_scan
 #include <cstddef>                              // for size_t
 #include "algorithm.cuh"                        // for SegmentedArgSort
 #include "cuda_context.cuh"                     // for CUDAContext
 #include "device_helpers.cuh"                   // for MakeTransformIterator, LaunchN
 #include "optional_weight.h"                    // for MakeOptionalWeights, OptionalWeights
 #include "ranking_utils.cuh"                    // for ThreadsForMean
 #include "ranking_utils.h"
 #include "threading_utils.cuh"                  // for SegmentedTrapezoidThreads
 #include "xgboost/base.h"                       // for XGBOOST_DEVICE, bst_group_t
 #include "xgboost/context.h"                    // for Context
 #include "xgboost/linalg.h"                     // for VectorView, All, Range
 #include "xgboost/logging.h"                    // for CHECK
 #include "xgboost/span.h"                       // for Span
 namespace xgboost::ltr {
 namespace cuda_impl {
 void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
                    common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
                    common::Span<bst_group_t const> d_group_ptr, std::size_t k,
                    linalg::VectorView<double> out_dcg) {
  CHECK_EQ(d_group_ptr.size() - 1, out_dcg.Size());
  using IdxGroup = thrust::pair<std::size_t, std::size_t>;
  auto group_it = dh::MakeTransformIterator<IdxGroup>(
      thrust::make_counting_iterator(0ull), [=] XGBOOST_DEVICE(std::size_t idx) {
        return thrust::make_pair(idx, dh::SegmentId(d_group_ptr, idx));  // NOLINT
      });
  auto value_it = dh::MakeTransformIterator<double>(
      group_it,
      [exp_gain, d_labels, d_group_ptr, k,
       d_sorted_idx] XGBOOST_DEVICE(IdxGroup const& l) -> double {
        auto g_begin = d_group_ptr[l.second];
        auto g_size = d_group_ptr[l.second + 1] - g_begin;
        auto idx_in_group = l.first - g_begin;
        if (idx_in_group >= k) {
          return 0.0;
        }
        double gain{0.0};
        auto g_sorted_idx = d_sorted_idx.subspan(g_begin, g_size);
        auto g_labels = d_labels.Slice(linalg::Range(g_begin, g_begin + g_size));
        if (exp_gain) {
          gain = ltr::CalcDCGGain(g_labels(g_sorted_idx[idx_in_group]));
        } else {
          gain = g_labels(g_sorted_idx[idx_in_group]);
        }
        double discount = CalcDCGDiscount(idx_in_group);
        return gain * discount;
      });
  CHECK(out_dcg.Contiguous());
  std::size_t bytes;
  cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
                                  d_group_ptr.size() - 1, d_group_ptr.data(),
                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
  dh::TemporaryArray<char> temp(bytes);
  cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, value_it, out_dcg.Values().data(),
                                  d_group_ptr.size() - 1, d_group_ptr.data(),
                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
 }
 void CalcQueriesInvIDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
                        common::Span<bst_group_t const> d_group_ptr,
                        linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const& p) {
  CHECK_GE(d_group_ptr.size(), 2ul);
  size_t n_groups = d_group_ptr.size() - 1;
  CHECK_EQ(out_inv_IDCG.Size(), n_groups);
  dh::device_vector<std::size_t> sorted_idx(d_labels.Size());
  auto d_sorted_idx = dh::ToSpan(sorted_idx);
  common::SegmentedArgSort<false, true>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
  CalcQueriesDCG(ctx, d_labels, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(), out_inv_IDCG);
  dh::LaunchN(out_inv_IDCG.Size(), ctx->CUDACtx()->Stream(),
              [out_inv_IDCG] XGBOOST_DEVICE(size_t idx) mutable {
                double idcg = out_inv_IDCG(idx);
                out_inv_IDCG(idx) = CalcInvIDCG(idcg);
              });
 }
 }  // namespace cuda_impl
 namespace {
 struct CheckNDCGOp {
  CUDAContext const* cuctx;
  template <typename It, typename Op>
  bool operator()(It beg, It end, Op op) {
    return thrust::none_of(cuctx->CTP(), beg, end, op);
  }
 };
 struct CheckMAPOp {
  CUDAContext const* cuctx;
  template <typename It, typename Op>
  bool operator()(It beg, It end, Op op) {
    return thrust::all_of(cuctx->CTP(), beg, end, op);
  }
 };
 struct ThreadGroupOp {
  common::Span<bst_group_t const> d_group_ptr;
  std::size_t n_pairs;
  common::Span<std::size_t> out_thread_group_ptr;
  XGBOOST_DEVICE void operator()(std::size_t i) {
    out_thread_group_ptr[i + 1] =
        cuda_impl::ThreadsForMean(d_group_ptr[i + 1] - d_group_ptr[i], n_pairs);
  }
 };
 struct GroupSizeOp {
  common::Span<bst_group_t const> d_group_ptr;
  XGBOOST_DEVICE auto operator()(std::size_t i) -> std::size_t {
    return d_group_ptr[i + 1] - d_group_ptr[i];
  }
 };
 struct WeightOp {
  common::OptionalWeights d_weight;
  XGBOOST_DEVICE auto operator()(std::size_t i) -> double { return d_weight[i]; }
 };
 }  // anonymous namespace
 void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
  CUDAContext const* cuctx = ctx->CUDACtx();
  group_ptr_.SetDevice(ctx->gpu_id);
  if (info.group_ptr_.empty()) {
    group_ptr_.Resize(2, 0);
    group_ptr_.HostVector()[1] = info.num_row_;
  } else {
    auto const& h_group_ptr = info.group_ptr_;
    group_ptr_.Resize(h_group_ptr.size());
    auto d_group_ptr = group_ptr_.DeviceSpan();
    dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
                                  cudaMemcpyHostToDevice, cuctx->Stream()));
  }
  auto d_group_ptr = DataGroupPtr(ctx);
  std::size_t n_groups = Groups();
  auto it = dh::MakeTransformIterator<std::size_t>(thrust::make_counting_iterator(0ul),
                                                   GroupSizeOp{d_group_ptr});
  max_group_size_ =
      thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
  threads_group_ptr_.SetDevice(ctx->gpu_id);
  threads_group_ptr_.Resize(n_groups + 1, 0);
  auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
  if (param_.HasTruncation()) {
    n_cuda_threads_ =
        common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair());
  } else {
    auto n_pairs = Param().NumPair();
    dh::LaunchN(n_groups, cuctx->Stream(),
                ThreadGroupOp{d_group_ptr, n_pairs, d_threads_group_ptr});
    thrust::inclusive_scan(cuctx->CTP(), dh::tcbegin(d_threads_group_ptr),
                           dh::tcend(d_threads_group_ptr), dh::tbegin(d_threads_group_ptr));
    n_cuda_threads_ = info.num_row_ * param_.NumPair();
  }
  sorted_idx_cache_.SetDevice(ctx->gpu_id);
  sorted_idx_cache_.Resize(info.labels.Size(), 0);
  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
  auto w_it =
      dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), WeightOp{weight});
  weight_norm_ = static_cast<double>(n_groups) / thrust::reduce(w_it, w_it + n_groups);
 }
 common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,
                                                             common::Span<float const> predt) {
  auto d_sorted_idx = sorted_idx_cache_.DeviceSpan();
  auto d_group_ptr = DataGroupPtr(ctx);
  common::SegmentedArgSort<false, true>(ctx, predt, d_group_ptr, d_sorted_idx);
  return d_sorted_idx;
 }
 void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
  CUDAContext const* cuctx = ctx->CUDACtx();
  auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
  CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});
  auto d_group_ptr = this->DataGroupPtr(ctx);
  std::size_t n_groups = d_group_ptr.size() - 1;
  inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
  auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
  cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
  CHECK_GE(this->Param().NumPair(), 1ul);
  discounts_.SetDevice(ctx->gpu_id);
  discounts_.Resize(MaxGroupSize());
  auto d_discount = discounts_.DeviceSpan();
  dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
              [=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
 }
 void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
  CheckMapLabels(d_label, CheckMAPOp{ctx->CUDACtx()});
 }
 }  // namespace xgboost::ltr
--- a/src/common/ranking_utils.cuh
+++ b/src/common/ranking_utils.cuh
@ -0,0 +1,40 @@
 /**
 * Copyright 2023 by XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_RANKING_UTILS_CUH_
 #define XGBOOST_COMMON_RANKING_UTILS_CUH_
 #include <cstddef>            // for size_t
 #include "ranking_utils.h"    // for LambdaRankParam
 #include "xgboost/base.h"     // for bst_group_t, XGBOOST_DEVICE
 #include "xgboost/context.h"  // for Context
 #include "xgboost/linalg.h"   // for VectorView
 #include "xgboost/span.h"     // for Span
 namespace xgboost {
 namespace ltr {
 namespace cuda_impl {
 void CalcQueriesDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
                    common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
                    common::Span<bst_group_t const> d_group_ptr, std::size_t k,
                    linalg::VectorView<double> out_dcg);
 void CalcQueriesInvIDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
                        common::Span<bst_group_t const> d_group_ptr,
                        linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const &p);
 // Functions for creating number of threads for CUDA, and getting back the number of pairs
 // from the number of threads.
 XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size,
                                                          std::size_t n_pairs) {
  return group_size * n_pairs;
 }
 XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,
                                                         std::size_t group_size) {
  return n_threads / group_size;
 }
 }  // namespace cuda_impl
 }  // namespace ltr
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_RANKING_UTILS_CUH_
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@ -11,7 +11,6 @@
 #include <string>                        // for char_traits, string
 #include <vector>                        // for vector
 #include "./math.h"                      // for CloseTo
 #include "dmlc/parameter.h"              // for FieldEntry, DMLC_DECLARE_FIELD
 #include "error_msg.h"                   // for GroupWeight, GroupSize
 #include "xgboost/base.h"                // for XGBOOST_DEVICE, bst_group_t
@ -19,7 +18,7 @@
 #include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/linalg.h"              // for Vector, VectorView, Tensor
-#include "xgboost/logging.h"             // for LogCheck_EQ, CHECK_EQ, CHECK
+#include "xgboost/logging.h"             // for CHECK_EQ, CHECK
 #include "xgboost/parameter.h"           // for XGBoostParameter
 #include "xgboost/span.h"                // for Span
 #include "xgboost/string_view.h"         // for StringView
@ -34,6 +33,25 @@ using rel_degree_t = std::uint32_t;  // NOLINT
 */
 using position_t = std::uint32_t;  // NOLINT
 /**
 * \brief Maximum relevance degree for NDCG
 */
 constexpr std::size_t MaxRel() { return sizeof(rel_degree_t) * 8 - 1; }
 static_assert(MaxRel() == 31);
 XGBOOST_DEVICE inline double CalcDCGGain(rel_degree_t label) {
  return static_cast<double>((1u << label) - 1);
 }
 XGBOOST_DEVICE inline double CalcDCGDiscount(std::size_t idx) {
  return 1.0 / std::log2(static_cast<double>(idx) + 2.0);
 }
 XGBOOST_DEVICE inline double CalcInvIDCG(double idcg) {
  auto inv_idcg = (idcg == 0.0 ? 0.0 : (1.0 / idcg));  // handle irrelevant document
  return inv_idcg;
 }
 enum class PairMethod : std::int32_t {
  kTopK = 0,
  kMean = 1,
@ -115,7 +133,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
        .describe("Number of pairs for each sample in the list.");
    DMLC_DECLARE_FIELD(lambdarank_unbiased)
        .set_default(false)
-        .describe("Unbiased lambda mart. Use IPW to debias click position");
+        .describe("Unbiased lambda mart. Use extended IPW to debias click position");
    DMLC_DECLARE_FIELD(lambdarank_bias_norm)
        .set_default(2.0)
        .set_lower_bound(0.0)
@ -126,6 +144,285 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
  }
 };
 /**
 * \brief Common cached items for ranking tasks.
 */
 class RankingCache {
 private:
  void InitOnCPU(Context const* ctx, MetaInfo const& info);
  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
  // Cached parameter
  LambdaRankParam param_;
  // offset to data groups.
  HostDeviceVector<bst_group_t> group_ptr_;
  // store the sorted index of prediction.
  HostDeviceVector<std::size_t> sorted_idx_cache_;
  // Maximum size of group
  std::size_t max_group_size_{0};
  // Normalization for weight
  double weight_norm_{1.0};
  /**
   * CUDA cache
   */
  // offset to threads assigned to each group for gradient calculation
  HostDeviceVector<std::size_t> threads_group_ptr_;
  // Sorted index of label for finding buckets.
  HostDeviceVector<std::size_t> y_sorted_idx_cache_;
  // Cached labels sorted by the model
  HostDeviceVector<float> y_ranked_by_model_;
  // store rounding factor for objective for each group
  linalg::Vector<GradientPair> roundings_;
  // rounding factor for cost
  HostDeviceVector<double> cost_rounding_;
  // temporary storage for creating rounding factors. Stored as byte to avoid having cuda
  // data structure in here.
  HostDeviceVector<std::uint8_t> max_lambdas_;
  // total number of cuda threads used for gradient calculation
  std::size_t n_cuda_threads_{0};
  // Create model rank list on GPU
  common::Span<std::size_t const> MakeRankOnCUDA(Context const* ctx,
                                                 common::Span<float const> predt);
  // Create model rank list on CPU
  common::Span<std::size_t const> MakeRankOnCPU(Context const* ctx,
                                                common::Span<float const> predt);
 protected:
  [[nodiscard]] std::size_t MaxGroupSize() const { return max_group_size_; }
 public:
  RankingCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : param_{p} {
    CHECK(param_.GetInitialised());
    if (!info.group_ptr_.empty()) {
      CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
          << error::GroupSize() << "the size of label.";
    }
    if (ctx->IsCPU()) {
      this->InitOnCPU(ctx, info);
    } else {
      this->InitOnCUDA(ctx, info);
    }
    if (!info.weights_.Empty()) {
      CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
    }
  }
  [[nodiscard]] std::size_t MaxPositionSize() const {
    // Use truncation level as bound.
    if (param_.HasTruncation()) {
      return param_.NumPair();
    }
    // Hardcoded maximum size of positions to track. We don't need too many of them as the
    // bias decreases exponentially.
    return std::min(max_group_size_, static_cast<std::size_t>(32));
  }
  // Constructed as [1, n_samples] if group ptr is not supplied by the user
  common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
    group_ptr_.SetDevice(ctx->gpu_id);
    return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
  }
  [[nodiscard]] auto const& Param() const { return param_; }
  [[nodiscard]] std::size_t Groups() const { return group_ptr_.Size() - 1; }
  [[nodiscard]] double WeightNorm() const { return weight_norm_; }
  // Create a rank list by model prediction
  common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
    if (sorted_idx_cache_.Empty()) {
      sorted_idx_cache_.SetDevice(ctx->gpu_id);
      sorted_idx_cache_.Resize(predt.size());
    }
    if (ctx->IsCPU()) {
      return this->MakeRankOnCPU(ctx, predt);
    } else {
      return this->MakeRankOnCUDA(ctx, predt);
    }
  }
  // The function simply returns a uninitialized buffer as this is only used by the
  // objective for creating pairs.
  common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
    CHECK(ctx->IsCUDA());
    if (y_sorted_idx_cache_.Empty()) {
      y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
      y_sorted_idx_cache_.Resize(n_samples);
    }
    return y_sorted_idx_cache_.DeviceSpan();
  }
  common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
    CHECK(ctx->IsCUDA());
    if (y_ranked_by_model_.Empty()) {
      y_ranked_by_model_.SetDevice(ctx->gpu_id);
      y_ranked_by_model_.Resize(n_samples);
    }
    return y_ranked_by_model_.DeviceSpan();
  }
  // CUDA cache getters, the cache is shared between metric and objective, some of these
  // fields are lazy initialized to avoid unnecessary allocation.
  [[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
    CHECK(!threads_group_ptr_.Empty());
    return threads_group_ptr_.ConstDeviceSpan();
  }
  [[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
  linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
    if (roundings_.Size() == 0) {
      roundings_.SetDevice(ctx->gpu_id);
      roundings_.Reshape(Groups());
    }
    return roundings_.View(ctx->gpu_id);
  }
  common::Span<double> CUDACostRounding(Context const* ctx) {
    if (cost_rounding_.Size() == 0) {
      cost_rounding_.SetDevice(ctx->gpu_id);
      cost_rounding_.Resize(1);
    }
    return cost_rounding_.DeviceSpan();
  }
  template <typename Type>
  common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
    max_lambdas_.SetDevice(ctx->gpu_id);
    std::size_t bytes = n * sizeof(Type);
    if (bytes != max_lambdas_.Size()) {
      max_lambdas_.Resize(bytes);
    }
    return common::Span<Type>{reinterpret_cast<Type*>(max_lambdas_.DevicePointer()), n};
  }
 };
 class NDCGCache : public RankingCache {
  // NDCG discount
  HostDeviceVector<double> discounts_;
  // 1.0 / IDCG
  linalg::Vector<double> inv_idcg_;
  /**
   * CUDA cache
   */
  // store the intermediate DCG calculation result for metric
  linalg::Vector<double> dcg_;
 public:
  void InitOnCPU(Context const* ctx, MetaInfo const& info);
  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
 public:
  NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
      : RankingCache{ctx, info, p} {
    if (ctx->IsCPU()) {
      this->InitOnCPU(ctx, info);
    } else {
      this->InitOnCUDA(ctx, info);
    }
  }
  linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
    return inv_idcg_.View(ctx->gpu_id);
  }
  common::Span<double const> Discount(Context const* ctx) const {
    return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
  }
  linalg::VectorView<double> Dcg(Context const* ctx) {
    if (dcg_.Size() == 0) {
      dcg_.SetDevice(ctx->gpu_id);
      dcg_.Reshape(this->Groups());
    }
    return dcg_.View(ctx->gpu_id);
  }
 };
 /**
 * \brief Validate label for NDCG
 *
 * \tparam NoneOf Implementation of std::none_of. Specified as a parameter to reuse the
 *                check for both CPU and GPU.
 */
 template <typename NoneOf>
 void CheckNDCGLabels(ltr::LambdaRankParam const& p, linalg::VectorView<float const> labels,
                     NoneOf none_of) {
  auto d_labels = labels.Values();
  if (p.ndcg_exp_gain) {
    auto label_is_integer =
        none_of(d_labels.data(), d_labels.data() + d_labels.size(), [] XGBOOST_DEVICE(float v) {
          auto l = std::floor(v);
          return std::fabs(l - v) > kRtEps || v < 0.0f;
        });
    CHECK(label_is_integer)
        << "When using relevance degree as target, label must be either 0 or positive integer.";
  }
  if (p.ndcg_exp_gain) {
    auto label_is_valid = none_of(d_labels.data(), d_labels.data() + d_labels.size(),
                                  [] XGBOOST_DEVICE(ltr::rel_degree_t v) { return v > MaxRel(); });
    CHECK(label_is_valid) << "Relevance degress must be lesser than or equal to " << MaxRel()
                          << " when the exponential NDCG gain function is used. "
                          << "Set `ndcg_exp_gain` to false to use custom DCG gain.";
  }
 }
 template <typename AllOf>
 bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
  auto s_label = label.Values();
  return all_of(s_label.data(), s_label.data() + s_label.size(), [] XGBOOST_DEVICE(float y) {
    return std::abs(y - 1.0f) < kRtEps || std::abs(y - 0.0f) < kRtEps;
  });
 }
 /**
 * \brief Validate label for MAP
 *
 * \tparam Implementation of std::all_of. Specified as a parameter to reuse the check for
 *         both CPU and GPU.
 */
 template <typename AllOf>
 void CheckMapLabels(linalg::VectorView<float const> label, AllOf all_of) {
  auto s_label = label.Values();
  auto is_binary = IsBinaryRel(label, all_of);
  CHECK(is_binary) << "MAP can only be used with binary labels.";
 }
 class MAPCache : public RankingCache {
  // Total number of relevant documents for each group
  HostDeviceVector<double> n_rel_;
  // \sum l_k/k
  HostDeviceVector<double> acc_;
  HostDeviceVector<double> map_;
  // Number of samples in this dataset.
  std::size_t n_samples_{0};
  void InitOnCPU(Context const* ctx, MetaInfo const& info);
  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
 public:
  MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
      : RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
    if (ctx->IsCPU()) {
      this->InitOnCPU(ctx, info);
    } else {
      this->InitOnCUDA(ctx, info);
    }
  }
  common::Span<double> NumRelevant(Context const* ctx) {
    if (n_rel_.Empty()) {
      n_rel_.SetDevice(ctx->gpu_id);
      n_rel_.Resize(n_samples_);
    }
    return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
  }
  common::Span<double> Acc(Context const* ctx) {
    if (acc_.Empty()) {
      acc_.SetDevice(ctx->gpu_id);
      acc_.Resize(n_samples_);
    }
    return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
  }
  common::Span<double> Map(Context const* ctx) {
    if (map_.Empty()) {
      map_.SetDevice(ctx->gpu_id);
      map_.Resize(this->Groups());
    }
    return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
  }
 };
 /**
 * \brief Parse name for ranking metric given parameters.
 *
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@ -8,9 +8,11 @@
 #include <dmlc/omp.h>
 #include <algorithm>
-#include <cstdint>  // std::int32_t
+#include <cstdint>  // for int32_t
 #include <cstdlib>  // for malloc, free
 #include <limits>
-#include <type_traits>  // std::is_signed
+#include <new>          // for bad_alloc
 #include <type_traits>  // for is_signed
 #include <vector>
 #include "xgboost/logging.h"
@ -266,7 +268,7 @@ class MemStackAllocator {
    if (MaxStackSize >= required_size_) {
      ptr_ = stack_mem_;
    } else {
-      ptr_ = reinterpret_cast<T*>(malloc(required_size_ * sizeof(T)));
+      ptr_ = reinterpret_cast<T*>(std::malloc(required_size_ * sizeof(T)));
    }
    if (!ptr_) {
      throw std::bad_alloc{};
@ -278,7 +280,7 @@ class MemStackAllocator {
  ~MemStackAllocator() {
    if (required_size_ > MaxStackSize) {
-      free(ptr_);
+      std::free(ptr_);
    }
  }
  T& operator[](size_t i) { return ptr_[i]; }
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -10,13 +10,16 @@
 #include <cstring>
 #include "../collective/communicator-inl.h"
-#include "../common/algorithm.h"  // StableSort
+#include "../collective/communicator.h"
-#include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
+#include "../common/common.h"
 #include "../common/algorithm.h"  // for StableSort
 #include "../common/api_entry.h"  // for XGBAPIThreadLocalEntry
 #include "../common/error_msg.h"  // for InfInData
 #include "../common/group_data.h"
 #include "../common/io.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
-#include "../common/numeric.h"  // Iota
+#include "../common/numeric.h"  // for Iota
 #include "../common/threading_utils.h"
 #include "../common/version.h"
 #include "../data/adapter.h"
@ -700,6 +703,14 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
  }
 }
 void MetaInfo::SynchronizeNumberOfColumns() {
  if (collective::IsFederated() && data_split_mode == DataSplitMode::kCol) {
    collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
  } else {
    collective::Allreduce<collective::Operation::kMax>(&num_col_, 1);
  }
 }
 void MetaInfo::Validate(std::int32_t device) const {
  if (group_ptr_.size() != 0 && weights_.Size() != 0) {
    CHECK_EQ(group_ptr_.size(), weights_.Size() + 1)
@ -867,7 +878,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
          dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
      data::FileAdapter adapter(parser.get());
      dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
-                             cache_file);
+                             cache_file, data_split_mode);
    } else {
      data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart),
                              file_format};
@ -903,11 +914,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
    LOG(FATAL) << "Encountered parser error:\n" << e.what();
  }
  /* sync up number of features after matrix loaded.
   * partitioned data will fail the train/val validation check
   * since partitioned data not knowing the real number of features. */
  collective::Allreduce<collective::Operation::kMax>(&dmat->Info().num_col_, 1);
  if (need_split && data_split_mode == DataSplitMode::kCol) {
    if (!cache_file.empty()) {
      LOG(FATAL) << "Column-wise data split is not support for external memory.";
@ -917,7 +923,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
    delete dmat;
    return sliced;
  } else {
    dmat->Info().data_split_mode = data_split_mode;
    return dmat;
  }
 }
@ -954,39 +959,49 @@ template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
    XGDMatrixCallbackNext *next, float missing, int32_t n_threads, std::string);
 template <typename AdapterT>
-DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string&) {
+DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string&,
-  return new data::SimpleDMatrix(adapter, missing, nthread);
+                         DataSplitMode data_split_mode) {
  return new data::SimpleDMatrix(adapter, missing, nthread, data_split_mode);
 }
 template DMatrix* DMatrix::Create<data::DenseAdapter>(data::DenseAdapter* adapter, float missing,
                                                      std::int32_t nthread,
-                                                      const std::string& cache_prefix);
+                                                      const std::string& cache_prefix,
                                                      DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::ArrayAdapter>(data::ArrayAdapter* adapter, float missing,
                                                      std::int32_t nthread,
-                                                      const std::string& cache_prefix);
+                                                      const std::string& cache_prefix,
                                                      DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSRAdapter>(data::CSRAdapter* adapter, float missing,
                                                    std::int32_t nthread,
-                                                    const std::string& cache_prefix);
+                                                    const std::string& cache_prefix,
                                                    DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSCAdapter>(data::CSCAdapter* adapter, float missing,
                                                    std::int32_t nthread,
-                                                    const std::string& cache_prefix);
+                                                    const std::string& cache_prefix,
                                                    DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::DataTableAdapter>(data::DataTableAdapter* adapter,
                                                          float missing, std::int32_t nthread,
-                                                          const std::string& cache_prefix);
+                                                          const std::string& cache_prefix,
                                                          DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::FileAdapter>(data::FileAdapter* adapter, float missing,
                                                     std::int32_t nthread,
-                                                     const std::string& cache_prefix);
+                                                     const std::string& cache_prefix,
                                                     DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(data::CSRArrayAdapter* adapter,
                                                         float missing, std::int32_t nthread,
-                                                         const std::string& cache_prefix);
+                                                         const std::string& cache_prefix,
                                                         DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter* adapter,
                                                         float missing, std::int32_t nthread,
-                                                         const std::string& cache_prefix);
+                                                         const std::string& cache_prefix,
                                                         DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create(
    data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
-    float missing, int nthread, const std::string& cache_prefix);
+    float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
-    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&);
+    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&,
    DataSplitMode data_split_mode);
 SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
  SparsePage transpose;
@ -1048,6 +1063,13 @@ void SparsePage::SortIndices(int32_t n_threads) {
  });
 }
 void SparsePage::Reindex(uint64_t feature_offset, int32_t n_threads) {
  auto& h_data = this->data.HostVector();
  common::ParallelFor(h_data.size(), n_threads, [&](auto i) {
    h_data[i].index += feature_offset;
  });
 }
 void SparsePage::SortRows(int32_t n_threads) {
  auto& h_offset = this->offset.HostVector();
  auto& h_data = this->data.HostVector();
@ -1144,7 +1166,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
    });
  }
  exec.Rethrow();
-  CHECK(valid) << "Input data contains `inf` or `nan`";
+  CHECK(valid) << error::InfInData();
  for (const auto & max : max_columns_vector) {
    max_columns = std::max(max_columns, max[0]);
  }
--- a/src/data/data.cu
+++ b/src/data/data.cu
@ -208,17 +208,17 @@ void MetaInfo::SetInfoFromCUDA(Context const& ctx, StringView key, Json array) {
 template <typename AdapterT>
 DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
-                         const std::string& cache_prefix) {
+                         const std::string& cache_prefix, DataSplitMode data_split_mode) {
  CHECK_EQ(cache_prefix.size(), 0)
      << "Device memory construction is not currently supported with external "
         "memory.";
-  return new data::SimpleDMatrix(adapter, missing, nthread);
+  return new data::SimpleDMatrix(adapter, missing, nthread, data_split_mode);
 }
 template DMatrix* DMatrix::Create<data::CudfAdapter>(
    data::CudfAdapter* adapter, float missing, int nthread,
-    const std::string& cache_prefix);
+    const std::string& cache_prefix, DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CupyAdapter>(
    data::CupyAdapter* adapter, float missing, int nthread,
-    const std::string& cache_prefix);
+    const std::string& cache_prefix, DataSplitMode data_split_mode);
 }  // namespace xgboost
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@ -4,6 +4,9 @@
 */
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
 #define XGBOOST_DATA_DEVICE_ADAPTER_H_
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
 #include <thrust/logical.h>                     // for none_of
 #include <cstddef>                              // for size_t
 #include <limits>
 #include <memory>
@ -240,6 +243,20 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
  return row_stride;
 }
 /**
 * \brief Check there's no inf in data.
 */
 template <typename AdapterBatchT>
 bool HasInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
  auto counting = thrust::make_counting_iterator(0llu);
  auto value_iter = dh::MakeTransformIterator<float>(
      counting, [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
  auto valid =
      thrust::none_of(value_iter, value_iter + batch.Size(),
                      [is_valid] XGBOOST_DEVICE(float v) { return is_valid(v) && std::isinf(v); });
  return valid;
 }
 };  // namespace data
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_DEVICE_ADAPTER_H_
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@ -1,5 +1,5 @@
-/*!
+/**
- * Copyright 2019-2022 XGBoost contributors
+ * Copyright 2019-2023 by XGBoost contributors
 */
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
@ -9,7 +9,7 @@
 #include "../common/random.h"
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
 #include "./ellpack_page.cuh"
-#include "device_adapter.cuh"
+#include "device_adapter.cuh"  // for HasInfInData
 #include "gradient_index.h"
 #include "xgboost/data.h"
@ -203,8 +203,7 @@ struct TupleScanOp {
 // Here the data is already correctly ordered and simply needs to be compacted
 // to remove missing data
 template <typename AdapterBatchT>
-void CopyDataToEllpack(const AdapterBatchT &batch,
+void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
                       common::Span<FeatureType const> feature_types,
                       EllpackPageImpl* dst, int device_idx, float missing) {
  // Some witchcraft happens here
  // The goal is to copy valid elements out of the input to an ELLPACK matrix
@ -215,6 +214,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
  // correct output position
  auto counting = thrust::make_counting_iterator(0llu);
  data::IsValidFunctor is_valid(missing);
  bool valid = data::HasInfInData(batch, is_valid);
  CHECK(valid) << error::InfInData();
  auto key_iter = dh::MakeTransformIterator<size_t>(
      counting,
      [=] __device__(size_t idx) {
@ -255,9 +257,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
      cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
                        TupleScanOp<Tuple>, cub::NullType, int64_t>;
 #if THRUST_MAJOR_VERSION >= 2
-  DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
+  dh::safe_cuda(DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
                                       TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
-                         nullptr);
+                                       nullptr));
 #else
  DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
                         TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
@ -265,9 +267,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
 #endif
  dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
 #if THRUST_MAJOR_VERSION >= 2
-  DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
+  dh::safe_cuda(DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
                                       key_value_index_iter, out, TupleScanOp<Tuple>(),
-                         cub::NullType(), batch.Size(), nullptr);
+                                       cub::NullType(), batch.Size(), nullptr));
 #else
  DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
                         key_value_index_iter, out, TupleScanOp<Tuple>(),
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@ -1,21 +1,23 @@
-/*!
+/**
- * Copyright 2017-2022 by XGBoost Contributors
+ * Copyright 2017-2023 by XGBoost Contributors
 * \brief Data type for fast histogram aggregation.
 */
 #ifndef XGBOOST_DATA_GRADIENT_INDEX_H_
 #define XGBOOST_DATA_GRADIENT_INDEX_H_
-#include <algorithm>  // std::min
+#include <algorithm>  // for min
-#include <cinttypes>  // std::uint32_t
+#include <atomic>     // for atomic
-#include <cstddef>    // std::size_t
+#include <cinttypes>  // for uint32_t
 #include <cstddef>    // for size_t
 #include <memory>
 #include <vector>
 #include "../common/categorical.h"
 #include "../common/error_msg.h"  // for InfInData
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
 #include "../common/threading_utils.h"
-#include "../common/transform_iterator.h"  // common::MakeIndexTransformIter
+#include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
 #include "proxy_dmatrix.h"
 #include "xgboost/base.h"
@ -62,6 +64,7 @@ class GHistIndexMatrix {
    BinIdxType* index_data = index_data_span.data();
    auto const& ptrs = cut.Ptrs();
    auto const& values = cut.Values();
    std::atomic<bool> valid{true};
    common::ParallelFor(batch_size, batch_threads, [&](size_t i) {
      auto line = batch.GetLine(i);
      size_t ibegin = row_ptr[rbegin + i];  // index of first entry for current block
@ -70,6 +73,9 @@ class GHistIndexMatrix {
      for (size_t j = 0; j < line.Size(); ++j) {
        data::COOTuple elem = line.GetElement(j);
        if (is_valid(elem)) {
          if (XGBOOST_EXPECT((std::isinf(elem.value)), false)) {
            valid = false;
          }
          bst_bin_t bin_idx{-1};
          if (common::IsCat(ft, elem.column_idx)) {
            bin_idx = cut.SearchCatBin(elem.value, elem.column_idx, ptrs, values);
@ -82,6 +88,8 @@ class GHistIndexMatrix {
        }
      }
    });
    CHECK(valid) << error::InfInData();
  }
  // Gather hit_count from all threads
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@ -190,7 +190,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
  // From here on Info() has the correct data shape
  Info().num_row_ = accumulated_rows;
  Info().num_nonzero_ = nnz;
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  Info().SynchronizeNumberOfColumns();
  CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) {
    return f > accumulated_rows;
  })) << "Something went wrong during iteration.";
@ -257,6 +257,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
  }
  iter.Reset();
  CHECK_EQ(rbegin, Info().num_row_);
  CHECK_EQ(this->ghist_->Features(), Info().num_col_);
  /**
   * Generate column matrix
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@ -195,7 +195,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
  iter.Reset();
  // Synchronise worker columns
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.SynchronizeNumberOfColumns();
 }
 BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& param) {
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@ -1,27 +1,24 @@
-/*!
+/**
- * Copyright 2021 XGBoost contributors
+ * Copyright 2021-2023 XGBoost contributors
 */
 #include <any>  // for any, any_cast
 #include "device_adapter.cuh"
 #include "proxy_dmatrix.h"
-namespace xgboost {
+namespace xgboost::data {
 namespace data {
 template <typename Fn>
 decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
-    auto value = dmlc::get<std::shared_ptr<CupyAdapter>>(
+    auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
        proxy->Adapter())->Value();
    return fn(value);
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
-    auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
        proxy->Adapter())->Value();
    return fn(value);
  } else {
    LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
-    auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
        proxy->Adapter())->Value();
    return fn(value);
  }
 }
-}  // namespace data
+}  // namespace xgboost::data
 }  // namespace xgboost
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@ -1,11 +1,10 @@
-/*!
+/**
- * Copyright 2020-2022, XGBoost contributors
+ * Copyright 2020-2023, XGBoost contributors
 */
 #ifndef XGBOOST_DATA_PROXY_DMATRIX_H_
 #define XGBOOST_DATA_PROXY_DMATRIX_H_
-#include <dmlc/any.h>
+#include <any>  // for any, any_cast
 #include <memory>
 #include <string>
 #include <utility>
@ -15,8 +14,7 @@
 #include "xgboost/context.h"
 #include "xgboost/data.h"
-namespace xgboost {
+namespace xgboost::data {
 namespace data {
 /*
 * \brief A proxy to external iterator.
 */
@ -44,7 +42,7 @@ class DataIterProxy {
 */
 class DMatrixProxy : public DMatrix {
  MetaInfo info_;
-  dmlc::any batch_;
+  std::any batch_;
  Context ctx_;
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@ -115,9 +113,7 @@ class DMatrixProxy : public DMatrix {
    LOG(FATAL) << "Not implemented.";
    return BatchSet<ExtSparsePage>(BatchIterator<ExtSparsePage>(nullptr));
  }
-  dmlc::any Adapter() const {
+  std::any Adapter() const { return batch_; }
    return batch_;
  }
 };
 inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
@ -131,15 +127,13 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
 template <typename Fn>
 decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
-    auto value =
+    auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
        dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
    if (type_error) {
      *type_error = false;
    }
    return fn(value);
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
-    auto value = dmlc::get<std::shared_ptr<ArrayAdapter>>(
+    auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
        proxy->Adapter())->Value();
    if (type_error) {
      *type_error = false;
    }
@ -154,6 +148,5 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
        decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
  }
 }
-}  // namespace data
+}  // namespace xgboost::data
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@ -73,6 +73,19 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
  return out;
 }
 void SimpleDMatrix::ReindexFeatures() {
  if (collective::IsFederated() && info_.data_split_mode == DataSplitMode::kCol) {
    std::vector<uint64_t> buffer(collective::GetWorldSize());
    buffer[collective::GetRank()] = info_.num_col_;
    collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
    auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0);
    if (offset == 0) {
      return;
    }
    sparse_page_->Reindex(offset, ctx_.Threads());
  }
 }
 BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
  // since csr is the default data structure so `source_` is always available.
  auto begin_iter = BatchIterator<SparsePage>(
@ -151,7 +164,8 @@ BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(BatchParam const&) {
 }
 template <typename AdapterT>
-SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
+SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
                             DataSplitMode data_split_mode) {
  this->ctx_.nthread = nthread;
  std::vector<uint64_t> qids;
@ -217,7 +231,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
  // Synchronise worker columns
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.data_split_mode = data_split_mode;
  ReindexFeatures();
  info_.SynchronizeNumberOfColumns();
  if (adapter->NumRows() == kAdapterUnknownSize) {
    using IteratorAdapterT
@ -272,21 +288,30 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
    fo->Write(sparse_page_->data.HostVector());
 }
-template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread);
+template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread,
-template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread);
+                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread);
+template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread,
-template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread);
+                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread);
+template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread,
-template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread);
+                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread);
+template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread,
-template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread);
+                                      DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread,
                                      DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread,
                                      DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread,
                                      DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread,
                                      DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(
    IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>
        *adapter,
-    float missing, int nthread);
+    float missing, int nthread, DataSplitMode data_split_mode);
 template <>
-SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread) {
+SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
                             DataSplitMode data_split_mode) {
    ctx_.nthread = nthread;
  auto& offset_vec = sparse_page_->offset.HostVector();
@ -346,7 +371,10 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
  }
  // Synchronise worker columns
  info_.num_col_ = adapter->NumColumns();
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.data_split_mode = data_split_mode;
  ReindexFeatures();
  info_.SynchronizeNumberOfColumns();
  info_.num_row_ = total_batch_size;
  info_.num_nonzero_ = data_vec.size();
  CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@ -15,7 +15,10 @@ namespace data {
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
 template <typename AdapterT>
-SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/) {
+SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/,
                             DataSplitMode data_split_mode) {
  CHECK(data_split_mode != DataSplitMode::kCol)
      << "Column-wise data split is currently not supported on the GPU.";
  auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
                                                                      : adapter->DeviceIdx();
  CHECK_GE(device, 0);
@ -40,12 +43,13 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
  info_.num_col_ = adapter->NumColumns();
  info_.num_row_ = adapter->NumRows();
  // Synchronise worker columns
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.data_split_mode = data_split_mode;
  info_.SynchronizeNumberOfColumns();
 }
 template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
-                                      int nthread);
+                                      int nthread, DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
-                                      int nthread);
+                                      int nthread, DataSplitMode data_split_mode);
 }  // namespace data
 }  // namespace xgboost
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@ -1,14 +1,13 @@
-/*!
+/**
- * Copyright 2019-2021 by XGBoost Contributors
+ * Copyright 2019-2023 by XGBoost Contributors
 * \file simple_dmatrix.cuh
 */
 #ifndef XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
 #define XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
 #include <thrust/copy.h>
 #include <thrust/scan.h>
 #include <thrust/execution_policy.h>
-#include "device_adapter.cuh"
+#include <thrust/scan.h>
 #if defined(XGBOOST_USE_CUDA)
 #include "../common/device_helpers.cuh"
@ -16,8 +15,10 @@
 #include "../common/device_helpers.hip.h"
 #endif
-namespace xgboost {
+#include "../common/error_msg.h"  // for InfInData
-namespace data {
+#include "device_adapter.cuh"     // for HasInfInData
 namespace xgboost::data {
 #if defined(XGBOOST_USE_CUDA)
 template <typename AdapterBatchT>
@ -94,7 +95,11 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
 }
 template <typename AdapterBatchT>
-size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing, SparsePage* page) {
+size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
                        SparsePage* page) {
  bool valid = HasInfInData(batch, IsValidFunctor{missing});
  CHECK(valid) << error::InfInData();
  page->offset.SetDevice(device);
  page->data.SetDevice(device);
  page->offset.Resize(batch.NumRows() + 1);
@ -106,6 +111,5 @@ size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missin
  return num_nonzero_;
 }
-}  // namespace data
+}  // namespace xgboost::data
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@ -22,7 +22,8 @@ class SimpleDMatrix : public DMatrix {
 public:
  SimpleDMatrix() = default;
  template <typename AdapterT>
-  explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread);
+  explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
                         DataSplitMode data_split_mode = DataSplitMode::kRow);
  explicit SimpleDMatrix(dmlc::Stream* in_stream);
  ~SimpleDMatrix() override = default;
@ -61,6 +62,15 @@ class SimpleDMatrix : public DMatrix {
  bool GHistIndexExists() const override { return static_cast<bool>(gradient_index_); }
  bool SparsePageExists() const override { return true; }
  /**
   * \brief Reindex the features based on a global view.
   *
   * In some cases (e.g. vertical federated learning), features are loaded locally with indices
   * starting from 0. However, all the algorithms assume the features are globally indexed, so we
   * reindex the features based on the offset needed to obtain the global view.
   */
  void ReindexFeatures();
 private:
  Context ctx_;
 };
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@ -96,7 +96,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
  this->info_.num_col_ = n_features;
  this->info_.num_nonzero_ = nnz;
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.SynchronizeNumberOfColumns();
  CHECK_NE(info_.num_col_, 0);
 }
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -10,6 +10,7 @@
 #include <dmlc/parameter.h>
 #include <algorithm>
 #include <cinttypes>  // for uint32_t
 #include <limits>
 #include <memory>
 #include <string>
@ -27,9 +28,11 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
 #include "xgboost/logging.h"
 #include "xgboost/model.h"
 #include "xgboost/objective.h"
 #include "xgboost/predictor.h"
-#include "xgboost/string_view.h"
+#include "xgboost/string_view.h"  // for StringView
 #include "xgboost/tree_model.h"   // for RegTree
 #include "xgboost/tree_updater.h"
 namespace xgboost::gbm {
@ -131,6 +134,12 @@ void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
    // set, since only experts are expected to do so.
    return;
  }
  if (model_.learner_model_param->IsVectorLeaf()) {
    CHECK(tparam_.tree_method == TreeMethod::kHist)
        << "Only the hist tree method is supported for building multi-target trees with vector "
           "leaf.";
  }
  // tparam_ is set before calling this function.
  if (tparam_.tree_method != TreeMethod::kAuto) {
    return;
@ -175,12 +184,12 @@ void GBTree::ConfigureUpdaters() {
    case TreeMethod::kExact:
      tparam_.updater_seq = "grow_colmaker,prune";
      break;
-    case TreeMethod::kHist:
+    case TreeMethod::kHist: {
-      LOG(INFO) <<
+      LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
-          "Tree method is selected to be 'hist', which uses a "
+                   "grow_quantile_histmaker.";
          "single updater grow_quantile_histmaker.";
      tparam_.updater_seq = "grow_quantile_histmaker";
      break;
    }
    case TreeMethod::kGPUHist: {
      common::AssertGPUSupport();
      tparam_.updater_seq = "grow_gpu_hist";
@ -209,11 +218,9 @@ void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_thre
    GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
  } else {
    std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
    auto nsize = static_cast<bst_omp_uint>(out_gpair->Size());
    const auto& gpair_h = in_gpair->ConstHostVector();
-    common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
+    common::ParallelFor(out_gpair->Size(), n_threads,
-      tmp_h[i] = gpair_h[i * n_groups + group_id];
+                        [&](auto i) { tmp_h[i] = gpair_h[i * n_groups + group_id]; });
    });
  }
 }
@ -234,6 +241,7 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
  CHECK_EQ(model_.param.num_parallel_tree, trees.size());
  CHECK_EQ(model_.param.num_parallel_tree, 1)
      << "Boosting random forest is not supported for current objective.";
  CHECK(!trees.front()->IsMultiTarget()) << "Update tree leaf" << MTNotImplemented();
  CHECK_EQ(trees.size(), model_.param.num_parallel_tree);
  for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
    auto const& position = node_position.at(tree_idx);
@ -245,17 +253,18 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
 void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                     PredictionCacheEntry* predt, ObjFunction const* obj) {
  std::vector<std::vector<std::unique_ptr<RegTree>>> new_trees;
-  const int ngroup = model_.learner_model_param->num_output_group;
+  const int ngroup = model_.learner_model_param->OutputLength();
  ConfigureWithKnownData(this->cfg_, p_fmat);
  monitor_.Start("BoostNewTrees");
  // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
  // `gpu_id` be the single source of determining what algorithms to run, but that will
  // break a lots of existing code.
  auto device = tparam_.tree_method != TreeMethod::kGPUHist ? Context::kCpuId : ctx_->gpu_id;
-  auto out = linalg::TensorView<float, 2>{
+  auto out = linalg::MakeTensorView(
      device,
      device == Context::kCpuId ? predt->predictions.HostSpan() : predt->predictions.DeviceSpan(),
-      {static_cast<size_t>(p_fmat->Info().num_row_), static_cast<size_t>(ngroup)},
+      p_fmat->Info().num_row_, model_.learner_model_param->OutputLength());
      device};
  CHECK_NE(ngroup, 0);
  if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf()) {
@ -266,7 +275,13 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
  // position is negated if the row is sampled out.
  std::vector<HostDeviceVector<bst_node_t>> node_position;
-  if (ngroup == 1) {
+  if (model_.learner_model_param->IsVectorLeaf()) {
    std::vector<std::unique_ptr<RegTree>> ret;
    BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
    UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
    // No update prediction cache yet.
    new_trees.push_back(std::move(ret));
  } else if (model_.learner_model_param->OutputLength() == 1) {
    std::vector<std::unique_ptr<RegTree>> ret;
    BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
    UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
@ -360,8 +375,8 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
          << "Set `process_type` to `update` if you want to update existing "
             "trees.";
      // create new tree
-      std::unique_ptr<RegTree> ptr(new RegTree());
+      std::unique_ptr<RegTree> ptr(new RegTree{this->model_.learner_model_param->LeafLength(),
-      ptr->param.UpdateAllowUnknown(this->cfg_);
+                                               this->model_.learner_model_param->num_feature});
      new_trees.push_back(ptr.get());
      ret->push_back(std::move(ptr));
    } else if (tparam_.process_type == TreeProcessType::kUpdate) {
@ -383,11 +398,15 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
  }
  // update the trees
-  CHECK_EQ(gpair->Size(), p_fmat->Info().num_row_)
+  auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_;
-      << "Mismatching size between number of rows from input data and size of "
+  StringView msg{
-         "gradient vector.";
+      "Mismatching size between number of rows from input data and size of gradient vector."};
  if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
    CHECK_EQ(n_out % gpair->Size(), 0) << msg;
  } else {
    CHECK_EQ(gpair->Size(), n_out) << msg;
  }
  CHECK(out_position);
  out_position->resize(new_trees.size());
  // Rescale learning rate according to the size of trees
@ -402,9 +421,13 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
 void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
  monitor_.Start("CommitModel");
-  for (uint32_t gid = 0; gid < model_.learner_model_param->num_output_group; ++gid) {
+  if (this->model_.learner_model_param->IsVectorLeaf()) {
    model_.CommitModel(std::move(new_trees[0]), 0);
  } else {
    for (std::uint32_t gid = 0; gid < model_.learner_model_param->OutputLength(); ++gid) {
      model_.CommitModel(std::move(new_trees[gid]), gid);
    }
  }
  monitor_.Stop("CommitModel");
 }
@ -564,11 +587,10 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
  if (out_preds->version == 0) {
    // out_preds->Size() can be non-zero as it's initialized here before any
    // tree is built at the 0^th iterator.
-    predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions,
+    predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions, model_);
                                  model_);
  }
-  uint32_t tree_begin, tree_end;
+  std::uint32_t tree_begin, tree_end;
  std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
  CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
  if (tree_end > tree_begin) {
@ -577,7 +599,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
  if (reset) {
    out_preds->version = 0;
  } else {
-    uint32_t delta = layer_end - out_preds->version;
+    std::uint32_t delta = layer_end - out_preds->version;
    out_preds->Update(delta);
  }
 }
@ -770,6 +792,7 @@ class Dart : public GBTree {
  void PredictBatchImpl(DMatrix *p_fmat, PredictionCacheEntry *p_out_preds,
                        bool training, unsigned layer_begin,
                        unsigned layer_end) const {
    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
    auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
    CHECK(predictor);
    predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
@ -830,6 +853,7 @@ class Dart : public GBTree {
  void InplacePredict(std::shared_ptr<DMatrix> p_fmat, float missing,
                      PredictionCacheEntry* p_out_preds, uint32_t layer_begin,
                      unsigned layer_end) const override {
    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
    uint32_t tree_begin, tree_end;
    std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
    auto n_groups = model_.learner_model_param->num_output_group;
@ -996,8 +1020,9 @@ class Dart : public GBTree {
  }
  // set normalization factors
-  inline size_t NormalizeTrees(size_t size_new_trees) {
+  std::size_t NormalizeTrees(size_t size_new_trees) {
-    float lr = 1.0 * dparam_.learning_rate / size_new_trees;
+    CHECK(tree_param_.GetInitialised());
    float lr = 1.0 * tree_param_.learning_rate / size_new_trees;
    size_t num_drop = idx_drop_.size();
    if (num_drop == 0) {
      for (size_t i = 0; i < size_new_trees; ++i) {
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@ -111,8 +111,6 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
  bool one_drop;
  /*! \brief probability of skipping the dropout during an iteration */
  float skip_drop;
  /*! \brief learning step size for a time */
  float learning_rate;
  // declare parameters
  DMLC_DECLARE_PARAMETER(DartTrainParam) {
    DMLC_DECLARE_FIELD(sample_type)
@ -136,24 +134,27 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
        .set_range(0.0f, 1.0f)
        .set_default(0.0f)
        .describe("Probability of skipping the dropout during a boosting iteration.");
    DMLC_DECLARE_FIELD(learning_rate)
        .set_lower_bound(0.0f)
        .set_default(0.3f)
        .describe("Learning rate(step size) of update.");
    DMLC_DECLARE_ALIAS(learning_rate, eta);
  }
 };
 namespace detail {
 // From here on, layer becomes concrete trees.
 inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const& model,
-                                                 size_t layer_begin,
+                                                 std::uint32_t layer_begin,
-                                                 size_t layer_end) {
+                                                 std::uint32_t layer_end) {
-  bst_group_t groups = model.learner_model_param->num_output_group;
+  std::uint32_t tree_begin;
-  uint32_t tree_begin = layer_begin * groups * model.param.num_parallel_tree;
+  std::uint32_t tree_end;
-  uint32_t tree_end = layer_end * groups * model.param.num_parallel_tree;
+  if (model.learner_model_param->IsVectorLeaf()) {
    tree_begin = layer_begin * model.param.num_parallel_tree;
    tree_end = layer_end * model.param.num_parallel_tree;
  } else {
    bst_group_t groups = model.learner_model_param->OutputLength();
    tree_begin = layer_begin * groups * model.param.num_parallel_tree;
    tree_end = layer_end * groups * model.param.num_parallel_tree;
  }
  if (tree_end == 0) {
-    tree_end = static_cast<uint32_t>(model.trees.size());
+    tree_end = model.trees.size();
  }
  if (model.trees.size() != 0) {
    CHECK_LE(tree_begin, tree_end);
@ -241,22 +242,25 @@ class GBTree : public GradientBooster {
  void LoadModel(Json const& in) override;
  // Number of trees per layer.
-  auto LayerTrees() const {
+  [[nodiscard]] std::uint32_t LayerTrees() const {
-    auto n_trees = model_.learner_model_param->num_output_group * model_.param.num_parallel_tree;
+    if (model_.learner_model_param->IsVectorLeaf()) {
-    return n_trees;
+      return model_.param.num_parallel_tree;
    }
    return model_.param.num_parallel_tree * model_.learner_model_param->OutputLength();
  }
  // slice the trees, out must be already allocated
  void Slice(int32_t layer_begin, int32_t layer_end, int32_t step,
             GradientBooster *out, bool* out_of_bound) const override;
-  int32_t BoostedRounds() const override {
+  [[nodiscard]] std::int32_t BoostedRounds() const override {
    CHECK_NE(model_.param.num_parallel_tree, 0);
    CHECK_NE(model_.learner_model_param->num_output_group, 0);
    return model_.trees.size() / this->LayerTrees();
  }
-  bool ModelFitted() const override {
+  [[nodiscard]] bool ModelFitted() const override {
    return !model_.trees.empty() || !model_.trees_to_update.empty();
  }
--- a/src/learner.cc
+++ b/src/learner.cc
@ -326,7 +326,7 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
  std::string booster;
  std::string objective;
  // This is a training parameter and is not saved (nor loaded) in the model.
-  MultiStrategy multi_strategy{MultiStrategy::kComposite};
+  MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};
  // declare parameters
  DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
@ -339,12 +339,12 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
        .set_default("reg:squarederror")
        .describe("Objective function used for obtaining gradient.");
    DMLC_DECLARE_FIELD(multi_strategy)
-        .add_enum("composite", MultiStrategy::kComposite)
+        .add_enum("one_output_per_tree", MultiStrategy::kOneOutputPerTree)
-        .add_enum("monolithic", MultiStrategy::kMonolithic)
+        .add_enum("multi_output_tree", MultiStrategy::kMultiOutputTree)
-        .set_default(MultiStrategy::kComposite)
+        .set_default(MultiStrategy::kOneOutputPerTree)
        .describe(
-            "Strategy used for training multi-target models. `mono` means building one single tree "
+            "Strategy used for training multi-target models. `multi_output_tree` means building "
-            "for all targets.");
+            "one single tree for all targets.");
  }
 };
@ -440,7 +440,7 @@ class LearnerConfiguration : public Learner {
        info.Validate(Ctx()->gpu_id);
        // We estimate it from input data.
        linalg::Tensor<float, 1> base_score;
-        UsePtr(obj_)->InitEstimation(info, &base_score);
+        InitEstimation(info, &base_score);
        CHECK_EQ(base_score.Size(), 1);
        mparam_.base_score = base_score(0);
        CHECK(!std::isnan(mparam_.base_score));
@ -775,8 +775,6 @@ class LearnerConfiguration : public Learner {
    }
    CHECK_NE(mparam_.num_feature, 0)
        << "0 feature is supplied.  Are you using raw Booster interface?";
    // Remove these once binary IO is gone.
    cfg_["num_feature"] = common::ToString(mparam_.num_feature);
  }
  void ConfigureGBM(LearnerTrainParam const& old, Args const& args) {
@ -859,17 +857,37 @@ class LearnerConfiguration : public Learner {
      mparam_.num_target = n_targets;
    }
  }
  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
    // Special handling for vertical federated learning.
    if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) {
      // We assume labels are only available on worker 0, so the estimation is calculated there
      // and added to other workers.
      if (collective::GetRank() == 0) {
        UsePtr(obj_)->InitEstimation(info, base_score);
        collective::Broadcast(base_score->Data()->HostPointer(),
                              sizeof(bst_float) * base_score->Size(), 0);
      } else {
        base_score->Reshape(1);
        collective::Broadcast(base_score->Data()->HostPointer(),
                              sizeof(bst_float) * base_score->Size(), 0);
      }
    } else {
      UsePtr(obj_)->InitEstimation(info, base_score);
    }
  }
 };
 std::string const LearnerConfiguration::kEvalMetric {"eval_metric"};  // NOLINT
 class LearnerIO : public LearnerConfiguration {
 private:
  std::set<std::string> saved_configs_ = {"num_round"};
  // Used to identify the offset of JSON string when
  // Will be removed once JSON takes over.  Right now we still loads some RDS files from R.
  std::string const serialisation_header_ { u8"CONFIG-offset:" };
  void ClearCaches() { this->prediction_container_ = PredictionContainer{}; }
 public:
  explicit LearnerIO(std::vector<std::shared_ptr<DMatrix>> cache) : LearnerConfiguration{cache} {}
@ -922,6 +940,7 @@ class LearnerIO : public LearnerConfiguration {
    }
    this->need_configuration_ = true;
    this->ClearCaches();
  }
  void SaveModel(Json* p_out) const override {
@ -1015,21 +1034,11 @@ class LearnerIO : public LearnerConfiguration {
    CHECK(fi->Read(&tparam_.booster)) << "BoostLearner: wrong model format";
    obj_.reset(ObjFunction::Create(tparam_.objective, &ctx_));
-    gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_,
+    gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_, &learner_model_param_));
                                       &learner_model_param_));
    gbm_->Load(fi);
    if (mparam_.contain_extra_attrs != 0) {
      std::vector<std::pair<std::string, std::string> > attr;
      fi->Read(&attr);
      for (auto& kv : attr) {
        const std::string prefix = "SAVED_PARAM_";
        if (kv.first.find(prefix) == 0) {
          const std::string saved_param = kv.first.substr(prefix.length());
          if (saved_configs_.find(saved_param) != saved_configs_.end()) {
            cfg_[saved_param] = kv.second;
          }
        }
      }
      attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
    }
    bool warn_old_model { false };
@ -1098,6 +1107,7 @@ class LearnerIO : public LearnerConfiguration {
    cfg_.insert(n.cbegin(), n.cend());
    this->need_configuration_ = true;
    this->ClearCaches();
  }
  // Save model into binary format.  The code is about to be deprecated by more robust
@ -1111,16 +1121,6 @@ class LearnerIO : public LearnerConfiguration {
    std::vector<std::pair<std::string, std::string> > extra_attr;
    mparam.contain_extra_attrs = 1;
    {
      std::vector<std::string> saved_params;
      for (const auto& key : saved_params) {
        auto it = cfg_.find(key);
        if (it != cfg_.end()) {
          mparam.contain_extra_attrs = 1;
          extra_attr.emplace_back("SAVED_PARAM_" + key, it->second);
        }
      }
    }
    {
      // Similar to JSON model IO, we save the objective.
      Json j_obj { Object() };
@ -1305,7 +1305,7 @@ class LearnerImpl : public LearnerIO {
    monitor_.Stop("PredictRaw");
    monitor_.Start("GetGradient");
-    obj_->GetGradient(predt.predictions, train->Info(), iter, &gpair_);
+    GetGradient(predt.predictions, train->Info(), iter, &gpair_);
    monitor_.Stop("GetGradient");
    TrainingObserver::Instance().Observe(gpair_, "Gradients");
@ -1484,6 +1484,28 @@ class LearnerImpl : public LearnerIO {
  }
 private:
  void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration,
                   HostDeviceVector<GradientPair>* out_gpair) {
    // Special handling for vertical federated learning.
    if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) {
      // We assume labels are only available on worker 0, so the gradients are calculated there
      // and broadcast to other workers.
      if (collective::GetRank() == 0) {
        obj_->GetGradient(preds, info, iteration, out_gpair);
        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
                              0);
      } else {
        CHECK_EQ(info.labels.Size(), 0)
            << "In vertical federated learning, labels should only be on the first worker";
        out_gpair->Resize(preds.Size());
        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
                              0);
      }
    } else {
      obj_->GetGradient(preds, info, iteration, out_gpair);
    }
  }
  /*! \brief random number transformation seed. */
  static int32_t constexpr kRandSeedMagic = 127;
  // gradient pairs
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@ -20,23 +20,51 @@
 //   corresponding headers that brings in those function declaration can't be included with CUDA).
 //   This precludes the CPU and GPU logic to coexist inside a .cu file
 #include "rank_metric.h"
 #include <dmlc/omp.h>
 #include <dmlc/registry.h>
 #include <xgboost/metric.h>
-#include <cmath>
+#include <algorithm>                         // for stable_sort, copy, fill_n, min, max
-#include <vector>
+#include <array>                             // for array
 #include <cmath>                             // for log, sqrt
 #include <cstddef>                           // for size_t, std
 #include <cstdint>                           // for uint32_t
 #include <functional>                        // for less, greater
 #include <map>                               // for operator!=, _Rb_tree_const_iterator
 #include <memory>                            // for allocator, unique_ptr, shared_ptr, __shared_...
 #include <numeric>                           // for accumulate
 #include <ostream>                           // for operator<<, basic_ostream, ostringstream
 #include <string>                            // for char_traits, operator<, basic_string, to_string
 #include <utility>                           // for pair, make_pair
 #include <vector>                            // for vector
-#include "../collective/communicator-inl.h"
+#include "../collective/communicator-inl.h"  // for IsDistributed, Allreduce
-#include "../common/algorithm.h"  // Sort
+#include "../collective/communicator.h"      // for Operation
-#include "../common/math.h"
+#include "../common/algorithm.h"             // for ArgSort, Sort
-#include "../common/ranking_utils.h"  // MakeMetricName
+#include "../common/linalg_op.h"             // for cbegin, cend
-#include "../common/threading_utils.h"
+#include "../common/math.h"                  // for CmpFirst
-#include "metric_common.h"
+#include "../common/optional_weight.h"       // for OptionalWeights, MakeOptionalWeights
-#include "xgboost/host_device_vector.h"
+#include "../common/ranking_utils.h"         // for LambdaRankParam, NDCGCache, ParseMetricName
 #include "../common/threading_utils.h"       // for ParallelFor
 #include "../common/transform_iterator.h"    // for IndexTransformIter
 #include "dmlc/common.h"                     // for OMPException
 #include "metric_common.h"                   // for MetricNoCache, GPUMetric, PackedReduceResult
 #include "xgboost/base.h"                    // for bst_float, bst_omp_uint, bst_group_t, Args
 #include "xgboost/cache.h"                   // for DMatrixCache
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for MetaInfo, DMatrix
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
 #include "xgboost/json.h"                    // for Json, FromJson, IsA, ToJson, get, Null, Object
 #include "xgboost/linalg.h"                  // for Tensor, TensorView, Range, VectorView, MakeT...
 #include "xgboost/logging.h"                 // for CHECK, ConsoleLogger, LOG_INFO, CHECK_EQ
 #include "xgboost/metric.h"                  // for MetricReg, XGBOOST_REGISTER_METRIC, Metric
 #include "xgboost/span.h"                    // for Span, operator!=
 #include "xgboost/string_view.h"             // for StringView
 namespace {
-using PredIndPair = std::pair<xgboost::bst_float, uint32_t>;
+using PredIndPair = std::pair<xgboost::bst_float, xgboost::ltr::rel_degree_t>;
 using PredIndPairContainer = std::vector<PredIndPair>;
 /*
@ -87,8 +115,7 @@ class PerGroupWeightPolicy {
 }  // anonymous namespace
-namespace xgboost {
+namespace xgboost::metric {
 namespace metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(rank_metric);
@ -257,71 +284,6 @@ struct EvalPrecision : public EvalRank {
  }
 };
 /*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
 struct EvalNDCG : public EvalRank {
 private:
  double CalcDCG(const PredIndPairContainer &rec) const {
    double sumdcg = 0.0;
    for (size_t i = 0; i < rec.size() && i < this->topn; ++i) {
      const unsigned rel = rec[i].second;
      if (rel != 0) {
        sumdcg += ((1 << rel) - 1) / std::log2(i + 2.0);
      }
    }
    return sumdcg;
  }
 public:
  explicit EvalNDCG(const char* name, const char* param) : EvalRank(name, param) {}
  double EvalGroup(PredIndPairContainer *recptr) const override {
    PredIndPairContainer &rec(*recptr);
    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
    double dcg = CalcDCG(rec);
    std::stable_sort(rec.begin(), rec.end(), common::CmpSecond);
    double idcg = CalcDCG(rec);
    if (idcg == 0.0f) {
      if (this->minus) {
        return 0.0f;
      } else {
        return 1.0f;
      }
    }
    return dcg/idcg;
  }
 };
 /*! \brief Mean Average Precision at N, for both classification and rank */
 struct EvalMAP : public EvalRank {
 public:
  explicit EvalMAP(const char* name, const char* param) : EvalRank(name, param) {}
  double EvalGroup(PredIndPairContainer *recptr) const override {
    PredIndPairContainer &rec(*recptr);
    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
    unsigned nhits = 0;
    double sumap = 0.0;
    for (size_t i = 0; i < rec.size(); ++i) {
      if (rec[i].second != 0) {
        nhits += 1;
        if (i < this->topn) {
          sumap += static_cast<double>(nhits) / (i + 1);
        }
      }
    }
    if (nhits != 0) {
      sumap /= nhits;
      return sumap;
    } else {
      if (this->minus) {
        return 0.0;
      } else {
        return 1.0;
      }
    }
  }
 };
 /*! \brief Cox: Partial likelihood of the Cox proportional hazards model */
 struct EvalCox : public MetricNoCache {
 public:
@ -377,16 +339,213 @@ XGBOOST_REGISTER_METRIC(Precision, "pre")
 .describe("precision@k for rank.")
 .set_body([](const char* param) { return new EvalPrecision("pre", param); });
 XGBOOST_REGISTER_METRIC(NDCG, "ndcg")
 .describe("ndcg@k for rank.")
 .set_body([](const char* param) { return new EvalNDCG("ndcg", param); });
 XGBOOST_REGISTER_METRIC(MAP, "map")
 .describe("map@k for rank.")
 .set_body([](const char* param) { return new EvalMAP("map", param); });
 XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik")
 .describe("Negative log partial likelihood of Cox proportional hazards model.")
 .set_body([](const char*) { return new EvalCox(); });
-}  // namespace metric
+
-}  // namespace xgboost
+// ranking metrics that requires cache
 template <typename Cache>
 class EvalRankWithCache : public Metric {
 protected:
  ltr::LambdaRankParam param_;
  bool minus_{false};
  std::string name_;
  DMatrixCache<Cache> cache_{DMatrixCache<Cache>::DefaultSize()};
 public:
  EvalRankWithCache(StringView name, const char* param) {
    auto constexpr kMax = ltr::LambdaRankParam::NotSet();
    std::uint32_t topn{kMax};
    this->name_ = ltr::ParseMetricName(name, param, &topn, &minus_);
    if (topn != kMax) {
      param_.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", std::to_string(topn)},
                                     {"lambdarank_pair_method", "topk"}});
    }
    param_.UpdateAllowUnknown(Args{});
  }
  void Configure(Args const&) override {
    // do not configure, otherwise the ndcg param will be forced into the same as the one in
    // objective.
  }
  void LoadConfig(Json const& in) override {
    if (IsA<Null>(in)) {
      return;
    }
    auto const& obj = get<Object const>(in);
    auto it = obj.find("lambdarank_param");
    if (it != obj.cend()) {
      FromJson(it->second, &param_);
    }
  }
  void SaveConfig(Json* p_out) const override {
    auto& out = *p_out;
    out["name"] = String{this->Name()};
    out["lambdarank_param"] = ToJson(param_);
  }
  double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
    auto const& info = p_fmat->Info();
    auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
    if (p_cache->Param() != param_) {
      p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
    }
    CHECK(p_cache->Param() == param_);
    CHECK_EQ(preds.Size(), info.labels.Size());
    return this->Eval(preds, info, p_cache);
  }
  virtual double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
                      std::shared_ptr<Cache> p_cache) = 0;
 };
 namespace {
 double Finalize(double score, double sw) {
  std::array<double, 2> dat{score, sw};
  collective::Allreduce<collective::Operation::kSum>(dat.data(), dat.size());
  if (sw > 0.0) {
    score = score / sw;
  }
  CHECK_LE(score, 1.0 + kRtEps)
      << "Invalid output score, might be caused by invalid query group weight.";
  score = std::min(1.0, score);
  return score;
 }
 }  // namespace
 /**
 * \brief Implement the NDCG score function for learning to rank.
 *
 *     Ties are ignored, which can lead to different result with other implementations.
 */
 class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
 public:
  using EvalRankWithCache::EvalRankWithCache;
  const char* Name() const override { return name_.c_str(); }
  double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
              std::shared_ptr<ltr::NDCGCache> p_cache) override {
    if (ctx_->IsCUDA()) {
      auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
      return Finalize(ndcg.Residue(), ndcg.Weights());
    }
    // group local ndcg
    auto group_ptr = p_cache->DataGroupPtr(ctx_);
    bst_group_t n_groups = group_ptr.size() - 1;
    auto ndcg_gloc = p_cache->Dcg(ctx_);
    std::fill_n(ndcg_gloc.Values().data(), ndcg_gloc.Size(), 0.0);
    auto h_inv_idcg = p_cache->InvIDCG(ctx_);
    auto p_discount = p_cache->Discount(ctx_).data();
    auto h_label = info.labels.HostView();
    auto h_predt = linalg::MakeTensorView(ctx_, &preds, preds.Size());
    auto weights = common::MakeOptionalWeights(ctx_, info.weights_);
    common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
      auto g_predt = h_predt.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
      auto g_labels = h_label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]), 0);
      auto sorted_idx = common::ArgSort<std::size_t>(ctx_, linalg::cbegin(g_predt),
                                                     linalg::cend(g_predt), std::greater<>{});
      double ndcg{.0};
      double inv_idcg = h_inv_idcg(g);
      if (inv_idcg <= 0.0) {
        ndcg_gloc(g) = minus_ ? 0.0 : 1.0;
        return;
      }
      std::size_t n{std::min(sorted_idx.size(), static_cast<std::size_t>(param_.TopK()))};
      if (param_.ndcg_exp_gain) {
        for (std::size_t i = 0; i < n; ++i) {
          ndcg += p_discount[i] * ltr::CalcDCGGain(g_labels(sorted_idx[i])) * inv_idcg;
        }
      } else {
        for (std::size_t i = 0; i < n; ++i) {
          ndcg += p_discount[i] * g_labels(sorted_idx[i]) * inv_idcg;
        }
      }
      ndcg_gloc(g) += ndcg * weights[g];
    });
    double sum_w{0};
    if (weights.Empty()) {
      sum_w = n_groups;
    } else {
      sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
    }
    auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
    return Finalize(ndcg, sum_w);
  }
 };
 class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
 public:
  using EvalRankWithCache::EvalRankWithCache;
  const char* Name() const override { return name_.c_str(); }
  double Eval(HostDeviceVector<float> const& predt, MetaInfo const& info,
              std::shared_ptr<ltr::MAPCache> p_cache) override {
    if (ctx_->IsCUDA()) {
      auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
      return Finalize(map.Residue(), map.Weights());
    }
    auto gptr = p_cache->DataGroupPtr(ctx_);
    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
    auto h_predt = linalg::MakeTensorView(ctx_, &predt, predt.Size());
    auto map_gloc = p_cache->Map(ctx_);
    std::fill_n(map_gloc.data(), map_gloc.size(), 0.0);
    auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
    common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
      auto g_predt = h_predt.Slice(linalg::Range(gptr[g], gptr[g + 1]));
      auto g_label = h_label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
      auto g_rank = rank_idx.subspan(gptr[g]);
      auto n = std::min(static_cast<std::size_t>(param_.TopK()), g_label.Size());
      double n_hits{0.0};
      for (std::size_t i = 0; i < n; ++i) {
        auto p = g_label(g_rank[i]);
        n_hits += p;
        map_gloc[g] += n_hits / static_cast<double>((i + 1)) * p;
      }
      for (std::size_t i = n; i < g_label.Size(); ++i) {
        n_hits += g_label(g_rank[i]);
      }
      if (n_hits > 0.0) {
        map_gloc[g] /= std::min(n_hits, static_cast<double>(param_.TopK()));
      } else {
        map_gloc[g] = minus_ ? 0.0 : 1.0;
      }
    });
    auto sw = 0.0;
    auto weight = common::MakeOptionalWeights(ctx_, info.weights_);
    if (!weight.Empty()) {
      CHECK_EQ(weight.weights.size(), p_cache->Groups());
    }
    for (std::size_t i = 0; i < map_gloc.size(); ++i) {
      map_gloc[i] = map_gloc[i] * weight[i];
      sw += weight[i];
    }
    auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
    return Finalize(sum, sw);
  }
 };
 XGBOOST_REGISTER_METRIC(EvalMAP, "map")
    .describe("map@k for ranking.")
    .set_body([](char const* param) {
      return new EvalMAPScore{"map", param};
    });
 XGBOOST_REGISTER_METRIC(EvalNDCG, "ndcg")
    .describe("ndcg@k for ranking.")
    .set_body([](char const* param) {
      return new EvalNDCG{"ndcg", param};
    });
 }  // namespace xgboost::metric
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@ -2,22 +2,29 @@
 * Copyright 2020-2023 by XGBoost Contributors
 */
 #include <dmlc/registry.h>
-#include <thrust/iterator/counting_iterator.h>  // make_counting_iterator
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
-#include <thrust/reduce.h>                      // reduce
+#include <thrust/reduce.h>                      // for reduce
 #include <xgboost/metric.h>
-#include <cstddef>                       // std::size_t
+#include <algorithm>                            // for transform
-#include <memory>                        // std::shared_ptr
+#include <cstddef>                              // for size_t
 #include <memory>                               // for shared_ptr
 #include <vector>                               // for vector
-#include "../common/cuda_context.cuh"    // CUDAContext
+#include "../common/cuda_context.cuh"           // for CUDAContext
 #include "../common/device_helpers.cuh"         // for MakeTransformIterator
 #include "../common/optional_weight.h"          // for MakeOptionalWeights
 #include "../common/ranking_utils.cuh"          // for CalcQueriesDCG, NDCGCache
 #include "metric_common.h"
-#include "xgboost/base.h"                // XGBOOST_DEVICE
+#include "rank_metric.h"
-#include "xgboost/context.h"             // Context
+#include "xgboost/base.h"                // for XGBOOST_DEVICE
-#include "xgboost/data.h"                // MetaInfo
+#include "xgboost/context.h"             // for Context
-#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/linalg.h"              // for MakeTensorView
 #include "xgboost/logging.h"             // for CHECK
 #include "xgboost/metric.h"
-namespace xgboost {
+namespace xgboost::metric {
 namespace metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(rank_metric_gpu);
@ -134,200 +141,125 @@ struct EvalPrecisionGpu {
  }
 };
 /*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
 struct EvalNDCGGpu {
 public:
  static void ComputeDCG(const dh::SegmentSorter<float> &pred_sorter,
                         const float *dlabels,
                         const EvalRankConfig &ecfg,
                         // The order in which labels have to be accessed. The order is determined
                         // by sorting the predictions or the labels for the entire dataset
                         const xgboost::common::Span<const uint32_t> &dlabels_sort_order,
                         dh::caching_device_vector<double> *dcgptr) {
    dh::caching_device_vector<double> &dcgs(*dcgptr);
    // Group info on device
    const auto &dgroups = pred_sorter.GetGroupsSpan();
    const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
    // First, determine non zero labels in the dataset individually
    auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
      return (static_cast<unsigned>(dlabels[dlabels_sort_order[idx]]));
    };  // NOLINT
    // Find each group's DCG value
    const auto nitems = pred_sorter.GetNumItems();
    auto *ddcgs = dcgs.data().get();
    int device_id = -1;
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaGetDevice(&device_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipGetDevice(&device_id));
 #endif
    // For each group item compute the aggregated precision
    dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
      const auto group_idx = dgroup_idx[idx];
      const auto group_begin = dgroups[group_idx];
      const auto ridx = idx - group_begin;
      auto label = DetermineNonTrivialLabelLambda(idx);
      if (ridx < ecfg.topn && label) {
        atomicAdd(&ddcgs[group_idx], ((1 << label) - 1) / std::log2(ridx + 2.0));
      }
    });
  }
  static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
                           const float *dlabels,
                           const EvalRankConfig &ecfg) {
    // Sort the labels and compute IDCG
    dh::SegmentSorter<float> segment_label_sorter;
    segment_label_sorter.SortItems(dlabels, pred_sorter.GetNumItems(),
                                   pred_sorter.GetGroupSegmentsSpan());
    uint32_t ngroups = pred_sorter.GetNumGroups();
    dh::caching_device_vector<double> idcg(ngroups, 0);
    ComputeDCG(pred_sorter, dlabels, ecfg, segment_label_sorter.GetOriginalPositionsSpan(), &idcg);
    // Compute the DCG values next
    dh::caching_device_vector<double> dcg(ngroups, 0);
    ComputeDCG(pred_sorter, dlabels, ecfg, pred_sorter.GetOriginalPositionsSpan(), &dcg);
    double *ddcg = dcg.data().get();
    double *didcg = idcg.data().get();
    int device_id = -1;
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaGetDevice(&device_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipGetDevice(&device_id));
 #endif
    // Compute the group's DCG and reduce it across all groups
    dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
      if (didcg[gidx] == 0.0f) {
        ddcg[gidx] = (ecfg.minus) ? 0.0f : 1.0f;
      } else {
        ddcg[gidx] /= didcg[gidx];
      }
    });
    // Allocator to be used for managing space overhead while performing reductions
    dh::XGBCachingDeviceAllocator<char> alloc;
 #if defined(XGBOOST_USE_CUDA)
    return thrust::reduce(thrust::cuda::par(alloc), dcg.begin(), dcg.end());
 #elif defined(XGBOOST_USE_HIP)
    return thrust::reduce(thrust::hip::par(alloc), dcg.begin(), dcg.end());
 #endif
  }
 };
 /*! \brief Mean Average Precision at N, for both classification and rank */
 struct EvalMAPGpu {
 public:
  static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
                           const float *dlabels,
                           const EvalRankConfig &ecfg) {
    // Group info on device
    const auto &dgroups = pred_sorter.GetGroupsSpan();
    const auto ngroups = pred_sorter.GetNumGroups();
    const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
    // Original positions of the predictions after they have been sorted
    const auto &dpreds_orig_pos = pred_sorter.GetOriginalPositionsSpan();
    // First, determine non zero labels in the dataset individually
    const auto nitems = pred_sorter.GetNumItems();
    dh::caching_device_vector<uint32_t> hits(nitems, 0);
    auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
      return (static_cast<unsigned>(dlabels[dpreds_orig_pos[idx]]) != 0) ? 1 : 0;
    };  // NOLINT
    thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
                      thrust::make_counting_iterator(nitems),
                      hits.begin(),
                      DetermineNonTrivialLabelLambda);
    // Allocator to be used by sort for managing space overhead while performing prefix scans
    dh::XGBCachingDeviceAllocator<char> alloc;
    // Next, prefix scan the nontrivial labels that are segmented to accumulate them.
    // This is required for computing the metric sum
    // Data segmented into different groups...
 #if defined(XGBOOST_USE_CUDA)
    thrust::inclusive_scan_by_key(thrust::cuda::par(alloc),
                                  dh::tcbegin(dgroup_idx), dh::tcend(dgroup_idx),
                                  hits.begin(),  // Input value
                                  hits.begin());  // In-place scan
 #elif defined(XGBOOST_USE_HIP)
    thrust::inclusive_scan_by_key(thrust::hip::par(alloc),
                                  dh::tcbegin(dgroup_idx), dh::tcend(dgroup_idx),
                                  hits.begin(),  // Input value
                                  hits.begin());  // In-place scan
 #endif
    // Find each group's metric sum
    dh::caching_device_vector<double> sumap(ngroups, 0);
    auto *dsumap = sumap.data().get();
    const auto *dhits = hits.data().get();
    int device_id = -1;
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaGetDevice(&device_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipGetDevice(&device_id));
 #endif
    // For each group item compute the aggregated precision
    dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
      if (DetermineNonTrivialLabelLambda(idx)) {
        const auto group_idx = dgroup_idx[idx];
        const auto group_begin = dgroups[group_idx];
        const auto ridx = idx - group_begin;
        if (ridx < ecfg.topn) {
          atomicAdd(&dsumap[group_idx],
                    static_cast<double>(dhits[idx]) / (ridx + 1));
        }
      }
    });
    // Aggregate the group's item precisions
    dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
      auto nhits = dgroups[gidx + 1] ? dhits[dgroups[gidx + 1] - 1] : 0;
      if (nhits != 0) {
        dsumap[gidx] /= nhits;
      } else {
        if (ecfg.minus) {
          dsumap[gidx] = 0;
        } else {
          dsumap[gidx] = 1;
        }
      }
    });
 #if defined(XGBOOST_USE_CUDA)
    return thrust::reduce(thrust::cuda::par(alloc), sumap.begin(), sumap.end());
 #elif defined(XGBOOST_USE_HIP)
    return thrust::reduce(thrust::hip::par(alloc), sumap.begin(), sumap.end());
 #endif
  }
 };
 XGBOOST_REGISTER_GPU_METRIC(PrecisionGpu, "pre")
 .describe("precision@k for rank computed on GPU.")
 .set_body([](const char* param) { return new EvalRankGpu<EvalPrecisionGpu>("pre", param); });
-XGBOOST_REGISTER_GPU_METRIC(NDCGGpu, "ndcg")
+namespace cuda_impl {
-.describe("ndcg@k for rank computed on GPU.")
+PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
-.set_body([](const char* param) { return new EvalRankGpu<EvalNDCGGpu>("ndcg", param); });
+                             HostDeviceVector<float> const &predt, bool minus,
                             std::shared_ptr<ltr::NDCGCache> p_cache) {
  CHECK(p_cache);
-XGBOOST_REGISTER_GPU_METRIC(MAPGpu, "map")
+  auto const &p = p_cache->Param();
-.describe("map@k for rank computed on GPU.")
+  auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
-.set_body([](const char* param) { return new EvalRankGpu<EvalMAPGpu>("map", param); });
+  if (!d_weight.Empty()) {
-}  // namespace metric
+    CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
-}  // namespace xgboost
+  }
  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
  predt.SetDevice(ctx->gpu_id);
  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
  auto d_inv_idcg = p_cache->InvIDCG(ctx);
  auto d_sorted_idx = p_cache->SortedIdx(ctx, d_predt.Values());
  auto d_out_dcg = p_cache->Dcg(ctx);
  ltr::cuda_impl::CalcQueriesDCG(ctx, d_label, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(),
                                 d_out_dcg);
  auto it = dh::MakeTransformIterator<PackedReduceResult>(
      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
        if (d_inv_idcg(i) <= 0.0) {
          return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[i])};
        }
        return PackedReduceResult{d_out_dcg(i) * d_inv_idcg(i) * d_weight[i],
                                  static_cast<double>(d_weight[i])};
      });
  auto pair = thrust::reduce(ctx->CUDACtx()->CTP(), it, it + d_out_dcg.Size(),
                             PackedReduceResult{0.0, 0.0});
  return pair;
 }
 PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt, bool minus,
                            std::shared_ptr<ltr::MAPCache> p_cache) {
  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
  predt.SetDevice(ctx->gpu_id);
  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
  auto key_it = dh::MakeTransformIterator<std::size_t>(
      thrust::make_counting_iterator(0ul),
      [=] XGBOOST_DEVICE(std::size_t i) { return dh::SegmentId(d_group_ptr, i); });
  auto get_label = [=] XGBOOST_DEVICE(std::size_t i) {
    auto g = key_it[i];
    auto g_begin = d_group_ptr[g];
    auto g_end = d_group_ptr[g + 1];
    i -= g_begin;
    auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
    auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
    return g_label(g_rank[i]);
  };
  auto it = dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), get_label);
  auto cuctx = ctx->CUDACtx();
  auto n_rel = p_cache->NumRelevant(ctx);
  thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + d_label.Size(), it, n_rel.data());
  double topk = p_cache->Param().TopK();
  auto map = p_cache->Map(ctx);
  thrust::fill_n(cuctx->CTP(), map.data(), map.size(), 0.0);
  {
    auto val_it = dh::MakeTransformIterator<double>(
        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
          auto g = key_it[i];
          auto g_begin = d_group_ptr[g];
          auto g_end = d_group_ptr[g + 1];
          i -= g_begin;
          if (i >= topk) {
            return 0.0;
          }
          auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
          auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
          auto label = g_label(g_rank[i]);
          auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
          auto nhits = g_n_rel[i];
          return nhits / static_cast<double>(i + 1) * label;
        });
    std::size_t bytes;
    cub::DeviceSegmentedReduce::Sum(nullptr, bytes, val_it, map.data(), p_cache->Groups(),
                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
    dh::TemporaryArray<char> temp(bytes);
    cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, val_it, map.data(), p_cache->Groups(),
                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
  }
  PackedReduceResult result{0.0, 0.0};
  {
    auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
    if (!d_weight.Empty()) {
      CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
    }
    auto val_it = dh::MakeTransformIterator<PackedReduceResult>(
        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t g) {
          auto g_begin = d_group_ptr[g];
          auto g_end = d_group_ptr[g + 1];
          auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
          if (!g_n_rel.empty() && g_n_rel.back() > 0.0) {
            return PackedReduceResult{map[g] * d_weight[g] / std::min(g_n_rel.back(), topk),
                                      static_cast<double>(d_weight[g])};
          }
          return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[g])};
        });
    result =
        thrust::reduce(cuctx->CTP(), val_it, val_it + map.size(), PackedReduceResult{0.0, 0.0});
  }
  return result;
 }
 }  // namespace cuda_impl
 }  // namespace xgboost::metric
--- a/src/metric/rank_metric.h
+++ b/src/metric/rank_metric.h
@ -0,0 +1,44 @@
 #ifndef XGBOOST_METRIC_RANK_METRIC_H_
 #define XGBOOST_METRIC_RANK_METRIC_H_
 /**
 * Copyright 2023 by XGBoost Contributors
 */
 #include <memory>                        // for shared_ptr
 #include "../common/common.h"            // for AssertGPUSupport
 #include "../common/ranking_utils.h"     // for NDCGCache, MAPCache
 #include "metric_common.h"               // for PackedReduceResult
 #include "xgboost/context.h"             // for Context
 #include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 namespace xgboost {
 namespace metric {
 namespace cuda_impl {
 PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
                             HostDeviceVector<float> const &predt, bool minus,
                             std::shared_ptr<ltr::NDCGCache> p_cache);
 PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt, bool minus,
                            std::shared_ptr<ltr::MAPCache> p_cache);
 #if !defined(XGBOOST_USE_CUDA)
 inline PackedReduceResult NDCGScore(Context const *, MetaInfo const &,
                                    HostDeviceVector<float> const &, bool,
                                    std::shared_ptr<ltr::NDCGCache>) {
  common::AssertGPUSupport();
  return {};
 }
 inline PackedReduceResult MAPScore(Context const *, MetaInfo const &,
                                   HostDeviceVector<float> const &, bool,
                                   std::shared_ptr<ltr::MAPCache>) {
  common::AssertGPUSupport();
  return {};
 }
 #endif
 }  // namespace cuda_impl
 }  // namespace metric
 }  // namespace xgboost
 #endif  // XGBOOST_METRIC_RANK_METRIC_H_
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@ -33,7 +33,7 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
  new_obj->GetGradient(dummy_predt, info, 0, &gpair);
  bst_target_t n_targets = this->Targets(info);
  linalg::Vector<float> leaf_weight;
-  tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
+  tree::FitStump(this->ctx_, info, gpair, n_targets, &leaf_weight);
  // workaround, we don't support multi-target due to binary model serialization for
  // base margin.
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@ -1,52 +1,64 @@
 /**
 * Copyright 2017-2023 by XGBoost Contributors
 */
-#include <dmlc/any.h>
+#include <algorithm>  // for max, fill, min
-#include <dmlc/omp.h>
+#include <any>        // for any, any_cast
 #include <cassert>    // for assert
 #include <cstddef>    // for size_t
 #include <cstdint>    // for uint32_t, int32_t, uint64_t
 #include <memory>     // for unique_ptr, shared_ptr
 #include <ostream>    // for char_traits, operator<<, basic_ostream
 #include <typeinfo>   // for type_info
 #include <vector>     // for vector
-#include <cstddef>
+#include "../collective/communicator-inl.h"   // for Allreduce, IsDistributed
-#include <limits>
+#include "../collective/communicator.h"       // for Operation
-#include <mutex>
+#include "../common/bitfield.h"               // for RBitField8
 #include "../common/categorical.h"            // for IsCat, Decision
 #include "../common/common.h"                 // for DivRoundUp
 #include "../common/math.h"                   // for CheckNAN
 #include "../common/threading_utils.h"        // for ParallelFor
 #include "../data/adapter.h"                  // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
 #include "../data/gradient_index.h"           // for GHistIndexMatrix
 #include "../data/proxy_dmatrix.h"            // for DMatrixProxy
 #include "../gbm/gbtree_model.h"              // for GBTreeModel, GBTreeModelParam
 #include "cpu_treeshap.h"                     // for CalculateContributions
 #include "dmlc/registry.h"                    // for DMLC_REGISTRY_FILE_TAG
 #include "predict_fn.h"                       // for GetNextNode, GetNextNodeMulti
 #include "xgboost/base.h"                     // for bst_float, bst_node_t, bst_omp_uint, bst_fe...
 #include "xgboost/context.h"                  // for Context
 #include "xgboost/data.h"                     // for Entry, DMatrix, MetaInfo, SparsePage, Batch...
 #include "xgboost/host_device_vector.h"       // for HostDeviceVector
 #include "xgboost/learner.h"                  // for LearnerModelParam
 #include "xgboost/linalg.h"                   // for TensorView, All, VectorView, Tensor
 #include "xgboost/logging.h"                  // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_NE
 #include "xgboost/multi_target_tree_model.h"  // for MultiTargetTree
 #include "xgboost/predictor.h"                // for PredictionCacheEntry, Predictor, PredictorReg
 #include "xgboost/span.h"                     // for Span
 #include "xgboost/tree_model.h"               // for RegTree, MTNotImplemented, RTreeNodeStat
-#include "../collective/communicator-inl.h"
+namespace xgboost::predictor {
 #include "../common/categorical.h"
 #include "../common/math.h"
 #include "../common/threading_utils.h"
 #include "../data/adapter.h"
 #include "../data/gradient_index.h"
 #include "../gbm/gbtree_model.h"
 #include "cpu_treeshap.h"  // CalculateContributions
 #include "predict_fn.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/logging.h"
 #include "xgboost/predictor.h"
 #include "xgboost/tree_model.h"
 namespace xgboost {
 namespace predictor {
 DMLC_REGISTRY_FILE_TAG(cpu_predictor);
 namespace scalar {
 template <bool has_missing, bool has_categorical>
 bst_node_t GetLeafIndex(RegTree const &tree, const RegTree::FVec &feat,
                        RegTree::CategoricalSplitMatrix const &cats) {
-  bst_node_t nid = 0;
+  bst_node_t nidx{0};
-  while (!tree[nid].IsLeaf()) {
+  while (!tree[nidx].IsLeaf()) {
-    unsigned split_index = tree[nid].SplitIndex();
+    bst_feature_t split_index = tree[nidx].SplitIndex();
    auto fvalue = feat.GetFvalue(split_index);
-    nid = GetNextNode<has_missing, has_categorical>(
+    nidx = GetNextNode<has_missing, has_categorical>(
-        tree[nid], nid, fvalue, has_missing && feat.IsMissing(split_index), cats);
+        tree[nidx], nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
  }
-  return nid;
+  return nidx;
 }
 bst_float PredValue(const SparsePage::Inst &inst,
                    const std::vector<std::unique_ptr<RegTree>> &trees,
-                    const std::vector<int> &tree_info, int bst_group,
+                    const std::vector<int> &tree_info, std::int32_t bst_group,
-                    RegTree::FVec *p_feats, unsigned tree_begin,
+                    RegTree::FVec *p_feats, std::uint32_t tree_begin, std::uint32_t tree_end) {
                    unsigned tree_end) {
  bst_float psum = 0.0f;
  p_feats->Fill(inst);
  for (size_t i = tree_begin; i < tree_end; ++i) {
@ -68,36 +80,80 @@ bst_float PredValue(const SparsePage::Inst &inst,
 }
 template <bool has_categorical>
-bst_float
+bst_float PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
 PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
                             RegTree::CategoricalSplitMatrix const &cats) {
-  const bst_node_t leaf = p_feats.HasMissing() ?
+  const bst_node_t leaf = p_feats.HasMissing()
-    GetLeafIndex<true, has_categorical>(tree, p_feats, cats) :
+                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
-    GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
+                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
  return tree[leaf].LeafValue();
 }
 }  // namespace scalar
-void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
+namespace multi {
-                       const size_t tree_end, std::vector<bst_float> *out_preds,
+template <bool has_missing, bool has_categorical>
-                       const size_t predict_offset, const size_t num_group,
+bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
-                       const std::vector<RegTree::FVec> &thread_temp,
+                        RegTree::CategoricalSplitMatrix const &cats) {
-                       const size_t offset, const size_t block_size) {
+  bst_node_t nidx{0};
-  std::vector<bst_float> &preds = *out_preds;
+  while (!tree.IsLeaf(nidx)) {
-  for (size_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
+    unsigned split_index = tree.SplitIndex(nidx);
-    const size_t gid = model.tree_info[tree_id];
+    auto fvalue = feat.GetFvalue(split_index);
-    auto const &tree = *model.trees[tree_id];
+    nidx = GetNextNodeMulti<has_missing, has_categorical>(
        tree, nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
  }
  return nidx;
 }
 template <bool has_categorical>
 void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tree,
                        RegTree::CategoricalSplitMatrix const &cats,
                        linalg::VectorView<float> out_predt) {
  bst_node_t const leaf = p_feats.HasMissing()
                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
  auto leaf_value = tree.LeafValue(leaf);
  assert(out_predt.Shape(0) == leaf_value.Shape(0) && "shape mismatch.");
  for (size_t i = 0; i < leaf_value.Size(); ++i) {
    out_predt(i) += leaf_value(i);
  }
 }
 }  // namespace multi
 namespace {
 void PredictByAllTrees(gbm::GBTreeModel const &model, std::uint32_t const tree_begin,
                       std::uint32_t const tree_end, std::size_t const predict_offset,
                       std::vector<RegTree::FVec> const &thread_temp, std::size_t const offset,
                       std::size_t const block_size, linalg::MatrixView<float> out_predt) {
  for (std::uint32_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
    auto const &tree = *model.trees.at(tree_id);
    auto const &cats = tree.GetCategoriesMatrix();
-    auto has_categorical = tree.HasCategoricalSplit();
+    bool has_categorical = tree.HasCategoricalSplit();
    if (tree.IsMultiTarget()) {
      if (has_categorical) {
-      for (size_t i = 0; i < block_size; ++i) {
+        for (std::size_t i = 0; i < block_size; ++i) {
-        preds[(predict_offset + i) * num_group + gid] +=
+          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-            PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
+          multi::PredValueByOneTree<true>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
                                          t_predts);
        }
      } else {
-      for (size_t i = 0; i < block_size; ++i) {
+        for (std::size_t i = 0; i < block_size; ++i) {
-        preds[(predict_offset + i) * num_group + gid] +=
+          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-            PredValueByOneTree<false>(thread_temp[offset + i], tree, cats);
+          multi::PredValueByOneTree<false>(thread_temp[offset + i], *tree.GetMultiTargetTree(),
                                           cats, t_predts);
        }
      }
    } else {
      auto const gid = model.tree_info[tree_id];
      if (has_categorical) {
        for (std::size_t i = 0; i < block_size; ++i) {
          out_predt(predict_offset + i, gid) +=
              scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
        }
      } else {
        for (std::size_t i = 0; i < block_size; ++i) {
          out_predt(predict_offset + i, gid) +=
              scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
        }
      }
    }
  }
@ -126,9 +182,7 @@ void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batc
  }
 }
-namespace {
+static std::size_t constexpr kUnroll = 8;
 static size_t constexpr kUnroll = 8;
 }  // anonymous namespace
 struct SparsePageView {
  bst_row_t base_rowid;
@ -227,15 +281,13 @@ class AdapterView {
 };
 template <typename DataView, size_t block_of_rows_size>
-void PredictBatchByBlockOfRowsKernel(
+void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &model,
-    DataView batch, std::vector<bst_float> *out_preds,
+                                     std::uint32_t tree_begin, std::uint32_t tree_end,
-    gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end,
+                                     std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads,
-    std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads) {
+                                     linalg::TensorView<float, 2> out_predt) {
  auto &thread_temp = *p_thread_temp;
  int32_t const num_group = model.learner_model_param->num_output_group;
-  CHECK_EQ(model.param.size_leaf_vector, 0)
+  CHECK_EQ(model.param.size_leaf_vector, 0) << "size_leaf_vector is enforced to 0 so far";
      << "size_leaf_vector is enforced to 0 so far";
  // parallel over local batch
  const auto nsize = static_cast<bst_omp_uint>(batch.Size());
  const int num_feature = model.learner_model_param->num_feature;
@ -243,16 +295,13 @@ void PredictBatchByBlockOfRowsKernel(
  common::ParallelFor(n_blocks, n_threads, [&](bst_omp_uint block_id) {
    const size_t batch_offset = block_id * block_of_rows_size;
-    const size_t block_size =
+    const size_t block_size = std::min(nsize - batch_offset, block_of_rows_size);
        std::min(nsize - batch_offset, block_of_rows_size);
    const size_t fvec_offset = omp_get_thread_num() * block_of_rows_size;
-    FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset,
+    FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, p_thread_temp);
             p_thread_temp);
    // process block of rows through all trees to keep cache locality
-    PredictByAllTrees(model, tree_begin, tree_end, out_preds,
+    PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid, thread_temp,
-                      batch_offset + batch.base_rowid, num_group, thread_temp,
+                      fvec_offset, block_size, out_predt);
                      fvec_offset, block_size);
    FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
  });
 }
@ -275,7 +324,7 @@ float FillNodeMeanValues(RegTree const *tree, bst_node_t nidx, std::vector<float
 }
 void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
-  size_t num_nodes = tree->param.num_nodes;
+  size_t num_nodes = tree->NumNodes();
  if (mean_values->size() == num_nodes) {
    return;
  }
@ -283,7 +332,6 @@ void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
  FillNodeMeanValues(tree, 0, mean_values);
 }
 namespace {
 // init thread buffers
 static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
  int prev_thread_temp_size = out->size();
@ -557,33 +605,6 @@ class ColumnSplitHelper {
 class CPUPredictor : public Predictor {
 protected:
  void PredictGHistIndex(DMatrix *p_fmat, gbm::GBTreeModel const &model, int32_t tree_begin,
                         int32_t tree_end, std::vector<bst_float> *out_preds) const {
    auto const n_threads = this->ctx_->Threads();
    constexpr double kDensityThresh = .5;
    size_t total =
        std::max(p_fmat->Info().num_row_ * p_fmat->Info().num_col_, static_cast<uint64_t>(1));
    double density = static_cast<double>(p_fmat->Info().num_nonzero_) / static_cast<double>(total);
    bool blocked = density > kDensityThresh;
    std::vector<RegTree::FVec> feat_vecs;
    InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
    std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
    auto ft = p_fmat->Info().feature_types.ConstHostVector();
    for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
      if (blocked) {
        PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
            GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads},
            out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
      } else {
        PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, 1>(
            GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads},
            out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
      }
    }
  }
  void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
                      gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end) const {
    if (p_fmat->IsColumnSplit()) {
@ -592,11 +613,6 @@ class CPUPredictor : public Predictor {
      return;
    }
    if (!p_fmat->PageExists<SparsePage>()) {
      this->PredictGHistIndex(p_fmat, model, tree_begin, tree_end, out_preds);
      return;
    }
    auto const n_threads = this->ctx_->Threads();
    constexpr double kDensityThresh = .5;
    size_t total =
@ -606,16 +622,38 @@ class CPUPredictor : public Predictor {
    std::vector<RegTree::FVec> feat_vecs;
    InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
    std::size_t n_samples = p_fmat->Info().num_row_;
    std::size_t n_groups = model.learner_model_param->OutputLength();
    CHECK_EQ(out_preds->size(), n_samples * n_groups);
    linalg::TensorView<float, 2> out_predt{*out_preds, {n_samples, n_groups}, ctx_->gpu_id};
    if (!p_fmat->PageExists<SparsePage>()) {
      std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
      auto ft = p_fmat->Info().feature_types.ConstHostVector();
      for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
        if (blocked) {
          PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
              GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
              tree_begin, tree_end, &feat_vecs, n_threads, out_predt);
        } else {
          PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, 1>(
              GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
              tree_begin, tree_end, &feat_vecs, n_threads, out_predt);
        }
      }
    } else {
      for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
      CHECK_EQ(out_preds->size(),
               p_fmat->Info().num_row_ * model.learner_model_param->num_output_group);
        if (blocked) {
          PredictBatchByBlockOfRowsKernel<SparsePageView, kBlockOfRowsSize>(
-            SparsePageView{&batch}, out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
+              SparsePageView{&batch}, model, tree_begin, tree_end, &feat_vecs, n_threads,
              out_predt);
        } else {
-        PredictBatchByBlockOfRowsKernel<SparsePageView, 1>(
+          PredictBatchByBlockOfRowsKernel<SparsePageView, 1>(SparsePageView{&batch}, model,
-            SparsePageView{&batch}, out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
+                                                             tree_begin, tree_end, &feat_vecs,
                                                             n_threads, out_predt);
        }
      }
    }
  }
@ -623,26 +661,24 @@ class CPUPredictor : public Predictor {
 public:
  explicit CPUPredictor(Context const *ctx) : Predictor::Predictor{ctx} {}
-  void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
+  void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts, const gbm::GBTreeModel &model,
-                    const gbm::GBTreeModel &model, uint32_t tree_begin,
+                    uint32_t tree_begin, uint32_t tree_end = 0) const override {
                    uint32_t tree_end = 0) const override {
    auto *out_preds = &predts->predictions;
    // This is actually already handled in gbm, but large amount of tests rely on the
    // behaviour.
    if (tree_end == 0) {
      tree_end = model.trees.size();
    }
-    this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin,
+    this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin, tree_end);
                         tree_end);
  }
  template <typename Adapter, size_t kBlockSize>
-  void DispatchedInplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
+  void DispatchedInplacePredict(std::any const &x, std::shared_ptr<DMatrix> p_m,
                                const gbm::GBTreeModel &model, float missing,
-                                PredictionCacheEntry *out_preds,
+                                PredictionCacheEntry *out_preds, uint32_t tree_begin,
-                                uint32_t tree_begin, uint32_t tree_end) const {
+                                uint32_t tree_end) const {
    auto const n_threads = this->ctx_->Threads();
-    auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
+    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
    CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
        << "Number of columns in data must equal to trained model.";
    if (p_m) {
@ -653,13 +689,16 @@ class CPUPredictor : public Predictor {
      info.num_row_ = m->NumRows();
      this->InitOutPredictions(info, &(out_preds->predictions), model);
    }
    std::vector<Entry> workspace(m->NumColumns() * kUnroll * n_threads);
    auto &predictions = out_preds->predictions.HostVector();
    std::vector<RegTree::FVec> thread_temp;
    InitThreadTemp(n_threads * kBlockSize, &thread_temp);
    std::size_t n_groups = model.learner_model_param->OutputLength();
    linalg::TensorView<float, 2> out_predt{predictions, {m->NumRows(), n_groups}, Context::kCpuId};
    PredictBatchByBlockOfRowsKernel<AdapterView<Adapter>, kBlockSize>(
-        AdapterView<Adapter>(m.get(), missing, common::Span<Entry>{workspace}, n_threads),
+        AdapterView<Adapter>(m.get(), missing, common::Span<Entry>{workspace}, n_threads), model,
-        &predictions, model, tree_begin, tree_end, &thread_temp, n_threads);
+        tree_begin, tree_end, &thread_temp, n_threads, out_predt);
  }
  bool InplacePredict(std::shared_ptr<DMatrix> p_m, const gbm::GBTreeModel &model, float missing,
@ -689,6 +728,7 @@ class CPUPredictor : public Predictor {
  void PredictInstance(const SparsePage::Inst& inst,
                       std::vector<bst_float>* out_preds,
                       const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
    CHECK(!model.learner_model_param->IsVectorLeaf()) << "predict instance" << MTNotImplemented();
    std::vector<RegTree::FVec> feat_vecs;
    feat_vecs.resize(1, RegTree::FVec());
    feat_vecs[0].Init(model.learner_model_param->num_feature);
@ -701,8 +741,8 @@ class CPUPredictor : public Predictor {
    auto base_score = model.learner_model_param->BaseScore(ctx_)(0);
    // loop over output groups
    for (uint32_t gid = 0; gid < model.learner_model_param->num_output_group; ++gid) {
-      (*out_preds)[gid] =
+      (*out_preds)[gid] = scalar::PredValue(inst, model.trees, model.tree_info, gid, &feat_vecs[0],
-          PredValue(inst, model.trees, model.tree_info, gid, &feat_vecs[0], 0, ntree_limit) +
+                                            0, ntree_limit) +
                          base_score;
    }
  }
@ -724,8 +764,7 @@ class CPUPredictor : public Predictor {
    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
      // parallel over local batch
      auto page = batch.GetView();
-      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
+      common::ParallelFor(page.Size(), n_threads, [&](auto i) {
      common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
        const int tid = omp_get_thread_num();
        auto ridx = static_cast<size_t>(batch.base_rowid + i);
        RegTree::FVec &feats = feat_vecs[tid];
@ -733,23 +772,28 @@ class CPUPredictor : public Predictor {
          feats.Init(num_feature);
        }
        feats.Fill(page[i]);
-        for (unsigned j = 0; j < ntree_limit; ++j) {
+        for (std::uint32_t j = 0; j < ntree_limit; ++j) {
          auto const &tree = *model.trees[j];
          auto const &cats = tree.GetCategoriesMatrix();
-          bst_node_t tid = GetLeafIndex<true, true>(tree, feats, cats);
+          bst_node_t nidx;
-          preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
+          if (tree.IsMultiTarget()) {
            nidx = multi::GetLeafIndex<true, true>(*tree.GetMultiTargetTree(), feats, cats);
          } else {
            nidx = scalar::GetLeafIndex<true, true>(tree, feats, cats);
          }
          preds[ridx * ntree_limit + j] = static_cast<bst_float>(nidx);
        }
        feats.Drop(page[i]);
      });
    }
  }
-  void PredictContribution(DMatrix *p_fmat,
+  void PredictContribution(DMatrix *p_fmat, HostDeviceVector<float> *out_contribs,
                           HostDeviceVector<float> *out_contribs,
                           const gbm::GBTreeModel &model, uint32_t ntree_limit,
-                           std::vector<bst_float> const *tree_weights,
+                           std::vector<bst_float> const *tree_weights, bool approximate,
-                           bool approximate, int condition,
+                           int condition, unsigned condition_feature) const override {
-                           unsigned condition_feature) const override {
+    CHECK(!model.learner_model_param->IsVectorLeaf())
        << "Predict contribution" << MTNotImplemented();
    auto const n_threads = this->ctx_->Threads();
    const int num_feature = model.learner_model_param->num_feature;
    std::vector<RegTree::FVec> feat_vecs;
@ -825,11 +869,12 @@ class CPUPredictor : public Predictor {
    }
  }
-  void PredictInteractionContributions(
+  void PredictInteractionContributions(DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
                                       const gbm::GBTreeModel &model, unsigned ntree_limit,
                                       std::vector<bst_float> const *tree_weights,
                                       bool approximate) const override {
    CHECK(!model.learner_model_param->IsVectorLeaf())
        << "Predict interaction contribution" << MTNotImplemented();
    const MetaInfo& info = p_fmat->Info();
    const int ngroup = model.learner_model_param->num_output_group;
    size_t const ncolumns = model.learner_model_param->num_feature;
@ -884,5 +929,4 @@ class CPUPredictor : public Predictor {
 XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
    .describe("Make predictions using CPU.")
    .set_body([](Context const *ctx) { return new CPUPredictor(ctx); });
-}  // namespace predictor
+}  // namespace xgboost::predictor
 }  // namespace xgboost
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -9,6 +9,7 @@
 #include <thrust/fill.h>
 #include <thrust/host_vector.h>
 #include <any>  // for any, any_cast
 #include <memory>
 #include "../common/bitfield.h"
@ -431,7 +432,7 @@ class DeviceModel {
    this->tree_beg_ = tree_begin;
    this->tree_end_ = tree_end;
-    this->num_group = model.learner_model_param->num_output_group;
+    this->num_group = model.learner_model_param->OutputLength();
  }
 };
@ -792,13 +793,13 @@ class GPUPredictor : public xgboost::Predictor {
  }
  template <typename Adapter, typename Loader>
-  void DispatchedInplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
+  void DispatchedInplacePredict(std::any const& x, std::shared_ptr<DMatrix> p_m,
                                const gbm::GBTreeModel& model, float missing,
-                                PredictionCacheEntry *out_preds,
+                                PredictionCacheEntry* out_preds, uint32_t tree_begin,
-                                uint32_t tree_begin, uint32_t tree_end) const {
+                                uint32_t tree_end) const {
    uint32_t const output_groups =  model.learner_model_param->num_output_group;
-    auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
+    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
    CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
        << "Number of columns in data must equal to trained model.";
    CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@ -1,13 +1,12 @@
-/*!
+/**
- * Copyright 2021 by XGBoost Contributors
+ * Copyright 2021-2023 by XGBoost Contributors
 */
 #ifndef XGBOOST_PREDICTOR_PREDICT_FN_H_
 #define XGBOOST_PREDICTOR_PREDICT_FN_H_
 #include "../common/categorical.h"
 #include "xgboost/tree_model.h"
-namespace xgboost {
+namespace xgboost::predictor {
 namespace predictor {
 template <bool has_missing, bool has_categorical>
 inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_t nid,
                                             float fvalue, bool is_missing,
@ -24,6 +23,25 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
    }
  }
 }
-}      // namespace predictor
+
-}      // namespace xgboost
+template <bool has_missing, bool has_categorical>
 inline XGBOOST_DEVICE bst_node_t GetNextNodeMulti(MultiTargetTree const &tree,
                                                  bst_node_t const nidx, float fvalue,
                                                  bool is_missing,
                                                  RegTree::CategoricalSplitMatrix const &cats) {
  if (has_missing && is_missing) {
    return tree.DefaultChild(nidx);
  } else {
    if (has_categorical && common::IsCat(cats.split_type, nidx)) {
      auto node_categories =
          cats.categories.subspan(cats.node_ptr[nidx].beg, cats.node_ptr[nidx].size);
      return common::Decision(node_categories, fvalue) ? tree.LeftChild(nidx)
                                                       : tree.RightChild(nidx);
    } else {
      return tree.LeftChild(nidx) + !(fvalue < tree.SplitCond(nidx));
    }
  }
 }
 }  // namespace xgboost::predictor
 #endif  // XGBOOST_PREDICTOR_PREDICT_FN_H_
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@ -1,22 +1,26 @@
-/*!
+/**
- * Copyright 2021-2022 XGBoost contributors
+ * Copyright 2021-2023 XGBoost contributors
 * \file common_row_partitioner.h
 * \brief Common partitioner logic for hist and approx methods.
 */
 #ifndef XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
 #define XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
 #include <algorithm>  // std::all_of
 #include <cinttypes>  // std::uint32_t
 #include <limits>  // std::numeric_limits
 #include <vector>
 #include "../collective/communicator-inl.h"
 #include "../common/linalg_op.h"  // cbegin
 #include "../common/numeric.h"  // Iota
 #include "../common/partition_builder.h"
 #include "hist/expand_entry.h"  // CPUExpandEntry
 #include "xgboost/base.h"
 #include "xgboost/context.h"    // Context
 #include "xgboost/linalg.h"       // TensorView
-namespace xgboost {
+namespace xgboost::tree {
 namespace tree {
 static constexpr size_t kPartitionBlockSize = 2048;
@ -34,9 +38,10 @@ class ColumnSplitHelper {
    missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
  }
  template <typename ExpandEntry>
  void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
                 GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
-                 std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                 std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
    // When data is split by column, we don't have all the feature values in the local worker, so
    // we first collect all the decisions and whether the feature is missing into bit vectors.
    std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
@ -97,41 +102,47 @@ class CommonRowPartitioner {
    }
  }
-  void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
+  template <typename ExpandEntry>
  void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
                           const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions) {
-    for (size_t i = 0; i < nodes.size(); ++i) {
+    auto const& ptrs = gmat.cut.Ptrs();
-      const int32_t nid = nodes[i].nid;
+    auto const& vals = gmat.cut.Values();
-      const bst_uint fid = tree[nid].SplitIndex();
+
-      const bst_float split_pt = tree[nid].SplitCond();
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
-      const uint32_t lower_bound = gmat.cut.Ptrs()[fid];
+      bst_node_t const nidx = nodes[i].nid;
-      const uint32_t upper_bound = gmat.cut.Ptrs()[fid + 1];
+      bst_feature_t const fidx = tree.SplitIndex(nidx);
      float const split_pt = tree.SplitCond(nidx);
      std::uint32_t const lower_bound = ptrs[fidx];
      std::uint32_t const upper_bound = ptrs[fidx + 1];
      bst_bin_t split_cond = -1;
      // convert floating-point split_pt into corresponding bin_id
      // split_cond = -1 indicates that split_pt is less than all known cut points
      CHECK_LT(upper_bound, static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
      for (auto bound = lower_bound; bound < upper_bound; ++bound) {
-        if (split_pt == gmat.cut.Values()[bound]) {
+        if (split_pt == vals[bound]) {
-          split_cond = static_cast<int32_t>(bound);
+          split_cond = static_cast<bst_bin_t>(bound);
        }
      }
-      (*split_conditions).at(i) = split_cond;
+      (*split_conditions)[i] = split_cond;
    }
  }
-  void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree const* p_tree) {
+  template <typename ExpandEntry>
  void AddSplitsToRowSet(const std::vector<ExpandEntry>& nodes, RegTree const* p_tree) {
    const size_t n_nodes = nodes.size();
    for (unsigned int i = 0; i < n_nodes; ++i) {
-      const int32_t nid = nodes[i].nid;
+      const int32_t nidx = nodes[i].nid;
      const size_t n_left = partition_builder_.GetNLeftElems(i);
      const size_t n_right = partition_builder_.GetNRightElems(i);
-      CHECK_EQ((*p_tree)[nid].LeftChild() + 1, (*p_tree)[nid].RightChild());
+      CHECK_EQ(p_tree->LeftChild(nidx) + 1, p_tree->RightChild(nidx));
-      row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild(),
+      row_set_collection_.AddSplit(nidx, p_tree->LeftChild(nidx), p_tree->RightChild(nidx), n_left,
-                                   n_left, n_right);
+                                   n_right);
    }
  }
  template <typename ExpandEntry>
  void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
    auto const& column_matrix = gmat.Transpose();
    if (column_matrix.IsInitialized()) {
      if (gmat.cut.HasCategorical()) {
@ -149,10 +160,10 @@ class CommonRowPartitioner {
    }
  }
-  template <bool any_cat>
+  template <bool any_cat, typename ExpandEntry>
  void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                      const common::ColumnMatrix& column_matrix,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
    if (column_matrix.AnyMissing()) {
      this->template UpdatePosition<true, any_cat>(ctx, gmat, column_matrix, nodes, p_tree);
    } else {
@ -160,33 +171,21 @@ class CommonRowPartitioner {
    }
  }
-  template <bool any_missing, bool any_cat>
+  template <bool any_missing, bool any_cat, typename ExpandEntry>
  void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                      const common::ColumnMatrix& column_matrix,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
-    switch (column_matrix.GetTypeSize()) {
+    common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto t) {
-      case common::kUint8BinsTypeSize:
+      using T = decltype(t);
-        this->template UpdatePosition<uint8_t, any_missing, any_cat>(ctx, gmat, column_matrix,
+      this->template UpdatePosition<T, any_missing, any_cat>(ctx, gmat, column_matrix, nodes,
-                                                                     nodes, p_tree);
+                                                             p_tree);
-        break;
+    });
      case common::kUint16BinsTypeSize:
        this->template UpdatePosition<uint16_t, any_missing, any_cat>(ctx, gmat, column_matrix,
                                                                      nodes, p_tree);
        break;
      case common::kUint32BinsTypeSize:
        this->template UpdatePosition<uint32_t, any_missing, any_cat>(ctx, gmat, column_matrix,
                                                                      nodes, p_tree);
        break;
      default:
        // no default behavior
        CHECK(false) << column_matrix.GetTypeSize();
    }
  }
-  template <typename BinIdxType, bool any_missing, bool any_cat>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
  void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                      const common::ColumnMatrix& column_matrix,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
    // 1. Find split condition for each split
    size_t n_nodes = nodes.size();
@ -248,9 +247,9 @@ class CommonRowPartitioner {
    AddSplitsToRowSet(nodes, p_tree);
  }
-  auto const& Partitions() const { return row_set_collection_; }
+  [[nodiscard]] auto const& Partitions() const { return row_set_collection_; }
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
    return std::distance(row_set_collection_.begin(), row_set_collection_.end());
  }
@ -263,12 +262,29 @@ class CommonRowPartitioner {
                                     [&](size_t idx) -> bool { return hess[idx] - .0f == .0f; });
  }
  void LeafPartition(Context const* ctx, RegTree const& tree,
                     linalg::TensorView<GradientPair const, 2> gpair,
                     std::vector<bst_node_t>* p_out_position) const {
    if (gpair.Shape(1) > 1) {
      partition_builder_.LeafPartition(
          ctx, tree, this->Partitions(), p_out_position, [&](std::size_t idx) -> bool {
            auto sample = gpair.Slice(idx, linalg::All());
            return std::all_of(linalg::cbegin(sample), linalg::cend(sample),
                               [](GradientPair const& g) { return g.GetHess() - .0f == .0f; });
          });
    } else {
      auto s = gpair.Slice(linalg::All(), 0);
      partition_builder_.LeafPartition(
          ctx, tree, this->Partitions(), p_out_position,
          [&](std::size_t idx) -> bool { return s(idx).GetHess() - .0f == .0f; });
    }
  }
  void LeafPartition(Context const* ctx, RegTree const& tree,
                     common::Span<GradientPair const> gpair,
                     std::vector<bst_node_t>* p_out_position) const {
    partition_builder_.LeafPartition(
        ctx, tree, this->Partitions(), p_out_position,
-        [&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
+        [&](std::size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
  }
 private:
@ -278,6 +294,5 @@ class CommonRowPartitioner {
  ColumnSplitHelper column_split_helper_;
 };
-}  // namespace tree
+}  // namespace xgboost::tree
 }  // namespace xgboost
 #endif  // XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@ -21,7 +21,8 @@
 namespace xgboost {
 namespace tree {
 namespace cpu_impl {
-void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
+void FitStump(Context const* ctx, MetaInfo const& info,
              linalg::TensorView<GradientPair const, 2> gpair,
              linalg::VectorView<float> out) {
  auto n_targets = out.Size();
  CHECK_EQ(n_targets, gpair.Shape(1));
@ -43,8 +44,12 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
    }
  }
  CHECK(h_sum.CContiguous());
  // In vertical federated learning, only worker 0 needs to call this, no need to do an allreduce.
  if (!collective::IsFederated() || info.data_split_mode != DataSplitMode::kCol) {
    collective::Allreduce<collective::Operation::kSum>(
        reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
  }
  for (std::size_t i = 0; i < h_sum.Size(); ++i) {
    out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
@ -64,7 +69,7 @@ inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
 #endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace cuda_impl
-void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
              bst_target_t n_targets, linalg::Vector<float>* out) {
  out->SetDevice(ctx->gpu_id);
  out->Reshape(n_targets);
@ -72,7 +77,7 @@ void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
  gpair.SetDevice(ctx->gpu_id);
  auto gpair_t = linalg::MakeTensorView(ctx, &gpair, n_samples, n_targets);
-  ctx->IsCPU() ? cpu_impl::FitStump(ctx, gpair_t, out->HostView())
+  ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
               : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
 }
 }  // namespace tree
--- a/src/tree/fit_stump.h
+++ b/src/tree/fit_stump.h
@ -16,6 +16,7 @@
 #include "../common/common.h"            // AssertGPUSupport
 #include "xgboost/base.h"                // GradientPair
 #include "xgboost/context.h"             // Context
 #include "xgboost/data.h"                // MetaInfo
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // TensorView
@ -30,7 +31,7 @@ XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) {
 /**
 * @brief Fit a tree stump as an estimation of base_score.
 */
-void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
              bst_target_t n_targets, linalg::Vector<float>* out);
 }  // namespace tree
 }  // namespace xgboost
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@ -4,22 +4,25 @@
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
-#include <algorithm>
+#include <algorithm>                   // for copy
 #include <cstddef>                     // for size_t
-#include <limits>
+#include <limits>                      // for numeric_limits
-#include <memory>
+#include <memory>                      // for shared_ptr
-#include <numeric>
+#include <numeric>                     // for accumulate
-#include <utility>
+#include <utility>                     // for move
-#include <vector>
+#include <vector>                      // for vector
-#include "../../common/categorical.h"
+#include "../../common/categorical.h"  // for CatBitField
-#include "../../common/hist_util.h"
+#include "../../common/hist_util.h"    // for GHistRow, HistogramCuts
-#include "../../common/random.h"
+#include "../../common/linalg_op.h"    // for cbegin, cend, begin
-#include "../../data/gradient_index.h"
+#include "../../common/random.h"       // for ColumnSampler
-#include "../constraints.h"
+#include "../constraints.h"            // for FeatureInteractionConstraintHost
 #include "../param.h"                  // for TrainParam
-#include "../split_evaluator.h"
+#include "../split_evaluator.h"        // for TreeEvaluator
-#include "xgboost/context.h"
+#include "expand_entry.h"              // for MultiExpandEntry
 #include "xgboost/base.h"              // for bst_node_t, bst_target_t, bst_feature_t
 #include "xgboost/context.h"           // for COntext
 #include "xgboost/linalg.h"            // for Constants, Vector
 namespace xgboost::tree {
 template <typename ExpandEntry>
@ -410,8 +413,6 @@ class HistEvaluator {
                             tree[candidate.nid].SplitIndex(), left_weight,
                             right_weight);
    auto max_node = std::max(left_child, tree[candidate.nid].RightChild());
    max_node = std::max(candidate.nid, max_node);
    snode_.resize(tree.GetNodes().size());
    snode_.at(left_child).stats = candidate.split.left_sum;
    snode_.at(left_child).root_gain =
@ -456,6 +457,216 @@ class HistEvaluator {
  }
 };
 class HistMultiEvaluator {
  std::vector<double> gain_;
  linalg::Matrix<GradientPairPrecise> stats_;
  TrainParam const *param_;
  FeatureInteractionConstraintHost interaction_constraints_;
  std::shared_ptr<common::ColumnSampler> column_sampler_;
  Context const *ctx_;
 private:
  static double MultiCalcSplitGain(TrainParam const &param,
                                   linalg::VectorView<GradientPairPrecise const> left_sum,
                                   linalg::VectorView<GradientPairPrecise const> right_sum,
                                   linalg::VectorView<float> left_weight,
                                   linalg::VectorView<float> right_weight) {
    CalcWeight(param, left_sum, left_weight);
    CalcWeight(param, right_sum, right_weight);
    auto left_gain = CalcGainGivenWeight(param, left_sum, left_weight);
    auto right_gain = CalcGainGivenWeight(param, right_sum, right_weight);
    return left_gain + right_gain;
  }
  template <bst_bin_t d_step>
  bool EnumerateSplit(common::HistogramCuts const &cut, bst_feature_t fidx,
                      common::Span<common::GHistRow const> hist,
                      linalg::VectorView<GradientPairPrecise const> parent_sum, double parent_gain,
                      SplitEntryContainer<std::vector<GradientPairPrecise>> *p_best) const {
    auto const &cut_ptr = cut.Ptrs();
    auto const &cut_val = cut.Values();
    auto const &min_val = cut.MinValues();
    auto sum = linalg::Empty<GradientPairPrecise>(ctx_, 2, hist.size());
    auto left_sum = sum.Slice(0, linalg::All());
    auto right_sum = sum.Slice(1, linalg::All());
    bst_bin_t ibegin, iend;
    if (d_step > 0) {
      ibegin = static_cast<bst_bin_t>(cut_ptr[fidx]);
      iend = static_cast<bst_bin_t>(cut_ptr[fidx + 1]);
    } else {
      ibegin = static_cast<bst_bin_t>(cut_ptr[fidx + 1]) - 1;
      iend = static_cast<bst_bin_t>(cut_ptr[fidx]) - 1;
    }
    const auto imin = static_cast<bst_bin_t>(cut_ptr[fidx]);
    auto n_targets = hist.size();
    auto weight = linalg::Empty<float>(ctx_, 2, n_targets);
    auto left_weight = weight.Slice(0, linalg::All());
    auto right_weight = weight.Slice(1, linalg::All());
    for (bst_bin_t i = ibegin; i != iend; i += d_step) {
      for (bst_target_t t = 0; t < n_targets; ++t) {
        auto t_hist = hist[t];
        auto t_p = parent_sum(t);
        left_sum(t) += t_hist[i];
        right_sum(t) = t_p - left_sum(t);
      }
      if (d_step > 0) {
        auto split_pt = cut_val[i];
        auto loss_chg =
            MultiCalcSplitGain(*param_, right_sum, left_sum, right_weight, left_weight) -
            parent_gain;
        p_best->Update(loss_chg, fidx, split_pt, d_step == -1, false, left_sum, right_sum);
      } else {
        float split_pt;
        if (i == imin) {
          split_pt = min_val[fidx];
        } else {
          split_pt = cut_val[i - 1];
        }
        auto loss_chg =
            MultiCalcSplitGain(*param_, right_sum, left_sum, left_weight, right_weight) -
            parent_gain;
        p_best->Update(loss_chg, fidx, split_pt, d_step == -1, false, right_sum, left_sum);
      }
    }
    // return true if there's missing. Doesn't handle floating-point error well.
    if (d_step == +1) {
      return !std::equal(linalg::cbegin(left_sum), linalg::cend(left_sum),
                         linalg::cbegin(parent_sum));
    }
    return false;
  }
 public:
  void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
                      common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
    auto &entries = *p_entries;
    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
    for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
      auto nidx = entries[nidx_in_set].nid;
      features[nidx_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
    }
    CHECK(!features.empty());
    std::int32_t n_threads = ctx_->Threads();
    std::size_t const grain_size = std::max<std::size_t>(1, features.front()->Size() / n_threads);
    common::BlockedSpace2d space(
        entries.size(), [&](std::size_t nidx_in_set) { return features[nidx_in_set]->Size(); },
        grain_size);
    std::vector<MultiExpandEntry> tloc_candidates(n_threads * entries.size());
    for (std::size_t i = 0; i < entries.size(); ++i) {
      for (std::int32_t j = 0; j < n_threads; ++j) {
        tloc_candidates[i * n_threads + j] = entries[i];
      }
    }
    common::ParallelFor2d(space, n_threads, [&](std::size_t nidx_in_set, common::Range1d r) {
      auto tidx = omp_get_thread_num();
      auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
      auto best = &entry->split;
      auto parent_sum = stats_.Slice(entry->nid, linalg::All());
      std::vector<common::GHistRow> node_hist;
      for (auto t_hist : hist) {
        node_hist.push_back((*t_hist)[entry->nid]);
      }
      auto features_set = features[nidx_in_set]->ConstHostSpan();
      for (auto fidx_in_set = r.begin(); fidx_in_set < r.end(); fidx_in_set++) {
        auto fidx = features_set[fidx_in_set];
        if (!interaction_constraints_.Query(entry->nid, fidx)) {
          continue;
        }
        auto parent_gain = gain_[entry->nid];
        bool missing =
            this->EnumerateSplit<+1>(cut, fidx, node_hist, parent_sum, parent_gain, best);
        if (missing) {
          this->EnumerateSplit<-1>(cut, fidx, node_hist, parent_sum, parent_gain, best);
        }
      }
    });
    for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
      for (auto tidx = 0; tidx < n_threads; ++tidx) {
        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
      }
    }
  }
  linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
    auto n_targets = root_sum.Size();
    stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
    gain_.resize(1);
    linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
    CalcWeight(*param_, root_sum, weight.HostView());
    auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
    gain_.front() = root_gain;
    auto h_stats = stats_.HostView();
    std::copy(linalg::cbegin(root_sum), linalg::cend(root_sum), linalg::begin(h_stats));
    return weight;
  }
  void ApplyTreeSplit(MultiExpandEntry const &candidate, RegTree *p_tree) {
    auto n_targets = p_tree->NumTargets();
    auto parent_sum = stats_.Slice(candidate.nid, linalg::All());
    auto weight = linalg::Empty<float>(ctx_, 3, n_targets);
    auto base_weight = weight.Slice(0, linalg::All());
    CalcWeight(*param_, parent_sum, base_weight);
    auto left_weight = weight.Slice(1, linalg::All());
    auto left_sum =
        linalg::MakeVec(candidate.split.left_sum.data(), candidate.split.left_sum.size());
    CalcWeight(*param_, left_sum, param_->learning_rate, left_weight);
    auto right_weight = weight.Slice(2, linalg::All());
    auto right_sum =
        linalg::MakeVec(candidate.split.right_sum.data(), candidate.split.right_sum.size());
    CalcWeight(*param_, right_sum, param_->learning_rate, right_weight);
    p_tree->ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
                       candidate.split.DefaultLeft(), base_weight, left_weight, right_weight);
    CHECK(p_tree->IsMultiTarget());
    auto left_child = p_tree->LeftChild(candidate.nid);
    CHECK_GT(left_child, candidate.nid);
    auto right_child = p_tree->RightChild(candidate.nid);
    CHECK_GT(right_child, candidate.nid);
    std::size_t n_nodes = p_tree->Size();
    gain_.resize(n_nodes);
    gain_[left_child] = CalcGainGivenWeight(*param_, left_sum, left_weight);
    gain_[right_child] = CalcGainGivenWeight(*param_, right_sum, right_weight);
    if (n_nodes >= stats_.Shape(0)) {
      stats_.Reshape(n_nodes * 2, stats_.Shape(1));
    }
    CHECK_EQ(stats_.Shape(1), n_targets);
    auto left_sum_stat = stats_.Slice(left_child, linalg::All());
    std::copy(candidate.split.left_sum.cbegin(), candidate.split.left_sum.cend(),
              linalg::begin(left_sum_stat));
    auto right_sum_stat = stats_.Slice(right_child, linalg::All());
    std::copy(candidate.split.right_sum.cbegin(), candidate.split.right_sum.cend(),
              linalg::begin(right_sum_stat));
  }
  explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
                              std::shared_ptr<common::ColumnSampler> sampler)
      : param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
    interaction_constraints_.Configure(*param, info.num_col_);
    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
                          param_->colsample_bynode, param_->colsample_bylevel,
                          param_->colsample_bytree);
  }
 };
 /**
 * \brief CPU implementation of update prediction cache, which calculates the leaf value
 *        for the last tree and accumulates it to prediction vector.
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@ -1,29 +1,51 @@
-/*!
+/**
- * Copyright 2021 XGBoost contributors
+ * Copyright 2021-2023 XGBoost contributors
 */
 #ifndef XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
 #define XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
-#include <utility>
+#include <algorithm>       // for all_of
-#include "../param.h"
+#include <ostream>         // for ostream
 #include <utility>         // for move
 #include <vector>          // for vector
-namespace xgboost {
+#include "../param.h"      // for SplitEntry, SplitEntryContainer, TrainParam
-namespace tree {
+#include "xgboost/base.h"  // for GradientPairPrecise, bst_node_t
-struct CPUExpandEntry {
+namespace xgboost::tree {
-  int nid;
+/**
-  int depth;
+ * \brief Structure for storing tree split candidate.
-  SplitEntry split;
+ */
-  CPUExpandEntry() = default;
+template <typename Impl>
-  XGBOOST_DEVICE
+struct ExpandEntryImpl {
-  CPUExpandEntry(int nid, int depth, SplitEntry split)
+  bst_node_t nid;
-      : nid(nid), depth(depth), split(std::move(split)) {}
+  bst_node_t depth;
-  CPUExpandEntry(int nid, int depth, float loss_chg)
+
-      : nid(nid), depth(depth)  {
+  [[nodiscard]] float GetLossChange() const {
-    split.loss_chg = loss_chg;
+    return static_cast<Impl const*>(this)->split.loss_chg;
  }
  [[nodiscard]] bst_node_t GetNodeId() const { return nid; }
  static bool ChildIsValid(TrainParam const& param, bst_node_t depth, bst_node_t num_leaves) {
    if (param.max_depth > 0 && depth >= param.max_depth) return false;
    if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
    return true;
  }
-  bool IsValid(const TrainParam& param, int num_leaves) const {
+  [[nodiscard]] bool IsValid(TrainParam const& param, bst_node_t num_leaves) const {
    return static_cast<Impl const*>(this)->IsValidImpl(param, num_leaves);
  }
 };
 struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
  SplitEntry split;
  CPUExpandEntry() = default;
  CPUExpandEntry(bst_node_t nidx, bst_node_t depth, SplitEntry split)
      : ExpandEntryImpl{nidx, depth}, split(std::move(split)) {}
  CPUExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {}
  [[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const {
    if (split.loss_chg <= kRtEps) return false;
    if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
      return false;
@ -40,16 +62,7 @@ struct CPUExpandEntry {
    return true;
  }
-  float GetLossChange() const { return split.loss_chg; }
+  friend std::ostream& operator<<(std::ostream& os, CPUExpandEntry const& e) {
  bst_node_t GetNodeId() const { return nid; }
  static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
    if (param.max_depth > 0 && depth >= param.max_depth) return false;
    if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
    return true;
  }
  friend std::ostream& operator<<(std::ostream& os, const CPUExpandEntry& e) {
    os << "ExpandEntry:\n";
    os << "nidx: " << e.nid << "\n";
    os << "depth: " << e.depth << "\n";
@ -58,6 +71,54 @@ struct CPUExpandEntry {
    return os;
  }
 };
-}  // namespace tree
+
-}  // namespace xgboost
+struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
  SplitEntryContainer<std::vector<GradientPairPrecise>> split;
  MultiExpandEntry() = default;
  MultiExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {}
  [[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const {
    if (split.loss_chg <= kRtEps) return false;
    auto is_zero = [](auto const& sum) {
      return std::all_of(sum.cbegin(), sum.cend(),
                         [&](auto const& g) { return g.GetHess() - .0 == .0; });
    };
    if (is_zero(split.left_sum) || is_zero(split.right_sum)) {
      return false;
    }
    if (split.loss_chg < param.min_split_loss) {
      return false;
    }
    if (param.max_depth > 0 && depth == param.max_depth) {
      return false;
    }
    if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
      return false;
    }
    return true;
  }
  friend std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& e) {
    os << "ExpandEntry: \n";
    os << "nidx: " << e.nid << "\n";
    os << "depth: " << e.depth << "\n";
    os << "loss: " << e.split.loss_chg << "\n";
    os << "split cond:" << e.split.split_value << "\n";
    os << "split ind:" << e.split.SplitIndex() << "\n";
    os << "left_sum: [";
    for (auto v : e.split.left_sum) {
      os << v << ", ";
    }
    os << "]\n";
    os << "right_sum: [";
    for (auto v : e.split.right_sum) {
      os << v << ", ";
    }
    os << "]\n";
    return os;
  }
 };
 }  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@ -306,9 +306,9 @@ class HistogramBuilder {
 // Construct a work space for building histogram.  Eventually we should move this
 // function into histogram builder once hist tree method supports external memory.
-template <typename Partitioner>
+template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
 common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
-                                          std::vector<CPUExpandEntry> const &nodes_to_build) {
+                                          std::vector<ExpandEntry> const &nodes_to_build) {
  std::vector<size_t> partition_size(nodes_to_build.size(), 0);
  for (auto const &partition : partitioners) {
    size_t k = 0;
--- a/src/tree/param.h
+++ b/src/tree/param.h
@ -14,10 +14,12 @@
 #include <string>
 #include <vector>
 #include "xgboost/parameter.h"
 #include "xgboost/data.h"
 #include "../common/categorical.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
 #include "xgboost/data.h"
 #include "xgboost/linalg.h"
 #include "xgboost/parameter.h"
 namespace xgboost {
 namespace tree {
@ -197,12 +199,11 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
  }
  /*! \brief given the loss change, whether we need to invoke pruning */
-  bool NeedPrune(double loss_chg, int depth) const {
+  [[nodiscard]] bool NeedPrune(double loss_chg, int depth) const {
-    return loss_chg < this->min_split_loss ||
+    return loss_chg < this->min_split_loss || (this->max_depth != 0 && depth > this->max_depth);
           (this->max_depth != 0 && depth > this->max_depth);
  }
-  bst_node_t MaxNodes() const {
+  [[nodiscard]] bst_node_t MaxNodes() const {
    if (this->max_depth == 0 && this->max_leaves == 0) {
      LOG(FATAL) << "Max leaves and max depth cannot both be unconstrained.";
    }
@ -292,6 +293,34 @@ XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, GpairT sum_grad)
  return CalcWeight(p, sum_grad.GetGrad(), sum_grad.GetHess());
 }
 /**
 * \brief multi-target weight, calculated with learning rate.
 */
 inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
                       float eta, linalg::VectorView<float> out_w) {
  for (bst_target_t i = 0; i < out_w.Size(); ++i) {
    out_w(i) = CalcWeight(p, grad_sum(i).GetGrad(), grad_sum(i).GetHess()) * eta;
  }
 }
 /**
 * \brief multi-target weight
 */
 inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
                       linalg::VectorView<float> out_w) {
  return CalcWeight(p, grad_sum, 1.0f, out_w);
 }
 inline double CalcGainGivenWeight(TrainParam const &p,
                                  linalg::VectorView<GradientPairPrecise const> sum_grad,
                                  linalg::VectorView<float const> weight) {
  double gain{0};
  for (bst_target_t i = 0; i < weight.Size(); ++i) {
    gain += -weight(i) * ThresholdL1(sum_grad(i).GetGrad(), p.reg_alpha);
  }
  return gain;
 }
 /*! \brief core statistics used for tree construction */
 struct XGBOOST_ALIGNAS(16) GradStats {
  using GradType = double;
@ -301,8 +330,8 @@ struct XGBOOST_ALIGNAS(16) GradStats {
  GradType sum_hess { 0 };
 public:
-  XGBOOST_DEVICE GradType GetGrad() const { return sum_grad; }
+  [[nodiscard]] XGBOOST_DEVICE GradType GetGrad() const { return sum_grad; }
-  XGBOOST_DEVICE GradType GetHess() const { return sum_hess; }
+  [[nodiscard]] XGBOOST_DEVICE GradType GetHess() const { return sum_hess; }
  friend std::ostream& operator<<(std::ostream& os, GradStats s) {
    os << s.GetGrad() << "/" << s.GetHess();
@ -340,7 +369,7 @@ struct XGBOOST_ALIGNAS(16) GradStats {
    sum_hess = a.sum_hess - b.sum_hess;
  }
  /*! \return whether the statistics is not used yet */
-  inline bool Empty() const { return sum_hess == 0.0; }
+  [[nodiscard]] bool Empty() const { return sum_hess == 0.0; }
  /*! \brief add statistics to the data */
  inline void Add(GradType grad, GradType hess) {
    sum_grad += grad;
@ -348,6 +377,19 @@ struct XGBOOST_ALIGNAS(16) GradStats {
  }
 };
 // Helper functions for copying gradient statistic, one for vector leaf, another for normal scalar.
 template <typename T, typename U>
 std::vector<T> &CopyStats(linalg::VectorView<U> const &src, std::vector<T> *dst) {  // NOLINT
  dst->resize(src.Size());
  std::copy(linalg::cbegin(src), linalg::cend(src), dst->begin());
  return *dst;
 }
 inline GradStats &CopyStats(GradStats const &src, GradStats *dst) {  // NOLINT
  *dst = src;
  return *dst;
 }
 /*!
 * \brief statistics that is helpful to store
 *   and represent a split solution for the tree
@ -378,9 +420,9 @@ struct SplitEntryContainer {
    return os;
  }
  /*!\return feature index to split on */
-  bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
+  [[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
  /*!\return whether missing value goes to left branch */
-  bool DefaultLeft() const { return (sindex >> 31) != 0; }
+  [[nodiscard]] bool DefaultLeft() const { return (sindex >> 31) != 0; }
  /*!
   * \brief decides whether we can replace current entry with the given statistics
   *
@ -391,7 +433,7 @@ struct SplitEntryContainer {
   * \param new_loss_chg the loss reduction get through the split
   * \param split_index the feature index where the split is on
   */
-  bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
+  [[nodiscard]] bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
    if (std::isinf(new_loss_chg)) {  // in some cases new_loss_chg can be NaN or Inf,
                                     // for example when lambda = 0 & min_child_weight = 0
                                     // skip value in this case
@ -429,9 +471,10 @@ struct SplitEntryContainer {
   * \param default_left whether the missing value goes to left
   * \return whether the proposed split is better and can replace current split
   */
-  bool Update(bst_float new_loss_chg, unsigned split_index,
+  template <typename GradientSumT>
-              bst_float new_split_value, bool default_left, bool is_cat,
+  bool Update(bst_float new_loss_chg, unsigned split_index, bst_float new_split_value,
-              const GradientT &left_sum, const GradientT &right_sum) {
+              bool default_left, bool is_cat, GradientSumT const &left_sum,
              GradientSumT const &right_sum) {
    if (this->NeedReplace(new_loss_chg, split_index)) {
      this->loss_chg = new_loss_chg;
      if (default_left) {
@ -440,8 +483,8 @@ struct SplitEntryContainer {
      this->sindex = split_index;
      this->split_value = new_split_value;
      this->is_cat = is_cat;
-      this->left_sum = left_sum;
+      CopyStats(left_sum, &this->left_sum);
-      this->right_sum = right_sum;
+      CopyStats(right_sum, &this->right_sum);
      return true;
    } else {
      return false;
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@ -815,9 +815,9 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
                         linalg::VectorView<float const> left_weight,
                         linalg::VectorView<float const> right_weight) {
  CHECK(IsMultiTarget());
-  CHECK_LT(split_index, this->param.num_feature);
+  CHECK_LT(split_index, this->param_.num_feature);
  CHECK(this->p_mt_tree_);
-  CHECK_GT(param.size_leaf_vector, 1);
+  CHECK_GT(param_.size_leaf_vector, 1);
  this->p_mt_tree_->Expand(nidx, split_index, split_cond, default_left, base_weight, left_weight,
                           right_weight);
@ -826,7 +826,7 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
  split_categories_segments_.resize(this->Size());
  this->split_types_.at(nidx) = FeatureType::kNumerical;
-  this->param.num_nodes = this->p_mt_tree_->Size();
+  this->param_.num_nodes = this->p_mt_tree_->Size();
 }
 void RegTree::ExpandCategorical(bst_node_t nid, bst_feature_t split_index,
@ -850,13 +850,13 @@ void RegTree::ExpandCategorical(bst_node_t nid, bst_feature_t split_index,
 }
 void RegTree::Load(dmlc::Stream* fi) {
-  CHECK_EQ(fi->Read(&param, sizeof(TreeParam)), sizeof(TreeParam));
+  CHECK_EQ(fi->Read(&param_, sizeof(TreeParam)), sizeof(TreeParam));
  if (!DMLC_IO_NO_ENDIAN_SWAP) {
-    param = param.ByteSwap();
+    param_ = param_.ByteSwap();
  }
-  nodes_.resize(param.num_nodes);
+  nodes_.resize(param_.num_nodes);
-  stats_.resize(param.num_nodes);
+  stats_.resize(param_.num_nodes);
-  CHECK_NE(param.num_nodes, 0);
+  CHECK_NE(param_.num_nodes, 0);
  CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()),
           sizeof(Node) * nodes_.size());
  if (!DMLC_IO_NO_ENDIAN_SWAP) {
@ -873,29 +873,31 @@ void RegTree::Load(dmlc::Stream* fi) {
  }
  // chg deleted nodes
  deleted_nodes_.resize(0);
-  for (int i = 1; i < param.num_nodes; ++i) {
+  for (int i = 1; i < param_.num_nodes; ++i) {
    if (nodes_[i].IsDeleted()) {
      deleted_nodes_.push_back(i);
    }
  }
-  CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param.num_deleted);
+  CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param_.num_deleted);
-  split_types_.resize(param.num_nodes, FeatureType::kNumerical);
+  split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
-  split_categories_segments_.resize(param.num_nodes);
+  split_categories_segments_.resize(param_.num_nodes);
 }
 void RegTree::Save(dmlc::Stream* fo) const {
-  CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
-  CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
-  CHECK_EQ(param.deprecated_num_roots, 1);
+  CHECK_EQ(param_.deprecated_num_roots, 1);
-  CHECK_NE(param.num_nodes, 0);
+  CHECK_NE(param_.num_nodes, 0);
  CHECK(!IsMultiTarget())
      << "Please use JSON/UBJSON for saving models with multi-target trees.";
  CHECK(!HasCategoricalSplit())
      << "Please use JSON/UBJSON for saving models with categorical splits.";
  if (DMLC_IO_NO_ENDIAN_SWAP) {
-    fo->Write(&param, sizeof(TreeParam));
+    fo->Write(&param_, sizeof(TreeParam));
  } else {
-    TreeParam x = param.ByteSwap();
+    TreeParam x = param_.ByteSwap();
    fo->Write(&x, sizeof(x));
  }
@ -1081,7 +1083,7 @@ void RegTree::LoadModel(Json const& in) {
  bool typed = IsA<I32Array>(in[tf::kParent]);
  auto const& in_obj = get<Object const>(in);
  // basic properties
-  FromJson(in["tree_param"], &param);
+  FromJson(in["tree_param"], &param_);
  // categorical splits
  bool has_cat = in_obj.find("split_type") != in_obj.cend();
  if (has_cat) {
@ -1092,55 +1094,55 @@ void RegTree::LoadModel(Json const& in) {
    }
  }
  // multi-target
-  if (param.size_leaf_vector > 1) {
+  if (param_.size_leaf_vector > 1) {
-    this->p_mt_tree_.reset(new MultiTargetTree{&param});
+    this->p_mt_tree_.reset(new MultiTargetTree{&param_});
    this->GetMultiTargetTree()->LoadModel(in);
    return;
  }
  bool feature_is_64 = IsA<I64Array>(in["split_indices"]);
  if (typed && feature_is_64) {
-    LoadModelImpl<true, true>(in, param, &stats_, &nodes_);
+    LoadModelImpl<true, true>(in, param_, &stats_, &nodes_);
  } else if (typed && !feature_is_64) {
-    LoadModelImpl<true, false>(in, param, &stats_, &nodes_);
+    LoadModelImpl<true, false>(in, param_, &stats_, &nodes_);
  } else if (!typed && feature_is_64) {
-    LoadModelImpl<false, true>(in, param, &stats_, &nodes_);
+    LoadModelImpl<false, true>(in, param_, &stats_, &nodes_);
  } else {
-    LoadModelImpl<false, false>(in, param, &stats_, &nodes_);
+    LoadModelImpl<false, false>(in, param_, &stats_, &nodes_);
  }
  if (!has_cat) {
-    this->split_categories_segments_.resize(this->param.num_nodes);
+    this->split_categories_segments_.resize(this->param_.num_nodes);
-    this->split_types_.resize(this->param.num_nodes);
+    this->split_types_.resize(this->param_.num_nodes);
    std::fill(split_types_.begin(), split_types_.end(), FeatureType::kNumerical);
  }
  deleted_nodes_.clear();
-  for (bst_node_t i = 1; i < param.num_nodes; ++i) {
+  for (bst_node_t i = 1; i < param_.num_nodes; ++i) {
    if (nodes_[i].IsDeleted()) {
      deleted_nodes_.push_back(i);
    }
  }
  // easier access to [] operator
  auto& self = *this;
-  for (auto nid = 1; nid < param.num_nodes; ++nid) {
+  for (auto nid = 1; nid < param_.num_nodes; ++nid) {
    auto parent = self[nid].Parent();
    CHECK_NE(parent, RegTree::kInvalidNodeId);
    self[nid].SetParent(self[nid].Parent(), self[parent].LeftChild() == nid);
  }
-  CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param.num_deleted);
+  CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param_.num_deleted);
-  CHECK_EQ(this->split_categories_segments_.size(), param.num_nodes);
+  CHECK_EQ(this->split_categories_segments_.size(), param_.num_nodes);
 }
 void RegTree::SaveModel(Json* p_out) const {
  auto& out = *p_out;
  // basic properties
-  out["tree_param"] = ToJson(param);
+  out["tree_param"] = ToJson(param_);
  // categorical splits
  this->SaveCategoricalSplit(p_out);
  // multi-target
  if (this->IsMultiTarget()) {
-    CHECK_GT(param.size_leaf_vector, 1);
+    CHECK_GT(param_.size_leaf_vector, 1);
    this->GetMultiTargetTree()->SaveModel(p_out);
    return;
  }
@ -1150,11 +1152,11 @@ void RegTree::SaveModel(Json* p_out) const {
   *  pruner, and this pruner can be used inside another updater so leaf are not necessary
   *  at the end of node array.
   */
-  CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
-  CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
-  CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param.num_nodes));
+  CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param_.num_nodes));
-  auto n_nodes = param.num_nodes;
+  auto n_nodes = param_.num_nodes;
  // stats
  F32Array loss_changes(n_nodes);
@ -1168,7 +1170,7 @@ void RegTree::SaveModel(Json* p_out) const {
  F32Array conds(n_nodes);
  U8Array default_left(n_nodes);
-  CHECK_EQ(this->split_types_.size(), param.num_nodes);
+  CHECK_EQ(this->split_types_.size(), param_.num_nodes);
  namespace tf = tree_field;
@ -1189,7 +1191,7 @@ void RegTree::SaveModel(Json* p_out) const {
      default_left.Set(i, static_cast<uint8_t>(!!n.DefaultLeft()));
    }
  };
-  if (this->param.num_feature > static_cast<bst_feature_t>(std::numeric_limits<int32_t>::max())) {
+  if (this->param_.num_feature > static_cast<bst_feature_t>(std::numeric_limits<int32_t>::max())) {
    I64Array indices_64(n_nodes);
    save_tree(&indices_64);
    out[tf::kSplitIdx] = std::move(indices_64);
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@ -226,8 +226,8 @@ class GloablApproxBuilder {
        for (auto const &candidate : valid_candidates) {
          int left_child_nidx = tree[candidate.nid].LeftChild();
          int right_child_nidx = tree[candidate.nid].RightChild();
-          CPUExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx), {}};
+          CPUExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx)};
-          CPUExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx), {}};
+          CPUExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx)};
          best_splits.push_back(l_best);
          best_splits.push_back(r_best);
        }
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@ -190,7 +190,7 @@ class ColMaker: public TreeUpdater {
        (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
      }
      // remember auxiliary statistics in the tree node
-      for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
+      for (int nid = 0; nid < p_tree->NumNodes(); ++nid) {
        p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
        p_tree->Stat(nid).base_weight = snode_[nid].weight;
        p_tree->Stat(nid).sum_hess = static_cast<float>(snode_[nid].stats.sum_hess);
@ -255,9 +255,9 @@ class ColMaker: public TreeUpdater {
      {
        // setup statistics space for each tree node
        for (auto& i : stemp_) {
-          i.resize(tree.param.num_nodes, ThreadEntry());
+          i.resize(tree.NumNodes(), ThreadEntry());
        }
-        snode_.resize(tree.param.num_nodes, NodeEntry());
+        snode_.resize(tree.NumNodes(), NodeEntry());
      }
      const MetaInfo& info = fmat.Info();
      // setup position
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@ -72,7 +72,7 @@ class TreePruner : public TreeUpdater {
  void DoPrune(TrainParam const* param, RegTree* p_tree) {
    auto& tree = *p_tree;
    bst_node_t npruned = 0;
-    for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
+    for (int nid = 0; nid < tree.NumNodes(); ++nid) {
      if (tree[nid].IsLeaf() && !tree[nid].IsDeleted()) {
        npruned = this->TryPruneLeaf(param, p_tree, nid, tree.GetDepth(nid), npruned);
      }
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@ -4,69 +4,413 @@
 * \brief use quantized feature values to construct a tree
 * \author Philip Cho, Tianqi Checn, Egor Smirnov
 */
-#include "./updater_quantile_hist.h"
+#include <algorithm>                         // for max, copy, transform
 #include <cstddef>                           // for size_t
 #include <cstdint>                           // for uint32_t, int32_t
 #include <memory>                            // for unique_ptr, allocator, make_unique, shared_ptr
 #include <numeric>                           // for accumulate
 #include <ostream>                           // for basic_ostream, char_traits, operator<<
 #include <utility>                           // for move, swap
 #include <vector>                            // for vector
-#include <algorithm>
+#include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
-#include <cstddef>
+#include "../collective/communicator.h"      // for Operation
-#include <memory>
+#include "../common/hist_util.h"             // for HistogramCuts, HistCollection
-#include <string>
+#include "../common/linalg_op.h"             // for begin, cbegin, cend
-#include <utility>
+#include "../common/random.h"                // for ColumnSampler
-#include <vector>
+#include "../common/threading_utils.h"       // for ParallelFor
 #include "../common/timer.h"                 // for Monitor
 #include "../common/transform_iterator.h"    // for IndexTransformIter, MakeIndexTransformIter
 #include "../data/gradient_index.h"          // for GHistIndexMatrix
 #include "common_row_partitioner.h"          // for CommonRowPartitioner
 #include "dmlc/omp.h"                        // for omp_get_thread_num
 #include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
 #include "driver.h"                          // for Driver
 #include "hist/evaluate_splits.h"            // for HistEvaluator, HistMultiEvaluator, UpdatePre...
 #include "hist/expand_entry.h"               // for MultiExpandEntry, CPUExpandEntry
 #include "hist/histogram.h"                  // for HistogramBuilder, ConstructHistSpace
 #include "hist/sampler.h"                    // for SampleGradient
 #include "param.h"                           // for TrainParam, SplitEntryContainer, GradStats
 #include "xgboost/base.h"                    // for GradientPairInternal, GradientPair, bst_targ...
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for BatchIterator, BatchSet, DMatrix, MetaInfo
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
 #include "xgboost/linalg.h"                  // for All, MatrixView, TensorView, Matrix, Empty
 #include "xgboost/logging.h"                 // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
 #include "xgboost/span.h"                    // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"             // for operator<<
 #include "xgboost/task.h"                    // for ObjInfo
 #include "xgboost/tree_model.h"              // for RegTree, MTNotImplemented, RTreeNodeStat
 #include "xgboost/tree_updater.h"            // for TreeUpdater, TreeUpdaterReg, XGBOOST_REGISTE...
-#include "common_row_partitioner.h"
+namespace xgboost::tree {
 #include "constraints.h"
 #include "hist/evaluate_splits.h"
 #include "hist/histogram.h"
 #include "hist/sampler.h"
 #include "param.h"
 #include "xgboost/linalg.h"
 #include "xgboost/logging.h"
 #include "xgboost/tree_updater.h"
 namespace xgboost {
 namespace tree {
 DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
-void QuantileHistMaker::Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair,
+BatchParam HistBatch(TrainParam const *param) { return {param->max_bin, param->sparse_threshold}; }
                               DMatrix *dmat,
                               common::Span<HostDeviceVector<bst_node_t>> out_position,
                               const std::vector<RegTree *> &trees) {
  // build tree
  const size_t n_trees = trees.size();
  if (!pimpl_) {
    pimpl_.reset(new Builder(n_trees, param, dmat, *task_, ctx_));
  }
-  size_t t_idx{0};
+template <typename ExpandEntry, typename Updater>
-  for (auto p_tree : trees) {
+void UpdateTree(common::Monitor *monitor_, linalg::MatrixView<GradientPair const> gpair,
-    auto &t_row_position = out_position[t_idx];
+                Updater *updater, DMatrix *p_fmat, TrainParam const *param,
-    this->pimpl_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
+                HostDeviceVector<bst_node_t> *p_out_position, RegTree *p_tree) {
-    ++t_idx;
+  monitor_->Start(__func__);
  updater->InitData(p_fmat, p_tree);
  Driver<ExpandEntry> driver{*param};
  auto const &tree = *p_tree;
  driver.Push(updater->InitRoot(p_fmat, gpair, p_tree));
  auto expand_set = driver.Pop();
  /**
   * Note for update position
   * Root:
   *   Not applied: No need to update position as initialization has got all the rows ordered.
   *   Applied: Update position is run on applied nodes so the rows are partitioned.
   * Non-root:
   *   Not applied: That node is root of the subtree, same rule as root.
   *   Applied: Ditto
   */
  while (!expand_set.empty()) {
    // candidates that can be further splited.
    std::vector<ExpandEntry> valid_candidates;
    // candidaates that can be applied.
    std::vector<ExpandEntry> applied;
    for (auto const &candidate : expand_set) {
      updater->ApplyTreeSplit(candidate, p_tree);
      CHECK_GT(p_tree->LeftChild(candidate.nid), candidate.nid);
      applied.push_back(candidate);
      if (driver.IsChildValid(candidate)) {
        valid_candidates.emplace_back(candidate);
      }
    }
-bool QuantileHistMaker::UpdatePredictionCache(const DMatrix *data,
+    updater->UpdatePosition(p_fmat, p_tree, applied);
-                                              linalg::VectorView<float> out_preds) {
+
-  if (pimpl_) {
+    std::vector<ExpandEntry> best_splits;
-    return pimpl_->UpdatePredictionCache(data, out_preds);
+    if (!valid_candidates.empty()) {
      updater->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair);
      for (auto const &candidate : valid_candidates) {
        auto left_child_nidx = tree.LeftChild(candidate.nid);
        auto right_child_nidx = tree.RightChild(candidate.nid);
        ExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx)};
        ExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx)};
        best_splits.push_back(l_best);
        best_splits.push_back(r_best);
      }
      updater->EvaluateSplits(p_fmat, p_tree, &best_splits);
    }
    driver.Push(best_splits.begin(), best_splits.end());
    expand_set = driver.Pop();
  }
  auto &h_out_position = p_out_position->HostVector();
  updater->LeafPartition(tree, gpair, &h_out_position);
  monitor_->Stop(__func__);
 }
 /**
 * \brief Updater for building multi-target trees. The implementation simply iterates over
 *        each target.
 */
 class MultiTargetHistBuilder {
 private:
  common::Monitor *monitor_{nullptr};
  TrainParam const *param_{nullptr};
  std::shared_ptr<common::ColumnSampler> col_sampler_;
  std::unique_ptr<HistMultiEvaluator> evaluator_;
  // Histogram builder for each target.
  std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
  Context const *ctx_{nullptr};
  // Partitioner for each data batch.
  std::vector<CommonRowPartitioner> partitioner_;
  // Pointer to last updated tree, used for update prediction cache.
  RegTree const *p_last_tree_{nullptr};
  ObjInfo const *task_{nullptr};
 public:
  void UpdatePosition(DMatrix *p_fmat, RegTree const *p_tree,
                      std::vector<MultiExpandEntry> const &applied) {
    monitor_->Start(__func__);
    std::size_t page_id{0};
    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
      page_id++;
    }
    monitor_->Stop(__func__);
  }
  void ApplyTreeSplit(MultiExpandEntry const &candidate, RegTree *p_tree) {
    this->evaluator_->ApplyTreeSplit(candidate, p_tree);
  }
  void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
    monitor_->Start(__func__);
    std::size_t page_id = 0;
    bst_bin_t n_total_bins = 0;
    partitioner_.clear();
    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      if (n_total_bins == 0) {
        n_total_bins = page.cut.TotalBins();
      } else {
        CHECK_EQ(n_total_bins, page.cut.TotalBins());
      }
      partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->IsColumnSplit());
      page_id++;
    }
    bst_target_t n_targets = p_tree->NumTargets();
    histogram_builder_.clear();
    for (std::size_t i = 0; i < n_targets; ++i) {
      histogram_builder_.emplace_back();
      histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
                                      collective::IsDistributed(), p_fmat->IsColumnSplit());
    }
    evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
    p_last_tree_ = p_tree;
    monitor_->Stop(__func__);
  }
  MultiExpandEntry InitRoot(DMatrix *p_fmat, linalg::MatrixView<GradientPair const> gpair,
                            RegTree *p_tree) {
    monitor_->Start(__func__);
    MultiExpandEntry best;
    best.nid = RegTree::kRoot;
    best.depth = 0;
    auto n_targets = p_tree->NumTargets();
    linalg::Matrix<GradientPairPrecise> root_sum_tloc =
        linalg::Empty<GradientPairPrecise>(ctx_, ctx_->Threads(), n_targets);
    CHECK_EQ(root_sum_tloc.Shape(1), gpair.Shape(1));
    auto h_root_sum_tloc = root_sum_tloc.HostView();
    common::ParallelFor(gpair.Shape(0), ctx_->Threads(), [&](auto i) {
      for (bst_target_t t{0}; t < n_targets; ++t) {
        h_root_sum_tloc(omp_get_thread_num(), t) += GradientPairPrecise{gpair(i, t)};
      }
    });
    // Aggregate to the first row.
    auto root_sum = h_root_sum_tloc.Slice(0, linalg::All());
    for (std::int32_t tidx{1}; tidx < ctx_->Threads(); ++tidx) {
      for (bst_target_t t{0}; t < n_targets; ++t) {
        root_sum(t) += h_root_sum_tloc(tidx, t);
      }
    }
    CHECK(root_sum.CContiguous());
    collective::Allreduce<collective::Operation::kSum>(
        reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
    std::vector<MultiExpandEntry> nodes{best};
    std::size_t i = 0;
    auto space = ConstructHistSpace(partitioner_, nodes);
    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      for (bst_target_t t{0}; t < n_targets; ++t) {
        auto t_gpair = gpair.Slice(linalg::All(), t);
        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
                                        nodes, {}, t_gpair.Values());
      }
      i++;
    }
    auto weight = evaluator_->InitRoot(root_sum);
    auto weight_t = weight.HostView();
    std::transform(linalg::cbegin(weight_t), linalg::cend(weight_t), linalg::begin(weight_t),
                   [&](float w) { return w * param_->learning_rate; });
    p_tree->SetLeaf(RegTree::kRoot, weight_t);
    std::vector<common::HistCollection const *> hists;
    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
      hists.push_back(&histogram_builder_[t].Histogram());
    }
    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
      break;
    }
    monitor_->Stop(__func__);
    return nodes.front();
  }
  void BuildHistogram(DMatrix *p_fmat, RegTree const *p_tree,
                      std::vector<MultiExpandEntry> const &valid_candidates,
                      linalg::MatrixView<GradientPair const> gpair) {
    monitor_->Start(__func__);
    std::vector<MultiExpandEntry> nodes_to_build;
    std::vector<MultiExpandEntry> nodes_to_sub;
    for (auto const &c : valid_candidates) {
      auto left_nidx = p_tree->LeftChild(c.nid);
      auto right_nidx = p_tree->RightChild(c.nid);
      auto build_nidx = left_nidx;
      auto subtract_nidx = right_nidx;
      auto lit =
          common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
      auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
      auto rit =
          common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
      auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
      auto fewer_right = right_sum < left_sum;
      if (fewer_right) {
        std::swap(build_nidx, subtract_nidx);
      }
      nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
      nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
    }
    std::size_t i = 0;
    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
        auto t_gpair = gpair.Slice(linalg::All(), t);
        // Make sure the gradient matrix is f-order.
        CHECK(t_gpair.Contiguous());
        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
                                        nodes_to_build, nodes_to_sub, t_gpair.Values());
      }
      i++;
    }
    monitor_->Stop(__func__);
  }
  void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
                      std::vector<MultiExpandEntry> *best_splits) {
    monitor_->Start(__func__);
    std::vector<common::HistCollection const *> hists;
    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
      hists.push_back(&histogram_builder_[t].Histogram());
    }
    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
      break;
    }
    monitor_->Stop(__func__);
  }
  void LeafPartition(RegTree const &tree, linalg::MatrixView<GradientPair const> gpair,
                     std::vector<bst_node_t> *p_out_position) {
    monitor_->Start(__func__);
    if (!task_->UpdateTreeLeaf()) {
      return;
    }
    for (auto const &part : partitioner_) {
      part.LeafPartition(ctx_, tree, gpair, p_out_position);
    }
    monitor_->Stop(__func__);
  }
 public:
  explicit MultiTargetHistBuilder(Context const *ctx, MetaInfo const &info, TrainParam const *param,
                                  std::shared_ptr<common::ColumnSampler> column_sampler,
                                  ObjInfo const *task, common::Monitor *monitor)
      : monitor_{monitor},
        param_{param},
        col_sampler_{std::move(column_sampler)},
        evaluator_{std::make_unique<HistMultiEvaluator>(ctx, info, param, col_sampler_)},
        ctx_{ctx},
        task_{task} {
    monitor_->Init(__func__);
  }
 };
 class HistBuilder {
 private:
  common::Monitor *monitor_;
  TrainParam const *param_;
  std::shared_ptr<common::ColumnSampler> col_sampler_;
  std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
  std::vector<CommonRowPartitioner> partitioner_;
  // back pointers to tree and data matrix
  const RegTree *p_last_tree_{nullptr};
  DMatrix const *const p_last_fmat_{nullptr};
  std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
  ObjInfo const *task_{nullptr};
  // Context for number of threads
  Context const *ctx_{nullptr};
 public:
  explicit HistBuilder(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
                       TrainParam const *param, DMatrix const *fmat, ObjInfo const *task,
                       common::Monitor *monitor)
      : monitor_{monitor},
        param_{param},
        col_sampler_{std::move(column_sampler)},
        evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
                                                                   col_sampler_)},
        p_last_fmat_(fmat),
        histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
        task_{task},
        ctx_{ctx} {
    monitor_->Init(__func__);
  }
  bool UpdatePredictionCache(DMatrix const *data, linalg::VectorView<float> out_preds) const {
    // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
    // conjunction with Update().
    if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
      return false;
    }
    monitor_->Start(__func__);
    CHECK_EQ(out_preds.Size(), data->Info().num_row_);
    UpdatePredictionCacheImpl(ctx_, p_last_tree_, partitioner_, out_preds);
    monitor_->Stop(__func__);
    return true;
  }
-CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
+ public:
-    DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h) {
+  // initialize temp data structure
-  CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0), 0.0f);
+  void InitData(DMatrix *fmat, RegTree const *p_tree) {
    monitor_->Start(__func__);
    std::size_t page_id{0};
    bst_bin_t n_total_bins{0};
    partitioner_.clear();
    for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      if (n_total_bins == 0) {
        n_total_bins = page.cut.TotalBins();
      } else {
        CHECK_EQ(n_total_bins, page.cut.TotalBins());
      }
      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid, fmat->IsColumnSplit());
      ++page_id;
    }
    histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
                              collective::IsDistributed(), fmat->IsColumnSplit());
    evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
                                                                 col_sampler_);
    p_last_tree_ = p_tree;
  }
-  size_t page_id = 0;
+  void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
                      std::vector<CPUExpandEntry> *best_splits) {
    monitor_->Start(__func__);
    auto const &histograms = histogram_builder_->Histogram();
    auto ft = p_fmat->Info().feature_types.ConstHostSpan();
    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
      break;
    }
    monitor_->Stop(__func__);
  }
  void ApplyTreeSplit(CPUExpandEntry const &candidate, RegTree *p_tree) {
    this->evaluator_->ApplyTreeSplit(candidate, p_tree);
  }
  CPUExpandEntry InitRoot(DMatrix *p_fmat, linalg::MatrixView<GradientPair const> gpair,
                          RegTree *p_tree) {
    CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
    std::size_t page_id = 0;
    auto space = ConstructHistSpace(partitioner_, {node});
    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      std::vector<CPUExpandEntry> nodes_to_build{node};
      std::vector<CPUExpandEntry> nodes_to_sub;
      this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
                                          partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                        nodes_to_sub, gpair_h);
+                                          nodes_to_sub, gpair.Slice(linalg::All(), 0).Values());
      ++page_id;
    }
@ -78,21 +422,23 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
         * of gradient histogram is equal to snode[nid]
         */
        auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
-      std::vector<uint32_t> const &row_ptr = gmat.cut.Ptrs();
+        std::vector<std::uint32_t> const &row_ptr = gmat.cut.Ptrs();
        CHECK_GE(row_ptr.size(), 2);
-      uint32_t const ibegin = row_ptr[0];
+        std::uint32_t const ibegin = row_ptr[0];
-      uint32_t const iend = row_ptr[1];
+        std::uint32_t const iend = row_ptr[1];
        auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
        auto begin = hist.data();
-      for (uint32_t i = ibegin; i < iend; ++i) {
+        for (std::uint32_t i = ibegin; i < iend; ++i) {
          GradientPairPrecise const &et = begin[i];
          grad_stat.Add(et.GetGrad(), et.GetHess());
        }
      } else {
        auto gpair_h = gpair.Slice(linalg::All(), 0).Values();
        for (auto const &grad : gpair_h) {
          grad_stat.Add(grad.GetGrad(), grad.GetHess());
        }
-      collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat), 2);
+        collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
                                                           2);
      }
      auto weight = evaluator_->InitRoot(GradStats{grad_stat});
@ -104,7 +450,8 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
      monitor_->Start("EvaluateSplits");
      auto ft = p_fmat->Info().feature_types.ConstHostSpan();
      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-      evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree, &entries);
+        evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
                                   &entries);
        break;
      }
      monitor_->Stop("EvaluateSplits");
@ -114,13 +461,13 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
    return node;
  }
-void QuantileHistMaker::Builder::BuildHistogram(DMatrix *p_fmat, RegTree *p_tree,
+  void BuildHistogram(DMatrix *p_fmat, RegTree *p_tree,
                      std::vector<CPUExpandEntry> const &valid_candidates,
-                                                std::vector<GradientPair> const &gpair) {
+                      linalg::MatrixView<GradientPair const> gpair) {
    std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
    std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
-  size_t n_idx = 0;
+    std::size_t n_idx = 0;
    for (auto const &c : valid_candidates) {
      auto left_nidx = (*p_tree)[c.nid].LeftChild();
      auto right_nidx = (*p_tree)[c.nid].RightChild();
@ -136,21 +483,31 @@ void QuantileHistMaker::Builder::BuildHistogram(DMatrix *p_fmat, RegTree *p_tree
      n_idx++;
    }
-  size_t page_id{0};
+    std::size_t page_id{0};
    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
                                    partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                  nodes_to_sub, gpair);
+                                    nodes_to_sub, gpair.Values());
      ++page_id;
    }
  }
-void QuantileHistMaker::Builder::LeafPartition(RegTree const &tree,
+  void UpdatePosition(DMatrix *p_fmat, RegTree const *p_tree,
-                                               common::Span<GradientPair const> gpair,
+                      std::vector<CPUExpandEntry> const &applied) {
    monitor_->Start(__func__);
    std::size_t page_id{0};
    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
      page_id++;
    }
    monitor_->Stop(__func__);
  }
  void LeafPartition(RegTree const &tree, linalg::MatrixView<GradientPair const> gpair,
                     std::vector<bst_node_t> *p_out_position) {
    monitor_->Start(__func__);
-  if (!task_.UpdateTreeLeaf()) {
+    if (!task_->UpdateTreeLeaf()) {
      return;
    }
    for (auto const &part : partitioner_) {
@ -158,137 +515,90 @@ void QuantileHistMaker::Builder::LeafPartition(RegTree const &tree,
    }
    monitor_->Stop(__func__);
  }
 };
-void QuantileHistMaker::Builder::ExpandTree(DMatrix *p_fmat, RegTree *p_tree,
+/*! \brief construct a tree using quantized feature values */
-                                            const std::vector<GradientPair> &gpair_h,
+class QuantileHistMaker : public TreeUpdater {
-                                            HostDeviceVector<bst_node_t> *p_out_position) {
+  std::unique_ptr<HistBuilder> p_impl_{nullptr};
-  monitor_->Start(__func__);
+  std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
  std::shared_ptr<common::ColumnSampler> column_sampler_ =
      std::make_shared<common::ColumnSampler>();
  common::Monitor monitor_;
  ObjInfo const *task_{nullptr};
-  Driver<CPUExpandEntry> driver(*param_);
+ public:
-  driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
+  explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
-  auto const &tree = *p_tree;
+      : TreeUpdater{ctx}, task_{task} {}
-  auto expand_set = driver.Pop();
+  void Configure(const Args &) override {}
-  while (!expand_set.empty()) {
+  void LoadConfig(Json const &) override {}
-    // candidates that can be further splited.
+  void SaveConfig(Json *) const override {}
-    std::vector<CPUExpandEntry> valid_candidates;
+
-    // candidaates that can be applied.
+  [[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
-    std::vector<CPUExpandEntry> applied;
+
-    int32_t depth = expand_set.front().depth + 1;
+  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
-    for (auto const& candidate : expand_set) {
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
-      evaluator_->ApplyTreeSplit(candidate, p_tree);
+              const std::vector<RegTree *> &trees) override {
-      applied.push_back(candidate);
+    if (trees.front()->IsMultiTarget()) {
-      if (driver.IsChildValid(candidate)) {
+      CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
-        valid_candidates.emplace_back(candidate);
+      if (!p_mtimpl_) {
        this->p_mtimpl_ = std::make_unique<MultiTargetHistBuilder>(
            ctx_, p_fmat->Info(), param, column_sampler_, task_, &monitor_);
      }
    } else {
      if (!p_impl_) {
        p_impl_ =
            std::make_unique<HistBuilder>(ctx_, column_sampler_, param, p_fmat, task_, &monitor_);
      }
    }
-    monitor_->Start("UpdatePosition");
+    bst_target_t n_targets = trees.front()->NumTargets();
-    size_t page_id{0};
+    auto h_gpair =
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+        linalg::MakeTensorView(ctx_, gpair->HostSpan(), p_fmat->Info().num_row_, n_targets);
      partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
      ++page_id;
    }
    monitor_->Stop("UpdatePosition");
-    std::vector<CPUExpandEntry> best_splits;
+    linalg::Matrix<GradientPair> sample_out;
-    if (!valid_candidates.empty()) {
+    auto h_sample_out = h_gpair;
-      this->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair_h);
+    auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
-      for (auto const &candidate : valid_candidates) {
+    if (need_copy()) {
-        int left_child_nidx = tree[candidate.nid].LeftChild();
+      // allocate buffer
-        int right_child_nidx = tree[candidate.nid].RightChild();
+      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
-        CPUExpandEntry l_best{left_child_nidx, depth, 0.0};
+      h_sample_out = sample_out.HostView();
        CPUExpandEntry r_best{right_child_nidx, depth, 0.0};
        best_splits.push_back(l_best);
        best_splits.push_back(r_best);
      }
      auto const &histograms = histogram_builder_->Histogram();
      auto ft = p_fmat->Info().feature_types.ConstHostSpan();
      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
        evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, &best_splits);
        break;
      }
    }
    driver.Push(best_splits.begin(), best_splits.end());
    expand_set = driver.Pop();
    }
-  auto &h_out_position = p_out_position->HostVector();
+    for (auto tree_it = trees.begin(); tree_it != trees.end(); ++tree_it) {
-  this->LeafPartition(tree, gpair_h, &h_out_position);
+      if (need_copy()) {
-  monitor_->Stop(__func__);
+        // Copy gradient into buffer for sampling. This converts C-order to F-order.
        std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
      }
      SampleGradient(ctx_, *param, h_sample_out);
      auto *h_out_position = &out_position[tree_it - trees.begin()];
      if ((*tree_it)->IsMultiTarget()) {
        UpdateTree<MultiExpandEntry>(&monitor_, h_sample_out, p_mtimpl_.get(), p_fmat, param,
                                     h_out_position, *tree_it);
      } else {
        UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
                                   h_out_position, *tree_it);
      }
    }
  }
-void QuantileHistMaker::Builder::UpdateTree(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+  bool UpdatePredictionCache(const DMatrix *data, linalg::VectorView<float> out_preds) override {
-                                            RegTree *p_tree,
+    if (p_impl_) {
-                                            HostDeviceVector<bst_node_t> *p_out_position) {
+      return p_impl_->UpdatePredictionCache(data, out_preds);
-  monitor_->Start(__func__);
+    } else if (p_mtimpl_) {
-
+      // Not yet supported.
-  std::vector<GradientPair> *gpair_ptr = &(gpair->HostVector());
+      return false;
-  // in case 'num_parallel_trees != 1' no posibility to change initial gpair
+    } else {
  if (GetNumberOfTrees() != 1) {
    gpair_local_.resize(gpair_ptr->size());
    gpair_local_ = *gpair_ptr;
    gpair_ptr = &gpair_local_;
  }
  this->InitData(p_fmat, *p_tree, gpair_ptr);
  ExpandTree(p_fmat, p_tree, *gpair_ptr, p_out_position);
  monitor_->Stop(__func__);
 }
 bool QuantileHistMaker::Builder::UpdatePredictionCache(DMatrix const *data,
                                                       linalg::VectorView<float> out_preds) const {
  // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
  // conjunction with Update().
  if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
      return false;
    }
  monitor_->Start(__func__);
  CHECK_EQ(out_preds.Size(), data->Info().num_row_);
  UpdatePredictionCacheImpl(ctx_, p_last_tree_, partitioner_, out_preds);
  monitor_->Stop(__func__);
  return true;
  }
-size_t QuantileHistMaker::Builder::GetNumberOfTrees() { return n_trees_; }
+  [[nodiscard]] bool HasNodePosition() const override { return true; }
-
+};
 void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
                                          std::vector<GradientPair> *gpair) {
  monitor_->Start(__func__);
  const auto& info = fmat->Info();
  {
    size_t page_id{0};
    int32_t n_total_bins{0};
    partitioner_.clear();
    for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      if (n_total_bins == 0) {
        n_total_bins = page.cut.TotalBins();
      } else {
        CHECK_EQ(n_total_bins, page.cut.TotalBins());
      }
      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid, fmat->IsColumnSplit());
      ++page_id;
    }
    histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
                              collective::IsDistributed(), fmat->IsColumnSplit());
    auto m_gpair = linalg::MakeTensorView(ctx_, *gpair, gpair->size(), static_cast<std::size_t>(1));
    SampleGradient(ctx_, *param_, m_gpair);
  }
  // store a pointer to the tree
  p_last_tree_ = &tree;
  evaluator_.reset(new HistEvaluator<CPUExpandEntry>{ctx_, param_, info, column_sampler_});
  monitor_->Stop(__func__);
 }
 XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker")
    .describe("Grow tree using quantized histogram.")
    .set_body([](Context const *ctx, ObjInfo const *task) {
-      return new QuantileHistMaker(ctx, task);
+      return new QuantileHistMaker{ctx, task};
    });
-}  // namespace tree
+}  // namespace xgboost::tree
 }  // namespace xgboost
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@ -1,133 +0,0 @@
 /*!
 * Copyright 2017-2022 by XGBoost Contributors
 * \file updater_quantile_hist.h
 * \brief use quantized feature values to construct a tree
 * \author Philip Cho, Tianqi Chen, Egor Smirnov
 */
 #ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
 #define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
 #include <xgboost/tree_updater.h>
 #include <algorithm>
 #include <limits>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/json.h"
 #include "hist/evaluate_splits.h"
 #include "hist/histogram.h"
 #include "hist/expand_entry.h"
 #include "common_row_partitioner.h"
 #include "constraints.h"
 #include "./param.h"
 #include "./driver.h"
 #include "../common/random.h"
 #include "../common/timer.h"
 #include "../common/hist_util.h"
 #include "../common/row_set.h"
 #include "../common/partition_builder.h"
 #include "../common/column_matrix.h"
 namespace xgboost::tree {
 inline BatchParam HistBatch(TrainParam const* param) {
  return {param->max_bin, param->sparse_threshold};
 }
 /*! \brief construct a tree using quantized feature values */
 class QuantileHistMaker: public TreeUpdater {
 public:
  explicit QuantileHistMaker(Context const* ctx, ObjInfo const* task)
      : TreeUpdater(ctx), task_{task} {}
  void Configure(const Args&) override {}
  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree*>& trees) override;
  bool UpdatePredictionCache(const DMatrix *data,
                             linalg::VectorView<float> out_preds) override;
  void LoadConfig(Json const&) override {}
  void SaveConfig(Json*) const override {}
  [[nodiscard]] char const* Name() const override { return "grow_quantile_histmaker"; }
  [[nodiscard]] bool HasNodePosition() const override { return true; }
 protected:
  // actual builder that runs the algorithm
  struct Builder {
   public:
    // constructor
    explicit Builder(const size_t n_trees, TrainParam const* param, DMatrix const* fmat,
                     ObjInfo task, Context const* ctx)
        : n_trees_(n_trees),
          param_(param),
          p_last_fmat_(fmat),
          histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
          task_{task},
          ctx_{ctx},
          monitor_{std::make_unique<common::Monitor>()} {
      monitor_->Init("Quantile::Builder");
    }
    // update one tree, growing
    void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
                    HostDeviceVector<bst_node_t>* p_out_position);
    bool UpdatePredictionCache(DMatrix const* data, linalg::VectorView<float> out_preds) const;
   private:
    // initialize temp data structure
    void InitData(DMatrix* fmat, const RegTree& tree, std::vector<GradientPair>* gpair);
    size_t GetNumberOfTrees();
    CPUExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree,
                            const std::vector<GradientPair>& gpair_h);
    void BuildHistogram(DMatrix* p_fmat, RegTree* p_tree,
                        std::vector<CPUExpandEntry> const& valid_candidates,
                        std::vector<GradientPair> const& gpair);
    void LeafPartition(RegTree const& tree, common::Span<GradientPair const> gpair,
                       std::vector<bst_node_t>* p_out_position);
    void ExpandTree(DMatrix* p_fmat, RegTree* p_tree, const std::vector<GradientPair>& gpair_h,
                    HostDeviceVector<bst_node_t>* p_out_position);
   private:
    const size_t n_trees_;
    TrainParam const* param_;
    std::shared_ptr<common::ColumnSampler> column_sampler_{
        std::make_shared<common::ColumnSampler>()};
    std::vector<GradientPair> gpair_local_;
    std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
    std::vector<CommonRowPartitioner> partitioner_;
    // back pointers to tree and data matrix
    const RegTree* p_last_tree_{nullptr};
    DMatrix const* const p_last_fmat_;
    std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
    ObjInfo task_;
    // Context for number of threads
    Context const* ctx_;
    std::unique_ptr<common::Monitor> monitor_;
  };
 protected:
  std::unique_ptr<Builder> pimpl_;
  ObjInfo const* task_;
 };
 }  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@ -50,11 +50,11 @@ class TreeRefresher : public TreeUpdater {
        int tid = omp_get_thread_num();
        int num_nodes = 0;
        for (auto tree : trees) {
-          num_nodes += tree->param.num_nodes;
+          num_nodes += tree->NumNodes();
        }
        stemp[tid].resize(num_nodes, GradStats());
        std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
-        fvec_temp[tid].Init(trees[0]->param.num_feature);
+        fvec_temp[tid].Init(trees[0]->NumFeatures());
      });
    }
    exc.Rethrow();
@ -77,7 +77,7 @@ class TreeRefresher : public TreeUpdater {
          for (auto tree : trees) {
            AddStats(*tree, feats, gpair_h, info, ridx,
                     dmlc::BeginPtr(stemp[tid]) + offset);
-            offset += tree->param.num_nodes;
+            offset += tree->NumNodes();
          }
          feats.Drop(inst);
        });
@ -96,7 +96,7 @@ class TreeRefresher : public TreeUpdater {
    int offset = 0;
    for (auto tree : trees) {
      this->Refresh(param, dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
-      offset += tree->param.num_nodes;
+      offset += tree->NumNodes();
    }
  }
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@ -12,13 +12,12 @@ tests/ci_build/ci_build.sh gpu nvidia-docker \
  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
  build/testxgboost
-# Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved
+echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
-# echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
+rm -rfv build/
-# rm -rfv build/
+buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
-# buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
+chmod +x build/testxgboost
-# chmod +x build/testxgboost
+tests/ci_build/ci_build.sh rmm nvidia-docker \
-# tests/ci_build/ci_build.sh rmm nvidia-docker \
+  --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
-#   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
+  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
-#   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
-#   --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
+  "source activate gpu_test && build/testxgboost --use-rmm-pool"
 #   "source activate gpu_test && build/testxgboost --use-rmm-pool"
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@ -3,7 +3,7 @@ import os
 import subprocess
 import sys
 from multiprocessing import Pool, cpu_count
-from typing import Dict, Optional, Tuple
+from typing import Dict, Tuple
 from pylint import epylint
 from test_utils import PY_PACKAGE, ROOT, cd, print_time, record_time
@ -15,7 +15,10 @@ SRCPATH = os.path.normpath(
@record_time
-def run_black(rel_path: str) -> bool:
+def run_black(rel_path: str, fix: bool) -> bool:
    if fix:
        cmd = ["black", "-q", rel_path]
    else:
        cmd = ["black", "-q", "--check", rel_path]
    ret = subprocess.run(cmd).returncode
    if ret != 0:
@ -31,7 +34,10 @@ Please run the following command on your machine to address the formatting error
@record_time
-def run_isort(rel_path: str) -> bool:
+def run_isort(rel_path: str, fix: bool) -> bool:
    if fix:
        cmd = ["isort", f"--src={SRCPATH}", "--profile=black", rel_path]
    else:
        cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
    ret = subprocess.run(cmd).returncode
    if ret != 0:
@ -132,7 +138,7 @@ def run_pylint() -> bool:
 def main(args: argparse.Namespace) -> None:
    if args.format == 1:
        black_results = [
-            run_black(path)
+            run_black(path, args.fix)
            for path in [
                # core
                "python-package/",
@ -166,7 +172,7 @@ def main(args: argparse.Namespace) -> None:
            sys.exit(-1)
        isort_results = [
-            run_isort(path)
+            run_isort(path, args.fix)
            for path in [
                # core
                "python-package/",
@ -230,6 +236,11 @@ if __name__ == "__main__":
    parser.add_argument("--format", type=int, choices=[0, 1], default=1)
    parser.add_argument("--type-check", type=int, choices=[0, 1], default=1)
    parser.add_argument("--pylint", type=int, choices=[0, 1], default=1)
    parser.add_argument(
        "--fix",
        action="store_true",
        help="Fix the formatting issues instead of emitting an error.",
    )
    args = parser.parse_args()
    try:
        main(args)
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@ -1,10 +1,12 @@
-/*!
+/**
- * Copyright 2022 XGBoost contributors
+ * Copyright 2022-2023, XGBoost contributors
 */
 #ifdef XGBOOST_USE_NCCL
 #include <gtest/gtest.h>
 #include <string>  // for string
 #include "../../../src/collective/nccl_device_communicator.cuh"
 namespace xgboost {
@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
  EXPECT_THROW(construct(), dmlc::Error);
 }
 TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
  try {
    dh::safe_nccl(ncclSystemError);
  } catch (dmlc::Error const& e) {
    auto str = std::string{e.what()};
    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
  }
 }
 }  // namespace collective
 }  // namespace xgboost
-#endif
+#endif  // XGBOOST_USE_NCCL
--- a/tests/cpp/common/test_partition_builder.cc
+++ b/tests/cpp/common/test_partition_builder.cc
@ -1,15 +1,17 @@
 /**
 * Copyright 2020-2023 by XGBoost contributors
 */
 #include <gtest/gtest.h>
-#include <vector>
+
 #include <string>
 #include <utility>
 #include <vector>
 #include "../../../src/common/row_set.h"
 #include "../../../src/common/partition_builder.h"
 #include "../../../src/common/row_set.h"
 #include "../helpers.h"
-namespace xgboost {
+namespace xgboost::common {
 namespace common {
 TEST(PartitionBuilder, BasicTest) {
  constexpr size_t kBlockSize = 16;
  constexpr size_t kNodes = 5;
@ -74,6 +76,4 @@ TEST(PartitionBuilder, BasicTest) {
    ASSERT_EQ(n_right, (kBlockSize - rows_for_left_node[nid]) * tasks[nid]);
  }
 }
-
+}  // namespace xgboost::common
 }  // namespace common
 }  // namespace xgboost
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@ -1,16 +1,25 @@
 /**
 * Copyright 2023 by XGBoost Contributors
 */
-#include <gtest/gtest.h>                        // for Test, AssertionResult, Message, TestPartR...
+#include "test_ranking_utils.h"
-#include <gtest/gtest.h>                        // for ASSERT_NEAR, ASSERT_T...
+
-#include <xgboost/base.h>                       // for Args
+#include <gtest/gtest.h>
 #include <xgboost/base.h>                       // for Args, bst_group_t, kRtEps
 #include <xgboost/context.h>                    // for Context
 #include <xgboost/data.h>                       // for MetaInfo, DMatrix
 #include <xgboost/host_device_vector.h>         // for HostDeviceVector
 #include <xgboost/logging.h>                    // for Error
 #include <xgboost/string_view.h>                // for StringView
 #include <cstddef>                              // for size_t
 #include <cstdint>                              // for uint32_t
-#include <utility>                              // for pair
+#include <numeric>                              // for iota
 #include <utility>                              // for move
 #include <vector>                               // for vector
 #include "../../../src/common/numeric.h"        // for Iota
 #include "../../../src/common/ranking_utils.h"  // for LambdaRankParam, ParseMetricName, MakeMet...
 #include "../helpers.h"                         // for EmptyDMatrix
 namespace xgboost::ltr {
 TEST(RankingUtils, LambdaRankParam) {
@ -66,4 +75,138 @@ TEST(RankingUtils, MakeMetricName) {
  name = MakeMetricName("map", 2, false);
  ASSERT_EQ(name, "map@2");
 }
 void TestRankingCache(Context const* ctx) {
  auto p_fmat = EmptyDMatrix();
  MetaInfo& info = p_fmat->Info();
  info.num_row_ = 16;
  info.labels.Reshape(info.num_row_);
  auto& h_label = info.labels.Data()->HostVector();
  for (std::size_t i = 0; i < h_label.size(); ++i) {
    h_label[i] = i % 2;
  }
  LambdaRankParam param;
  param.UpdateAllowUnknown(Args{});
  RankingCache cache{ctx, info, param};
  HostDeviceVector<float> predt(info.num_row_, 0);
  auto& h_predt = predt.HostVector();
  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
  predt.SetDevice(ctx->gpu_id);
  auto rank_idx =
      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
  for (std::size_t i = 0; i < rank_idx.size(); ++i) {
    ASSERT_EQ(rank_idx[i], rank_idx.size() - i - 1);
  }
 }
 TEST(RankingCache, InitFromCPU) {
  Context ctx;
  TestRankingCache(&ctx);
 }
 void TestNDCGCache(Context const* ctx) {
  auto p_fmat = EmptyDMatrix();
  MetaInfo& info = p_fmat->Info();
  LambdaRankParam param;
  param.UpdateAllowUnknown(Args{});
  {
    // empty
    NDCGCache cache{ctx, info, param};
    ASSERT_EQ(cache.DataGroupPtr(ctx).size(), 2);
  }
  info.num_row_ = 3;
  info.group_ptr_ = {static_cast<bst_group_t>(0), static_cast<bst_group_t>(info.num_row_)};
  {
    auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
    // empty label
    ASSERT_THROW(fail(), dmlc::Error);
    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
    // invalid label
    ASSERT_THROW(fail(), dmlc::Error);
    auto h_labels = info.labels.HostView();
    for (std::size_t i = 0; i < h_labels.Size(); ++i) {
      h_labels(i) *= 10;
    }
    param.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
    NDCGCache cache{ctx, info, param};
    Context cpuctx;
    auto inv_idcg = cache.InvIDCG(&cpuctx);
    ASSERT_EQ(inv_idcg.Size(), 1);
    ASSERT_NEAR(1.0 / inv_idcg(0), 2.63093, kRtEps);
  }
  {
    param.UpdateAllowUnknown(Args{{"lambdarank_unbiased", "false"}});
    std::vector<float> h_data(32);
    common::Iota(ctx, h_data.begin(), h_data.end(), 0.0f);
    info.labels.Reshape(h_data.size());
    info.num_row_ = h_data.size();
    info.group_ptr_.back() = info.num_row_;
    info.labels.Data()->HostVector() = std::move(h_data);
    {
      NDCGCache cache{ctx, info, param};
      Context cpuctx;
      auto inv_idcg = cache.InvIDCG(&cpuctx);
      ASSERT_NEAR(inv_idcg(0), 0.00551782, kRtEps);
    }
    param.UpdateAllowUnknown(
        Args{{"lambdarank_num_pair_per_sample", "3"}, {"lambdarank_pair_method", "topk"}});
    {
      NDCGCache cache{ctx, info, param};
      Context cpuctx;
      auto inv_idcg = cache.InvIDCG(&cpuctx);
      ASSERT_NEAR(inv_idcg(0), 0.01552123, kRtEps);
    }
  }
 }
 TEST(NDCGCache, InitFromCPU) {
  Context ctx;
  TestNDCGCache(&ctx);
 }
 void TestMAPCache(Context const* ctx) {
  auto p_fmat = EmptyDMatrix();
  MetaInfo& info = p_fmat->Info();
  LambdaRankParam param;
  param.UpdateAllowUnknown(Args{});
  std::vector<float> h_data(32);
  common::Iota(ctx, h_data.begin(), h_data.end(), 0.0f);
  info.labels.Reshape(h_data.size());
  info.num_row_ = h_data.size();
  info.labels.Data()->HostVector() = std::move(h_data);
  auto fail = [&]() { std::make_shared<MAPCache>(ctx, info, param); };
  // binary label
  ASSERT_THROW(fail(), dmlc::Error);
  h_data = std::vector<float>(32, 0.0f);
  h_data[1] = 1.0f;
  info.labels.Data()->HostVector() = h_data;
  auto p_cache = std::make_shared<MAPCache>(ctx, info, param);
  ASSERT_EQ(p_cache->Acc(ctx).size(), info.num_row_);
  ASSERT_EQ(p_cache->NumRelevant(ctx).size(), info.num_row_);
 }
 TEST(MAPCache, InitFromCPU) {
  Context ctx;
  ctx.Init(Args{});
  TestMAPCache(&ctx);
 }
 }  // namespace xgboost::ltr
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@ -0,0 +1,104 @@
 /**
 * Copyright 2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>                          // for Args, XGBOOST_DEVICE, bst_group_t, kRtEps
 #include <xgboost/context.h>                       // for Context
 #include <xgboost/linalg.h>                        // for MakeTensorView, Vector
 #include <cstddef>                                 // for size_t
 #include <memory>                                  // for shared_ptr
 #include <numeric>                                 // for iota
 #include <vector>                                  // for vector
 #include "../../../src/common/algorithm.cuh"       // for SegmentedSequence
 #include "../../../src/common/cuda_context.cuh"    // for CUDAContext
 #include "../../../src/common/device_helpers.cuh"  // for device_vector, ToSpan
 #include "../../../src/common/ranking_utils.cuh"   // for CalcQueriesInvIDCG
 #include "../../../src/common/ranking_utils.h"     // for LambdaRankParam, RankingCache
 #include "../helpers.h"                            // for EmptyDMatrix
 #include "test_ranking_utils.h"                    // for TestNDCGCache
 #include "xgboost/data.h"                          // for MetaInfo
 #include "xgboost/host_device_vector.h"            // for HostDeviceVector
 namespace xgboost::ltr {
 void TestCalcQueriesInvIDCG() {
  Context ctx;
  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
  std::size_t n_groups = 5, n_samples_per_group = 32;
  dh::device_vector<float> scores(n_samples_per_group * n_groups);
  dh::device_vector<bst_group_t> group_ptr(n_groups + 1);
  auto d_group_ptr = dh::ToSpan(group_ptr);
  dh::LaunchN(d_group_ptr.size(), ctx.CUDACtx()->Stream(),
              [=] XGBOOST_DEVICE(std::size_t i) { d_group_ptr[i] = i * n_samples_per_group; });
  auto d_scores = dh::ToSpan(scores);
  common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
  linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
  ltr::LambdaRankParam p;
  p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
  cuda_impl::CalcQueriesInvIDCG(&ctx, linalg::MakeTensorView(&ctx, d_scores, d_scores.size()),
                                dh::ToSpan(group_ptr), inv_IDCG.View(ctx.gpu_id), p);
  for (std::size_t i = 0; i < n_groups; ++i) {
    double inv_idcg = inv_IDCG(i);
    ASSERT_NEAR(inv_idcg, 0.00551782, kRtEps);
  }
 }
 TEST(RankingUtils, CalcQueriesInvIDCG) { TestCalcQueriesInvIDCG(); }
 namespace {
 void TestRankingCache(Context const* ctx) {
  auto p_fmat = EmptyDMatrix();
  MetaInfo& info = p_fmat->Info();
  info.num_row_ = 16;
  info.labels.Reshape(info.num_row_);
  auto& h_label = info.labels.Data()->HostVector();
  for (std::size_t i = 0; i < h_label.size(); ++i) {
    h_label[i] = i % 2;
  }
  LambdaRankParam param;
  param.UpdateAllowUnknown(Args{});
  RankingCache cache{ctx, info, param};
  HostDeviceVector<float> predt(info.num_row_, 0);
  auto& h_predt = predt.HostVector();
  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
  predt.SetDevice(ctx->gpu_id);
  auto rank_idx =
      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
  std::vector<std::size_t> h_rank_idx(rank_idx.size());
  dh::CopyDeviceSpanToVector(&h_rank_idx, rank_idx);
  for (std::size_t i = 0; i < rank_idx.size(); ++i) {
    ASSERT_EQ(h_rank_idx[i], h_rank_idx.size() - i - 1);
  }
 }
 }  // namespace
 TEST(RankingCache, InitFromGPU) {
  Context ctx;
  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
  TestRankingCache(&ctx);
 }
 TEST(NDCGCache, InitFromGPU) {
  Context ctx;
  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
  TestNDCGCache(&ctx);
 }
 TEST(MAPCache, InitFromGPU) {
  Context ctx;
  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
  TestMAPCache(&ctx);
 }
 }  // namespace xgboost::ltr
--- a/tests/cpp/common/test_ranking_utils.h
+++ b/tests/cpp/common/test_ranking_utils.h
@ -0,0 +1,11 @@
 /**
 * Copyright 2023 by XGBoost Contributors
 */
 #pragma once
 #include <xgboost/context.h>  // for Context
 namespace xgboost::ltr {
 void TestNDCGCache(Context const* ctx);
 void TestMAPCache(Context const* ctx);
 }  // namespace xgboost::ltr
--- a/tests/cpp/data/test_data.cc
+++ b/tests/cpp/data/test_data.cc
@ -112,31 +112,12 @@ TEST(SparsePage, SortIndices) {
 }
 TEST(DMatrix, Uri) {
-  size_t constexpr kRows {16};
+  auto constexpr kRows {16};
-  size_t constexpr kCols {8};
+  auto constexpr kCols {8};
  std::vector<float> data (kRows * kCols);
  for (size_t i = 0; i < kRows * kCols; ++i) {
    data[i] = i;
  }
  dmlc::TemporaryDirectory tmpdir;
-  std::string path = tmpdir.path + "/small.csv";
+  auto const path = tmpdir.path + "/small.csv";
-
+  CreateTestCSV(path, kRows, kCols);
  std::ofstream fout(path);
  size_t i = 0;
  for (size_t r = 0; r < kRows; ++r) {
    for (size_t c = 0; c < kCols; ++c) {
      fout << data[i];
      i++;
      if (c != kCols - 1) {
        fout << ",";
      }
    }
    fout << "\n";
  }
  fout.flush();
  fout.close();
  std::unique_ptr<DMatrix> dmat;
  // FIXME(trivialfis): Enable the following test by restricting csv parser in dmlc-core.
--- a/tests/cpp/data/test_file_iterator.cc
+++ b/tests/cpp/data/test_file_iterator.cc
@ -1,8 +1,9 @@
-/*!
+/**
- * Copyright 2021 XGBoost contributors
+ * Copyright 2021-2023 XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <any>  // for any_cast
 #include <memory>
 #include "../../../src/data/adapter.h"
@ -11,15 +12,14 @@
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
-namespace xgboost {
+namespace xgboost::data {
 namespace data {
 TEST(FileIterator, Basic) {
  auto check_n_features = [](FileIterator *iter) {
    size_t n_features = 0;
    iter->Reset();
    while (iter->Next()) {
      auto proxy = MakeProxy(iter->Proxy());
-      auto csr = dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
+      auto csr = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
      n_features = std::max(n_features, csr->NumColumns());
    }
    ASSERT_EQ(n_features, 5);
@ -42,5 +42,4 @@ TEST(FileIterator, Basic) {
    check_n_features(&iter);
  }
 }
-}  // namespace data
+}  // namespace xgboost::data
 }  // namespace xgboost
--- a/Show More
+++ b/Show More