Initial support for multi-target tree. (#8616)

* Implement multi-target for hist. - Add new hist tree builder. - Move data fetchers for tests. - Dispatch function calls in gbm base on the tree type.
2023-03-22 23:49:56 +08:00 · 2023-03-22 23:49:56 +08:00 · 151882dd26
commit 151882dd26
parent ea04d4c46c
34 changed files with 856 additions and 389 deletions
--- a/demo/guide-python/multioutput_regression.py
+++ b/demo/guide-python/multioutput_regression.py
@ -7,6 +7,12 @@ The demo is adopted from scikit-learn:
 https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py

 See :doc:`/tutorials/multioutput` for more information.
+
+.. note::
+
+    The feature is experimental. For the `multi_output_tree` strategy, many features are
+    missing.
+
 """

 import argparse
@ -40,11 +46,18 @@ def gen_circle() -> Tuple[np.ndarray, np.ndarray]:
    return X, y


-def rmse_model(plot_result: bool):
+def rmse_model(plot_result: bool, strategy: str):
    """Draw a circle with 2-dim coordinate as target variables."""
    X, y = gen_circle()
    # Train a regressor on it
-    reg = xgb.XGBRegressor(tree_method="hist", n_estimators=64)
+    reg = xgb.XGBRegressor(
+        tree_method="hist",
+        n_estimators=128,
+        n_jobs=16,
+        max_depth=8,
+        multi_strategy=strategy,
+        subsample=0.6,
+    )
    reg.fit(X, y, eval_set=[(X, y)])

    y_predt = reg.predict(X)
@ -52,7 +65,7 @@ def rmse_model(plot_result: bool):
        plot_predt(y, y_predt, "multi")


-def custom_rmse_model(plot_result: bool) -> None:
+def custom_rmse_model(plot_result: bool, strategy: str) -> None:
    """Train using Python implementation of Squared Error."""

    # As the experimental support status, custom objective doesn't support matrix as
@ -88,9 +101,10 @@ def custom_rmse_model(plot_result: bool) -> None:
        {
            "tree_method": "hist",
            "num_target": y.shape[1],
+            "multi_strategy": strategy,
        },
        dtrain=Xy,
-        num_boost_round=100,
+        num_boost_round=128,
        obj=squared_log,
        evals=[(Xy, "Train")],
        evals_result=results,
@ -107,6 +121,16 @@ if __name__ == "__main__":
    parser.add_argument("--plot", choices=[0, 1], type=int, default=1)
    args = parser.parse_args()
    # Train with builtin RMSE objective
-    rmse_model(args.plot == 1)
+    # - One model per output.
+    rmse_model(args.plot == 1, "one_output_per_tree")
+
+    # - One model for all outputs, this is still working in progress, many features are
+    # missing.
+    rmse_model(args.plot == 1, "multi_output_tree")
+
    # Train with custom objective.
-    custom_rmse_model(args.plot == 1)
+    # - One model per output.
+    custom_rmse_model(args.plot == 1, "one_output_per_tree")
+    # - One model for all outputs, this is still working in progress, many features are
+    # missing.
+    custom_rmse_model(args.plot == 1, "multi_output_tree")
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -226,6 +226,18 @@ Parameters for Tree Booster
    list is a group of indices of features that are allowed to interact with each other.
    See :doc:`/tutorials/feature_interaction_constraint` for more information.

+* ``multi_strategy``, [default = ``one_output_per_tree``]
+
+  .. versionadded:: 2.0.0
+
+  .. note:: This parameter is working-in-progress.
+
+  - The strategy used for training multi-target models, including multi-target regression
+  and multi-class classification. See :doc:`/tutorials/multioutput` for more information.
+
+    - ``one_output_per_tree``: One model for each target.
+    - ``multi_output_tree``:  Use multi-target trees.
+
 .. _cat-param:

 Parameters for Categorical Feature
--- a/doc/tutorials/multioutput.rst
+++ b/doc/tutorials/multioutput.rst
@ -11,7 +11,11 @@ can be simultaneously classified as both sci-fi and comedy.  For detailed explan
 terminologies related to different multi-output models please refer to the
 :doc:`scikit-learn user guide <sklearn:modules/multiclass>`.

-Internally, XGBoost builds one model for each target similar to sklearn meta estimators,
+**********************************
+Training with One-Model-Per-Target
+**********************************
+
+By default, XGBoost builds one model for each target similar to sklearn meta estimators,
 with the added benefit of reusing data and other integrated features like SHAP.  For a
 worked example of regression, see
 :ref:`sphx_glr_python_examples_multioutput_regression.py`. For multi-label classification,
@ -36,3 +40,26 @@ dense matrix for labels.


 The feature is still under development with limited support from objectives and metrics.
+
+*************************
+Training with Vector Leaf
+*************************
+
+.. versionadded:: 2.0
+
+.. note::
+
+   This is still working-in-progress, and many features are missing.
+
+XGBoost can optionally build multi-output trees with the size of leaf equals to the number
+of targets when the tree method `hist` is used. The behavior can be controlled by the
+``multi_strategy`` training parameter, which can take the value `one_output_per_tree` (the
+default) for building one model per-target or `multi_output_tree` for building
+multi-output trees.
+
+.. code-block:: python
+
+  clf = xgb.XGBClassifier(tree_method="hist", multi_strategy="multi_output_tree")
+
+See :ref:`sphx_glr_python_examples_multioutput_regression.py` for a worked example with
+regression.
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@ -286,8 +286,8 @@ struct LearnerModelParamLegacy;
 * \brief Strategy for building multi-target models.
 */
 enum class MultiStrategy : std::int32_t {
-  kComposite = 0,
-  kMonolithic = 1,
+  kOneOutputPerTree = 0,
+  kMultiOutputTree = 1,
 };

 /**
@ -317,7 +317,7 @@ struct LearnerModelParam {
  /**
   * \brief Strategy for building multi-target models.
   */
-  MultiStrategy multi_strategy{MultiStrategy::kComposite};
+  MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};

  LearnerModelParam() = default;
  // As the old `LearnerModelParamLegacy` is still used by binary IO, we keep
@ -338,7 +338,7 @@ struct LearnerModelParam {

  void Copy(LearnerModelParam const& that);
  [[nodiscard]] bool IsVectorLeaf() const noexcept {
-    return multi_strategy == MultiStrategy::kMonolithic;
+    return multi_strategy == MultiStrategy::kMultiOutputTree;
  }
  [[nodiscard]] bst_target_t OutputLength() const noexcept { return this->num_output_group; }
  [[nodiscard]] bst_target_t LeafLength() const noexcept {
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@ -530,17 +530,17 @@ class TensorView {
  /**
   * \brief Number of items in the tensor.
   */
-  LINALG_HD [[nodiscard]] std::size_t Size() const { return size_; }
+  [[nodiscard]] LINALG_HD std::size_t Size() const { return size_; }
  /**
   * \brief Whether this is a contiguous array, both C and F contiguous returns true.
   */
-  LINALG_HD [[nodiscard]] bool Contiguous() const {
+  [[nodiscard]] LINALG_HD bool Contiguous() const {
    return data_.size() == this->Size() || this->CContiguous() || this->FContiguous();
  }
  /**
   * \brief Whether it's a c-contiguous array.
   */
-  LINALG_HD [[nodiscard]] bool CContiguous() const {
+  [[nodiscard]] LINALG_HD bool CContiguous() const {
    StrideT stride;
    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
    // It's contiguous if the stride can be calculated from shape.
@ -550,7 +550,7 @@ class TensorView {
  /**
   * \brief Whether it's a f-contiguous array.
   */
-  LINALG_HD [[nodiscard]] bool FContiguous() const {
+  [[nodiscard]] LINALG_HD bool FContiguous() const {
    StrideT stride;
    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
    // It's contiguous if the stride can be calculated from shape.
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -312,6 +312,19 @@ __model_doc = f"""
        needs to be set to have categorical feature support. See :doc:`Categorical Data
        </tutorials/categorical>` and :ref:`cat-param` for details.

+    multi_strategy : Optional[str]
+
+        .. versionadded:: 2.0.0
+
+        .. note:: This parameter is working-in-progress.
+
+        The strategy used for training multi-target models, including multi-target
+        regression and multi-class classification. See :doc:`/tutorials/multioutput` for
+        more information.
+
+        - ``one_output_per_tree``: One model for each target.
+        - ``multi_output_tree``:  Use multi-target trees.
+
    eval_metric : Optional[Union[str, List[str], Callable]]

        .. versionadded:: 1.6.0
@ -624,6 +637,7 @@ class XGBModel(XGBModelBase):
        feature_types: Optional[FeatureTypes] = None,
        max_cat_to_onehot: Optional[int] = None,
        max_cat_threshold: Optional[int] = None,
+        multi_strategy: Optional[str] = None,
        eval_metric: Optional[Union[str, List[str], Callable]] = None,
        early_stopping_rounds: Optional[int] = None,
        callbacks: Optional[List[TrainingCallback]] = None,
@ -670,6 +684,7 @@ class XGBModel(XGBModelBase):
        self.feature_types = feature_types
        self.max_cat_to_onehot = max_cat_to_onehot
        self.max_cat_threshold = max_cat_threshold
+        self.multi_strategy = multi_strategy
        self.eval_metric = eval_metric
        self.early_stopping_rounds = early_stopping_rounds
        self.callbacks = callbacks
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@ -10,11 +10,9 @@ import os
 import platform
 import socket
 import sys
-import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from io import StringIO
-from pathlib import Path
 from platform import system
 from typing import (
    Any,
@ -29,7 +27,6 @@ from typing import (
    TypedDict,
    Union,
 )
-from urllib import request

 import numpy as np
 import pytest
@ -38,6 +35,13 @@ from scipy import sparse
 import xgboost as xgb
 from xgboost.core import ArrayLike
 from xgboost.sklearn import SklObjective
+from xgboost.testing.data import (
+    get_california_housing,
+    get_cancer,
+    get_digits,
+    get_sparse,
+    memory,
+)

 hypothesis = pytest.importorskip("hypothesis")

@ -45,13 +49,8 @@ hypothesis = pytest.importorskip("hypothesis")
 from hypothesis import strategies
 from hypothesis.extra.numpy import arrays

-joblib = pytest.importorskip("joblib")
 datasets = pytest.importorskip("sklearn.datasets")

-Memory = joblib.Memory
-
-memory = Memory("./cachedir", verbose=0)
-
 PytestSkip = TypedDict("PytestSkip", {"condition": bool, "reason": str})


@ -353,137 +352,6 @@ class TestDataset:
        return self.name


-@memory.cache
-def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
-    data = datasets.fetch_california_housing()
-    return data.data, data.target
-
-
-@memory.cache
-def get_digits() -> Tuple[np.ndarray, np.ndarray]:
-    data = datasets.load_digits()
-    return data.data, data.target
-
-
-@memory.cache
-def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
-    return datasets.load_breast_cancer(return_X_y=True)
-
-
-@memory.cache
-def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
-    rng = np.random.RandomState(199)
-    n = 2000
-    sparsity = 0.75
-    X, y = datasets.make_regression(n, random_state=rng)
-    flag = rng.binomial(1, sparsity, X.shape)
-    for i in range(X.shape[0]):
-        for j in range(X.shape[1]):
-            if flag[i, j]:
-                X[i, j] = np.nan
-    return X, y
-
-
-@memory.cache
-def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
-    """
-    Number of samples: 1460
-    Number of features: 20
-    Number of categorical features: 10
-    Number of numerical features: 10
-    """
-    from sklearn.datasets import fetch_openml
-
-    X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
-
-    categorical_columns_subset: List[str] = [
-        "BldgType",  # 5 cats, no nan
-        "GarageFinish",  # 3 cats, nan
-        "LotConfig",  # 5 cats, no nan
-        "Functional",  # 7 cats, no nan
-        "MasVnrType",  # 4 cats, nan
-        "HouseStyle",  # 8 cats, no nan
-        "FireplaceQu",  # 5 cats, nan
-        "ExterCond",  # 5 cats, no nan
-        "ExterQual",  # 4 cats, no nan
-        "PoolQC",  # 3 cats, nan
-    ]
-
-    numerical_columns_subset: List[str] = [
-        "3SsnPorch",
-        "Fireplaces",
-        "BsmtHalfBath",
-        "HalfBath",
-        "GarageCars",
-        "TotRmsAbvGrd",
-        "BsmtFinSF1",
-        "BsmtFinSF2",
-        "GrLivArea",
-        "ScreenPorch",
-    ]
-
-    X = X[categorical_columns_subset + numerical_columns_subset]
-    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
-    return X, y
-
-
-@memory.cache
-def get_mq2008(
-    dpath: str,
-) -> Tuple[
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-]:
-    from sklearn.datasets import load_svmlight_files
-
-    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
-    target = os.path.join(os.path.expanduser(dpath), "MQ2008.zip")
-    if not os.path.exists(target):
-        request.urlretrieve(url=src, filename=target)
-
-    with zipfile.ZipFile(target, "r") as f:
-        f.extractall(path=dpath)
-
-    (
-        x_train,
-        y_train,
-        qid_train,
-        x_test,
-        y_test,
-        qid_test,
-        x_valid,
-        y_valid,
-        qid_valid,
-    ) = load_svmlight_files(
-        (
-            Path(dpath) / "MQ2008" / "Fold1" / "train.txt",
-            Path(dpath) / "MQ2008" / "Fold1" / "test.txt",
-            Path(dpath) / "MQ2008" / "Fold1" / "vali.txt",
-        ),
-        query_id=True,
-        zero_based=False,
-    )
-
-    return (
-        x_train,
-        y_train,
-        qid_train,
-        x_test,
-        y_test,
-        qid_test,
-        x_valid,
-        y_valid,
-        qid_valid,
-    )
-
-
 # pylint: disable=too-many-arguments,too-many-locals
@memory.cache
 def make_categorical(
@ -738,20 +606,7 @@ _unweighted_datasets_strategy = strategies.sampled_from(
        TestDataset(
            "calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
        ),
-        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
        TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
-        TestDataset(
-            "mtreg",
-            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
-            "reg:squarederror",
-            "rmse",
-        ),
-        TestDataset(
-            "mtreg-l1",
-            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
-            "reg:absoluteerror",
-            "mae",
-        ),
        TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
        TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
        TestDataset(
@ -764,37 +619,71 @@ _unweighted_datasets_strategy = strategies.sampled_from(
 )


-@strategies.composite
-def _dataset_weight_margin(draw: Callable) -> TestDataset:
-    data: TestDataset = draw(_unweighted_datasets_strategy)
-    if draw(strategies.booleans()):
-        data.w = draw(
-            arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
-        )
-    if draw(strategies.booleans()):
-        num_class = 1
-        if data.objective == "multi:softmax":
-            num_class = int(np.max(data.y) + 1)
-        elif data.name.startswith("mtreg"):
-            num_class = data.y.shape[1]
+def make_datasets_with_margin(
+    unweighted_strategy: strategies.SearchStrategy,
+) -> Callable:
+    """Factory function for creating strategies that generates datasets with weight and
+    base margin.

-        data.margin = draw(
-            arrays(
-                np.float64,
-                (data.y.shape[0] * num_class),
-                elements=strategies.floats(0.5, 1.0),
+    """
+
+    @strategies.composite
+    def weight_margin(draw: Callable) -> TestDataset:
+        data: TestDataset = draw(unweighted_strategy)
+        if draw(strategies.booleans()):
+            data.w = draw(
+                arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
            )
-        )
-        assert data.margin is not None
-        if num_class != 1:
-            data.margin = data.margin.reshape(data.y.shape[0], num_class)
+        if draw(strategies.booleans()):
+            num_class = 1
+            if data.objective == "multi:softmax":
+                num_class = int(np.max(data.y) + 1)
+            elif data.name.startswith("mtreg"):
+                num_class = data.y.shape[1]

-    return data
+            data.margin = draw(
+                arrays(
+                    np.float64,
+                    (data.y.shape[0] * num_class),
+                    elements=strategies.floats(0.5, 1.0),
+                )
+            )
+            assert data.margin is not None
+            if num_class != 1:
+                data.margin = data.margin.reshape(data.y.shape[0], num_class)
+
+        return data
+
+    return weight_margin


-# A strategy for drawing from a set of example datasets
-# May add random weights to the dataset
-dataset_strategy = _dataset_weight_margin()
+# A strategy for drawing from a set of example datasets. May add random weights to the
+# dataset
+dataset_strategy = make_datasets_with_margin(_unweighted_datasets_strategy)()
+
+
+_unweighted_multi_datasets_strategy = strategies.sampled_from(
+    [
+        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
+        TestDataset(
+            "mtreg",
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
+            "reg:squarederror",
+            "rmse",
+        ),
+        TestDataset(
+            "mtreg-l1",
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
+            "reg:absoluteerror",
+            "mae",
+        ),
+    ]
+)
+
+# A strategy for drawing from a set of multi-target/multi-class datasets.
+multi_dataset_strategy = make_datasets_with_margin(
+    _unweighted_multi_datasets_strategy
+)()


 def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool:
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@ -1,13 +1,20 @@
 """Utilities for data generation."""
-from typing import Any, Generator, Tuple, Union
+import os
+import zipfile
+from typing import Any, Generator, List, Tuple, Union
+from urllib import request

 import numpy as np
 import pytest
 from numpy.random import Generator as RNG
+from scipy import sparse

 import xgboost
 from xgboost.data import pandas_pyarrow_mapper

+joblib = pytest.importorskip("joblib")
+memory = joblib.Memory("./cachedir", verbose=0)
+

 def np_dtypes(
    n_samples: int, n_features: int
@ -195,3 +202,141 @@ def check_inf(rng: RNG) -> None:

    with pytest.raises(ValueError, match="Input data contains `inf`"):
        xgboost.DMatrix(X, y)
+
+
+@memory.cache
+def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the California housing dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    data = datasets.fetch_california_housing()
+    return data.data, data.target
+
+
+@memory.cache
+def get_digits() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the digits dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    data = datasets.load_digits()
+    return data.data, data.target
+
+
+@memory.cache
+def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the breast cancer dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    return datasets.load_breast_cancer(return_X_y=True)
+
+
+@memory.cache
+def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
+    """Generate a sparse dataset."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    rng = np.random.RandomState(199)
+    n = 2000
+    sparsity = 0.75
+    X, y = datasets.make_regression(n, random_state=rng)
+    flag = rng.binomial(1, sparsity, X.shape)
+    for i in range(X.shape[0]):
+        for j in range(X.shape[1]):
+            if flag[i, j]:
+                X[i, j] = np.nan
+    return X, y
+
+
+@memory.cache
+def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Number of samples: 1460
+    Number of features: 20
+    Number of categorical features: 10
+    Number of numerical features: 10
+    """
+    datasets = pytest.importorskip("sklearn.datasets")
+    X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+
+    categorical_columns_subset: List[str] = [
+        "BldgType",  # 5 cats, no nan
+        "GarageFinish",  # 3 cats, nan
+        "LotConfig",  # 5 cats, no nan
+        "Functional",  # 7 cats, no nan
+        "MasVnrType",  # 4 cats, nan
+        "HouseStyle",  # 8 cats, no nan
+        "FireplaceQu",  # 5 cats, nan
+        "ExterCond",  # 5 cats, no nan
+        "ExterQual",  # 4 cats, no nan
+        "PoolQC",  # 3 cats, nan
+    ]
+
+    numerical_columns_subset: List[str] = [
+        "3SsnPorch",
+        "Fireplaces",
+        "BsmtHalfBath",
+        "HalfBath",
+        "GarageCars",
+        "TotRmsAbvGrd",
+        "BsmtFinSF1",
+        "BsmtFinSF2",
+        "GrLivArea",
+        "ScreenPorch",
+    ]
+
+    X = X[categorical_columns_subset + numerical_columns_subset]
+    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
+    return X, y
+
+
+@memory.cache
+def get_mq2008(
+    dpath: str,
+) -> Tuple[
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+]:
+    """Fetch the mq2008 dataset."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
+    target = os.path.join(dpath, "MQ2008.zip")
+    if not os.path.exists(target):
+        request.urlretrieve(url=src, filename=target)
+
+    with zipfile.ZipFile(target, "r") as f:
+        f.extractall(path=dpath)
+
+    (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    ) = datasets.load_svmlight_files(
+        (
+            os.path.join(dpath, "MQ2008/Fold1/train.txt"),
+            os.path.join(dpath, "MQ2008/Fold1/test.txt"),
+            os.path.join(dpath, "MQ2008/Fold1/vali.txt"),
+        ),
+        query_id=True,
+        zero_based=False,
+    )
+
+    return (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    )
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@ -4,8 +4,8 @@ from typing import cast

 import pytest

-hypothesis = pytest.importorskip("hypothesis")
-from hypothesis import strategies  # pylint:disable=wrong-import-position
+strategies = pytest.importorskip("hypothesis.strategies")
+

 exact_parameter_strategy = strategies.fixed_dictionaries(
    {
@ -41,6 +41,26 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
 )

+hist_multi_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(1, 11),
+        "max_leaves": strategies.integers(0, 1024),
+        "max_bin": strategies.integers(2, 512),
+        "multi_strategy": strategies.sampled_from(
+            ["multi_output_tree", "one_output_per_tree"]
+        ),
+        "grow_policy": strategies.sampled_from(["lossguide", "depthwise"]),
+        "min_child_weight": strategies.floats(0.5, 2.0),
+        # We cannot enable subsampling as the training loss can increase
+        # 'subsample': strategies.floats(0.5, 1.0),
+        "colsample_bytree": strategies.floats(0.5, 1.0),
+        "colsample_bylevel": strategies.floats(0.5, 1.0),
+    }
+).filter(
+    lambda x: (cast(int, x["max_depth"]) > 0 or cast(int, x["max_leaves"]) > 0)
+    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
+)
+
 cat_parameter_strategy = strategies.fixed_dictionaries(
    {
        "max_cat_to_onehot": strategies.integers(1, 128),
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@ -55,6 +55,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
      *out_dim = 2;
      shape.resize(*out_dim);
      shape.front() = rows;
+      // chunksize can be 1 if it's softmax
      shape.back() = std::min(groups, chunksize);
    }
    break;
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@ -359,6 +359,7 @@ void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_b
                 HistogramCuts *cuts) {
  size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
  auto &cut_values = cuts->cut_values_.HostVector();
+  // we use the min_value as the first (0th) element, hence starting from 1.
  for (size_t i = 1; i < required_cuts; ++i) {
    bst_float cpt = summary.data[i].value;
    if (i == 1 || cpt > cut_values.back()) {
@ -419,8 +420,8 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
    } else {
      AddCutPoint<WQSketch>(a, max_num_bins, cuts);
      // push a value that is greater than anything
-      const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
-                                         : cuts->min_vals_.HostVector()[fid];
+      const bst_float cpt =
+          (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
      // this must be bigger than last value in a scale
      const bst_float last = cpt + (fabs(cpt) + 1e-5f);
      cuts->cut_values_.HostVector().push_back(last);
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@ -352,19 +352,6 @@ struct WQSummary {
      prev_rmax = data[i].rmax;
    }
  }
-  // check consistency of the summary
-  inline bool Check(const char *msg) const {
-    const float tol = 10.0f;
-    for (size_t i = 0; i < this->size; ++i) {
-      if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
-          data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
-        LOG(INFO) << "---------- WQSummary::Check did not pass ----------";
-        this->Print();
-        return false;
-      }
-    }
-    return true;
-  }
 };

 /*! \brief try to do efficient pruning */
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@ -257,6 +257,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
  }
  iter.Reset();
  CHECK_EQ(rbegin, Info().num_row_);
+  CHECK_EQ(this->ghist_->Features(), Info().num_col_);

  /**
   * Generate column matrix
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -10,6 +10,7 @@
 #include <dmlc/parameter.h>

 #include <algorithm>
+#include <cinttypes>  // for uint32_t
 #include <limits>
 #include <memory>
 #include <string>
@ -27,9 +28,11 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
 #include "xgboost/logging.h"
+#include "xgboost/model.h"
 #include "xgboost/objective.h"
 #include "xgboost/predictor.h"
-#include "xgboost/string_view.h"
+#include "xgboost/string_view.h"  // for StringView
+#include "xgboost/tree_model.h"   // for RegTree
 #include "xgboost/tree_updater.h"

 namespace xgboost::gbm {
@ -131,6 +134,12 @@ void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
    // set, since only experts are expected to do so.
    return;
  }
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    CHECK(tparam_.tree_method == TreeMethod::kHist)
+        << "Only the hist tree method is supported for building multi-target trees with vector "
+           "leaf.";
+  }
+
  // tparam_ is set before calling this function.
  if (tparam_.tree_method != TreeMethod::kAuto) {
    return;
@ -175,12 +184,12 @@ void GBTree::ConfigureUpdaters() {
    case TreeMethod::kExact:
      tparam_.updater_seq = "grow_colmaker,prune";
      break;
-    case TreeMethod::kHist:
-      LOG(INFO) <<
-          "Tree method is selected to be 'hist', which uses a "
-          "single updater grow_quantile_histmaker.";
+    case TreeMethod::kHist: {
+      LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
+                   "grow_quantile_histmaker.";
      tparam_.updater_seq = "grow_quantile_histmaker";
      break;
+    }
    case TreeMethod::kGPUHist: {
      common::AssertGPUSupport();
      tparam_.updater_seq = "grow_gpu_hist";
@ -209,11 +218,9 @@ void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_thre
    GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
  } else {
    std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
-    auto nsize = static_cast<bst_omp_uint>(out_gpair->Size());
-    const auto &gpair_h = in_gpair->ConstHostVector();
-    common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
-      tmp_h[i] = gpair_h[i * n_groups + group_id];
-    });
+    const auto& gpair_h = in_gpair->ConstHostVector();
+    common::ParallelFor(out_gpair->Size(), n_threads,
+                        [&](auto i) { tmp_h[i] = gpair_h[i * n_groups + group_id]; });
  }
 }

@ -234,6 +241,7 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
  CHECK_EQ(model_.param.num_parallel_tree, trees.size());
  CHECK_EQ(model_.param.num_parallel_tree, 1)
      << "Boosting random forest is not supported for current objective.";
+  CHECK(!trees.front()->IsMultiTarget()) << "Update tree leaf" << MTNotImplemented();
  CHECK_EQ(trees.size(), model_.param.num_parallel_tree);
  for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
    auto const& position = node_position.at(tree_idx);
@ -245,17 +253,18 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
 void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                     PredictionCacheEntry* predt, ObjFunction const* obj) {
  std::vector<std::vector<std::unique_ptr<RegTree>>> new_trees;
-  const int ngroup = model_.learner_model_param->num_output_group;
+  const int ngroup = model_.learner_model_param->OutputLength();
  ConfigureWithKnownData(this->cfg_, p_fmat);
  monitor_.Start("BoostNewTrees");
+
  // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
  // `gpu_id` be the single source of determining what algorithms to run, but that will
  // break a lots of existing code.
  auto device = tparam_.tree_method != TreeMethod::kGPUHist ? Context::kCpuId : ctx_->gpu_id;
-  auto out = linalg::TensorView<float, 2>{
+  auto out = linalg::MakeTensorView(
+      device,
      device == Context::kCpuId ? predt->predictions.HostSpan() : predt->predictions.DeviceSpan(),
-      {static_cast<size_t>(p_fmat->Info().num_row_), static_cast<size_t>(ngroup)},
-      device};
+      p_fmat->Info().num_row_, model_.learner_model_param->OutputLength());
  CHECK_NE(ngroup, 0);

  if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf()) {
@ -266,7 +275,13 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
  // position is negated if the row is sampled out.
  std::vector<HostDeviceVector<bst_node_t>> node_position;

-  if (ngroup == 1) {
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    std::vector<std::unique_ptr<RegTree>> ret;
+    BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
+    UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
+    // No update prediction cache yet.
+    new_trees.push_back(std::move(ret));
+  } else if (model_.learner_model_param->OutputLength() == 1) {
    std::vector<std::unique_ptr<RegTree>> ret;
    BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
    UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
@ -383,11 +398,15 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
  }

  // update the trees
-  CHECK_EQ(gpair->Size(), p_fmat->Info().num_row_)
-      << "Mismatching size between number of rows from input data and size of "
-         "gradient vector.";
+  auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_;
+  StringView msg{
+      "Mismatching size between number of rows from input data and size of gradient vector."};
+  if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
+    CHECK_EQ(n_out % gpair->Size(), 0) << msg;
+  } else {
+    CHECK_EQ(gpair->Size(), n_out) << msg;
+  }

-  CHECK(out_position);
  out_position->resize(new_trees.size());

  // Rescale learning rate according to the size of trees
@ -402,8 +421,12 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma

 void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
  monitor_.Start("CommitModel");
-  for (uint32_t gid = 0; gid < model_.learner_model_param->num_output_group; ++gid) {
-    model_.CommitModel(std::move(new_trees[gid]), gid);
+  if (this->model_.learner_model_param->IsVectorLeaf()) {
+    model_.CommitModel(std::move(new_trees[0]), 0);
+  } else {
+    for (std::uint32_t gid = 0; gid < model_.learner_model_param->OutputLength(); ++gid) {
+      model_.CommitModel(std::move(new_trees[gid]), gid);
+    }
  }
  monitor_.Stop("CommitModel");
 }
@ -564,11 +587,10 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
  if (out_preds->version == 0) {
    // out_preds->Size() can be non-zero as it's initialized here before any
    // tree is built at the 0^th iterator.
-    predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions,
-                                  model_);
+    predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions, model_);
  }

-  uint32_t tree_begin, tree_end;
+  std::uint32_t tree_begin, tree_end;
  std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
  CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
  if (tree_end > tree_begin) {
@ -577,7 +599,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
  if (reset) {
    out_preds->version = 0;
  } else {
-    uint32_t delta = layer_end - out_preds->version;
+    std::uint32_t delta = layer_end - out_preds->version;
    out_preds->Update(delta);
  }
 }
@ -770,6 +792,7 @@ class Dart : public GBTree {
  void PredictBatchImpl(DMatrix *p_fmat, PredictionCacheEntry *p_out_preds,
                        bool training, unsigned layer_begin,
                        unsigned layer_end) const {
+    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
    auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
    CHECK(predictor);
    predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
@ -830,6 +853,7 @@ class Dart : public GBTree {
  void InplacePredict(std::shared_ptr<DMatrix> p_fmat, float missing,
                      PredictionCacheEntry* p_out_preds, uint32_t layer_begin,
                      unsigned layer_end) const override {
+    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
    uint32_t tree_begin, tree_end;
    std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
    auto n_groups = model_.learner_model_param->num_output_group;
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@ -139,14 +139,22 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {

 namespace detail {
 // From here on, layer becomes concrete trees.
-inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const &model,
-                                                 size_t layer_begin,
-                                                 size_t layer_end) {
-  bst_group_t groups = model.learner_model_param->num_output_group;
-  uint32_t tree_begin = layer_begin * groups * model.param.num_parallel_tree;
-  uint32_t tree_end = layer_end * groups * model.param.num_parallel_tree;
+inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const& model,
+                                                 std::uint32_t layer_begin,
+                                                 std::uint32_t layer_end) {
+  std::uint32_t tree_begin;
+  std::uint32_t tree_end;
+  if (model.learner_model_param->IsVectorLeaf()) {
+    tree_begin = layer_begin * model.param.num_parallel_tree;
+    tree_end = layer_end * model.param.num_parallel_tree;
+  } else {
+    bst_group_t groups = model.learner_model_param->OutputLength();
+    tree_begin = layer_begin * groups * model.param.num_parallel_tree;
+    tree_end = layer_end * groups * model.param.num_parallel_tree;
+  }
+
  if (tree_end == 0) {
-    tree_end = static_cast<uint32_t>(model.trees.size());
+    tree_end = model.trees.size();
  }
  if (model.trees.size() != 0) {
    CHECK_LE(tree_begin, tree_end);
@ -234,22 +242,25 @@ class GBTree : public GradientBooster {
  void LoadModel(Json const& in) override;

  // Number of trees per layer.
-  auto LayerTrees() const {
-    auto n_trees = model_.learner_model_param->num_output_group * model_.param.num_parallel_tree;
-    return n_trees;
+  [[nodiscard]] std::uint32_t LayerTrees() const {
+    if (model_.learner_model_param->IsVectorLeaf()) {
+      return model_.param.num_parallel_tree;
+    }
+    return model_.param.num_parallel_tree * model_.learner_model_param->OutputLength();
  }

  // slice the trees, out must be already allocated
  void Slice(int32_t layer_begin, int32_t layer_end, int32_t step,
             GradientBooster *out, bool* out_of_bound) const override;

-  int32_t BoostedRounds() const override {
+  [[nodiscard]] std::int32_t BoostedRounds() const override {
    CHECK_NE(model_.param.num_parallel_tree, 0);
    CHECK_NE(model_.learner_model_param->num_output_group, 0);
+
    return model_.trees.size() / this->LayerTrees();
  }

-  bool ModelFitted() const override {
+  [[nodiscard]] bool ModelFitted() const override {
    return !model_.trees.empty() || !model_.trees_to_update.empty();
  }

--- a/src/learner.cc
+++ b/src/learner.cc
@ -326,7 +326,7 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
  std::string booster;
  std::string objective;
  // This is a training parameter and is not saved (nor loaded) in the model.
-  MultiStrategy multi_strategy{MultiStrategy::kComposite};
+  MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};

  // declare parameters
  DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
@ -339,12 +339,12 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
        .set_default("reg:squarederror")
        .describe("Objective function used for obtaining gradient.");
    DMLC_DECLARE_FIELD(multi_strategy)
-        .add_enum("composite", MultiStrategy::kComposite)
-        .add_enum("monolithic", MultiStrategy::kMonolithic)
-        .set_default(MultiStrategy::kComposite)
+        .add_enum("one_output_per_tree", MultiStrategy::kOneOutputPerTree)
+        .add_enum("multi_output_tree", MultiStrategy::kMultiOutputTree)
+        .set_default(MultiStrategy::kOneOutputPerTree)
        .describe(
-            "Strategy used for training multi-target models. `monolithic` means building one "
-            "single tree for all targets.");
+            "Strategy used for training multi-target models. `multi_output_tree` means building "
+            "one single tree for all targets.");
  }
 };

--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@ -145,7 +145,6 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());

  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
-  auto n_groups = info.group_ptr_.size() - 1;

  auto d_inv_idcg = p_cache->InvIDCG(ctx);
  auto d_sorted_idx = p_cache->SortedIdx(ctx, d_predt.Values());
@ -171,7 +170,6 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
                            HostDeviceVector<float> const &predt, bool minus,
                            std::shared_ptr<ltr::MAPCache> p_cache) {
  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
-  auto n_groups = info.group_ptr_.size() - 1;
  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);

  predt.SetDevice(ctx->gpu_id);
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@ -87,30 +87,6 @@ bst_float PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
  return tree[leaf].LeafValue();
 }
-
-void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
-                       const size_t tree_end, const size_t predict_offset,
-                       const std::vector<RegTree::FVec> &thread_temp, const size_t offset,
-                       const size_t block_size, linalg::TensorView<float, 2> out_predt) {
-  for (size_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
-    const size_t gid = model.tree_info[tree_id];
-    auto const &tree = *model.trees[tree_id];
-    auto const &cats = tree.GetCategoriesMatrix();
-    auto has_categorical = tree.HasCategoricalSplit();
-
-    if (has_categorical) {
-      for (std::size_t i = 0; i < block_size; ++i) {
-        out_predt(predict_offset + i, gid) +=
-            PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
-      }
-    } else {
-      for (std::size_t i = 0; i < block_size; ++i) {
-        out_predt(predict_offset + i, gid) +=
-            PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
-      }
-    }
-  }
-}
 }  // namespace scalar

 namespace multi {
@ -128,7 +104,7 @@ bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
 }

 template <bool has_categorical>
-void PredValueByOneTree(const RegTree::FVec &p_feats, MultiTargetTree const &tree,
+void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tree,
                        RegTree::CategoricalSplitMatrix const &cats,
                        linalg::VectorView<float> out_predt) {
  bst_node_t const leaf = p_feats.HasMissing()
@ -140,36 +116,52 @@ void PredValueByOneTree(const RegTree::FVec &p_feats, MultiTargetTree const &tre
    out_predt(i) += leaf_value(i);
  }
 }
+}  // namespace multi

-void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
-                       const size_t tree_end, const size_t predict_offset,
-                       const std::vector<RegTree::FVec> &thread_temp, const size_t offset,
-                       const size_t block_size, linalg::TensorView<float, 2> out_predt) {
-  for (size_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
+namespace {
+void PredictByAllTrees(gbm::GBTreeModel const &model, std::uint32_t const tree_begin,
+                       std::uint32_t const tree_end, std::size_t const predict_offset,
+                       std::vector<RegTree::FVec> const &thread_temp, std::size_t const offset,
+                       std::size_t const block_size, linalg::MatrixView<float> out_predt) {
+  for (std::uint32_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
    auto const &tree = *model.trees.at(tree_id);
-    auto cats = tree.GetCategoriesMatrix();
+    auto const &cats = tree.GetCategoriesMatrix();
    bool has_categorical = tree.HasCategoricalSplit();

-    if (has_categorical) {
-      for (std::size_t i = 0; i < block_size; ++i) {
-        auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-        PredValueByOneTree<true>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
-                                 t_predts);
+    if (tree.IsMultiTarget()) {
+      if (has_categorical) {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
+          multi::PredValueByOneTree<true>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
+                                          t_predts);
+        }
+      } else {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
+          multi::PredValueByOneTree<false>(thread_temp[offset + i], *tree.GetMultiTargetTree(),
+                                           cats, t_predts);
+        }
      }
    } else {
-      for (std::size_t i = 0; i < block_size; ++i) {
-        auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-        PredValueByOneTree<false>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
-                                  t_predts);
+      auto const gid = model.tree_info[tree_id];
+      if (has_categorical) {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          out_predt(predict_offset + i, gid) +=
+              scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
+        }
+      } else {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          out_predt(predict_offset + i, gid) +=
+              scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
+        }
      }
    }
  }
 }
-}  // namespace multi

 template <typename DataView>
 void FVecFill(const size_t block_size, const size_t batch_offset, const int num_feature,
-              DataView* batch, const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
+              DataView *batch, const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
  for (size_t i = 0; i < block_size; ++i) {
    RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
    if (feats.Size() == 0) {
@ -181,8 +173,8 @@ void FVecFill(const size_t block_size, const size_t batch_offset, const int num_
 }

 template <typename DataView>
-void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batch,
-              const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
+void FVecDrop(const size_t block_size, const size_t batch_offset, DataView *batch,
+              const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
  for (size_t i = 0; i < block_size; ++i) {
    RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
    const SparsePage::Inst inst = (*batch)[batch_offset + i];
@ -190,9 +182,7 @@ void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batc
  }
 }

-namespace {
 static std::size_t constexpr kUnroll = 8;
-}  // anonymous namespace

 struct SparsePageView {
  bst_row_t base_rowid;
@ -292,7 +282,7 @@ class AdapterView {

 template <typename DataView, size_t block_of_rows_size>
 void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &model,
-                                     int32_t tree_begin, int32_t tree_end,
+                                     std::uint32_t tree_begin, std::uint32_t tree_end,
                                     std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads,
                                     linalg::TensorView<float, 2> out_predt) {
  auto &thread_temp = *p_thread_temp;
@ -310,14 +300,8 @@ void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &mod

    FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, p_thread_temp);
    // process block of rows through all trees to keep cache locality
-    if (model.learner_model_param->IsVectorLeaf()) {
-      multi::PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid,
-                               thread_temp, fvec_offset, block_size, out_predt);
-    } else {
-      scalar::PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid,
-                                thread_temp, fvec_offset, block_size, out_predt);
-    }
-
+    PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid, thread_temp,
+                      fvec_offset, block_size, out_predt);
    FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
  });
 }
@ -348,7 +332,6 @@ void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
  FillNodeMeanValues(tree, 0, mean_values);
 }

-namespace {
 // init thread buffers
 static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
  int prev_thread_temp_size = out->size();
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -411,7 +411,7 @@ class DeviceModel {

    this->tree_beg_ = tree_begin;
    this->tree_end_ = tree_end;
-    this->num_group = model.learner_model_param->num_output_group;
+    this->num_group = model.learner_model_param->OutputLength();
  }
 };

--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@ -306,9 +306,9 @@ class HistogramBuilder {

 // Construct a work space for building histogram.  Eventually we should move this
 // function into histogram builder once hist tree method supports external memory.
-template <typename Partitioner>
+template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
 common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
-                                          std::vector<CPUExpandEntry> const &nodes_to_build) {
+                                          std::vector<ExpandEntry> const &nodes_to_build) {
  std::vector<size_t> partition_size(nodes_to_build.size(), 0);
  for (auto const &partition : partitioners) {
    size_t k = 0;
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@ -889,6 +889,8 @@ void RegTree::Save(dmlc::Stream* fo) const {
  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
  CHECK_EQ(param_.deprecated_num_roots, 1);
  CHECK_NE(param_.num_nodes, 0);
+  CHECK(!IsMultiTarget())
+      << "Please use JSON/UBJSON for saving models with multi-target trees.";
  CHECK(!HasCategoricalSplit())
      << "Please use JSON/UBJSON for saving models with categorical splits.";

--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@ -4,36 +4,39 @@
 * \brief use quantized feature values to construct a tree
 * \author Philip Cho, Tianqi Checn, Egor Smirnov
 */
-#include <algorithm>                         // for max
+#include <algorithm>                         // for max, copy, transform
 #include <cstddef>                           // for size_t
-#include <cstdint>                           // for uint32_t
-#include <memory>                            // for unique_ptr, allocator, make_unique, make_shared
-#include <ostream>                           // for operator<<, char_traits, basic_ostream
-#include <tuple>                             // for apply
+#include <cstdint>                           // for uint32_t, int32_t
+#include <memory>                            // for unique_ptr, allocator, make_unique, shared_ptr
+#include <numeric>                           // for accumulate
+#include <ostream>                           // for basic_ostream, char_traits, operator<<
 #include <utility>                           // for move, swap
 #include <vector>                            // for vector

 #include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
 #include "../collective/communicator.h"      // for Operation
 #include "../common/hist_util.h"             // for HistogramCuts, HistCollection
+#include "../common/linalg_op.h"             // for begin, cbegin, cend
 #include "../common/random.h"                // for ColumnSampler
 #include "../common/threading_utils.h"       // for ParallelFor
 #include "../common/timer.h"                 // for Monitor
+#include "../common/transform_iterator.h"    // for IndexTransformIter, MakeIndexTransformIter
 #include "../data/gradient_index.h"          // for GHistIndexMatrix
 #include "common_row_partitioner.h"          // for CommonRowPartitioner
+#include "dmlc/omp.h"                        // for omp_get_thread_num
 #include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
 #include "driver.h"                          // for Driver
-#include "hist/evaluate_splits.h"            // for HistEvaluator, UpdatePredictionCacheImpl
-#include "hist/expand_entry.h"               // for CPUExpandEntry
+#include "hist/evaluate_splits.h"            // for HistEvaluator, HistMultiEvaluator, UpdatePre...
+#include "hist/expand_entry.h"               // for MultiExpandEntry, CPUExpandEntry
 #include "hist/histogram.h"                  // for HistogramBuilder, ConstructHistSpace
 #include "hist/sampler.h"                    // for SampleGradient
-#include "param.h"                           // for TrainParam, GradStats
-#include "xgboost/base.h"                    // for GradientPair, GradientPairInternal, bst_node_t
+#include "param.h"                           // for TrainParam, SplitEntryContainer, GradStats
+#include "xgboost/base.h"                    // for GradientPairInternal, GradientPair, bst_targ...
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for BatchIterator, BatchSet, DMatrix, MetaInfo
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
-#include "xgboost/linalg.h"                  // for TensorView, MatrixView, UnravelIndex, All
-#include "xgboost/logging.h"                 // for LogCheck_EQ, LogCheck_GE, CHECK_EQ, LOG, LOG...
+#include "xgboost/linalg.h"                  // for All, MatrixView, TensorView, Matrix, Empty
+#include "xgboost/logging.h"                 // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
 #include "xgboost/span.h"                    // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"             // for operator<<
 #include "xgboost/task.h"                    // for ObjInfo
@ -105,6 +108,212 @@ void UpdateTree(common::Monitor *monitor_, linalg::MatrixView<GradientPair const
  monitor_->Stop(__func__);
 }

+/**
+ * \brief Updater for building multi-target trees. The implementation simply iterates over
+ *        each target.
+ */
+class MultiTargetHistBuilder {
+ private:
+  common::Monitor *monitor_{nullptr};
+  TrainParam const *param_{nullptr};
+  std::shared_ptr<common::ColumnSampler> col_sampler_;
+  std::unique_ptr<HistMultiEvaluator> evaluator_;
+  // Histogram builder for each target.
+  std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
+  Context const *ctx_{nullptr};
+  // Partitioner for each data batch.
+  std::vector<CommonRowPartitioner> partitioner_;
+  // Pointer to last updated tree, used for update prediction cache.
+  RegTree const *p_last_tree_{nullptr};
+
+  ObjInfo const *task_{nullptr};
+
+ public:
+  void UpdatePosition(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<MultiExpandEntry> const &applied) {
+    monitor_->Start(__func__);
+    std::size_t page_id{0};
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
+      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
+      page_id++;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void ApplyTreeSplit(MultiExpandEntry const &candidate, RegTree *p_tree) {
+    this->evaluator_->ApplyTreeSplit(candidate, p_tree);
+  }
+
+  void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
+    monitor_->Start(__func__);
+
+    std::size_t page_id = 0;
+    bst_bin_t n_total_bins = 0;
+    partitioner_.clear();
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      if (n_total_bins == 0) {
+        n_total_bins = page.cut.TotalBins();
+      } else {
+        CHECK_EQ(n_total_bins, page.cut.TotalBins());
+      }
+      partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->IsColumnSplit());
+      page_id++;
+    }
+
+    bst_target_t n_targets = p_tree->NumTargets();
+    histogram_builder_.clear();
+    for (std::size_t i = 0; i < n_targets; ++i) {
+      histogram_builder_.emplace_back();
+      histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
+                                      collective::IsDistributed(), p_fmat->IsColumnSplit());
+    }
+
+    evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
+    p_last_tree_ = p_tree;
+    monitor_->Stop(__func__);
+  }
+
+  MultiExpandEntry InitRoot(DMatrix *p_fmat, linalg::MatrixView<GradientPair const> gpair,
+                            RegTree *p_tree) {
+    monitor_->Start(__func__);
+    MultiExpandEntry best;
+    best.nid = RegTree::kRoot;
+    best.depth = 0;
+
+    auto n_targets = p_tree->NumTargets();
+    linalg::Matrix<GradientPairPrecise> root_sum_tloc =
+        linalg::Empty<GradientPairPrecise>(ctx_, ctx_->Threads(), n_targets);
+    CHECK_EQ(root_sum_tloc.Shape(1), gpair.Shape(1));
+    auto h_root_sum_tloc = root_sum_tloc.HostView();
+    common::ParallelFor(gpair.Shape(0), ctx_->Threads(), [&](auto i) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        h_root_sum_tloc(omp_get_thread_num(), t) += GradientPairPrecise{gpair(i, t)};
+      }
+    });
+    // Aggregate to the first row.
+    auto root_sum = h_root_sum_tloc.Slice(0, linalg::All());
+    for (std::int32_t tidx{1}; tidx < ctx_->Threads(); ++tidx) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        root_sum(t) += h_root_sum_tloc(tidx, t);
+      }
+    }
+    CHECK(root_sum.CContiguous());
+    collective::Allreduce<collective::Operation::kSum>(
+        reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
+
+    std::vector<MultiExpandEntry> nodes{best};
+    std::size_t i = 0;
+    auto space = ConstructHistSpace(partitioner_, nodes);
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
+                                        nodes, {}, t_gpair.Values());
+      }
+      i++;
+    }
+
+    auto weight = evaluator_->InitRoot(root_sum);
+    auto weight_t = weight.HostView();
+    std::transform(linalg::cbegin(weight_t), linalg::cend(weight_t), linalg::begin(weight_t),
+                   [&](float w) { return w * param_->learning_rate; });
+
+    p_tree->SetLeaf(RegTree::kRoot, weight_t);
+    std::vector<common::HistCollection const *> hists;
+    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
+      hists.push_back(&histogram_builder_[t].Histogram());
+    }
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
+      break;
+    }
+    monitor_->Stop(__func__);
+
+    return nodes.front();
+  }
+
+  void BuildHistogram(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<MultiExpandEntry> const &valid_candidates,
+                      linalg::MatrixView<GradientPair const> gpair) {
+    monitor_->Start(__func__);
+    std::vector<MultiExpandEntry> nodes_to_build;
+    std::vector<MultiExpandEntry> nodes_to_sub;
+
+    for (auto const &c : valid_candidates) {
+      auto left_nidx = p_tree->LeftChild(c.nid);
+      auto right_nidx = p_tree->RightChild(c.nid);
+
+      auto build_nidx = left_nidx;
+      auto subtract_nidx = right_nidx;
+      auto lit =
+          common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
+      auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
+      auto rit =
+          common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
+      auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
+      auto fewer_right = right_sum < left_sum;
+      if (fewer_right) {
+        std::swap(build_nidx, subtract_nidx);
+      }
+      nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
+      nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
+    }
+
+    std::size_t i = 0;
+    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        // Make sure the gradient matrix is f-order.
+        CHECK(t_gpair.Contiguous());
+        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
+                                        nodes_to_build, nodes_to_sub, t_gpair.Values());
+      }
+      i++;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<MultiExpandEntry> *best_splits) {
+    monitor_->Start(__func__);
+    std::vector<common::HistCollection const *> hists;
+    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
+      hists.push_back(&histogram_builder_[t].Histogram());
+    }
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
+      break;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void LeafPartition(RegTree const &tree, linalg::MatrixView<GradientPair const> gpair,
+                     std::vector<bst_node_t> *p_out_position) {
+    monitor_->Start(__func__);
+    if (!task_->UpdateTreeLeaf()) {
+      return;
+    }
+    for (auto const &part : partitioner_) {
+      part.LeafPartition(ctx_, tree, gpair, p_out_position);
+    }
+    monitor_->Stop(__func__);
+  }
+
+ public:
+  explicit MultiTargetHistBuilder(Context const *ctx, MetaInfo const &info, TrainParam const *param,
+                                  std::shared_ptr<common::ColumnSampler> column_sampler,
+                                  ObjInfo const *task, common::Monitor *monitor)
+      : monitor_{monitor},
+        param_{param},
+        col_sampler_{std::move(column_sampler)},
+        evaluator_{std::make_unique<HistMultiEvaluator>(ctx, info, param, col_sampler_)},
+        ctx_{ctx},
+        task_{task} {
+    monitor_->Init(__func__);
+  }
+};
+
 class HistBuilder {
 private:
  common::Monitor *monitor_;
@ -155,8 +364,7 @@ class HistBuilder {
  // initialize temp data structure
  void InitData(DMatrix *fmat, RegTree const *p_tree) {
    monitor_->Start(__func__);
-
-    size_t page_id{0};
+    std::size_t page_id{0};
    bst_bin_t n_total_bins{0};
    partitioner_.clear();
    for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
@ -195,7 +403,7 @@ class HistBuilder {
                          RegTree *p_tree) {
    CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));

-    size_t page_id = 0;
+    std::size_t page_id = 0;
    auto space = ConstructHistSpace(partitioner_, {node});
    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      std::vector<CPUExpandEntry> nodes_to_build{node};
@ -214,13 +422,13 @@ class HistBuilder {
         * of gradient histogram is equal to snode[nid]
         */
        auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
-        std::vector<uint32_t> const &row_ptr = gmat.cut.Ptrs();
+        std::vector<std::uint32_t> const &row_ptr = gmat.cut.Ptrs();
        CHECK_GE(row_ptr.size(), 2);
-        uint32_t const ibegin = row_ptr[0];
-        uint32_t const iend = row_ptr[1];
+        std::uint32_t const ibegin = row_ptr[0];
+        std::uint32_t const iend = row_ptr[1];
        auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
        auto begin = hist.data();
-        for (uint32_t i = ibegin; i < iend; ++i) {
+        for (std::uint32_t i = ibegin; i < iend; ++i) {
          GradientPairPrecise const &et = begin[i];
          grad_stat.Add(et.GetGrad(), et.GetHess());
        }
@ -259,7 +467,7 @@ class HistBuilder {
    std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
    std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());

-    size_t n_idx = 0;
+    std::size_t n_idx = 0;
    for (auto const &c : valid_candidates) {
      auto left_nidx = (*p_tree)[c.nid].LeftChild();
      auto right_nidx = (*p_tree)[c.nid].RightChild();
@ -275,7 +483,7 @@ class HistBuilder {
      n_idx++;
    }

-    size_t page_id{0};
+    std::size_t page_id{0};
    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
      histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
@ -311,11 +519,12 @@ class HistBuilder {

 /*! \brief construct a tree using quantized feature values */
 class QuantileHistMaker : public TreeUpdater {
-  std::unique_ptr<HistBuilder> p_impl_;
+  std::unique_ptr<HistBuilder> p_impl_{nullptr};
+  std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
  std::shared_ptr<common::ColumnSampler> column_sampler_ =
      std::make_shared<common::ColumnSampler>();
  common::Monitor monitor_;
-  ObjInfo const *task_;
+  ObjInfo const *task_{nullptr};

 public:
  explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
@ -332,7 +541,10 @@ class QuantileHistMaker : public TreeUpdater {
              const std::vector<RegTree *> &trees) override {
    if (trees.front()->IsMultiTarget()) {
      CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
-      LOG(FATAL) << "Not implemented.";
+      if (!p_mtimpl_) {
+        this->p_mtimpl_ = std::make_unique<MultiTargetHistBuilder>(
+            ctx_, p_fmat->Info(), param, column_sampler_, task_, &monitor_);
+      }
    } else {
      if (!p_impl_) {
        p_impl_ =
@ -355,13 +567,14 @@ class QuantileHistMaker : public TreeUpdater {

    for (auto tree_it = trees.begin(); tree_it != trees.end(); ++tree_it) {
      if (need_copy()) {
-        // Copy gradient into buffer for sampling.
+        // Copy gradient into buffer for sampling. This converts C-order to F-order.
        std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
      }
      SampleGradient(ctx_, *param, h_sample_out);
      auto *h_out_position = &out_position[tree_it - trees.begin()];
      if ((*tree_it)->IsMultiTarget()) {
-        LOG(FATAL) << "Not implemented.";
+        UpdateTree<MultiExpandEntry>(&monitor_, h_sample_out, p_mtimpl_.get(), p_fmat, param,
+                                     h_out_position, *tree_it);
      } else {
        UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
                                   h_out_position, *tree_it);
@ -372,6 +585,9 @@ class QuantileHistMaker : public TreeUpdater {
  bool UpdatePredictionCache(const DMatrix *data, linalg::VectorView<float> out_preds) override {
    if (p_impl_) {
      return p_impl_->UpdatePredictionCache(data, out_preds);
+    } else if (p_mtimpl_) {
+      // Not yet supported.
+      return false;
    } else {
      return false;
    }
@ -383,6 +599,6 @@ class QuantileHistMaker : public TreeUpdater {
 XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker")
    .describe("Grow tree using quantized histogram.")
    .set_body([](Context const *ctx, ObjInfo const *task) {
-      return new QuantileHistMaker(ctx, task);
+      return new QuantileHistMaker{ctx, task};
    });
 }  // namespace xgboost::tree
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@ -3,7 +3,7 @@ import os
 import subprocess
 import sys
 from multiprocessing import Pool, cpu_count
-from typing import Dict, Optional, Tuple
+from typing import Dict, Tuple

 from pylint import epylint
 from test_utils import PY_PACKAGE, ROOT, cd, print_time, record_time
@ -15,8 +15,11 @@ SRCPATH = os.path.normpath(


@record_time
-def run_black(rel_path: str) -> bool:
-    cmd = ["black", "-q", "--check", rel_path]
+def run_black(rel_path: str, fix: bool) -> bool:
+    if fix:
+        cmd = ["black", "-q", rel_path]
+    else:
+        cmd = ["black", "-q", "--check", rel_path]
    ret = subprocess.run(cmd).returncode
    if ret != 0:
        subprocess.run(["black", "--version"])
@ -31,8 +34,11 @@ Please run the following command on your machine to address the formatting error


@record_time
-def run_isort(rel_path: str) -> bool:
-    cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
+def run_isort(rel_path: str, fix: bool) -> bool:
+    if fix:
+        cmd = ["isort", f"--src={SRCPATH}", "--profile=black", rel_path]
+    else:
+        cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
    ret = subprocess.run(cmd).returncode
    if ret != 0:
        subprocess.run(["isort", "--version"])
@ -132,7 +138,7 @@ def run_pylint() -> bool:
 def main(args: argparse.Namespace) -> None:
    if args.format == 1:
        black_results = [
-            run_black(path)
+            run_black(path, args.fix)
            for path in [
                # core
                "python-package/",
@ -166,7 +172,7 @@ def main(args: argparse.Namespace) -> None:
            sys.exit(-1)

        isort_results = [
-            run_isort(path)
+            run_isort(path, args.fix)
            for path in [
                # core
                "python-package/",
@ -230,6 +236,11 @@ if __name__ == "__main__":
    parser.add_argument("--format", type=int, choices=[0, 1], default=1)
    parser.add_argument("--type-check", type=int, choices=[0, 1], default=1)
    parser.add_argument("--pylint", type=int, choices=[0, 1], default=1)
+    parser.add_argument(
+        "--fix",
+        action="store_true",
+        help="Fix the formatting issues instead of emitting an error.",
+    )
    args = parser.parse_args()
    try:
        main(args)
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@ -412,7 +412,7 @@ std::pair<Json, Json> TestModelSlice(std::string booster) {
    j++;
  }

-  // CHECK sliced model doesn't have dependency on old one
+  // CHECK sliced model doesn't have dependency on the old one
  learner.reset();
  CHECK_EQ(sliced->GetNumFeature(), kCols);

--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@ -473,7 +473,7 @@ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint
                                int32_t device = Context::kCpuId) {
  size_t shape[1]{1};
  LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
-                           n_groups, 1, MultiStrategy::kComposite);
+                           n_groups, 1, MultiStrategy::kOneOutputPerTree);
  return mparam;
 }

--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@ -428,7 +428,7 @@ void TestVectorLeafPrediction(Context const *ctx) {

  LearnerModelParam mparam{static_cast<bst_feature_t>(kCols),
                           linalg::Vector<float>{{0.5}, {1}, Context::kCpuId}, 1, 3,
-                           MultiStrategy::kMonolithic};
+                           MultiStrategy::kMultiOutputTree};

  std::vector<std::unique_ptr<RegTree>> trees;
  trees.emplace_back(new RegTree{mparam.LeafLength(), mparam.num_feature});
--- a/tests/cpp/test_multi_target.cc
+++ b/tests/cpp/test_multi_target.cc
@ -124,11 +124,11 @@ TEST(MultiStrategy, Configure) {
  auto p_fmat = RandomDataGenerator{12ul, 3ul, 0.0}.GenerateDMatrix();
  p_fmat->Info().labels.Reshape(p_fmat->Info().num_row_, 2);
  std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-  learner->SetParams(Args{{"multi_strategy", "monolithic"}, {"num_target", "2"}});
+  learner->SetParams(Args{{"multi_strategy", "multi_output_tree"}, {"num_target", "2"}});
  learner->Configure();
  ASSERT_EQ(learner->Groups(), 2);

-  learner->SetParams(Args{{"multi_strategy", "monolithic"}, {"num_target", "0"}});
+  learner->SetParams(Args{{"multi_strategy", "multi_output_tree"}, {"num_target", "0"}});
  ASSERT_THROW({ learner->Configure(); }, dmlc::Error);
 }
 }  // namespace xgboost
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@ -116,7 +116,7 @@ def test_with_mq2008(objective, metric) -> None:
        x_valid,
        y_valid,
        qid_valid,
-    ) = tm.get_mq2008(os.path.join(os.path.join(tm.demo_dir(__file__), "rank")))
+    ) = tm.data.get_mq2008(os.path.join(os.path.join(tm.demo_dir(__file__), "rank")))

    if metric.find("map") != -1 or objective.find("map") != -1:
        y_train[y_train <= 1] = 0.0
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@ -32,6 +32,19 @@ def train_result(param, dmat: xgb.DMatrix, num_rounds: int) -> dict:
    return result


+class TestGPUUpdatersMulti:
+    @given(
+        hist_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
+    )
+    @settings(deadline=None, max_examples=50, print_blob=True)
+    def test_hist(self, param, num_rounds, dataset):
+        param["tree_method"] = "gpu_hist"
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+
 class TestGPUUpdaters:
    cputest = test_up.TestTreeMethod()

@ -101,7 +114,7 @@ class TestGPUUpdaters:
    ) -> None:
        cat_parameters.update(hist_parameters)
        dataset = tm.TestDataset(
-            "ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
+            "ames_housing", tm.data.get_ames_housing, "reg:squarederror", "rmse"
        )
        cat_parameters["tree_method"] = "gpu_hist"
        results = train_result(cat_parameters, dataset.get_dmat(), 16)
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@ -15,13 +15,17 @@ rng = np.random.RandomState(1994)


 def json_model(model_path: str, parameters: dict) -> dict:
-    X = np.random.random((10, 3))
-    y = np.random.randint(2, size=(10,))
+    datasets = pytest.importorskip("sklearn.datasets")
+
+    X, y = datasets.make_classification(64, n_features=8, n_classes=3, n_informative=6)
+    if parameters.get("objective", None) == "multi:softmax":
+        parameters["num_class"] = 3

    dm1 = xgb.DMatrix(X, y)

    bst = xgb.train(parameters, dm1)
    bst.save_model(model_path)
+
    if model_path.endswith("ubj"):
        import ubjson
        with open(model_path, "rb") as ubjfd:
@ -326,24 +330,43 @@ class TestModels:
        from_ubjraw = xgb.Booster()
        from_ubjraw.load_model(ubj_raw)

-        old_from_json = from_jraw.save_raw(raw_format="deprecated")
-        old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
+        if parameters.get("multi_strategy", None) != "multi_output_tree":
+            # old binary model is not supported.
+            old_from_json = from_jraw.save_raw(raw_format="deprecated")
+            old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")

-        assert old_from_json == old_from_ubj
+            assert old_from_json == old_from_ubj

        raw_json = bst.save_raw(raw_format="json")
        pretty = json.dumps(json.loads(raw_json), indent=2) + "\n\n"
        bst.load_model(bytearray(pretty, encoding="ascii"))

-        old_from_json = from_jraw.save_raw(raw_format="deprecated")
-        old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
+        if parameters.get("multi_strategy", None) != "multi_output_tree":
+            # old binary model is not supported.
+            old_from_json = from_jraw.save_raw(raw_format="deprecated")
+            old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")

-        assert old_from_json == old_from_ubj
+            assert old_from_json == old_from_ubj
+
+        rng = np.random.default_rng()
+        X = rng.random(size=from_jraw.num_features() * 10).reshape(
+            (10, from_jraw.num_features())
+        )
+        predt_from_jraw = from_jraw.predict(xgb.DMatrix(X))
+        predt_from_bst = bst.predict(xgb.DMatrix(X))
+        np.testing.assert_allclose(predt_from_jraw, predt_from_bst)

    @pytest.mark.parametrize("ext", ["json", "ubj"])
    def test_model_json_io(self, ext: str) -> None:
        parameters = {"booster": "gbtree", "tree_method": "hist"}
        self.run_model_json_io(parameters, ext)
+        parameters = {
+            "booster": "gbtree",
+            "tree_method": "hist",
+            "multi_strategy": "multi_output_tree",
+            "objective": "multi:softmax",
+        }
+        self.run_model_json_io(parameters, ext)
        parameters = {"booster": "gblinear"}
        self.run_model_json_io(parameters, ext)
        parameters = {"booster": "dart", "tree_method": "hist"}
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@ -465,7 +465,7 @@ class TestCallbacks:
                assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".pkl"))

    def test_callback_list(self):
-        X, y = tm.get_california_housing()
+        X, y = tm.data.get_california_housing()
        m = xgb.DMatrix(X, y)
        callbacks = [xgb.callback.EarlyStopping(rounds=10)]
        for i in range(4):
--- a/tests/python/test_ranking.py
+++ b/tests/python/test_ranking.py
@ -82,7 +82,7 @@ class TestRanking:
        """
        cls.dpath = 'demo/rank/'
        (x_train, y_train, qid_train, x_test, y_test, qid_test,
-         x_valid, y_valid, qid_valid) = tm.get_mq2008(cls.dpath)
+         x_valid, y_valid, qid_valid) = tm.data.get_mq2008(cls.dpath)

        # instantiate the matrices
        cls.dtrain = xgboost.DMatrix(x_train, y_train)
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@ -11,6 +11,7 @@ from xgboost import testing as tm
 from xgboost.testing.params import (
    cat_parameter_strategy,
    exact_parameter_strategy,
+    hist_multi_parameter_strategy,
    hist_parameter_strategy,
 )
 from xgboost.testing.updater import check_init_estimation, check_quantile_loss
@ -18,11 +19,70 @@ from xgboost.testing.updater import check_init_estimation, check_quantile_loss

 def train_result(param, dmat, num_rounds):
    result = {}
-    xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
-              evals_result=result)
+    booster = xgb.train(
+        param,
+        dmat,
+        num_rounds,
+        [(dmat, "train")],
+        verbose_eval=False,
+        evals_result=result,
+    )
+    assert booster.num_features() == dmat.num_col()
+    assert booster.num_boosted_rounds() == num_rounds
+    assert booster.feature_names == dmat.feature_names
+    assert booster.feature_types == dmat.feature_types
+
    return result


+class TestTreeMethodMulti:
+    @given(
+        exact_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_exact(self, param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
+        if dataset.name.endswith("-l1"):
+            return
+        param["tree_method"] = "exact"
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    @given(
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.multi_dataset_strategy,
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_approx(self, param, hist_param, num_rounds, dataset):
+        param["tree_method"] = "approx"
+        param = dataset.set_params(param)
+        param.update(hist_param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    @given(
+        exact_parameter_strategy,
+        hist_multi_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.multi_dataset_strategy,
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_hist(
+        self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset
+    ) -> None:
+        if dataset.name.endswith("-l1"):
+            return
+        param["tree_method"] = "hist"
+        param = dataset.set_params(param)
+        param.update(hist_param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+
 class TestTreeMethod:
    USE_ONEHOT = np.iinfo(np.int32).max
    USE_PART = 1
@ -77,10 +137,14 @@ class TestTreeMethod:
        # Second prune should not change the tree
        assert after_prune == second_prune

-    @given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
-           tm.dataset_strategy)
+    @given(
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.dataset_strategy
+    )
    @settings(deadline=None, print_blob=True)
-    def test_hist(self, param, hist_param, num_rounds, dataset):
+    def test_hist(self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
        param['tree_method'] = 'hist'
        param = dataset.set_params(param)
        param.update(hist_param)
@ -88,23 +152,6 @@ class TestTreeMethod:
        note(result)
        assert tm.non_increasing(result['train'][dataset.metric])

-    @given(tm.sparse_datasets_strategy)
-    @settings(deadline=None, print_blob=True)
-    def test_sparse(self, dataset):
-        param = {"tree_method": "hist", "max_bin": 64}
-        hist_result = train_result(param, dataset.get_dmat(), 16)
-        note(hist_result)
-        assert tm.non_increasing(hist_result['train'][dataset.metric])
-
-        param = {"tree_method": "approx", "max_bin": 64}
-        approx_result = train_result(param, dataset.get_dmat(), 16)
-        note(approx_result)
-        assert tm.non_increasing(approx_result['train'][dataset.metric])
-
-        np.testing.assert_allclose(
-            hist_result["train"]["rmse"], approx_result["train"]["rmse"]
-        )
-
    def test_hist_categorical(self):
        # hist must be same as exact on all-categorial data
        dpath = 'demo/data/'
@ -143,6 +190,23 @@ class TestTreeMethod:
        w = [0, 0, 1, 0]
        model.fit(X, y, sample_weight=w)

+    @given(tm.sparse_datasets_strategy)
+    @settings(deadline=None, print_blob=True)
+    def test_sparse(self, dataset):
+        param = {"tree_method": "hist", "max_bin": 64}
+        hist_result = train_result(param, dataset.get_dmat(), 16)
+        note(hist_result)
+        assert tm.non_increasing(hist_result['train'][dataset.metric])
+
+        param = {"tree_method": "approx", "max_bin": 64}
+        approx_result = train_result(param, dataset.get_dmat(), 16)
+        note(approx_result)
+        assert tm.non_increasing(approx_result['train'][dataset.metric])
+
+        np.testing.assert_allclose(
+            hist_result["train"]["rmse"], approx_result["train"]["rmse"]
+        )
+
    def run_invalid_category(self, tree_method: str) -> None:
        rng = np.random.default_rng()
        # too large
@ -365,7 +429,7 @@ class TestTreeMethod:
    ) -> None:
        cat_parameters.update(hist_parameters)
        dataset = tm.TestDataset(
-            "ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
+            "ames_housing", tm.data.get_ames_housing, "reg:squarederror", "rmse"
        )
        cat_parameters["tree_method"] = tree_method
        results = train_result(cat_parameters, dataset.get_dmat(), 16)
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@ -1168,7 +1168,7 @@ def test_dask_aft_survival() -> None:

 def test_dask_ranking(client: "Client") -> None:
    dpath = "demo/rank/"
-    mq2008 = tm.get_mq2008(dpath)
+    mq2008 = tm.data.get_mq2008(dpath)
    data = []
    for d in mq2008:
        if isinstance(d, scipy.sparse.csr_matrix):