Merge branch 'master' into dev-hui

2023-03-08 00:39:33 +01:00 · 2023-03-08 00:39:33 +01:00 · ed45aa2816
commit ed45aa2816
parent f286ae5bfa f236640427
221 changed files with 3122 additions and 1486 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -1,4 +1,4 @@
-Checks: 'modernize-*,-modernize-make-*,-modernize-use-auto,-modernize-raw-string-literal,-modernize-avoid-c-arrays,-modernize-use-trailing-return-type,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming'
+Checks: 'modernize-*,-modernize-use-nodiscard,-modernize-concat-nested-namespaces,-modernize-make-*,-modernize-use-auto,-modernize-raw-string-literal,-modernize-avoid-c-arrays,-modernize-use-trailing-return-type,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming'
 CheckOptions:
  - { key: readability-identifier-naming.ClassCase,                 value: CamelCase  }
  - { key: readability-identifier-naming.StructCase,                value: CamelCase  }
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@ -34,11 +34,11 @@ jobs:
        python -m pip install awscli

    - name: Cache Maven packages
-      uses: actions/cache@937d24475381cd9c75ae6db12cb4e79714b926ed # v3.0.11
+      uses: actions/cache@6998d139ddd3e68c71e9e398d8e40b71a2f39812 # v3.2.5
      with:
        path: ~/.m2
        key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
-        restore-keys: ${{ runner.os }}-m2
+        restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}

    - name: Test XGBoost4J
      run: |
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -144,7 +144,18 @@ jobs:
        python -m pip install wheel setuptools cpplint pylint
    - name: Run lint
      run: |
-        python dmlc-core/scripts/lint.py xgboost cpp R-package/src
+        python3 dmlc-core/scripts/lint.py xgboost cpp R-package/src
+
+        python3 dmlc-core/scripts/lint.py --exclude_path \
+            python-package/xgboost/dmlc-core \
+            python-package/xgboost/include \
+            python-package/xgboost/lib \
+            python-package/xgboost/rabit \
+            python-package/xgboost/src \
+            --pylint-rc python-package/.pylintrc \
+            xgboost \
+            cpp \
+            include src python-package

  sphinx:
    runs-on: ubuntu-latest
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 project(xgboost LANGUAGES CXX C VERSION 2.0.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
@ -212,9 +212,6 @@ find_package(Threads REQUIRED)

 if (USE_OPENMP)
  if (APPLE)
-    # Require CMake 3.16+ on Mac OSX, as previous versions of CMake had trouble locating
-    # OpenMP on Mac. See https://github.com/dmlc/xgboost/pull/5146#issuecomment-568312706
-    cmake_minimum_required(VERSION 3.16)
    find_package(OpenMP)
    if (NOT OpenMP_FOUND)
      # Try again with extra path info; required for libomp 15+ from Homebrew
--- a/R-package/CMakeLists.txt
+++ b/R-package/CMakeLists.txt
@ -30,7 +30,7 @@ if (USE_OPENMP)
 endif (USE_OPENMP)
 set_target_properties(
  xgboost-r PROPERTIES
-  CXX_STANDARD 14
+  CXX_STANDARD 17
  CXX_STANDARD_REQUIRED ON
  POSITION_INDEPENDENT_CODE ON)

--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -66,4 +66,4 @@ Imports:
    jsonlite (>= 1.0),
 RoxygenNote: 7.2.3
 Encoding: UTF-8
-SystemRequirements: GNU make, C++14
+SystemRequirements: GNU make, C++17
--- a/R-package/configure
+++ b/R-package/configure
@ -2096,9 +2096,9 @@ if test -z "${R_HOME}"; then
  exit 1
 fi

-CXX14=`"${R_HOME}/bin/R" CMD config CXX14`
-CXX14STD=`"${R_HOME}/bin/R" CMD config CXX14STD`
-CXX="${CXX14} ${CXX14STD}"
+CXX17=`"${R_HOME}/bin/R" CMD config CXX17`
+CXX17STD=`"${R_HOME}/bin/R" CMD config CXX17STD`
+CXX="${CXX17} ${CXX17STD}"
 CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`

 CC=`"${R_HOME}/bin/R" CMD config CC`
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@ -10,9 +10,9 @@ if test -z "${R_HOME}"; then
  exit 1
 fi

-CXX14=`"${R_HOME}/bin/R" CMD config CXX14`
-CXX14STD=`"${R_HOME}/bin/R" CMD config CXX14STD`
-CXX="${CXX14} ${CXX14STD}"
+CXX17=`"${R_HOME}/bin/R" CMD config CXX17`
+CXX17STD=`"${R_HOME}/bin/R" CMD config CXX17STD`
+CXX="${CXX17} ${CXX17STD}"
 CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`

 CC=`"${R_HOME}/bin/R" CMD config CC`
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@ -3,7 +3,7 @@ PKGROOT=../../
 ENABLE_STD_THREAD=1
 # _*_ mode: Makefile; _*_

-CXX_STD = CXX14
+CXX_STD = CXX17

 XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
           -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
@ -36,6 +36,8 @@ OBJECTS= \
    $(PKGROOT)/src/objective/hinge.o \
    $(PKGROOT)/src/objective/aft_obj.o \
    $(PKGROOT)/src/objective/adaptive.o \
+    $(PKGROOT)/src/objective/init_estimation.o \
+    $(PKGROOT)/src/objective/quantile_obj.o \
    $(PKGROOT)/src/gbm/gbm.o \
    $(PKGROOT)/src/gbm/gbtree.o \
    $(PKGROOT)/src/gbm/gbtree_model.o \
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@ -3,7 +3,7 @@ PKGROOT=../../
 ENABLE_STD_THREAD=0
 # _*_ mode: Makefile; _*_

-CXX_STD = CXX14
+CXX_STD = CXX17

 XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
           -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
@ -36,6 +36,8 @@ OBJECTS= \
    $(PKGROOT)/src/objective/hinge.o \
    $(PKGROOT)/src/objective/aft_obj.o \
    $(PKGROOT)/src/objective/adaptive.o \
+    $(PKGROOT)/src/objective/init_estimation.o \
+    $(PKGROOT)/src/objective/quantile_obj.o \
    $(PKGROOT)/src/gbm/gbm.o \
    $(PKGROOT)/src/gbm/gbtree.o \
    $(PKGROOT)/src/gbm/gbtree_model.o \
--- a/cmake/Sanitizer.cmake
+++ b/cmake/Sanitizer.cmake
@ -8,9 +8,6 @@ macro(enable_sanitizer sanitizer)
  if(${sanitizer} MATCHES "address")
    find_package(ASan)
    set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=address")
-    if (ASan_FOUND)
-      link_libraries(${ASan_LIBRARY})
-    endif (ASan_FOUND)

  elseif(${sanitizer} MATCHES "thread")
    find_package(TSan)
@ -22,16 +19,10 @@ macro(enable_sanitizer sanitizer)
  elseif(${sanitizer} MATCHES "leak")
    find_package(LSan)
    set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=leak")
-    if (LSan_FOUND)
-      link_libraries(${LSan_LIBRARY})
-    endif (LSan_FOUND)

  elseif(${sanitizer} MATCHES "undefined")
    find_package(UBSan)
    set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=undefined -fno-sanitize-recover=undefined")
-    if (UBSan_FOUND)
-      link_libraries(${UBSan_LIBRARY})
-    endif (UBSan_FOUND)

  else()
    message(FATAL_ERROR "Santizer ${sanitizer} not supported.")
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@ -178,17 +178,10 @@ function(xgboost_set_cuda_flags target)
      $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
  endif (MSVC)

-  if (PLUGIN_RMM)
  set_target_properties(${target} PROPERTIES
    CUDA_STANDARD 17
    CUDA_STANDARD_REQUIRED ON
    CUDA_SEPARABLE_COMPILATION OFF)
-  else ()
-    set_target_properties(${target} PROPERTIES
-      CUDA_STANDARD 14
-      CUDA_STANDARD_REQUIRED ON
-      CUDA_SEPARABLE_COMPILATION OFF)
-  endif (PLUGIN_RMM)
 endfunction(xgboost_set_cuda_flags)

 macro(xgboost_link_nccl target)
@ -205,17 +198,10 @@ endmacro(xgboost_link_nccl)

 # compile options
 macro(xgboost_target_properties target)
-  if (PLUGIN_RMM)
  set_target_properties(${target} PROPERTIES
    CXX_STANDARD 17
    CXX_STANDARD_REQUIRED ON
    POSITION_INDEPENDENT_CODE ON)
-  else ()
-    set_target_properties(${target} PROPERTIES
-      CXX_STANDARD 14
-      CXX_STANDARD_REQUIRED ON
-      POSITION_INDEPENDENT_CODE ON)
-  endif (PLUGIN_RMM)

  if (HIDE_CXX_SYMBOLS)
    #-- Hide all C++ symbols
--- a/cmake/modules/FindASan.cmake
+++ b/cmake/modules/FindASan.cmake
@ -1,7 +1,7 @@
 set(ASan_LIB_NAME ASan)

 find_library(ASan_LIBRARY
-  NAMES libasan.so libasan.so.5 libasan.so.4 libasan.so.3 libasan.so.2 libasan.so.1 libasan.so.0
+  NAMES libasan.so libasan.so.6 libasan.so.5 libasan.so.4 libasan.so.3 libasan.so.2 libasan.so.1 libasan.so.0
  PATHS ${SANITIZER_PATH} /usr/lib64 /usr/lib /usr/local/lib64 /usr/local/lib ${CMAKE_PREFIX_PATH}/lib)

 include(FindPackageHandleStandardArgs)
--- a/cmake/version_config.h.in
+++ b/cmake/version_config.h.in
@ -1,11 +1,11 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023 by XGBoost contributors
 */
 #ifndef XGBOOST_VERSION_CONFIG_H_
 #define XGBOOST_VERSION_CONFIG_H_

-#define XGBOOST_VER_MAJOR @xgboost_VERSION_MAJOR@
-#define XGBOOST_VER_MINOR @xgboost_VERSION_MINOR@
-#define XGBOOST_VER_PATCH @xgboost_VERSION_PATCH@
+#define XGBOOST_VER_MAJOR @xgboost_VERSION_MAJOR@  /* NOLINT */
+#define XGBOOST_VER_MINOR @xgboost_VERSION_MINOR@  /* NOLINT */
+#define XGBOOST_VER_PATCH @xgboost_VERSION_PATCH@  /* NOLINT */

 #endif  // XGBOOST_VERSION_CONFIG_H_
--- a/demo/c-api/CMakeLists.txt
+++ b/demo/c-api/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(xgboost-c-examples)

 add_subdirectory(basic)
--- a/demo/c-api/external-memory/CMakeLists.txt
+++ b/demo/c-api/external-memory/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(external-memory-demo LANGUAGES C VERSION 0.0.1)

 find_package(xgboost REQUIRED)
--- a/demo/c-api/inference/CMakeLists.txt
+++ b/demo/c-api/inference/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(inference-demo LANGUAGES C VERSION 0.0.1)
 find_package(xgboost REQUIRED)

--- a/demo/dask/cpu_survival.py
+++ b/demo/dask/cpu_survival.py
@ -8,9 +8,9 @@ import os

 import dask.dataframe as dd
 from dask.distributed import Client, LocalCluster
-from xgboost.dask import DaskDMatrix

 import xgboost as xgb
+from xgboost.dask import DaskDMatrix


 def main(client):
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@ -5,9 +5,9 @@ Example of training with Dask on CPU
 """
 from dask import array as da
 from dask.distributed import Client, LocalCluster
-from xgboost.dask import DaskDMatrix

 import xgboost as xgb
+from xgboost.dask import DaskDMatrix


 def main(client):
--- a/demo/dask/dask_callbacks.py
+++ b/demo/dask/dask_callbacks.py
@ -6,9 +6,9 @@ import numpy as np
 from dask.distributed import Client, LocalCluster
 from dask_ml.datasets import make_regression
 from dask_ml.model_selection import train_test_split
-from xgboost.dask import DaskDMatrix

 import xgboost as xgb
+from xgboost.dask import DaskDMatrix


 def probability_for_going_backward(epoch):
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@ -7,10 +7,10 @@ from dask import array as da
 from dask import dataframe as dd
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
-from xgboost.dask import DaskDMatrix

 import xgboost as xgb
 from xgboost import dask as dxgb
+from xgboost.dask import DaskDMatrix


 def using_dask_matrix(client: Client, X, y):
--- a/demo/guide-python/quantile_regression.py
+++ b/demo/guide-python/quantile_regression.py
@ -0,0 +1,122 @@
+"""
+Quantile Regression
+===================
+
+The script is inspired by this awesome example in sklearn:
+https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html
+
+"""
+import argparse
+from typing import Dict
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+import xgboost as xgb
+
+
+def f(x: np.ndarray) -> np.ndarray:
+    """The function to predict."""
+    return x * np.sin(x)
+
+
+def quantile_loss(args: argparse.Namespace) -> None:
+    """Train a quantile regression model."""
+    rng = np.random.RandomState(1994)
+    # Generate a synthetic dataset for demo, the generate process is from the sklearn
+    # example.
+    X = np.atleast_2d(rng.uniform(0, 10.0, size=1000)).T
+    expected_y = f(X).ravel()
+
+    sigma = 0.5 + X.ravel() / 10.0
+    noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2.0 / 2.0)
+    y = expected_y + noise
+
+    # Train on 0.05 and 0.95 quantiles. The model is similar to multi-class and
+    # multi-target models.
+    alpha = np.array([0.05, 0.5, 0.95])
+    evals_result: Dict[str, Dict] = {}
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+    # We will be using the `hist` tree method, quantile DMatrix can be used to preserve
+    # memory.
+    # Do not use the `exact` tree method for quantile regression, otherwise the
+    # performance might drop.
+    Xy = xgb.QuantileDMatrix(X, y)
+    # use Xy as a reference
+    Xy_test = xgb.QuantileDMatrix(X_test, y_test, ref=Xy)
+
+    booster = xgb.train(
+        {
+            # Use the quantile objective function.
+            "objective": "reg:quantileerror",
+            "tree_method": "hist",
+            "quantile_alpha": alpha,
+            # Let's try not to overfit.
+            "learning_rate": 0.04,
+            "max_depth": 5,
+        },
+        Xy,
+        num_boost_round=32,
+        early_stopping_rounds=2,
+        # The evaluation result is a weighted average across multiple quantiles.
+        evals=[(Xy, "Train"), (Xy_test, "Test")],
+        evals_result=evals_result,
+    )
+    xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
+    scores = booster.inplace_predict(xx)
+    # dim 1 is the quantiles
+    assert scores.shape[0] == xx.shape[0]
+    assert scores.shape[1] == alpha.shape[0]
+
+    y_lower = scores[:, 0]  # alpha=0.05
+    y_med = scores[:, 1]  # alpha=0.5, median
+    y_upper = scores[:, 2]  # alpha=0.95
+
+    # Train a mse model for comparison
+    booster = xgb.train(
+        {
+            "objective": "reg:squarederror",
+            "tree_method": "hist",
+            # Let's try not to overfit.
+            "learning_rate": 0.04,
+            "max_depth": 5,
+        },
+        Xy,
+        num_boost_round=32,
+        early_stopping_rounds=2,
+        evals=[(Xy, "Train"), (Xy_test, "Test")],
+        evals_result=evals_result,
+    )
+    xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
+    y_pred = booster.inplace_predict(xx)
+
+    if args.plot:
+        from matplotlib import pyplot as plt
+
+        fig = plt.figure(figsize=(10, 10))
+        plt.plot(xx, f(xx), "g:", linewidth=3, label=r"$f(x) = x\,\sin(x)$")
+        plt.plot(X_test, y_test, "b.", markersize=10, label="Test observations")
+        plt.plot(xx, y_med, "r-", label="Predicted median")
+        plt.plot(xx, y_pred, "m-", label="Predicted mean")
+        plt.plot(xx, y_upper, "k-")
+        plt.plot(xx, y_lower, "k-")
+        plt.fill_between(
+            xx.ravel(), y_lower, y_upper, alpha=0.4, label="Predicted 90% interval"
+        )
+        plt.xlabel("$x$")
+        plt.ylabel("$f(x)$")
+        plt.ylim(-10, 25)
+        plt.legend(loc="upper left")
+        plt.show()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--plot",
+        action="store_true",
+        help="Specify it to enable plotting the outputs.",
+    )
+    args = parser.parse_args()
+    quantile_loss(args)
--- a/demo/guide-python/spark_estimator_examples.py
+++ b/demo/guide-python/spark_estimator_examples.py
@ -10,6 +10,7 @@ from pyspark.ml.linalg import Vectors
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import rand
 from sklearn.model_selection import train_test_split
+
 from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor

 spark = SparkSession.builder.master("local[*]").getOrCreate()
--- a/demo/nvflare/custom/controller.py
+++ b/demo/nvflare/custom/controller.py
@ -4,7 +4,6 @@ Example of training controller with NVFlare
 """
 import multiprocessing

-import xgboost.federated
 from nvflare.apis.client import Client
 from nvflare.apis.fl_context import FLContext
 from nvflare.apis.impl.controller import Controller, Task
@ -12,6 +11,8 @@ from nvflare.apis.shareable import Shareable
 from nvflare.apis.signal import Signal
 from trainer import SupportedTasks

+import xgboost.federated
+

 class XGBoostController(Controller):
    def __init__(self, port: int, world_size: int, server_key_path: str,
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit dfd9365264a060a5096734b7d892e1858b6d2722
+Subproject commit 81db539486ce6525b31b971545edffee2754aced
--- a/doc/model.schema
+++ b/doc/model.schema
@ -440,6 +440,20 @@
              },
              "type": "object"
            },
+            {
+              "properties": {
+                "name": {
+                  "const": "reg:quantileerror"
+                },
+                "quantile_loss_param": {
+                  "type": "object",
+                  "properties": {
+                    "quantle_alpha": {"type": "array"}
+                  }
+                }
+              },
+              "type": "object"
+            },
            {
              "type": "object",
              "properties": {
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -348,6 +348,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
  - ``reg:logistic``: logistic regression.
  - ``reg:pseudohubererror``: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
  - ``reg:absoluteerror``: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction. If used in distributed training, the leaf value is calculated as the mean value from all workers, which is not guaranteed to be optimal.
+  - ``reg:quantileerror``: Quantile loss, also known as ``pinball loss``. See later sections for its parameter and :ref:`sphx_glr_python_examples_quantile_regression.py` for a worked example.
  - ``binary:logistic``: logistic regression for binary classification, output probability
  - ``binary:logitraw``: logistic regression for binary classification, output score before logistic transformation
  - ``binary:hinge``: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
@ -441,6 +442,11 @@ Parameter for using Pseudo-Huber (``reg:pseudohubererror``)

 * ``huber_slope`` : A parameter used for Pseudo-Huber loss to define the :math:`\delta` term. [default = 1.0]

+Parameter for using Quantile Loss (``reg:quantileerror``)
+=========================================================
+
+* ``quantile_alpha``: A scala or a list of targeted quantiles.
+
 ***********************
 Command Line Parameters
 ***********************
--- a/doc/tutorials/c_api_tutorial.rst
+++ b/doc/tutorials/c_api_tutorial.rst
@ -45,7 +45,7 @@ Use ``find_package()`` and ``target_link_libraries()`` in your application's CMa

 .. code-block:: cmake

-    cmake_minimum_required(VERSION 3.13)
+    cmake_minimum_required(VERSION 3.18)
    project(your_project_name LANGUAGES C CXX VERSION your_project_version)
    find_package(xgboost REQUIRED)
    add_executable(your_project_name /path/to/project_file.c)
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@ -48,21 +48,6 @@
 #define XGBOOST_ALIGNAS(X)
 #endif  // defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4)

-#if defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) && \
-    !defined(__CUDACC__) && !defined(__sun) && !defined(sun)
-#include <parallel/algorithm>
-#define XGBOOST_PARALLEL_SORT(X, Y, Z) __gnu_parallel::sort((X), (Y), (Z))
-#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) \
-  __gnu_parallel::stable_sort((X), (Y), (Z))
-#elif defined(_MSC_VER) && (!__INTEL_COMPILER)
-#include <ppl.h>
-#define XGBOOST_PARALLEL_SORT(X, Y, Z) concurrency::parallel_sort((X), (Y), (Z))
-#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) std::stable_sort((X), (Y), (Z))
-#else
-#define XGBOOST_PARALLEL_SORT(X, Y, Z) std::sort((X), (Y), (Z))
-#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) std::stable_sort((X), (Y), (Z))
-#endif  // GLIBC VERSION
-
 #if defined(__GNUC__)
 #define XGBOOST_EXPECT(cond, ret)  __builtin_expect((cond), (ret))
 #else
--- a/include/xgboost/cache.h
+++ b/include/xgboost/cache.h
@ -4,18 +4,21 @@
 #ifndef XGBOOST_CACHE_H_
 #define XGBOOST_CACHE_H_

-#include <xgboost/logging.h>  // CHECK_EQ
+#include <xgboost/logging.h>  // for CHECK_EQ, CHECK

-#include <cstddef>            // std::size_t
-#include <memory>             // std::weak_ptr,std::shared_ptr,std::make_shared
-#include <queue>              // std:queue
-#include <unordered_map>      // std::unordered_map
-#include <vector>             // std::vector
+#include <cstddef>            // for size_t
+#include <memory>             // for weak_ptr, shared_ptr, make_shared
+#include <mutex>              // for mutex, lock_guard
+#include <queue>              // for queue
+#include <thread>             // for thread
+#include <unordered_map>      // for unordered_map
+#include <utility>            // for move
+#include <vector>             // for vector

 namespace xgboost {
 class DMatrix;
 /**
- * \brief FIFO cache for DMatrix related data.
+ * \brief Thread-aware FIFO cache for DMatrix related data.
 *
 * \tparam CacheT The type that needs to be cached.
 */
@ -30,13 +33,37 @@ class DMatrixCache {

    CacheT const& Value() const { return *value; }
    CacheT& Value() { return *value; }
+
+    Item(std::shared_ptr<DMatrix> m, std::shared_ptr<CacheT> v) : ref{m}, value{std::move(v)} {}
  };

  static constexpr std::size_t DefaultSize() { return 32; }

+ private:
+  mutable std::mutex lock_;
+
 protected:
-  std::unordered_map<DMatrix const*, Item> container_;
-  std::queue<DMatrix const*> queue_;
+  struct Key {
+    DMatrix const* ptr;
+    std::thread::id const thread_id;
+
+    bool operator==(Key const& that) const {
+      return ptr == that.ptr && thread_id == that.thread_id;
+    }
+  };
+  struct Hash {
+    std::size_t operator()(Key const& key) const noexcept {
+      std::size_t f = std::hash<DMatrix const*>()(key.ptr);
+      std::size_t s = std::hash<std::thread::id>()(key.thread_id);
+      if (f == s) {
+        return f;
+      }
+      return f ^ s;
+    }
+  };
+
+  std::unordered_map<Key, Item, Hash> container_;
+  std::queue<Key> queue_;
  std::size_t max_size_;

  void CheckConsistent() const { CHECK_EQ(queue_.size(), container_.size()); }
@ -44,8 +71,8 @@ class DMatrixCache {
  void ClearExpired() {
    // Clear expired entries
    this->CheckConsistent();
-    std::vector<DMatrix const*> expired;
-    std::queue<DMatrix const*> remained;
+    std::vector<Key> expired;
+    std::queue<Key> remained;

    while (!queue_.empty()) {
      auto p_fmat = queue_.front();
@ -61,8 +88,8 @@ class DMatrixCache {
    CHECK(queue_.empty());
    CHECK_EQ(remained.size() + expired.size(), container_.size());

-    for (auto const* p_fmat : expired) {
-      container_.erase(p_fmat);
+    for (auto const& key : expired) {
+      container_.erase(key);
    }
    while (!remained.empty()) {
      auto p_fmat = remained.front();
@ -74,7 +101,9 @@ class DMatrixCache {

  void ClearExcess() {
    this->CheckConsistent();
-    while (queue_.size() >= max_size_) {
+    // clear half of the entries to prevent repeatingly clearing cache.
+    std::size_t half_size = max_size_ / 2;
+    while (queue_.size() >= half_size && !queue_.empty()) {
      auto p_fmat = queue_.front();
      queue_.pop();
      container_.erase(p_fmat);
@ -88,7 +117,7 @@ class DMatrixCache {
   */
  explicit DMatrixCache(std::size_t cache_size) : max_size_{cache_size} {}
  /**
-   * \brief Cache a new DMatrix if it's no in the cache already.
+   * \brief Cache a new DMatrix if it's not in the cache already.
   *
   *  Passing in a `shared_ptr` is critical here.  First to create a `weak_ptr` inside the
   *  entry this shared pointer is necessary.  More importantly, the life time of this
@ -101,35 +130,42 @@ class DMatrixCache {
   *         created.
   */
  template <typename... Args>
-  std::shared_ptr<CacheT>& CacheItem(std::shared_ptr<DMatrix> m, Args const&... args) {
+  std::shared_ptr<CacheT> CacheItem(std::shared_ptr<DMatrix> m, Args const&... args) {
    CHECK(m);
+    std::lock_guard<std::mutex> guard{lock_};
+
    this->ClearExpired();
    if (container_.size() >= max_size_) {
      this->ClearExcess();
    }
    // after clear, cache size < max_size
    CHECK_LT(container_.size(), max_size_);
-    auto it = container_.find(m.get());
+    auto key = Key{m.get(), std::this_thread::get_id()};
+    auto it = container_.find(key);
    if (it == container_.cend()) {
      // after the new DMatrix, cache size is at most max_size
-      container_[m.get()] = {m, std::make_shared<CacheT>(args...)};
-      queue_.push(m.get());
+      container_.emplace(key, Item{m, std::make_shared<CacheT>(args...)});
+      queue_.emplace(key);
    }
-    return container_.at(m.get()).value;
+    return container_.at(key).value;
  }
  /**
   * \brief Get a const reference to the underlying hash map.  Clear expired caches before
   *        returning.
   */
  decltype(container_) const& Container() {
+    std::lock_guard<std::mutex> guard{lock_};
+
    this->ClearExpired();
    return container_;
  }

  std::shared_ptr<CacheT> Entry(DMatrix const* m) const {
-    CHECK(container_.find(m) != container_.cend());
-    CHECK(!container_.at(m).ref.expired());
-    return container_.at(m).value;
+    std::lock_guard<std::mutex> guard{lock_};
+    auto key = Key{m, std::this_thread::get_id()};
+    CHECK(container_.find(key) != container_.cend());
+    CHECK(!container_.at(key).ref.expired());
+    return container_.at(key).value;
  }
 };
 }  // namespace xgboost
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@ -124,18 +124,7 @@ class MetaInfo {
    return weights_.Size() != 0 ?  weights_.HostVector()[i] : 1.0f;
  }
  /*! \brief get sorted indexes (argsort) of labels by absolute value (used by cox loss) */
-  inline const std::vector<size_t>& LabelAbsSort() const {
-    if (label_order_cache_.size() == labels.Size()) {
-      return label_order_cache_;
-    }
-    label_order_cache_.resize(labels.Size());
-    std::iota(label_order_cache_.begin(), label_order_cache_.end(), 0);
-    const auto& l = labels.Data()->HostVector();
-    XGBOOST_PARALLEL_STABLE_SORT(label_order_cache_.begin(), label_order_cache_.end(),
-              [&l](size_t i1, size_t i2) {return std::abs(l[i1]) < std::abs(l[i2]);});
-
-    return label_order_cache_;
-  }
+  const std::vector<size_t>& LabelAbsSort(Context const* ctx) const;
  /*! \brief clear all the information */
  void Clear();
  /*!
@ -540,6 +529,16 @@ class DMatrix {
    return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
  }

+  /*! \brief Whether the data is split row-wise. */
+  bool IsRowSplit() const {
+    return Info().data_split_mode == DataSplitMode::kRow;
+  }
+
+  /*! \brief Whether the data is split column-wise. */
+  bool IsColumnSplit() const {
+    return Info().data_split_mode == DataSplitMode::kCol;
+  }
+
  /*!
   * \brief Load DMatrix from URI.
   * \param uri The URI of input.
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@ -1,5 +1,5 @@
 /**
- * Copyright by XGBoost Contributors 2019-2023
+ * Copyright 2019-2023 by XGBoost Contributors
 */
 #ifndef XGBOOST_JSON_H_
 #define XGBOOST_JSON_H_
@ -372,7 +372,7 @@ class Json {
  /*! \brief Use your own JsonWriter. */
  static void Dump(Json json, JsonWriter* writer);

-  Json() : ptr_{new JsonNull} {}
+  Json() = default;

  // number
  explicit Json(JsonNumber number) : ptr_{new JsonNumber(std::move(number))} {}
@ -462,7 +462,7 @@ class Json {
  IntrusivePtr<Value> const& Ptr() const { return ptr_; }

 private:
-  IntrusivePtr<Value> ptr_;
+  IntrusivePtr<Value> ptr_{new JsonNull};
 };

 /**
--- a/include/xgboost/json_io.h
+++ b/include/xgboost/json_io.h
@ -22,13 +22,13 @@ namespace detail {
 // static_cast and std::to_string.
 template <typename Char, std::enable_if_t<std::is_signed<Char>::value>* = nullptr>
 std::string CharToStr(Char c) {
-  static_assert(std::is_same<Char, char>::value, "");
+  static_assert(std::is_same<Char, char>::value);
  return std::string{c};
 }

 template <typename Char, std::enable_if_t<!std::is_signed<Char>::value>* = nullptr>
 std::string CharToStr(Char c) {
-  static_assert(std::is_same<Char, char>::value, "");
+  static_assert(std::is_same<Char, char>::value);
  return (c <= static_cast<char>(127) ? std::string{c} : std::to_string(c));
 }
 }  // namespace detail
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@ -15,14 +15,19 @@

 #include <algorithm>
 #include <cassert>
-#include <cinttypes>  // std::int32_t
+#include <cinttypes>  // for int32_t
+#include <cstddef>    // for size_t
 #include <limits>
 #include <string>
-#include <tuple>
+#include <tuple>  // for make_tuple
 #include <type_traits>
 #include <utility>
 #include <vector>

+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif  // defined(_MSC_VER)
+
 // decouple it from xgboost.
 #ifndef LINALG_HD
 #if defined(__CUDA__) || defined(__NVCC__)
@ -32,8 +37,7 @@
 #endif  // defined (__CUDA__) || defined(__NVCC__)
 #endif  // LINALG_HD

-namespace xgboost {
-namespace linalg {
+namespace xgboost::linalg {
 namespace detail {

 struct ArrayInterfaceHandler {
@ -47,14 +51,14 @@ struct ArrayInterfaceHandler {

 template <size_t dim, typename S, typename Head, size_t D>
 constexpr size_t Offset(S (&strides)[D], size_t n, Head head) {
-  static_assert(dim < D, "");
+  static_assert(dim < D);
  return n + head * strides[dim];
 }

 template <size_t dim, typename S, size_t D, typename Head, typename... Tail>
 constexpr std::enable_if_t<sizeof...(Tail) != 0, size_t> Offset(S (&strides)[D], size_t n,
                                                                Head head, Tail &&...rest) {
-  static_assert(dim < D, "");
+  static_assert(dim < D);
  return Offset<dim + 1>(strides, n + (head * strides[dim]), std::forward<Tail>(rest)...);
 }

@ -81,7 +85,7 @@ template <typename I>
 struct RangeTag {
  I beg;
  I end;
-  constexpr size_t Size() const { return end - beg; }
+  [[nodiscard]] constexpr size_t Size() const { return end - beg; }
 };

 /**
@ -146,21 +150,41 @@ inline LINALG_HD int Popc(uint64_t v) {
  return __popcll(v);
 #elif defined(__GNUC__) || defined(__clang__)
  return __builtin_popcountll(v);
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && _defined(_M_X64)
  return __popcnt64(v);
 #else
  return NativePopc(v);
 #endif  // compiler
 }

+template <std::size_t D, typename Head>
+LINALG_HD void IndexToArr(std::size_t (&arr)[D], Head head) {
+  static_assert(std::is_integral<std::remove_reference_t<Head>>::value, "Invalid index type.");
+  arr[D - 1] = head;
+}
+
+/**
+ * \brief Convert index from parameter pack to C-style array.
+ */
+template <std::size_t D, typename Head, typename... Rest>
+LINALG_HD void IndexToArr(std::size_t (&arr)[D], Head head, Rest &&...index) {
+  static_assert(sizeof...(Rest) < D, "Index overflow.");
+  static_assert(std::is_integral<std::remove_reference_t<Head>>::value, "Invalid index type.");
+  arr[D - sizeof...(Rest) - 1] = head;
+  IndexToArr(arr, std::forward<Rest>(index)...);
+}
+
 template <class T, std::size_t N, std::size_t... Idx>
-constexpr auto Arr2Tup(T (&arr)[N], std::index_sequence<Idx...>) {
+constexpr auto ArrToTuple(T (&arr)[N], std::index_sequence<Idx...>) {
  return std::make_tuple(arr[Idx]...);
 }

+/**
+ * \brief Convert C-styple array to std::tuple.
+ */
 template <class T, std::size_t N>
-constexpr auto Arr2Tup(T (&arr)[N]) {
-  return Arr2Tup(arr, std::make_index_sequence<N>{});
+constexpr auto ArrToTuple(T (&arr)[N]) {
+  return ArrToTuple(arr, std::make_index_sequence<N>{});
 }

 // uint division optimization inspired by the CIndexer in cupy.  Division operation is
@ -183,19 +207,19 @@ LINALG_HD auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {
    }
  }
  index[0] = idx;
-  return Arr2Tup(index);
+  return ArrToTuple(index);
 }

 template <size_t dim, typename I, int32_t D>
 void ReshapeImpl(size_t (&out_shape)[D], I s) {
-  static_assert(dim < D, "");
+  static_assert(dim < D);
  out_shape[dim] = s;
 }

 template <size_t dim, int32_t D, typename... S, typename I,
          std::enable_if_t<sizeof...(S) != 0> * = nullptr>
 void ReshapeImpl(size_t (&out_shape)[D], I &&s, S &&...rest) {
-  static_assert(dim < D, "");
+  static_assert(dim < D);
  out_shape[dim] = s;
  ReshapeImpl<dim + 1>(out_shape, std::forward<S>(rest)...);
 }
@ -225,7 +249,8 @@ struct Conjunction : std::true_type {};
 template <class B1>
 struct Conjunction<B1> : B1 {};
 template <class B1, class... Bn>
-struct Conjunction<B1, Bn...> : std::conditional_t<bool(B1::value), Conjunction<Bn...>, B1> {};
+struct Conjunction<B1, Bn...>
+    : std::conditional_t<static_cast<bool>(B1::value), Conjunction<Bn...>, B1> {};

 template <typename... Index>
 using IsAllIntegral = Conjunction<std::is_integral<std::remove_reference_t<Index>>...>;
@ -246,6 +271,11 @@ constexpr detail::RangeTag<I> Range(I beg, I end) {
  return {beg, end};
 }

+enum Order : std::uint8_t {
+  kC,  // Row major
+  kF,  // Col major
+};
+
 /**
 * \brief A tensor view with static type and dimension. It implements indexing and slicing.
 *
@ -286,8 +316,8 @@ class TensorView {
  template <size_t old_dim, size_t new_dim, int32_t D, typename I>
  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
                                detail::RangeTag<I> &&range) const {
-    static_assert(new_dim < D, "");
-    static_assert(old_dim < kDim, "");
+    static_assert(new_dim < D);
+    static_assert(old_dim < kDim);
    new_stride[new_dim] = stride_[old_dim];
    new_shape[new_dim] = range.Size();
    assert(static_cast<decltype(shape_[old_dim])>(range.end) <= shape_[old_dim]);
@ -301,8 +331,8 @@ class TensorView {
  template <size_t old_dim, size_t new_dim, int32_t D, typename I, typename... S>
  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
                                detail::RangeTag<I> &&range, S &&...slices) const {
-    static_assert(new_dim < D, "");
-    static_assert(old_dim < kDim, "");
+    static_assert(new_dim < D);
+    static_assert(old_dim < kDim);
    new_stride[new_dim] = stride_[old_dim];
    new_shape[new_dim] = range.Size();
    assert(static_cast<decltype(shape_[old_dim])>(range.end) <= shape_[old_dim]);
@ -315,8 +345,8 @@ class TensorView {

  template <size_t old_dim, size_t new_dim, int32_t D>
  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag) const {
-    static_assert(new_dim < D, "");
-    static_assert(old_dim < kDim, "");
+    static_assert(new_dim < D);
+    static_assert(old_dim < kDim);
    new_stride[new_dim] = stride_[old_dim];
    new_shape[new_dim] = shape_[old_dim];
    return 0;
@ -327,8 +357,8 @@ class TensorView {
  template <size_t old_dim, size_t new_dim, int32_t D, typename... S>
  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag,
                                S &&...slices) const {
-    static_assert(new_dim < D, "");
-    static_assert(old_dim < kDim, "");
+    static_assert(new_dim < D);
+    static_assert(old_dim < kDim);
    new_stride[new_dim] = stride_[old_dim];
    new_shape[new_dim] = shape_[old_dim];
    return MakeSliceDim<old_dim + 1, new_dim + 1, D>(new_shape, new_stride,
@ -338,7 +368,7 @@ class TensorView {
  template <size_t old_dim, size_t new_dim, int32_t D, typename Index>
  LINALG_HD size_t MakeSliceDim(DMLC_ATTRIBUTE_UNUSED size_t new_shape[D],
                                DMLC_ATTRIBUTE_UNUSED size_t new_stride[D], Index i) const {
-    static_assert(old_dim < kDim, "");
+    static_assert(old_dim < kDim);
    return stride_[old_dim] * i;
  }
  /**
@ -347,7 +377,7 @@ class TensorView {
  template <size_t old_dim, size_t new_dim, int32_t D, typename Index, typename... S>
  LINALG_HD std::enable_if_t<std::is_integral<Index>::value, size_t> MakeSliceDim(
      size_t new_shape[D], size_t new_stride[D], Index i, S &&...slices) const {
-    static_assert(old_dim < kDim, "");
+    static_assert(old_dim < kDim);
    auto offset = stride_[old_dim] * i;
    auto res =
        MakeSliceDim<old_dim + 1, new_dim, D>(new_shape, new_stride, std::forward<S>(slices)...);
@ -371,7 +401,11 @@ class TensorView {
   * \param device Device ordinal
   */
  template <typename I, int32_t D>
-  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], int32_t device)
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device)
+      : TensorView{data, shape, device, Order::kC} {}
+
+  template <typename I, int32_t D>
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device, Order order)
      : data_{data}, ptr_{data_.data()}, device_{device} {
    static_assert(D > 0 && D <= kDim, "Invalid shape.");
    // shape
@ -380,7 +414,19 @@ class TensorView {
      shape_[i] = 1;
    }
    // stride
+    switch (order) {
+      case Order::kC: {
        detail::CalcStride(shape_, stride_);
+        break;
+      }
+      case Order::kF: {
+        detail::CalcStride<kDim, true>(shape_, stride_);
+        break;
+      }
+      default: {
+        SPAN_CHECK(false);
+      }
+    }
    // size
    this->CalcSize();
  }
@ -484,19 +530,19 @@ class TensorView {
  /**
   * \brief Number of items in the tensor.
   */
-  LINALG_HD size_t Size() const { return size_; }
+  LINALG_HD [[nodiscard]] std::size_t Size() const { return size_; }
  /**
   * \brief Whether this is a contiguous array, both C and F contiguous returns true.
   */
-  LINALG_HD bool Contiguous() const {
+  LINALG_HD [[nodiscard]] bool Contiguous() const {
    return data_.size() == this->Size() || this->CContiguous() || this->FContiguous();
  }
  /**
   * \brief Whether it's a c-contiguous array.
   */
-  LINALG_HD bool CContiguous() const {
+  LINALG_HD [[nodiscard]] bool CContiguous() const {
    StrideT stride;
-    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value, "");
+    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
    // It's contiguous if the stride can be calculated from shape.
    detail::CalcStride(shape_, stride);
    return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};
@ -504,9 +550,9 @@ class TensorView {
  /**
   * \brief Whether it's a f-contiguous array.
   */
-  LINALG_HD bool FContiguous() const {
+  LINALG_HD [[nodiscard]] bool FContiguous() const {
    StrideT stride;
-    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value, "");
+    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
    // It's contiguous if the stride can be calculated from shape.
    detail::CalcStride<kDim, true>(shape_, stride);
    return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};
@ -524,16 +570,38 @@ class TensorView {
 /**
 * \brief Constructor for automatic type deduction.
 */
-template <typename Container, typename I, int32_t D,
-          std::enable_if_t<!common::detail::IsSpan<Container>::value> * = nullptr>
-auto MakeTensorView(Container &data, I const (&shape)[D], int32_t device) {  // NOLINT
+template <typename Container, typename... S,
+          std::enable_if_t<!common::detail::IsSpan<Container>::value &&
+                           !std::is_pointer_v<Container>> * = nullptr>
+auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) {  // NOLINT
  using T = typename Container::value_type;
-  return TensorView<T, D>{data, shape, device};
+  std::size_t in_shape[sizeof...(S)];
+  detail::IndexToArr(in_shape, std::forward<S>(shape)...);
+  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->gpu_id};
 }

-template <typename T, typename I, int32_t D>
-LINALG_HD auto MakeTensorView(common::Span<T> data, I const (&shape)[D], int32_t device) {
-  return TensorView<T, D>{data, shape, device};
+template <typename T, typename... S>
+LINALG_HD auto MakeTensorView(std::int32_t device, common::Span<T> data, S &&...shape) {
+  std::size_t in_shape[sizeof...(S)];
+  detail::IndexToArr(in_shape, std::forward<S>(shape)...);
+  return TensorView<T, sizeof...(S)>{data, in_shape, device};
+}
+
+template <typename T, typename... S>
+auto MakeTensorView(Context const *ctx, common::Span<T> data, S &&...shape) {
+  return MakeTensorView(ctx->gpu_id, data, std::forward<S>(shape)...);
+}
+
+template <typename T, typename... S>
+auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
+  auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();
+  return MakeTensorView(ctx->gpu_id, span, std::forward<S>(shape)...);
+}
+
+template <typename T, typename... S>
+auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {
+  auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan();
+  return MakeTensorView(ctx->gpu_id, span, std::forward<S>(shape)...);
 }

 /**
@ -548,6 +616,18 @@ LINALG_HD auto UnravelIndex(size_t idx, common::Span<size_t const, D> shape) {
  }
 }

+template <size_t D>
+LINALG_HD auto UnravelIndex(size_t idx, std::size_t const (&shape)[D]) {
+  return UnravelIndex(idx, common::Span<std::size_t const, D>(shape));
+}
+
+template <typename... S>
+LINALG_HD auto UnravelIndex(std::size_t idx, S... shape) {
+  std::size_t s[sizeof...(S)];
+  detail::IndexToArr(s, shape...);
+  return UnravelIndex(idx, common::Span<std::size_t const, sizeof...(S)>(s));
+}
+
 /**
 * \brief A view over a vector, specialization of Tensor
 *
@ -615,7 +695,7 @@ Json ArrayInterface(TensorView<T const, D> const &t) {
  array_interface["version"] = 3;

  char constexpr kT = detail::ArrayInterfaceHandler::TypeChar<T>();
-  static_assert(kT != '\0', "");
+  static_assert(kT != '\0');
  if (DMLC_LITTLE_ENDIAN) {
    array_interface["typestr"] = String{"<" + (kT + std::to_string(sizeof(T)))};
  } else {
@ -665,6 +745,7 @@ class Tensor {
 private:
  HostDeviceVector<T> data_;
  ShapeT shape_{0};
+  Order order_{Order::kC};

  template <typename I, std::int32_t D>
  void Initialize(I const (&shape)[D], std::int32_t device) {
@ -690,11 +771,12 @@ class Tensor {
   * See \ref TensorView for parameters of this constructor.
   */
  template <typename I, int32_t D>
-  explicit Tensor(I const (&shape)[D], int32_t device)
-      : Tensor{common::Span<I const, D>{shape}, device} {}
+  explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
+      : Tensor{common::Span<I const, D>{shape}, device, order} {}

  template <typename I, size_t D>
-  explicit Tensor(common::Span<I const, D> shape, int32_t device) {
+  explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC)
+      : order_{order} {
    // No device unroll as this is a host only function.
    std::copy(shape.data(), shape.data() + D, shape_);
    for (auto i = D; i < kDim; ++i) {
@ -713,7 +795,8 @@ class Tensor {
   * Initialize from 2 host iterators.
   */
  template <typename It, typename I, int32_t D>
-  explicit Tensor(It begin, It end, I const (&shape)[D], int32_t device) {
+  explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC)
+      : order_{order} {
    auto &h_vec = data_.HostVector();
    h_vec.insert(h_vec.begin(), begin, end);
    // shape
@ -721,8 +804,9 @@ class Tensor {
  }

  template <typename I, int32_t D>
-  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D],
-                  int32_t device = Context::kCpuId) {
+  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], std::int32_t device,
+                  Order order = kC)
+      : order_{order} {
    auto &h_vec = data_.HostVector();
    h_vec = data;
    // shape
@ -752,27 +836,27 @@ class Tensor {
    if (device >= 0) {
      data_.SetDevice(device);
      auto span = data_.DeviceSpan();
-      return {span, shape_, device};
+      return {span, shape_, device, order_};
    } else {
      auto span = data_.HostSpan();
-      return {span, shape_, device};
+      return {span, shape_, device, order_};
    }
  }
  TensorView<T const, kDim> View(int32_t device) const {
    if (device >= 0) {
      data_.SetDevice(device);
      auto span = data_.ConstDeviceSpan();
-      return {span, shape_, device};
+      return {span, shape_, device, order_};
    } else {
      auto span = data_.ConstHostSpan();
-      return {span, shape_, device};
+      return {span, shape_, device, order_};
    }
  }

  auto HostView() const { return this->View(-1); }
  auto HostView() { return this->View(-1); }

-  size_t Size() const { return data_.Size(); }
+  [[nodiscard]] size_t Size() const { return data_.Size(); }
  auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }
  auto Shape(size_t i) const { return shape_[i]; }

@ -826,12 +910,26 @@ class Tensor {
  void Reshape(size_t (&shape)[D]) {
    this->Reshape(common::Span<size_t const, D>{shape});
  }
+  /**
+   * \brief Get a host view on the slice.
+   */
+  template <typename... S>
+  auto Slice(S &&...slices) const {
+    return this->HostView().Slice(std::forward<S>(slices)...);
+  }
+  /**
+   * \brief Get a host view on the slice.
+   */
+  template <typename... S>
+  auto Slice(S &&...slices) {
+    return this->HostView().Slice(std::forward<S>(slices)...);
+  }

  /**
   * \brief Set device ordinal for this tensor.
   */
  void SetDevice(int32_t device) const { data_.SetDevice(device); }
-  int32_t DeviceIdx() const { return data_.DeviceIdx(); }
+  [[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
 };

 template <typename T>
@ -889,8 +987,7 @@ void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
    shape[0] = l->Shape(0) + r.Shape(0);
  });
 }
-}  // namespace linalg
-}  // namespace xgboost
+}  // namespace xgboost::linalg

 #if defined(LINALG_HD)
 #undef LINALG_HD
--- a/include/xgboost/metric.h
+++ b/include/xgboost/metric.h
@ -8,15 +8,16 @@
 #define XGBOOST_METRIC_H_

 #include <dmlc/registry.h>
-#include <xgboost/model.h>
-#include <xgboost/data.h>
 #include <xgboost/base.h>
+#include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>
+#include <xgboost/model.h>

-#include <vector>
-#include <string>
 #include <functional>
+#include <memory>  // shared_ptr
+#include <string>
 #include <utility>
+#include <vector>

 namespace xgboost {
 struct Context;
@ -27,7 +28,7 @@ struct Context;
 */
 class Metric : public Configurable {
 protected:
-  Context const* ctx_;
+  Context const* ctx_{nullptr};

 public:
  /*!
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@ -116,12 +116,13 @@ class ObjFunction : public Configurable {
   *
   * \param position The leaf index for each rows.
   * \param info MetaInfo providing labels and weights.
+   * \param learning_rate The learning rate for current iteration.
   * \param prediction Model prediction after transformation.
   * \param group_idx The group index for this tree, 0 when it's not multi-target or multi-class.
   * \param p_tree Tree that needs to be updated.
   */
  virtual void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& /*position*/,
-                              MetaInfo const& /*info*/,
+                              MetaInfo const& /*info*/, float /*learning_rate*/,
                              HostDeviceVector<float> const& /*prediction*/,
                              std::int32_t /*group_idx*/, RegTree* /*p_tree*/) const {}

--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@ -14,6 +14,8 @@
 #include <functional>  // std::function
 #include <memory>
 #include <string>
+#include <thread>   // for get_id
+#include <utility>  // for make_pair
 #include <vector>

 // Forward declarations
@ -48,18 +50,17 @@ struct PredictionCacheEntry {
 * \brief A container for managed prediction caches.
 */
 class PredictionContainer : public DMatrixCache<PredictionCacheEntry> {
-  // we cache up to 32 DMatrix
-  std::size_t static constexpr DefaultSize() { return 32; }
+  // We cache up to 64 DMatrix for all threads
+  std::size_t static constexpr DefaultSize() { return 64; }

 public:
  PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
-  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, int32_t device) {
-    this->CacheItem(m);
-    auto p_cache = this->container_.find(m.get());
+  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, std::int32_t device) {
+    auto p_cache = this->CacheItem(m);
    if (device != Context::kCpuId) {
-      p_cache->second.Value().predictions.SetDevice(device);
+      p_cache->predictions.SetDevice(device);
    }
-    return p_cache->second.Value();
+    return *p_cache;
  }
 };

--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@ -24,6 +24,9 @@
 #include <vector>

 namespace xgboost {
+namespace tree {
+struct TrainParam;
+}

 class Json;
 struct Context;
@ -56,8 +59,10 @@ class TreeUpdater : public Configurable {
   *        tree can be used.
   */
  virtual bool HasNodePosition() const { return false; }
-  /*!
+  /**
   * \brief perform update to the tree models
+   *
+   * \param param Hyper-parameter for constructing trees.
   * \param gpair the gradient pair statistics of the data
   * \param data The data matrix passed to the updater.
   * \param out_position The leaf index for each row.  The index is negated if that row is
@ -67,8 +72,8 @@ class TreeUpdater : public Configurable {
   *         but maybe different random seeds, usually one tree is passed in at a time,
   *         there can be multiple trees when we train random forest style model
   */
-  virtual void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* data,
-                      common::Span<HostDeviceVector<bst_node_t>> out_position,
+  virtual void Update(tree::TrainParam const* param, HostDeviceVector<GradientPair>* gpair,
+                      DMatrix* data, common::Span<HostDeviceVector<bst_node_t>> out_position,
                      const std::vector<RegTree*>& out_trees) = 0;

  /*!
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@ -1,11 +1,11 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023 by XGBoost contributors
 */
 #ifndef XGBOOST_VERSION_CONFIG_H_
 #define XGBOOST_VERSION_CONFIG_H_

-#define XGBOOST_VER_MAJOR 2
-#define XGBOOST_VER_MINOR 0
-#define XGBOOST_VER_PATCH 0
+#define XGBOOST_VER_MAJOR 2  /* NOLINT */
+#define XGBOOST_VER_MINOR 0  /* NOLINT */
+#define XGBOOST_VER_PATCH 0  /* NOLINT */

 #endif  // XGBOOST_VERSION_CONFIG_H_
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -181,7 +181,7 @@
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-assembly-plugin</artifactId>
-                        <version>3.4.2</version>
+                        <version>3.5.0</version>
                        <configuration>
                            <descriptorRefs>
                                <descriptorRef>jar-with-dependencies</descriptorRef>
@ -392,7 +392,7 @@
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
-                <version>4.8.0</version>
+                <version>4.8.1</version>
                <executions>
                    <execution>
                        <id>compile</id>
@ -455,7 +455,7 @@
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
-                <version>4.8.0</version>
+                <version>4.8.1</version>
                <configuration>
                    <jvmArgs>
                        <jvmArg>-Xms64m</jvmArg>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@ -68,7 +68,7 @@
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-javadoc-plugin</artifactId>
-                <version>3.4.1</version>
+                <version>3.5.0</version>
                <configuration>
                    <show>protected</show>
                    <nohelp>true</nohelp>
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@ -56,7 +56,7 @@
          <plugin>
              <groupId>org.apache.maven.plugins</groupId>
              <artifactId>maven-javadoc-plugin</artifactId>
-              <version>3.4.1</version>
+              <version>3.5.0</version>
              <configuration>
                  <show>protected</show>
                  <nohelp>true</nohelp>
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@ -15,7 +15,7 @@ if (PLUGIN_UPDATER_ONEAPI)
  target_link_libraries(oneapi_plugin PUBLIC -fsycl)
  set_target_properties(oneapi_plugin PROPERTIES
    COMPILE_FLAGS -fsycl
-    CXX_STANDARD 14
+    CXX_STANDARD 17
    CXX_STANDARD_REQUIRED ON
    POSITION_INDEPENDENT_CODE ON)
  if (USE_OPENMP)
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@ -23,7 +23,13 @@ from typing import (
 import numpy

 from . import collective
-from .core import Booster, DMatrix, XGBoostError, _get_booster_layer_trees
+from .core import (
+    Booster,
+    DMatrix,
+    XGBoostError,
+    _get_booster_layer_trees,
+    _parse_eval_str,
+)

 __all__ = [
    "TrainingCallback",
@ -250,11 +256,7 @@ class CallbackContainer:
            for _, name in evals:
                assert name.find("-") == -1, "Dataset name should not contain `-`"
            score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)
-            splited = score.split()[1:]  # into datasets
-            # split up `test-error:0.1234`
-            metric_score_str = [tuple(s.split(":")) for s in splited]
-            # convert to float
-            metric_score = [(n, float(s)) for n, s in metric_score_str]
+            metric_score = _parse_eval_str(score)
            self._update_history(metric_score, epoch)
        ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
        return ret
--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@ -231,7 +231,7 @@ def allreduce(data: np.ndarray, op: Op) -> np.ndarray:  # pylint:disable=invalid
    if buf.base is data.base:
        buf = buf.copy()
    if buf.dtype not in DTYPE_ENUM__:
-        raise Exception(f"data type {buf.dtype} not supported")
+        raise TypeError(f"data type {buf.dtype} not supported")
    _check_call(
        _LIB.XGCommunicatorAllreduce(
            buf.ctypes.data_as(ctypes.c_void_p),
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -111,6 +111,16 @@ def make_jcargs(**kwargs: Any) -> bytes:
    return from_pystr_to_cstr(json.dumps(kwargs))


+def _parse_eval_str(result: str) -> List[Tuple[str, float]]:
+    """Parse an eval result string from the booster."""
+    splited = result.split()[1:]
+    # split up `test-error:0.1234`
+    metric_score_str = [tuple(s.split(":")) for s in splited]
+    # convert to float
+    metric_score = [(n, float(s)) for n, s in metric_score_str]
+    return metric_score
+
+
 IterRange = TypeVar("IterRange", Optional[Tuple[int, int]], Tuple[int, int])


@ -1926,6 +1936,8 @@ class Booster:
        elif isinstance(params, str) and value is not None:
            params = [(params, value)]
        for key, val in cast(Iterable[Tuple[str, str]], params):
+            if isinstance(val, np.ndarray):
+                val = val.tolist()
            if val is not None:
                _check_call(
                    _LIB.XGBoosterSetParam(self.handle, c_str(key), c_str(str(val)))
--- a/python-package/xgboost/rabit.py
+++ b/python-package/xgboost/rabit.py
@ -136,7 +136,7 @@ def allreduce(  # pylint:disable=invalid-name
    """
    if prepare_fun is None:
        return collective.allreduce(data, collective.Op(op))
-    raise Exception("preprocessing function is no longer supported")
+    raise ValueError("preprocessing function is no longer supported")


 def version_number() -> int:
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -43,8 +43,9 @@ from .core import (
    XGBoostError,
    _convert_ntree_limit,
    _deprecate_positional_args,
+    _parse_eval_str,
 )
-from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
+from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
 from .training import train


@ -1812,32 +1813,43 @@ class XGBRFRegressor(XGBRegressor):
        return self


+def _get_qid(
+    X: ArrayLike, qid: Optional[ArrayLike]
+) -> Tuple[ArrayLike, Optional[ArrayLike]]:
+    """Get the special qid column from X if exists."""
+    if (_is_pandas_df(X) or _is_cudf_df(X)) and hasattr(X, "qid"):
+        if qid is not None:
+            raise ValueError(
+                "Found both the special column `qid` in `X` and the `qid` from the"
+                "`fit` method. Please remove one of them."
+            )
+        q_x = X.qid
+        X = X.drop("qid", axis=1)
+        return X, q_x
+    return X, qid
+
+
@xgboost_model_doc(
-    "Implementation of the Scikit-Learn API for XGBoost Ranking.",
+    """Implementation of the Scikit-Learn API for XGBoost Ranking.""",
    ["estimators", "model"],
    end_note="""
-        .. note::
-
-            The default objective for XGBRanker is "rank:pairwise"
-
        .. note::

            A custom objective function is currently not supported by XGBRanker.
-            Likewise, a custom metric function is not supported either.

        .. note::

-            Query group information is required for ranking tasks by either using the
-            `group` parameter or `qid` parameter in `fit` method. This information is
-            not required in 'predict' method and multiple groups can be predicted on
-            a single call to `predict`.
+            Query group information is only required for ranking training but not
+            prediction. Multiple groups can be predicted on a single call to
+            :py:meth:`predict`.

        When fitting the model with the `group` parameter, your data need to be sorted
-        by query group first. `group` must be an array that contains the size of each
+        by the query group first. `group` is an array that contains the size of each
        query group.
-        When fitting the model with the `qid` parameter, your data does not need
-        sorting. `qid` must be an array that contains the group of each training
-        sample.
+
+        Similarly, when fitting the model with the `qid` parameter, the data should be
+        sorted according to query index and `qid` is an array that contains the query
+        index for each training sample.

        For example, if your original data look like:

@ -1859,9 +1871,10 @@ class XGBRFRegressor(XGBRegressor):
        |   2   |   1       |   x_7         |
        +-------+-----------+---------------+

-        then `fit` method can be called with either `group` array as ``[3, 4]``
-        or with `qid` as ``[`1, 1, 1, 2, 2, 2, 2]``, that is the qid column.
-""",
+        then :py:meth:`fit` method can be called with either `group` array as ``[3, 4]``
+        or with `qid` as ``[1, 1, 1, 2, 2, 2, 2]``, that is the qid column.  Also, the
+        `qid` can be a special column of input `X` instead of a separated parameter, see
+        :py:meth:`fit` for more info.""",
 )
 class XGBRanker(XGBModel, XGBRankerMixIn):
    # pylint: disable=missing-docstring,too-many-arguments,invalid-name
@ -1873,6 +1886,16 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        if "rank:" not in objective:
            raise ValueError("please use XGBRanker for ranking task")

+    def _create_ltr_dmatrix(
+        self, ref: Optional[DMatrix], data: ArrayLike, qid: ArrayLike, **kwargs: Any
+    ) -> DMatrix:
+        data, qid = _get_qid(data, qid)
+
+        if kwargs.get("group", None) is None and qid is None:
+            raise ValueError("Either `group` or `qid` is required for ranking task")
+
+        return super()._create_dmatrix(ref=ref, data=data, qid=qid, **kwargs)
+
    @_deprecate_positional_args
    def fit(
        self,
@ -1907,6 +1930,23 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        X :
            Feature matrix. See :ref:`py-data` for a list of supported types.

+            When this is a :py:class:`pandas.DataFrame` or a :py:class:`cudf.DataFrame`,
+            it may contain a special column called ``qid`` for specifying the query
+            index. Using a special column is the same as using the `qid` parameter,
+            except for being compatible with sklearn utility functions like
+            :py:func:`sklearn.model_selection.cross_validation`. The same convention
+            applies to the :py:meth:`XGBRanker.score` and :py:meth:`XGBRanker.predict`.
+
+            +-----+----------------+----------------+
+            | qid | feat_0         | feat_1         |
+            +-----+----------------+----------------+
+            | 0   | :math:`x_{00}` | :math:`x_{01}` |
+            +-----+----------------+----------------+
+            | 1   | :math:`x_{10}` | :math:`x_{11}` |
+            +-----+----------------+----------------+
+            | 1   | :math:`x_{20}` | :math:`x_{21}` |
+            +-----+----------------+----------------+
+
            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
            for conserving memory. However, this has performance implications when the
@ -1916,12 +1956,12 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        y :
            Labels
        group :
-            Size of each query group of training data. Should have as many elements as the
-            query groups in the training data.  If this is set to None, then user must
-            provide qid.
+            Size of each query group of training data. Should have as many elements as
+            the query groups in the training data.  If this is set to None, then user
+            must provide qid.
        qid :
            Query ID for each training sample.  Should have the size of n_samples.  If
-            this is set to None, then user must provide group.
+            this is set to None, then user must provide group or a special column in X.
        sample_weight :
            Query group weights

@ -1929,8 +1969,9 @@ class XGBRanker(XGBModel, XGBRankerMixIn):

                In ranking task, one weight is assigned to each query group/id (not each
                data point). This is because we only care about the relative ordering of
-                data points within each group, so it doesn't make sense to assign weights
-                to individual data points.
+                data points within each group, so it doesn't make sense to assign
+                weights to individual data points.
+
        base_margin :
            Global bias for each instance.
        eval_set :
@ -1942,7 +1983,8 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
            query groups in the ``i``-th pair in **eval_set**.
        eval_qid :
            A list in which ``eval_qid[i]`` is the array containing query ID of ``i``-th
-            pair in **eval_set**.
+            pair in **eval_set**. The special column convention in `X` applies to
+            validation datasets as well.

        eval_metric : str, list of str, optional
            .. deprecated:: 1.6.0
@ -1985,16 +2027,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
                Use `callbacks` in :py:meth:`__init__` or :py:meth:`set_params` instead.

        """
-        # check if group information is provided
        with config_context(verbosity=self.verbosity):
-            if group is None and qid is None:
-                raise ValueError("group or qid is required for ranking task")
-
-            if eval_set is not None:
-                if eval_group is None and eval_qid is None:
-                    raise ValueError(
-                        "eval_group or eval_qid is required if eval_set is not None"
-                    )
            train_dmatrix, evals = _wrap_evaluation_matrices(
                missing=self.missing,
                X=X,
@ -2009,7 +2042,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
                base_margin_eval_set=base_margin_eval_set,
                eval_group=eval_group,
                eval_qid=eval_qid,
-                create_dmatrix=self._create_dmatrix,
+                create_dmatrix=self._create_ltr_dmatrix,
                enable_categorical=self.enable_categorical,
                feature_types=self.feature_types,
            )
@ -2044,3 +2077,59 @@ class XGBRanker(XGBModel, XGBRankerMixIn):

            self._set_evaluation_result(evals_result)
            return self
+
+    def predict(
+        self,
+        X: ArrayLike,
+        output_margin: bool = False,
+        ntree_limit: Optional[int] = None,
+        validate_features: bool = True,
+        base_margin: Optional[ArrayLike] = None,
+        iteration_range: Optional[Tuple[int, int]] = None,
+    ) -> ArrayLike:
+        X, _ = _get_qid(X, None)
+        return super().predict(
+            X,
+            output_margin,
+            ntree_limit,
+            validate_features,
+            base_margin,
+            iteration_range,
+        )
+
+    def apply(
+        self,
+        X: ArrayLike,
+        ntree_limit: int = 0,
+        iteration_range: Optional[Tuple[int, int]] = None,
+    ) -> ArrayLike:
+        X, _ = _get_qid(X, None)
+        return super().apply(X, ntree_limit, iteration_range)
+
+    def score(self, X: ArrayLike, y: ArrayLike) -> float:
+        """Evaluate score for data using the last evaluation metric.
+
+        Parameters
+        ----------
+        X : pd.DataFrame|cudf.DataFrame
+          Feature matrix. A DataFrame with a special `qid` column.
+
+        y :
+          Labels
+
+        Returns
+        -------
+        score :
+          The result of the first evaluation metric for the ranker.
+
+        """
+        X, qid = _get_qid(X, None)
+        Xyq = DMatrix(X, y, qid=qid)
+        if callable(self.eval_metric):
+            metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
+            result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)
+        else:
+            result_str = self.get_booster().eval(Xyq)
+
+        metric_score = _parse_eval_str(result_str)
+        return metric_score[-1][1]
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@ -34,12 +34,12 @@ from pyspark.sql.types import (
    ShortType,
 )
 from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
-from xgboost.compat import is_cudf_available
-from xgboost.core import Booster
-from xgboost.training import train as worker_train

 import xgboost
 from xgboost import XGBClassifier, XGBRanker, XGBRegressor
+from xgboost.compat import is_cudf_available
+from xgboost.core import Booster
+from xgboost.training import train as worker_train

 from .data import (
    _read_csr_matrix_from_unwrapped_spark_vec,
@ -314,8 +314,19 @@ class _SparkXGBParams(
                raise ValueError("Only string type 'objective' param is allowed.")

        if self.getOrDefault(self.eval_metric) is not None:
-            if not isinstance(self.getOrDefault(self.eval_metric), str):
-                raise ValueError("Only string type 'eval_metric' param is allowed.")
+            if not (
+                isinstance(self.getOrDefault(self.eval_metric), str)
+                or (
+                    isinstance(self.getOrDefault(self.eval_metric), List)
+                    and all(
+                        isinstance(metric, str)
+                        for metric in self.getOrDefault(self.eval_metric)
+                    )
+                )
+            ):
+                raise ValueError(
+                    "Only string type or list of string type 'eval_metric' param is allowed."
+                )

        if self.getOrDefault(self.early_stopping_rounds) is not None:
            if not (
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@ -6,9 +6,9 @@ from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tupl
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
-from xgboost.compat import concat

 from xgboost import DataIter, DMatrix, QuantileDMatrix, XGBModel
+from xgboost.compat import concat

 from .._typing import ArrayLike
 from ..core import _convert_ntree_limit
--- a/python-package/xgboost/spark/model.py
+++ b/python-package/xgboost/spark/model.py
@ -8,6 +8,7 @@ import uuid
 from pyspark import SparkFiles, cloudpickle
 from pyspark.ml.util import DefaultParamsReader, DefaultParamsWriter, MLReader, MLWriter
 from pyspark.sql import SparkSession
+
 from xgboost.core import Booster

 from .utils import get_class_name, get_logger
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@ -8,9 +8,9 @@ from typing import Any, Callable, Dict, Set, Type
 import pyspark
 from pyspark import BarrierTaskContext, SparkContext
 from pyspark.sql.session import SparkSession
-from xgboost.tracker import RabitTracker

 from xgboost import collective
+from xgboost.tracker import RabitTracker


 def get_class_name(cls: Type) -> str:
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@ -33,10 +33,10 @@ from urllib import request
 import numpy as np
 import pytest
 from scipy import sparse
-from xgboost.core import ArrayLike
-from xgboost.sklearn import SklObjective

 import xgboost as xgb
+from xgboost.core import ArrayLike
+from xgboost.sklearn import SklObjective

 hypothesis = pytest.importorskip("hypothesis")

--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@ -2,9 +2,9 @@
 import numpy as np
 from dask import array as da
 from distributed import Client
-from xgboost.testing.updater import get_basescore

 import xgboost as xgb
+from xgboost.testing.updater import get_basescore


 def check_init_estimation_clf(tree_method: str, client: Client) -> None:
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@ -2,6 +2,7 @@
 from typing import Any, Generator, Tuple, Union

 import numpy as np
+
 from xgboost.data import pandas_pyarrow_mapper


--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@ -0,0 +1,72 @@
+# pylint: disable=too-many-locals
+"""Tests for learning to rank."""
+from types import ModuleType
+from typing import Any
+
+import numpy as np
+import pytest
+
+import xgboost as xgb
+from xgboost import testing as tm
+
+
+def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
+    """Test ranking with qid packed into X."""
+    import scipy.sparse
+    from sklearn.metrics import mean_squared_error
+    from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
+
+    X, y, q, _ = tm.make_ltr(n_samples=128, n_features=2, n_query_groups=8, max_rel=3)
+
+    # pack qid into x using dataframe
+    df = impl.DataFrame(X)
+    df["qid"] = q
+    ranker = xgb.XGBRanker(n_estimators=3, eval_metric="ndcg", tree_method=tree_method)
+    ranker.fit(df, y)
+    s = ranker.score(df, y)
+    assert s > 0.7
+
+    # works with validation datasets as well
+    valid_df = df.copy()
+    valid_df.iloc[0, 0] = 3.0
+    ranker.fit(df, y, eval_set=[(valid_df, y)])
+
+    # same as passing qid directly
+    ranker = xgb.XGBRanker(n_estimators=3, eval_metric="ndcg", tree_method=tree_method)
+    ranker.fit(X, y, qid=q)
+    s1 = ranker.score(df, y)
+    assert np.isclose(s, s1)
+
+    # Works with standard sklearn cv
+    if tree_method != "gpu_hist":
+        # we need cuML for this.
+        kfold = StratifiedGroupKFold(shuffle=False)
+        results = cross_val_score(ranker, df, y, cv=kfold, groups=df.qid)
+        assert len(results) == 5
+
+    # Works with custom metric
+    def neg_mse(*args: Any, **kwargs: Any) -> float:
+        return -float(mean_squared_error(*args, **kwargs))
+
+    ranker = xgb.XGBRanker(n_estimators=3, eval_metric=neg_mse, tree_method=tree_method)
+    ranker.fit(df, y, eval_set=[(valid_df, y)])
+    score = ranker.score(valid_df, y)
+    assert np.isclose(score, ranker.evals_result()["validation_0"]["neg_mse"][-1])
+
+    # Works with sparse data
+    if tree_method != "gpu_hist":
+        # no sparse with cuDF
+        X_csr = scipy.sparse.csr_matrix(X)
+        df = impl.DataFrame.sparse.from_spmatrix(
+            X_csr, columns=[str(i) for i in range(X.shape[1])]
+        )
+        df["qid"] = q
+        ranker = xgb.XGBRanker(
+            n_estimators=3, eval_metric="ndcg", tree_method=tree_method
+        )
+        ranker.fit(df, y)
+        s2 = ranker.score(df, y)
+        assert np.isclose(s2, s)
+
+    with pytest.raises(ValueError, match="Either `group` or `qid`."):
+        ranker.fit(df, y, eval_set=[(X, y)])
--- a/python-package/xgboost/testing/shared.py
+++ b/python-package/xgboost/testing/shared.py
@ -8,9 +8,9 @@ import tempfile
 from typing import Any, Callable, Dict, Type

 import numpy as np
-from xgboost._typing import ArrayLike

 import xgboost as xgb
+from xgboost._typing import ArrayLike


 def validate_leaf_output(leaf: np.ndarray, num_parallel_tree: int) -> None:
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@ -1,9 +1,12 @@
 """Tests for updaters."""
 import json
+from functools import partial, update_wrapper
+from typing import Dict

 import numpy as np

 import xgboost as xgb
+import xgboost.testing as tm


 def get_basescore(model: xgb.XGBModel) -> float:
@ -68,3 +71,91 @@ def check_init_estimation(tree_method: str) -> None:
        n_samples=4096, n_labels=3, n_classes=5, random_state=17
    )
    run_clf(X, y)
+
+
+# pylint: disable=too-many-locals
+def check_quantile_loss(tree_method: str, weighted: bool) -> None:
+    """Test for quantile loss."""
+    from sklearn.datasets import make_regression
+    from sklearn.metrics import mean_pinball_loss
+
+    from xgboost.sklearn import _metric_decorator
+
+    n_samples = 4096
+    n_features = 8
+    n_estimators = 8
+    # non-zero base score can cause floating point difference with GPU predictor.
+    # multi-class has small difference than single target in the prediction kernel
+    base_score = 0.0
+    rng = np.random.RandomState(1994)
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        random_state=rng,
+    )
+    if weighted:
+        weight = rng.random(size=n_samples)
+    else:
+        weight = None
+
+    Xy = xgb.QuantileDMatrix(X, y, weight=weight)
+
+    alpha = np.array([0.1, 0.5])
+    evals_result: Dict[str, Dict] = {}
+    booster_multi = xgb.train(
+        {
+            "objective": "reg:quantileerror",
+            "tree_method": tree_method,
+            "quantile_alpha": alpha,
+            "base_score": base_score,
+        },
+        Xy,
+        num_boost_round=n_estimators,
+        evals=[(Xy, "Train")],
+        evals_result=evals_result,
+    )
+    predt_multi = booster_multi.predict(Xy, strict_shape=True)
+
+    assert tm.non_increasing(evals_result["Train"]["quantile"])
+    assert evals_result["Train"]["quantile"][-1] < 20.0
+    # check that there's a way to use custom metric and compare the results.
+    metrics = [
+        _metric_decorator(
+            update_wrapper(
+                partial(mean_pinball_loss, sample_weight=weight, alpha=alpha[i]),
+                mean_pinball_loss,
+            )
+        )
+        for i in range(alpha.size)
+    ]
+
+    predts = np.empty(predt_multi.shape)
+    for i in range(alpha.shape[0]):
+        a = alpha[i]
+
+        booster_i = xgb.train(
+            {
+                "objective": "reg:quantileerror",
+                "tree_method": tree_method,
+                "quantile_alpha": a,
+                "base_score": base_score,
+            },
+            Xy,
+            num_boost_round=n_estimators,
+            evals=[(Xy, "Train")],
+            custom_metric=metrics[i],
+            evals_result=evals_result,
+        )
+        assert tm.non_increasing(evals_result["Train"]["quantile"])
+        assert evals_result["Train"]["quantile"][-1] < 30.0
+        np.testing.assert_allclose(
+            np.array(evals_result["Train"]["quantile"]),
+            np.array(evals_result["Train"]["mean_pinball_loss"]),
+            atol=1e-6,
+            rtol=1e-6,
+        )
+        predts[:, i] = booster_i.predict(Xy)
+
+    for i in range(alpha.shape[0]):
+        np.testing.assert_allclose(predts[:, i], predt_multi[:, i])
--- a/rabit/CMakeLists.txt
+++ b/rabit/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.3)
+cmake_minimum_required(VERSION 3.18)

 find_package(Threads REQUIRED)

--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -455,7 +455,8 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
  xgboost_CHECK_C_ARG_PTR(indptr);
  xgboost_CHECK_C_ARG_PTR(indices);
  xgboost_CHECK_C_ARG_PTR(data);
-  data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data}, nrow};
+  data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data},
+                                static_cast<std::size_t>(nrow)};
  xgboost_CHECK_C_ARG_PTR(c_json_config);
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023 by XGBoost contributors
 */
 #pragma once
 #include <string>
@ -9,7 +9,7 @@
 namespace xgboost {
 namespace collective {

-/*!
+/**
 * \brief Initialize the collective communicator.
 *
 *  Currently the communicator API is experimental, function signatures may change in the future
@ -140,6 +140,19 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
  }
 }

+/**
+ * @brief Gathers data from all processes and distributes it to all processes.
+ *
+ * This assumes all ranks have the same size, and input data has been sliced into the
+ * corresponding position.
+ *
+ * @param send_receive_buffer Buffer storing the data.
+ * @param size                Size of the data in bytes.
+ */
+inline void Allgather(void *send_receive_buffer, std::size_t size) {
+  Communicator::Get()->AllGather(send_receive_buffer, size);
+}
+
 /*!
 * \brief Perform in-place allreduce. This function is NOT thread-safe.
 *
@ -197,7 +210,7 @@ inline void Allreduce(uint64_t *send_receive_buffer, size_t count) {
 template <Operation op, typename T,
          typename = std::enable_if_t<std::is_same<size_t, T>{} && !std::is_same<uint64_t, T>{}> >
 inline void Allreduce(T *send_receive_buffer, size_t count) {
-  static_assert(sizeof(T) == sizeof(uint64_t), "");
+  static_assert(sizeof(T) == sizeof(uint64_t));
  Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kUInt64, op);
 }

--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@ -1,10 +1,32 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_ALGORITHM_H_
 #define XGBOOST_COMMON_ALGORITHM_H_
-#include <algorithm>  // std::upper_bound
-#include <cinttypes>  // std::size_t
+#include <algorithm>          // upper_bound, stable_sort, sort, max
+#include <cinttypes>          // size_t
+#include <functional>         // less
+#include <iterator>           // iterator_traits, distance
+#include <vector>             // vector
+
+#include "numeric.h"          // Iota
+#include "xgboost/context.h"  // Context
+
+// clang with libstdc++ works as well
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
+    !defined(__APPLE__) && __has_include(<omp.h>)
+#define GCC_HAS_PARALLEL 1
+#endif  // GLIC_VERSION
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#define MSVC_HAS_PARALLEL 1
+#endif  // MSC
+
+#if defined(GCC_HAS_PARALLEL)
+#include <parallel/algorithm>
+#elif defined(MSVC_HAS_PARALLEL)
+#include <ppl.h>
+#endif  // GLIBC VERSION

 namespace xgboost {
 namespace common {
@ -13,6 +35,63 @@ auto SegmentId(It first, It last, Idx idx) {
  std::size_t segment_id = std::upper_bound(first, last, idx) - 1 - first;
  return segment_id;
 }
+
+template <typename Iter, typename Comp>
+void StableSort(Context const *ctx, Iter begin, Iter end, Comp &&comp) {
+  if (ctx->Threads() > 1) {
+#if defined(GCC_HAS_PARALLEL)
+    __gnu_parallel::stable_sort(begin, end, comp,
+                                __gnu_parallel::default_parallel_tag(ctx->Threads()));
+#else
+    // the only stable sort is radix sort for msvc ppl.
+    std::stable_sort(begin, end, comp);
+#endif  // GLIBC VERSION
+  } else {
+    std::stable_sort(begin, end, comp);
+  }
+}
+
+template <typename Iter, typename Comp>
+void Sort(Context const *ctx, Iter begin, Iter end, Comp comp) {
+  if (ctx->Threads() > 1) {
+#if defined(GCC_HAS_PARALLEL)
+    __gnu_parallel::sort(begin, end, comp, __gnu_parallel::default_parallel_tag(ctx->Threads()));
+#elif defined(MSVC_HAS_PARALLEL)
+    auto n = std::distance(begin, end);
+    // use chunk size as hint to number of threads. No local policy/scheduler input with the
+    // concurrency module.
+    std::size_t chunk_size = n / ctx->Threads();
+    // 2048 is the default of msvc ppl as of v2022.
+    chunk_size = std::max(chunk_size, static_cast<std::size_t>(2048));
+    concurrency::parallel_sort(begin, end, comp, chunk_size);
+#else
+    std::sort(begin, end, comp);
+#endif  // GLIBC VERSION
+  } else {
+    std::sort(begin, end, comp);
+  }
+}
+
+template <typename Idx, typename Iter, typename V = typename std::iterator_traits<Iter>::value_type,
+          typename Comp = std::less<V>>
+std::vector<Idx> ArgSort(Context const *ctx, Iter begin, Iter end, Comp comp = std::less<V>{}) {
+  CHECK(ctx->IsCPU());
+  auto n = std::distance(begin, end);
+  std::vector<Idx> result(n);
+  Iota(ctx, result.begin(), result.end(), 0);
+  auto op = [&](Idx const &l, Idx const &r) { return comp(begin[l], begin[r]); };
+  StableSort(ctx, result.begin(), result.end(), op);
+  return result;
+}
 }  // namespace common
 }  // namespace xgboost
+
+#if defined(GCC_HAS_PARALLEL)
+#undef GCC_HAS_PARALLEL
+#endif  // defined(GCC_HAS_PARALLEL)
+
+#if defined(MSVC_HAS_PARALLEL)
+#undef MSVC_HAS_PARALLEL
+#endif  // defined(MSVC_HAS_PARALLEL)
+
 #endif  // XGBOOST_COMMON_ALGORITHM_H_
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@ -42,9 +42,9 @@ constexpr inline bst_cat_t OutOfRangeCat() {

 inline XGBOOST_DEVICE bool InvalidCat(float cat) {
  constexpr auto kMaxCat = OutOfRangeCat();
-  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat, "");
-  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1, "");
-  static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat, "");
+  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat);
+  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1);
+  static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat);
  return cat < 0 || cat >= kMaxCat;
 }

--- a/src/common/charconv.cc
+++ b/src/common/charconv.cc
@ -270,7 +270,9 @@ struct RyuPowLogUtils {
   */
  static uint32_t MulPow5InvDivPow2(const uint32_t m, const uint32_t q,
                                    const int32_t j) noexcept(true) {
-    return MulShift(m, kFloatPow5InvSplit[q], j);
+    static_assert(sizeof(kFloatPow5InvSplit) == 55 * sizeof(std::uint64_t));
+    assert(q < 55);
+    return MulShift(m, kFloatPow5InvSplit[q], j);  // NOLINT
  }

  /*
@ -495,12 +497,10 @@ class PowerBaseComputer {
                             static_cast<int32_t>(IEEE754::kFloatBias) -
                             static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
                             static_cast<int32_t>(2);
-      static_assert(static_cast<int32_t>(1) -
-                            static_cast<int32_t>(IEEE754::kFloatBias) -
+      static_assert(static_cast<int32_t>(1) - static_cast<int32_t>(IEEE754::kFloatBias) -
                        static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
                        static_cast<int32_t>(2) ==
-                        -151,
-                    "");
+                    -151);
      mantissa_base2 = f.mantissa;
    } else {
      base2_range.exponent = static_cast<int32_t>(f.exponent) - IEEE754::kFloatBias -
@ -544,7 +544,7 @@ class RyuPrinter {
    // Function precondition: v is not a 10-digit number.
    // (f2s: 9 digits are sufficient for round-tripping.)
    // (d2fixed: We print 9-digit blocks.)
-    static_assert(100000000 == Tens(8), "");
+    static_assert(100000000 == Tens(8));
    assert(v < Tens(9));
    if (v >= Tens(8)) {
      return 9;
@ -911,7 +911,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
  // the bias and also special-case the value 0.
  int32_t shift = (f_e2 == 0 ? 1 : f_e2) - exp_b2 - IEEE754::kFloatBias -
                  IEEE754::kFloatMantissaBits;
-  assert(shift >= 0);
+  assert(shift >= 1);

  // We need to round up if the exact value is more than 0.5 above the value we
  // computed. That's equivalent to checking if the last removed bit was 1 and
@ -920,7 +920,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
  //
  // We need to update trailingZeros given that we have the exact output
  // exponent ieee_e2 now.
-  trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;
+  trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;  // NOLINT
  uint32_t lastRemovedBit = (mantissa_b2 >> (shift - 1)) & 1;
  bool roundup = (lastRemovedBit != 0) &&
                 (!trailing_zeros || (((mantissa_b2 >> shift) & 1) != 0));
--- a/src/common/charconv.h
+++ b/src/common/charconv.h
@ -87,7 +87,7 @@ inline to_chars_result to_chars(char *first, char *last, int64_t value) { // NOL
  if (value < 0) {
    *first = '-';
    std::advance(first, 1);
-    unsigned_value = uint64_t(~value) + uint64_t(1);
+    unsigned_value = static_cast<uint64_t>(~value) + static_cast<uint64_t>(1);
  }
  return detail::ToCharsUnsignedImpl(first, last, unsigned_value);
 }
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
    feature_offsets_[fid] = accum_index;
  }

-  SetTypeSize(gmat.max_num_bins);
+  SetTypeSize(gmat.MaxNumBinPerFeat());
  auto storage_size =
      feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
  index_.resize(storage_size, 0);
--- a/src/common/common.h
+++ b/src/common/common.h
@ -188,17 +188,6 @@ inline void SetDevice(std::int32_t device) {
 }
 #endif

-template <typename Idx, typename Container,
-          typename V = typename Container::value_type,
-          typename Comp = std::less<V>>
-std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
-  std::vector<Idx> result(array.size());
-  std::iota(result.begin(), result.end(), 0);
-  auto op = [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); };
-  XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op);
-  return result;
-}
-
 /**
 * Last index of a group in a CSR style of index pointer.
 */
@ -206,31 +195,6 @@ template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
  return indptr[group + 1] - 1;
 }
-
-/**
- * \brief A CRTP (curiously recurring template pattern) helper function.
- *
- * https://www.fluentcpp.com/2017/05/19/crtp-helper/
- *
- * Does two things:
- * 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
- * 2. Avoids having to `static_cast` in a lot of places.
- *
- * \tparam T The derived class in a CRTP hierarchy.
- */
-template <typename T>
-struct Crtp {
-  T &Underlying() { return static_cast<T &>(*this); }
-  T const &Underlying() const { return static_cast<T const &>(*this); }
-};
-
-/**
- * \brief C++17 std::as_const
- */
-template <typename T>
-typename std::add_const<T>::type &AsConst(T &v) noexcept {  // NOLINT(runtime/references)
-  return v;
-}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@ -1,12 +1,13 @@
-/*!
- * Copyright 2017 by Contributors
+/**
+ * Copyright 2017-2023 by XGBoost Contributors
 * \file compressed_iterator.h
 */
 #pragma once
 #include <xgboost/base.h>
-#include <cmath>
-#include <cstddef>
+
 #include <algorithm>
+#include <cmath>
+#include <cstddef>  // for size_t

 #include "common.h"

@ -36,7 +37,7 @@ static const int kPadding = 4;  // Assign padding so we can read slightly off
 // The number of bits required to represent a given unsigned range
 inline XGBOOST_DEVICE size_t SymbolBits(size_t num_symbols) {
  auto bits = std::ceil(log2(static_cast<double>(num_symbols)));
-  return common::Max(static_cast<size_t>(bits), size_t(1));
+  return common::Max(static_cast<size_t>(bits), static_cast<std::size_t>(1));
 }
 }  // namespace detail

--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -20,6 +20,7 @@

 #include <algorithm>
 #include <chrono>
+#include <cstddef>  // for size_t
 #include <cub/cub.cuh>
 #include <cub/util_allocator.cuh>
 #include <numeric>
@ -178,7 +179,7 @@ inline size_t MaxSharedMemory(int device_idx) {
  dh::safe_cuda(cudaDeviceGetAttribute
                (&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
                 device_idx));
-  return size_t(max_shared_memory);
+  return static_cast<std::size_t>(max_shared_memory);
 }

 /**
@ -195,7 +196,7 @@ inline size_t MaxSharedMemoryOptin(int device_idx) {
  dh::safe_cuda(cudaDeviceGetAttribute
                (&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlockOptin,
                 device_idx));
-  return size_t(max_shared_memory);
+  return static_cast<std::size_t>(max_shared_memory);
 }

 inline void CheckComputeCapability() {
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@ -46,7 +46,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
  if (!use_sorted) {
    HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
                                  HostSketchContainer::UseGroup(info),
-                                  m->Info().data_split_mode == DataSplitMode::kCol, n_threads);
+                                  m->IsColumnSplit(), n_threads);
    for (auto const& page : m->GetBatches<SparsePage>()) {
      container.PushRowPage(page, info, hessian);
    }
@ -54,7 +54,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
  } else {
    SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
                                    HostSketchContainer::UseGroup(info),
-                                    m->Info().data_split_mode == DataSplitMode::kCol, n_threads};
+                                    m->IsColumnSplit(), n_threads};
    for (auto const& page : m->GetBatches<SortedCSCPage>()) {
      container.PushColPage(page, info, hessian);
    }
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -1,33 +1,31 @@
-/*!
- * Copyright 2018~2020 XGBoost contributors
+/**
+ * Copyright 2018~2023 by XGBoost contributors
 */
-
-#include <xgboost/logging.h>
-
+#include <thrust/binary_search.h>
 #include <thrust/copy.h>
+#include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
+#include <xgboost/logging.h>

+#include <cstddef>  // for size_t
 #include <memory>
 #include <mutex>
 #include <utility>
 #include <vector>

+#include "categorical.h"
 #include "device_helpers.cuh"
-#include "hist_util.h"
 #include "hist_util.cuh"
+#include "hist_util.h"
 #include "math.h"  // NOLINT
 #include "quantile.h"
-#include "categorical.h"
 #include "xgboost/host_device_vector.h"

-
 namespace xgboost {
 namespace common {

@ -318,7 +316,7 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
    size_t batch_nnz = batch.data.Size();
    auto const& info = dmat->Info();
    for (auto begin = 0ull; begin < batch_nnz; begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch_nnz, size_t(begin + sketch_batch_num_elements));
+      size_t end = std::min(batch_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
      if (has_weights) {
        bool is_ranking = HostSketchContainer::UseGroup(dmat->Info());
        dh::caching_device_vector<uint32_t> groups(info.group_ptr_.cbegin(),
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 XGBoost contributors
+/**
+ * Copyright 2020-2023 by XGBoost contributors
 *
 * \brief Front end and utilities for GPU based sketching.  Works on sliding window
 *        instead of stream.
@ -9,11 +9,13 @@

 #include <thrust/host_vector.h>

+#include <cstddef>  // for size_t
+
+#include "../data/device_adapter.cuh"
+#include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
-#include "device_helpers.cuh"
 #include "timer.h"
-#include "../data/device_adapter.cuh"

 namespace xgboost {
 namespace common {
@ -304,7 +306,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
        device, num_cuts_per_feature, true);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
+      size_t end =
+          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
      ProcessWeightedSlidingWindow(batch, info,
                                   num_cuts_per_feature,
                                   HostSketchContainer::UseGroup(info), missing, device, num_cols, begin, end,
@ -316,7 +319,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
        device, num_cuts_per_feature, false);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
+      size_t end =
+          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
      ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
                           sketch_container, num_cuts_per_feature);
    }
--- a/src/common/io.cc
+++ b/src/common/io.cc
@ -50,7 +50,7 @@ size_t PeekableInStream::PeekRead(void* dptr, size_t size) {
  }
 }

-FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream), pointer_{0} {
+FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream) {
  size_t constexpr kInitialSize = 4096;
  size_t size{kInitialSize}, total{0};
  buffer_.clear();
--- a/src/common/io.h
+++ b/src/common/io.h
@ -27,8 +27,7 @@ using MemoryBufferStream = rabit::utils::MemoryBufferStream;
 */
 class PeekableInStream : public dmlc::Stream {
 public:
-  explicit PeekableInStream(dmlc::Stream* strm)
-      : strm_(strm), buffer_ptr_(0) {}
+  explicit PeekableInStream(dmlc::Stream* strm) : strm_(strm) {}

  size_t Read(void* dptr, size_t size) override;
  virtual size_t PeekRead(void* dptr, size_t size);
@ -41,7 +40,7 @@ class PeekableInStream : public dmlc::Stream {
  /*! \brief input stream */
  dmlc::Stream *strm_;
  /*! \brief current buffer pointer */
-  size_t buffer_ptr_;
+  size_t buffer_ptr_{0};
  /*! \brief internal buffer */
  std::string buffer_;
 };
@ -72,7 +71,7 @@ class FixedSizeStream : public PeekableInStream {
  void Take(std::string* out);

 private:
-  size_t pointer_;
+  size_t pointer_{0};
  std::string buffer_;
 };

--- a/src/common/json.cc
+++ b/src/common/json.cc
@ -710,10 +710,10 @@ void Json::Dump(Json json, JsonWriter* writer) {
  writer->Save(json);
 }

-static_assert(std::is_nothrow_move_constructible<Json>::value, "");
-static_assert(std::is_nothrow_move_constructible<Object>::value, "");
-static_assert(std::is_nothrow_move_constructible<Array>::value, "");
-static_assert(std::is_nothrow_move_constructible<String>::value, "");
+static_assert(std::is_nothrow_move_constructible<Json>::value);
+static_assert(std::is_nothrow_move_constructible<Object>::value);
+static_assert(std::is_nothrow_move_constructible<Array>::value);
+static_assert(std::is_nothrow_move_constructible<String>::value);

 Json UBJReader::ParseArray() {
  auto marker = PeekNextChar();
--- a/src/common/numeric.cc
+++ b/src/common/numeric.cc
@ -14,7 +14,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
  if (ctx->IsCPU()) {
    auto const& h_values = values.ConstHostVector();
    auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
-    static_assert(std::is_same<decltype(result), double>::value, "");
+    static_assert(std::is_same<decltype(result), double>::value);
    return result;
  }
  return cuda_impl::Reduce(ctx, values);
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@ -42,8 +42,8 @@ void RunLengthEncode(Iter begin, Iter end, std::vector<Idx>* p_out) {
 */
 template <typename InIt, typename OutIt, typename T>
 void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
-  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value, "");
-  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value, "");
+  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value);
+  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value);
  // The number of threads is pegged to the batch size. If the OMP block is parallelized
  // on anything other than the batch/block size, it should be reassigned
  auto n = static_cast<size_t>(std::distance(begin, end));
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@ -31,6 +31,8 @@ namespace common {
 // BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
 template<size_t BlockSize>
 class PartitionBuilder {
+  using BitVector = RBitField8;
+
 public:
  template<typename Func>
  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
@ -121,27 +123,11 @@ class PartitionBuilder {
    bool default_left = tree[nid].DefaultLeft();
    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
    auto node_cats = tree.NodeCats(nid);
-
-    auto const& index = gmat.index;
    auto const& cut_values = gmat.cut.Values();
-    auto const& cut_ptrs = gmat.cut.Ptrs();
-
-    auto gidx_calc = [&](auto ridx) {
-      auto begin = gmat.RowIdx(ridx);
-      if (gmat.IsDense()) {
-        return static_cast<bst_bin_t>(index[begin + fid]);
-      }
-      auto end = gmat.RowIdx(ridx + 1);
-      auto f_begin = cut_ptrs[fid];
-      auto f_end = cut_ptrs[fid + 1];
-      // bypassing the column matrix as we need the cut value instead of bin idx for categorical
-      // features.
-      return BinarySearchBin(begin, end, index, f_begin, f_end);
-    };

    auto pred_hist = [&](auto ridx, auto bin_id) {
      if (any_cat && is_cat) {
-        auto gidx = gidx_calc(ridx);
+        auto gidx = gmat.GetGindex(ridx, fid);
        bool go_left = default_left;
        if (gidx > -1) {
          go_left = Decision(node_cats, cut_values[gidx]);
@ -153,7 +139,7 @@ class PartitionBuilder {
    };

    auto pred_approx = [&](auto ridx) {
-      auto gidx = gidx_calc(ridx);
+      auto gidx = gmat.GetGindex(ridx, fid);
      bool go_left = default_left;
      if (gidx > -1) {
        if (is_cat) {
@ -199,6 +185,84 @@ class PartitionBuilder {
    SetNRightElems(node_in_set, range.begin(), n_right);
  }

+  /**
+   * @brief When data is split by column, we don't have all the features locally on the current
+   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
+   * to go right, or if the feature value used for the split is missing.
+   */
+  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
+                const common::Range1d range, GHistIndexMatrix const& gmat,
+                const common::ColumnMatrix& column_matrix,
+                const RegTree& tree, const size_t* rid,
+                BitVector* decision_bits, BitVector* missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree[nid].SplitIndex();
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    if (!column_matrix.IsInitialized()) {
+      for (auto row_id : rid_span) {
+        auto gidx = gmat.GetGindex(row_id, fid);
+        if (gidx > -1) {
+          bool go_left = false;
+          if (is_cat) {
+            go_left = Decision(node_cats, cut_values[gidx]);
+          } else {
+            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+          }
+          if (go_left) {
+            decision_bits->Set(row_id - gmat.base_rowid);
+          }
+        } else {
+          missing_bits->Set(row_id - gmat.base_rowid);
+        }
+      }
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+  }
+
+  /**
+   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
+   * use them to partition the rows accordingly.
+   */
+  void PartitionByMask(const size_t node_in_set,
+                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
+                       const common::Range1d range, GHistIndexMatrix const& gmat,
+                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                       const size_t* rid, BitVector const& decision_bits,
+                       BitVector const& missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bool default_left = tree[nid].DefaultLeft();
+
+    auto pred_approx = [&](auto ridx) {
+      bool go_left = default_left;
+      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
+      if (!is_missing) {
+        go_left = decision_bits.Check(ridx - gmat.base_rowid);
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
  // allocate thread local memory, should be called for each specific task
  void AllocateForTask(size_t id) {
    if (mem_blocks_[id].get() == nullptr) {
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
 */
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@ -109,7 +109,7 @@ void PruneImpl(common::Span<SketchContainer::OffsetT const> cuts_ptr,
 template <typename T, typename U>
 void CopyTo(Span<T> out, Span<U> src) {
  CHECK_EQ(out.size(), src.size());
-  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value, "");
+  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
  dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
                                out.size_bytes(),
                                cudaMemcpyDefault));
@ -143,7 +143,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
      thrust::make_zip_iterator(thrust::make_tuple(b_ind_iter, place_holder));

  dh::XGBCachingDeviceAllocator<Tuple> alloc;
-  static_assert(sizeof(Tuple) == sizeof(SketchEntry), "");
+  static_assert(sizeof(Tuple) == sizeof(SketchEntry));
  // We reuse the memory for storing merge path.
  common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
  // Determine the merge path, 0 if element is from x, 1 if it's from y.
--- a/src/common/random.cc
+++ b/src/common/random.cc
@ -24,8 +24,9 @@ std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
    for (size_t i = 0; i < h_features.size(); ++i) {
      weights[i] = feature_weights_[h_features[i]];
    }
+    CHECK(ctx_);
    new_features.HostVector() =
-        WeightedSamplingWithoutReplacement(p_features->HostVector(), weights, n);
+        WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weights, n);
  } else {
    new_features.Resize(features.size());
    std::copy(features.begin(), features.end(), new_features.HostVector().begin());
--- a/src/common/random.h
+++ b/src/common/random.h
@ -20,7 +20,9 @@
 #include <vector>

 #include "../collective/communicator-inl.h"
+#include "algorithm.h"  // ArgSort
 #include "common.h"
+#include "xgboost/context.h"  // Context
 #include "xgboost/host_device_vector.h"

 namespace xgboost {
@ -87,8 +89,8 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
 * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
 */
 template <typename T>
-std::vector<T> WeightedSamplingWithoutReplacement(
-    std::vector<T> const &array, std::vector<float> const &weights, size_t n) {
+std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vector<T> const& array,
+                                                  std::vector<float> const& weights, size_t n) {
  // ES sampling.
  CHECK_EQ(array.size(), weights.size());
  std::vector<float> keys(weights.size());
@ -100,7 +102,7 @@ std::vector<T> WeightedSamplingWithoutReplacement(
    auto k = std::log(u) / w;
    keys[i] = k;
  }
-  auto ind = ArgSort<size_t>(Span<float>{keys}, std::greater<>{});
+  auto ind = ArgSort<std::size_t>(ctx, keys.data(), keys.data() + keys.size(), std::greater<>{});
  ind.resize(n);

  std::vector<T> results(ind.size());
@ -126,6 +128,7 @@ class ColumnSampler {
  float colsample_bytree_{1.0f};
  float colsample_bynode_{1.0f};
  GlobalRandomEngine rng_;
+  Context const* ctx_;

 public:
  std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
@ -157,12 +160,13 @@ class ColumnSampler {
   * \param colsample_bytree
   * \param skip_index_0      (Optional) True to skip index 0.
   */
-  void Init(int64_t num_col, std::vector<float> feature_weights, float colsample_bynode,
-            float colsample_bylevel, float colsample_bytree) {
+  void Init(Context const* ctx, int64_t num_col, std::vector<float> feature_weights,
+            float colsample_bynode, float colsample_bylevel, float colsample_bytree) {
    feature_weights_ = std::move(feature_weights);
    colsample_bylevel_ = colsample_bylevel;
    colsample_bytree_ = colsample_bytree;
    colsample_bynode_ = colsample_bynode;
+    ctx_ = ctx;

    if (feature_set_tree_ == nullptr) {
      feature_set_tree_ = std::make_shared<HostDeviceVector<bst_feature_t>>();
--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@ -77,14 +77,14 @@ class RowSetCollection {
    if (row_indices_.empty()) {  // edge case: empty instance set
      constexpr size_t* kBegin = nullptr;
      constexpr size_t* kEnd = nullptr;
-      static_assert(kEnd - kBegin == 0, "");
-      elem_of_each_node_.emplace_back(Elem(kBegin, kEnd, 0));
+      static_assert(kEnd - kBegin == 0);
+      elem_of_each_node_.emplace_back(kBegin, kEnd, 0);
      return;
    }

    const size_t* begin = dmlc::BeginPtr(row_indices_);
    const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
-    elem_of_each_node_.emplace_back(Elem(begin, end, 0));
+    elem_of_each_node_.emplace_back(begin, end, 0);
  }

  std::vector<size_t>* Data() { return &row_indices_; }
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@ -35,11 +35,11 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
    auto iter = linalg::cbegin(ti_v);
    float q{0};
    if (opt_weights.Empty()) {
-      q = common::Quantile(0.5, iter, iter + ti_v.Size());
+      q = common::Quantile(ctx, 0.5, iter, iter + ti_v.Size());
    } else {
      CHECK_NE(t_v.Shape(1), 0);
      auto w_it = common::MakeIndexTransformIter([&](std::size_t i) { return opt_weights[i]; });
-      q = common::WeightedQuantile(0.5, iter, iter + ti_v.Size(), w_it);
+      q = common::WeightedQuantile(ctx, 0.5, iter, iter + ti_v.Size(), w_it);
    }
    h_out(i) = q;
  }
--- a/src/common/stats.h
+++ b/src/common/stats.h
@ -4,46 +4,52 @@
 #ifndef XGBOOST_COMMON_STATS_H_
 #define XGBOOST_COMMON_STATS_H_
 #include <algorithm>
-#include <iterator>
+#include <iterator>  // for distance
 #include <limits>
 #include <vector>

+#include "algorithm.h"           // for StableSort
 #include "common.h"              // AssertGPUSupport, OptionalWeights
 #include "optional_weight.h"     // OptionalWeights
 #include "transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/context.h"     // Context
-#include "xgboost/linalg.h"
+#include "xgboost/linalg.h"      // TensorView,VectorView
 #include "xgboost/logging.h"     // CHECK_GE

 namespace xgboost {
 namespace common {

 /**
- * \brief Percentile with masked array using linear interpolation.
+ * @brief Quantile using linear interpolation.
 *
 *   https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
 *
- * \param alpha Percentile, must be in range [0, 1].
+ * \param alpha Quantile, must be in range [0, 1].
 * \param begin Iterator begin for input array.
 * \param end   Iterator end for input array.
 *
 * \return The result of interpolation.
 */
 template <typename Iter>
-float Quantile(double alpha, Iter const& begin, Iter const& end) {
+float Quantile(Context const* ctx, double alpha, Iter const& begin, Iter const& end) {
  CHECK(alpha >= 0 && alpha <= 1);
  auto n = static_cast<double>(std::distance(begin, end));
  if (n == 0) {
    return std::numeric_limits<float>::quiet_NaN();
  }

-  std::vector<size_t> sorted_idx(n);
+  std::vector<std::size_t> sorted_idx(n);
  std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
+  if (omp_in_parallel()) {
    std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
-                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+                     [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  } else {
+    StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
+               [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  }

  auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
-  static_assert(std::is_same<decltype(val(0)), float>::value, "");
+  static_assert(std::is_same<decltype(val(0)), float>::value);

  if (alpha <= (1 / (n + 1))) {
    return val(0);
@ -51,7 +57,7 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
  if (alpha >= (n / (n + 1))) {
    return val(sorted_idx.size() - 1);
  }
-  assert(n != 0 && "The number of rows in a leaf can not be zero.");
+
  double x = alpha * static_cast<double>((n + 1));
  double k = std::floor(x) - 1;
  CHECK_GE(k, 0);
@ -66,30 +72,35 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
 * \brief Calculate the weighted quantile with step function. Unlike the unweighted
 *        version, no interpolation is used.
 *
- *   See https://aakinshin.net/posts/weighted-quantiles/ for some discussion on computing
+ *   See https://aakinshin.net/posts/weighted-quantiles/ for some discussions on computing
 *   weighted quantile with interpolation.
 */
 template <typename Iter, typename WeightIter>
-float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
+float WeightedQuantile(Context const* ctx, double alpha, Iter begin, Iter end, WeightIter w_begin) {
  auto n = static_cast<double>(std::distance(begin, end));
  if (n == 0) {
    return std::numeric_limits<float>::quiet_NaN();
  }
  std::vector<size_t> sorted_idx(n);
  std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
+  if (omp_in_parallel()) {
    std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
-                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+                     [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  } else {
+    StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
+               [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  }

  auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };

  std::vector<float> weight_cdf(n);  // S_n
  // weighted cdf is sorted during construction
-  weight_cdf[0] = *(weights + sorted_idx[0]);
+  weight_cdf[0] = *(w_begin + sorted_idx[0]);
  for (size_t i = 1; i < n; ++i) {
-    weight_cdf[i] = weight_cdf[i - 1] + *(weights + sorted_idx[i]);
+    weight_cdf[i] = weight_cdf[i - 1] + w_begin[sorted_idx[i]];
  }
  float thresh = weight_cdf.back() * alpha;
-  size_t idx =
+  std::size_t idx =
      std::lower_bound(weight_cdf.cbegin(), weight_cdf.cend(), thresh) - weight_cdf.cbegin();
  idx = std::min(idx, static_cast<size_t>(n - 1));
  return val(idx);
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -10,12 +10,13 @@
 #include <cstring>

 #include "../collective/communicator-inl.h"
+#include "../common/algorithm.h"  // StableSort
 #include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
 #include "../common/group_data.h"
 #include "../common/io.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
-#include "../common/numeric.h"
+#include "../common/numeric.h"  // Iota
 #include "../common/threading_utils.h"
 #include "../common/version.h"
 #include "../data/adapter.h"
@ -258,6 +259,19 @@ void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<Feat
  }
 }

+const std::vector<size_t>& MetaInfo::LabelAbsSort(Context const* ctx) const {
+  if (label_order_cache_.size() == labels.Size()) {
+    return label_order_cache_;
+  }
+  label_order_cache_.resize(labels.Size());
+  common::Iota(ctx, label_order_cache_.begin(), label_order_cache_.end(), 0);
+  const auto& l = labels.Data()->HostVector();
+  common::StableSort(ctx, label_order_cache_.begin(), label_order_cache_.end(),
+                     [&l](size_t i1, size_t i2) { return std::abs(l[i1]) < std::abs(l[i2]); });
+
+  return label_order_cache_;
+}
+
 void MetaInfo::LoadBinary(dmlc::Stream *fi) {
  auto version = Version::Load(fi);
  auto major = std::get<0>(version);
@ -898,6 +912,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
    if (!cache_file.empty()) {
      LOG(FATAL) << "Column-wise data split is not support for external memory.";
    }
+    LOG(CONSOLE) << "Splitting data by column";
    auto* sliced = dmat->SliceCol(npart, partid);
    delete dmat;
    return sliced;
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@ -1,12 +1,14 @@
-/*!
- *  Copyright (c) 2019 by Contributors
+/**
+ *  Copyright 2019-2023 by XGBoost Contributors
 * \file device_adapter.cuh
 */
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
 #define XGBOOST_DATA_DEVICE_ADAPTER_H_
+#include <cstddef>  // for size_t
 #include <limits>
 #include <memory>
 #include <string>
+
 #include "../common/device_helpers.cuh"
 #include "../common/math.h"
 #include "adapter.h"
@ -205,10 +207,10 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
    }
  });
  dh::XGBCachingDeviceAllocator<char> alloc;
-  size_t row_stride = dh::Reduce(
-      thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data()) + offset.size(), size_t(0),
-      thrust::maximum<size_t>());
+  size_t row_stride =
+      dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+                 thrust::device_pointer_cast(offset.data()) + offset.size(),
+                 static_cast<std::size_t>(0), thrust::maximum<size_t>());
  return row_stride;
 }
 };  // namespace data
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@ -21,13 +21,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM

 GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
                                   double sparse_thresh, bool sorted_sketch, int32_t n_threads,
-                                   common::Span<float> hess) {
+                                   common::Span<float> hess)
+    : max_numeric_bins_per_feat{max_bins_per_feat} {
  CHECK(p_fmat->SingleColBlock());
  // We use sorted sketching for approx tree method since it's more efficient in
  // computation time (but higher memory usage).
  cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);

-  max_num_bins = max_bins_per_feat;
  const uint32_t nbins = cut.Ptrs().back();
  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(n_threads * nbins, 0);
@ -64,7 +64,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
    : row_ptr(info.num_row_ + 1, 0),
      hit_count(cuts.TotalBins(), 0),
      cut{std::forward<common::HistogramCuts>(cuts)},
-      max_num_bins(max_bin_per_feat),
+      max_numeric_bins_per_feat(max_bin_per_feat),
      isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}

 #if !defined(XGBOOST_USE_CUDA)
@ -87,13 +87,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
 }

 GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
-                                   common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
-                                   bool isDense, double sparse_thresh, int32_t n_threads) {
+                                   common::HistogramCuts cuts, int32_t max_bins_per_feat,
+                                   bool isDense, double sparse_thresh, int32_t n_threads)
+    : cut{std::move(cuts)},
+      max_numeric_bins_per_feat{max_bins_per_feat},
+      base_rowid{batch.base_rowid},
+      isDense_{isDense} {
  CHECK_GE(n_threads, 1);
-  base_rowid = batch.base_rowid;
-  isDense_ = isDense;
-  cut = cuts;
-  max_num_bins = max_bins_per_feat;
  CHECK_EQ(row_ptr.size(), 0);
  // The number of threads is pegged to the batch size. If the OMP
  // block is parallelized on anything other than the batch/block size,
@ -128,12 +128,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH

 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
-  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+  if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
+      isDense) {
    // compress dense index to uint8
    index.SetBinTypeSize(common::kUint8BinsTypeSize);
    index.Resize((sizeof(uint8_t)) * n_index);
-  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
-              max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
+  } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
+              MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
             isDense) {
    // compress dense index to uint16
    index.SetBinTypeSize(common::kUint16BinsTypeSize);
@ -149,16 +150,24 @@ common::ColumnMatrix const &GHistIndexMatrix::Transpose() const {
  return *columns_;
 }

+bst_bin_t GHistIndexMatrix::GetGindex(size_t ridx, size_t fidx) const {
+  auto begin = RowIdx(ridx);
+  if (IsDense()) {
+    return static_cast<bst_bin_t>(index[begin + fidx]);
+  }
+  auto end = RowIdx(ridx + 1);
+  auto const& cut_ptrs = cut.Ptrs();
+  auto f_begin = cut_ptrs[fidx];
+  auto f_end = cut_ptrs[fidx + 1];
+  return BinarySearchBin(begin, end, index, f_begin, f_end);
+}
+
 float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
  auto const &values = cut.Values();
  auto const &mins = cut.MinValues();
  auto const &ptrs = cut.Ptrs();
  if (is_cat) {
-    auto f_begin = ptrs[fidx];
-    auto f_end = ptrs[fidx + 1];
-    auto begin = RowIdx(ridx);
-    auto end = RowIdx(ridx + 1);
-    auto gidx = BinarySearchBin(begin, end, index, f_begin, f_end);
+    auto gidx = GetGindex(ridx, fidx);
    if (gidx == -1) {
      return std::numeric_limits<float>::quiet_NaN();
    }
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,

 GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
                                   EllpackPage const& in_page, BatchParam const& p)
-    : max_num_bins{p.max_bin} {
+    : max_numeric_bins_per_feat{p.max_bin} {
  auto page = in_page.Impl();
  isDense_ = page->is_dense;

--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@ -134,11 +134,15 @@ class GHistIndexMatrix {
  std::vector<size_t> hit_count;
  /*! \brief The corresponding cuts */
  common::HistogramCuts cut;
-  /*! \brief max_bin for each feature. */
-  bst_bin_t max_num_bins;
+  /** \brief max_bin for each feature. */
+  bst_bin_t max_numeric_bins_per_feat;
  /*! \brief base row index for current page (used by external memory) */
  size_t base_rowid{0};

+  bst_bin_t MaxNumBinPerFeat() const {
+    return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
+  }
+
  ~GHistIndexMatrix();
  /**
   * \brief Constrcutor for SimpleDMatrix.
@ -161,7 +165,7 @@ class GHistIndexMatrix {
   * \brief Constructor for external memory.
   */
  GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
-                   common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
+                   common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
                   double sparse_thresh, int32_t n_threads);
  GHistIndexMatrix();  // also for ext mem, empty ctor so that we can read the cache back.

@ -224,6 +228,8 @@ class GHistIndexMatrix {

  common::ColumnMatrix const& Transpose() const;

+  bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
+
  float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;

 private:
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
    if (!fi->Read(&page->hit_count)) {
      return false;
    }
-    if (!fi->Read(&page->max_num_bins)) {
+    if (!fi->Read(&page->max_numeric_bins_per_feat)) {
      return false;
    }
    if (!fi->Read(&page->base_rowid)) {
@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
        page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
        sizeof(uint64_t);
    // max_bins, base row, is_dense
-    fo->Write(page.max_num_bins);
-    bytes += sizeof(page.max_num_bins);
+    fo->Write(page.max_numeric_bins_per_feat);
+    bytes += sizeof(page.max_numeric_bins_per_feat);
    fo->Write(page.base_rowid);
    bytes += sizeof(page.base_rowid);
    fo->Write(page.IsDense());
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@ -213,7 +213,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
        SyncFeatureType(&h_ft);
        p_sketch.reset(new common::HostSketchContainer{
            batch_param_.max_bin, h_ft, column_sizes, !proxy->Info().group_ptr_.empty(),
-            proxy->Info().data_split_mode == DataSplitMode::kCol, ctx_.Threads()});
+            proxy->IsColumnSplit(), ctx_.Threads()});
      }
      HostAdapterDispatch(proxy, [&](auto const& batch) {
        proxy->Info().num_nonzero_ = batch_nnz[i];
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@ -19,7 +19,7 @@ const MetaInfo &SparsePageDMatrix::Info() const { return info_; }

 namespace detail {
 // Use device dispatch
-std::size_t NSamplesDevice(DMatrixProxy *)
+std::size_t NSamplesDevice(DMatrixProxy *)  // NOLINT
 #if defined(XGBOOST_USE_CUDA)
 ;  // NOLINT
 #else
@ -28,7 +28,7 @@ std::size_t NSamplesDevice(DMatrixProxy *)
  return 0;
 }
 #endif
-std::size_t NFeaturesDevice(DMatrixProxy *)
+std::size_t NFeaturesDevice(DMatrixProxy *)  // NOLINT
 #if defined(XGBOOST_USE_CUDA)
 ;  // NOLINT
 #else
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@ -75,10 +75,7 @@ class GBLinear : public GradientBooster {
      : GradientBooster{ctx},
        learner_model_param_{learner_model_param},
        model_{learner_model_param},
-        previous_model_{learner_model_param},
-        sum_instance_weight_(0),
-        sum_weight_complete_(false),
-        is_converged_(false) {}
+        previous_model_{learner_model_param} {}

  void Configure(const Args& cfg) override {
    if (model_.weight.size() == 0) {
@ -344,10 +341,10 @@ class GBLinear : public GradientBooster {
  GBLinearModel previous_model_;
  GBLinearTrainParam param_;
  std::unique_ptr<LinearUpdater> updater_;
-  double sum_instance_weight_;
-  bool sum_weight_complete_;
+  double sum_instance_weight_{};
+  bool sum_weight_complete_{false};
  common::Monitor monitor_;
-  bool is_converged_;
+  bool is_converged_{false};
 };

 // register the objective functions
--- a/src/gbm/gblinear_model.h
+++ b/src/gbm/gblinear_model.h
@ -47,12 +47,12 @@ class GBLinearModel : public Model {
  DeprecatedGBLinearModelParam param_;

 public:
-  int32_t num_boosted_rounds;
+  int32_t num_boosted_rounds{0};
  LearnerModelParam const* learner_model_param;

 public:
-  explicit GBLinearModel(LearnerModelParam const* learner_model_param) :
-      num_boosted_rounds{0}, learner_model_param {learner_model_param} {}
+  explicit GBLinearModel(LearnerModelParam const *learner_model_param)
+      : learner_model_param{learner_model_param} {}
  void Configure(Args const &) { }

  // weight for each of feature, bias is the last one
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -32,15 +32,14 @@
 #include "xgboost/string_view.h"
 #include "xgboost/tree_updater.h"

-namespace xgboost {
-namespace gbm {
-
+namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);

-void GBTree::Configure(const Args& cfg) {
+void GBTree::Configure(Args const& cfg) {
  this->cfg_ = cfg;
  std::string updater_seq = tparam_.updater_seq;
  tparam_.UpdateAllowUnknown(cfg);
+  tree_param_.UpdateAllowUnknown(cfg);

  model_.Configure(cfg);

@ -235,9 +234,11 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
  CHECK_EQ(model_.param.num_parallel_tree, trees.size());
  CHECK_EQ(model_.param.num_parallel_tree, 1)
      << "Boosting random forest is not supported for current objective.";
+  CHECK_EQ(trees.size(), model_.param.num_parallel_tree);
  for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
    auto const& position = node_position.at(tree_idx);
-    obj->UpdateTreeLeaf(position, p_fmat->Info(), predictions, group_idx, trees[tree_idx].get());
+    obj->UpdateTreeLeaf(position, p_fmat->Info(), tree_param_.learning_rate / trees.size(),
+                        predictions, group_idx, trees[tree_idx].get());
  }
 }

@ -388,9 +389,15 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma

  CHECK(out_position);
  out_position->resize(new_trees.size());
+
+  // Rescale learning rate according to the size of trees
+  auto lr = tree_param_.learning_rate;
+  tree_param_.learning_rate /= static_cast<float>(new_trees.size());
  for (auto& up : updaters_) {
-    up->Update(gpair, p_fmat, common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
+    up->Update(&tree_param_, gpair, p_fmat,
+               common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
  }
+  tree_param_.learning_rate = lr;
 }

 void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
@ -404,6 +411,8 @@ void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& ne
 void GBTree::LoadConfig(Json const& in) {
  CHECK_EQ(get<String>(in["name"]), "gbtree");
  FromJson(in["gbtree_train_param"], &tparam_);
+  FromJson(in["tree_train_param"], &tree_param_);
+
  // Process type cannot be kUpdate from loaded model
  // This would cause all trees to be pushed to trees_to_update
  // e.g. updating a model, then saving and loading it would result in an empty model
@ -451,6 +460,7 @@ void GBTree::SaveConfig(Json* p_out) const {
  auto& out = *p_out;
  out["name"] = String("gbtree");
  out["gbtree_train_param"] = ToJson(tparam_);
+  out["tree_train_param"] = ToJson(tree_param_);

  // Process type cannot be kUpdate from loaded model
  // This would cause all trees to be pushed to trees_to_update
@ -1058,5 +1068,4 @@ XGBOOST_REGISTER_GBM(Dart, "dart")
      GBTree* p = new Dart(booster_config, ctx);
      return p;
    });
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@ -20,6 +20,7 @@

 #include "../common/common.h"
 #include "../common/timer.h"
+#include "../tree/param.h"  // TrainParam
 #include "gbtree_model.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@ -405,7 +406,7 @@ class GBTree : public GradientBooster {
        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
  }

-  std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
+  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
                                                   std::string format) const override {
    return model_.DumpModel(fmap, with_stats, this->ctx_->Threads(), format);
  }
@ -428,6 +429,8 @@ class GBTree : public GradientBooster {
  GBTreeModel model_;
  // training parameter
  GBTreeTrainParam tparam_;
+  // Tree training parameter
+  tree::TrainParam tree_param_;
  // ----training fields----
  bool showed_updater_warning_ {false};
  bool specified_updater_   {false};
--- a/src/learner.cc
+++ b/src/learner.cc
@ -21,7 +21,7 @@
 #include <sstream>
 #include <stack>
 #include <string>
-#include <utility>
+#include <utility>  // for as_const
 #include <vector>

 #include "collective/communicator-inl.h"
@ -257,11 +257,11 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
    : LearnerModelParam{user_param, t} {
  std::swap(base_score_, base_margin);
  // Make sure read access everywhere for thread-safe prediction.
-  common::AsConst(base_score_).HostView();
+  std::as_const(base_score_).HostView();
  if (!ctx->IsCPU()) {
-    common::AsConst(base_score_).View(ctx->gpu_id);
+    std::as_const(base_score_).View(ctx->gpu_id);
  }
-  CHECK(common::AsConst(base_score_).Data()->HostCanRead());
+  CHECK(std::as_const(base_score_).Data()->HostCanRead());
 }

 linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device) const {
@ -287,9 +287,9 @@ void LearnerModelParam::Copy(LearnerModelParam const& that) {
  base_score_.Reshape(that.base_score_.Shape());
  base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
  base_score_.Data()->Copy(*that.base_score_.Data());
-  common::AsConst(base_score_).HostView();
+  std::as_const(base_score_).HostView();
  if (that.base_score_.DeviceIdx() != Context::kCpuId) {
-    common::AsConst(base_score_).View(that.base_score_.DeviceIdx());
+    std::as_const(base_score_).View(that.base_score_.DeviceIdx());
  }
  CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
  CHECK(base_score_.Data()->HostCanRead());
@ -328,9 +328,6 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 using LearnerAPIThreadLocalStore =
    dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;

-using ThreadLocalPredictionCache =
-    dmlc::ThreadLocalStore<std::map<Learner const *, PredictionContainer>>;
-
 namespace {
 StringView ModelMsg() {
  return StringView{
@ -368,6 +365,8 @@ class LearnerConfiguration : public Learner {
  LearnerModelParam learner_model_param_;
  LearnerTrainParam tparam_;
  // Initial prediction.
+  PredictionContainer prediction_container_;
+
  std::vector<std::string> metric_names_;

  void ConfigureModelParamWithoutBaseScore() {
@ -426,22 +425,15 @@ class LearnerConfiguration : public Learner {
  }

 public:
-  explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix> > cache)
+  explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix>> cache)
      : need_configuration_{true} {
    monitor_.Init("Learner");
-    auto& local_cache = (*ThreadLocalPredictionCache::Get())[this];
    for (std::shared_ptr<DMatrix> const& d : cache) {
      if (d) {
-        local_cache.Cache(d, Context::kCpuId);
+        prediction_container_.Cache(d, Context::kCpuId);
      }
    }
  }
-  ~LearnerConfiguration() override {
-    auto local_cache = ThreadLocalPredictionCache::Get();
-    if (local_cache->find(this) != local_cache->cend()) {
-      local_cache->erase(this);
-    }
-  }

  // Configuration before data is known.
  void Configure() override {
@ -499,10 +491,6 @@ class LearnerConfiguration : public Learner {
    CHECK_NE(learner_model_param_.BaseScore(this->Ctx()).Size(), 0) << ModelNotFitted();
  }

-  virtual PredictionContainer* GetPredictionCache() const {
-    return &((*ThreadLocalPredictionCache::Get())[this]);
-  }
-
  void LoadConfig(Json const& in) override {
    // If configuration is loaded, ensure that the model came from the same version
    CHECK(IsA<Object>(in));
@ -741,11 +729,10 @@ class LearnerConfiguration : public Learner {
    if (mparam_.num_feature == 0) {
      // TODO(hcho3): Change num_feature to 64-bit integer
      unsigned num_feature = 0;
-      auto local_cache = this->GetPredictionCache();
-      for (auto& matrix : local_cache->Container()) {
-        CHECK(matrix.first);
+      for (auto const& matrix : prediction_container_.Container()) {
+        CHECK(matrix.first.ptr);
        CHECK(!matrix.second.ref.expired());
-        const uint64_t num_col = matrix.first->Info().num_col_;
+        const uint64_t num_col = matrix.first.ptr->Info().num_col_;
        CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
            << "Unfortunately, XGBoost does not support data matrices with "
            << std::numeric_limits<unsigned>::max() << " features or greater";
@ -817,13 +804,13 @@ class LearnerConfiguration : public Learner {
   */
  void ConfigureTargets() {
    CHECK(this->obj_);
-    auto const& cache = this->GetPredictionCache()->Container();
+    auto const& cache = prediction_container_.Container();
    size_t n_targets = 1;
    for (auto const& d : cache) {
      if (n_targets == 1) {
-        n_targets = this->obj_->Targets(d.first->Info());
+        n_targets = this->obj_->Targets(d.first.ptr->Info());
      } else {
-        auto t = this->obj_->Targets(d.first->Info());
+        auto t = this->obj_->Targets(d.first.ptr->Info());
        CHECK(n_targets == t || 1 == t) << "Inconsistent labels.";
      }
    }
@ -1275,8 +1262,7 @@ class LearnerImpl : public LearnerIO {

    this->ValidateDMatrix(train.get(), true);

-    auto local_cache = this->GetPredictionCache();
-    auto& predt = local_cache->Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);

    monitor_.Start("PredictRaw");
    this->PredictRaw(train.get(), &predt, true, 0, 0);
@ -1303,8 +1289,7 @@ class LearnerImpl : public LearnerIO {

    this->ValidateDMatrix(train.get(), true);

-    auto local_cache = this->GetPredictionCache();
-    auto& predt = local_cache->Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
    gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
    monitor_.Stop("BoostOneIter");
  }
@ -1326,10 +1311,9 @@ class LearnerImpl : public LearnerIO {
      metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
    }

-    auto local_cache = this->GetPredictionCache();
    for (size_t i = 0; i < data_sets.size(); ++i) {
      std::shared_ptr<DMatrix> m = data_sets[i];
-      auto &predt = local_cache->Cache(m, ctx_.gpu_id);
+      auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
      this->ValidateDMatrix(m.get(), false);
      this->PredictRaw(m.get(), &predt, false, 0, 0);

@ -1370,8 +1354,7 @@ class LearnerImpl : public LearnerIO {
    } else if (pred_leaf) {
      gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
    } else {
-      auto local_cache = this->GetPredictionCache();
-      auto& prediction = local_cache->Cache(data, ctx_.gpu_id);
+      auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
      this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
      // Copy the prediction cache to output prediction. out_preds comes from C API
      out_preds->SetDevice(ctx_.gpu_id);
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@ -14,9 +14,11 @@
 #include <utility>
 #include <vector>

+#include "../common/algorithm.h"        // ArgSort
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // OptionalWeights
 #include "metric_common.h"              // MetricNoCache
+#include "xgboost/context.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/linalg.h"
 #include "xgboost/metric.h"
@ -77,9 +79,8 @@ BinaryAUC(common::Span<float const> predts, linalg::VectorView<float const> labe
 *   Machine Learning Models
 */
 template <typename BinaryAUC>
-double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
-                     size_t n_classes, int32_t n_threads,
-                     BinaryAUC &&binary_auc) {
+double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaInfo const &info,
+                     size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) {
  CHECK_NE(n_classes, 0);
  auto const labels = info.labels.View(Context::kCpuId);
  if (labels.Shape(0) != 0) {
@ -108,7 +109,7 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
      }
      double fp;
      std::tie(fp, tp(c), auc(c)) =
-          binary_auc(proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
+          binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
      local_area(c) = fp * tp(c);
    });
  }
@ -139,23 +140,26 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
  return auc_sum;
 }

-std::tuple<double, double, double> BinaryROCAUC(common::Span<float const> predts,
+std::tuple<double, double, double> BinaryROCAUC(Context const *ctx,
+                                                common::Span<float const> predts,
                                                linalg::VectorView<float const> labels,
                                                common::OptionalWeights weights) {
-  auto const sorted_idx = common::ArgSort<size_t>(predts, std::greater<>{});
+  auto const sorted_idx =
+      common::ArgSort<size_t>(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{});
  return BinaryAUC(predts, labels, weights, sorted_idx, TrapezoidArea);
 }

 /**
 * Calculate AUC for 1 ranking group;
 */
-double GroupRankingROC(common::Span<float const> predts,
+double GroupRankingROC(Context const* ctx, common::Span<float const> predts,
                       linalg::VectorView<float const> labels, float w) {
  // on ranking, we just count all pairs.
  double auc{0};
  // argsort doesn't support tensor input yet.
  auto raw_labels = labels.Values().subspan(0, labels.Size());
-  auto const sorted_idx = common::ArgSort<size_t>(raw_labels, std::greater<>{});
+  auto const sorted_idx = common::ArgSort<size_t>(
+      ctx, raw_labels.data(), raw_labels.data() + raw_labels.size(), std::greater<>{});
  w = common::Sqr(w);

  double sum_w = 0.0f;
@ -185,10 +189,11 @@ double GroupRankingROC(common::Span<float const> predts,
 *
 *   https://doi.org/10.1371/journal.pone.0092209
 */
-std::tuple<double, double, double> BinaryPRAUC(common::Span<float const> predts,
+std::tuple<double, double, double> BinaryPRAUC(Context const *ctx, common::Span<float const> predts,
                                               linalg::VectorView<float const> labels,
                                               common::OptionalWeights weights) {
-  auto const sorted_idx = common::ArgSort<size_t>(predts, std::greater<>{});
+  auto const sorted_idx =
+      common::ArgSort<size_t>(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{});
  double total_pos{0}, total_neg{0};
  for (size_t i = 0; i < labels.Size(); ++i) {
    auto w = weights[i];
@ -211,9 +216,8 @@ std::tuple<double, double, double> BinaryPRAUC(common::Span<float const> predts,
 * Cast LTR problem to binary classification problem by comparing pairs.
 */
 template <bool is_roc>
-std::pair<double, uint32_t> RankingAUC(std::vector<float> const &predts,
-                                       MetaInfo const &info,
-                                       int32_t n_threads) {
+std::pair<double, uint32_t> RankingAUC(Context const *ctx, std::vector<float> const &predts,
+                                       MetaInfo const &info, int32_t n_threads) {
  CHECK_GE(info.group_ptr_.size(), 2);
  uint32_t n_groups = info.group_ptr_.size() - 1;
  auto s_predts = common::Span<float const>{predts};
@ -237,9 +241,9 @@ std::pair<double, uint32_t> RankingAUC(std::vector<float> const &predts,
      auc = 0;
    } else {
      if (is_roc) {
-        auc = GroupRankingROC(g_predts, g_labels, w);
+        auc = GroupRankingROC(ctx, g_predts, g_labels, w);
      } else {
-        auc = std::get<2>(BinaryPRAUC(g_predts, g_labels, common::OptionalWeights{w}));
+        auc = std::get<2>(BinaryPRAUC(ctx, g_predts, g_labels, common::OptionalWeights{w}));
      }
      if (std::isnan(auc)) {
        invalid_groups++;
@ -344,7 +348,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
    auto n_threads = ctx_->Threads();
    if (ctx_->gpu_id == Context::kCpuId) {
      std::tie(auc, valid_groups) =
-          RankingAUC<true>(predts.ConstHostVector(), info, n_threads);
+          RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
    } else {
      std::tie(auc, valid_groups) =
          GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
@ -358,8 +362,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
    auto n_threads = ctx_->Threads();
    CHECK_NE(n_classes, 0);
    if (ctx_->gpu_id == Context::kCpuId) {
-      auc = MultiClassOVR(predts.ConstHostVector(), info, n_classes, n_threads,
-                          BinaryROCAUC);
+      auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
    } else {
      auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
    }
@ -370,8 +373,8 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
  EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
    double fp, tp, auc;
    if (ctx_->gpu_id == Context::kCpuId) {
-      std::tie(fp, tp, auc) =
-          BinaryROCAUC(predts.ConstHostVector(), info.labels.HostView().Slice(linalg::All(), 0),
+      std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
+                                           info.labels.HostView().Slice(linalg::All(), 0),
                                           common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
      std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info,
@ -422,7 +425,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
    double pr, re, auc;
    if (ctx_->gpu_id == Context::kCpuId) {
      std::tie(pr, re, auc) =
-          BinaryPRAUC(predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
+          BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                      common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
      std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info,
@ -435,8 +438,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
                        size_t n_classes) {
    if (ctx_->gpu_id == Context::kCpuId) {
      auto n_threads = this->ctx_->Threads();
-      return MultiClassOVR(predts.ConstHostSpan(), info, n_classes, n_threads,
-                           BinaryPRAUC);
+      return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
    } else {
      return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes);
    }
@ -453,7 +455,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
        InvalidLabels();
      }
      std::tie(auc, valid_groups) =
-          RankingAUC<false>(predts.ConstHostVector(), info, n_threads);
+          RankingAUC<false>(ctx_, predts.ConstHostVector(), info, n_threads);
    } else {
      std::tie(auc, valid_groups) =
          GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_);
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@ -5,7 +5,7 @@

 #include <algorithm>
 #include <cassert>
-#include <cub/cub.cuh>
+#include <cub/cub.cuh>  // NOLINT
 #include <limits>
 #include <memory>
 #include <tuple>
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@ -451,9 +451,8 @@ class QuantileError : public MetricNoCache {
    auto alpha = ctx->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
    std::size_t n_targets = preds.Size() / info.num_row_ / alpha_.Size();
    CHECK_NE(n_targets, 0);
-    auto y_predt = linalg::MakeTensorView(
-        ctx->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan(),
-        {static_cast<std::size_t>(info.num_row_), alpha_.Size(), n_targets}, ctx->gpu_id);
+    auto y_predt = linalg::MakeTensorView(ctx, &preds, static_cast<std::size_t>(info.num_row_),
+                                          alpha_.Size(), n_targets);

    info.weights_.SetDevice(ctx->gpu_id);
    common::OptionalWeights weight{ctx->IsCPU() ? info.weights_.ConstHostSpan()
--- a/Show More
+++ b/Show More